├── .editorconfig
├── .gitignore
├── README.md
├── infogain_loss_matrix
    ├── Makefile
    ├── config
    └── data
├── net-audio
    ├── README.md
    ├── experiments
    │   └── experiments.csv
    ├── net.prototxt
    ├── plot_log.gnuplot
    ├── solver.prototxt
    ├── testing.sh
    └── training.sh
├── net
    ├── README.md
    ├── create_net_lstm.py
    ├── net.prototxt
    ├── plot_log.gnuplot
    ├── solver.prototxt
    ├── testing.sh
    ├── training.sh
    └── xiaoyin
    │   ├── A2_4_test.prototxt
    │   ├── A2_4_train.prototxt
    │   └── solver_xiaoyin.prototxt
├── paper
    ├── Makefile
    ├── chapters
    │   ├── acoustic_model.tex
    │   ├── caffe.tex
    │   ├── data.tex
    │   ├── demo.tex
    │   ├── evaluation.tex
    │   ├── fusion.tex
    │   ├── future.tex
    │   ├── introduction.tex
    │   ├── lexical_model.tex
    │   ├── parameters.tex
    │   └── related_work.tex
    ├── img
    │   ├── audio_parameter_eval.png
    │   ├── audio_parameter_eval.svg
    │   ├── demo_l.png
    │   ├── demo_l_a.png
    │   ├── drawing.svg
    │   ├── fusion_1.pdf
    │   ├── fusion_1.svg
    │   ├── fusion_2.pdf
    │   ├── fusion_2.svg
    │   ├── fusion_eval.pdf
    │   ├── fusion_eval.png
    │   ├── fusion_eval.svg
    │   ├── fusion_eval2.svg
    │   ├── hpi_logo.png
    │   ├── net_acoustic.pdf
    │   ├── net_acoustic.svg
    │   ├── net_lexical.pdf
    │   ├── net_lexical.svg
    │   ├── overview_accoustic.pdf
    │   ├── overview_accoustic.svg
    │   ├── overview_lexical.pdf
    │   ├── overview_lexical.svg
    │   ├── sliding_window.pdf
    │   ├── sliding_window.svg
    │   ├── window_eval.png
    │   ├── window_eval.svg
    │   ├── window_eval_width.svg
    │   ├── window_pos_eval.png
    │   ├── window_pos_eval.svg
    │   ├── window_wiki_eval.png
    │   └── window_wiki_eval.svg
    ├── main.bib
    ├── main.tex
    └── notes
    │   ├── Makefile
    │   ├── auswertung.ods
    │   ├── auswertung2.ods
    │   ├── auswertung_fusion.txt
    │   ├── auswertung_onlypos.ods
    │   ├── auswertung_wiki.ods
    │   ├── experiments.csv
    │   ├── experiments2.csv
    │   ├── fusion_eval.ods
    │   ├── fusion_eval.txt
    │   └── results.tex
├── python
    ├── README.md
    ├── common
    │   ├── __init__.py
    │   ├── argparse_util.py
    │   ├── sbd_config.py
    │   └── send_email.py
    ├── config.ini.default
    ├── console_demo
    │   ├── README.md
    │   ├── __init__.py
    │   ├── demo.py
    │   └── demo_preparation.py
    ├── demo_data
    │   ├── .gitignore
    │   ├── audio_examples
    │   │   └── .gitignore
    │   ├── audio_models
    │   │   └── .gitignore
    │   ├── download_all.sh
    │   ├── download_google_vector.sh
    │   ├── download_models.sh
    │   ├── folders.txt
    │   ├── lexical_models.txt
    │   ├── lexical_models
    │   │   └── .gitignore
    │   └── text_data
    │   │   └── .gitignore
    ├── email.ini.default
    ├── evaluation
    │   └── evaluation.py
    ├── evaluation_data
    │   ├── .gitignore
    │   ├── download_all.sh
    │   └── folders.txt
    ├── experiments
    │   ├── README.md
    │   ├── audio_databases.sh
    │   ├── audio_training.sh
    │   ├── databases.sh
    │   └── training.sh
    ├── parsing
    │   ├── __init__.py
    │   ├── abstract_parser.py
    │   ├── audio_parser.py
    │   ├── ctm_parser.py
    │   ├── get_parser.py
    │   ├── line_parser.py
    │   ├── plaintext_parser.py
    │   └── xml_parser.py
    ├── preprocessing
    │   ├── __init__.py
    │   ├── audio.py
    │   ├── glove_file.py
    │   ├── nlp_pipeline.py
    │   ├── sliding_window.py
    │   ├── text.py
    │   ├── tokens.py
    │   ├── training_instance.py
    │   └── word2vec_file.py
    ├── sbd_classification
    │   ├── __init__.py
    │   ├── audio_classification.py
    │   ├── classification_input.py
    │   ├── fusion.py
    │   ├── lexical_classification.py
    │   └── util.py
    ├── sbd_leveldb
    │   ├── __init__.py
    │   ├── audio_training_instance_generator.py
    │   ├── level_db_creator.py
    │   └── training_instance_generator.py
    ├── tools
    │   ├── __init__.py
    │   ├── comparison.py
    │   ├── look_into_leveldb.py
    │   ├── netconfig.py
    │   ├── parse_result.py
    │   └── text_converter.py
    └── web_demo
    │   ├── README.md
    │   ├── __init__.py
    │   ├── file_io.py
    │   ├── json_converter.py
    │   ├── static
    │       ├── main.css
    │       └── main.js
    │   ├── templates
    │       ├── audio_lexical.html
    │       └── index.html
    │   └── web.py
└── requirements.txt


/.editorconfig:
--------------------------------------------------------------------------------
 1 | # top-most EditorConfig file
 2 | root = true
 3 | 
 4 | # Unix-style newlines with a newline ending every file
 5 | [*]
 6 | end_of_line = lf
 7 | insert_final_newline = true
 8 | charset = utf-8
 9 | indent_style = space
10 | trim_trailing_whitespace = true
11 | 
12 | [Makefile]
13 | indent_style = tab
14 | 
15 | [*.md]
16 | trim_trailing_whitespace = false
17 | 
18 | [*.py]
19 | indent_size = 4
20 | 
21 | [*.sh]
22 | indent_size = 4
23 | 
24 | [*.html]
25 | indent_size = 4
26 | 
27 | [*.js]
28 | indent_size = 4
29 | 
30 | [*.css]
31 | indent_size = 4
32 | 
33 | [*.prototxt]
34 | indent_size = 2
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Sentence files
 2 | train_instances.txt
 3 | test_instances.txt
 4 | config.ini
 5 | email.ini
 6 | lineparsing
 7 | configurations/
 8 | audio_configurations/
 9 | auto.prototxt
10 | 
11 | # Experiment files
12 | *.run
13 | 
14 | # Caffe files
15 | *.tlog
16 | *.tstlog
17 | leveldbs
18 | net/experiments
19 | net/snapshots/
20 | net-audio/experiments
21 | net-audio/snapshots/
22 | 
23 | # IDE files
24 | .idea/
25 | *.iml
26 | __pycache__/
27 | 
28 | # Python virtual environment
29 | .env
30 | p3/
31 | p2/
32 | *.pyc
33 | 
34 | # Database file
35 | hdf5/
36 | infogain_loss_matrix/*.h5
37 | 
38 | # LaTeX
39 | .output/
40 | main.pdf
41 | #*.svg
42 | *.txss
43 | 
44 | # Lock files
45 | .~lock.*#
46 | 
47 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Sentence Boundary Detecting using Deep Neural Networks
 2 | 
 3 | We try to detect sentence boundaries using deep learning.
 4 | Created as part of the "Practical Applications of Multimedia Retrieval" seminar at the Hasso-Plattner-Institute, Potsdam, Germany.
 5 | 
 6 | ### Setup Demo
 7 | We build a python-based demo using caffe.
 8 | 
 9 | #####Prerequirements:
10 | 1. Clone this repository
11 | 2. Install python 2.7 including the following packages from requirements.txt
12 | 
13 |   `pip install -r requirements.txt`
14 | 
15 | 3. Use the nltk downloader to download `averaged_perceptron_tagger` and `punkt` models:
16 | 
17 |   `python -m nltk.downloader`
18 | 
19 | 4. Setup caffe, like described [here](http://caffe.berkeleyvision.org/installation.html)
20 | 5. Add path to the repository to your python path: 
21 | 
22 |   `export PYTHONPATH=/path/to/sentence-boundary-detection-nn/python:$PYTHONPATH`
23 | 
24 | 6. Download Google Word Vector (GoogleNews-vectors-negative300.bin.gz) from [here](https://code.google.com/p/word2vec/)  or use directly this [url](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing) and extract the result into the `sentence-boundary-detection-nn/python/demo_data` directory
25 | 7. Paste your trained models into a demo data folder, for example `sentence-boundary-detection-nn/python/demo_data` with the following structure:
26 |   * lexical_models : containing all pretrained models you want to use in a seperate directory. Each models needs a 
27 |     * .ini
28 |     * .caffemodel
29 |     * net.prototxt file.
30 |   * text_data: containing all possible text files, which should be used as prediction input
31 |   * audio_models: containing all pretrainied audio models, each in a seperate directory. Each needs the same files as described for lexical models
32 |   * audio_examples: containing all audio files, which should be available during the demo. Each one in a seperate directory containing the ctm, energy and pitch files.
33 | 
34 | #####Start up
35 | 
36 | Change into the repository directory and execute, this should work right out of the box, unless you are using a custom `demo_data` folder:
37 | ```
38 | python web_demo/web.py
39 | ```
40 | Optionally you can specify the location of the word vector and the demo data. Otherwise default values are used.
41 | For further information execute:
42 | ```
43 | python web_demo/web.py -h
44 | ```
45 | 


--------------------------------------------------------------------------------
/infogain_loss_matrix/Makefile:
--------------------------------------------------------------------------------
1 | 
2 | hdf5:
3 | 	rm -f infogain_loss_matrix.h5
4 | 	h5import data -c config -o infogain_loss_matrix.h5
5 | 
6 | scp:
7 | 	scp infogain_loss_matrix.h5 sentence:/mnt/naruto/sentence/hdf5s/
8 | 


--------------------------------------------------------------------------------
/infogain_loss_matrix/config:
--------------------------------------------------------------------------------
1 | RANK 4
2 | DIMENSION-SIZES 1 1 2 2
3 | INPUT-CLASS TEXTFP
4 | INPUT-SIZE 32
5 | 


--------------------------------------------------------------------------------
/infogain_loss_matrix/data:
--------------------------------------------------------------------------------
1 | 1.0 0.0
2 | 0.0 1.0
3 | 


--------------------------------------------------------------------------------
/net-audio/README.md:
--------------------------------------------------------------------------------
 1 | # Training
 2 | 
 3 | For training the acoustic neural network, you have to execute the following steps:
 4 | 
 5 | 1. Adapt the `net.prototxt`: Change the network layout and make sure, you enter the correct path to the level db.
 6 | 2. Adapt the `solver.prototxt`.
 7 | 3. Make sure there exists an `experiments` and `snapshots` folder in this folder.
 8 | 4. Execute `training.sh <experiment_name>`.
 9 | 
10 | The `training.sh` script does several things:
11 | 
12 | * Creates a folder in the `experiment` folder with the name you gave your experiment
13 | * The following files are copied to that folder:
14 |  * `config.ini`, which is located in your database folder
15 |  * `net.prototxt`
16 |  * `solver.prototxt`
17 |  * log files from the training
18 | * Starts the training of the neural network
19 | * The latest `.solverstate` and `.caffemodel` are copied to the `experiment` folder after the training is finished
20 | * After training, different graphs are created and put into the `experiment` folder.
21 | 


--------------------------------------------------------------------------------
/net-audio/experiments/experiments.csv:
--------------------------------------------------------------------------------
 1 | window_size,punctuation_position,accuracy_0,loss_1,precision_per_class_2,precision_per_class_3,recall_per_class_4,recall_per_class_5
 2 | 1,0,0.927,0.176455,0.983368,0.46225,0.937801,0.771208
 3 | 1,1,0.931583,0.180177,0.983442,0.4856,0.942666,0.773248
 4 | 3,1,0.935583,0.178618,0.983526,0.502092,0.946979,0.771208
 5 | 3,2,0.938417,0.165755,0.981621,0.517889,0.951965,0.743261
 6 | 5,1,0.945083,0.162986,0.982374,0.55619,0.958478,0.751609
 7 | 5,2,0.947167,0.156467,0.981449,0.571286,0.961686,0.737452
 8 | 5,3,0.942833,0.161973,0.981886,0.543499,0.95651,0.745828
 9 | 5,4,0.94625,0.155126,0.98064,0.567134,0.961501,0.726573
10 | 5,5,0.948333,0.162826,0.978675,0.588424,0.96576,0.699363
11 | 8,2,0.950333,0.156283,0.98064,0.596195,0.96596,0.724936
12 | 8,3,0.95,0.159289,0.980893,0.593521,0.965333,0.72914
13 | 8,4,0.950833,0.153095,0.980477,0.600427,0.966673,0.722365
14 | 8,5,0.95025,0.153548,0.981163,0.593946,0.965339,0.732304
15 | 8,6,0.94975,0.157002,0.980279,0.593023,0.965689,0.720154
16 | 


--------------------------------------------------------------------------------
/net-audio/net.prototxt:
--------------------------------------------------------------------------------
  1 | name: "sentence_boundary_detection"
  2 | #
  3 | # Data
  4 | #
  5 | layer {
  6 |   name: "data"
  7 |   type: "Data"
  8 |   top: "data"
  9 |   top: "label"
 10 |   include {
 11 |     phase: TRAIN
 12 |   }
 13 |   data_param {
 14 |     source: "/home/ms2015t3/sentence-boundary-detection-nn/leveldbs/audio_window-5-3/train"
 15 |     batch_size: 1024
 16 |     backend: LEVELDB
 17 |   }
 18 | }
 19 | layer {
 20 |   name: "data"
 21 |   type: "Data"
 22 |   top: "data"
 23 |   top: "label"
 24 |   include {
 25 |     phase: TEST
 26 |   }
 27 |   data_param {
 28 |     source: "/home/ms2015t3/sentence-boundary-detection-nn/leveldbs/audio_window-5-3/test"
 29 |     batch_size: 12000
 30 |     backend: LEVELDB
 31 |   }
 32 | }
 33 | layer {
 34 |   name: "infogain_loss_matrix"
 35 |   type: "HDF5Data"
 36 |   top: "dataset0"
 37 |   hdf5_data_param {
 38 |     source: "/mnt/naruto/sentence/hdf5s/infogain_loss_matrix.txt"
 39 |     batch_size: 1
 40 |   }
 41 | }
 42 | #
 43 | # Fully Connected Layer 1
 44 | #
 45 | layer {
 46 |   name: "fc1"
 47 |   type: "InnerProduct"
 48 |   bottom: "data"
 49 |   top: "fc1"
 50 |   inner_product_param {
 51 |     num_output: 2048
 52 |     weight_filler {
 53 |       type: "xavier"
 54 |     }
 55 |     bias_filler {
 56 |       type: "constant"
 57 |     }
 58 |   }
 59 | }
 60 | layer {
 61 |   name: "relu1"
 62 |   type: "ReLU"
 63 |   bottom: "fc1"
 64 |   top: "fc1"
 65 | }
 66 | layer {
 67 |   name: "drop1"
 68 |   type: "Dropout"
 69 |   bottom: "fc1"
 70 |   top: "fc1"
 71 |   dropout_param {
 72 |     dropout_ratio: 0.5
 73 |   }
 74 | }
 75 | #
 76 | # Fully Connected Layer 2
 77 | #
 78 | layer {
 79 |   name: "fc2"
 80 |   type: "InnerProduct"
 81 |   bottom: "fc1"
 82 |   top: "fc2"
 83 |   inner_product_param {
 84 |     num_output: 4096
 85 |     weight_filler {
 86 |       type: "xavier"
 87 |     }
 88 |     bias_filler {
 89 |       type: "constant"
 90 |     }
 91 |   }
 92 | }
 93 | layer {
 94 |   name: "relu2"
 95 |   type: "ReLU"
 96 |   bottom: "fc2"
 97 |   top: "fc2"
 98 | }
 99 | layer {
100 |   name: "drop2"
101 |   type: "Dropout"
102 |   bottom: "fc2"
103 |   top: "fc2"
104 |   dropout_param {
105 |     dropout_ratio: 0.5
106 |   }
107 | }
108 | 
109 | #
110 | # Fully Connected Layer 3
111 | #
112 | layer {
113 |   name: "fc3"
114 |   type: "InnerProduct"
115 |   bottom: "fc2"
116 |   top: "fc3"
117 |   inner_product_param {
118 |     num_output: 2048
119 |     weight_filler {
120 |       type: "xavier"
121 |     }
122 |     bias_filler {
123 |       type: "constant"
124 |     }
125 |   }
126 | }
127 | layer {
128 |   name: "relu3"
129 |   type: "ReLU"
130 |   bottom: "fc3"
131 |   top: "fc3"
132 | }
133 | # layer {
134 | #   name: "drop3"
135 | #   type: "Dropout"
136 | #   bottom: "fc3"
137 | #   top: "fc3"
138 | #   dropout_param {
139 | #     dropout_ratio: 0.5
140 | #   }
141 | # }
142 | 
143 | #
144 | # Fully Connected Layer Final - Preparation for Output
145 | #
146 | layer {
147 |   name: "fc_final"
148 |   type: "InnerProduct"
149 |   bottom: "fc3"
150 |   top: "fc_final"
151 |   inner_product_param {
152 |     num_output: 2
153 |     weight_filler {
154 |       type: "xavier"
155 |     }
156 |     bias_filler {
157 |       type: "constant"
158 |     }
159 |   }
160 | }
161 | 
162 | #
163 | # Loss, Accuracy
164 | #
165 | layer {
166 |   name: "softmax"
167 |   type: "Softmax"
168 |   bottom: "fc_final"
169 |   top: "softmax"
170 | }
171 | layer {
172 |   name: "loss"
173 |   type: "InfogainLoss"
174 |   bottom: "softmax"
175 |   bottom: "label"
176 |   bottom: "dataset0"
177 |   top: "loss"
178 | }
179 | layer {
180 |   name: "accuracy"
181 |   type: "Accuracy"
182 |   bottom: "fc_final"
183 |   bottom: "label"
184 |   top: "accuracy"
185 |   top: "recall_per_class"
186 |   top: "precision_per_class"
187 |   include {
188 |     phase: TEST
189 |   }
190 | }
191 | 


--------------------------------------------------------------------------------
/net-audio/plot_log.gnuplot:
--------------------------------------------------------------------------------
 1 | # Please generate the neccessary data files with 
 2 | # /path/to/caffe/tools/extra/parse_log.sh before plotting.
 3 | # Example usage: 
 4 | #     ./parse_log.sh mnist.log
 5 | # Now you have mnist.log.train and mnist.log.test.
 6 | #     gnuplot mnist.gnuplot
 7 | 
 8 | # The fields present in the data files that are usually proper to plot along
 9 | # the y axis are test accuracy, test loss, training loss, and learning rate.
10 | # Those should plot along the x axis are training iterations and seconds.
11 | # Possible combinations:
12 | # 1. Test accuracy (test score 0) vs. training iterations / time;
13 | # 2. Test loss (test score 1) time;
14 | # 3. Training loss vs. training iterations / time;
15 | # 4. Learning rate vs. training iterations / time;
16 | # A rarer one: Training time vs. iterations.
17 | 
18 | reset
19 | #set terminal dumb
20 | set style data lines
21 | set key right center
22 | 
23 | file(test_or_train) = sprintf("%s.%s", filename, test_or_train)
24 | ucf_101_title = "Learning on six classes of UCF 101"
25 | 
26 | ###### Fields in the training data
27 | ###### Iters Seconds TrainingLoss LearningRate
28 | 
29 | # Training loss vs. training iterations
30 | set terminal png
31 | set output "it_vs_train-loss.png"
32 | set title "Training loss vs. training iterations"
33 | set xlabel "Training iterations"
34 | set ylabel "Training loss"
35 | plot file("train") using 1:3 title "loss"
36 | 
37 | # Training loss vs. training time
38 | #set terminal png
39 | #set output "time_vs_train-loss.png"
40 | #set title "Training time vs. training loss"
41 | #set xlabel "Training time"
42 | #set ylabel "Training loss"
43 | #plot file("train") using 2:3 title "loss"
44 | 
45 | # Learning rate vs. training iterations;
46 | set terminal png
47 | set output "it_vs_lr.png"
48 | set xlabel "Training iterations"
49 | set ylabel "Learning rate"
50 | plot file("train") using 1:4 title "learning rate"
51 | 
52 | ###### Fields in the test data
53 | ###### Iters Seconds TestAccuracy TestLoss
54 | 
55 | # Test loss vs. training iterations
56 | set terminal png
57 | set output "it_vs_test-acc.png"
58 | set title "Training iterations vs. test accuracy"
59 | set xlabel "Training iterations"
60 | set ylabel "Test accuracy"
61 | plot file("test") using 1:3 title "accuracy"
62 | 


--------------------------------------------------------------------------------
/net-audio/solver.prototxt:
--------------------------------------------------------------------------------
 1 | net: "auto.prototxt"
 2 | 
 3 | # Test before training?
 4 | test_initialization: true
 5 | # Test every nth iteration
 6 | test_interval: 10000
 7 | # How many iterations per test
 8 | test_iter: 1
 9 | 
10 | # Base learning rate
11 | base_lr: 0.00001
12 | # Policy for changing the learning rate - multiply by gamma every stepsize iterations
13 | lr_policy: "step"
14 | gamma: 0.1
15 | stepsize: 300000
16 | momentum: 0.9
17 | # Regularization parameter for the weights
18 | weight_decay: 0.0005
19 | 
20 | # Display training loss every nth iteration
21 | display: 200
22 | # After how many iterations to stop
23 | max_iter: 100000
24 | 
25 | # Snapshot every nth iteration in the specified directory
26 | snapshot: 100000
27 | snapshot_prefix: "snapshots/"
28 | 
29 | random_seed: 1701
30 | # Display the loss averaged over the last average_loss iterations - this does not work for accuracy
31 | average_loss: 100
32 | #clip_gradients: 10
33 | 
34 | # GPU for the win!
35 | solver_mode: GPU
36 | 


--------------------------------------------------------------------------------
/net-audio/testing.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | PROJECT="sentence"
 4 | TESTING_LOG_NAME="${PROJECT}.tstlog"
 5 | 
 6 | # Check if called with name
 7 | if [ $# -ne 1 ]; then
 8 |     echo "Usage: $0 [experiment_name]"
 9 | 	echo "       experiment_name: Name of the subfolder in ./experiments/ for the current experiment."
10 | 	echo "Exiting."
11 | 	exit 1
12 | fi
13 | 
14 | # We need the output/error redirection, because caffe outputs to standard error, and we want to pipe to grep's standard in
15 | # See http://stackoverflow.com/questions/1507816/with-bash-how-can-i-pipe-standard-error-into-another-process
16 | ($CAFFE_ROOT/build/tools/caffe test -model net.prototxt -weights experiments/$1/*.caffemodel -iterations 1 3>&1 1>&2- 2>&3-) | grep --invert-match "Waiting for data" > $TESTING_LOG_NAME
17 | 
18 | 


--------------------------------------------------------------------------------
/net-audio/training.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Check if called with name
 4 | if [ $# -ne 1 ]; then
 5 |     echo "Usage: $0 [experiment_name]"
 6 | 	echo "       experiment_name: Name of the subfolder in ./experiments/ for the current experiment."
 7 | 	echo "Exiting."
 8 | 	exit 1
 9 | fi
10 | 
11 | PROJECT="audio"
12 | SOLVER="solver.prototxt"
13 | # Find out net from the solver
14 | NET=$(grep --only-matching "\w\+\.prototxt" solver.prototxt)
15 | DATABASE=$(python $SENTENCE_HOME/python/tools/netconfig.py -p $NET)
16 | 
17 | echo "Using solver ${SOLVER} with net ${NET} and database ${DATABASE}"
18 | 
19 | # Set Vars
20 | DATE=`date +%Y%m%d-%H%M%S`
21 | FOLDER_NAME="${DATE}_$1"
22 | TRAINING_LOG_NAME="${PROJECT}_${NET}.tlog"
23 | 
24 | echo "Saving experiment in experiments/$FOLDER_NAME"
25 | mkdir experiments/$FOLDER_NAME
26 | 
27 | # Function for saving results and making plots
28 | function cleanup() {
29 |     echo $1
30 | 
31 |     echo "Copying snapshots"
32 |     ls -v -1 snapshots/ | tail -n 2 | xargs -i mv snapshots/{} experiments/$FOLDER_NAME
33 | 
34 |     echo "Parsing logs"
35 |     $CAFFE_ROOT/tools/extra/parse_log.sh $TRAINING_LOG_NAME
36 | 
37 |     echo "Copying logs"
38 |     cp $TRAINING_LOG_NAME $TRAINING_LOG_NAME.train $TRAINING_LOG_NAME.test experiments/$FOLDER_NAME
39 | 
40 |     echo "Building plots"
41 |     gnuplot -e "filename='$TRAINING_LOG_NAME'" -p plot_log.gnuplot
42 |     mv *.png experiments/$FOLDER_NAME
43 | 
44 |     rm ${TRAINING_LOG_NAME}.test ${TRAINING_LOG_NAME}.train
45 |     echo "Clean up finished"
46 | }
47 | 
48 | # Clean snapshots
49 | rm snapshots/* 2> /dev/null
50 | 
51 | # Saving setup
52 | cp *.prototxt $SOLVER training.sh experiments/$FOLDER_NAME
53 | # Copy database configuration
54 | cp $DATABASE/*.ini experiments/$FOLDER_NAME
55 | 
56 | # Setting interrupt trap
57 | trap 'cleanup "Training interrupted"; exit 1' INT
58 | 
59 | # Calling caffe
60 | # export CAFFE_ROOT="$HOME/caffe-tmbo"
61 | 
62 | $CAFFE_ROOT/build/tools/caffe train \
63 |     -solver ./experiments/$FOLDER_NAME/$SOLVER 2> $TRAINING_LOG_NAME
64 | 
65 | # Check if Training successful
66 | if [ $? -ne 0 ]; then
67 |     # Send Email Notification
68 |     cd "${SENTENCE_HOME}/python"
69 |     python "common/send_email.py" "Training failed" "$FOLDER_NAME" "../net/$TRAINING_LOG_NAME"
70 |     cd -
71 |     echo "Training not successful. Exiting."
72 | 
73 |     # Resetting interrupt handling
74 |     trap - INT
75 |     exit 2
76 | fi
77 | 
78 | # Resetting interrupt handling
79 | trap - INT
80 | 
81 | cleanup "Training finished"
82 | 
83 | 


--------------------------------------------------------------------------------
/net/README.md:
--------------------------------------------------------------------------------
 1 | # Training
 2 | 
 3 | For training the lexical neural network, you have to execute the following steps:
 4 | 
 5 | 1. Adapt the `net.prototxt`: Change the network layout and make sure, you enter the correct path to the level db.
 6 | 2. Adapt the `solver.prototxt`.
 7 | 3. Make sure there exists an `experiments` and `snapshots` folder in this folder.
 8 | 4. Execute `training.sh <experiment_name>`.
 9 | 
10 | The `training.sh` script does several things:
11 | 
12 | * Creates a folder in the `experiment` folder with the name you gave your experiment
13 | * The following files are copied to that folder:
14 |  * `config.ini`, which is located in your database folder
15 |  * `net.prototxt`
16 |  * `solver.prototxt`
17 |  * log files from the training
18 | * Starts the training of the neural network
19 | * The latest `.solverstate` and `.caffemodel` are copied to the `experiment` folder after the training is finished
20 | * After training, different graphs are created and put into the `experiment` folder.
21 | 


--------------------------------------------------------------------------------
/net/net.prototxt:
--------------------------------------------------------------------------------
  1 | name: "sentence_boundary_detection"
  2 | #
  3 | # Data
  4 | #
  5 | layer {
  6 |   name: "data"
  7 |   type: "Data"
  8 |   top: "data"
  9 |   top: "label"
 10 |   include {
 11 |     phase: TRAIN
 12 |   }
 13 |   data_param {
 14 |     source: "/home/ms2015t3/sentence-boundary-detection-nn/leveldbs/google_ted_wiki_window-8-4_pos-true_qm-false_balanced-false_nr-rep-true_word-this_wiki-test/train"
 15 |     batch_size: 128
 16 |     backend: LEVELDB
 17 |   }
 18 | }
 19 | layer {
 20 |   name: "data"
 21 |   type: "Data"
 22 |   top: "data"
 23 |   top: "label"
 24 |   include {
 25 |     phase: TEST
 26 |   }
 27 |   data_param {
 28 |     source: "/home/ms2015t3/sentence-boundary-detection-nn/leveldbs/google_ted_wiki_window-8-4_pos-true_qm-false_balanced-false_nr-rep-true_word-this_wiki-test/test"
 29 |     batch_size: 12000
 30 |     backend: LEVELDB
 31 |   }
 32 | }
 33 | #
 34 | # Fully Connected Layer 1
 35 | #
 36 | layer {
 37 |   name: "fc1"
 38 |   type: "InnerProduct"
 39 |   bottom: "data"
 40 |   top: "fc1"
 41 |   inner_product_param {
 42 |     num_output: 2048
 43 |     weight_filler {
 44 |       type: "xavier"
 45 |     }
 46 |     bias_filler {
 47 |       type: "constant"
 48 |     }
 49 |   }
 50 | }
 51 | layer {
 52 |   name: "relu1"
 53 |   type: "ReLU"
 54 |   bottom: "fc1"
 55 |   top: "fc1"
 56 | }
 57 | layer {
 58 |   name: "drop1"
 59 |   type: "Dropout"
 60 |   bottom: "fc1"
 61 |   top: "fc1"
 62 |   dropout_param {
 63 |     dropout_ratio: 0.5
 64 |   }
 65 | }
 66 | #
 67 | # Fully Connected Layer 2
 68 | #
 69 | layer {
 70 |   name: "fc2"
 71 |   type: "InnerProduct"
 72 |   bottom: "fc1"
 73 |   top: "fc2"
 74 |   inner_product_param {
 75 |     num_output: 4096
 76 |     weight_filler {
 77 |       type: "xavier"
 78 |     }
 79 |     bias_filler {
 80 |       type: "constant"
 81 |     }
 82 |   }
 83 | }
 84 | layer {
 85 |   name: "relu2"
 86 |   type: "ReLU"
 87 |   bottom: "fc2"
 88 |   top: "fc2"
 89 | }
 90 | layer {
 91 |   name: "drop2"
 92 |   type: "Dropout"
 93 |   bottom: "fc2"
 94 |   top: "fc2"
 95 |   dropout_param {
 96 |     dropout_ratio: 0.5
 97 |   }
 98 | }
 99 | #
100 | # Fully Connected Layer 3
101 | #
102 | layer {
103 |   name: "fc3"
104 |   type: "InnerProduct"
105 |   bottom: "fc2"
106 |   top: "fc3"
107 |   inner_product_param {
108 |     num_output: 2048
109 |     weight_filler {
110 |       type: "xavier"
111 |     }
112 |     bias_filler {
113 |       type: "constant"
114 |     }
115 |   }
116 | }
117 | layer {
118 |   name: "relu3"
119 |   type: "ReLU"
120 |   bottom: "fc3"
121 |   top: "fc3"
122 | }
123 | # layer {
124 | #   name: "drop3"
125 | #   type: "Dropout"
126 | #   bottom: "fc3"
127 | #   top: "fc3"
128 | #   dropout_param {
129 | #     dropout_ratio: 0.5
130 | #   }
131 | # }
132 | 
133 | #
134 | # Fully Connected Layer Final - Preparation for Output
135 | #
136 | layer {
137 |   name: "fc_final"
138 |   type: "InnerProduct"
139 |   bottom: "fc3"
140 |   top: "fc_final"
141 |   inner_product_param {
142 |     num_output: 3
143 |     weight_filler {
144 |       type: "xavier"
145 |     }
146 |     bias_filler {
147 |       type: "constant"
148 |     }
149 |   }
150 | }
151 | 
152 | #
153 | # Loss, Accuracy
154 | #
155 | layer {
156 |   name: "loss"
157 |   type: "SoftmaxWithLoss"
158 |   bottom: "fc_final"
159 |   bottom: "label"
160 |   top: "loss"
161 | }
162 | layer {
163 |   name: "accuracy"
164 |   type: "Accuracy"
165 |   bottom: "fc_final"
166 |   bottom: "label"
167 |   top: "accuracy"
168 |   top: "recall_per_class"
169 |   top: "precision_per_class"
170 |   include {
171 |     phase: TEST
172 |   }
173 | }
174 | 


--------------------------------------------------------------------------------
/net/plot_log.gnuplot:
--------------------------------------------------------------------------------
 1 | # Please generate the neccessary data files with 
 2 | # /path/to/caffe/tools/extra/parse_log.sh before plotting.
 3 | # Example usage: 
 4 | #     ./parse_log.sh mnist.log
 5 | # Now you have mnist.log.train and mnist.log.test.
 6 | #     gnuplot mnist.gnuplot
 7 | 
 8 | # The fields present in the data files that are usually proper to plot along
 9 | # the y axis are test accuracy, test loss, training loss, and learning rate.
10 | # Those should plot along the x axis are training iterations and seconds.
11 | # Possible combinations:
12 | # 1. Test accuracy (test score 0) vs. training iterations / time;
13 | # 2. Test loss (test score 1) time;
14 | # 3. Training loss vs. training iterations / time;
15 | # 4. Learning rate vs. training iterations / time;
16 | # A rarer one: Training time vs. iterations.
17 | 
18 | reset
19 | #set terminal dumb
20 | set style data lines
21 | set key right center
22 | 
23 | file(test_or_train) = sprintf("%s.%s", filename, test_or_train)
24 | ucf_101_title = "Learning on six classes of UCF 101"
25 | 
26 | ###### Fields in the training data
27 | ###### Iters Seconds TrainingLoss LearningRate
28 | 
29 | # Training loss vs. training iterations
30 | set terminal png
31 | set output "it_vs_train-loss.png"
32 | set title "Training loss vs. training iterations"
33 | set xlabel "Training iterations"
34 | set ylabel "Training loss"
35 | plot file("train") using 1:3 title "loss"
36 | 
37 | # Training loss vs. training time
38 | #set terminal png
39 | #set output "time_vs_train-loss.png"
40 | #set title "Training time vs. training loss"
41 | #set xlabel "Training time"
42 | #set ylabel "Training loss"
43 | #plot file("train") using 2:3 title "loss"
44 | 
45 | # Learning rate vs. training iterations;
46 | set terminal png
47 | set output "it_vs_lr.png"
48 | set xlabel "Training iterations"
49 | set ylabel "Learning rate"
50 | plot file("train") using 1:4 title "learning rate"
51 | 
52 | ###### Fields in the test data
53 | ###### Iters Seconds TestAccuracy TestLoss
54 | 
55 | # Test loss vs. training iterations
56 | set terminal png
57 | set output "it_vs_test-acc.png"
58 | set title "Training iterations vs. test accuracy"
59 | set xlabel "Training iterations"
60 | set ylabel "Test accuracy"
61 | plot file("test") using 1:3 title "accuracy"
62 | 


--------------------------------------------------------------------------------
/net/solver.prototxt:
--------------------------------------------------------------------------------
 1 | net: "net.prototxt"
 2 | 
 3 | # Test before training?
 4 | test_initialization: true
 5 | # Test every nth iteration
 6 | test_interval: 5000
 7 | # How many iterations per test
 8 | test_iter: 1
 9 | 
10 | # Base learning rate
11 | base_lr: 0.01
12 | # Policy for changing the learning rate - multiply by gamma every stepsize iterations
13 | lr_policy: "step"
14 | gamma: 0.1
15 | stepsize: 300000
16 | momentum: 0.9
17 | # Regularization parameter for the weights
18 | weight_decay: 0.0005
19 | 
20 | # Display training loss every nth iteration
21 | display: 200
22 | # After how many iterations to stop
23 | max_iter: 125000
24 | 
25 | # Snapshot every nth iteration in the specified directory
26 | snapshot: 100000
27 | snapshot_prefix: "snapshots/"
28 | 
29 | random_seed: 1701
30 | # Display the loss averaged over the last average_loss iterations - this does not work for accuracy
31 | average_loss: 100
32 | #clip_gradients: 10
33 | 
34 | # GPU for the win!
35 | solver_mode: GPU
36 | 


--------------------------------------------------------------------------------
/net/testing.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | PROJECT="sentence"
 4 | TESTING_LOG_NAME="${PROJECT}.tstlog"
 5 | 
 6 | # Check if called with name
 7 | if [ $# -ne 1 ]; then
 8 |     echo "Usage: $0 [experiment_name]"
 9 | 	echo "       experiment_name: Name of the subfolder in ./experiments/ for the current experiment."
10 | 	echo "Exiting."
11 | 	exit 1
12 | fi
13 | 
14 | # We need the output/error redirection, because caffe outputs to standard error, and we want to pipe to grep's standard in
15 | # See http://stackoverflow.com/questions/1507816/with-bash-how-can-i-pipe-standard-error-into-another-process
16 | ($CAFFE_ROOT/build/tools/caffe test -model net.prototxt -weights experiments/$1/*.caffemodel -iterations 1 3>&1 1>&2- 2>&3-) | grep --invert-match "Waiting for data" > $TESTING_LOG_NAME
17 | 
18 | 


--------------------------------------------------------------------------------
/net/training.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Check if called with name
 4 | if [ $# -ne 1 ]; then
 5 |     echo "Usage: $0 [experiment_name]"
 6 | 	echo "       experiment_name: Name of the subfolder in ./experiments/ for the current experiment."
 7 | 	echo "Exiting."
 8 | 	exit 1
 9 | fi
10 | 
11 | PROJECT="sentence"
12 | SOLVER="solver.prototxt"
13 | # Find out net from the solver
14 | NET=$(grep --only-matching "\w\+\.prototxt" solver.prototxt)
15 | #DATABASE=$(python $SENTENCE_HOME/python/tools/netconfig.py -p $NET)
16 | 
17 | echo "Using solver ${SOLVER} with net ${NET} and database ${DATABASE}"
18 | 
19 | # Set Vars
20 | DATE=`date +%Y%m%d-%H%M%S`
21 | FOLDER_NAME="${DATE}_$1"
22 | TRAINING_LOG_NAME="${PROJECT}_${NET}.tlog"
23 | 
24 | echo "Saving experiment in experiments/$FOLDER_NAME"
25 | mkdir experiments/$FOLDER_NAME
26 | 
27 | # Function for saving results and making plots
28 | function cleanup() {
29 |     echo $1
30 | 
31 |     echo "Copying snapshots"
32 |     ls -v -1 snapshots/ | tail -n 2 | xargs -i mv snapshots/{} experiments/$FOLDER_NAME
33 | 
34 |     echo "Parsing logs"
35 |     $CAFFE_ROOT/tools/extra/parse_log.sh $TRAINING_LOG_NAME
36 | 
37 |     echo "Copying logs"
38 |     cp $TRAINING_LOG_NAME $TRAINING_LOG_NAME.train $TRAINING_LOG_NAME.test experiments/$FOLDER_NAME
39 | 
40 |     echo "Building plots"
41 |     gnuplot -e "filename='$TRAINING_LOG_NAME'" -p plot_log.gnuplot
42 |     mv *.png experiments/$FOLDER_NAME
43 | 
44 |     rm ${TRAINING_LOG_NAME}.test ${TRAINING_LOG_NAME}.train
45 |     echo "Clean up finished"
46 | }
47 | 
48 | # Clean snapshots
49 | rm snapshots/* 2> /dev/null
50 | 
51 | # Saving setup
52 | cp *.prototxt $SOLVER training.sh experiments/$FOLDER_NAME
53 | # Copy database configuration
54 | #cp $DATABASE/*.ini experiments/$FOLDER_NAME
55 | 
56 | # Setting interrupt trap
57 | trap 'cleanup "Training interrupted"; exit 1' INT
58 | 
59 | # Calling caffe
60 | # export CAFFE_ROOT="$HOME/caffe-tmbo"
61 | 
62 | $CAFFE_ROOT/build/tools/caffe train \
63 |     -solver ./experiments/$FOLDER_NAME/$SOLVER 2> $TRAINING_LOG_NAME
64 | 
65 | # Check if Training successful
66 | if [ $? -ne 0 ]; then
67 |     # Send Email Notification
68 |     cd "${SENTENCE_HOME}/python"
69 |     python "common/send_email.py" "Training failed" "$FOLDER_NAME" "../net/$TRAINING_LOG_NAME"
70 |     cd -
71 |     echo "Training not successful. Exiting."
72 | 
73 |     # Resetting interrupt handling
74 |     trap - INT
75 |     exit 2
76 | fi
77 | 
78 | # Resetting interrupt handling
79 | trap - INT
80 | 
81 | cleanup "Training finished"
82 | 
83 | 


--------------------------------------------------------------------------------
/net/xiaoyin/A2_4_test.prototxt:
--------------------------------------------------------------------------------
  1 | layer {
  2 |   name: "data"
  3 |   type: "HDF5Data"
  4 |   top: "data"
  5 |   top: "label"
  6 |   hdf5_data_param {
  7 |     source: "/home/ms2015t3/sentence-boundary-detection-nn/net/test_hdf5s"
  8 |     batch_size: 12000
  9 |   }
 10 | }
 11 | layer {
 12 |     name: "reshape"
 13 |     type: "Reshape"
 14 |     bottom: "data"
 15 |     top: "dataR"
 16 |     reshape_param {
 17 |       shape {
 18 |         dim: 0  # copy the dimension from below
 19 |         dim: 1
 20 |         dim: 5
 21 |         dim: -1 # infer it from the other dimensions
 22 |       }
 23 |     }
 24 | }
 25 | layer {
 26 |   name: "ip1"
 27 |   type: "InnerProduct"
 28 |   bottom: "dataR"
 29 |   top: "ip1"
 30 |   inner_product_param {
 31 |     num_output: 2048
 32 |     weight_filler {
 33 |       type: "xavier"
 34 |     }
 35 |   }
 36 | }
 37 | layer {
 38 |   name: "relu1"
 39 |   type: "ReLU"
 40 |   bottom: "ip1"
 41 |   top: "ip1"
 42 | }
 43 | layer {
 44 |   name: "dropout1"
 45 |   type: "Dropout"
 46 |   bottom: "ip1"
 47 |   top: "ip1"
 48 |   dropout_param {
 49 |     dropout_ratio: 0.5
 50 |   }
 51 | }
 52 | 
 53 | layer {
 54 |   name: "ip2"
 55 |   type: "InnerProduct"
 56 |   bottom: "ip1"
 57 |   top: "ip2"
 58 |   inner_product_param {
 59 |     num_output: 4096
 60 |     weight_filler {
 61 |       type: "xavier"
 62 |     }
 63 |   }
 64 | }
 65 | layer {
 66 |   name: "relu2"
 67 |   type: "ReLU"
 68 |   bottom: "ip2"
 69 |   top: "ip2"
 70 | }
 71 | layer {
 72 |   name: "dropout2"
 73 |   type: "Dropout"
 74 |   bottom: "ip2"
 75 |   top: "ip2"
 76 |   dropout_param {
 77 |     dropout_ratio: 0.5
 78 |   }
 79 | }
 80 | 
 81 | layer {
 82 |   name: "ip3"
 83 |   type: "InnerProduct"
 84 |   bottom: "ip2"
 85 |   top: "ip3"
 86 |   inner_product_param {
 87 |     num_output: 2048
 88 |     weight_filler {
 89 |       type: "xavier"
 90 |     }
 91 |   }
 92 | }
 93 | layer {
 94 |   name: "relu3"
 95 |   type: "ReLU"
 96 |   bottom: "ip3"
 97 |   top: "ip3"
 98 | }
 99 | layer {
100 |   name: "ip4"
101 |   type: "InnerProduct"
102 |   bottom: "ip3"
103 |   top: "ip4"
104 |   inner_product_param {
105 |     num_output: 4
106 |     weight_filler {
107 |       type: "xavier"
108 |     }
109 |   }
110 | }
111 | layer {
112 |   name: "accuracy"
113 |   type: "Accuracy"
114 |   bottom: "ip4"
115 |   bottom: "label"
116 |   top: "accuracy"
117 |   top: "recall_per_class"
118 |   top: "precision_per_class"
119 | }
120 | layer {
121 |   name: "loss"
122 |   type: "SoftmaxWithLoss"
123 |   bottom: "ip4"
124 |   bottom: "label"
125 |   top: "loss"
126 | }
127 | 


--------------------------------------------------------------------------------
/net/xiaoyin/A2_4_train.prototxt:
--------------------------------------------------------------------------------
  1 | layer {
  2 |   name: "data"
  3 |   type: "HDF5Data"
  4 |   top: "data"
  5 |   top: "label"
  6 |   hdf5_data_param {
  7 |     source: "/home/ms2015t3/sentence-boundary-detection-nn/net/train_hdf5s"
  8 |     batch_size: 256
  9 |   }
 10 | }
 11 | layer {
 12 |     name: "reshape"
 13 |     type: "Reshape"
 14 |     bottom: "data"
 15 |     top: "dataR"
 16 |     reshape_param {
 17 |       shape {
 18 |         dim: 0  # copy the dimension from below
 19 |         dim: 1
 20 |         dim: 5
 21 |         dim: -1 # infer it from the other dimensions
 22 |       }
 23 |     }
 24 | }
 25 | layer {
 26 |   name: "ip1"
 27 |   type: "InnerProduct"
 28 |   bottom: "dataR"
 29 |   top: "ip1"
 30 |   inner_product_param {
 31 |     num_output: 2048
 32 |     weight_filler {
 33 |       type: "xavier"
 34 |     }
 35 |   }
 36 | }
 37 | layer {
 38 |   name: "relu1"
 39 |   type: "ReLU"
 40 |   bottom: "ip1"
 41 |   top: "ip1"
 42 | }
 43 | layer {
 44 |   name: "dropout1"
 45 |   type: "Dropout"
 46 |   bottom: "ip1"
 47 |   top: "ip1"
 48 |   dropout_param {
 49 |     dropout_ratio: 0.5
 50 |   }
 51 | }
 52 | 
 53 | layer {
 54 |   name: "ip2"
 55 |   type: "InnerProduct"
 56 |   bottom: "ip1"
 57 |   top: "ip2"
 58 |   inner_product_param {
 59 |     num_output: 4096
 60 |     weight_filler {
 61 |       type: "xavier"
 62 |     }
 63 |   }
 64 | }
 65 | layer {
 66 |   name: "relu2"
 67 |   type: "ReLU"
 68 |   bottom: "ip2"
 69 |   top: "ip2"
 70 | }
 71 | layer {
 72 |   name: "dropout2"
 73 |   type: "Dropout"
 74 |   bottom: "ip2"
 75 |   top: "ip2"
 76 |   dropout_param {
 77 |     dropout_ratio: 0.5
 78 |   }
 79 | }
 80 | 
 81 | layer {
 82 |   name: "ip3"
 83 |   type: "InnerProduct"
 84 |   bottom: "ip2"
 85 |   top: "ip3"
 86 |   inner_product_param {
 87 |     num_output: 2048
 88 |     weight_filler {
 89 |       type: "xavier"
 90 |     }
 91 |   }
 92 | }
 93 | layer {
 94 |   name: "relu3"
 95 |   type: "ReLU"
 96 |   bottom: "ip3"
 97 |   top: "ip3"
 98 | }
 99 | layer {
100 |   name: "ip4"
101 |   type: "InnerProduct"
102 |   bottom: "ip3"
103 |   top: "ip4"
104 |   inner_product_param {
105 |     num_output: 4
106 |     weight_filler {
107 |       type: "xavier"
108 |     }
109 |   }
110 | }
111 | layer {
112 |   name: "accuracy"
113 |   type: "Accuracy"
114 |   bottom: "ip4"
115 |   bottom: "label"
116 |   top: "accuracy"
117 | }
118 | layer {
119 |   name: "loss"
120 |   type: "SoftmaxWithLoss"
121 |   bottom: "ip4"
122 |   bottom: "label"
123 |   top: "loss"
124 | }
125 | 


--------------------------------------------------------------------------------
/net/xiaoyin/solver_xiaoyin.prototxt:
--------------------------------------------------------------------------------
 1 | train_net: "A2_4_train.prototxt"
 2 | test_net: "A2_4_test.prototxt"
 3 | test_iter: 1
 4 | test_interval: 5000
 5 | base_lr: 0.01
 6 | lr_policy: "step"
 7 | gamma: 0.1
 8 | stepsize: 300000
 9 | display: 100
10 | max_iter: 1000000
11 | momentum: 0.9
12 | weight_decay: 0.0005
13 | snapshot: 100000
14 | snapshot_prefix: "snapshots/"
15 | solver_mode: GPU
16 | 
17 | # ./build/tools/caffe train -solver xyche/solver.prototxt
18 | 
19 | # ./build/tools/caffe test -model xyche/prototxt/C2/C2_test.prototxt -weights xyche/snapshots/C1_50d_iter_100000.caffemodel -iterations 400
20 | 
21 | # ./build/tools/caffe test -model xyche/prototxt/C1/C1_test_output.prototxt -weights xyche/snapshots/C1_50d_iter_100000.caffemodel -iterations 1
22 | 


--------------------------------------------------------------------------------
/paper/Makefile:
--------------------------------------------------------------------------------
 1 | PDFVIEWER=evince
 2 | FILE=main
 3 | 
 4 | show: build
 5 | 	$(PDFVIEWER) $(FILE).pdf &
 6 | 
 7 | plot:
 8 | 	cd plots && build
 9 | 
10 | when-changed:
11 | 	@clear && when-changed chapters/ -c 'printf "\033c" && echo "Building" && make -s build && echo "Succeeded"'
12 | 
13 | convert:
14 | 	cd plots && make convert
15 | 
16 | build:
17 | 	@mkdir -p .output
18 | 	@pdflatex -interaction=nonstopmode -halt-on-error -output-directory .output -jobname=$(FILE) $(FILE).tex 1>&2 > .output/error
19 | 	@mv .output/$(FILE).pdf .
20 | 
21 | error:
22 | 	@vim + .output/error
23 | 
24 | 
25 | bibtex:
26 | 	@cp $(FILE).bib .output/ && cd .output && bibtex $(FILE) && cd ..
27 | 
28 | clean:
29 | 	rm .output/*.aux .output/*.log
30 | 


--------------------------------------------------------------------------------
/paper/chapters/acoustic_model.tex:
--------------------------------------------------------------------------------
 1 | Besides the lexical model, we also use an acoustic model to predict the punctuation.
 2 | The acoustic model is based on prosodic features, such as pauses and pitch levels.
 3 | In the following the training and evaluation of the acoustic model is shown in detail.
 4 | 
 5 | \subsection{Training Instance Generation}
 6 | 
 7 | Many researches are using pauses, pitch levels, and energy levels for prediction punctuation.
 8 | The pitch level encodes the volume of the speaker, whereas the energy level describes the amount of power the speaker is using in his voice.
 9 | To obtain those values from the \texttt{.sph} files in our data set, we first have to convert those files into \texttt{.wav} files.
10 | For that, we used a sound processing program called \emph{SoX}.
11 | Having the \texttt{.wav} files, we can extract the pitch and energy level from them using different libraries.
12 | For generating the pitch level the library \emph{aubio}\footnote{\url{http://aubio.org/}} is used.
13 | The output is a file containing two columns: The first column is the time in seconds in the talk and the second column is the pitch level of that second.
14 | The library \emph{Yaafe}\footnote{\url{yaafe.sourceforge.net/}} is used for extracting the energy levels from the \texttt{.wav} files.
15 | The output from \emph{Yaafe} contains one column with energy values.
16 | One line in the output file represents the energy level in \texttt{1 / sample rate} intervals.
17 | 
18 | Together with the \texttt{.ctm} files, we can now create the training instances.
19 | The process of generating the training instances is shown in Figure~\ref{fig:overview_acoustic}.
20 | \begin{figure}[ht]
21 |     \centering
22 |     \includegraphics[width=0.8\textwidth]{img/overview_accoustic.pdf}
23 |     \caption{Creation of the training instance for the acoustic model: The pause feature is extracted from the \texttt{.ctm} files, the pitch level feature from the \texttt{.pitch} files and the energy level feature from the \texttt{.energy} file. All features are normalized to a mean 0 and a variance of 1. As in the lexical model, a sliding window is used to create the final training instances.}
24 |     \label{fig:overview_acoustic}
25 | \end{figure}
26 | 
27 | The \texttt{.ctm} files hold the information of sentence boundaries.
28 | Unfortunately, we do not have any information about other punctuation marks besides periods in those files.
29 | Therefore, the acoustic model is only able to predict periods.
30 | 
31 | Having the \texttt{.ctm} files the first step in the training instance generation is to extract the words with their corresponding start time and duration.
32 | Additionally, the sentence boundaries are stored to obtain the gold standard.
33 | When all words of a talk have been read, the pauses before and after each word are calculated.
34 | Afterwards the \texttt{.energy} and \texttt{.pitch} files are parsed.
35 | The energy and pitch level is mapped to the word, which was spoken at the time mentioned in the files.
36 | It can happen, that multiple energy and pitch levels are mapped to one word.
37 | In that case, the average value over all those energy/pitch levels belonging to one word is taken as final energy/pitch level for that word.
38 | 
39 | Furthermore, we filter the pitch values.
40 | The voice frequency of a typical adult male ranges from 85 to 180 Hz, a typical adult female has a range from 165 to 255 Hz.
41 | A lot of values in the pitch files lay far over those values, because of the background noise recorded in the talk.
42 | Thus, we decided to filter all pitch levels, which are above 300 Hz.
43 | 
44 | In the end, we have the following four features for our acoustic model:
45 | \begin{itemize}
46 | 	\item the duration of the pause before a word,
47 | 	\item the duration of the pause after a word,
48 | 	\item the average energy level of a word, and
49 | 	\item the average pitch level of a word.
50 | \end{itemize}
51 | In the next step the features are normalized to a mean of zero and a variance of one.
52 | 
53 | As in the lexical model, we use a sliding window to create the training instances.
54 | The \texttt{config} file hold the information about the size of the window and the position of the punctuation.
55 | Using the gold standard we obtained from the \texttt{.ctm} files, the training instance with the corresponding class (\textsc{None} or \textsc{Period}) are created.
56 | The training instances are then written to a LevelDB in a last step.
57 | 
58 | \subsection{Neural Network Layout}
59 | 
60 | We used the same model as for the lexical model, except in the input and the output layer (see Figure~\ref{fig:net_acoustic}).
61 | For the input layer, we have only four features per word, compared to 314 features in the lexical layer.
62 | So, for example, a window size of eight leads to 32 features.
63 | In the last layer, we use only two dimensions instead of three, because we have only two classes: \textsc{None} and \textsc{Period}.
64 | 
65 | \begin{figure}[ht]
66 |     \centering
67 |     \includegraphics[width=0.6\textwidth]{img/net_acoustic.pdf}
68 |     \caption{Network architecture of the acoustic model consisting of four \texttt{inner product} layers.}
69 |     \label{fig:net_acoustic}
70 | \end{figure}
71 | 
72 | \subsection{Results and Evaluation}
73 | 
74 | As mentioned before, we can evaluate the model only on \textsc{Period}s, as we do not have ground truth data for the commas.
75 | Again, we evaluated different window sizes and punctuation positions.
76 | Figure~\ref{audio_eval} shows the F-measure for all experiments.
77 | \begin{figure}[ht]
78 |     \centering
79 |     \includegraphics[width=0.7\textwidth]{img/audio_parameter_eval.png}
80 |     \caption{Evaluation of the acoustic model: Window size eight and punctuation position four yields the best results.}
81 |     \label{audio_eval}
82 | \end{figure}
83 | Note that the y-axis has been capped to better show differences.
84 | Interestingly, the combination of window size eight and punctuation position four is again the best combination.
85 | It leads to an F-score of 78.36\%.


--------------------------------------------------------------------------------
/paper/chapters/caffe.tex:
--------------------------------------------------------------------------------
1 | % a little bit about caffe
2 | 


--------------------------------------------------------------------------------
/paper/chapters/data.tex:
--------------------------------------------------------------------------------
 1 | We use two different data sets for training and evaluating our SBD system.
 2 | The first dataset is a set of TED talks\footnote{\url{ted.com}} from 2011 until 2014.
 3 | The second dataset is plain text from Wikipedia\footnote{\url{en.wikipedia.org}}.
 4 | This was extracted from the English Wikipedia as of February 2014.
 5 | 
 6 | \paragraph{TED talks}
 7 | Our TED talk data set consists of 57 talks. For each talk we have the following data files:
 8 | \begin{itemize}
 9 | 	\item \texttt{.xml} file: This file contains a manually created script of the talk.
10 | 	The text is formatted and serves as training data for the lexical model (ground truth).
11 | 	\item \texttt{.ctm} file: This is a time-marked speech input.
12 | 	It contains one word per line with the time in second the word was said in the talk and its duration.
13 |     This is a typical output from an ASR system.
14 | 	Additionally, each sentence is labeled in the file, so that the data can be used for training of the acoustic model.
15 | 	\item \texttt{.sph} file: This file contains raw pulse code modulation (PCM) data.
16 | 	This data can be converted into .wav files.
17 | \end{itemize}
18 | 
19 | \paragraph{Wikipedia}
20 | We extracted the plain text from English Wikipedia articles.
21 | We selected only those articles with more than 10,000 characters, assuming that these articles have gained lots of attention, and therefore provide a good textual quality.
22 | When extracting the plain text, we discarded lists, headlines, tables etc., focusing only on paragraphs, so we can make sure, that only proper sentences are used for training.
23 | In total we received around 3.5 million new training instances from the Wikipedia articles.


--------------------------------------------------------------------------------
/paper/chapters/demo.tex:
--------------------------------------------------------------------------------
 1 | We use a demo application, accessible with a web browser, to present the working prototype.
 2 | It can be used to find sentence boundaries in unpunctuated text.
 3 | The general web page shows two main tabs, one labeled \emph{Lexical} and one \emph{Lexical + Audio}.
 4 | A user can click these, to switch between using only the lexical model or the fusion of both the lexical and the acoustic model.
 5 | 
 6 | There are two ways to feed input to our model for the \emph{Lexical} SBD (see Figure~\ref{fig:demo_l}).
 7 | \begin{figure}[ht]
 8 |     \centering
 9 |     \includegraphics[width=0.5\textwidth]{img/demo_l.png}
10 |     \caption{The demo application for lexical model. The results are presented below the options for input and model selection.}
11 |     \label{fig:demo_l}
12 | \end{figure}
13 | The user can enter a text input field to manually enter or paste any text they wish.
14 | Another possibility is to choose from a set of existing text files.
15 | A dropdown selection allows the user to choose the pretrained models, if multiple models are available in the system.
16 | If the model is changed, it is automatically loaded in the background.
17 | Once the user clicks the \emph{Punctuate!} button, the text, which was entered, or selected as a file, is passed to our lexical model.
18 | While the server processes the request, a small loading icon is shown inside the button.
19 | After the predictions are returned from the server, the result is shown beneath.
20 | The input text and positions where no punctuation was predicted are shown as tokens with a light grey background.
21 | Any commas or periods inserted, are shown in distinct colors.
22 | If a model, which uses POS tags, is selected a user can hover their mouse over a token to see its POS category.
23 | For further use the entire result is selectable and can be copied.
24 | 
25 | For the \emph{Lexical + Audio} SBD the possibilities for entering input are more limited (see Figure~\ref{fig:demo_la}).
26 | \begin{figure}[ht]
27 |     \centering
28 |     \includegraphics[width=0.5\textwidth]{img/demo_l_a.png}
29 |     \caption{The demo application for fusion of both models. The results of the individual models and the fusion are presented below the options for input and model selection. Only one result section is in the screenshot, the other sections are out of the region of the screenshot.}
30 |     \label{fig:demo_la}
31 | \end{figure}
32 | Since we need an audio recording, we offer only examples existing in the system.
33 | At the moment the system contains samples, which were used in the testing phase, but not for training.
34 | The selection of the user is therefore limited by a dropdown menu of all available choices.
35 | However, the choice of both the acoustic and the lexical model is independently available to a user.
36 | These can also be selected in a dropdown menu.
37 | The functionality of the \emph{Punctuate!} button is unchanged.
38 | It triggers the processing and shows a loading indicator until the result returns.
39 | The result area however is changed, and contains three subareas, which each contain a different result.
40 | Two of them contain the raw results of the acoustic model and the lexical model.
41 | The third shows the result after the fusion.
42 | Therefore, it is easy to compare the results of each individual model, and the result after the fusion.


--------------------------------------------------------------------------------
/paper/chapters/evaluation.tex:
--------------------------------------------------------------------------------
1 | % evaluation
2 | 
3 | Problem with audio data:
4 | Not tokenized in the same way as our data
5 | No commas
6 | No capitalization, which is important for POS tagging
7 | 


--------------------------------------------------------------------------------
/paper/chapters/fusion.tex:
--------------------------------------------------------------------------------
 1 | The individual predictions from the acoustic and the lexical model need to be combined to obtain a final, overall prediction.
 2 | Therefore, we fuse the predictions of the two models.
 3 | We implemented two different fusion approaches:
 4 | The first approach is called \emph{Threshold Fusion}, the second one is called \emph{Balance Fusion}.
 5 | The two fusion approaches and the evaluation will be presented in the remainder of this section.
 6 | 
 7 | \subsection{Threshold Fusion}
 8 | The main idea of the threshold fusion is the following: If the probability for the class \textsc{Period} from the acoustic model is over a certain threshold and the probability for the class \textsc{None} from the lexical model is below a certain threshold, we want to predict a period or a comma.
 9 | If the condition is satisfied, the probability of the class \textsc{Period} from the acoustic model is added to the probabilities of the classes \textsc{Period} and \textsc{Comma} from the lexical model.
10 | The idea of the threshold fusion is shown in the Figure~\ref{fig:fusion_1}.
11 | \begin{figure}[ht]
12 |     \centering
13 |     \includegraphics[width=0.7\textwidth]{img/fusion_1.pdf}
14 |     \caption{Threshold Fusion: The probability of the class \textsc{Period} from the acoustic model is added to the probabilities of the classes \textsc{Period} and \textsc{Comma} from the lexical model, if the probability for the class \textsc{Period} from the acoustic model is over a certain threshold and the probability for the class \textsc{None} from the lexical model is below a certain threshold.}
15 |     \label{fig:fusion_1}
16 | \end{figure}
17 | If the condition does not hold, we just take the prediction probabilities of the lexical model as the final predictions.
18 | Thus, the threshold fusion trusts the lexical model more than the acoustic model.
19 | The acoustic model is taken only into account, if the acoustic model is quite certain, that there should be a period, and the lexical model is not certain enough, that there should be no punctuation at all.
20 | In the end, the class with the highest probability is chosen.
21 | For the example in Figure~\ref{fig:fusion_1}, we would predict a comma.
22 | 
23 | \subsection{Balance Fusion}
24 | The balance fusion sums up weighted probabilities of both models.
25 | Figure~\ref{fig:fusion_2} shows an example.
26 | \begin{figure}[ht]
27 |     \centering
28 |     \includegraphics[width=0.7\textwidth]{img/fusion_2.pdf}
29 |     \caption{Balance Fusion: Sum up the weighted probabilities of both models.}
30 |     \label{fig:fusion_2}
31 | \end{figure}
32 | Using weights we can regulate, which model we trust more.
33 | In the example shown in Figure~\ref{fig:fusion_2}, the lexical model is more important than the acoustic model.
34 | In the end, the class with the overall highest probability is chosen again.
35 | So in the example, the predicted class would be the class \textsc{None}.
36 | 
37 | \subsection{Results and Evaluation}
38 | We evaluate both fusion approaches to determine, which of them leads to better results.
39 | The evaluation was done on the TED talk data set, because the acoustic model needs the information about the energy and pitch levels and therefore audio files.
40 | We used the \texttt{.ctm} files along with their corresponding \texttt{.sph} file.
41 | Thus, we have only a gold standard for the class \textsc{Period}.
42 | Consequently, the evaluation was done only for this class.
43 | 
44 | To evaluate the fusion approaches, we remove all sentence boundaries from the data and pass the data on to the lexical and acoustic model.
45 | We chose the best lexical and acoustic model from the previous sections to predict the punctuations.
46 | The predictions from the lexical and acoustic model are then fused.
47 | Therefore, we tested the threshold fusion with different threshold values and the balanced fusion with multiple weights.
48 | We added a baseline fusion for the lexical and acoustic model, which pass on the predictions from the corresponding model.
49 | The predictions returned by the different fusion approaches are then evaluated using the gold standard.
50 | The F-score was used as evaluation metric.
51 | Figure~\ref{fig:eval_fusion} shows the results.
52 | \begin{figure}[ht]
53 |     \centering
54 |     \includegraphics[width=0.7\textwidth]{img/fusion_eval.pdf}
55 |     \caption{Evaluation results for different fusion approaches: The best result is obtained by the threshold fusion with an acoustic threshold of 0.5 and a lexical threshold of 0.9.}
56 |     \label{fig:eval_fusion}
57 | \end{figure}
58 | 
59 | Because we have only a gold standard for the class \textsc{Period}, the baseline is the F-score from the acoustic model.
60 | Six fusion approaches outperform the F-score of the baseline.
61 | The best one is the threshold fusion with an acoustic threshold of 0.5 and a lexical threshold of 0.9.
62 | It obtained an F-score of 80.43\%, whereas the baseline has an F-score of 78.49\%.
63 | Consequently, fusing the results increases the overall performance.
64 | 


--------------------------------------------------------------------------------
/paper/chapters/future.tex:
--------------------------------------------------------------------------------
 1 | We presented an approach to automatically detect sentence boundaries, and predict the correct punctuation marks in unpunctuated ASR output.
 2 | Two different models were trained independently, one using lexical input and the other using acoustic input.
 3 | The results of both models were merged with a late fusion.
 4 | Evaluation has shown, that one has to be careful with the training data, which should stem only from actually spoken text.
 5 | Just adding more written text data did not improve the performance.
 6 | On the other hand, part of speech tags as additional features consistently increase the performance of the sentence boundary detection.
 7 | 
 8 | There are many possibilities for improvement on the presented approach.
 9 | Since we did not explore the large variety of different neural network layouts, further exploration in this area is likely to improve on the results.
10 | Especially in the case of more training data, a deeper network architecture can provide better results.
11 | Also, using Long Short Term Memory (LSTM) neural networks appears promising, as they can process a stream of data while keeping time information.
12 | This maps easily to the stream of word tokens in a text.
13 | 
14 | In the fusion step we decided for a late fusion approach, which combines only the predictions.
15 | However, another way to explore, is an earlier fusion, where both models and the fusion itself are trained together.
16 | Instead of fusing the predictions, the actual features can be fused.
17 | As for data preparation, a different representation of features in the lexical model can be examined, such as a second or third data channel or a combination similar to the fusion of the acoustic and lexical model.
18 | 
19 | Another improvement could be achieved with a better post processing of the results.
20 | For example, one punctuation symbol right after another is unlikely to be correct.


--------------------------------------------------------------------------------
/paper/chapters/introduction.tex:
--------------------------------------------------------------------------------
 1 | Automatic Speech Recognition systems (ASR) have many practical applications nowadays, e.g., in dictation systems for medical documentation and journalism.
 2 | Another application comes from the rapidly increasing amount of videos available online on video platforms for entertainment and learning, such as Youtube\footnote{\url{youtube.com}}, Vimeo\footnote{\url{vimeo.com}}, Coursera\footnote{\url{coursera.org}} or OpenHPI\footnote{\url{open.hpi.com}}.
 3 | All of these benefit from automatically generated transcripts and subtitles.
 4 | However, the result of many ASR systems is an unformatted text without any punctuation marks, such as periods and commas.
 5 | These texts are hard to read and understand without manually inserting the missing punctuation marks.
 6 | However, this is a mundane, complicated task.
 7 | Therefore, an automatic solution for formatting the ASR output and inserting punctuation marks is necessary.
 8 | We call this \emph{sentence boundary detection} (SBD).
 9 | 
10 | SBD is a mandatory preprocessing step for many further use cases.
11 | For example, most machine translation outputs are trained on properly formatted text.
12 | Having an ASR output without punctuation marks decreases the performance of machine translating systems.
13 | Also, other natural language processing tasks, such as part-of-speech tagging or tokenization, work on sentence units.
14 | Thus, the ASR output needs to be formatted before it can be further processed.
15 | 
16 | In this paper we want to address this problem by automatically creating punctuated text from unpunctuated text.
17 | We use neural networks to process the unformatted transcripts.
18 | The use of neural networks has led to large improvements in areas, such as image and video classification recently.
19 | 
20 | Our SBD system contains two models: one from the ASR text transcript (lexical model), and one from the raw audio data (acoustic model).
21 | We train both models independently and retrieve their separate predictions.
22 | Afterwards the results are combined in a fusion step.
23 | The final output can replace the original output from ASR systems and improve readability and quality of transcripts.
24 | Additionally, the punctuation marks often represent suitable boundaries for subtitles, enhancing their overall quality.
25 | 
26 | The rest of the paper is structured as follows:
27 | Related work is summarized in Section~\ref{sec:related_work}.
28 | Section~\ref{sec:training_data} describes the datasets we use for training and evaluation.
29 | The data preprocessing, training, and evaluation of our lexical and our acoustic model can be seen in Section~\ref{sec:lexical_model} and Section~\ref{sec:acoustic_model} respectively.
30 | Details of the fusion step are explained in Section~\ref{sec:fusion}.
31 | We show our demo application in Section~\ref{sec:demo} and conclude our work in Section~\ref{sec:future}.


--------------------------------------------------------------------------------
/paper/chapters/parameters.tex:
--------------------------------------------------------------------------------
 1 | % how did we find the best parameters?
 2 | 
 3 | Evaluation of results:
 4 | \begin{itemize}
 5 | \item F-measure for each class is calculated.
 6 | \item Harmonic mean for all F-measures is total score (higher is better).
 7 | \end{itemize}
 8 | 
 9 | \begin{figure}[ht]
10 |     \centering
11 |     \includegraphics[width=\textwidth]{img/parameter_eval.png}
12 |     \caption{Harmonic mean between all f1 scores for all classes. \emph{2/5} means window size of five and punctuation is tested at position two. If \emph{wi} is in the label, it uses wikipedia training data.}
13 |     \label{fig2}
14 | \end{figure}
15 | 
16 | Comparison between experiments with and without POS tagging (other than that, they have the same configurations):
17 | \begin{itemize}
18 | \item With POS tagging: 0.305
19 | \item Without POS tagging: 0.275
20 | \end{itemize}
21 | 
22 | Comparison between experiments with and without wikipedia data (other than that, they have the same configurations):
23 | \begin{itemize}
24 | \item Without wikipedia: 0.385
25 | \item With wikipedia data: 0.252
26 | \end{itemize}
27 | 


--------------------------------------------------------------------------------
/paper/chapters/related_work.tex:
--------------------------------------------------------------------------------
 1 | As punctuation prediction is a mandatory preprocessing step for further working with automated speech recognition output, a lot of research has been done in this field.
 2 | Some approaches focus only on the lexical part~\cite{Gravano2009, Lu2010, Ueffing2013, Cho2012, Zhang2013}.
 3 | Gravano et al.~\cite{Gravano2009} used a text-based n-gram language model to detect punctuation (comma, period, question mark).
 4 | Dynamic conditional random fields are used by Lu and Ng~\cite{Lu2010} and Ueffing et al.~\cite{Ueffing2013}.
 5 | Ueffing et al. evaluate their method with different features, such as language model scores, parse trees, dynamic sentence length and token n-grams.
 6 | The usefulness of the individual features highly depends on the nature of the text, which is processed.
 7 | For example, if the text is well structured, the parse tree features improve the result.
 8 | Similar to our approach, Cho et al.~\cite{Cho2012} use a sliding window over the input data to predict different punctuations.
 9 | Zhang et al.~\cite{Zhang2013} predict punctuations of an input stream.
10 | For each processed word in the input stream, syntactic features are used to predict the punctuation symbol after that word.
11 | The used features include, e.g., part-of-speech tags, tree-based features (the parse tree is build step by step) or bag of words.
12 | 
13 | Most of the researchers combine prosodic features, such as pitch, pauses, duration, and lexical features, such as words, n-grams, part-of-speech tags~\cite{Mark1999, Christensen2001, Liu2005, Matusov2007, Wang2012}.
14 | Chen~\cite{Mark1999} predict punctuation on the basis of prosodic features in a first step using a Hidden Markov Model.
15 | In a second step they use a language model to adapt the predicted punctuation from the first step.
16 | Christensen et al.~\cite{Christensen2001} focus on multi-layer perceptron methods to combine prosodic and lexical features, whereas Liu et al.~\cite{Liu2005} use conditional random fields.
17 | Matusov et al.~\cite{Matusov2007} optimize their approach to the needs of machine translation.
18 | They combine a language model and prosodic features in a log-linear model and add a phrase coverage feature, which is motivated by phrase-based machine translation systems.
19 | A comparison of different machine learning models to combine prosodic and lexical feature for the prediction of punctuation was done by Wang et al.~\cite{Wang2012}.
20 | The dynamic conditional random fields achieve the best result on a broadcast news copora (F1-Measure of 42.8\%).
21 | 
22 | In this paper we present a new approach: predicting punctuations using deep learning.
23 | To the best of our knowledge, such an approach was never tried before.
24 | We learn two individual models, one based on lexical features and the other one based on prosodic features.
25 | In the end, the predictions of both models are fused.


--------------------------------------------------------------------------------
/paper/img/audio_parameter_eval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/audio_parameter_eval.png


--------------------------------------------------------------------------------
/paper/img/demo_l.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/demo_l.png


--------------------------------------------------------------------------------
/paper/img/demo_l_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/demo_l_a.png


--------------------------------------------------------------------------------
/paper/img/fusion_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/fusion_1.pdf


--------------------------------------------------------------------------------
/paper/img/fusion_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/fusion_2.pdf


--------------------------------------------------------------------------------
/paper/img/fusion_eval.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/fusion_eval.pdf


--------------------------------------------------------------------------------
/paper/img/fusion_eval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/fusion_eval.png


--------------------------------------------------------------------------------
/paper/img/hpi_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/hpi_logo.png


--------------------------------------------------------------------------------
/paper/img/net_acoustic.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/net_acoustic.pdf


--------------------------------------------------------------------------------
/paper/img/net_lexical.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/net_lexical.pdf


--------------------------------------------------------------------------------
/paper/img/overview_accoustic.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/overview_accoustic.pdf


--------------------------------------------------------------------------------
/paper/img/overview_lexical.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/overview_lexical.pdf


--------------------------------------------------------------------------------
/paper/img/sliding_window.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/sliding_window.pdf


--------------------------------------------------------------------------------
/paper/img/window_eval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/window_eval.png


--------------------------------------------------------------------------------
/paper/img/window_pos_eval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/window_pos_eval.png


--------------------------------------------------------------------------------
/paper/img/window_wiki_eval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/window_wiki_eval.png


--------------------------------------------------------------------------------
/paper/main.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[a4paper,12pt,pagesize,headsepline,bibliography=totoc,titlepage]{scrartcl}
  2 | \usepackage[utf8]{inputenc}
  3 | % \usepackage[T1]{fontenc}
  4 | \usepackage{mathptmx}
  5 | \usepackage[scaled=.90]{helvet}
  6 | \usepackage{courier}
  7 | \usepackage{amsmath,amsthm,amsfonts,graphicx,caption}
  8 | \usepackage{hyperref}
  9 | \usepackage{ae,aecompl}
 10 | \usepackage{todonotes}
 11 | \usepackage{subcaption}
 12 | \usepackage{listings}
 13 | 
 14 | \lstset {
 15 | 	backgroundcolor=\color{white},
 16 | 	breakatwhitespace=false,
 17 | 	breaklines=true,
 18 | 	numbers=left,
 19 | 	frame=single,
 20 | 	title=\lstname,
 21 | 	basicstyle=\footnotesize
 22 | }
 23 | 
 24 | % \pagestyle{headings}
 25 | \headsep4mm % Abstand der Kopfzeile vom Text
 26 | % \typearea[current]{current}
 27 | 
 28 | \title{
 29 | 	\includegraphics*[width=0.4\textwidth]{img/hpi_logo.png}\\
 30 | 	\vspace{24pt}
 31 | 	Sentence Boundary Detection
 32 | }
 33 | \subtitle{
 34 | 	Seminar\\
 35 | 	Practical Applications of Multimedia Retrieval\\
 36 | 	Fall Semester 2015/2016
 37 | }
 38 | \author{
 39 | 	Tanja Bergmann, Joseph Bethge, Stefan Bunk, Ricarda Schüler\\[12pt]
 40 | 	Supervisor:\\
 41 |     Xiaoyin Che\\
 42 | 	Dr. Haojin Yang\\
 43 | 	Prof. Dr. Christoph Meinel
 44 | }
 45 | \date{\today}
 46 | 
 47 | \begin{document}
 48 | \maketitle
 49 | \tableofcontents
 50 | \newpage
 51 | 
 52 | \section{Introduction}
 53 | \label{sec:introduction}
 54 | \input{chapters/introduction}
 55 | 
 56 | \section{Related Work}
 57 | \label{sec:related_work}
 58 | \input{chapters/related_work}
 59 | 
 60 | \section{Training Data}
 61 | \label{sec:training_data}
 62 | \input{chapters/data}
 63 | 
 64 | \section{Lexical Model}
 65 | \label{sec:lexical_model}
 66 | \input{chapters/lexical_model}
 67 | 
 68 | \section{Acoustic Model}
 69 | \label{sec:acoustic_model}
 70 | \input{chapters/acoustic_model}
 71 | 
 72 | \section{Fusion}
 73 | \label{sec:fusion}
 74 | \input{chapters/fusion}
 75 | 
 76 | \section{Demo Tool}
 77 | \label{sec:demo}
 78 | \input{chapters/demo}
 79 | 
 80 | \section{Conclusion and Future Work}
 81 | \label{sec:future}
 82 | \input{chapters/future}
 83 | 
 84 | \bibliographystyle{plain}
 85 | \bibliography{main}
 86 | 
 87 | %\newpage
 88 | %\appendix
 89 | %\section{Appendix}
 90 | %We appended the following files for reference:
 91 | %\begin{itemize}
 92 | %    \item lexical-solver.prototxt, the configuration of the solver (lexical model)
 93 | %    \item lexical-net.prototxt, our net configuration (lexical model)
 94 | %    \item acoustic-solver.prototxt, the configuration of the solver (acoustic model)
 95 | %    \item acoustic-net.prototxt, our net configuration (acoustic model)
 96 | %\end{itemize}
 97 | %
 98 | %\subsection{lexical-solver.prototxt}
 99 | %\lstinputlisting[caption={lexical-solver.prototxt}, label={lst:lexical-solver.prototxt}]{../net/solver.prototxt}
100 | %\newpage
101 | %
102 | %\subsection{lexical-net.prototxt}
103 | %\lstinputlisting[caption={lexical-net.prototxt}, label={lst:lexical-net.prototxt}]{../net/net.prototxt}
104 | %\newpage
105 | %
106 | %\subsection{acoustic-solver.prototxt}
107 | %\lstinputlisting[caption={acoustic-solver.prototxt}, label={lst:acoustic-solver.prototxt}]{../net-audio/solver.prototxt}
108 | %\newpage
109 | %
110 | %\subsection{acoustic-net.prototxt}
111 | %\lstinputlisting[caption={acoustic-net.prototxt}, label={lst:acoustic-net.prototxt}]{../net-audio/net.prototxt}
112 | 
113 | %\newpage %for more appended files
114 | 
115 | \end{document}
116 | 


--------------------------------------------------------------------------------
/paper/notes/Makefile:
--------------------------------------------------------------------------------
 1 | PDFVIEWER=evince
 2 | FILE=results
 3 | 
 4 | show: build
 5 | 	$(PDFVIEWER) $(FILE).pdf &
 6 | 
 7 | plot:
 8 | 	cd plots && build
 9 | 
10 | when-changed:
11 | 	@clear && when-changed $(FILE).tex -c 'printf "\033c" && echo "Building" && make -s build && echo "Succeeded"'
12 | 
13 | convert:
14 | 	cd plots && make convert
15 | 
16 | build:
17 | 	@mkdir -p .output
18 | 	@pdflatex -interaction=nonstopmode -halt-on-error -output-directory .output -jobname=$(FILE) $(FILE).tex 1>&2 > .output/error
19 | 	@mv .output/$(FILE).pdf .
20 | 
21 | error:
22 | 	@vim + .output/error
23 | 
24 | 
25 | bibtex:
26 | 	@cp $(FILE).bib .output/ && cd .output && bibtex $(FILE) && cd ..
27 | 
28 | clean:
29 | 	rm .output/*.aux .output/*.log
30 | 


--------------------------------------------------------------------------------
/paper/notes/auswertung.ods:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/notes/auswertung.ods


--------------------------------------------------------------------------------
/paper/notes/auswertung2.ods:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/notes/auswertung2.ods


--------------------------------------------------------------------------------
/paper/notes/auswertung_fusion.txt:
--------------------------------------------------------------------------------
 1 | fusion,precision[NONE],precision[PERIOD],recall[NONE],recall[PERIOD],f1[NONE],f1[PERIOD],support[NONE],support[PERIOD]
 2 | ('BaselineLexicalFusion',)
 3 | 0.977,0.645,0.984,0.554,0.981,0.596,11611.000,599.000
 4 | ('BaselineAudioFusion',)
 5 | 0.981,0.604,0.968,0.722,0.974,0.658,12169.000,832.000
 6 | ('ThresholdFusion[AudioThresh: 0.50, LexicalThresh: 0.80]',)
 7 | 0.981,0.672,0.979,0.701,0.980,0.686,11647.000,722.000
 8 | ('ThresholdFusion[AudioThresh: 0.50, LexicalThresh: 0.90]',)
 9 | 0.983,0.678,0.978,0.733,0.981,0.704,11647.000,722.000
10 | ('ThresholdFusion[AudioThresh: 0.60, LexicalThresh: 0.80]',)
11 | 0.981,0.672,0.979,0.695,0.980,0.683,11646.000,721.000
12 | ('ThresholdFusion[AudioThresh: 0.60, LexicalThresh: 0.90]',)
13 | 0.983,0.678,0.979,0.727,0.981,0.701,11646.000,721.000
14 | ('ThresholdFusion[AudioThresh: 0.70, LexicalThresh: 0.80]',)
15 | 0.981,0.676,0.980,0.689,0.980,0.682,11644.000,717.000
16 | ('ThresholdFusion[AudioThresh: 0.70, LexicalThresh: 0.90]',)
17 | 0.983,0.682,0.979,0.721,0.981,0.701,11644.000,717.000
18 | ('BalanceFusion[BalanceValue: 0.10]',)
19 | 0.981,0.788,0.993,0.593,0.987,0.677,11893.000,553.000
20 | ('BalanceFusion[BalanceValue: 0.20]',)
21 | 0.982,0.792,0.993,0.599,0.987,0.682,11916.000,553.000
22 | ('BalanceFusion[BalanceValue: 0.30]',)
23 | 0.982,0.807,0.993,0.608,0.988,0.694,11944.000,551.000
24 | ('BalanceFusion[BalanceValue: 0.40]',)
25 | 0.982,0.820,0.994,0.611,0.988,0.701,11973.000,553.000
26 | ('BalanceFusion[BalanceValue: 0.50]',)
27 | 0.980,0.803,0.993,0.591,0.987,0.681,11916.000,580.000
28 | ('BalanceFusion[BalanceValue: 0.60]',)
29 | 0.979,0.717,0.989,0.569,0.984,0.634,11773.000,587.000
30 | ('BalanceFusion[BalanceValue: 0.70]',)
31 | 0.978,0.675,0.986,0.568,0.982,0.617,11698.000,595.000
32 | ('BalanceFusion[BalanceValue: 0.80]',)
33 | 0.978,0.669,0.986,0.568,0.982,0.614,11664.000,597.000
34 | ('BalanceFusion[BalanceValue: 0.90]',)
35 | 0.978,0.653,0.985,0.564,0.981,0.605,11635.000,598.000
36 | 
37 | 


--------------------------------------------------------------------------------
/paper/notes/auswertung_onlypos.ods:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/notes/auswertung_onlypos.ods


--------------------------------------------------------------------------------
/paper/notes/auswertung_wiki.ods:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/notes/auswertung_wiki.ods


--------------------------------------------------------------------------------
/paper/notes/fusion_eval.ods:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/notes/fusion_eval.ods


--------------------------------------------------------------------------------
/paper/notes/fusion_eval.txt:
--------------------------------------------------------------------------------
 1 | fusion	precision[NONE]	precision[PERIOD]	recall[NONE]	recall[PERIOD]	f1[NONE]	f1[PERIOD]	support[NONE]	support[PERIOD]
 2 | BaselineLexicalFusion	0.971	0.635	0.986	0.460	0.979	0.534	11225.000	602.000
 3 | BaselineAudioFusion	0.980	0.606	0.967	0.722	0.973	0.659	11797.000	832.000
 4 | ThresholdFusion[AudioThresh: 0.50, LexicalThresh: 0.80]	0.976	0.682	0.981	0.629	0.979	0.655	11254.000	731.000
 5 | ThresholdFusion[AudioThresh: 0.50, LexicalThresh: 0.90]	0.979	0.693	0.981	0.672	0.980	0.682	11254.000	731.000
 6 | ThresholdFusion[AudioThresh: 0.60, LexicalThresh: 0.80]	0.976	0.681	0.981	0.625	0.978	0.652	11254.000	728.000
 7 | ThresholdFusion[AudioThresh: 0.60, LexicalThresh: 0.90]	0.979	0.691	0.981	0.668	0.980	0.679	11254.000	728.000
 8 | ThresholdFusion[AudioThresh: 0.70, LexicalThresh: 0.80]	0.976	0.684	0.982	0.620	0.979	0.650	11252.000	726.000
 9 | ThresholdFusion[AudioThresh: 0.70, LexicalThresh: 0.90]	0.978	0.694	0.981	0.663	0.980	0.678	11252.000	726.000
10 | BalanceFusion[BalanceValue: 0.10]	0.980	0.784	0.993	0.557	0.987	0.652	11521.000	522.000
11 | BalanceFusion[BalanceValue: 0.20]	0.981	0.796	0.993	0.570	0.987	0.664	11538.000	519.000
12 | BalanceFusion[BalanceValue: 0.30]	0.981	0.813	0.994	0.569	0.988	0.670	11568.000	518.000
13 | BalanceFusion[BalanceValue: 0.40]	0.981	0.826	0.995	0.569	0.988	0.674	11599.000	527.000
14 | BalanceFusion[BalanceValue: 0.50]	0.977	0.807	0.994	0.523	0.985	0.635	11551.000	566.000
15 | BalanceFusion[BalanceValue: 0.60]	0.974	0.727	0.991	0.480	0.982	0.578	11394.000	581.000
16 | BalanceFusion[BalanceValue: 0.70]	0.973	0.671	0.988	0.468	0.980	0.552	11314.000	594.000
17 | BalanceFusion[BalanceValue: 0.80]	0.972	0.660	0.987	0.465	0.980	0.546	11270.000	598.000
18 | BalanceFusion[BalanceValue: 0.90]	0.972	0.647	0.986	0.463	0.979	0.539	11247.000	601.000
19 | 


--------------------------------------------------------------------------------
/paper/notes/results.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{graphicx}
 3 | 
 4 | \begin{document}
 5 | 
 6 | Evaluation of results:
 7 | \begin{itemize}
 8 | \item F-measure for each class is calculated.
 9 | \item Harmonic mean for all F-measures is total score (higher is better).
10 | \end{itemize}
11 | 
12 | \begin{figure}[ht]
13 |     \centering
14 |     \includegraphics[width=\textwidth]{diagram.png}
15 |     \caption{Harmonic mean between all f1 scores for all classes. \emph{2/5} means window size of five and punctuation is tested at position two. If \emph{wi} is in the label, it uses wikipedia training data.}
16 |     \label{fig2}
17 | \end{figure}
18 | 
19 | Comparison between experiments with and without POS tagging (other than that, they have the same configurations):
20 | \begin{itemize}
21 | \item With POS tagging: 0.305
22 | \item Without POS tagging: 0.275
23 | \end{itemize}
24 | 
25 | Comparison between experiments with and without wikipedia data (other than that, they have the same configurations):
26 | \begin{itemize}
27 | \item Without wikipedia: 0.385
28 | \item With wikipedia data: 0.252
29 | \end{itemize}
30 | 
31 | \end{document}
32 | 


--------------------------------------------------------------------------------
/python/README.md:
--------------------------------------------------------------------------------
 1 | Before executing any scripts on the server, please execute `. ./use_python p2` in `/home/ms2015t3/sentence-boundary-detection-nn`:
 2 | 
 3 | ```
 4 | cd /home/ms2015t3/sentence-boundary-detection-nn
 5 | . ./use_python p2
 6 | ```
 7 | 
 8 | Also, make sure that the directory `/home/ms2015t3/sentence-boundary-detection-nn/python` is added to the python path environment variable:
 9 | 
10 | ```
11 | export PYTHONPATH="${PYTHONPATH}:/home/ms2015t3/sentence-boundary-detection-nn/python"
12 | ```
13 | 
14 | Also, you have to set the environment variable `SENTENCE_HOME`, because many scripts rely on it:
15 | 
16 | ```
17 | export SENTENCE_HOME="/home/ms2015t3/sentence-boundary-detection-nn"
18 | ```
19 | 
20 | To execute all python scripts in this folder, please use **this folder as the working directory**.
21 | 
22 | ## Creating LevelDB for lexical model
23 | 
24 | To build a level db for the lexical model, please execute:
25 | ```
26 | python sbd_leveldb/training_instance_generator.py config.ini
27 | ```
28 | The `config.ini` file contains all parameters, which are needed during the creation of the training instances. 
29 | It also contains the training files and test files, which should be used. 
30 | The data root directory is set to `/mnt/naruto/sentence/data`. 
31 | All training and test files should be located in this folder.
32 | The `config.ini.default` file contains an example of a valid `config.ini` file.
33 | 
34 | The created level db can be found under `/mnt/naruto/sentence/leveldbs`.
35 | 
36 | ## Creating LevelDB for acoustic model
37 | 
38 | To build a level db for the acoustic model, please execute:
39 | ```
40 | python sbd_leveldb/audio_training_instance_generator.py config.ini
41 | ```
42 | The `config.ini` file contains all parameters, which are needed during the creation of the training instances. 
43 | The parameter `lexical` needs to be set to `false`.
44 | 
45 | It also contains the training files and test files, which should be used. 
46 | The data root directory is set to `/mnt/naruto/sentence/data`
47 | All training and test files should be located in this folder.
48 | The corresponding `.pitch` and `.energy` files should be in the same folder as the `.ctm` files.
49 | Also, the `.pitch` and `.energy` files should have the following name: `<data-set>_talkid<id>.[pitch|energy]`.
50 | The `<data-set>` parameter is extracted from the `.ctm` files.
51 | To create the `.pitch` and `.energy` files, you can use the `pitch_and_energy.sh` script under `/mnt/naruto/sentence/data/audio`.
52 | 
53 | The created level db can be found under `/mnt/naruto/sentence/leveldbs`.
54 | 


--------------------------------------------------------------------------------
/python/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/python/common/__init__.py


--------------------------------------------------------------------------------
/python/common/argparse_util.py:
--------------------------------------------------------------------------------
1 | def is_valid_file(parser, arg, mode='r'):
2 |     try:
3 |         f = open(arg, mode)
4 |         f.close()
5 |         return arg
6 |     except IOError:
7 |         parser.error('The file %s can not be opened!' % arg)
8 | 


--------------------------------------------------------------------------------
/python/common/send_email.py:
--------------------------------------------------------------------------------
 1 | import smtplib, ConfigParser, argparse
 2 | 
 3 | config_path = "email.ini"
 4 | 
 5 | config = ConfigParser.ConfigParser()
 6 | print("Reading email config: %s" % config_path)
 7 | config.read(config_path)
 8 | 
 9 | from_address = config.get('credentials','username')
10 | to_address_list  = config.get('adresses','to').split(",")
11 | username = config.get('credentials','username')
12 | password = config.get('credentials','password')
13 | 
14 | class EmailNotification(object):
15 |     def __init__(self, subject, message):
16 |         self.subject = subject
17 |         self.message = message
18 | 
19 |     def __format_message(self):
20 |         message_with_header = "\r\n".join([
21 |           "From: %s" % from_address,
22 |           "To: %s" % ",".join(to_address_list),
23 |           "Subject: [PAMuR] %s" % self.subject,
24 |           "",
25 |           "%s" % self.message
26 |           ])
27 |         return message_with_header
28 | 
29 |     def send(self):
30 |         server = smtplib.SMTP('smtp.gmail.com:587')
31 |         server.ehlo()
32 |         server.starttls()
33 |         server.login(username, password)
34 |         msg = self.__format_message()
35 |         server.sendmail(from_address, to_address_list, msg)
36 |         server.quit()
37 | 
38 |     def attach_files(self, files):
39 |         for filename in files:
40 |             self.message += "\r\n\r\n========== %s ==========\r\n" % filename
41 |             with open(filename, "r") as file_:
42 |                 for line in file_:
43 |                     self.message += "> %s" % line
44 |             self.message += "\r\n\r\n========== %s ==========\r\n" % filename
45 | 
46 | def main(args):
47 |     e = EmailNotification(args.subject, args.message)
48 |     e.attach_files(args.files)
49 |     # print e.message
50 |     e.send()
51 | 
52 | if __name__ == '__main__':
53 |     parser = argparse.ArgumentParser(description='Send a notification email')
54 |     parser.add_argument('subject', help='subject of the email')
55 |     parser.add_argument('message', help='message of the email')
56 |     parser.add_argument('files', help='files which are appended after the text', nargs='*')
57 |     args = parser.parse_args()
58 |     main(args)
59 | 


--------------------------------------------------------------------------------
/python/config.ini.default:
--------------------------------------------------------------------------------
 1 | [data]
 2 | normalize_class_distribution = false
 3 | train_files = ted/2010-1.xml,ted/2010-2.xml,ted/2012.xml,ted/2013.xml
 4 | test_files = ted/2011.xml
 5 | 
 6 | [word_vector]
 7 | # if set to 'avg' the average word vector is taken,
 8 | # otherwise the given word is taken as key error vector.
 9 | key_error_vector = this
10 | vector_file = google
11 | 
12 | [windowing]
13 | window_size = 5
14 | punctuation_position = 3
15 | 
16 | [features]
17 | use_question_mark = true
18 | pos_tagging = false
19 | number_replacement = true
20 | 
21 | [model]
22 | lexical = true
23 | 


--------------------------------------------------------------------------------
/python/console_demo/README.md:
--------------------------------------------------------------------------------
1 | ## Demo
2 | 
3 | Please check the [main readme file](../README.md) for the proper demo execution.


--------------------------------------------------------------------------------
/python/console_demo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/python/console_demo/__init__.py


--------------------------------------------------------------------------------
/python/console_demo/demo.py:
--------------------------------------------------------------------------------
  1 | import argparse, numpy, caffe
  2 | 
  3 | from preprocessing.nlp_pipeline import NlpPipeline
  4 | from preprocessing.sliding_window import SlidingWindow, PUNCTUATION_POS
  5 | from preprocessing.text import Sentence
  6 | from preprocessing.word2vec_file import Word2VecFile
  7 | 
  8 | classes = ["NONE", "COMMA", "PERIOD", "QUESTION"]
  9 | classes_as_string = ["", ",", ".", "?"]
 10 | 
 11 | class InputText(object):
 12 | 
 13 |     def __init__(self, text):
 14 |         self.text = text
 15 | 
 16 |         self.nlp_pipeline = NlpPipeline()
 17 |         self.gold_tokens = self.nlp_pipeline.parse_text(self.text)
 18 | 
 19 |     def get_gold_tokens(self):
 20 |         return self.gold_tokens
 21 | 
 22 | 
 23 | class Demo(object):
 24 |     """parses demo data, feeds to a trained model and returns predictions"""
 25 | 
 26 |     def __init__(self, net, word2vec):
 27 |         self.word2vec = word2vec
 28 |         self.net = net
 29 | 
 30 |     def get_not_covered_words(self):
 31 |         return self.word2vec.not_covered_words
 32 | 
 33 |     def predict_text(self, text):
 34 |         input_text = InputText(text)
 35 | 
 36 |         for token in input_text.gold_tokens:
 37 |             if not token.is_punctuation():
 38 |                 token.word_vec = self.word2vec.get_vector(token.word.lower())
 39 | 
 40 |         slidingWindow = SlidingWindow()
 41 |         instances = slidingWindow.list_windows(input_text)
 42 | 
 43 |         punctuations = []
 44 |         for instance in instances:
 45 |             probs = self.predict_caffe(instance)
 46 |             #print instance
 47 |             #self.show_probs(probs)
 48 |             punctuations.append(numpy.argmax(probs))
 49 |         #print punctuations
 50 | 
 51 |         print(">>> Sentence with boundaries:")
 52 |         for i in range(len(punctuations) - 1, -1, -1):
 53 |             input_text.gold_tokens.insert(i + PUNCTUATION_POS, classes_as_string[punctuations[i]])
 54 |         print "{",
 55 |         for t in input_text.gold_tokens:
 56 |             print t,
 57 |         print "}"
 58 | 
 59 |     def predict_caffe(self, instance):
 60 |         transformer = caffe.io.Transformer({'data': self.net.blobs['data'].data.shape})
 61 | 
 62 |         batchsize = 1
 63 |         self.net.blobs['data'].reshape(batchsize,1,5,300)
 64 |         reshaped_array = numpy.expand_dims(instance.get_array(), axis=0)
 65 | 
 66 |         self.net.blobs['data'].data[...] = reshaped_array
 67 | 
 68 |         out = self.net.forward()
 69 |         return out['softmax']
 70 | 
 71 |     def show_probs(self, probs):
 72 |         for i in range (0, len(classes)):
 73 |             print classes[i], ":", probs[0][i]
 74 | 
 75 | 
 76 | def main_no_loading(net, vector, datafile=None, show=False):
 77 |     if show:
 78 |         classes_as_string[0] = "_"
 79 |     caffe.set_mode_cpu()
 80 |     d = Demo(net, vector)
 81 |     if datafile:
 82 |         f = open(datafile)
 83 |         text = f.read()
 84 |         f.close()
 85 |         d.predict_text(text)
 86 |     else:
 87 |         while (1):
 88 |             text = raw_input("Please enter some text without punctuation for prediction (enter q to quit):")
 89 |             if text == "q":
 90 |                 return
 91 |             d.predict_text(text)    
 92 | 
 93 | def main(vectorfile, caffeproto, caffemodel, datafile=None, show=False):
 94 |     vector = Word2VecFile(vectorfile)
 95 |     net = caffe.Net(caffeproto, caffemodel, caffe.TEST)
 96 |     main_no_loading(net, vector, datafile, show)
 97 | 
 98 | if __name__ == '__main__':
 99 |     parser = argparse.ArgumentParser(description='Get word vector from binary data.')
100 |     parser.add_argument('-d','--datafile', help='path to file with text,  text can be entered interactively if ommited', dest=datafile)
101 |     parser.add_argument('vectorfile', help='path to word vector binary')
102 |     parser.add_argument('caffeproto', help='path to caffe proto file')
103 |     parser.add_argument('caffemodel', help='path to caffe model file')
104 |     parser.add_argument('-s','--show', help='show the non-existing punctuation with an underscore', action=store_true, dest=show)
105 |     args = parser.parse_args()
106 |     main(show=args.show, vectorfile=args.vectorfile, caffeproto=args.caffeproto, caffemodel=args.caffemodel, datafile=args.datafile)
107 | 


--------------------------------------------------------------------------------
/python/console_demo/demo_preparation.py:
--------------------------------------------------------------------------------
1 | import caffe
2 | 
3 | import demo.demo as d
4 | from preprocessing.word2vec_file import Word2VecFile
5 | 
6 | vector = Word2VecFile('models/GoogleNews-vectors-negative300.bin')
7 | net = caffe.Net('models/deploy.prototxt', 'models/model.caffemodel', caffe.TEST)
8 | 
9 | 


--------------------------------------------------------------------------------
/python/demo_data/.gitignore:
--------------------------------------------------------------------------------
1 | *.*
2 | !*.gitignore
3 | !download_all.sh
4 | !download_models.sh
5 | !download_google_vector.sh
6 | !folders.txt
7 | !models.txt
8 | 


--------------------------------------------------------------------------------
/python/demo_data/audio_examples/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/python/demo_data/audio_models/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/python/demo_data/download_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | user="ms2015t3"
 4 | host="172.16.23.193"
 5 | path="/home/ms2015t3/demo_data"
 6 | 
 7 | while IFS='' read -r folder || [[ -n "$folder" ]]; do
 8 |     echo "downloading folder: $folder..."
 9 |     mkdir "$folder" -p
10 |     sftp -r "$user@$host:$path/$folder" .
11 |     echo -e "*\n!.gitignore" > "$folder/.gitignore"
12 | done < "folders.txt"
13 | 


--------------------------------------------------------------------------------
/python/demo_data/download_google_vector.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | sftp ms2015t3@172.16.23.193:/home/ms2015t3/ms-2015-t3/GoogleNews-vectors-negative300.bin .
3 | 


--------------------------------------------------------------------------------
/python/demo_data/download_models.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | user="ms2015t3"
 4 | host="172.16.23.193"
 5 | path="/home/ms2015t3/sentence-boundary-detection-nn/net/experiments"
 6 | 
 7 | while IFS='' read -r model || [[ -n "$model" ]]; do
 8 |     echo "downloading model: $model..."
 9 |     mkdir "lexical_models/$model" -p
10 |     sftp -r "$user@$host:$path/$model/net.prototxt" "lexical_models/$model/"
11 |     sftp -r "$user@$host:$path/$model/*.ini" "lexical_models/$model/"
12 |     sftp -r "$user@$host:$path/$model/*.caffemodel" "lexical_models/$model/"
13 | done < "lexical_models.txt"
14 | 


--------------------------------------------------------------------------------
/python/demo_data/folders.txt:
--------------------------------------------------------------------------------
1 | audio_examples
2 | audio_models
3 | lexical_models
4 | text_data
5 | 


--------------------------------------------------------------------------------
/python/demo_data/lexical_models.txt:
--------------------------------------------------------------------------------
1 | 20160108-025006_google_ted_wiki_window-5-4_pos-false_qm-false_balanced-false_nr-rep-true_word-this
2 | 20160108-032648_google_ted_wiki_window-5-4_pos-true_qm-false_balanced-false_nr-rep-true_word-this
3 | 20160108-072841_google_ted_wiki_window-8-4_pos-false_qm-false_balanced-false_nr-rep-true_word-this
4 | 20160108-081712_google_ted_wiki_window-8-4_pos-true_qm-false_balanced-false_nr-rep-true_word-this
5 | 


--------------------------------------------------------------------------------
/python/demo_data/lexical_models/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/python/demo_data/text_data/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | *.result
4 | 


--------------------------------------------------------------------------------
/python/email.ini.default:
--------------------------------------------------------------------------------
1 | [credentials]
2 | username=username@gmail.com
3 | password=password
4 | 
5 | [adresses]
6 | to=other1@gmail.com,other2@gmail.com
7 | 


--------------------------------------------------------------------------------
/python/evaluation/evaluation.py:
--------------------------------------------------------------------------------
  1 | import common.sbd_config as sbd
  2 | import argparse, os
  3 | from parsing.audio_parser import AudioParser
  4 | from sbd_classification.util import *
  5 | from sbd_classification.classification_input import InputText, InputAudio
  6 | from sbd_classification.fusion import get_evaluation_fusion_list
  7 | from preprocessing.word2vec_file import Word2VecFile
  8 | from preprocessing.tokens import Punctuation
  9 | 
 10 | from sklearn.metrics import precision_recall_fscore_support
 11 | 
 12 | class Evaluation(object):
 13 | 
 14 |     def __init__(self, talks):
 15 |         self.talks = talks
 16 |         self.tokens = [token for talk in self.talks for token in talk.get_tokens()]
 17 | 
 18 |     def evaluate(self, lexical_model_folder, audio_model_folder, vector):
 19 |         print("Evaluating %s and %s ..." % (lexical_model_folder, audio_model_folder))
 20 | 
 21 |         lexical_classifier = load_lexical_classifier(lexical_model_folder, vector)
 22 |         audio_classifier = load_audio_classifier(audio_model_folder)
 23 | 
 24 |         # get audio probabilities
 25 |         self._load_config(audio_model_folder)
 26 |         input_audio = InputAudio(self.talks)
 27 |         audio_probs = audio_classifier.predict(input_audio)
 28 | 
 29 |         # get lexical probabilities
 30 |         self._load_config(lexical_model_folder)
 31 |         input_text = InputText(self.talks)
 32 |         lexical_probs = lexical_classifier.predict(input_text)
 33 | 
 34 |         # get config parameter
 35 |         (lexical_window_size, lexical_punctuation_pos, pos_tagging) = lexical_classifier.get_lexical_parameter()
 36 |         (audio_window_size, audio_punctuation_pos) = audio_classifier.get_audio_parameter()
 37 | 
 38 |         fusions = get_evaluation_fusion_list(lexical_punctuation_pos, lexical_window_size, audio_punctuation_pos, audio_window_size)
 39 | 
 40 |         assert(len(input_audio.tokens) == len(input_text.tokens))
 41 |         print("fusion,precision[NONE],precision[PERIOD],recall[NONE],recall[PERIOD],f1[NONE],f1[PERIOD],support[NONE],support[PERIOD]")
 42 |         for fusion in fusions:
 43 |             print(str(fusion),)
 44 |             fusion_probs = fusion.fuse(len(input_text.tokens), lexical_probs, audio_probs)
 45 | 
 46 |             exp_actual = self.get_expected_actual(fusion_probs, self.tokens)
 47 |             self.calculate_evaluation_metrics(exp_actual)
 48 | 
 49 |     def get_expected_actual(self, fusion_probs, tokens):
 50 |         expected_actual = []
 51 |         word_tokens = [token for token in tokens if not token.is_punctuation()]
 52 | 
 53 |         assert(len(word_tokens) == len(fusion_probs))
 54 |         tokens_idx = 1
 55 |         for i in range(len(fusion_probs)):
 56 |             actual = fusion_probs[i].index(max(fusion_probs[i]))
 57 |             is_punctuation = tokens[tokens_idx].is_punctuation()
 58 |             expected = tokens[tokens_idx].punctuation_type.value if is_punctuation else 0
 59 |             if is_punctuation:
 60 |                 tokens_idx += 1
 61 |             tokens_idx += 1
 62 |             if actual == Punctuation.COMMA.value:
 63 |                 continue
 64 |             expected_actual.append((expected, actual))
 65 | 
 66 |         return expected_actual
 67 | 
 68 |     def calculate_evaluation_metrics(self, expected_actual):
 69 |         expected = map(lambda x: x[0], expected_actual)
 70 |         actual = map(lambda x: x[1], expected_actual)
 71 |         results = precision_recall_fscore_support(expected, actual)
 72 |         print("%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f" % (
 73 |             results[0][0],
 74 |             results[0][1],
 75 |             results[1][0],
 76 |             results[1][1],
 77 |             results[2][0],
 78 |             results[2][1],
 79 |             results[3][0],
 80 |             results[3][1]
 81 |             )
 82 |         )
 83 | 
 84 |     def _load_config(self, model_folder):
 85 |         config_file, caffemodel_file, net_proto = get_filenames(model_folder)
 86 |         sbd.SbdConfig(config_file)
 87 | 
 88 | 
 89 | if __name__ == '__main__':
 90 |     parser = argparse.ArgumentParser(description='evaluates the fusion.')
 91 |     parser.add_argument('ctm_file', help="path to ctm_file", default="evaluation_data/data/tst2011_0.ctm", nargs='?')
 92 |     parser.add_argument('vectorfile', help='the google news word vector', default='evaluation_data/GoogleNews-vectors-negative300.bin', nargs='?')
 93 |     parser.add_argument('lexical_model_folder', help="path to lexical models", default="evaluation_data/lexical_models", nargs='?')
 94 |     parser.add_argument('audio_model_folder', help="path to audio models", default="evaluation_data/audio_models", nargs='?')
 95 |     parser.add_argument('--release', help="whether to test in release mode", action='store_true')
 96 |     args = parser.parse_args()
 97 | 
 98 |     if args.release:
 99 |         vector = Word2VecFile(args.vectorfile)
100 |     else:
101 |         vector = None
102 | 
103 |     # get all talks
104 |     print("Reading all talks ...")
105 |     audio_parser = AudioParser()
106 |     talks = audio_parser.parse(args.ctm_file)
107 | 
108 | 
109 |     # get all lexical models
110 |     lexical_models = []
111 |     for dirname, dirnames, filenames in os.walk(args.lexical_model_folder):
112 |         for subdirname in dirnames:
113 |             lexical_models.append(os.path.join(dirname, subdirname))
114 | 
115 |     # get all audio models
116 |     audio_models = []
117 |     for dirname, dirnames, filenames in os.walk(args.audio_model_folder):
118 |         for subdirname in dirnames:
119 |             audio_models.append(os.path.join(dirname, subdirname))
120 | 
121 | 
122 |     # evaluate all combination of models
123 |     evaluation = Evaluation(talks)
124 |     for lexical_model in lexical_models:
125 |         for audio_model in audio_models:
126 |             evaluation.evaluate(lexical_model, audio_model, vector)
127 | 
128 | 


--------------------------------------------------------------------------------
/python/evaluation_data/.gitignore:
--------------------------------------------------------------------------------
1 | *.*
2 | !*.gitignore
3 | !download_all.sh
4 | !download_models.sh
5 | !folders.txt
6 | !models.txt
7 | 


--------------------------------------------------------------------------------
/python/evaluation_data/download_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | user="ms2015t3"
 4 | host="172.16.23.193"
 5 | 
 6 | mkdir -p data
 7 | scp "$user@$host:/mnt/naruto/sentence/data/audio/tst2011_*.{ctm,energy,pitch}" data/
 8 | 
 9 | mkdir -p audio_models
10 | scp -r "$user@$host:/home/ms2015t3/sentence-boundary-detection-nn/net-audio/experiments/20160126-053506_audio_window-8-4" audio_models/
11 | 
12 | mkdir -p lexical_models
13 | scp -r "$user@$host:/home/ms2015t3/sentence-boundary-detection-nn/net/experiments/20160111-131832_google_ted_window-8-4_pos-true_qm-false_balanced-false_nr-rep-true_word-this" lexical_models/
14 | 


--------------------------------------------------------------------------------
/python/evaluation_data/folders.txt:
--------------------------------------------------------------------------------
1 | audio_models
2 | lexical_models
3 | data
4 | 


--------------------------------------------------------------------------------
/python/experiments/README.md:
--------------------------------------------------------------------------------
 1 | If you want to train multiple configurations on multiple databases you can use the convenience scripts in this folder.
 2 | 
 3 | First you will need to create a root folder, where all databases and experiment data is going to be saved, e.g. `/some/path`.
 4 | Then create a `1_open`, a `2_databased`, a `3_trained`, a `4_database_failed` and a `5_training_failed` folder inside that path (e.g. `/some/path/1_open`).
 5 | Insert all config files you want to test into the `1_open` folder.
 6 | Note that you should give these config files **meaningful filenames**, as their filenames are used for identification purposes later on.
 7 | 
 8 | # Creating Multiple Databases
 9 | 
10 | For lexical data use `databases.sh` and for acoustic data use `audio_databases.sh`.
11 | 
12 | * Pass the original root folder (e.g. `/some/path`) to either of these scripts.
13 | * All config files for which a database was created successfully, will be moved to the subfolder `2_databased`.
14 | * If any fail, they will be moved to a subfolder `4_database_failed`.
15 | 
16 | # Training on Multiple Databases
17 | 
18 | For lexical data use `training.sh` and for acoustic data use `audio_training.sh`.
19 | 
20 | * Pass the original root folder (e.g. `/some/path`) to either of these scripts.
21 | * This script will access all config files in the subfolder `2_databased`, and automatically move successful trained files to a subfolder `3_trained`
22 | * If any fail, they will be moved to a subfolder `5_training_failed`.
23 | 
24 | The final experiment names will be taken from the basename of the config file.
25 | 


--------------------------------------------------------------------------------
/python/experiments/audio_databases.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [ "$#" -ne 1 ]; then
 4 |     echo "Illegal number of parameters!"
 5 |     echo "./experiments.sh <CONFIG_FOLDER>"
 6 |     exit
 7 | fi
 8 | 
 9 | CONFIG_FOLDER=$1
10 | 
11 | if ! [[ -d $CONFIG_FOLDER ]]; then
12 |     echo "$CONFIG_FOLDER is not a directory!"
13 |     exit
14 | fi
15 | 
16 | # Abort on first error
17 | set -e
18 | 
19 | source $SENTENCE_HOME/use_python p2
20 | 
21 | for CONFIG_FILE in "$CONFIG_FOLDER"/1_open/*
22 | do
23 |     cd $SENTENCE_HOME/python/
24 |     CONFIG=$(basename ${CONFIG_FILE})
25 |     CONFIG="${CONFIG%.*}"
26 |     echo "#################### Creating database with $CONFIG ####################"
27 |     python sbd_leveldb/audio_training_instance_generator.py $CONFIG_FILE
28 | 
29 |     if [ $? -eq 0 ]; then
30 |         echo "#################### Moving to 2_databased          ####################"
31 |         mv $CONFIG_FOLDER/1_open/$CONFIG.ini $CONFIG_FOLDER/2_databased/
32 |     else
33 |         echo "#################### Moving to 4_database_failed    ####################"
34 |         mv $CONFIG_FOLDER/1_open/$CONFIG.ini $CONFIG_FOLDER/4_database_failed/
35 |     fi
36 | 
37 | done
38 | 


--------------------------------------------------------------------------------
/python/experiments/audio_training.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [ "$#" -ne 1 ]; then
 4 |     echo "Illegal number of parameters!"
 5 |     echo "./experiments.sh <CONFIG_FOLDER>"
 6 |     exit
 7 | fi
 8 | 
 9 | CONFIG_FOLDER=$1
10 | 
11 | if ! [[ -d $CONFIG_FOLDER ]]; then
12 |     echo "$CONFIG_FOLDER is not a directory!"
13 |     exit
14 | fi
15 | 
16 | source $SENTENCE_HOME/use_python p2
17 | 
18 | for CONFIG_FILE in "$CONFIG_FOLDER"/2_databased/*
19 | do
20 |     cd $SENTENCE_HOME/python/
21 |     CONFIG=$(basename ${CONFIG_FILE})
22 |     CONFIG="${CONFIG%.*}"
23 |     echo "#################### Training with $CONFIG          ####################"
24 |     echo "#################### Configuring net                ####################"
25 |     python tools/netconfig.py ../net-audio/net.prototxt -o ../net-audio/auto.prototxt -t $SENTENCE_HOME/leveldbs/$CONFIG
26 |     echo "#################### Starting training              ####################"
27 |     date
28 |     cd $SENTENCE_HOME/net-audio/
29 |     ./training.sh $CONFIG
30 | 
31 |     if [ $? -eq 0 ]; then
32 |         echo "#################### Moving to 3_trained            ####################"
33 |         cd $SENTENCE_HOME/python/
34 |         mv $CONFIG_FOLDER/2_databased/$CONFIG.ini $CONFIG_FOLDER/3_trained
35 |     else
36 |         echo "#################### Moving to 5_training_failed    ####################"
37 |         cd $SENTENCE_HOME/python/
38 |         mv $CONFIG_FOLDER/2_databased/$CONFIG.ini $CONFIG_FOLDER/5_training_failed
39 |     fi
40 |     echo "#################### Removing net definition   ####################"
41 |     cd $SENTENCE_HOME/net-audio/
42 |     rm auto.prototxt
43 | done
44 | 


--------------------------------------------------------------------------------
/python/experiments/databases.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [ "$#" -ne 1 ]; then
 4 |     echo "Illegal number of parameters!"
 5 |     echo "./experiments.sh <CONFIG_FOLDER>"
 6 |     exit
 7 | fi
 8 | 
 9 | CONFIG_FOLDER=$1
10 | 
11 | if ! [[ -d $CONFIG_FOLDER ]]; then
12 |     echo "$CONFIG_FOLDER is not a directory!"
13 |     exit
14 | fi
15 | 
16 | # Abort on first error
17 | set -e
18 | 
19 | source $SENTENCE_HOME/use_python p2
20 | 
21 | for CONFIG_FILE in "$CONFIG_FOLDER"/1_open/*
22 | do
23 |     cd $SENTENCE_HOME/python/
24 |     CONFIG=$(basename ${CONFIG_FILE})
25 |     CONFIG="${CONFIG%.*}"
26 |     echo "#################### Creating database with $CONFIG ####################"
27 |     python sbd_leveldb/training_instance_generator.py $CONFIG_FILE
28 | 
29 |     if [ $? -eq 0 ]; then
30 |         echo "#################### Moving to 2_databased          ####################"
31 |         mv $CONFIG_FOLDER/1_open/$CONFIG.ini $CONFIG_FOLDER/2_databased/
32 |     else
33 |         echo "#################### Moving to 4_database_failed    ####################"
34 |         mv $CONFIG_FOLDER/1_open/$CONFIG.ini $CONFIG_FOLDER/4_database_failed/
35 |     fi
36 | 
37 | done
38 | 


--------------------------------------------------------------------------------
/python/experiments/training.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [ "$#" -ne 1 ]; then
 4 |     echo "Illegal number of parameters!"
 5 |     echo "./experiments.sh <CONFIG_FOLDER>"
 6 |     exit
 7 | fi
 8 | 
 9 | CONFIG_FOLDER=$1
10 | 
11 | if ! [[ -d $CONFIG_FOLDER ]]; then
12 |     echo "$CONFIG_FOLDER is not a directory!"
13 |     exit
14 | fi
15 | 
16 | source $SENTENCE_HOME/use_python p2
17 | 
18 | for CONFIG_FILE in "$CONFIG_FOLDER"/2_databased/*
19 | do
20 |     cd $SENTENCE_HOME/python/
21 |     CONFIG=$(basename ${CONFIG_FILE})
22 |     CONFIG="${CONFIG%.*}"
23 |     echo "#################### Training with $CONFIG          ####################"
24 |     echo "#################### Configuring net                ####################"
25 |     python tools/netconfig.py ../net/net.prototxt -o ../net/auto.prototxt -t $SENTENCE_HOME/leveldbs/$CONFIG
26 |     echo "#################### Starting training              ####################"
27 |     date
28 |     cd $SENTENCE_HOME/net/
29 |     ./training.sh $CONFIG
30 | 
31 |     if [ $? -eq 0 ]; then
32 |         echo "#################### Moving to 3_trained            ####################"
33 |         cd $SENTENCE_HOME/python/
34 |         mv $CONFIG_FOLDER/2_databased/$CONFIG.ini $CONFIG_FOLDER/3_trained
35 |     else
36 |         echo "#################### Moving to 5_training_failed    ####################"
37 |         cd $SENTENCE_HOME/python/
38 |         mv $CONFIG_FOLDER/2_databased/$CONFIG.ini $CONFIG_FOLDER/5_training_failed
39 |     fi
40 |     echo "#################### Removing net definition   ####################"
41 |     cd $SENTENCE_HOME/net/
42 |     rm auto.prototxt
43 | done
44 | 


--------------------------------------------------------------------------------
/python/parsing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/python/parsing/__init__.py


--------------------------------------------------------------------------------
/python/parsing/abstract_parser.py:
--------------------------------------------------------------------------------
 1 | import os, argparse
 2 | 
 3 | from common.argparse_util import *
 4 | 
 5 | class AbstractParser(object):
 6 |     """AbstractParser with standard filename methods, parse method has to be implemented by subclass"""
 7 |     def __init__(self, filename):
 8 |         self.filename = filename
 9 | 
10 |     def _wanted_file_endings(self):
11 |         """returns a list of file endings, that can be parsed by this parser"""
12 |         raise NotImplementedError("to be implemented by subclass")
13 | 
14 |     def wants_this_file(self):
15 |         basepath, extension = os.path.splitext(self.filename)
16 |         return extension in self._wanted_file_endings()
17 | 
18 |     def get_file_name(self):
19 |         return self.filename
20 | 
21 |     def parse(self):
22 |         """returns a list of talks, it is recommended to use the python generator for less memory usage"""
23 |         raise NotImplementedError("to be implemented by subclass")
24 | 
25 |     def progress(self):
26 |         """progress of parsing, should be implemented for parsers with large file sizes"""
27 |         raise NotImplementedError("to be implemented by subclass")
28 | 
29 |     def _no_progress_function(self):
30 |         return 0.
31 | 
32 |     def _line_count_progress(self):
33 |         return float(self._progress) / self._linenumber
34 | 
35 |     def _init_line_count_progress(self):
36 |         i = -1
37 |         with open(self.filename) as f:
38 |             for i, line in enumerate(f):
39 |                 pass
40 |         self._linenumber = i + 1
41 |         self._progress = 0
42 | 
43 | 
44 | def main(filename, class_):
45 |     parser = class_(filename)
46 |     texts = parser.parse()
47 |     for i, text in enumerate(texts):
48 |         print "progress %f, text %d:" % (parser.progress(), i)
49 |         print text
50 | 
51 | def parse_command_line_arguments(class_):
52 |     parser = argparse.ArgumentParser(description='Test the file parsing')
53 |     parser.add_argument('filename', help='the file you want to parse', type=lambda arg: is_valid_file(parser, arg))
54 |     args = parser.parse_args()
55 | 
56 |     main(args.filename, class_)
57 | 


--------------------------------------------------------------------------------
/python/parsing/audio_parser.py:
--------------------------------------------------------------------------------
 1 | from parsing.get_parser import *
 2 | from sbd_classification.classification_input import InputText
 3 | from sbd_classification.classification_input import InputAudio
 4 | from preprocessing.tokens import WordToken
 5 | from preprocessing.nlp_pipeline import NlpPipeline
 6 | 
 7 | class AudioParser(object):
 8 | 
 9 |     def parse(self, ctm_file):
10 |         parser = get_parser(ctm_file)
11 |         base_dir = os.path.dirname(parser.get_file_name())
12 |         raw_talks = parser.parse()
13 | 
14 |         talks = []
15 |         for i, talk in enumerate(raw_talks):
16 |             # build range map from second intervals to tokens
17 |             talk.build_interval_tree()
18 | 
19 |             # get pitch feature values
20 |             pitch_file = base_dir + "/" + talk.group_name + "_talkid" + str(talk.talk_id) + ".pitch"
21 |             talk.parse_pitch_feature(pitch_file)
22 | 
23 |             # get energy feature values
24 |             energy_file = base_dir + "/" + talk.group_name + "_talkid" + str(talk.talk_id) + ".energy"
25 |             talk.parse_energy_feature(energy_file)
26 | 
27 |             # get pitch feature values
28 |             talk.parse_pitch_feature(pitch_file)
29 |             # get energy feature values
30 |             talk.parse_energy_feature(energy_file)
31 |             # normalize features
32 |             talk.normalize()
33 | 
34 |             talks.append(talk)
35 | 
36 |         return talks
37 | 
38 | 


--------------------------------------------------------------------------------
/python/parsing/ctm_parser.py:
--------------------------------------------------------------------------------
  1 | import sys, argparse, os
  2 | from os.path import basename
  3 | 
  4 | import re
  5 | from common.argparse_util import *
  6 | import common.sbd_config as sbd
  7 | from preprocessing.nlp_pipeline import NlpPipeline, PosTag
  8 | from preprocessing.audio import Audio, AudioSentence
  9 | from preprocessing.tokens import AudioToken, PunctuationToken, Punctuation
 10 | 
 11 | from abstract_parser import AbstractParser, main, parse_command_line_arguments
 12 | 
 13 | reload(sys)
 14 | sys.setdefaultencoding('utf8')
 15 | 
 16 | 
 17 | class CtmParser(AbstractParser):
 18 | 
 19 |     def __init__(self, filename):
 20 |         super(CtmParser, self).__init__(filename)
 21 |         if not self.wants_this_file():
 22 |             return
 23 | 
 24 |         self._init_line_count_progress()
 25 | 
 26 |     def _wanted_file_endings(self):
 27 |         return (".ctm",)
 28 | 
 29 |     def parse(self):
 30 |         current_talk_id = 0
 31 |         audio = Audio()
 32 |         sentence = AudioSentence()
 33 |         sentence.tokens = []
 34 | 
 35 |         group_name = self._extract_group_name()
 36 | 
 37 |         with open(self.filename, "r") as file_:
 38 |             for line_unenc in file_:
 39 |                 self._progress += 1
 40 | 
 41 |                 # parse line
 42 |                 line = unicode(line_unenc, errors='ignore')
 43 |                 line = line.rstrip()
 44 | 
 45 |                 if line.startswith("#"):
 46 |                     talk_id = self._extract_talk_id(line)
 47 |                     token_count = len(sentence.tokens)
 48 | 
 49 |                     # end of sentence reached
 50 |                     if token_count > 0:
 51 |                         sentence.begin = sentence.tokens[0].begin
 52 |                         sentence.end = sentence.tokens[-1].begin + sentence.tokens[-1].duration
 53 |                         sentence.tokenize()
 54 |                         sentence.prepare()
 55 |                         audio.add_sentence(sentence)
 56 | 
 57 |                     # end of talk reached
 58 |                     if talk_id != current_talk_id:
 59 |                         if token_count > 0:
 60 |                             # save audio talk
 61 |                             audio.talk_id = current_talk_id
 62 |                             audio.group_name = group_name
 63 |                             audio = self._prepare_audio(audio)
 64 |                             yield audio
 65 |                             audio = Audio()
 66 |                             current_talk_id = talk_id
 67 |                             continue
 68 |                         else:
 69 |                             current_talk_id = talk_id
 70 | 
 71 |                     # begin a new sentence
 72 |                     sentence = AudioSentence()
 73 |                     sentence.tokens = []
 74 | 
 75 |                 else:
 76 |                     # parse line
 77 |                     line_parts = re.split(" +", line)
 78 |                     begin = float(line_parts[2])
 79 |                     duration = float(line_parts[3])
 80 |                     word = line_parts[4]
 81 | 
 82 |                     # add token to sentence
 83 |                     token = AudioToken(word.lower())
 84 |                     token.begin = begin
 85 |                     token.duration = duration
 86 | 
 87 |                     sentence.append_token(token)
 88 | 
 89 |         if (len(sentence.tokens) > 0):
 90 |             sentence.begin = sentence.tokens[0].begin
 91 |             sentence.end = sentence.tokens[-1].begin + sentence.tokens[-1].duration
 92 |             sentence.tokenize()
 93 |             sentence.prepare()
 94 |             audio.add_sentence(sentence)
 95 | 
 96 |         if len(audio.sentences) > 0:
 97 |             audio.talk_id = current_talk_id
 98 |             audio.group_name = group_name
 99 |             audio = self._prepare_audio(audio)
100 |             yield audio
101 | 
102 |     def _extract_group_name(self):
103 |         return basename(self.filename).split("_")[0]
104 | 
105 |     def _prepare_audio(self, audio):
106 |         # sort sentences by begin
107 |         sorted_sentences = sorted(audio.sentences, key=lambda x: x.begin)
108 |         audio.sentences = sorted_sentences
109 | 
110 |         # calculate pause before and pause after
111 |         return self._calculate_pause(audio)
112 | 
113 |     def _calculate_pause(self, audio):
114 |         last_end = 0.0
115 |         last_token = None
116 | 
117 |         for token in audio.get_tokens():
118 |             if token.is_punctuation():
119 |                 continue
120 | 
121 |             pause = float(format(token.begin - last_end, '.4f'))
122 | 
123 |             if pause < 0.0 or pause == -0.0:
124 |                 pause = 0.0
125 | 
126 |             token.set_pause_before(pause)
127 |             if last_token is not None:
128 |                 last_token.set_pause_after(pause)
129 | 
130 |             last_end = token.begin + token.duration
131 |             last_token = token
132 | 
133 |         return audio
134 | 
135 |     def _extract_talk_id(self, line):
136 |         line = line[2:]
137 |         line_parts = line.split("talkid")
138 |         relevant = line_parts[1]
139 | 
140 |         talkid = "0"
141 |         for i in range(0, len(relevant)):
142 |             if relevant[i].isdigit():
143 |                 talkid += relevant[i]
144 |             else:
145 |                 break
146 | 
147 |         return int(talkid)
148 | 
149 |     def progress(self):
150 |         return self._line_count_progress()
151 | 
152 | 
153 | ################
154 | # Example call #
155 | ################
156 | 
157 | if __name__ == '__main__':
158 |     parse_command_line_arguments(CtmParser)
159 | 


--------------------------------------------------------------------------------
/python/parsing/get_parser.py:
--------------------------------------------------------------------------------
 1 | import sys, argparse, os
 2 | 
 3 | from common.argparse_util import *
 4 | from line_parser import LineParser
 5 | from plaintext_parser import PlaintextParser
 6 | from xml_parser import XMLParser
 7 | from ctm_parser import CtmParser
 8 | 
 9 | 
10 | def get_parser(filename):
11 |     parsers = []
12 |     parsers.append(PlaintextParser(filename))
13 |     try:
14 |         parsers.append(LineParser(filename))
15 |     except ValueError:
16 |         pass
17 |     parsers.append(XMLParser(filename))
18 |     parsers.append(CtmParser(filename))
19 | 
20 |     for parser in parsers:
21 |         if parser.wants_this_file():
22 |             return parser
23 | 
24 |     return None
25 | 
26 | def main(filename):
27 |     parser = get_parser(filename)
28 |     if parser:
29 |         texts = parser.parse()
30 |         for i, text in enumerate(texts):
31 |             print "progress %f, text %d:" % (parser.progress(), i)
32 |             print text
33 |     else:
34 |         print "#error: no suitable parser for %s found, sorry." % filename
35 | 
36 | if __name__ == '__main__':
37 |     parser = argparse.ArgumentParser(description='Test the file parsing')
38 |     parser.add_argument('filename', help='the file you want to parse', type=lambda arg: is_valid_file(parser, arg))
39 |     args = parser.parse_args()
40 | 
41 |     main(args.filename)
42 | 


--------------------------------------------------------------------------------
/python/parsing/line_parser.py:
--------------------------------------------------------------------------------
  1 | import sys, argparse, os
  2 | 
  3 | from common.argparse_util import *
  4 | import common.sbd_config as sbd
  5 | from preprocessing.nlp_pipeline import NlpPipeline, PosTag
  6 | from preprocessing.text import Text, Sentence, END_OF_TEXT_MARKER
  7 | from preprocessing.tokens import WordToken, PunctuationToken, Punctuation
  8 | 
  9 | from abstract_parser import AbstractParser, main, parse_command_line_arguments
 10 | 
 11 | reload(sys)
 12 | sys.setdefaultencoding('utf8')
 13 | 
 14 | 
 15 | class LineParser(AbstractParser):
 16 | 
 17 |     def __init__(self, filename):
 18 |         super(LineParser, self).__init__(filename)
 19 |         if not self.wants_this_file():
 20 |             return
 21 |             
 22 |         self._init_line_count_progress()
 23 |         # if sbd.config.getboolean('features', 'use_question_mark'):
 24 |         #     raise ValueError("Question marks not supported by LineParser")
 25 | 
 26 |         self.POS_TAGGING = sbd.config.getboolean('features', 'pos_tagging')
 27 |         self.nlp_pipeline = NlpPipeline()
 28 | 
 29 |     def _wanted_file_endings(self):
 30 |         return (".line", )
 31 | 
 32 |     def parse(self):
 33 |         with open(self.filename, "r") as file_:
 34 |             text = Text()
 35 |             sentence = Sentence()
 36 |             sentence.tokens = []
 37 | 
 38 |             for line_unenc in file_:
 39 |                 # end of a text reached
 40 |                 if line_unenc.rstrip() == END_OF_TEXT_MARKER:
 41 |                     yield text
 42 |                     text = Text()
 43 |                     continue
 44 | 
 45 |                 self._progress += 1
 46 | 
 47 |                 # parse line
 48 |                 line = unicode(line_unenc, errors='ignore')
 49 |                 line = line.rstrip()
 50 | 
 51 |                 # split line into word, pos_tags and type
 52 |                 line_parts = line.split('\t')
 53 |                 word = self._get_word(line_parts)
 54 |                 if word is None:
 55 |                     continue
 56 |                 pos_tags = self._get_pos_tags(line_parts)
 57 |                 punctuation = self._get_punctuation(line_parts)
 58 | 
 59 |                 sentence.tokens.extend(self._create_tokens(word, pos_tags, punctuation))
 60 | 
 61 |                 # we are at the end of a sentence
 62 |                 if punctuation == 'PERIOD':
 63 |                     if self.POS_TAGGING and not pos_tags:
 64 |                         self.nlp_pipeline.pos_tag(sentence.tokens)
 65 |                     text.add_sentence(sentence)
 66 |                     sentence = Sentence()
 67 |                     sentence.tokens = []
 68 | 
 69 |         # if we do not have any end-of-text-marker
 70 |         # return everything as one text
 71 |         if len(text.sentences) > 0:
 72 |             yield text
 73 | 
 74 |     def _get_word(self, line_parts):
 75 |         word = unicode(line_parts[0])
 76 |         word = self.nlp_pipeline.process_word(word)
 77 |         # check if needed
 78 |         # if "?" in word and len(word) > 0:
 79 |         #     word = word.replace("?", "")
 80 |         return word
 81 | 
 82 |     def _get_punctuation(self, line_parts):
 83 |         if len(line_parts) == 2:
 84 |             return unicode(line_parts[1])
 85 |         else:
 86 |             return unicode(line_parts[2])
 87 | 
 88 |     def _get_pos_tags(self, line_parts):
 89 |         if len(line_parts) == 2:
 90 |             return set()
 91 |         else:
 92 |             pos_tag_str = line_parts[1].split(",")
 93 |             pos_tag_types = map(lambda x: x.split(".")[1], pos_tag_str)
 94 |             return set(map(lambda x: PosTag[x], pos_tag_types))
 95 | 
 96 |     def progress(self):
 97 |         return self._line_count_progress()
 98 |  
 99 |     def _create_tokens(self, word, pos_tags, punctuation):
100 |         word_token = WordToken(word)
101 |         word_token.set_pos_tags(pos_tags)
102 |         
103 |         punctuation_token = None
104 |         if punctuation == 'PERIOD':
105 |             punctuation_token = PunctuationToken(punctuation, Punctuation.PERIOD)
106 |         elif punctuation == 'COMMA':
107 |             punctuation_token = PunctuationToken(punctuation, Punctuation.COMMA)
108 | 
109 |         if punctuation_token is not None:
110 |             return [word_token, punctuation_token]
111 |         return [word_token]
112 | 
113 | 
114 | 
115 | ################
116 | # Example call #
117 | ################
118 | 
119 | if __name__ == '__main__':
120 |     parse_command_line_arguments(LineParser)
121 | 


--------------------------------------------------------------------------------
/python/parsing/plaintext_parser.py:
--------------------------------------------------------------------------------
 1 | import argparse, sys, os
 2 | 
 3 | from common.argparse_util import *
 4 | from preprocessing.nlp_pipeline import NlpPipeline
 5 | from preprocessing.text import Sentence, Text
 6 | 
 7 | from abstract_parser import AbstractParser, main, parse_command_line_arguments
 8 | 
 9 | TEXT_SEPARATOR = "################################################################################"
10 | 
11 | reload(sys)
12 | sys.setdefaultencoding('utf8')
13 | 
14 | class PlaintextParser(AbstractParser):
15 |     def __init__(self, filename):
16 |         super(PlaintextParser, self).__init__(filename)
17 |         if not self.wants_this_file():
18 |             return
19 |         self._init_line_count_progress()
20 |         self.nlp_pipeline = NlpPipeline()
21 | 
22 |     def _wanted_file_endings(self):
23 |         return (".txt",)
24 | 
25 |     def parse(self):
26 |         text = Text()
27 | 
28 |         with open(self.filename, "r") as file_:
29 |             for line_unenc in file_:
30 |                 self._progress += 1
31 |                 line = unicode(line_unenc.encode('utf8'))
32 |                 if line.startswith(TEXT_SEPARATOR):
33 |                     if (len(text.sentences) > 0):
34 |                         yield text
35 |                         text = Text()
36 |                         continue
37 |                 sentences = self.nlp_pipeline.sentence_segmentation(line)
38 |                 for sentence in sentences:
39 |                     s = Sentence()
40 |                     s.set_sentence_text(sentence)
41 |                     s.set_tokens(self.nlp_pipeline.parse_text(sentence))
42 |                     text.add_sentence(s)
43 |         if (len(text.sentences) > 0):
44 |             yield text
45 | 
46 |     def progress(self):
47 |         return self._line_count_progress()
48 | 
49 | 
50 | ################
51 | # Example call #
52 | ################
53 | 
54 | if __name__ == '__main__':
55 |     parse_command_line_arguments(PlaintextParser)
56 | 


--------------------------------------------------------------------------------
/python/parsing/xml_parser.py:
--------------------------------------------------------------------------------
 1 | import xml.etree.ElementTree, sys, os.path, re
 2 | 
 3 | from common.argparse_util import *
 4 | from preprocessing.nlp_pipeline import NlpPipeline
 5 | from preprocessing.text import *
 6 | 
 7 | from abstract_parser import AbstractParser, main, parse_command_line_arguments
 8 | 
 9 | class XMLParser(AbstractParser):
10 |     def __init__(self, filename):
11 |         super(XMLParser, self).__init__(filename)
12 |         if not self.wants_this_file():
13 |             return
14 |         self.nlp_pipeline = NlpPipeline()
15 |         self._linenumber = self._count_docs()
16 |         self._progress = 0
17 | 
18 |     def _wanted_file_endings(self):
19 |         return (".xml",)
20 | 
21 |     def parse(self):
22 |         mteval = xml.etree.ElementTree.parse(self.filename).getroot()
23 |         srcset = mteval.find("srcset")
24 |         for doc in srcset.findall('doc'):
25 |             self._progress += 1
26 |             talk = Text()
27 | 
28 |             for sentence in doc.findall("seg"):
29 |                 sentence_text = unicode(sentence.text)
30 | 
31 |                 sentence = Sentence()
32 |                 sentence.set_sentence_text(sentence_text)
33 |                 sentence.set_tokens(self.nlp_pipeline.parse_text(sentence_text))
34 |                 talk.add_sentence(sentence)
35 | 
36 |             yield talk
37 | 
38 |     def progress(self):
39 |         return self._line_count_progress()
40 | 
41 |     def _count_docs(self):
42 |         mteval = xml.etree.ElementTree.parse(self.filename).getroot()
43 |         srcset = mteval.find("srcset")
44 |         i = 0
45 |         for doc in srcset.findall('doc'):
46 |             i += 1
47 |         return i
48 | 
49 | ################
50 | # Example call #
51 | ################
52 | 
53 | if __name__ == '__main__':
54 |     parse_command_line_arguments(XMLParser)
55 | 


--------------------------------------------------------------------------------
/python/preprocessing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/python/preprocessing/__init__.py


--------------------------------------------------------------------------------
/python/preprocessing/glove_file.py:
--------------------------------------------------------------------------------
 1 | import sys, argparse, struct, numpy
 2 | import common.sbd_config as sbd
 3 | 
 4 | class GloveFile(object):
 5 |     """reads a binary word vector file, returns vectors for single words"""
 6 | 
 7 |     def __init__(self, filename):
 8 |         self.ENCODING = 'UTF-8'
 9 |         self.KEY_ERROR_VECTOR = sbd.config.get('word_vector', 'key_error_vector')
10 | 
11 |         # the following variable counts word, that are not covered in the given vector
12 |         # see get_vector for details
13 |         self.not_covered_words = dict()
14 |         # and some bare numbers
15 |         self.nr_covered_words = 0
16 |         self.nr_uncovered_words = 0
17 |         # read vector file
18 |         self.__filename = filename
19 | 
20 |         try:
21 |             self.__file = open(filename, 'rb')
22 |         except IOError:
23 |             print ('The file %s can not be read!' % self.__filename)
24 |             return
25 | 
26 |         self.words = 400000
27 |         self.vector_size = 50
28 | 
29 |         self.vector_array = numpy.zeros((self.words, self.vector_size), float)
30 |         self.word2index = {}
31 |         self.average_vector = numpy.zeros((self.vector_size,), float)
32 | 
33 |         index = 0
34 |         with open(filename) as f:
35 |             for line in f:
36 |                 if index % 100000 == 0:
37 |                     print("Parsed %d/%d lines." % (index, self.words))
38 |                 parts = line.split(" ")
39 |                 word = parts[0]
40 |                 vector = parts[1:]
41 | 
42 |                 self.word2index[word] = index
43 |                 for i in range (len(vector)):
44 |                     self.vector_array[index][i] = float(vector[i])
45 | 
46 |                 index += 1
47 | 
48 |         self.__file.close()
49 |         print('Parsing finished!')
50 | 
51 |     def __del__(self):
52 |         self.vector_array = None
53 |         self.word2index = None
54 | 
55 |     def get_vector(self, word):
56 |         try:
57 |             idx = self.word2index[word]
58 |             self.nr_covered_words += 1
59 |             return self.vector_array[idx]
60 |         except KeyError:
61 |             self.not_covered_words[word] = self.not_covered_words.get(word, 0) + 1
62 |             self.nr_uncovered_words += 1
63 |             if self.KEY_ERROR_VECTOR != 'avg':
64 |                 idx = self.word2index[self.KEY_ERROR_VECTOR]
65 |                 return self.vector_array[idx]
66 |             else:
67 |                 return self.average_vector
68 | 


--------------------------------------------------------------------------------
/python/preprocessing/nlp_pipeline.py:
--------------------------------------------------------------------------------
  1 | import nltk, nltk.data
  2 | from enum import Enum
  3 | import regex as re
  4 | import common.sbd_config as sbd
  5 | from tokens import Punctuation, PunctuationToken, WordToken
  6 | 
  7 | 
  8 | class PosTag(Enum):
  9 |     OTHER = 0
 10 |     VERB = 1
 11 |     NOUN = 2
 12 |     DETERMINER = 3
 13 |     ADJECTIVE = 4
 14 |     ADVERB = 5
 15 |     NUMERAL = 6
 16 |     CONJUNCTION = 7
 17 |     PARTICLE = 8
 18 |     EXISTENTIAL_THERE = 9
 19 |     MARKER = 10
 20 |     PRONOUN = 11
 21 |     INTERJECTION = 12
 22 |     QUESTION_WORDS = 13
 23 | 
 24 | 
 25 | class NlpPipeline(object):
 26 | 
 27 |     def __init__(self):
 28 |         self.POS_TAGGING = sbd.config.getboolean('features', 'pos_tagging')
 29 |         self.NUMBER_REPLACEMENT = sbd.config.getboolean('features', 'number_replacement')
 30 | 
 31 |         self.punkt = None
 32 |         self.punctuation_regex = re.compile("^\p{posix_punct}+$")
 33 |         self.punctuation_mapping = {
 34 |             ";": Punctuation.PERIOD,
 35 |             ".": Punctuation.PERIOD,
 36 |             "!": Punctuation.PERIOD,
 37 |             ",": Punctuation.COMMA,
 38 |             ":": Punctuation.COMMA,
 39 |             "-": Punctuation.COMMA,
 40 |             "--": Punctuation.COMMA,
 41 |             "?": Punctuation.QUESTION
 42 |         }
 43 |         self.inv_pos_tag_mapping = {
 44 |             PosTag.ADJECTIVE: {
 45 |                 "JJ", "JJR", "JJS"
 46 |             },
 47 |             PosTag.ADVERB: {
 48 |                 "RB", "RBR", "RBS"
 49 |             },
 50 |             PosTag.PARTICLE: {
 51 |                 "RP"
 52 |             },
 53 |             PosTag.CONJUNCTION: {
 54 |                 "CC", "IN"
 55 |             },
 56 |             PosTag.NUMERAL: {
 57 |                 "CD", "LS"
 58 |             },
 59 |             PosTag.DETERMINER: {
 60 |                 "DT", "PDT"
 61 |             },
 62 |             PosTag.EXISTENTIAL_THERE: {
 63 |                 "EX"
 64 |             },
 65 |             PosTag.NOUN: {
 66 |                 "FW", "NN", "NNP", "NNPS", "NNS"
 67 |             },
 68 |             PosTag.VERB: {
 69 |                 "MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"
 70 |             },
 71 |             PosTag.MARKER: {
 72 |                 "POS", "TO"
 73 |             },
 74 |             PosTag.PRONOUN: {
 75 |                 "PRP", "PRP$"
 76 |             },
 77 |             PosTag.INTERJECTION: {
 78 |                 "UH"
 79 |             },
 80 |             PosTag.QUESTION_WORDS: {
 81 |                 "WDT", "WP", "WP$", "WRB"
 82 |             }
 83 |         }
 84 |         self.pos_tag_mapping = {
 85 |             v2: k for k, v1 in self.inv_pos_tag_mapping.items() for v2 in v1
 86 |         }
 87 | 
 88 |     def parse_text(self, text):
 89 |         """
 90 |         Parses a text and create tokens.
 91 | 
 92 |         Args:
 93 |             text (str): A string representing a sentence.
 94 | 
 95 |         Returns:
 96 |             [token]: List of word and punctuation tokens.
 97 |         """
 98 | 
 99 |         raw_tokens = nltk.word_tokenize(text)
100 |         tokens = []
101 | 
102 |         for raw_token in raw_tokens:
103 |             if raw_token in self.punctuation_mapping:
104 |                 punctuation_type = self.punctuation_mapping[raw_token]
105 |                 tokens.append(PunctuationToken(raw_token, punctuation_type))
106 |             else:
107 |                 word_token = self.process_word(raw_token)
108 |                 if word_token is None:
109 |                     continue
110 |                 tokens.append(WordToken(word_token))
111 | 
112 |         if self.POS_TAGGING:
113 |             self.pos_tag(tokens)
114 | 
115 |         return tokens
116 | 
117 |     def process_word(self, raw_token):
118 |         if re.match(self.punctuation_regex, raw_token):
119 |             return None
120 |         if self.NUMBER_REPLACEMENT:
121 |             return self._replace_number(raw_token)
122 |         return raw_token
123 | 
124 | 
125 |     def pos_tag(self, tokens):
126 |         word_tokens = map(lambda x: x.word, tokens)
127 |         pos_tags = nltk.pos_tag(word_tokens)
128 | 
129 |         for i, token in enumerate(tokens):
130 |             if isinstance(token, WordToken):
131 |                 pos_tag_str = pos_tags[i][1]
132 |                 token.set_pos_tags(self._parse_pos_tag(pos_tag_str))
133 | 
134 |     def _parse_pos_tag(self, pos_tag_str):
135 |         pos_tags = pos_tag_str.split("/")
136 |         pos_tag_set = set()
137 | 
138 |         for pos_tag in pos_tags:
139 |             pos_tag_set.add(self.pos_tag_mapping.get(pos_tag, PosTag.OTHER))
140 | 
141 |         return pos_tag_set
142 | 
143 |     def sentence_segmentation(self, text):
144 |         if not self.punkt:
145 |             self.punkt = nltk.data.load('tokenizers/punkt/english.pickle')
146 |         return self.punkt.tokenize(text.strip())
147 | 
148 |     def _replace_number(self, word):
149 |         if word[:-2].isdigit() and (word.endswith("st") or word.endswith("nd") or word.endswith("rd") or word.endswith("th")):
150 |             return "1st"
151 |         try:
152 |             float(word)
153 |             return "1"
154 |         except ValueError:
155 |             return word
156 | 
157 | 


--------------------------------------------------------------------------------
/python/preprocessing/sliding_window.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | 
 3 | import common.sbd_config as sbd
 4 | from nlp_pipeline import Punctuation, NlpPipeline
 5 | from text import Sentence, Text
 6 | from tokens import PunctuationToken
 7 | from training_instance import TrainingInstance
 8 | 
 9 | 
10 | class SlidingWindow(object):
11 | 
12 |     def __init__(self):
13 |         self.WINDOW_SIZE = sbd.config.getint('windowing', 'window_size')
14 |         self.PUNCTUATION_POS = sbd.config.getint('windowing', 'punctuation_position')
15 | 
16 |     def list_windows(self, talk):
17 | 
18 |         tokens = talk.get_tokens()
19 | 
20 |         index = 0
21 |         training_instances = []
22 | 
23 |         while index <= len(tokens) - self.WINDOW_SIZE:
24 |             window_tokens = []
25 |             instance_label = Punctuation.NONE
26 | 
27 |             i = index
28 |             word_count = 0
29 |             while word_count < self.WINDOW_SIZE and i < len(tokens):
30 |                 current_token = tokens[i]
31 |                 is_punctuation = current_token.is_punctuation()
32 | 
33 |                 # if there are two punctuations in a row, the last punctuation token is taken
34 | 
35 |                 if not is_punctuation:
36 |                     word_count += 1
37 |                     window_tokens.append(current_token)
38 |                 elif i == index:
39 |                     index += 1  ##dont parse windows with punctuations at the beginning twice
40 | 
41 |                 if word_count == self.PUNCTUATION_POS and is_punctuation:
42 |                     instance_label = current_token.punctuation_type
43 | 
44 |                 i += 1
45 | 
46 |             # if punctuation pos is behind the last word, determine the instance label
47 |             if word_count == self.PUNCTUATION_POS and i < len(tokens):
48 |                 current_token = tokens[i]
49 |                 is_punctuation = current_token.is_punctuation()
50 |                 if is_punctuation:
51 |                     instance_label = current_token.punctuation_type
52 | 
53 |             if len(window_tokens) == self.WINDOW_SIZE:
54 |                 training_instances.append(TrainingInstance(window_tokens, instance_label))
55 |             index += 1
56 | 
57 |         return training_instances
58 | 
59 | 
60 | 
61 | ################
62 | # Example call #
63 | ################
64 | 
65 | def main():
66 |     nlp_pipeline = NlpPipeline()
67 | 
68 |     sentence = Sentence()
69 |     sentence.set_sentence_text(unicode("I'm a savant, or more precisly, a high-functioning autisitic savant"))
70 |     sentence.set_tokens(nlp_pipeline.parse_text(sentence.sentence_text))
71 | 
72 |     text = Text()
73 |     text.add_sentence(sentence)
74 | 
75 |     slidingWindow = SlidingWindow()
76 |     windows = slidingWindow.list_windows(text)
77 | 
78 |     for window in windows:
79 |         print(window)
80 | 
81 | if __name__ == '__main__':
82 |     main()
83 | 


--------------------------------------------------------------------------------
/python/preprocessing/text.py:
--------------------------------------------------------------------------------
 1 | import common.sbd_config as sbd
 2 | 
 3 | END_OF_TEXT_MARKER = "###END###"
 4 | 
 5 | class Text(object):
 6 | 
 7 |     def __init__(self):
 8 |         self.sentences = []
 9 |         self.POS_TAGGING = sbd.config.getboolean('features', 'pos_tagging')
10 | 
11 |     def add_sentence(self, sentence):
12 |         self.sentences.append(sentence)
13 | 
14 |     def get_tokens(self):
15 |         tokens = []
16 |         for sentence in self.sentences:
17 |             tokens.extend(sentence.tokens)
18 |         return tokens
19 | 
20 |     def append_to_file(self, filename):
21 |         file = open(filename, "a")
22 | 
23 |         for sentence in self.sentences:
24 |             tokens = sentence.get_tokens()
25 |             # get the word vectors for all tokens in the sentence
26 |             for i, token in enumerate(tokens):
27 |                 if not token.is_punctuation():
28 |                     if i == len(tokens) - 1:
29 |                         punctuation_string = "PERIOD"
30 |                     else:
31 |                         next_token = tokens[i + 1]
32 |                         if next_token.is_punctuation():
33 |                             punctuation_string = str(next_token.punctuation_type)
34 |                             punctuation_string = punctuation_string[12:]
35 |                         else:
36 |                             punctuation_string = "O"
37 | 
38 |                     if self.POS_TAGGING:
39 |                         line_str = u"%s\t%s\t%s\n" % (token.word.lower(), " ".join(map(unicode, token.pos_tags)), punctuation_string)
40 |                     else:
41 |                         line_str = u"%s\t%s\n" % (token.word.lower(), punctuation_string)
42 | 
43 |                     file.write(line_str)
44 | 
45 |         file.write("%s\n" % END_OF_TEXT_MARKER)
46 |         file.close()
47 | 
48 |     def __str__(self):
49 |         sentences_str = ''.join(map(str, self.sentences))
50 |         return sentences_str
51 | 
52 | 
53 | class Sentence(object):
54 | 
55 |     def __init__(self):
56 |         self.tokens = None
57 |         self.sentence_text = None
58 | 
59 |     def set_sentence_text(self, sentence_text):
60 |         self.sentence_text = sentence_text
61 | 
62 |     def set_tokens(self, tokens):
63 |         self.tokens = tokens
64 | 
65 |     def get_tokens(self):
66 |         return self.tokens
67 | 
68 |     def __str__(self):
69 |         tokens_str = ', '.join(map(str, self.tokens))
70 | 
71 |         return "sentence: %s \n tokens: %s \n" % (self.sentence_text, tokens_str)
72 | 


--------------------------------------------------------------------------------
/python/preprocessing/tokens.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | 
  3 | 
  4 | class Punctuation(Enum):
  5 |     NONE = 0
  6 |     COMMA = 1
  7 |     PERIOD = 2
  8 |     QUESTION = 3
  9 | 
 10 | 
 11 | class AudioToken(object):
 12 |     def __init__(self, word):
 13 |         self.word = word
 14 |         self.begin = 0.0
 15 |         self.duration = 0.0
 16 |         self.pause_before = 0.0
 17 |         self.pause_after = 0.0
 18 |         self.energy = 0.0
 19 |         self.pitch = 0.0
 20 |         self.pitch_levels = []
 21 |         self.energy_levels = []
 22 | 
 23 |     def is_punctuation(self):
 24 |         return False
 25 | 
 26 |     def append_pitch_level(self, pitch_level):
 27 |         self.pitch_levels.append(pitch_level)
 28 | 
 29 |     def append_energy_level(self, energy_level):
 30 |         self.energy_levels.append(energy_level)
 31 | 
 32 |     def set_pause_before(self, pause_before):
 33 |         self.pause_before = pause_before
 34 | 
 35 |     def set_pause_after(self, pause_after):
 36 |         self.pause_after = pause_after
 37 | 
 38 |     def set_energy(self, energy):
 39 |         self.energy = energy
 40 | 
 41 |     def set_pitch(self, pitch):
 42 |         self.pitch = pitch
 43 | 
 44 |     def __str__(self):
 45 |         return "(pause: %s, pitch: %s, energy: %s) %s" % (str(self.pause_before), str(self.pitch), str(self.energy), self.word)
 46 | 
 47 |     def __repr__(self):
 48 |         return self.word
 49 | 
 50 |     def __eq__(self, other):
 51 |         if other.is_punctuation():
 52 |             return False
 53 |         return self.word == other.word
 54 | 
 55 |     def __hash__(self):
 56 |         return hash(self.word) ^ hash(self.is_punctuation())
 57 | 
 58 | 
 59 | class WordToken(object):
 60 |     def __init__(self, word):
 61 |         self.word = word
 62 |         self.word_vec = None
 63 |         self.pos_tags = set()
 64 | 
 65 |     def is_punctuation(self):
 66 |         return False
 67 | 
 68 |     def set_word_vec(self, word_vec):
 69 |         self.word_vec = word_vec
 70 | 
 71 |     def set_pos_tags(self, pos_tag):
 72 |         self.pos_tags = pos_tag
 73 | 
 74 |     def __str__(self):
 75 |         pos_str = ""
 76 |         if len(self.pos_tags) > 0:
 77 |             pos_str = " (" + " ".join(map(unicode, self.pos_tags)) + ")"
 78 |         return self.word + pos_str
 79 | 
 80 |     def __repr__(self):
 81 |         return self.word
 82 | 
 83 |     def __eq__(self, other):
 84 |         if other.is_punctuation():
 85 |             return False
 86 |         return self.word == other.word
 87 | 
 88 |     def __hash__(self):
 89 |         return hash(self.word) ^ hash(self.is_punctuation())
 90 | 
 91 | 
 92 | class PunctuationToken(object):
 93 |     def __init__(self, word, punctuation_type):
 94 |         self.word = word
 95 |         self.punctuation_type = punctuation_type
 96 | 
 97 |     def is_punctuation(self):
 98 |         return True
 99 | 
100 |     def __str__(self):
101 |         return str(self.punctuation_type)
102 | 
103 |     def __repr__(self):
104 |         return str(self)
105 | 
106 |     def __eq__(self, other):
107 |         if not other.is_punctuation():
108 |             return False
109 |         return self.punctuation_type == other.punctuation_type
110 | 
111 |     def __hash__(self):
112 |         return hash(self.punctuation_type) ^ hash(self.is_punctuation())
113 | 
114 | 


--------------------------------------------------------------------------------
/python/preprocessing/training_instance.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | 
 3 | import common.sbd_config as sbd
 4 | from nlp_pipeline import PosTag
 5 | from tokens import Punctuation, AudioToken
 6 | 
 7 | 
 8 | class TrainingInstance(object):
 9 | 
10 |     def __init__(self, tokens, label):
11 |         self.WINDOW_SIZE = sbd.config.getint('windowing', 'window_size')
12 |         self.POS_TAGGING = sbd.config.getboolean('features', 'pos_tagging')
13 |         self.USE_QUESTION_MARK = sbd.config.getboolean('features', 'use_question_mark')
14 |         self.LEXICAL = sbd.config.getboolean('model', 'lexical')
15 | 
16 |         self.tokens = tokens
17 |         self.label = label
18 | 
19 |     def __repr__(self):
20 |         return "TOKENS: %s \nLABEL: %s \n" % (" ".join(map(unicode, self.tokens)), str(self.label))
21 | 
22 |     def get_array(self):
23 |         if self.LEXICAL:
24 |             return self.get_lexical_array()
25 |         else:
26 |             return self.get_audio_array()
27 | 
28 |     def get_lexical_array(self):
29 |         word_vec_size = len(self.tokens[0].word_vec)
30 |         feature_size = word_vec_size
31 | 
32 |         if self.POS_TAGGING:
33 |             feature_size += len(PosTag)
34 | 
35 |         dimensions = (1, self.WINDOW_SIZE, feature_size)
36 |         arr = numpy.zeros(dimensions, float)
37 | 
38 |         for i in range(0, self.WINDOW_SIZE):
39 |             arr[0][i][0:word_vec_size] = self.tokens[i].word_vec
40 | 
41 |             if self.POS_TAGGING:
42 |                 for pos_tag in self.tokens[i].pos_tags:
43 |                     arr[0][i][word_vec_size + pos_tag.value] = 1.0
44 | 
45 |         return arr
46 | 
47 |     def get_audio_array(self):
48 |         feature_size = 4
49 | 
50 |         dimensions = (1, self.WINDOW_SIZE, feature_size)
51 |         arr = numpy.zeros(dimensions, float)
52 | 
53 |         for i in range(0, self.WINDOW_SIZE):
54 |             arr[0][i][0] = self.tokens[i].pause_before
55 |             arr[0][i][1] = self.tokens[i].pause_after
56 |             arr[0][i][2] = self.tokens[i].energy
57 |             arr[0][i][3] = self.tokens[i].pitch
58 | 
59 |         return arr
60 | 
61 |     def get_label(self):
62 |         if self.LEXICAL:
63 |             return self.get_lexical_label()
64 |         else:
65 |             return self.get_audio_label()
66 | 
67 |     def get_audio_label(self):
68 |         if self.label == Punctuation.PERIOD:
69 |             return 1
70 |         else:
71 |             return self.label.value
72 | 
73 |     def get_lexical_label(self):
74 |         if not self.USE_QUESTION_MARK and self.label == Punctuation.QUESTION:
75 |             return Punctuation.PERIOD.value
76 |         return self.label.value
77 | 
78 |     def get_tokens(self):
79 |         return self.tokens
80 | 


--------------------------------------------------------------------------------
/python/preprocessing/word2vec_file.py:
--------------------------------------------------------------------------------
  1 | import sys, argparse, struct, numpy
  2 | 
  3 | from common.argparse_util import *
  4 | import common.sbd_config as sbd
  5 | 
  6 | class Word2VecFile(object):
  7 |     """reads a binary word vector file, returns vectors for single words"""
  8 |     def __init__(self, filename):
  9 |         self.ENCODING = 'UTF-8'
 10 |         self.KEY_ERROR_VECTOR = "this"
 11 | 
 12 |         self.key_mapping = {
 13 |             "'s": "is",
 14 |             "a": "the",
 15 |             "of": "from",
 16 |             "to": "from",
 17 |             "and": "or"
 18 |         }
 19 | 
 20 |         # the following variable counts word, that are not covered in the given vector
 21 |         # see get_vector for details
 22 |         self.not_covered_words = dict()
 23 |         # and some bare numbers
 24 |         self.nr_covered_words = 0
 25 |         self.nr_uncovered_words = 0
 26 |         # read vector file
 27 |         self.__filename = filename
 28 | 
 29 |         try:
 30 |             self.__file = open(filename, 'rb')
 31 |         except IOError:
 32 |             print ('The file %s can not be read!' % self.__filename)
 33 |             return
 34 | 
 35 |         first_line = self.__file.readline().decode(self.ENCODING).split(' ')
 36 |         self.words = int(first_line[0])
 37 |         self.vector_size = int(first_line[1])
 38 |         print('File has %d words with vectors of size %d. Parsing ..' % (self.words, self.vector_size))
 39 | 
 40 |         self.vector_array = numpy.zeros((self.words, self.vector_size), numpy.float32)
 41 |         self.word2index = {}
 42 | 
 43 |         progress_steps = self.words / 100
 44 | 
 45 |         chars = []
 46 |         for w_index in range(0, self.words):
 47 |             if w_index % progress_steps == 0:
 48 |                 progress = w_index * 100 / self.words
 49 |                 sys.stdout.write(str(progress) + "% ")
 50 |                 sys.stdout.flush()
 51 |             byte = self.__file.read(1)
 52 |             while byte:
 53 |                 if byte == b" ":
 54 |                     word = b"".join(chars)
 55 |                     self.word2index[word.decode(self.ENCODING)] = w_index
 56 |                     chars = []
 57 |                     break
 58 |                 if byte != b"\n":
 59 |                     chars.append(byte)
 60 |                 byte = self.__file.read(1)
 61 |             for f_index in range(0, self.vector_size):
 62 |                 f_bytes = self.__file.read(4)
 63 |                 self.vector_array[w_index][f_index] = struct.unpack('f', f_bytes)[0]
 64 |         self.__file.close()
 65 | 
 66 |         print('Parsing finished!')
 67 | 
 68 |     def __del__(self):
 69 |         self.vector_array = None
 70 |         self.word2index = None
 71 | 
 72 |     def get_vector(self, word):
 73 |         try:
 74 |             if word in self.key_mapping:
 75 |                 # TODO: This only works for google vector, which does not have the words 'and', 'of' etc.
 76 |                 # If we use other word2vec vectors, this won't work
 77 |                 word = self.key_mapping[word]
 78 |             idx = self.word2index[word]
 79 |             self.nr_covered_words += 1
 80 |             return self.vector_array[idx]
 81 |         except KeyError:
 82 |             self.not_covered_words[word] = self.not_covered_words.get(word, 0) + 1
 83 |             self.nr_uncovered_words += 1
 84 |             if self.KEY_ERROR_VECTOR != 'avg':
 85 |                 idx = self.word2index[self.KEY_ERROR_VECTOR]
 86 |                 return self.vector_array[idx]
 87 |             raise Exception
 88 | 
 89 | 
 90 | ################
 91 | # Example call #
 92 | ################
 93 | 
 94 | def main(args):
 95 |     word2VecFile = Word2VecFile(args.datafile)
 96 |     for word in args.word:
 97 |         try:
 98 |             print(word, word2VecFile.get_vector(word))
 99 |         except KeyError:
100 |             print(word, "not found!")
101 | 
102 | def is_valid_file(parser, arg, mode):
103 |     try:
104 |         f = open(arg, mode)
105 |         f.close()
106 |         return arg
107 |     except IOError:
108 |         parser.error('The file %s can not be opened!' % arg)
109 | 
110 | if __name__ == '__main__':
111 |     parser = argparse.ArgumentParser(description='Get word vector from binary data.')
112 |     parser.add_argument('datafile', help='path to binary data file', type=lambda arg: is_valid_file(parser, arg, 'rb'))
113 |     parser.add_argument('word', help='word to find in data file', nargs='+')
114 |     args = parser.parse_args()
115 |     main(args)
116 | 


--------------------------------------------------------------------------------
/python/sbd_classification/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/python/sbd_classification/__init__.py


--------------------------------------------------------------------------------
/python/sbd_classification/audio_classification.py:
--------------------------------------------------------------------------------
 1 | import numpy, caffe, argparse
 2 | import common.sbd_config as sbd
 3 | from preprocessing.nlp_pipeline import NlpPipeline, PosTag
 4 | from preprocessing.sliding_window import SlidingWindow
 5 | from preprocessing.word2vec_file import Word2VecFile
 6 | from parsing.audio_parser import AudioParser
 7 | from classification_input import InputAudio
 8 | 
 9 | 
10 | class AudioClassifier(object):
11 | 
12 |     def __init__(self, net, debug = False):
13 |         self.classes = ["NONE", "PERIOD"]
14 | 
15 |         self.WINDOW_SIZE = sbd.config.getint('windowing', 'window_size')
16 |         self.PUNCTUATION_POS = sbd.config.getint('windowing', 'punctuation_position')
17 | 
18 |         self.net = net
19 |         self.debug = debug
20 | 
21 |     def predict(self, input_audio):
22 |         sliding_window = SlidingWindow()
23 |         instances = sliding_window.list_windows(input_audio)
24 | 
25 |         # get caffe predictions
26 |         punctuation_probs = []
27 |         for instance in instances:
28 |             probs = self._predict_caffe(instance)
29 |             punctuation_probs.extend(numpy.copy(probs))
30 | 
31 |         return punctuation_probs
32 | 
33 |     def _predict_caffe(self, instance):
34 |         caffe.io.Transformer({'data': self.net.blobs['data'].data.shape})
35 | 
36 |         # batchsize = 1
37 |         # self.net.blobs['data'].reshape(batchsize, 1, self.WINDOW_SIZE, self.FEATURE_LENGTH)
38 |         reshaped_array = numpy.expand_dims(instance.get_array(), axis=0)
39 |         self.net.blobs['data'].data[...] = reshaped_array
40 | 
41 |         out = self.net.forward()
42 |         return out['softmax']
43 | 
44 |     def get_audio_parameter(self):
45 |         return (self.WINDOW_SIZE, self.PUNCTUATION_POS)
46 | 
47 | ################
48 | # Example call #
49 | ################
50 | 
51 | def main(model_folder, example_folder):
52 |     config_file, caffemodel_file, net_proto = get_filenames(model_folder)
53 |     sbd.SbdConfig(config_file)
54 |     ctm_file, pitch_file, energy_file = get_audio_files(example_folder)
55 | 
56 |     # parse ctm_file, pitch_file and energy_file
57 |     parser = AudioParser(ctm_file, pitch_file, energy_file)
58 |     parser.parse()
59 | 
60 |     classifier = load_audio_classifier(model_folder)
61 | 
62 |     data = classifier.predict_audio(parser)
63 |     print(data)
64 | 
65 | if __name__ == '__main__':
66 |     parser = argparse.ArgumentParser(description='run the web demo')
67 |     parser.add_argument('model_folder', help='the trained caffemodel', default='demo_data/audio_models/audio_window-1-1/', nargs='?')
68 |     parser.add_argument('example_folder', help='folder containing the ctm, pitch and energy files', default='demo_data/audio_examples/tst2011_talkid1169/', nargs='?')
69 |     args = parser.parse_args()
70 | 
71 |     main(args.model_folder, args.example_folder)
72 | 


--------------------------------------------------------------------------------
/python/sbd_classification/classification_input.py:
--------------------------------------------------------------------------------
 1 | import common.sbd_config as sbd
 2 | from preprocessing.nlp_pipeline import NlpPipeline, PosTag
 3 | from preprocessing.audio import Audio
 4 | from preprocessing.tokens import WordToken
 5 | 
 6 | class InputText(object):
 7 | 
 8 |     def __init__(self, obj):
 9 |         self.tokens = None
10 | 
11 |         if isinstance(obj, str) or isinstance(obj, unicode):
12 |             self._initialize_with_text(obj)
13 |         elif isinstance(obj, list):
14 |             if obj:
15 |                 el = obj[0]
16 |                 if isinstance(el, Audio):
17 |                     self._initialize_with_talks(obj)
18 |                 elif isinstance(el, str):
19 |                     self._initialize_with_tokens(obj)
20 |                 else:
21 |                     print("ERROR: Could not initialize input text!")
22 |         else:
23 |             print("ERROR: Could not initialize input text!")
24 | 
25 | 
26 |     def _initialize_with_text(self, text):
27 |         nlp_pipeline = NlpPipeline()
28 |         self.tokens = nlp_pipeline.parse_text(text)
29 | 
30 |     def _initialize_with_talks(self, talks):
31 |         nlp_pipeline = NlpPipeline()
32 |         word_tokens = []
33 | 
34 |         for talk in talks:
35 |             for sentence in talk.sentences:
36 |                 sentence_tokens = []
37 |                 # get all word tokens
38 |                 for token in sentence.tokens:
39 |                     if not token.is_punctuation():
40 |                         sentence_tokens.append(WordToken(token.word))
41 |                 # do pos_tagging if needed on sentence level
42 |                 if sbd.config.getboolean('features', 'pos_tagging'):
43 |                     nlp_pipeline.pos_tag(sentence_tokens)
44 |                 for t in sentence_tokens:
45 |                     t.word = t.word.lower()
46 |                 word_tokens += sentence_tokens
47 | 
48 |         self.tokens = word_tokens
49 | 
50 |     def _initialize_with_tokens(self, tokens):
51 |         # convert tokens to WordTokens
52 |         word_tokens = [ WordToken(token) for token in tokens ]
53 | 
54 |         # do pos_tagging if needed
55 |         if sbd.config.getboolean('features', 'pos_tagging'):
56 |             nlp_pipeline = NlpPipeline()
57 |             nlp_pipeline.pos_tag(wordTokens)
58 | 
59 |         self.tokens = word_tokens
60 | 
61 |     def get_tokens(self):
62 |         return self.tokens
63 | 
64 | 
65 | class InputAudio(object):
66 | 
67 |     def __init__(self, talks):
68 |         self.tokens = []
69 | 
70 |         for talk in talks:
71 |             for token in talk.get_tokens():
72 |                 if not token.is_punctuation():
73 |                     self.tokens.append(token)
74 | 
75 |     def get_tokens(self):
76 |         return self.tokens
77 | 


--------------------------------------------------------------------------------
/python/sbd_classification/fusion.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from preprocessing.word2vec_file import Word2VecFile
  3 | from sbd_classification.lexical_classification import LexicalClassifier
  4 | from sbd_classification.util import *
  5 | 
  6 | 
  7 | def norm_single(probs):
  8 |     s = 0.0
  9 |     for p in probs:
 10 |         s += p
 11 |     for i in range(0, len(probs)):
 12 |         probs[i] = probs[i] / s
 13 |     return probs
 14 | 
 15 | def norm(probs_list):
 16 |     for probs in probs_list:
 17 |         norm_single(probs)
 18 |     return probs_list
 19 | 
 20 | 
 21 | class Fusion(object):
 22 | 
 23 |     def __init__(self):
 24 |         # constants for index access into the probability vectors
 25 |         self.AUDIO_NONE_IDX = 0
 26 |         self.AUDIO_PERIOD_IDX = 1
 27 |         self.LEX_NONE_IDX = 0
 28 |         self.LEX_COMMA_IDX = 1
 29 |         self.LEX_PERIOD_IDX = 2
 30 | 
 31 |         self.__initialized = False
 32 | 
 33 |     def init_parameters(self, lexical_punctuation_pos, lexical_window_size, audio_punctuation_pos, audio_window_size):
 34 |         self.LEXICAL_PUNCTUATION_POS = lexical_punctuation_pos
 35 |         self.LEXICAL_WINDOW_SIZE = lexical_window_size
 36 |         self.AUDIO_PUNCTUATION_POS = audio_punctuation_pos
 37 |         self.AUDIO_WINDOW_SIZE = audio_window_size
 38 | 
 39 |         self.__initialized = True
 40 | 
 41 |         return self
 42 | 
 43 |     def fuse(self, nr_tokens, lexical_probs, audio_probs):
 44 |         assert(self.__initialized)
 45 |         assert(len(lexical_probs) + self.LEXICAL_WINDOW_SIZE == len(audio_probs) + self.AUDIO_WINDOW_SIZE)
 46 |         assert(nr_tokens == len(audio_probs) + self.AUDIO_WINDOW_SIZE - 1)
 47 |         assert(nr_tokens == len(lexical_probs) + self.LEXICAL_WINDOW_SIZE - 1)
 48 | 
 49 |         fusion_probs = []
 50 |         for i in range(nr_tokens):
 51 |             lexical_pos = get_index(i, len(lexical_probs), self.LEXICAL_PUNCTUATION_POS)
 52 |             audio_pos = get_index(i, len(audio_probs), self.AUDIO_PUNCTUATION_POS)
 53 | 
 54 |             # if we have no predictions return NONE
 55 |             if lexical_pos < 0 and audio_pos < 0:
 56 |                 fusion_probs.append([1.0, 0.0, 0.0])
 57 |                 continue
 58 | 
 59 |             # if we have no audio prediction return lexical prediction
 60 |             if audio_pos < 0:
 61 |                 fusion_probs.append(lexical_probs[lexical_pos])
 62 |                 continue
 63 | 
 64 |             audio_none = audio_probs[audio_pos][self.AUDIO_NONE_IDX]
 65 |             audio_period = audio_probs[audio_pos][self.AUDIO_PERIOD_IDX]
 66 | 
 67 |             # if we have no lexical prediction return audio prediction
 68 |             if lexical_pos < 0:
 69 |                 fusion_probs.append([audio_none, 0.0, audio_period])
 70 |                 continue
 71 | 
 72 |             fusion_result = self.sophisticated_fusion(lexical_probs[lexical_pos], audio_probs[audio_pos])
 73 |             assert(len(fusion_result) == 3)
 74 |             fusion_probs.append(fusion_result)
 75 | 
 76 |         return fusion_probs
 77 | 
 78 |     def sophisticated_fusion(self, lexical_probs, audio_probs):
 79 |         raise Exception("Abstract base class")
 80 | 
 81 | class ThresholdFusion(Fusion):
 82 | 
 83 |     def __init__(self, threshold_audio = 0.5, threshold_lexical = 0.9):
 84 |         super(ThresholdFusion, self).__init__()
 85 |         self.threshold_audio = threshold_audio
 86 |         self.threshold_lexical = threshold_lexical
 87 | 
 88 |     def sophisticated_fusion(self, lexical_probs, audio_probs):
 89 |         audio_none = audio_probs[self.AUDIO_NONE_IDX]
 90 |         audio_period = audio_probs[self.AUDIO_PERIOD_IDX]
 91 | 
 92 |         lexical_none = lexical_probs[self.LEX_NONE_IDX]
 93 |         lexical_comma = lexical_probs[self.LEX_COMMA_IDX]
 94 |         lexical_period = lexical_probs[self.LEX_PERIOD_IDX]
 95 | 
 96 |         # if audio model predicts a period, and lexical is not very confident, that there is no period, use audio prediction
 97 |         if audio_period > self.threshold_audio and lexical_none < self.threshold_lexical:
 98 |             return norm_single([lexical_none, lexical_comma, lexical_period + audio_period])
 99 |         else:
100 |             return [lexical_none, lexical_comma, lexical_period]
101 | 
102 |     def __str__(self):
103 |         return "ThresholdFusion[AudioThresh: %.2f, LexicalThresh: %.2f]" % (self.threshold_audio, self.threshold_lexical)
104 | 
105 | class BalanceFusion(Fusion):
106 | 
107 |     def __init__(self, lexical_audio_balance = 0.5):
108 |         super(BalanceFusion, self).__init__()
109 |         self.lexical_audio_balance = lexical_audio_balance
110 | 
111 |     def sophisticated_fusion(self, lexical_probs, audio_probs):
112 |         audio_factor = (1 - self.lexical_audio_balance)
113 |         lexical_factor = self.lexical_audio_balance
114 | 
115 |         audio_none = audio_probs[self.AUDIO_NONE_IDX] * audio_factor
116 |         audio_period = audio_probs[self.AUDIO_PERIOD_IDX] * audio_factor
117 | 
118 |         lexical_none = lexical_probs[self.LEX_NONE_IDX] * lexical_factor
119 |         lexical_comma = lexical_probs[self.LEX_COMMA_IDX] * lexical_factor
120 |         lexical_period = lexical_probs[self.LEX_PERIOD_IDX] * lexical_factor
121 | 
122 |         return norm_single([audio_none + lexical_none, lexical_comma + audio_period, lexical_period + audio_period])
123 | 
124 |     def __str__(self):
125 |         return "BalanceFusion[BalanceValue: %.2f]" % (self.lexical_audio_balance)
126 | 
127 | class BaselineLexicalFusion(Fusion):
128 | 
129 |     def sophisticated_fusion(self, lexical_probs, audio_probs):
130 |         return [lexical_probs[self.LEX_NONE_IDX], lexical_probs[self.LEX_COMMA_IDX], lexical_probs[self.LEX_PERIOD_IDX]]
131 | 
132 |     def __str__(self):
133 |         return "BaselineLexicalFusion"
134 | 
135 | class BaselineAudioFusion(Fusion):
136 | 
137 |     def sophisticated_fusion(self, lexical_probs, audio_probs):
138 |         return [audio_probs[self.AUDIO_NONE_IDX], 0.0, audio_probs[self.AUDIO_PERIOD_IDX]]
139 | 
140 |     def __str__(self):
141 |         return "BaselineAudioFusion"
142 | 
143 | ################
144 | # Example call #
145 | ################
146 | 
147 | def get_evaluation_fusion_list(lexical_punctuation_pos, lexical_window_size, audio_punctuation_pos, audio_window_size):
148 |     fusions = []
149 |     fusions.append(BaselineLexicalFusion())
150 |     fusions.append(BaselineAudioFusion())
151 |     fusions.append(ThresholdFusion(0.5, 0.8))
152 |     fusions.append(ThresholdFusion(0.5, 0.9))
153 |     fusions.append(ThresholdFusion(0.6, 0.8))
154 |     fusions.append(ThresholdFusion(0.6, 0.9))
155 |     fusions.append(ThresholdFusion(0.7, 0.8))
156 |     fusions.append(ThresholdFusion(0.7, 0.9))
157 |     fusions.append(BalanceFusion(0.1))
158 |     fusions.append(BalanceFusion(0.2))
159 |     fusions.append(BalanceFusion(0.3))
160 |     fusions.append(BalanceFusion(0.4))
161 |     fusions.append(BalanceFusion(0.5))
162 |     fusions.append(BalanceFusion(0.6))
163 |     fusions.append(BalanceFusion(0.7))
164 |     fusions.append(BalanceFusion(0.8))
165 |     fusions.append(BalanceFusion(0.9))
166 |     return [f.init_parameters(lexical_punctuation_pos, lexical_window_size, audio_punctuation_pos, audio_window_size) for f in fusions]
167 | 
168 | def main():
169 |     import random
170 | 
171 |     lexical_punctuation_pos = 4
172 |     lexical_window_size = 8
173 |     audio_punctuation_pos = 2
174 |     audio_window_size = 4
175 | 
176 |     fusions = get_evaluation_fusion_list(lexical_punctuation_pos, lexical_window_size, audio_punctuation_pos, audio_window_size)
177 |     num_words = 9
178 | 
179 |     tokens = ["test" + str(i) for i in range(1, 1 + num_words)]
180 |     probs_lexic = [[random.random(), random.random(), random.random()] for i in range(0, num_words - lexical_window_size + 1)]
181 |     probs_audio = [[random.random(), random.random()] for i in range(0, num_words - audio_window_size + 1)]
182 | 
183 |     probs_lexic = norm(probs_lexic)
184 |     probs_audio = norm(probs_audio)
185 | 
186 |     print tokens, len(probs_lexic), len(probs_audio)
187 | 
188 |     for fc in fusions:
189 |         print fc
190 |         print fc.fuse(len(tokens), probs_lexic, probs_audio)
191 | 
192 | if __name__ == '__main__':
193 |     main()
194 | 


--------------------------------------------------------------------------------
/python/sbd_classification/lexical_classification.py:
--------------------------------------------------------------------------------
 1 | import numpy, caffe, argparse
 2 | import common.sbd_config as sbd
 3 | from preprocessing.nlp_pipeline import NlpPipeline, PosTag
 4 | from preprocessing.sliding_window import SlidingWindow
 5 | from preprocessing.word2vec_file import Word2VecFile
 6 | from classification_input import InputText
 7 | 
 8 | class LexicalClassifier(object):
 9 | 
10 |     def __init__(self, net, word2vec):
11 | 
12 |         self.WINDOW_SIZE = sbd.config.getint('windowing', 'window_size')
13 |         self.PUNCTUATION_POS = sbd.config.getint('windowing', 'punctuation_position')
14 |         self.POS_TAGGING = sbd.config.getboolean('features', 'pos_tagging')
15 | 
16 |         self.FEATURE_LENGTH = 300 if not self.POS_TAGGING else 300 + len(PosTag)
17 | 
18 |         self.word2vec = word2vec
19 |         self.net = net
20 | 
21 |     def predict(self, input_text):
22 |         for token in input_text.tokens:
23 |             if not token.is_punctuation():
24 |                 if not self.word2vec:
25 |                     token.word_vec = numpy.random.rand(300)
26 |                 else:
27 |                     token.word_vec = self.word2vec.get_vector(token.word.lower())
28 | 
29 |         sliding_window = SlidingWindow()
30 |         instances = sliding_window.list_windows(input_text)
31 | 
32 |         # get caffe predictions
33 |         punctuation_probs = []
34 |         for instance in instances:
35 |             probs = self._predict_caffe(instance)
36 |             punctuation_probs.extend(numpy.copy(probs))
37 | 
38 |         return punctuation_probs
39 | 
40 |     def _predict_caffe(self, instance):
41 |         caffe.io.Transformer({'data': self.net.blobs['data'].data.shape})
42 | 
43 |         # batchsize = 1
44 |         # self.net.blobs['data'].reshape(batchsize, 1, self.WINDOW_SIZE, self.FEATURE_LENGTH)
45 |         reshaped_array = numpy.expand_dims(instance.get_array(), axis=0)
46 | 
47 |         self.net.blobs['data'].data[...] = reshaped_array
48 | 
49 |         out = self.net.forward()
50 |         return out['softmax']
51 | 
52 |     def get_lexical_parameter(self):
53 |         return (self.WINDOW_SIZE, self.PUNCTUATION_POS, self.POS_TAGGING)
54 | 
55 | ################
56 | # Example call #
57 | ################
58 | 
59 | 
60 | def main(caffeproto, caffemodel):
61 |     net = caffe.Net(caffeproto, caffemodel, caffe.TEST)
62 |     classifier = LexicalClassifier(net, None, True)
63 | 
64 |     text = "This is a very long text This text has two sentences"
65 |     data = classifier.predict_text(text)
66 |     print(data)
67 | 
68 | if __name__ == '__main__':
69 |     parser = argparse.ArgumentParser(description='run the web demo')
70 |     parser.add_argument('caffeproto', help='the deploy prototxt of your trained model', default='models/deploy.prototxt', nargs='?')
71 |     parser.add_argument('caffemodel', help='the trained caffemodel', default='models/model.caffemodel', nargs='?')
72 |     args = parser.parse_args()
73 | 
74 |     main(args.caffeproto, args.caffemodel)
75 | 


--------------------------------------------------------------------------------
/python/sbd_classification/util.py:
--------------------------------------------------------------------------------
  1 | import common.sbd_config as sbd
  2 | import caffe, os
  3 | from tools.netconfig import NetConfig
  4 | from os import listdir
  5 | from sbd_classification.lexical_classification import LexicalClassifier
  6 | from sbd_classification.audio_classification import AudioClassifier
  7 | from preprocessing.nlp_pipeline import PosTag
  8 | 
  9 | def get_index(index, length, punctuation_pos):
 10 |     position = index - punctuation_pos + 1
 11 |     if 0 <= position < length:
 12 |         return position
 13 |     else:
 14 |         return -1
 15 | 
 16 | def convert_probabilities(token_length, punctuation_pos, probabilities, classes = ["NONE", "COMMA", "PERIOD"]):
 17 |     new_probablities = []
 18 |     for i in range(0, token_length):
 19 |         current_prediction_position = get_index(i, len(probabilities), punctuation_pos)
 20 |         if i == token_length - 1:
 21 |             new_probablities.append([(1.0 if current == "PERIOD" else 0.0) for current in classes])
 22 |         elif current_prediction_position < 0:
 23 |             new_probablities.append([(1.0 if current == "NONE" else 0.0) for current in classes])
 24 |         else:
 25 |             new_probablities.append(probabilities[current_prediction_position].tolist())
 26 |     print probabilities, new_probablities
 27 |     return new_probablities
 28 | 
 29 | def get_filenames(folder):
 30 |     for file_ in listdir(folder):
 31 |         if file_.endswith(".ini"):
 32 |             config_file = folder + "/" + file_
 33 |         elif file_.endswith(".caffemodel"):
 34 |             caffemodel_file = folder + "/" + file_
 35 |         elif file_ == "net.prototxt":
 36 |             net_proto = folder + "/" + file_
 37 |     return config_file, caffemodel_file, net_proto
 38 | 
 39 | def make_lexical_temp_deploy(folder, prototxt, temp_file_name = "temp_deploy.prototxt"):
 40 |     WINDOW_SIZE = sbd.config.getint('windowing', 'window_size')
 41 |     FEATURE_LENGTH = 300 if not sbd.config.getboolean('features', 'pos_tagging') else 300 + len(PosTag)
 42 | 
 43 |     with file(prototxt, "r") as input_:
 44 |         nc = NetConfig(input_)
 45 |     nc.transform_deploy([1, 1, WINDOW_SIZE, FEATURE_LENGTH])
 46 |     temp_proto = "%s/%s" % (folder, temp_file_name)
 47 |     with file(temp_proto, "w") as output:
 48 |         nc.write_to(output)
 49 | 
 50 |     return temp_proto
 51 | 
 52 | def make_audio_temp_deploy(folder, prototxt, temp_file_name = "temp_deploy.prototxt"):
 53 |     WINDOW_SIZE = sbd.config.getint('windowing', 'window_size')
 54 |     FEATURE_LENGTH = 4
 55 | 
 56 |     with file(prototxt, "r") as input_:
 57 |         nc = NetConfig(input_)
 58 |     nc.transform_deploy([1, 1, WINDOW_SIZE, FEATURE_LENGTH])
 59 |     temp_proto = "%s/%s" % (folder, temp_file_name)
 60 |     with file(temp_proto, "w") as output:
 61 |         nc.write_to(output)
 62 | 
 63 |     return temp_proto
 64 | 
 65 | def load_lexical_classifier(folder, vector):
 66 |     print('Loading config folder: ' + folder)
 67 | 
 68 |     config_file, caffemodel_file, net_proto = get_filenames(folder)
 69 | 
 70 |     sbd.SbdConfig(config_file)
 71 |     temp_proto = make_lexical_temp_deploy(folder, net_proto)
 72 | 
 73 |     net = caffe.Net(temp_proto, caffemodel_file, caffe.TEST)
 74 | 
 75 |     if vector:
 76 |         classifier = LexicalClassifier(net, vector)
 77 |     else:
 78 |         classifier = LexicalClassifier(net, vector)
 79 | 
 80 |     return classifier
 81 | 
 82 | def load_audio_classifier(folder):
 83 |     print('Loading config folder: ' + folder)
 84 | 
 85 |     config_file, caffemodel_file, net_proto = get_filenames(folder)
 86 | 
 87 |     sbd.SbdConfig(config_file)
 88 |     temp_proto = make_audio_temp_deploy(folder, net_proto)
 89 | 
 90 |     net = caffe.Net(temp_proto, caffemodel_file, caffe.TEST)
 91 | 
 92 |     classifier = AudioClassifier(net)
 93 | 
 94 |     return classifier
 95 | 
 96 | def get_audio_files(folder):
 97 |     ctm_file = None
 98 |     pitch_file = None
 99 |     energy_file = None
100 | 
101 |     for file_ in listdir(folder):
102 |         if file_.endswith(".ctm"):
103 |             ctm_file = os.path.join(folder, file_)
104 |         elif file_.endswith(".pitch"):
105 |             pitch_file = os.path.join(folder, file_)
106 |         elif file_.endswith(".energy"):
107 |             energy_file = os.path.join(folder, file_)
108 | 
109 |     return ctm_file, pitch_file, energy_file
110 | 


--------------------------------------------------------------------------------
/python/sbd_leveldb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/python/sbd_leveldb/__init__.py


--------------------------------------------------------------------------------
/python/sbd_leveldb/audio_training_instance_generator.py:
--------------------------------------------------------------------------------
  1 | import operator, os, shutil, sys, time, argparse
  2 | 
  3 | from common.argparse_util import *
  4 | import common.sbd_config as sbd
  5 | from preprocessing.sliding_window import SlidingWindow
  6 | from preprocessing.tokens import Punctuation
  7 | from preprocessing.word2vec_file import Word2VecFile
  8 | from preprocessing.glove_file import GloveFile
  9 | from parsing.get_parser import *
 10 | from level_db_creator import LevelDBCreator
 11 | 
 12 | 
 13 | class TrainingInstanceGenerator(object):
 14 |     """reads the original data, process them and writes them to a level-db"""
 15 | 
 16 |     def __init__(self):
 17 |         self.test_talks = set()
 18 | 
 19 |     def generate(self, parsers, database, is_test):
 20 |         level_db = LevelDBCreator(database)
 21 |         window_slider = SlidingWindow()
 22 | 
 23 |         nr_instances = 0
 24 | 
 25 |         if is_test:
 26 |             plain_text_instances_file = open(database + "/../test_instances.txt", "w")
 27 |         else:
 28 |             plain_text_instances_file = open(database + "/../train_instances.txt", "w")
 29 | 
 30 |         for i, talk_parser in enumerate(parsers):
 31 |             talks = talk_parser.parse()
 32 | 
 33 |             prev_progress = 0
 34 |             print("")
 35 |             print("Processing file %s ..." % talk_parser.get_file_name())
 36 | 
 37 |             for talk in talks:
 38 |                 progress = int(talk_parser.progress() * 100)
 39 |                 if progress > prev_progress:
 40 |                     sys.stdout.write(str(progress) + "% ")
 41 |                     sys.stdout.flush()
 42 |                     prev_progress = progress
 43 | 
 44 |                 talk.build_interval_tree()
 45 |                 base_dir = os.path.dirname(talk_parser.get_file_name())
 46 | 
 47 |                 # get pitch feature values
 48 |                 pitch_level_file = base_dir + "/" + talk.group_name + "_talkid" + str(talk.talk_id) + ".pitch"
 49 |                 talk.parse_pitch_feature(pitch_level_file)
 50 | 
 51 |                 # get energy feature values
 52 |                 energy_level_file = base_dir + "/" + talk.group_name + "_talkid" + str(talk.talk_id) + ".energy"
 53 |                 talk.parse_energy_feature(energy_level_file)
 54 | 
 55 |                 # normalize features
 56 |                 talk.normalize()
 57 | 
 58 |                 # get the training instances
 59 |                 training_instances = window_slider.list_windows(talk)
 60 | 
 61 |                 # write training instances to level db
 62 |                 for training_instance in training_instances:
 63 |                     nr_instances += 1
 64 | 
 65 |                     # write instance to file
 66 |                     s = unicode(training_instance) + "\n"
 67 |                     s += "\n"
 68 |                     plain_text_instances_file.write(s.encode('utf8'))
 69 | 
 70 |                     # write to level db
 71 |                     level_db.write_training_instance(training_instance)
 72 | 
 73 |         plain_text_instances_file.close()
 74 | 
 75 | if __name__ == '__main__':
 76 |     parser = argparse.ArgumentParser(description='create test and train datasets as a lmdb.')
 77 |     parser.add_argument('config_file', help="path to config file")
 78 |     args = parser.parse_args()
 79 | 
 80 |     # initialize config
 81 |     sbd.SbdConfig(args.config_file)
 82 | 
 83 |     # create proper name for the database
 84 |     SENTENCE_HOME = os.environ['SENTENCE_HOME']
 85 |     data_folder = "/mnt/naruto/sentence/data/"
 86 |     LEVEL_DB_DIR = "leveldbs"
 87 | 
 88 |     database = SENTENCE_HOME + "/" + LEVEL_DB_DIR + "/" + sbd.SbdConfig.get_db_name_from_config(sbd.config)
 89 | 
 90 |     # check if database already exists
 91 |     if os.path.isdir(database):
 92 |         print("Deleting " + database + ". y/N?")
 93 |         sys.stdout.flush()
 94 |         s = raw_input()
 95 |         if s != "Y" and s != "y":
 96 |             print("Not deleting. Exiting ..")
 97 |             sys.exit(3)
 98 |         shutil.rmtree(database)
 99 | 
100 |     # create database folder and copy config file
101 |     os.mkdir(database)
102 |     shutil.copy(args.config_file, database)
103 | 
104 |     # get training and test data
105 |     training_data = sbd.config.get('data', 'train_files').split(",")
106 |     test_data = sbd.config.get('data', 'test_files').split(",")
107 | 
108 |     # get training parsers
109 |     training_parsers = []
110 |     for f in training_data:
111 |         parser = get_parser(data_folder + f)
112 |         if parser is None:
113 |             print("WARNING: Could not find training parser for file %s!" % f)
114 |         else:
115 |             training_parsers.append(parser)
116 | 
117 |     # get test parsers
118 |     test_parsers = []
119 |     for f in test_data:
120 |         parser = get_parser(data_folder + f)
121 |         if parser is None:
122 |             print("WARNING: Could not find test parser for file %s!" % f)
123 |         else:
124 |             test_parsers.append(parser)
125 | 
126 |     # generate data
127 |     generator = TrainingInstanceGenerator()
128 | 
129 |     print("Generating test data .. ")
130 |     start = time.time()
131 |     generator.generate(test_parsers, database + "/test", is_test = True)
132 |     duration = int(time.time() - start) / 60
133 |     print("Done in " + str(duration) + " min.")
134 | 
135 |     print("Generating training data .. ")
136 |     start = time.time()
137 |     generator.generate(training_parsers, database + "/train", is_test = False)
138 |     duration = int(time.time() - start) / 60
139 |     print("Done in " + str(duration) + " min.")
140 | 


--------------------------------------------------------------------------------
/python/sbd_leveldb/level_db_creator.py:
--------------------------------------------------------------------------------
  1 | import argparse, numpy, leveldb
  2 | from caffe.proto import caffe_pb2
  3 | 
  4 | 
  5 | class LevelDBCreator(object):
  6 |     """create a new level db, fill it with word vectors"""
  7 | 
  8 |     def __init__(self, filename, batchsize=1000):
  9 |         self.__filename = filename
 10 |         self.__db = leveldb.LevelDB(filename)
 11 |         self.__current_batch_size = 0
 12 |         self.__batch = None
 13 |         self.__index = 0
 14 |         self.batchsize = batchsize
 15 | 
 16 |     def write_training_instance_list(self, training_instance_list):
 17 |         for training_instance in training_instance_list:
 18 |             self.write_training_instance(training_instance)
 19 | 
 20 |     def write_training_instance(self, training_instance):
 21 |         if (self.__batch == None):
 22 |             self.__batch = leveldb.WriteBatch()
 23 | 
 24 |         vectors = training_instance.get_array()
 25 |         label = training_instance.get_label()
 26 | 
 27 | 
 28 | 
 29 |         datum = caffe_pb2.Datum()
 30 |         datum.channels, datum.height, datum.width = vectors.shape
 31 |         datum.label = label
 32 |         datum.float_data.extend(vectors.flat)
 33 | 
 34 |         self.__batch.Put(str(self.__index), datum.SerializeToString())
 35 | 
 36 |         self.__index += 1
 37 |         self.__current_batch_size += 1
 38 | 
 39 |         if (self.__current_batch_size == self.batchsize):
 40 |             self.__db.Write(self.__batch, sync=True)
 41 |             self.__batch = None
 42 |             self.__current_batch_size = 0
 43 | 
 44 |     def close(self):
 45 |         if (self.__batch):
 46 |             self.__db.Write(self.__batch, sync=True)
 47 |             self.__batch = None
 48 |         self.__current_batch_size = 0
 49 |         self.__db = None
 50 | 
 51 |     def read(self, key):
 52 |         return self.__db.Get(key)
 53 | 
 54 | 
 55 | 
 56 | 
 57 | ################
 58 | # Example call #
 59 | ################
 60 | 
 61 | class DummyTrainingInstance():
 62 |     """assumed interface of training instance"""
 63 | 
 64 |     def __init__(self):
 65 |         pass
 66 | 
 67 |     def get_array(self):
 68 |         channels = 1
 69 |         window_size = 5
 70 |         vector_size = 300
 71 |         dimensions = (channels, window_size, vector_size)
 72 | 
 73 |         array = numpy.zeros((dimensions))
 74 | 
 75 |         return array
 76 | 
 77 |     def get_label(self):
 78 |         return 0
 79 | 
 80 | def main(args):
 81 |     ### writing
 82 |     ldbCreation = LevelDBCreator(args.dbfile)
 83 | 
 84 |     # write single instance
 85 |     instance = DummyTrainingInstance()
 86 |     ldbCreation.write_training_instance(instance)
 87 | 
 88 |     # write list
 89 |     training_instance_list = []
 90 |     for i in range(0, 1000):
 91 |         training_instance_list += DummyTrainingInstance(),
 92 |     ldbCreation.write_training_instance_list(training_instance_list)
 93 | 
 94 |     # close after you are done!
 95 |     ldbCreation.close()
 96 | 
 97 |     ### reading (for debug)
 98 |     ldbCreation = LevelDBCreator(args.dbfile)
 99 |     datum = caffe_pb2.Datum()
100 |     datum.ParseFromString(ldbCreation.read("1"))
101 |     print(datum)
102 |     print(datum.label)
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     parser = argparse.ArgumentParser(description='Write a test lmdb file.')
107 |     parser.add_argument('dbfile', help='path to a level db test directory')
108 |     args = parser.parse_args()
109 |     main(args)
110 | 


--------------------------------------------------------------------------------
/python/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/python/tools/__init__.py


--------------------------------------------------------------------------------
/python/tools/comparison.py:
--------------------------------------------------------------------------------
 1 | import math, sys
 2 | import numpy
 3 | 
 4 | OUR = "/home/tanja/Desktop/output"
 5 | XIAOYIN_DATA = "/home/tanja/Desktop/xiayin_data"
 6 | XIAOYIN_LABEL = "/home/tanja/Desktop/xiayin_label"
 7 | 
 8 | WINDOW = 80
 9 | TAKE = 12000
10 | COUNT = 12000
11 | SKIP = 2
12 | DIFF = 0.001
13 | INSTANCE_SIZE = 250
14 | 
15 | our_data = numpy.zeros((TAKE, INSTANCE_SIZE))
16 | our_label = []
17 | xiaoyin_data = numpy.zeros((TAKE, INSTANCE_SIZE))
18 | xiaoyin_label = []
19 | 
20 | instance_count = 0
21 | with open(OUR, "r") as file_:
22 |     for line in file_:
23 |         if instance_count >= TAKE:
24 |             continue
25 | 
26 |         line = line.rstrip()
27 |         label = line[-1]
28 |         our_label.append(float(label))
29 | 
30 |         line = line[1:-4]
31 |         parts = line.split(", ")
32 |         for i, p in enumerate(parts):
33 |             our_data[instance_count][i] = float(p)
34 | 
35 |         instance_count += 1
36 | 
37 | instance_count = 0
38 | with open(XIAOYIN_LABEL, "r") as file_:
39 |     for line in file_:
40 |         if instance_count >= TAKE:
41 |             continue
42 |         if instance_count < SKIP:
43 |             instance_count += 1
44 |             continue
45 |         line = line.rstrip()
46 |         xiaoyin_label.append(float(line))
47 |         instance_count += 1
48 | 
49 | instance_count = 0
50 | with open(XIAOYIN_DATA, "r") as file_:
51 |     for line in file_:
52 |         if instance_count >= TAKE:
53 |             continue
54 |         if instance_count < SKIP:
55 |             instance_count += 1
56 |             continue
57 |         parts = line.split("\t")
58 |         for i, p in enumerate(parts):
59 |             xiaoyin_data[instance_count][i] = float(p)
60 |         instance_count += 1
61 | 
62 | assert(len(our_data)  == len(xiaoyin_data))
63 | assert(len(our_label) - SKIP == len(xiaoyin_label))
64 | 
65 | 
66 | def check_instance(a, b):
67 |     for i in range(INSTANCE_SIZE):
68 |         if DIFF < math.fabs(a[i] - b[i]):
69 |             return False
70 |     return True
71 | 
72 | 
73 | count_label = 0
74 | count_data = 0
75 | 
76 | 
77 | for instance_nr in range(TAKE):
78 |     equal = False
79 |     for i in range(max(0, instance_nr - WINDOW), min(instance_nr + WINDOW, TAKE)):
80 |         if check_instance(our_data[instance_nr], xiaoyin_data[i]):
81 |             equal = True
82 |             continue
83 |     if not equal:
84 |         count_data += 1
85 | 
86 | print("DATA", float(count_data) / TAKE)
87 | 
88 | 
89 | # for i in range(len(xiaoyin_label)):
90 | #     if our_label[i] != xiaoyin_label[i]:
91 | #         for j in range(max(0, i-WINDOW), min(len(xiaoyin_label), i + WINDOW)):
92 | #             if our_label[j] == xiaoyin_label[i] or our_label[i] == xiaoyin_label[j]:
93 | #                 continue
94 | #         count_label += 1
95 | 
96 | print("LABEL", float(count_label) / COUNT)
97 | 


--------------------------------------------------------------------------------
/python/tools/look_into_leveldb.py:
--------------------------------------------------------------------------------
 1 | import leveldb, argparse
 2 | from caffe.proto import caffe_pb2
 3 | 
 4 | 
 5 | def main(leveldb_dir, limit):
 6 |     datum = caffe_pb2.Datum()
 7 |     db = leveldb.LevelDB(leveldb_dir)
 8 |     for i in range (0, limit):
 9 |         datum.ParseFromString(db.Get(str(i)))
10 |         print datum.float_data, datum.label
11 | 
12 | if __name__ == '__main__':
13 |     parser = argparse.ArgumentParser(description='Print (beginning of) contents of a level db database.')
14 |     parser.add_argument('leveldb', help='path to level db folder')
15 |     parser.add_argument('-l','--limit', help='number of entries which should be displayed', type=int, default=10)
16 |     args = parser.parse_args()
17 |     main(args.leveldb, args.limit)
18 | 


--------------------------------------------------------------------------------
/python/tools/netconfig.py:
--------------------------------------------------------------------------------
  1 | import argparse, sys
  2 | from google.protobuf import text_format
  3 | 
  4 | import caffe
  5 | from caffe.proto import caffe_pb2
  6 | 
  7 | def get_layer_by_name(net, name):
  8 |     for layer in net.layer:
  9 |         if layer.name == name:
 10 |             return layer
 11 | 
 12 | def get_data_layer(net, phase):
 13 |     for layer in net.layer:
 14 |         if layer.name == "data":
 15 |             for value in layer.include:
 16 |                 if value.phase == phase:
 17 |                     return layer
 18 | 
 19 | def get_test_data_layer(net):
 20 |     return get_data_layer(net, caffe_pb2.TEST)
 21 | 
 22 | def get_train_data_layer(net):
 23 |     return get_data_layer(net, caffe_pb2.TRAIN)
 24 | 
 25 | def replace_loss_with_softmax(net):
 26 |     losslayer = get_layer_by_name(net, "loss")
 27 |     if losslayer.type == "InfogainLoss":
 28 |         return losslayer
 29 |     losslayer.name = "softmax"
 30 |     losslayer.type = "Softmax"
 31 |     losslayer.bottom.remove("label")
 32 |     losslayer.top.remove("loss")
 33 |     losslayer.top.append("softmax")
 34 |     return losslayer
 35 | 
 36 | class NetConfig(object):
 37 |     def __init__(self, prototxt):
 38 |         self.net = caffe_pb2.NetParameter()
 39 |         text_format.Merge(prototxt.read(), self.net)
 40 | 
 41 |     def transform_deploy(self, dimensions = [1, 1, 5, 300]):
 42 |         # make deploy version of net
 43 |         # remove data layers
 44 |         self.net.layer.remove(get_train_data_layer(self.net))
 45 |         self.net.layer.remove(get_test_data_layer(self.net))
 46 | 
 47 |         # remove accuracy layer
 48 |         self.net.layer.remove(get_layer_by_name(self.net, "accuracy"))
 49 | 
 50 |         # add input
 51 |         self.net.input.append("data")
 52 |         for d in dimensions:
 53 |             self.net.input_dim.append(d)
 54 | 
 55 |         uses_infogain = get_layer_by_name(self.net, "loss").type == "InfogainLoss"
 56 | 
 57 |         if not uses_infogain:
 58 |             # use softmax instead of loss layer
 59 |             replace_loss_with_softmax(self.net)
 60 |         else:
 61 |             # infogain already depends on softmax, we can just remove the weight and the infogain loss layer
 62 |             self.net.layer.remove(get_layer_by_name(self.net, "infogain_loss_matrix"))
 63 |             self.net.layer.remove(get_layer_by_name(self.net, "loss"))
 64 | 
 65 |     def transform_data_paths(self, db_pair_dir):
 66 |         db_pair_dir = args.train
 67 | 
 68 |         # modify path to leveldb for test and train data layer
 69 |         test_data_layer = get_test_data_layer(self.net)
 70 |         test_data_layer.data_param.source = db_pair_dir + "/test"
 71 | 
 72 |         train_data_layer = get_train_data_layer(self.net)
 73 |         train_data_layer.data_param.source = db_pair_dir + "/train"
 74 | 
 75 |     def get_database(self):
 76 |         test_data_layer = get_test_data_layer(self.net)
 77 |         return test_data_layer.data_param.source.replace("/test", "")
 78 | 
 79 |     def write_to(self, outstream):
 80 |         outstream.write(str(self.net))
 81 | 
 82 | def main(args):
 83 |     nc = NetConfig(args.prototxt)
 84 | 
 85 |     if args.deploy:
 86 |         nc.transform_deploy()
 87 | 
 88 |     if args.train:
 89 |         nc.transform_data_paths(args.train)
 90 | 
 91 |     if args.print_database:
 92 |         print nc.get_database()
 93 |         return
 94 | 
 95 |     nc.write_to(args.output)
 96 | 
 97 | if __name__ == '__main__':
 98 |     parser = argparse.ArgumentParser(description='Configure your net')
 99 |     parser.add_argument('prototxt', help='the original net prototxt', type=argparse.FileType('r'))
100 |     group = parser.add_mutually_exclusive_group()
101 |     group.add_argument('-d','--deploy', help='preset: deploy; remove data layers, add softmax', action='store_true')
102 |     group.add_argument('-p','--print_database', help='Whether to print the database folder or not', action='store_true')
103 |     group.add_argument('-t','--train', help='preset: make training net on test/train leveldb in directory', metavar='directory')
104 |     parser.add_argument('-o','--output', help='output of the modified net', type=argparse.FileType('w'), default=sys.stdout, metavar='output')
105 |     # parser.add_argument('-v','--verbose', help='be verbose', action='store_true')
106 |     args = parser.parse_args()
107 | 
108 |     main(args)
109 | 


--------------------------------------------------------------------------------
/python/tools/parse_result.py:
--------------------------------------------------------------------------------
 1 | import sys, os, csv, re, argparse, ConfigParser
 2 | 
 3 | 
 4 | 
 5 | def read_test_results(logPath):
 6 |     test_results = {}
 7 | 
 8 |     for line in file(logPath):
 9 |         if "Test net output" in line:
10 |             search_terms = re.search('Test net output #([0-9]): (.*?) = (-?(0|1)\\.?[0-9]*)', line)
11 |             if search_terms:
12 |                 key = str(search_terms.group(2)) + "_" + search_terms.group(1)
13 |                 test_results[key] = search_terms.group(3)
14 |     return test_results
15 | 
16 | ###config
17 | def read_config(config_path):
18 |     sections = ['data', 'word_vector', 'windowing', 'features']
19 | 
20 |     current_config = ConfigParser.ConfigParser()
21 |     current_config.read(config_path)
22 |     feature_map = {}
23 | 
24 |     for section in sections:
25 |         for f in current_config.items(section):
26 |             feature_map ["_" + f[0]] = f[1]
27 |     return feature_map
28 | 
29 | 
30 | def main(experiments_path, result_file):
31 |     all_values = []
32 | 
33 |     for d in os.listdir(experiments_path):
34 |         full_d_path = os.path.join(experiments_path,d)
35 |         if os.path.isdir(full_d_path):
36 |             print full_d_path
37 |             logFile = None
38 |             configFile = None
39 | 
40 |             files = os.listdir(full_d_path)
41 |             for f in files:
42 |                 if f.endswith(".tlog"):
43 |                     print f
44 |                     logFile = os.path.join(full_d_path , f)
45 |                 elif f.endswith(".ini"):
46 |                     print f
47 |                     configFile = os.path.join(full_d_path, f)
48 | 
49 |             if logFile == None or configFile == None:
50 |                 print "#Warning: Skipped %s, log or config file was not found!" % full_d_path
51 |                 continue
52 |             features = read_config(configFile)
53 |             test_results = read_test_results(logFile)
54 |             features.update(test_results)
55 | 
56 |             all_values.append(features)
57 | 
58 |     with open(result_file, 'w') as csvfile:
59 |         fieldnames = []
60 |         for row in all_values:
61 |             dict_keys = row.keys()
62 |             dict_keys.sort()
63 |             for key in dict_keys:
64 |                 if not key in fieldnames:
65 |                     fieldnames.append(key)
66 |         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
67 |         writer.writeheader()
68 | 
69 |         for row in all_values:
70 |             writer.writerow(row)
71 | 
72 | if __name__ == '__main__':
73 |     parser = argparse.ArgumentParser(description='Create overview csv file of training results.', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
74 |     parser.add_argument('experimentfolder', help='path to experiment folder', default='../net/experiments', nargs='?')
75 |     parser.add_argument('output', help='path of result file', default='../net/experiments/experiments.csv', nargs='?')
76 |     args = parser.parse_args()
77 |     main(args.experimentfolder, args.output)
78 | 


--------------------------------------------------------------------------------
/python/tools/text_converter.py:
--------------------------------------------------------------------------------
 1 | import operator, os, shutil, sys, time, argparse
 2 | 
 3 | from common.argparse_util import *
 4 | import common.sbd_config as sbd
 5 | from parsing.get_parser import *
 6 | 
 7 | 
 8 | class TextConverter(object):
 9 | 
10 |     def convert(self, parsers):
11 |         for i, text_parser in enumerate(parsers):
12 |             texts = text_parser.parse()
13 |             file_path = text_parser.get_file_name() + ".line"
14 | 
15 |             if os.path.isfile(file_path):
16 |                 print("Deleting " + file_path + ".")
17 |                 os.remove(file_path)
18 |             print("Writing file %s ..." % file_path)
19 | 
20 |             prev_progress = 0
21 | 
22 |             for text in texts:
23 |                 progress = int(text_parser.progress() * 100)
24 |                 if progress > prev_progress:
25 |                     sys.stdout.write(str(progress) + "% ")
26 |                     sys.stdout.flush()
27 |                     prev_progress = progress
28 | 
29 |                 text.append_to_file(file_path)
30 | 
31 | if __name__ == '__main__':
32 |     parser = argparse.ArgumentParser(description='converts files into line format.')
33 |     parser.add_argument('config_file', help="path to config file")
34 |     args = parser.parse_args()
35 | 
36 |     # initialize config
37 |     sbd.SbdConfig(args.config_file)
38 | 
39 |     # get training and test data
40 |     training_data = sbd.config.get('data', 'train_files').split(",")
41 |     test_data = sbd.config.get('data', 'test_files').split(",")
42 | 
43 |     data_folder = "/mnt/naruto/sentence/data/"
44 | 
45 |     # get training parsers
46 |     training_parsers = []
47 |     for f in training_data:
48 |         parser = get_parser(data_folder + f)
49 |         if parser is None:
50 |             print("WARNING: Could not find training parser for file %s!" % f)
51 |         else:
52 |             training_parsers.append(parser)
53 | 
54 |     # get test parsers
55 |     test_parsers = []
56 |     for f in test_data:
57 |         parser = get_parser(data_folder + f)
58 |         if parser is None:
59 |             print("WARNING: Could not find test parser for file %s!" % f)
60 |         else:
61 |             test_parsers.append(parser)
62 | 
63 |     # convert data
64 |     converter = TextConverter()
65 |     print("Converting data .. ")
66 |     start = time.time()
67 |     converter.convert(test_parsers)
68 |     duration = int(time.time() - start) / 60
69 |     print("Done in " + str(duration) + " min.")
70 |     start = time.time()
71 |     converter.convert(training_parsers)
72 |     duration = int(time.time() - start) / 60
73 |     print("Done in " + str(duration) + " min.")
74 | 


--------------------------------------------------------------------------------
/python/web_demo/README.md:
--------------------------------------------------------------------------------
 1 | # Run Demo
 2 | 
 3 | As described in the [general Python README](../README.md), before executing any scripts on the server, please execute the following command  `. ./use_python p2`  in `/home/ms2015t3/sentence-boundary-detection-nn`.
 4 | 
 5 | ```
 6 | cd /home/ms2015t3/sentence-boundary-detection-nn
 7 | . ./use_python p2
 8 | ```
 9 |  
10 | Then use the following command on the server to run the demo:
11 | 
12 |  `python web_demo/web.py /home/ms2015t3/demo_data /home/fb10dl01/workspace/ms-2015-t3/GoogleNews-vectors-negative300.bin -nd`
13 | 
14 | Or the equivalent for your specific environment:
15 | 
16 |  `python web_demo/web.py [DemoDataFolder] [TrainedWord2VecModel] -nd`
17 | 
18 | 


--------------------------------------------------------------------------------
/python/web_demo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/python/web_demo/__init__.py


--------------------------------------------------------------------------------
/python/web_demo/file_io.py:
--------------------------------------------------------------------------------
 1 | import common.sbd_config as sbd
 2 | from sbd_classification.util import convert_probabilities
 3 | 
 4 | class ResultWriter (object):
 5 | 
 6 |     def __init__(self, classes = ["NONE", "COMMA", "PERIOD"]):
 7 |        self.PUNCTUATION_POS = sbd.config.getint('windowing', 'punctuation_position')
 8 |        self.classes = classes
 9 |        self.separator = " "
10 | 
11 |     def writeToFile(self, file_name, tokens, punctuation_probs):
12 |         with open(file_name, "w") as f:
13 |             header = "%s\n" % (self.separator.join(["TOKEN"] + self.classes))
14 |             f.write(header)
15 | 
16 |             for i, token in enumerate(tokens):
17 |                 f.write("%s\n" % self.separator.join(str(prob) for prob in ([token] + punctuation_probs[i])))
18 | 
19 | 
20 | class InputTextReader (object):
21 | 
22 |     def __init__(self):
23 |         pass
24 | 
25 |     def readFile(self, file_name):
26 |         text = ""
27 |         with open(file_name, "r") as f:
28 |             for line in f.readlines():
29 |                 word = line.split("\t")[0]
30 |                 text += " " + word
31 | 
32 |         return text
33 | 


--------------------------------------------------------------------------------
/python/web_demo/json_converter.py:
--------------------------------------------------------------------------------
 1 | import common.sbd_config as sbd
 2 | import numpy
 3 | from sbd_classification.util import get_index
 4 | 
 5 | class JsonConverter(object):
 6 | 
 7 |     def __init__(self, lexical_punctuation_pos, lexical_window_size, audio_punctuation_pos, audio_window_size, pos_tagging):
 8 |         self.LEXICAL_PUNCTUATION_POS = lexical_punctuation_pos
 9 |         self.LEXICAL_WINDOW_SIZE = lexical_window_size
10 |         self.AUDIO_PUNCTUATION_POS = audio_punctuation_pos
11 |         self.AUDIO_WINDOW_SIZE = audio_window_size
12 |         self.POS_TAGGING = pos_tagging
13 |         self.classes_lexical_audio = ["NONE", "COMMA", "PERIOD"]
14 |         self.classes_audio = ["NONE", "PERIOD"]
15 | 
16 |     def convert_fusion(self, tokens, fusion_probs, lexical_probs, audio_probs):
17 |         json_data = []
18 | 
19 |         # build json
20 |         for i, token in enumerate(tokens):
21 |             token_json = {'type': 'word', 'token': token.word}
22 |             if self.POS_TAGGING:
23 |                 token_json['pos'] = [str(tag).replace("PosTag.", "") for tag in token.pos_tags]
24 |             json_data.append(token_json)
25 | 
26 |             probs_json = {'type': 'punctuation'}
27 | 
28 |             # FUSION
29 |             # we have probabilities for all tokens
30 |             current_punctuation = self.classes_lexical_audio[numpy.argmax(fusion_probs[i])]
31 |             class_distribution = self._get_class_distribution(fusion_probs[i], self.classes_lexical_audio)
32 |             if i == len(tokens) - 1:
33 |                 probs_json['fusion'] = {'punctuation': 'PERIOD', 'probs': {'NONE': 0.0, 'PERIOD': 1.0}}
34 |             else:
35 |                 probs_json['fusion'] = {'punctuation': current_punctuation, 'probs': class_distribution}
36 | 
37 |             # AUDIO
38 |             current_prediction_position = get_index(i, len(audio_probs), self.AUDIO_PUNCTUATION_POS)
39 |             if i == len(tokens) - 1:
40 |                 probs_json['audio'] = {'punctuation': 'PERIOD', 'probs': {'NONE': 0.0, 'PERIOD': 1.0}}
41 |             elif current_prediction_position < 0:
42 |                 probs_json['audio'] = {'punctuation': 'NONE', 'probs': {'NONE': 1.0, 'PERIOD': 0.0}}
43 |             else:
44 |                 current_punctuation = self.classes_audio[numpy.argmax(audio_probs[current_prediction_position])]
45 |                 class_distribution = self._get_class_distribution(audio_probs[current_prediction_position], self.classes_audio)
46 |                 probs_json['audio'] = { 'punctuation': current_punctuation, 'probs': class_distribution}
47 | 
48 |             # LEXICAL
49 |             current_prediction_position = get_index(i, len(lexical_probs), self.LEXICAL_PUNCTUATION_POS)
50 |             if i == len(tokens) - 1:
51 |                 probs_json['lexical'] = {'punctuation': 'PERIOD', 'probs': {'NONE': 0.0, 'COMMA': 0.0, 'PERIOD': 1.0}}
52 |             elif current_prediction_position < 0:
53 |                 probs_json['lexical'] = {'punctuation': 'NONE', 'probs': {'NONE': 1.0, 'COMMA': 0.0, 'PERIOD': 0.0}}
54 |             else:
55 |                 current_punctuation = self.classes_lexical_audio[numpy.argmax(lexical_probs[current_prediction_position])]
56 |                 class_distribution = self._get_class_distribution(lexical_probs[current_prediction_position], self.classes_lexical_audio)
57 |                 probs_json['lexical'] = {'punctuation': current_punctuation, 'probs': class_distribution}
58 | 
59 |             json_data.append(probs_json)
60 | 
61 |         return json_data
62 | 
63 |     def convert_lexical(self, tokens, punctuation_probs):
64 |         json_data = []
65 |         # build json
66 |         for index, token in enumerate(tokens):
67 |             token_json = {'type': 'word', 'token': token.word}
68 |             if self.POS_TAGGING:
69 |                 token_json['pos'] = [str(tag).replace("PosTag.", "") for tag in token.pos_tags]
70 |             json_data.append(token_json)
71 | 
72 |             current_punctuation = self.classes_lexical_audio[numpy.argmax(punctuation_probs[index])]
73 |             class_distribution = self._get_class_distribution(punctuation_probs[index], self.classes_lexical_audio)
74 |             json_data.append({'type': 'punctuation', 'punctuation': current_punctuation, 'probs': class_distribution})
75 | 
76 |         return json_data
77 | 
78 |     def _get_class_distribution(self, probs, classes):
79 |         json_data = {}
80 |         for i in range (0, len(classes)):
81 |             json_data[classes[i]] = str(probs[i])
82 |         return json_data
83 | 
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/python/web_demo/static/main.css:
--------------------------------------------------------------------------------
 1 | #punctuation {
 2 |     padding: 15px;
 3 |     margin: 15px;
 4 |     max-width: 100%;
 5 |     min-height: 100px;
 6 |     background: #EEE;
 7 | }
 8 | .punctuation_div {
 9 |     padding: 15px;
10 |     margin: 15px;
11 |     max-width: 100%;
12 |     min-height: 100px;
13 |     background: #EEE;
14 | }
15 | #textarea-input {
16 |     border: 0.5px solid black;
17 |     font-size: 1em;
18 |     margin: 10px 2px;
19 | }
20 | .token {
21 |     background: #CCC;
22 |     border-radius: 4px;
23 |     padding: 5px;
24 |     margin: 10px;
25 |     margin-right: 3px;
26 |     margin-left: 3px;
27 |     float: left;
28 | }
29 | .token-punctuation {
30 |     background: #F2C38A;
31 |     padding: 5px;
32 |     padding-right: 10px;
33 |     padding-left: 10px;
34 | }
35 | .token-NONE {
36 |     background: #F2C38A;
37 |     background: #DDD;
38 |     white-space: pre-wrap;
39 | }
40 | .token-COMMA {
41 |     background: #FFC107;
42 |     white-space: pre-line;
43 | }
44 | .token-PERIOD {
45 |     background: #03A9F4;
46 |     white-space: pre-line;
47 | }
48 | 
49 | .glyphicon.spinning {
50 |     animation: spin 1s infinite linear;
51 |     -webkit-animation: spin2 1s infinite linear;
52 | }
53 | 
54 | @keyframes spin {
55 |     from { transform: scale(1) rotate(0deg); }
56 |     to { transform: scale(1) rotate(360deg); }
57 | }
58 | 
59 | @-webkit-keyframes spin2 {
60 |     from { -webkit-transform: rotate(0deg); }
61 |     to { -webkit-transform: rotate(360deg); }
62 | }
63 | 


--------------------------------------------------------------------------------
/python/web_demo/static/main.js:
--------------------------------------------------------------------------------
  1 | $(document).ready(function() {
  2 | 
  3 |     function stringRepresentation(token) {
  4 |         token.string = token.punctuation
  5 |         if (token.punctuation == "NONE")
  6 |         {
  7 |             token.string = " "
  8 |         }
  9 |         if (token.punctuation == "PERIOD")
 10 |         {
 11 |             token.string = "."
 12 |         }
 13 |         if (token.punctuation == "COMMA")
 14 |         {
 15 |             token.string = ","
 16 |         }
 17 |     };
 18 | 
 19 |     function buildProbsString(probs) {
 20 |         var probs_str = "";
 21 |         for (var key in probs) {
 22 |             probs_str += key + ": " + (probs[key] * 100 ).toFixed(2) + "% &#013;"
 23 |         };
 24 |         return probs_str
 25 |     };
 26 | 
 27 |     function processPunctuationToken(token, resultDiv) {
 28 |         stringRepresentation(token)
 29 |         var probs_str = buildProbsString(token.probs);
 30 |         resultDiv.append("<span title='" + probs_str + "' class='token token-punctuation token-" + token.punctuation + "'>" + token.string + "</span>");
 31 |     };
 32 | 
 33 |     function displayLexicalAudioResult(tokens) {
 34 |         var $resultDivLexicalAudio = $("#punctuation_lexical_audio");
 35 |         var $resultDivLexical = $("#punctuation_lexical");
 36 |         var $resultDivAudio = $("#punctuation_audio");
 37 | 
 38 |         $resultDivLexicalAudio.empty("");
 39 |         $resultDivLexical.empty("");
 40 |         $resultDivAudio.empty("");
 41 | 
 42 |         tokens.forEach(function(token) {
 43 |             if (token.type == "word") {
 44 |                 var tag_str = "";
 45 |                 for (var key in token.pos) {
 46 |                     tag_str += token.pos[key] + "&#013;"
 47 |                 };
 48 |                 var s = "<span title='" + tag_str + "' class='token token-" + token.type + "'>" + token.token + "</span>";
 49 |                 $resultDivLexicalAudio.append(s);
 50 |                 $resultDivLexical.append(s);
 51 |                 $resultDivAudio.append(s);
 52 |             } else if (token.type == "punctuation") {
 53 |                 processPunctuationToken(token.fusion, $resultDivLexicalAudio);
 54 |                 processPunctuationToken(token.lexical, $resultDivLexical);
 55 |                 processPunctuationToken(token.audio, $resultDivAudio);
 56 |             }
 57 |         });
 58 |     };
 59 | 
 60 |     function displayLexicalResult(tokens) {
 61 |         var $resultDiv = $("#punctuation");
 62 |         $resultDiv.empty("");
 63 |         tokens.forEach(function(token) {
 64 |             if (token.type == "word") {
 65 |                 var tag_str = "";
 66 |                 for (var key in token.pos) {
 67 |                     tag_str += token.pos[key] + "&#013;"
 68 |                 };
 69 |                 $resultDiv.append("<span title='" + tag_str + "' class='token token-" + token.type + "'>" + token.token + "</span>");
 70 |             } else if (token.type == "punctuation") {
 71 |                 processPunctuationToken(token, $resultDiv);
 72 |             }
 73 |         });
 74 |     };
 75 | 
 76 | 
 77 |     $("#collapse2").on('hidden.bs.collapse', function () {
 78 |         $('#selection-text-file').val('');
 79 |     });
 80 | 
 81 |     $("#punctuate-lexical").click(function() {
 82 |         var text = {
 83 |             text: $('#textarea-input').val(),
 84 |             textfile: $('#selection-text-file').val(),
 85 |             lexical_folder: $("#selection-lexical-models").val()
 86 |         };
 87 |         $('#loading').show();
 88 |         $('#punctuation').empty();
 89 |         $.post("/classify_lexical", text, function(response, textStatus) {
 90 |                 $('#loading').hide();
 91 |                 displayLexicalResult(response);
 92 |             }, "json")
 93 |         .fail(function(data) {
 94 |             console.error(data);
 95 |         });
 96 | 	});
 97 | 
 98 |     $("#punctuate-audio-lexical").click(function() {
 99 |         var setting = {
100 |             example: $('#selection-audio-examples').val(),
101 |             lexical_folder: $("#selection-lexical-models").val(),
102 |             audio_folder: $("#selection-audio-models").val()
103 |         };
104 |         $('#loading').show();
105 |         $('#punctuation').empty();
106 |         $.post("/classify_audio_lexical", setting, function(response, textStatus) {
107 |             $('#loading').hide();
108 |             displayLexicalAudioResult(response);
109 |         }, "json")
110 |             .fail(function(data) {
111 |                       console.error(data);
112 |                   });
113 |     });
114 | 
115 |     $("#selection-lexical-models").on('change', function() {
116 |         var setting = {
117 |             folder: $("#selection-lexical-models").val()
118 |         };
119 |         $.post("/lexical_models", setting, function(response) {})
120 |         .fail(function(data) {
121 |             console.error(data);
122 |         });
123 |     });
124 | 
125 |     $("#selection-audio-models").on('change', function() {
126 |         var setting = {
127 |             folder: $("#selection-audio-models").val()
128 |         };
129 | 
130 |         $.post("/audio_models", setting, function(response) {})
131 |             .fail(function(data) {
132 |                       console.error(data);
133 |                   });
134 |     });
135 | 
136 |     function loadLexicalModels() {
137 |         $.get("/lexical_models", function(response) {
138 |             response.options.forEach(function(option){
139 | 
140 |                 if (response.selected === option){
141 |                     $('#selection-lexical-models').append($('<option>', {
142 |                          value: option,
143 |                          text: option,
144 |                          selected:"selected"
145 |                     }));
146 |                 }
147 |                 else{
148 |                     $('#selection-lexical-models').append($('<option>', {
149 |                          value: option,
150 |                          text: option
151 |                     }));
152 |                 }
153 |             });
154 |         }, "json")
155 |         .fail(function(data) {
156 |             console.error(data);
157 |         });
158 |     };
159 | 
160 |     function loadTextFiles() {
161 |         $.get("/files", function(response) {
162 |             response.forEach(function(option){
163 |                 $('#selection-text-file').append($('<option>', {
164 |                      value: option,
165 |                      text: option,
166 |                 }));
167 |             });
168 |         }, "json")
169 |         .fail(function(data) {
170 |             console.error(data);
171 |         });
172 |     };
173 | 
174 |     function loadAudioExamples() {
175 |         $.get("/examples", function(response) {
176 |             response.forEach(function(option){
177 |                 $('#selection-audio-examples').append($('<option>', {
178 |                     value: option,
179 |                     text: option,
180 |                 }));
181 |             });
182 |         }, "json")
183 |             .fail(function(data) {
184 |                       console.error(data);
185 |                   });
186 |     };
187 | 
188 |     function loadAudioModels() {
189 |         $.get("/audio_models", function(response) {
190 |             response.options.forEach(function(option){
191 | 
192 |                 if (response.selected === option){
193 |                     $('#selection-audio-models').append($('<option>', {
194 |                         value: option,
195 |                         text: option,
196 |                         selected:"selected"
197 |                     }));
198 |                 }
199 |                 else{
200 |                     $('#selection-audio-models').append($('<option>', {
201 |                         value: option,
202 |                         text: option
203 |                     }));
204 |                 }
205 |             });
206 |         }, "json")
207 |             .fail(function(data) {
208 |                       console.error(data);
209 |                   });
210 |     };
211 | 
212 | 
213 |     loadTextFiles();
214 |     loadAudioExamples();
215 |     loadLexicalModels();
216 |     loadAudioModels();
217 | 
218 | });
219 | 


--------------------------------------------------------------------------------
/python/web_demo/templates/audio_lexical.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="utf-8" />
 5 |     <meta http-equiv="X-UA-Compatible" content="IE=edge"/ >
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1" />
 7 |     <!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
 8 |     <title>Sentence Boundary Detection</title>
 9 | 
10 |     <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css" />
11 |     <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap-theme.min.css">
12 |     <link rel="stylesheet" href="static/main.css"></script>
13 | 
14 |     <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.3/jquery.min.js"></script>
15 |     <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js"></script>
16 |     <script src="static/main.js"></script>
17 | </head>
18 | <body>
19 | <div class="container">
20 |     <div class="row">
21 |         <div class="col-md-2"></div>
22 |         <div class="col-md-8">
23 |             <h1>Sentence Boundary Detection</h1>
24 |             <ul class="nav nav-tabs">
25 |                 <li><a href="/">Lexical</a></li>
26 |                 <li class="active"><a href="/audio_lexical">Lexical + Audio</a></li>
27 |             </ul>
28 | 
29 |             <h4 style="margin-top: 30px">Choose an Example</h4>
30 |             <select id="selection-audio-examples" type="selection" class="form-control"></select>
31 | 
32 | 
33 |             <h4 style="margin-top: 30px">Choose an Audio Model</h4>
34 |             <select id="selection-audio-models" type="selection" class="form-control"></select>
35 | 
36 |             <h4 style="margin-top: 10px">Choose a Lexical Model</h4>
37 |             <select id="selection-lexical-models" type="selection" class="form-control"></select>
38 | 
39 |             <form>
40 |                 <div class="pull-right" style="margin-top: 30px">
41 |                     <button id="punctuate-audio-lexical" class="btn btn-success btn-lg" type="button">
42 |                         <span id='loading' class="glyphicon glyphicon-refresh spinning" style="display:none"></span>
43 |                         Punctuate!  
44 |                     </button>
45 |                 </div>
46 |             </form>
47 | 
48 |             <h3 style="margin-top: 100px">Results</h3>
49 |             <div class="panel-group" id="accordion">
50 |                 <div class="panel panel-default">
51 |                     <div class="panel-heading">
52 |                         <h4 class="panel-title">
53 |                             <a data-toggle="collapse" data-parent="#accordion" href="#collapse1">
54 |                                 Lexical & Audio</a>
55 |                         </h4>
56 |                     </div>
57 |                     <div id="collapse1" class="panel-collapse collapse in">
58 |                         <div class="panel-body">
59 |                             <div id="punctuation_lexical_audio" class="punctuation_div clearfix"></div>
60 |                         </div>
61 |                     </div>
62 |                 </div>
63 |                 <div class="panel panel-default">
64 |                     <div class="panel-heading">
65 |                         <h4 class="panel-title">
66 |                             <a data-toggle="collapse" data-parent="#accordion" href="#collapse2">
67 |                                 Lexical</a>
68 |                         </h4>
69 |                     </div>
70 |                     <div id="collapse2" class="panel-collapse collapse">
71 |                         <div class="panel-body">
72 |                             <div id="punctuation_lexical" class="punctuation_div clearfix"></div>
73 |                         </div>
74 |                     </div>
75 |                 </div>
76 |                 <div class="panel panel-default">
77 |                     <div class="panel-heading">
78 |                         <h4 class="panel-title">
79 |                             <a data-toggle="collapse" data-parent="#accordion" href="#collapse3">
80 |                                 Audio</a>
81 |                         </h4>
82 |                     </div>
83 |                     <div id="collapse3" class="panel-collapse collapse">
84 |                         <div class="panel-body">
85 |                             <div id="punctuation_audio" class="punctuation_div clearfix"></div>
86 |                         </div>
87 |                     </div>
88 |                 </div>
89 |             </div>
90 |         </div>
91 |         <div class="col-md-2"></div>
92 |     </div>
93 | </div>
94 | </body>
95 | </html>
96 | 


--------------------------------------------------------------------------------
/python/web_demo/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="utf-8" />
 5 |     <meta http-equiv="X-UA-Compatible" content="IE=edge"/ >
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1" />
 7 |     <!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
 8 |     <title>Sentence Boundary Detection</title>
 9 | 
10 |     <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css" />
11 |     <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap-theme.min.css">
12 |     <link rel="stylesheet" href="static/main.css"></script>
13 | 
14 |     <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.3/jquery.min.js"></script>
15 |     <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js"></script>
16 |     <script src="static/main.js"></script>
17 | </head>
18 | <body>
19 | <div class="container">
20 |     <div class="row">
21 |         <div class="col-md-2"></div>
22 |         <div class="col-md-8">
23 |             <h1>Sentence Boundary Detection</h1>
24 |             <ul class="nav nav-tabs">
25 |                 <li class="active"><a href="/">Lexical</a></li>
26 |                 <li><a href="/audio_lexical">Lexical + Audio</a></li>
27 |             </ul>
28 | 
29 |             <div class="panel-group" id="accordion" style="margin-top: 30px">
30 |                 <div class="panel panel-default">
31 |                     <div class="panel-heading">
32 |                         <h4 class="panel-title">
33 |                             <a data-toggle="collapse" data-parent="#accordion" href="#collapse1">
34 |                                 Input Text</a>
35 |                         </h4>
36 |                     </div>
37 |                     <div id="collapse1" class="panel-collapse collapse in">
38 |                         <div class="panel-body">
39 |                             <textarea id="textarea-input" class="form-control" rows="5" placeholder="Enter unpunctuated text">Jean-Joseph Rabearivelo is Madagascar's national poet He grew up impoverished and failed to complete secondary education but taught himself the traditions of French literature and Malagasy poetry and gained work in a publishing house as a proofreader and editor of its literary journals He produced numerous poetry anthologies in French and Malagasy as well as literary critiques an opera and two novels</textarea>
40 |                         </div>
41 |                     </div>
42 |                 </div>
43 |                 <div class="panel panel-default">
44 |                     <div class="panel-heading">
45 |                         <h4 class="panel-title">
46 |                             <a data-toggle="collapse" data-parent="#accordion" href="#collapse2">
47 |                                 Input File</a>
48 |                         </h4>
49 |                     </div>
50 |                     <div id="collapse2" class="panel-collapse collapse">
51 |                         <div class="panel-body">
52 |                             <select id="selection-text-file" type="selection" class="form-control">
53 |                                 <option value="">Choose a file</option>
54 |                             </select>
55 |                         </div>
56 |                     </div>
57 |                 </div>
58 |             </div>
59 | 
60 |             <h4 style="margin-top: 30px">Choose a Lexical Model</h4>
61 |             <select id="selection-lexical-models" type="selection" class="form-control"></select>
62 | 
63 |             <div class="clearfix"></div>
64 |             <form style="margin-top: 30px">
65 |                 <div class="pull-right">
66 |                     <button id="punctuate-lexical" class="btn btn-success btn-lg" type="button">
67 |                         <span id='loading' class="glyphicon glyphicon-refresh spinning" style="display:none"></span>
68 |                         Punctuate!
69 |                     </button>
70 |                 </div>
71 |                 <div class="clearfix"></div>
72 |             </form>
73 | 
74 |             <h3>Result</h3>
75 |             <div id="punctuation" class="clearfix"></div>
76 |         </div>
77 |         <div class="col-md-2"></div>
78 |     </div>
79 | </div>
80 | </body>
81 | </html>
82 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Flask==0.10.1
 2 | Jinja2==2.8
 3 | MarkupSafe==0.23
 4 | Pillow==3.1.0
 5 | Werkzeug==0.11.3
 6 | argparse==1.2.1
 7 | cycler==0.9.0
 8 | decorator==4.0.6
 9 | enum34==1.1.2
10 | intervaltree==2.1.0
11 | itsdangerous==0.24
12 | matplotlib==1.5.1
13 | networkx==1.10
14 | nltk==3.1
15 | numpy==1.10.4
16 | protobuf==2.6.1
17 | pyparsing==2.0.7
18 | python-dateutil==2.4.2
19 | pytz==2015.7
20 | regex==2016.01.10
21 | scikit-image==0.11.3
22 | scipy==0.17.0
23 | six==1.10.0
24 | sortedcontainers==1.4.4
25 | wsgiref==0.1.2
26 | 


--------------------------------------------------------------------------------