├── .editorconfig ├── .gitignore ├── README.md ├── infogain_loss_matrix ├── Makefile ├── config └── data ├── net-audio ├── README.md ├── experiments │ └── experiments.csv ├── net.prototxt ├── plot_log.gnuplot ├── solver.prototxt ├── testing.sh └── training.sh ├── net ├── README.md ├── create_net_lstm.py ├── net.prototxt ├── plot_log.gnuplot ├── solver.prototxt ├── testing.sh ├── training.sh └── xiaoyin │ ├── A2_4_test.prototxt │ ├── A2_4_train.prototxt │ └── solver_xiaoyin.prototxt ├── paper ├── Makefile ├── chapters │ ├── acoustic_model.tex │ ├── caffe.tex │ ├── data.tex │ ├── demo.tex │ ├── evaluation.tex │ ├── fusion.tex │ ├── future.tex │ ├── introduction.tex │ ├── lexical_model.tex │ ├── parameters.tex │ └── related_work.tex ├── img │ ├── audio_parameter_eval.png │ ├── audio_parameter_eval.svg │ ├── demo_l.png │ ├── demo_l_a.png │ ├── drawing.svg │ ├── fusion_1.pdf │ ├── fusion_1.svg │ ├── fusion_2.pdf │ ├── fusion_2.svg │ ├── fusion_eval.pdf │ ├── fusion_eval.png │ ├── fusion_eval.svg │ ├── fusion_eval2.svg │ ├── hpi_logo.png │ ├── net_acoustic.pdf │ ├── net_acoustic.svg │ ├── net_lexical.pdf │ ├── net_lexical.svg │ ├── overview_accoustic.pdf │ ├── overview_accoustic.svg │ ├── overview_lexical.pdf │ ├── overview_lexical.svg │ ├── sliding_window.pdf │ ├── sliding_window.svg │ ├── window_eval.png │ ├── window_eval.svg │ ├── window_eval_width.svg │ ├── window_pos_eval.png │ ├── window_pos_eval.svg │ ├── window_wiki_eval.png │ └── window_wiki_eval.svg ├── main.bib ├── main.tex └── notes │ ├── Makefile │ ├── auswertung.ods │ ├── auswertung2.ods │ ├── auswertung_fusion.txt │ ├── auswertung_onlypos.ods │ ├── auswertung_wiki.ods │ ├── experiments.csv │ ├── experiments2.csv │ ├── fusion_eval.ods │ ├── fusion_eval.txt │ └── results.tex ├── python ├── README.md ├── common │ ├── __init__.py │ ├── argparse_util.py │ ├── sbd_config.py │ └── send_email.py ├── config.ini.default ├── console_demo │ ├── README.md │ ├── __init__.py │ ├── demo.py │ └── demo_preparation.py ├── demo_data │ ├── .gitignore │ ├── audio_examples │ │ └── .gitignore │ ├── audio_models │ │ └── .gitignore │ ├── download_all.sh │ ├── download_google_vector.sh │ ├── download_models.sh │ ├── folders.txt │ ├── lexical_models.txt │ ├── lexical_models │ │ └── .gitignore │ └── text_data │ │ └── .gitignore ├── email.ini.default ├── evaluation │ └── evaluation.py ├── evaluation_data │ ├── .gitignore │ ├── download_all.sh │ └── folders.txt ├── experiments │ ├── README.md │ ├── audio_databases.sh │ ├── audio_training.sh │ ├── databases.sh │ └── training.sh ├── parsing │ ├── __init__.py │ ├── abstract_parser.py │ ├── audio_parser.py │ ├── ctm_parser.py │ ├── get_parser.py │ ├── line_parser.py │ ├── plaintext_parser.py │ └── xml_parser.py ├── preprocessing │ ├── __init__.py │ ├── audio.py │ ├── glove_file.py │ ├── nlp_pipeline.py │ ├── sliding_window.py │ ├── text.py │ ├── tokens.py │ ├── training_instance.py │ └── word2vec_file.py ├── sbd_classification │ ├── __init__.py │ ├── audio_classification.py │ ├── classification_input.py │ ├── fusion.py │ ├── lexical_classification.py │ └── util.py ├── sbd_leveldb │ ├── __init__.py │ ├── audio_training_instance_generator.py │ ├── level_db_creator.py │ └── training_instance_generator.py ├── tools │ ├── __init__.py │ ├── comparison.py │ ├── look_into_leveldb.py │ ├── netconfig.py │ ├── parse_result.py │ └── text_converter.py └── web_demo │ ├── README.md │ ├── __init__.py │ ├── file_io.py │ ├── json_converter.py │ ├── static │ ├── main.css │ └── main.js │ ├── templates │ ├── audio_lexical.html │ └── index.html │ └── web.py └── requirements.txt /.editorconfig: -------------------------------------------------------------------------------- 1 | # top-most EditorConfig file 2 | root = true 3 | 4 | # Unix-style newlines with a newline ending every file 5 | [*] 6 | end_of_line = lf 7 | insert_final_newline = true 8 | charset = utf-8 9 | indent_style = space 10 | trim_trailing_whitespace = true 11 | 12 | [Makefile] 13 | indent_style = tab 14 | 15 | [*.md] 16 | trim_trailing_whitespace = false 17 | 18 | [*.py] 19 | indent_size = 4 20 | 21 | [*.sh] 22 | indent_size = 4 23 | 24 | [*.html] 25 | indent_size = 4 26 | 27 | [*.js] 28 | indent_size = 4 29 | 30 | [*.css] 31 | indent_size = 4 32 | 33 | [*.prototxt] 34 | indent_size = 2 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Sentence files 2 | train_instances.txt 3 | test_instances.txt 4 | config.ini 5 | email.ini 6 | lineparsing 7 | configurations/ 8 | audio_configurations/ 9 | auto.prototxt 10 | 11 | # Experiment files 12 | *.run 13 | 14 | # Caffe files 15 | *.tlog 16 | *.tstlog 17 | leveldbs 18 | net/experiments 19 | net/snapshots/ 20 | net-audio/experiments 21 | net-audio/snapshots/ 22 | 23 | # IDE files 24 | .idea/ 25 | *.iml 26 | __pycache__/ 27 | 28 | # Python virtual environment 29 | .env 30 | p3/ 31 | p2/ 32 | *.pyc 33 | 34 | # Database file 35 | hdf5/ 36 | infogain_loss_matrix/*.h5 37 | 38 | # LaTeX 39 | .output/ 40 | main.pdf 41 | #*.svg 42 | *.txss 43 | 44 | # Lock files 45 | .~lock.*# 46 | 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Sentence Boundary Detecting using Deep Neural Networks 2 | 3 | We try to detect sentence boundaries using deep learning. 4 | Created as part of the "Practical Applications of Multimedia Retrieval" seminar at the Hasso-Plattner-Institute, Potsdam, Germany. 5 | 6 | ### Setup Demo 7 | We build a python-based demo using caffe. 8 | 9 | #####Prerequirements: 10 | 1. Clone this repository 11 | 2. Install python 2.7 including the following packages from requirements.txt 12 | 13 | `pip install -r requirements.txt` 14 | 15 | 3. Use the nltk downloader to download `averaged_perceptron_tagger` and `punkt` models: 16 | 17 | `python -m nltk.downloader` 18 | 19 | 4. Setup caffe, like described [here](http://caffe.berkeleyvision.org/installation.html) 20 | 5. Add path to the repository to your python path: 21 | 22 | `export PYTHONPATH=/path/to/sentence-boundary-detection-nn/python:$PYTHONPATH` 23 | 24 | 6. Download Google Word Vector (GoogleNews-vectors-negative300.bin.gz) from [here](https://code.google.com/p/word2vec/) or use directly this [url](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing) and extract the result into the `sentence-boundary-detection-nn/python/demo_data` directory 25 | 7. Paste your trained models into a demo data folder, for example `sentence-boundary-detection-nn/python/demo_data` with the following structure: 26 | * lexical_models : containing all pretrained models you want to use in a seperate directory. Each models needs a 27 | * .ini 28 | * .caffemodel 29 | * net.prototxt file. 30 | * text_data: containing all possible text files, which should be used as prediction input 31 | * audio_models: containing all pretrainied audio models, each in a seperate directory. Each needs the same files as described for lexical models 32 | * audio_examples: containing all audio files, which should be available during the demo. Each one in a seperate directory containing the ctm, energy and pitch files. 33 | 34 | #####Start up 35 | 36 | Change into the repository directory and execute, this should work right out of the box, unless you are using a custom `demo_data` folder: 37 | ``` 38 | python web_demo/web.py 39 | ``` 40 | Optionally you can specify the location of the word vector and the demo data. Otherwise default values are used. 41 | For further information execute: 42 | ``` 43 | python web_demo/web.py -h 44 | ``` 45 | -------------------------------------------------------------------------------- /infogain_loss_matrix/Makefile: -------------------------------------------------------------------------------- 1 | 2 | hdf5: 3 | rm -f infogain_loss_matrix.h5 4 | h5import data -c config -o infogain_loss_matrix.h5 5 | 6 | scp: 7 | scp infogain_loss_matrix.h5 sentence:/mnt/naruto/sentence/hdf5s/ 8 | -------------------------------------------------------------------------------- /infogain_loss_matrix/config: -------------------------------------------------------------------------------- 1 | RANK 4 2 | DIMENSION-SIZES 1 1 2 2 3 | INPUT-CLASS TEXTFP 4 | INPUT-SIZE 32 5 | -------------------------------------------------------------------------------- /infogain_loss_matrix/data: -------------------------------------------------------------------------------- 1 | 1.0 0.0 2 | 0.0 1.0 3 | -------------------------------------------------------------------------------- /net-audio/README.md: -------------------------------------------------------------------------------- 1 | # Training 2 | 3 | For training the acoustic neural network, you have to execute the following steps: 4 | 5 | 1. Adapt the `net.prototxt`: Change the network layout and make sure, you enter the correct path to the level db. 6 | 2. Adapt the `solver.prototxt`. 7 | 3. Make sure there exists an `experiments` and `snapshots` folder in this folder. 8 | 4. Execute `training.sh `. 9 | 10 | The `training.sh` script does several things: 11 | 12 | * Creates a folder in the `experiment` folder with the name you gave your experiment 13 | * The following files are copied to that folder: 14 | * `config.ini`, which is located in your database folder 15 | * `net.prototxt` 16 | * `solver.prototxt` 17 | * log files from the training 18 | * Starts the training of the neural network 19 | * The latest `.solverstate` and `.caffemodel` are copied to the `experiment` folder after the training is finished 20 | * After training, different graphs are created and put into the `experiment` folder. 21 | -------------------------------------------------------------------------------- /net-audio/experiments/experiments.csv: -------------------------------------------------------------------------------- 1 | window_size,punctuation_position,accuracy_0,loss_1,precision_per_class_2,precision_per_class_3,recall_per_class_4,recall_per_class_5 2 | 1,0,0.927,0.176455,0.983368,0.46225,0.937801,0.771208 3 | 1,1,0.931583,0.180177,0.983442,0.4856,0.942666,0.773248 4 | 3,1,0.935583,0.178618,0.983526,0.502092,0.946979,0.771208 5 | 3,2,0.938417,0.165755,0.981621,0.517889,0.951965,0.743261 6 | 5,1,0.945083,0.162986,0.982374,0.55619,0.958478,0.751609 7 | 5,2,0.947167,0.156467,0.981449,0.571286,0.961686,0.737452 8 | 5,3,0.942833,0.161973,0.981886,0.543499,0.95651,0.745828 9 | 5,4,0.94625,0.155126,0.98064,0.567134,0.961501,0.726573 10 | 5,5,0.948333,0.162826,0.978675,0.588424,0.96576,0.699363 11 | 8,2,0.950333,0.156283,0.98064,0.596195,0.96596,0.724936 12 | 8,3,0.95,0.159289,0.980893,0.593521,0.965333,0.72914 13 | 8,4,0.950833,0.153095,0.980477,0.600427,0.966673,0.722365 14 | 8,5,0.95025,0.153548,0.981163,0.593946,0.965339,0.732304 15 | 8,6,0.94975,0.157002,0.980279,0.593023,0.965689,0.720154 16 | -------------------------------------------------------------------------------- /net-audio/net.prototxt: -------------------------------------------------------------------------------- 1 | name: "sentence_boundary_detection" 2 | # 3 | # Data 4 | # 5 | layer { 6 | name: "data" 7 | type: "Data" 8 | top: "data" 9 | top: "label" 10 | include { 11 | phase: TRAIN 12 | } 13 | data_param { 14 | source: "/home/ms2015t3/sentence-boundary-detection-nn/leveldbs/audio_window-5-3/train" 15 | batch_size: 1024 16 | backend: LEVELDB 17 | } 18 | } 19 | layer { 20 | name: "data" 21 | type: "Data" 22 | top: "data" 23 | top: "label" 24 | include { 25 | phase: TEST 26 | } 27 | data_param { 28 | source: "/home/ms2015t3/sentence-boundary-detection-nn/leveldbs/audio_window-5-3/test" 29 | batch_size: 12000 30 | backend: LEVELDB 31 | } 32 | } 33 | layer { 34 | name: "infogain_loss_matrix" 35 | type: "HDF5Data" 36 | top: "dataset0" 37 | hdf5_data_param { 38 | source: "/mnt/naruto/sentence/hdf5s/infogain_loss_matrix.txt" 39 | batch_size: 1 40 | } 41 | } 42 | # 43 | # Fully Connected Layer 1 44 | # 45 | layer { 46 | name: "fc1" 47 | type: "InnerProduct" 48 | bottom: "data" 49 | top: "fc1" 50 | inner_product_param { 51 | num_output: 2048 52 | weight_filler { 53 | type: "xavier" 54 | } 55 | bias_filler { 56 | type: "constant" 57 | } 58 | } 59 | } 60 | layer { 61 | name: "relu1" 62 | type: "ReLU" 63 | bottom: "fc1" 64 | top: "fc1" 65 | } 66 | layer { 67 | name: "drop1" 68 | type: "Dropout" 69 | bottom: "fc1" 70 | top: "fc1" 71 | dropout_param { 72 | dropout_ratio: 0.5 73 | } 74 | } 75 | # 76 | # Fully Connected Layer 2 77 | # 78 | layer { 79 | name: "fc2" 80 | type: "InnerProduct" 81 | bottom: "fc1" 82 | top: "fc2" 83 | inner_product_param { 84 | num_output: 4096 85 | weight_filler { 86 | type: "xavier" 87 | } 88 | bias_filler { 89 | type: "constant" 90 | } 91 | } 92 | } 93 | layer { 94 | name: "relu2" 95 | type: "ReLU" 96 | bottom: "fc2" 97 | top: "fc2" 98 | } 99 | layer { 100 | name: "drop2" 101 | type: "Dropout" 102 | bottom: "fc2" 103 | top: "fc2" 104 | dropout_param { 105 | dropout_ratio: 0.5 106 | } 107 | } 108 | 109 | # 110 | # Fully Connected Layer 3 111 | # 112 | layer { 113 | name: "fc3" 114 | type: "InnerProduct" 115 | bottom: "fc2" 116 | top: "fc3" 117 | inner_product_param { 118 | num_output: 2048 119 | weight_filler { 120 | type: "xavier" 121 | } 122 | bias_filler { 123 | type: "constant" 124 | } 125 | } 126 | } 127 | layer { 128 | name: "relu3" 129 | type: "ReLU" 130 | bottom: "fc3" 131 | top: "fc3" 132 | } 133 | # layer { 134 | # name: "drop3" 135 | # type: "Dropout" 136 | # bottom: "fc3" 137 | # top: "fc3" 138 | # dropout_param { 139 | # dropout_ratio: 0.5 140 | # } 141 | # } 142 | 143 | # 144 | # Fully Connected Layer Final - Preparation for Output 145 | # 146 | layer { 147 | name: "fc_final" 148 | type: "InnerProduct" 149 | bottom: "fc3" 150 | top: "fc_final" 151 | inner_product_param { 152 | num_output: 2 153 | weight_filler { 154 | type: "xavier" 155 | } 156 | bias_filler { 157 | type: "constant" 158 | } 159 | } 160 | } 161 | 162 | # 163 | # Loss, Accuracy 164 | # 165 | layer { 166 | name: "softmax" 167 | type: "Softmax" 168 | bottom: "fc_final" 169 | top: "softmax" 170 | } 171 | layer { 172 | name: "loss" 173 | type: "InfogainLoss" 174 | bottom: "softmax" 175 | bottom: "label" 176 | bottom: "dataset0" 177 | top: "loss" 178 | } 179 | layer { 180 | name: "accuracy" 181 | type: "Accuracy" 182 | bottom: "fc_final" 183 | bottom: "label" 184 | top: "accuracy" 185 | top: "recall_per_class" 186 | top: "precision_per_class" 187 | include { 188 | phase: TEST 189 | } 190 | } 191 | -------------------------------------------------------------------------------- /net-audio/plot_log.gnuplot: -------------------------------------------------------------------------------- 1 | # Please generate the neccessary data files with 2 | # /path/to/caffe/tools/extra/parse_log.sh before plotting. 3 | # Example usage: 4 | # ./parse_log.sh mnist.log 5 | # Now you have mnist.log.train and mnist.log.test. 6 | # gnuplot mnist.gnuplot 7 | 8 | # The fields present in the data files that are usually proper to plot along 9 | # the y axis are test accuracy, test loss, training loss, and learning rate. 10 | # Those should plot along the x axis are training iterations and seconds. 11 | # Possible combinations: 12 | # 1. Test accuracy (test score 0) vs. training iterations / time; 13 | # 2. Test loss (test score 1) time; 14 | # 3. Training loss vs. training iterations / time; 15 | # 4. Learning rate vs. training iterations / time; 16 | # A rarer one: Training time vs. iterations. 17 | 18 | reset 19 | #set terminal dumb 20 | set style data lines 21 | set key right center 22 | 23 | file(test_or_train) = sprintf("%s.%s", filename, test_or_train) 24 | ucf_101_title = "Learning on six classes of UCF 101" 25 | 26 | ###### Fields in the training data 27 | ###### Iters Seconds TrainingLoss LearningRate 28 | 29 | # Training loss vs. training iterations 30 | set terminal png 31 | set output "it_vs_train-loss.png" 32 | set title "Training loss vs. training iterations" 33 | set xlabel "Training iterations" 34 | set ylabel "Training loss" 35 | plot file("train") using 1:3 title "loss" 36 | 37 | # Training loss vs. training time 38 | #set terminal png 39 | #set output "time_vs_train-loss.png" 40 | #set title "Training time vs. training loss" 41 | #set xlabel "Training time" 42 | #set ylabel "Training loss" 43 | #plot file("train") using 2:3 title "loss" 44 | 45 | # Learning rate vs. training iterations; 46 | set terminal png 47 | set output "it_vs_lr.png" 48 | set xlabel "Training iterations" 49 | set ylabel "Learning rate" 50 | plot file("train") using 1:4 title "learning rate" 51 | 52 | ###### Fields in the test data 53 | ###### Iters Seconds TestAccuracy TestLoss 54 | 55 | # Test loss vs. training iterations 56 | set terminal png 57 | set output "it_vs_test-acc.png" 58 | set title "Training iterations vs. test accuracy" 59 | set xlabel "Training iterations" 60 | set ylabel "Test accuracy" 61 | plot file("test") using 1:3 title "accuracy" 62 | -------------------------------------------------------------------------------- /net-audio/solver.prototxt: -------------------------------------------------------------------------------- 1 | net: "auto.prototxt" 2 | 3 | # Test before training? 4 | test_initialization: true 5 | # Test every nth iteration 6 | test_interval: 10000 7 | # How many iterations per test 8 | test_iter: 1 9 | 10 | # Base learning rate 11 | base_lr: 0.00001 12 | # Policy for changing the learning rate - multiply by gamma every stepsize iterations 13 | lr_policy: "step" 14 | gamma: 0.1 15 | stepsize: 300000 16 | momentum: 0.9 17 | # Regularization parameter for the weights 18 | weight_decay: 0.0005 19 | 20 | # Display training loss every nth iteration 21 | display: 200 22 | # After how many iterations to stop 23 | max_iter: 100000 24 | 25 | # Snapshot every nth iteration in the specified directory 26 | snapshot: 100000 27 | snapshot_prefix: "snapshots/" 28 | 29 | random_seed: 1701 30 | # Display the loss averaged over the last average_loss iterations - this does not work for accuracy 31 | average_loss: 100 32 | #clip_gradients: 10 33 | 34 | # GPU for the win! 35 | solver_mode: GPU 36 | -------------------------------------------------------------------------------- /net-audio/testing.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | PROJECT="sentence" 4 | TESTING_LOG_NAME="${PROJECT}.tstlog" 5 | 6 | # Check if called with name 7 | if [ $# -ne 1 ]; then 8 | echo "Usage: $0 [experiment_name]" 9 | echo " experiment_name: Name of the subfolder in ./experiments/ for the current experiment." 10 | echo "Exiting." 11 | exit 1 12 | fi 13 | 14 | # We need the output/error redirection, because caffe outputs to standard error, and we want to pipe to grep's standard in 15 | # See http://stackoverflow.com/questions/1507816/with-bash-how-can-i-pipe-standard-error-into-another-process 16 | ($CAFFE_ROOT/build/tools/caffe test -model net.prototxt -weights experiments/$1/*.caffemodel -iterations 1 3>&1 1>&2- 2>&3-) | grep --invert-match "Waiting for data" > $TESTING_LOG_NAME 17 | 18 | -------------------------------------------------------------------------------- /net-audio/training.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Check if called with name 4 | if [ $# -ne 1 ]; then 5 | echo "Usage: $0 [experiment_name]" 6 | echo " experiment_name: Name of the subfolder in ./experiments/ for the current experiment." 7 | echo "Exiting." 8 | exit 1 9 | fi 10 | 11 | PROJECT="audio" 12 | SOLVER="solver.prototxt" 13 | # Find out net from the solver 14 | NET=$(grep --only-matching "\w\+\.prototxt" solver.prototxt) 15 | DATABASE=$(python $SENTENCE_HOME/python/tools/netconfig.py -p $NET) 16 | 17 | echo "Using solver ${SOLVER} with net ${NET} and database ${DATABASE}" 18 | 19 | # Set Vars 20 | DATE=`date +%Y%m%d-%H%M%S` 21 | FOLDER_NAME="${DATE}_$1" 22 | TRAINING_LOG_NAME="${PROJECT}_${NET}.tlog" 23 | 24 | echo "Saving experiment in experiments/$FOLDER_NAME" 25 | mkdir experiments/$FOLDER_NAME 26 | 27 | # Function for saving results and making plots 28 | function cleanup() { 29 | echo $1 30 | 31 | echo "Copying snapshots" 32 | ls -v -1 snapshots/ | tail -n 2 | xargs -i mv snapshots/{} experiments/$FOLDER_NAME 33 | 34 | echo "Parsing logs" 35 | $CAFFE_ROOT/tools/extra/parse_log.sh $TRAINING_LOG_NAME 36 | 37 | echo "Copying logs" 38 | cp $TRAINING_LOG_NAME $TRAINING_LOG_NAME.train $TRAINING_LOG_NAME.test experiments/$FOLDER_NAME 39 | 40 | echo "Building plots" 41 | gnuplot -e "filename='$TRAINING_LOG_NAME'" -p plot_log.gnuplot 42 | mv *.png experiments/$FOLDER_NAME 43 | 44 | rm ${TRAINING_LOG_NAME}.test ${TRAINING_LOG_NAME}.train 45 | echo "Clean up finished" 46 | } 47 | 48 | # Clean snapshots 49 | rm snapshots/* 2> /dev/null 50 | 51 | # Saving setup 52 | cp *.prototxt $SOLVER training.sh experiments/$FOLDER_NAME 53 | # Copy database configuration 54 | cp $DATABASE/*.ini experiments/$FOLDER_NAME 55 | 56 | # Setting interrupt trap 57 | trap 'cleanup "Training interrupted"; exit 1' INT 58 | 59 | # Calling caffe 60 | # export CAFFE_ROOT="$HOME/caffe-tmbo" 61 | 62 | $CAFFE_ROOT/build/tools/caffe train \ 63 | -solver ./experiments/$FOLDER_NAME/$SOLVER 2> $TRAINING_LOG_NAME 64 | 65 | # Check if Training successful 66 | if [ $? -ne 0 ]; then 67 | # Send Email Notification 68 | cd "${SENTENCE_HOME}/python" 69 | python "common/send_email.py" "Training failed" "$FOLDER_NAME" "../net/$TRAINING_LOG_NAME" 70 | cd - 71 | echo "Training not successful. Exiting." 72 | 73 | # Resetting interrupt handling 74 | trap - INT 75 | exit 2 76 | fi 77 | 78 | # Resetting interrupt handling 79 | trap - INT 80 | 81 | cleanup "Training finished" 82 | 83 | -------------------------------------------------------------------------------- /net/README.md: -------------------------------------------------------------------------------- 1 | # Training 2 | 3 | For training the lexical neural network, you have to execute the following steps: 4 | 5 | 1. Adapt the `net.prototxt`: Change the network layout and make sure, you enter the correct path to the level db. 6 | 2. Adapt the `solver.prototxt`. 7 | 3. Make sure there exists an `experiments` and `snapshots` folder in this folder. 8 | 4. Execute `training.sh `. 9 | 10 | The `training.sh` script does several things: 11 | 12 | * Creates a folder in the `experiment` folder with the name you gave your experiment 13 | * The following files are copied to that folder: 14 | * `config.ini`, which is located in your database folder 15 | * `net.prototxt` 16 | * `solver.prototxt` 17 | * log files from the training 18 | * Starts the training of the neural network 19 | * The latest `.solverstate` and `.caffemodel` are copied to the `experiment` folder after the training is finished 20 | * After training, different graphs are created and put into the `experiment` folder. 21 | -------------------------------------------------------------------------------- /net/net.prototxt: -------------------------------------------------------------------------------- 1 | name: "sentence_boundary_detection" 2 | # 3 | # Data 4 | # 5 | layer { 6 | name: "data" 7 | type: "Data" 8 | top: "data" 9 | top: "label" 10 | include { 11 | phase: TRAIN 12 | } 13 | data_param { 14 | source: "/home/ms2015t3/sentence-boundary-detection-nn/leveldbs/google_ted_wiki_window-8-4_pos-true_qm-false_balanced-false_nr-rep-true_word-this_wiki-test/train" 15 | batch_size: 128 16 | backend: LEVELDB 17 | } 18 | } 19 | layer { 20 | name: "data" 21 | type: "Data" 22 | top: "data" 23 | top: "label" 24 | include { 25 | phase: TEST 26 | } 27 | data_param { 28 | source: "/home/ms2015t3/sentence-boundary-detection-nn/leveldbs/google_ted_wiki_window-8-4_pos-true_qm-false_balanced-false_nr-rep-true_word-this_wiki-test/test" 29 | batch_size: 12000 30 | backend: LEVELDB 31 | } 32 | } 33 | # 34 | # Fully Connected Layer 1 35 | # 36 | layer { 37 | name: "fc1" 38 | type: "InnerProduct" 39 | bottom: "data" 40 | top: "fc1" 41 | inner_product_param { 42 | num_output: 2048 43 | weight_filler { 44 | type: "xavier" 45 | } 46 | bias_filler { 47 | type: "constant" 48 | } 49 | } 50 | } 51 | layer { 52 | name: "relu1" 53 | type: "ReLU" 54 | bottom: "fc1" 55 | top: "fc1" 56 | } 57 | layer { 58 | name: "drop1" 59 | type: "Dropout" 60 | bottom: "fc1" 61 | top: "fc1" 62 | dropout_param { 63 | dropout_ratio: 0.5 64 | } 65 | } 66 | # 67 | # Fully Connected Layer 2 68 | # 69 | layer { 70 | name: "fc2" 71 | type: "InnerProduct" 72 | bottom: "fc1" 73 | top: "fc2" 74 | inner_product_param { 75 | num_output: 4096 76 | weight_filler { 77 | type: "xavier" 78 | } 79 | bias_filler { 80 | type: "constant" 81 | } 82 | } 83 | } 84 | layer { 85 | name: "relu2" 86 | type: "ReLU" 87 | bottom: "fc2" 88 | top: "fc2" 89 | } 90 | layer { 91 | name: "drop2" 92 | type: "Dropout" 93 | bottom: "fc2" 94 | top: "fc2" 95 | dropout_param { 96 | dropout_ratio: 0.5 97 | } 98 | } 99 | # 100 | # Fully Connected Layer 3 101 | # 102 | layer { 103 | name: "fc3" 104 | type: "InnerProduct" 105 | bottom: "fc2" 106 | top: "fc3" 107 | inner_product_param { 108 | num_output: 2048 109 | weight_filler { 110 | type: "xavier" 111 | } 112 | bias_filler { 113 | type: "constant" 114 | } 115 | } 116 | } 117 | layer { 118 | name: "relu3" 119 | type: "ReLU" 120 | bottom: "fc3" 121 | top: "fc3" 122 | } 123 | # layer { 124 | # name: "drop3" 125 | # type: "Dropout" 126 | # bottom: "fc3" 127 | # top: "fc3" 128 | # dropout_param { 129 | # dropout_ratio: 0.5 130 | # } 131 | # } 132 | 133 | # 134 | # Fully Connected Layer Final - Preparation for Output 135 | # 136 | layer { 137 | name: "fc_final" 138 | type: "InnerProduct" 139 | bottom: "fc3" 140 | top: "fc_final" 141 | inner_product_param { 142 | num_output: 3 143 | weight_filler { 144 | type: "xavier" 145 | } 146 | bias_filler { 147 | type: "constant" 148 | } 149 | } 150 | } 151 | 152 | # 153 | # Loss, Accuracy 154 | # 155 | layer { 156 | name: "loss" 157 | type: "SoftmaxWithLoss" 158 | bottom: "fc_final" 159 | bottom: "label" 160 | top: "loss" 161 | } 162 | layer { 163 | name: "accuracy" 164 | type: "Accuracy" 165 | bottom: "fc_final" 166 | bottom: "label" 167 | top: "accuracy" 168 | top: "recall_per_class" 169 | top: "precision_per_class" 170 | include { 171 | phase: TEST 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /net/plot_log.gnuplot: -------------------------------------------------------------------------------- 1 | # Please generate the neccessary data files with 2 | # /path/to/caffe/tools/extra/parse_log.sh before plotting. 3 | # Example usage: 4 | # ./parse_log.sh mnist.log 5 | # Now you have mnist.log.train and mnist.log.test. 6 | # gnuplot mnist.gnuplot 7 | 8 | # The fields present in the data files that are usually proper to plot along 9 | # the y axis are test accuracy, test loss, training loss, and learning rate. 10 | # Those should plot along the x axis are training iterations and seconds. 11 | # Possible combinations: 12 | # 1. Test accuracy (test score 0) vs. training iterations / time; 13 | # 2. Test loss (test score 1) time; 14 | # 3. Training loss vs. training iterations / time; 15 | # 4. Learning rate vs. training iterations / time; 16 | # A rarer one: Training time vs. iterations. 17 | 18 | reset 19 | #set terminal dumb 20 | set style data lines 21 | set key right center 22 | 23 | file(test_or_train) = sprintf("%s.%s", filename, test_or_train) 24 | ucf_101_title = "Learning on six classes of UCF 101" 25 | 26 | ###### Fields in the training data 27 | ###### Iters Seconds TrainingLoss LearningRate 28 | 29 | # Training loss vs. training iterations 30 | set terminal png 31 | set output "it_vs_train-loss.png" 32 | set title "Training loss vs. training iterations" 33 | set xlabel "Training iterations" 34 | set ylabel "Training loss" 35 | plot file("train") using 1:3 title "loss" 36 | 37 | # Training loss vs. training time 38 | #set terminal png 39 | #set output "time_vs_train-loss.png" 40 | #set title "Training time vs. training loss" 41 | #set xlabel "Training time" 42 | #set ylabel "Training loss" 43 | #plot file("train") using 2:3 title "loss" 44 | 45 | # Learning rate vs. training iterations; 46 | set terminal png 47 | set output "it_vs_lr.png" 48 | set xlabel "Training iterations" 49 | set ylabel "Learning rate" 50 | plot file("train") using 1:4 title "learning rate" 51 | 52 | ###### Fields in the test data 53 | ###### Iters Seconds TestAccuracy TestLoss 54 | 55 | # Test loss vs. training iterations 56 | set terminal png 57 | set output "it_vs_test-acc.png" 58 | set title "Training iterations vs. test accuracy" 59 | set xlabel "Training iterations" 60 | set ylabel "Test accuracy" 61 | plot file("test") using 1:3 title "accuracy" 62 | -------------------------------------------------------------------------------- /net/solver.prototxt: -------------------------------------------------------------------------------- 1 | net: "net.prototxt" 2 | 3 | # Test before training? 4 | test_initialization: true 5 | # Test every nth iteration 6 | test_interval: 5000 7 | # How many iterations per test 8 | test_iter: 1 9 | 10 | # Base learning rate 11 | base_lr: 0.01 12 | # Policy for changing the learning rate - multiply by gamma every stepsize iterations 13 | lr_policy: "step" 14 | gamma: 0.1 15 | stepsize: 300000 16 | momentum: 0.9 17 | # Regularization parameter for the weights 18 | weight_decay: 0.0005 19 | 20 | # Display training loss every nth iteration 21 | display: 200 22 | # After how many iterations to stop 23 | max_iter: 125000 24 | 25 | # Snapshot every nth iteration in the specified directory 26 | snapshot: 100000 27 | snapshot_prefix: "snapshots/" 28 | 29 | random_seed: 1701 30 | # Display the loss averaged over the last average_loss iterations - this does not work for accuracy 31 | average_loss: 100 32 | #clip_gradients: 10 33 | 34 | # GPU for the win! 35 | solver_mode: GPU 36 | -------------------------------------------------------------------------------- /net/testing.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | PROJECT="sentence" 4 | TESTING_LOG_NAME="${PROJECT}.tstlog" 5 | 6 | # Check if called with name 7 | if [ $# -ne 1 ]; then 8 | echo "Usage: $0 [experiment_name]" 9 | echo " experiment_name: Name of the subfolder in ./experiments/ for the current experiment." 10 | echo "Exiting." 11 | exit 1 12 | fi 13 | 14 | # We need the output/error redirection, because caffe outputs to standard error, and we want to pipe to grep's standard in 15 | # See http://stackoverflow.com/questions/1507816/with-bash-how-can-i-pipe-standard-error-into-another-process 16 | ($CAFFE_ROOT/build/tools/caffe test -model net.prototxt -weights experiments/$1/*.caffemodel -iterations 1 3>&1 1>&2- 2>&3-) | grep --invert-match "Waiting for data" > $TESTING_LOG_NAME 17 | 18 | -------------------------------------------------------------------------------- /net/training.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Check if called with name 4 | if [ $# -ne 1 ]; then 5 | echo "Usage: $0 [experiment_name]" 6 | echo " experiment_name: Name of the subfolder in ./experiments/ for the current experiment." 7 | echo "Exiting." 8 | exit 1 9 | fi 10 | 11 | PROJECT="sentence" 12 | SOLVER="solver.prototxt" 13 | # Find out net from the solver 14 | NET=$(grep --only-matching "\w\+\.prototxt" solver.prototxt) 15 | #DATABASE=$(python $SENTENCE_HOME/python/tools/netconfig.py -p $NET) 16 | 17 | echo "Using solver ${SOLVER} with net ${NET} and database ${DATABASE}" 18 | 19 | # Set Vars 20 | DATE=`date +%Y%m%d-%H%M%S` 21 | FOLDER_NAME="${DATE}_$1" 22 | TRAINING_LOG_NAME="${PROJECT}_${NET}.tlog" 23 | 24 | echo "Saving experiment in experiments/$FOLDER_NAME" 25 | mkdir experiments/$FOLDER_NAME 26 | 27 | # Function for saving results and making plots 28 | function cleanup() { 29 | echo $1 30 | 31 | echo "Copying snapshots" 32 | ls -v -1 snapshots/ | tail -n 2 | xargs -i mv snapshots/{} experiments/$FOLDER_NAME 33 | 34 | echo "Parsing logs" 35 | $CAFFE_ROOT/tools/extra/parse_log.sh $TRAINING_LOG_NAME 36 | 37 | echo "Copying logs" 38 | cp $TRAINING_LOG_NAME $TRAINING_LOG_NAME.train $TRAINING_LOG_NAME.test experiments/$FOLDER_NAME 39 | 40 | echo "Building plots" 41 | gnuplot -e "filename='$TRAINING_LOG_NAME'" -p plot_log.gnuplot 42 | mv *.png experiments/$FOLDER_NAME 43 | 44 | rm ${TRAINING_LOG_NAME}.test ${TRAINING_LOG_NAME}.train 45 | echo "Clean up finished" 46 | } 47 | 48 | # Clean snapshots 49 | rm snapshots/* 2> /dev/null 50 | 51 | # Saving setup 52 | cp *.prototxt $SOLVER training.sh experiments/$FOLDER_NAME 53 | # Copy database configuration 54 | #cp $DATABASE/*.ini experiments/$FOLDER_NAME 55 | 56 | # Setting interrupt trap 57 | trap 'cleanup "Training interrupted"; exit 1' INT 58 | 59 | # Calling caffe 60 | # export CAFFE_ROOT="$HOME/caffe-tmbo" 61 | 62 | $CAFFE_ROOT/build/tools/caffe train \ 63 | -solver ./experiments/$FOLDER_NAME/$SOLVER 2> $TRAINING_LOG_NAME 64 | 65 | # Check if Training successful 66 | if [ $? -ne 0 ]; then 67 | # Send Email Notification 68 | cd "${SENTENCE_HOME}/python" 69 | python "common/send_email.py" "Training failed" "$FOLDER_NAME" "../net/$TRAINING_LOG_NAME" 70 | cd - 71 | echo "Training not successful. Exiting." 72 | 73 | # Resetting interrupt handling 74 | trap - INT 75 | exit 2 76 | fi 77 | 78 | # Resetting interrupt handling 79 | trap - INT 80 | 81 | cleanup "Training finished" 82 | 83 | -------------------------------------------------------------------------------- /net/xiaoyin/A2_4_test.prototxt: -------------------------------------------------------------------------------- 1 | layer { 2 | name: "data" 3 | type: "HDF5Data" 4 | top: "data" 5 | top: "label" 6 | hdf5_data_param { 7 | source: "/home/ms2015t3/sentence-boundary-detection-nn/net/test_hdf5s" 8 | batch_size: 12000 9 | } 10 | } 11 | layer { 12 | name: "reshape" 13 | type: "Reshape" 14 | bottom: "data" 15 | top: "dataR" 16 | reshape_param { 17 | shape { 18 | dim: 0 # copy the dimension from below 19 | dim: 1 20 | dim: 5 21 | dim: -1 # infer it from the other dimensions 22 | } 23 | } 24 | } 25 | layer { 26 | name: "ip1" 27 | type: "InnerProduct" 28 | bottom: "dataR" 29 | top: "ip1" 30 | inner_product_param { 31 | num_output: 2048 32 | weight_filler { 33 | type: "xavier" 34 | } 35 | } 36 | } 37 | layer { 38 | name: "relu1" 39 | type: "ReLU" 40 | bottom: "ip1" 41 | top: "ip1" 42 | } 43 | layer { 44 | name: "dropout1" 45 | type: "Dropout" 46 | bottom: "ip1" 47 | top: "ip1" 48 | dropout_param { 49 | dropout_ratio: 0.5 50 | } 51 | } 52 | 53 | layer { 54 | name: "ip2" 55 | type: "InnerProduct" 56 | bottom: "ip1" 57 | top: "ip2" 58 | inner_product_param { 59 | num_output: 4096 60 | weight_filler { 61 | type: "xavier" 62 | } 63 | } 64 | } 65 | layer { 66 | name: "relu2" 67 | type: "ReLU" 68 | bottom: "ip2" 69 | top: "ip2" 70 | } 71 | layer { 72 | name: "dropout2" 73 | type: "Dropout" 74 | bottom: "ip2" 75 | top: "ip2" 76 | dropout_param { 77 | dropout_ratio: 0.5 78 | } 79 | } 80 | 81 | layer { 82 | name: "ip3" 83 | type: "InnerProduct" 84 | bottom: "ip2" 85 | top: "ip3" 86 | inner_product_param { 87 | num_output: 2048 88 | weight_filler { 89 | type: "xavier" 90 | } 91 | } 92 | } 93 | layer { 94 | name: "relu3" 95 | type: "ReLU" 96 | bottom: "ip3" 97 | top: "ip3" 98 | } 99 | layer { 100 | name: "ip4" 101 | type: "InnerProduct" 102 | bottom: "ip3" 103 | top: "ip4" 104 | inner_product_param { 105 | num_output: 4 106 | weight_filler { 107 | type: "xavier" 108 | } 109 | } 110 | } 111 | layer { 112 | name: "accuracy" 113 | type: "Accuracy" 114 | bottom: "ip4" 115 | bottom: "label" 116 | top: "accuracy" 117 | top: "recall_per_class" 118 | top: "precision_per_class" 119 | } 120 | layer { 121 | name: "loss" 122 | type: "SoftmaxWithLoss" 123 | bottom: "ip4" 124 | bottom: "label" 125 | top: "loss" 126 | } 127 | -------------------------------------------------------------------------------- /net/xiaoyin/A2_4_train.prototxt: -------------------------------------------------------------------------------- 1 | layer { 2 | name: "data" 3 | type: "HDF5Data" 4 | top: "data" 5 | top: "label" 6 | hdf5_data_param { 7 | source: "/home/ms2015t3/sentence-boundary-detection-nn/net/train_hdf5s" 8 | batch_size: 256 9 | } 10 | } 11 | layer { 12 | name: "reshape" 13 | type: "Reshape" 14 | bottom: "data" 15 | top: "dataR" 16 | reshape_param { 17 | shape { 18 | dim: 0 # copy the dimension from below 19 | dim: 1 20 | dim: 5 21 | dim: -1 # infer it from the other dimensions 22 | } 23 | } 24 | } 25 | layer { 26 | name: "ip1" 27 | type: "InnerProduct" 28 | bottom: "dataR" 29 | top: "ip1" 30 | inner_product_param { 31 | num_output: 2048 32 | weight_filler { 33 | type: "xavier" 34 | } 35 | } 36 | } 37 | layer { 38 | name: "relu1" 39 | type: "ReLU" 40 | bottom: "ip1" 41 | top: "ip1" 42 | } 43 | layer { 44 | name: "dropout1" 45 | type: "Dropout" 46 | bottom: "ip1" 47 | top: "ip1" 48 | dropout_param { 49 | dropout_ratio: 0.5 50 | } 51 | } 52 | 53 | layer { 54 | name: "ip2" 55 | type: "InnerProduct" 56 | bottom: "ip1" 57 | top: "ip2" 58 | inner_product_param { 59 | num_output: 4096 60 | weight_filler { 61 | type: "xavier" 62 | } 63 | } 64 | } 65 | layer { 66 | name: "relu2" 67 | type: "ReLU" 68 | bottom: "ip2" 69 | top: "ip2" 70 | } 71 | layer { 72 | name: "dropout2" 73 | type: "Dropout" 74 | bottom: "ip2" 75 | top: "ip2" 76 | dropout_param { 77 | dropout_ratio: 0.5 78 | } 79 | } 80 | 81 | layer { 82 | name: "ip3" 83 | type: "InnerProduct" 84 | bottom: "ip2" 85 | top: "ip3" 86 | inner_product_param { 87 | num_output: 2048 88 | weight_filler { 89 | type: "xavier" 90 | } 91 | } 92 | } 93 | layer { 94 | name: "relu3" 95 | type: "ReLU" 96 | bottom: "ip3" 97 | top: "ip3" 98 | } 99 | layer { 100 | name: "ip4" 101 | type: "InnerProduct" 102 | bottom: "ip3" 103 | top: "ip4" 104 | inner_product_param { 105 | num_output: 4 106 | weight_filler { 107 | type: "xavier" 108 | } 109 | } 110 | } 111 | layer { 112 | name: "accuracy" 113 | type: "Accuracy" 114 | bottom: "ip4" 115 | bottom: "label" 116 | top: "accuracy" 117 | } 118 | layer { 119 | name: "loss" 120 | type: "SoftmaxWithLoss" 121 | bottom: "ip4" 122 | bottom: "label" 123 | top: "loss" 124 | } 125 | -------------------------------------------------------------------------------- /net/xiaoyin/solver_xiaoyin.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "A2_4_train.prototxt" 2 | test_net: "A2_4_test.prototxt" 3 | test_iter: 1 4 | test_interval: 5000 5 | base_lr: 0.01 6 | lr_policy: "step" 7 | gamma: 0.1 8 | stepsize: 300000 9 | display: 100 10 | max_iter: 1000000 11 | momentum: 0.9 12 | weight_decay: 0.0005 13 | snapshot: 100000 14 | snapshot_prefix: "snapshots/" 15 | solver_mode: GPU 16 | 17 | # ./build/tools/caffe train -solver xyche/solver.prototxt 18 | 19 | # ./build/tools/caffe test -model xyche/prototxt/C2/C2_test.prototxt -weights xyche/snapshots/C1_50d_iter_100000.caffemodel -iterations 400 20 | 21 | # ./build/tools/caffe test -model xyche/prototxt/C1/C1_test_output.prototxt -weights xyche/snapshots/C1_50d_iter_100000.caffemodel -iterations 1 22 | -------------------------------------------------------------------------------- /paper/Makefile: -------------------------------------------------------------------------------- 1 | PDFVIEWER=evince 2 | FILE=main 3 | 4 | show: build 5 | $(PDFVIEWER) $(FILE).pdf & 6 | 7 | plot: 8 | cd plots && build 9 | 10 | when-changed: 11 | @clear && when-changed chapters/ -c 'printf "\033c" && echo "Building" && make -s build && echo "Succeeded"' 12 | 13 | convert: 14 | cd plots && make convert 15 | 16 | build: 17 | @mkdir -p .output 18 | @pdflatex -interaction=nonstopmode -halt-on-error -output-directory .output -jobname=$(FILE) $(FILE).tex 1>&2 > .output/error 19 | @mv .output/$(FILE).pdf . 20 | 21 | error: 22 | @vim + .output/error 23 | 24 | 25 | bibtex: 26 | @cp $(FILE).bib .output/ && cd .output && bibtex $(FILE) && cd .. 27 | 28 | clean: 29 | rm .output/*.aux .output/*.log 30 | -------------------------------------------------------------------------------- /paper/chapters/acoustic_model.tex: -------------------------------------------------------------------------------- 1 | Besides the lexical model, we also use an acoustic model to predict the punctuation. 2 | The acoustic model is based on prosodic features, such as pauses and pitch levels. 3 | In the following the training and evaluation of the acoustic model is shown in detail. 4 | 5 | \subsection{Training Instance Generation} 6 | 7 | Many researches are using pauses, pitch levels, and energy levels for prediction punctuation. 8 | The pitch level encodes the volume of the speaker, whereas the energy level describes the amount of power the speaker is using in his voice. 9 | To obtain those values from the \texttt{.sph} files in our data set, we first have to convert those files into \texttt{.wav} files. 10 | For that, we used a sound processing program called \emph{SoX}. 11 | Having the \texttt{.wav} files, we can extract the pitch and energy level from them using different libraries. 12 | For generating the pitch level the library \emph{aubio}\footnote{\url{http://aubio.org/}} is used. 13 | The output is a file containing two columns: The first column is the time in seconds in the talk and the second column is the pitch level of that second. 14 | The library \emph{Yaafe}\footnote{\url{yaafe.sourceforge.net/}} is used for extracting the energy levels from the \texttt{.wav} files. 15 | The output from \emph{Yaafe} contains one column with energy values. 16 | One line in the output file represents the energy level in \texttt{1 / sample rate} intervals. 17 | 18 | Together with the \texttt{.ctm} files, we can now create the training instances. 19 | The process of generating the training instances is shown in Figure~\ref{fig:overview_acoustic}. 20 | \begin{figure}[ht] 21 | \centering 22 | \includegraphics[width=0.8\textwidth]{img/overview_accoustic.pdf} 23 | \caption{Creation of the training instance for the acoustic model: The pause feature is extracted from the \texttt{.ctm} files, the pitch level feature from the \texttt{.pitch} files and the energy level feature from the \texttt{.energy} file. All features are normalized to a mean 0 and a variance of 1. As in the lexical model, a sliding window is used to create the final training instances.} 24 | \label{fig:overview_acoustic} 25 | \end{figure} 26 | 27 | The \texttt{.ctm} files hold the information of sentence boundaries. 28 | Unfortunately, we do not have any information about other punctuation marks besides periods in those files. 29 | Therefore, the acoustic model is only able to predict periods. 30 | 31 | Having the \texttt{.ctm} files the first step in the training instance generation is to extract the words with their corresponding start time and duration. 32 | Additionally, the sentence boundaries are stored to obtain the gold standard. 33 | When all words of a talk have been read, the pauses before and after each word are calculated. 34 | Afterwards the \texttt{.energy} and \texttt{.pitch} files are parsed. 35 | The energy and pitch level is mapped to the word, which was spoken at the time mentioned in the files. 36 | It can happen, that multiple energy and pitch levels are mapped to one word. 37 | In that case, the average value over all those energy/pitch levels belonging to one word is taken as final energy/pitch level for that word. 38 | 39 | Furthermore, we filter the pitch values. 40 | The voice frequency of a typical adult male ranges from 85 to 180 Hz, a typical adult female has a range from 165 to 255 Hz. 41 | A lot of values in the pitch files lay far over those values, because of the background noise recorded in the talk. 42 | Thus, we decided to filter all pitch levels, which are above 300 Hz. 43 | 44 | In the end, we have the following four features for our acoustic model: 45 | \begin{itemize} 46 | \item the duration of the pause before a word, 47 | \item the duration of the pause after a word, 48 | \item the average energy level of a word, and 49 | \item the average pitch level of a word. 50 | \end{itemize} 51 | In the next step the features are normalized to a mean of zero and a variance of one. 52 | 53 | As in the lexical model, we use a sliding window to create the training instances. 54 | The \texttt{config} file hold the information about the size of the window and the position of the punctuation. 55 | Using the gold standard we obtained from the \texttt{.ctm} files, the training instance with the corresponding class (\textsc{None} or \textsc{Period}) are created. 56 | The training instances are then written to a LevelDB in a last step. 57 | 58 | \subsection{Neural Network Layout} 59 | 60 | We used the same model as for the lexical model, except in the input and the output layer (see Figure~\ref{fig:net_acoustic}). 61 | For the input layer, we have only four features per word, compared to 314 features in the lexical layer. 62 | So, for example, a window size of eight leads to 32 features. 63 | In the last layer, we use only two dimensions instead of three, because we have only two classes: \textsc{None} and \textsc{Period}. 64 | 65 | \begin{figure}[ht] 66 | \centering 67 | \includegraphics[width=0.6\textwidth]{img/net_acoustic.pdf} 68 | \caption{Network architecture of the acoustic model consisting of four \texttt{inner product} layers.} 69 | \label{fig:net_acoustic} 70 | \end{figure} 71 | 72 | \subsection{Results and Evaluation} 73 | 74 | As mentioned before, we can evaluate the model only on \textsc{Period}s, as we do not have ground truth data for the commas. 75 | Again, we evaluated different window sizes and punctuation positions. 76 | Figure~\ref{audio_eval} shows the F-measure for all experiments. 77 | \begin{figure}[ht] 78 | \centering 79 | \includegraphics[width=0.7\textwidth]{img/audio_parameter_eval.png} 80 | \caption{Evaluation of the acoustic model: Window size eight and punctuation position four yields the best results.} 81 | \label{audio_eval} 82 | \end{figure} 83 | Note that the y-axis has been capped to better show differences. 84 | Interestingly, the combination of window size eight and punctuation position four is again the best combination. 85 | It leads to an F-score of 78.36\%. -------------------------------------------------------------------------------- /paper/chapters/caffe.tex: -------------------------------------------------------------------------------- 1 | % a little bit about caffe 2 | -------------------------------------------------------------------------------- /paper/chapters/data.tex: -------------------------------------------------------------------------------- 1 | We use two different data sets for training and evaluating our SBD system. 2 | The first dataset is a set of TED talks\footnote{\url{ted.com}} from 2011 until 2014. 3 | The second dataset is plain text from Wikipedia\footnote{\url{en.wikipedia.org}}. 4 | This was extracted from the English Wikipedia as of February 2014. 5 | 6 | \paragraph{TED talks} 7 | Our TED talk data set consists of 57 talks. For each talk we have the following data files: 8 | \begin{itemize} 9 | \item \texttt{.xml} file: This file contains a manually created script of the talk. 10 | The text is formatted and serves as training data for the lexical model (ground truth). 11 | \item \texttt{.ctm} file: This is a time-marked speech input. 12 | It contains one word per line with the time in second the word was said in the talk and its duration. 13 | This is a typical output from an ASR system. 14 | Additionally, each sentence is labeled in the file, so that the data can be used for training of the acoustic model. 15 | \item \texttt{.sph} file: This file contains raw pulse code modulation (PCM) data. 16 | This data can be converted into .wav files. 17 | \end{itemize} 18 | 19 | \paragraph{Wikipedia} 20 | We extracted the plain text from English Wikipedia articles. 21 | We selected only those articles with more than 10,000 characters, assuming that these articles have gained lots of attention, and therefore provide a good textual quality. 22 | When extracting the plain text, we discarded lists, headlines, tables etc., focusing only on paragraphs, so we can make sure, that only proper sentences are used for training. 23 | In total we received around 3.5 million new training instances from the Wikipedia articles. -------------------------------------------------------------------------------- /paper/chapters/demo.tex: -------------------------------------------------------------------------------- 1 | We use a demo application, accessible with a web browser, to present the working prototype. 2 | It can be used to find sentence boundaries in unpunctuated text. 3 | The general web page shows two main tabs, one labeled \emph{Lexical} and one \emph{Lexical + Audio}. 4 | A user can click these, to switch between using only the lexical model or the fusion of both the lexical and the acoustic model. 5 | 6 | There are two ways to feed input to our model for the \emph{Lexical} SBD (see Figure~\ref{fig:demo_l}). 7 | \begin{figure}[ht] 8 | \centering 9 | \includegraphics[width=0.5\textwidth]{img/demo_l.png} 10 | \caption{The demo application for lexical model. The results are presented below the options for input and model selection.} 11 | \label{fig:demo_l} 12 | \end{figure} 13 | The user can enter a text input field to manually enter or paste any text they wish. 14 | Another possibility is to choose from a set of existing text files. 15 | A dropdown selection allows the user to choose the pretrained models, if multiple models are available in the system. 16 | If the model is changed, it is automatically loaded in the background. 17 | Once the user clicks the \emph{Punctuate!} button, the text, which was entered, or selected as a file, is passed to our lexical model. 18 | While the server processes the request, a small loading icon is shown inside the button. 19 | After the predictions are returned from the server, the result is shown beneath. 20 | The input text and positions where no punctuation was predicted are shown as tokens with a light grey background. 21 | Any commas or periods inserted, are shown in distinct colors. 22 | If a model, which uses POS tags, is selected a user can hover their mouse over a token to see its POS category. 23 | For further use the entire result is selectable and can be copied. 24 | 25 | For the \emph{Lexical + Audio} SBD the possibilities for entering input are more limited (see Figure~\ref{fig:demo_la}). 26 | \begin{figure}[ht] 27 | \centering 28 | \includegraphics[width=0.5\textwidth]{img/demo_l_a.png} 29 | \caption{The demo application for fusion of both models. The results of the individual models and the fusion are presented below the options for input and model selection. Only one result section is in the screenshot, the other sections are out of the region of the screenshot.} 30 | \label{fig:demo_la} 31 | \end{figure} 32 | Since we need an audio recording, we offer only examples existing in the system. 33 | At the moment the system contains samples, which were used in the testing phase, but not for training. 34 | The selection of the user is therefore limited by a dropdown menu of all available choices. 35 | However, the choice of both the acoustic and the lexical model is independently available to a user. 36 | These can also be selected in a dropdown menu. 37 | The functionality of the \emph{Punctuate!} button is unchanged. 38 | It triggers the processing and shows a loading indicator until the result returns. 39 | The result area however is changed, and contains three subareas, which each contain a different result. 40 | Two of them contain the raw results of the acoustic model and the lexical model. 41 | The third shows the result after the fusion. 42 | Therefore, it is easy to compare the results of each individual model, and the result after the fusion. -------------------------------------------------------------------------------- /paper/chapters/evaluation.tex: -------------------------------------------------------------------------------- 1 | % evaluation 2 | 3 | Problem with audio data: 4 | Not tokenized in the same way as our data 5 | No commas 6 | No capitalization, which is important for POS tagging 7 | -------------------------------------------------------------------------------- /paper/chapters/fusion.tex: -------------------------------------------------------------------------------- 1 | The individual predictions from the acoustic and the lexical model need to be combined to obtain a final, overall prediction. 2 | Therefore, we fuse the predictions of the two models. 3 | We implemented two different fusion approaches: 4 | The first approach is called \emph{Threshold Fusion}, the second one is called \emph{Balance Fusion}. 5 | The two fusion approaches and the evaluation will be presented in the remainder of this section. 6 | 7 | \subsection{Threshold Fusion} 8 | The main idea of the threshold fusion is the following: If the probability for the class \textsc{Period} from the acoustic model is over a certain threshold and the probability for the class \textsc{None} from the lexical model is below a certain threshold, we want to predict a period or a comma. 9 | If the condition is satisfied, the probability of the class \textsc{Period} from the acoustic model is added to the probabilities of the classes \textsc{Period} and \textsc{Comma} from the lexical model. 10 | The idea of the threshold fusion is shown in the Figure~\ref{fig:fusion_1}. 11 | \begin{figure}[ht] 12 | \centering 13 | \includegraphics[width=0.7\textwidth]{img/fusion_1.pdf} 14 | \caption{Threshold Fusion: The probability of the class \textsc{Period} from the acoustic model is added to the probabilities of the classes \textsc{Period} and \textsc{Comma} from the lexical model, if the probability for the class \textsc{Period} from the acoustic model is over a certain threshold and the probability for the class \textsc{None} from the lexical model is below a certain threshold.} 15 | \label{fig:fusion_1} 16 | \end{figure} 17 | If the condition does not hold, we just take the prediction probabilities of the lexical model as the final predictions. 18 | Thus, the threshold fusion trusts the lexical model more than the acoustic model. 19 | The acoustic model is taken only into account, if the acoustic model is quite certain, that there should be a period, and the lexical model is not certain enough, that there should be no punctuation at all. 20 | In the end, the class with the highest probability is chosen. 21 | For the example in Figure~\ref{fig:fusion_1}, we would predict a comma. 22 | 23 | \subsection{Balance Fusion} 24 | The balance fusion sums up weighted probabilities of both models. 25 | Figure~\ref{fig:fusion_2} shows an example. 26 | \begin{figure}[ht] 27 | \centering 28 | \includegraphics[width=0.7\textwidth]{img/fusion_2.pdf} 29 | \caption{Balance Fusion: Sum up the weighted probabilities of both models.} 30 | \label{fig:fusion_2} 31 | \end{figure} 32 | Using weights we can regulate, which model we trust more. 33 | In the example shown in Figure~\ref{fig:fusion_2}, the lexical model is more important than the acoustic model. 34 | In the end, the class with the overall highest probability is chosen again. 35 | So in the example, the predicted class would be the class \textsc{None}. 36 | 37 | \subsection{Results and Evaluation} 38 | We evaluate both fusion approaches to determine, which of them leads to better results. 39 | The evaluation was done on the TED talk data set, because the acoustic model needs the information about the energy and pitch levels and therefore audio files. 40 | We used the \texttt{.ctm} files along with their corresponding \texttt{.sph} file. 41 | Thus, we have only a gold standard for the class \textsc{Period}. 42 | Consequently, the evaluation was done only for this class. 43 | 44 | To evaluate the fusion approaches, we remove all sentence boundaries from the data and pass the data on to the lexical and acoustic model. 45 | We chose the best lexical and acoustic model from the previous sections to predict the punctuations. 46 | The predictions from the lexical and acoustic model are then fused. 47 | Therefore, we tested the threshold fusion with different threshold values and the balanced fusion with multiple weights. 48 | We added a baseline fusion for the lexical and acoustic model, which pass on the predictions from the corresponding model. 49 | The predictions returned by the different fusion approaches are then evaluated using the gold standard. 50 | The F-score was used as evaluation metric. 51 | Figure~\ref{fig:eval_fusion} shows the results. 52 | \begin{figure}[ht] 53 | \centering 54 | \includegraphics[width=0.7\textwidth]{img/fusion_eval.pdf} 55 | \caption{Evaluation results for different fusion approaches: The best result is obtained by the threshold fusion with an acoustic threshold of 0.5 and a lexical threshold of 0.9.} 56 | \label{fig:eval_fusion} 57 | \end{figure} 58 | 59 | Because we have only a gold standard for the class \textsc{Period}, the baseline is the F-score from the acoustic model. 60 | Six fusion approaches outperform the F-score of the baseline. 61 | The best one is the threshold fusion with an acoustic threshold of 0.5 and a lexical threshold of 0.9. 62 | It obtained an F-score of 80.43\%, whereas the baseline has an F-score of 78.49\%. 63 | Consequently, fusing the results increases the overall performance. 64 | -------------------------------------------------------------------------------- /paper/chapters/future.tex: -------------------------------------------------------------------------------- 1 | We presented an approach to automatically detect sentence boundaries, and predict the correct punctuation marks in unpunctuated ASR output. 2 | Two different models were trained independently, one using lexical input and the other using acoustic input. 3 | The results of both models were merged with a late fusion. 4 | Evaluation has shown, that one has to be careful with the training data, which should stem only from actually spoken text. 5 | Just adding more written text data did not improve the performance. 6 | On the other hand, part of speech tags as additional features consistently increase the performance of the sentence boundary detection. 7 | 8 | There are many possibilities for improvement on the presented approach. 9 | Since we did not explore the large variety of different neural network layouts, further exploration in this area is likely to improve on the results. 10 | Especially in the case of more training data, a deeper network architecture can provide better results. 11 | Also, using Long Short Term Memory (LSTM) neural networks appears promising, as they can process a stream of data while keeping time information. 12 | This maps easily to the stream of word tokens in a text. 13 | 14 | In the fusion step we decided for a late fusion approach, which combines only the predictions. 15 | However, another way to explore, is an earlier fusion, where both models and the fusion itself are trained together. 16 | Instead of fusing the predictions, the actual features can be fused. 17 | As for data preparation, a different representation of features in the lexical model can be examined, such as a second or third data channel or a combination similar to the fusion of the acoustic and lexical model. 18 | 19 | Another improvement could be achieved with a better post processing of the results. 20 | For example, one punctuation symbol right after another is unlikely to be correct. -------------------------------------------------------------------------------- /paper/chapters/introduction.tex: -------------------------------------------------------------------------------- 1 | Automatic Speech Recognition systems (ASR) have many practical applications nowadays, e.g., in dictation systems for medical documentation and journalism. 2 | Another application comes from the rapidly increasing amount of videos available online on video platforms for entertainment and learning, such as Youtube\footnote{\url{youtube.com}}, Vimeo\footnote{\url{vimeo.com}}, Coursera\footnote{\url{coursera.org}} or OpenHPI\footnote{\url{open.hpi.com}}. 3 | All of these benefit from automatically generated transcripts and subtitles. 4 | However, the result of many ASR systems is an unformatted text without any punctuation marks, such as periods and commas. 5 | These texts are hard to read and understand without manually inserting the missing punctuation marks. 6 | However, this is a mundane, complicated task. 7 | Therefore, an automatic solution for formatting the ASR output and inserting punctuation marks is necessary. 8 | We call this \emph{sentence boundary detection} (SBD). 9 | 10 | SBD is a mandatory preprocessing step for many further use cases. 11 | For example, most machine translation outputs are trained on properly formatted text. 12 | Having an ASR output without punctuation marks decreases the performance of machine translating systems. 13 | Also, other natural language processing tasks, such as part-of-speech tagging or tokenization, work on sentence units. 14 | Thus, the ASR output needs to be formatted before it can be further processed. 15 | 16 | In this paper we want to address this problem by automatically creating punctuated text from unpunctuated text. 17 | We use neural networks to process the unformatted transcripts. 18 | The use of neural networks has led to large improvements in areas, such as image and video classification recently. 19 | 20 | Our SBD system contains two models: one from the ASR text transcript (lexical model), and one from the raw audio data (acoustic model). 21 | We train both models independently and retrieve their separate predictions. 22 | Afterwards the results are combined in a fusion step. 23 | The final output can replace the original output from ASR systems and improve readability and quality of transcripts. 24 | Additionally, the punctuation marks often represent suitable boundaries for subtitles, enhancing their overall quality. 25 | 26 | The rest of the paper is structured as follows: 27 | Related work is summarized in Section~\ref{sec:related_work}. 28 | Section~\ref{sec:training_data} describes the datasets we use for training and evaluation. 29 | The data preprocessing, training, and evaluation of our lexical and our acoustic model can be seen in Section~\ref{sec:lexical_model} and Section~\ref{sec:acoustic_model} respectively. 30 | Details of the fusion step are explained in Section~\ref{sec:fusion}. 31 | We show our demo application in Section~\ref{sec:demo} and conclude our work in Section~\ref{sec:future}. -------------------------------------------------------------------------------- /paper/chapters/parameters.tex: -------------------------------------------------------------------------------- 1 | % how did we find the best parameters? 2 | 3 | Evaluation of results: 4 | \begin{itemize} 5 | \item F-measure for each class is calculated. 6 | \item Harmonic mean for all F-measures is total score (higher is better). 7 | \end{itemize} 8 | 9 | \begin{figure}[ht] 10 | \centering 11 | \includegraphics[width=\textwidth]{img/parameter_eval.png} 12 | \caption{Harmonic mean between all f1 scores for all classes. \emph{2/5} means window size of five and punctuation is tested at position two. If \emph{wi} is in the label, it uses wikipedia training data.} 13 | \label{fig2} 14 | \end{figure} 15 | 16 | Comparison between experiments with and without POS tagging (other than that, they have the same configurations): 17 | \begin{itemize} 18 | \item With POS tagging: 0.305 19 | \item Without POS tagging: 0.275 20 | \end{itemize} 21 | 22 | Comparison between experiments with and without wikipedia data (other than that, they have the same configurations): 23 | \begin{itemize} 24 | \item Without wikipedia: 0.385 25 | \item With wikipedia data: 0.252 26 | \end{itemize} 27 | -------------------------------------------------------------------------------- /paper/chapters/related_work.tex: -------------------------------------------------------------------------------- 1 | As punctuation prediction is a mandatory preprocessing step for further working with automated speech recognition output, a lot of research has been done in this field. 2 | Some approaches focus only on the lexical part~\cite{Gravano2009, Lu2010, Ueffing2013, Cho2012, Zhang2013}. 3 | Gravano et al.~\cite{Gravano2009} used a text-based n-gram language model to detect punctuation (comma, period, question mark). 4 | Dynamic conditional random fields are used by Lu and Ng~\cite{Lu2010} and Ueffing et al.~\cite{Ueffing2013}. 5 | Ueffing et al. evaluate their method with different features, such as language model scores, parse trees, dynamic sentence length and token n-grams. 6 | The usefulness of the individual features highly depends on the nature of the text, which is processed. 7 | For example, if the text is well structured, the parse tree features improve the result. 8 | Similar to our approach, Cho et al.~\cite{Cho2012} use a sliding window over the input data to predict different punctuations. 9 | Zhang et al.~\cite{Zhang2013} predict punctuations of an input stream. 10 | For each processed word in the input stream, syntactic features are used to predict the punctuation symbol after that word. 11 | The used features include, e.g., part-of-speech tags, tree-based features (the parse tree is build step by step) or bag of words. 12 | 13 | Most of the researchers combine prosodic features, such as pitch, pauses, duration, and lexical features, such as words, n-grams, part-of-speech tags~\cite{Mark1999, Christensen2001, Liu2005, Matusov2007, Wang2012}. 14 | Chen~\cite{Mark1999} predict punctuation on the basis of prosodic features in a first step using a Hidden Markov Model. 15 | In a second step they use a language model to adapt the predicted punctuation from the first step. 16 | Christensen et al.~\cite{Christensen2001} focus on multi-layer perceptron methods to combine prosodic and lexical features, whereas Liu et al.~\cite{Liu2005} use conditional random fields. 17 | Matusov et al.~\cite{Matusov2007} optimize their approach to the needs of machine translation. 18 | They combine a language model and prosodic features in a log-linear model and add a phrase coverage feature, which is motivated by phrase-based machine translation systems. 19 | A comparison of different machine learning models to combine prosodic and lexical feature for the prediction of punctuation was done by Wang et al.~\cite{Wang2012}. 20 | The dynamic conditional random fields achieve the best result on a broadcast news copora (F1-Measure of 42.8\%). 21 | 22 | In this paper we present a new approach: predicting punctuations using deep learning. 23 | To the best of our knowledge, such an approach was never tried before. 24 | We learn two individual models, one based on lexical features and the other one based on prosodic features. 25 | In the end, the predictions of both models are fused. -------------------------------------------------------------------------------- /paper/img/audio_parameter_eval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/audio_parameter_eval.png -------------------------------------------------------------------------------- /paper/img/demo_l.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/demo_l.png -------------------------------------------------------------------------------- /paper/img/demo_l_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/demo_l_a.png -------------------------------------------------------------------------------- /paper/img/fusion_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/fusion_1.pdf -------------------------------------------------------------------------------- /paper/img/fusion_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/fusion_2.pdf -------------------------------------------------------------------------------- /paper/img/fusion_eval.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/fusion_eval.pdf -------------------------------------------------------------------------------- /paper/img/fusion_eval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/fusion_eval.png -------------------------------------------------------------------------------- /paper/img/hpi_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/hpi_logo.png -------------------------------------------------------------------------------- /paper/img/net_acoustic.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/net_acoustic.pdf -------------------------------------------------------------------------------- /paper/img/net_lexical.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/net_lexical.pdf -------------------------------------------------------------------------------- /paper/img/overview_accoustic.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/overview_accoustic.pdf -------------------------------------------------------------------------------- /paper/img/overview_lexical.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/overview_lexical.pdf -------------------------------------------------------------------------------- /paper/img/sliding_window.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/sliding_window.pdf -------------------------------------------------------------------------------- /paper/img/window_eval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/window_eval.png -------------------------------------------------------------------------------- /paper/img/window_pos_eval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/window_pos_eval.png -------------------------------------------------------------------------------- /paper/img/window_wiki_eval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/img/window_wiki_eval.png -------------------------------------------------------------------------------- /paper/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass[a4paper,12pt,pagesize,headsepline,bibliography=totoc,titlepage]{scrartcl} 2 | \usepackage[utf8]{inputenc} 3 | % \usepackage[T1]{fontenc} 4 | \usepackage{mathptmx} 5 | \usepackage[scaled=.90]{helvet} 6 | \usepackage{courier} 7 | \usepackage{amsmath,amsthm,amsfonts,graphicx,caption} 8 | \usepackage{hyperref} 9 | \usepackage{ae,aecompl} 10 | \usepackage{todonotes} 11 | \usepackage{subcaption} 12 | \usepackage{listings} 13 | 14 | \lstset { 15 | backgroundcolor=\color{white}, 16 | breakatwhitespace=false, 17 | breaklines=true, 18 | numbers=left, 19 | frame=single, 20 | title=\lstname, 21 | basicstyle=\footnotesize 22 | } 23 | 24 | % \pagestyle{headings} 25 | \headsep4mm % Abstand der Kopfzeile vom Text 26 | % \typearea[current]{current} 27 | 28 | \title{ 29 | \includegraphics*[width=0.4\textwidth]{img/hpi_logo.png}\\ 30 | \vspace{24pt} 31 | Sentence Boundary Detection 32 | } 33 | \subtitle{ 34 | Seminar\\ 35 | Practical Applications of Multimedia Retrieval\\ 36 | Fall Semester 2015/2016 37 | } 38 | \author{ 39 | Tanja Bergmann, Joseph Bethge, Stefan Bunk, Ricarda Schüler\\[12pt] 40 | Supervisor:\\ 41 | Xiaoyin Che\\ 42 | Dr. Haojin Yang\\ 43 | Prof. Dr. Christoph Meinel 44 | } 45 | \date{\today} 46 | 47 | \begin{document} 48 | \maketitle 49 | \tableofcontents 50 | \newpage 51 | 52 | \section{Introduction} 53 | \label{sec:introduction} 54 | \input{chapters/introduction} 55 | 56 | \section{Related Work} 57 | \label{sec:related_work} 58 | \input{chapters/related_work} 59 | 60 | \section{Training Data} 61 | \label{sec:training_data} 62 | \input{chapters/data} 63 | 64 | \section{Lexical Model} 65 | \label{sec:lexical_model} 66 | \input{chapters/lexical_model} 67 | 68 | \section{Acoustic Model} 69 | \label{sec:acoustic_model} 70 | \input{chapters/acoustic_model} 71 | 72 | \section{Fusion} 73 | \label{sec:fusion} 74 | \input{chapters/fusion} 75 | 76 | \section{Demo Tool} 77 | \label{sec:demo} 78 | \input{chapters/demo} 79 | 80 | \section{Conclusion and Future Work} 81 | \label{sec:future} 82 | \input{chapters/future} 83 | 84 | \bibliographystyle{plain} 85 | \bibliography{main} 86 | 87 | %\newpage 88 | %\appendix 89 | %\section{Appendix} 90 | %We appended the following files for reference: 91 | %\begin{itemize} 92 | % \item lexical-solver.prototxt, the configuration of the solver (lexical model) 93 | % \item lexical-net.prototxt, our net configuration (lexical model) 94 | % \item acoustic-solver.prototxt, the configuration of the solver (acoustic model) 95 | % \item acoustic-net.prototxt, our net configuration (acoustic model) 96 | %\end{itemize} 97 | % 98 | %\subsection{lexical-solver.prototxt} 99 | %\lstinputlisting[caption={lexical-solver.prototxt}, label={lst:lexical-solver.prototxt}]{../net/solver.prototxt} 100 | %\newpage 101 | % 102 | %\subsection{lexical-net.prototxt} 103 | %\lstinputlisting[caption={lexical-net.prototxt}, label={lst:lexical-net.prototxt}]{../net/net.prototxt} 104 | %\newpage 105 | % 106 | %\subsection{acoustic-solver.prototxt} 107 | %\lstinputlisting[caption={acoustic-solver.prototxt}, label={lst:acoustic-solver.prototxt}]{../net-audio/solver.prototxt} 108 | %\newpage 109 | % 110 | %\subsection{acoustic-net.prototxt} 111 | %\lstinputlisting[caption={acoustic-net.prototxt}, label={lst:acoustic-net.prototxt}]{../net-audio/net.prototxt} 112 | 113 | %\newpage %for more appended files 114 | 115 | \end{document} 116 | -------------------------------------------------------------------------------- /paper/notes/Makefile: -------------------------------------------------------------------------------- 1 | PDFVIEWER=evince 2 | FILE=results 3 | 4 | show: build 5 | $(PDFVIEWER) $(FILE).pdf & 6 | 7 | plot: 8 | cd plots && build 9 | 10 | when-changed: 11 | @clear && when-changed $(FILE).tex -c 'printf "\033c" && echo "Building" && make -s build && echo "Succeeded"' 12 | 13 | convert: 14 | cd plots && make convert 15 | 16 | build: 17 | @mkdir -p .output 18 | @pdflatex -interaction=nonstopmode -halt-on-error -output-directory .output -jobname=$(FILE) $(FILE).tex 1>&2 > .output/error 19 | @mv .output/$(FILE).pdf . 20 | 21 | error: 22 | @vim + .output/error 23 | 24 | 25 | bibtex: 26 | @cp $(FILE).bib .output/ && cd .output && bibtex $(FILE) && cd .. 27 | 28 | clean: 29 | rm .output/*.aux .output/*.log 30 | -------------------------------------------------------------------------------- /paper/notes/auswertung.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/notes/auswertung.ods -------------------------------------------------------------------------------- /paper/notes/auswertung2.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/notes/auswertung2.ods -------------------------------------------------------------------------------- /paper/notes/auswertung_fusion.txt: -------------------------------------------------------------------------------- 1 | fusion,precision[NONE],precision[PERIOD],recall[NONE],recall[PERIOD],f1[NONE],f1[PERIOD],support[NONE],support[PERIOD] 2 | ('BaselineLexicalFusion',) 3 | 0.977,0.645,0.984,0.554,0.981,0.596,11611.000,599.000 4 | ('BaselineAudioFusion',) 5 | 0.981,0.604,0.968,0.722,0.974,0.658,12169.000,832.000 6 | ('ThresholdFusion[AudioThresh: 0.50, LexicalThresh: 0.80]',) 7 | 0.981,0.672,0.979,0.701,0.980,0.686,11647.000,722.000 8 | ('ThresholdFusion[AudioThresh: 0.50, LexicalThresh: 0.90]',) 9 | 0.983,0.678,0.978,0.733,0.981,0.704,11647.000,722.000 10 | ('ThresholdFusion[AudioThresh: 0.60, LexicalThresh: 0.80]',) 11 | 0.981,0.672,0.979,0.695,0.980,0.683,11646.000,721.000 12 | ('ThresholdFusion[AudioThresh: 0.60, LexicalThresh: 0.90]',) 13 | 0.983,0.678,0.979,0.727,0.981,0.701,11646.000,721.000 14 | ('ThresholdFusion[AudioThresh: 0.70, LexicalThresh: 0.80]',) 15 | 0.981,0.676,0.980,0.689,0.980,0.682,11644.000,717.000 16 | ('ThresholdFusion[AudioThresh: 0.70, LexicalThresh: 0.90]',) 17 | 0.983,0.682,0.979,0.721,0.981,0.701,11644.000,717.000 18 | ('BalanceFusion[BalanceValue: 0.10]',) 19 | 0.981,0.788,0.993,0.593,0.987,0.677,11893.000,553.000 20 | ('BalanceFusion[BalanceValue: 0.20]',) 21 | 0.982,0.792,0.993,0.599,0.987,0.682,11916.000,553.000 22 | ('BalanceFusion[BalanceValue: 0.30]',) 23 | 0.982,0.807,0.993,0.608,0.988,0.694,11944.000,551.000 24 | ('BalanceFusion[BalanceValue: 0.40]',) 25 | 0.982,0.820,0.994,0.611,0.988,0.701,11973.000,553.000 26 | ('BalanceFusion[BalanceValue: 0.50]',) 27 | 0.980,0.803,0.993,0.591,0.987,0.681,11916.000,580.000 28 | ('BalanceFusion[BalanceValue: 0.60]',) 29 | 0.979,0.717,0.989,0.569,0.984,0.634,11773.000,587.000 30 | ('BalanceFusion[BalanceValue: 0.70]',) 31 | 0.978,0.675,0.986,0.568,0.982,0.617,11698.000,595.000 32 | ('BalanceFusion[BalanceValue: 0.80]',) 33 | 0.978,0.669,0.986,0.568,0.982,0.614,11664.000,597.000 34 | ('BalanceFusion[BalanceValue: 0.90]',) 35 | 0.978,0.653,0.985,0.564,0.981,0.605,11635.000,598.000 36 | 37 | -------------------------------------------------------------------------------- /paper/notes/auswertung_onlypos.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/notes/auswertung_onlypos.ods -------------------------------------------------------------------------------- /paper/notes/auswertung_wiki.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/notes/auswertung_wiki.ods -------------------------------------------------------------------------------- /paper/notes/fusion_eval.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/paper/notes/fusion_eval.ods -------------------------------------------------------------------------------- /paper/notes/fusion_eval.txt: -------------------------------------------------------------------------------- 1 | fusion precision[NONE] precision[PERIOD] recall[NONE] recall[PERIOD] f1[NONE] f1[PERIOD] support[NONE] support[PERIOD] 2 | BaselineLexicalFusion 0.971 0.635 0.986 0.460 0.979 0.534 11225.000 602.000 3 | BaselineAudioFusion 0.980 0.606 0.967 0.722 0.973 0.659 11797.000 832.000 4 | ThresholdFusion[AudioThresh: 0.50, LexicalThresh: 0.80] 0.976 0.682 0.981 0.629 0.979 0.655 11254.000 731.000 5 | ThresholdFusion[AudioThresh: 0.50, LexicalThresh: 0.90] 0.979 0.693 0.981 0.672 0.980 0.682 11254.000 731.000 6 | ThresholdFusion[AudioThresh: 0.60, LexicalThresh: 0.80] 0.976 0.681 0.981 0.625 0.978 0.652 11254.000 728.000 7 | ThresholdFusion[AudioThresh: 0.60, LexicalThresh: 0.90] 0.979 0.691 0.981 0.668 0.980 0.679 11254.000 728.000 8 | ThresholdFusion[AudioThresh: 0.70, LexicalThresh: 0.80] 0.976 0.684 0.982 0.620 0.979 0.650 11252.000 726.000 9 | ThresholdFusion[AudioThresh: 0.70, LexicalThresh: 0.90] 0.978 0.694 0.981 0.663 0.980 0.678 11252.000 726.000 10 | BalanceFusion[BalanceValue: 0.10] 0.980 0.784 0.993 0.557 0.987 0.652 11521.000 522.000 11 | BalanceFusion[BalanceValue: 0.20] 0.981 0.796 0.993 0.570 0.987 0.664 11538.000 519.000 12 | BalanceFusion[BalanceValue: 0.30] 0.981 0.813 0.994 0.569 0.988 0.670 11568.000 518.000 13 | BalanceFusion[BalanceValue: 0.40] 0.981 0.826 0.995 0.569 0.988 0.674 11599.000 527.000 14 | BalanceFusion[BalanceValue: 0.50] 0.977 0.807 0.994 0.523 0.985 0.635 11551.000 566.000 15 | BalanceFusion[BalanceValue: 0.60] 0.974 0.727 0.991 0.480 0.982 0.578 11394.000 581.000 16 | BalanceFusion[BalanceValue: 0.70] 0.973 0.671 0.988 0.468 0.980 0.552 11314.000 594.000 17 | BalanceFusion[BalanceValue: 0.80] 0.972 0.660 0.987 0.465 0.980 0.546 11270.000 598.000 18 | BalanceFusion[BalanceValue: 0.90] 0.972 0.647 0.986 0.463 0.979 0.539 11247.000 601.000 19 | -------------------------------------------------------------------------------- /paper/notes/results.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{graphicx} 3 | 4 | \begin{document} 5 | 6 | Evaluation of results: 7 | \begin{itemize} 8 | \item F-measure for each class is calculated. 9 | \item Harmonic mean for all F-measures is total score (higher is better). 10 | \end{itemize} 11 | 12 | \begin{figure}[ht] 13 | \centering 14 | \includegraphics[width=\textwidth]{diagram.png} 15 | \caption{Harmonic mean between all f1 scores for all classes. \emph{2/5} means window size of five and punctuation is tested at position two. If \emph{wi} is in the label, it uses wikipedia training data.} 16 | \label{fig2} 17 | \end{figure} 18 | 19 | Comparison between experiments with and without POS tagging (other than that, they have the same configurations): 20 | \begin{itemize} 21 | \item With POS tagging: 0.305 22 | \item Without POS tagging: 0.275 23 | \end{itemize} 24 | 25 | Comparison between experiments with and without wikipedia data (other than that, they have the same configurations): 26 | \begin{itemize} 27 | \item Without wikipedia: 0.385 28 | \item With wikipedia data: 0.252 29 | \end{itemize} 30 | 31 | \end{document} 32 | -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- 1 | Before executing any scripts on the server, please execute `. ./use_python p2` in `/home/ms2015t3/sentence-boundary-detection-nn`: 2 | 3 | ``` 4 | cd /home/ms2015t3/sentence-boundary-detection-nn 5 | . ./use_python p2 6 | ``` 7 | 8 | Also, make sure that the directory `/home/ms2015t3/sentence-boundary-detection-nn/python` is added to the python path environment variable: 9 | 10 | ``` 11 | export PYTHONPATH="${PYTHONPATH}:/home/ms2015t3/sentence-boundary-detection-nn/python" 12 | ``` 13 | 14 | Also, you have to set the environment variable `SENTENCE_HOME`, because many scripts rely on it: 15 | 16 | ``` 17 | export SENTENCE_HOME="/home/ms2015t3/sentence-boundary-detection-nn" 18 | ``` 19 | 20 | To execute all python scripts in this folder, please use **this folder as the working directory**. 21 | 22 | ## Creating LevelDB for lexical model 23 | 24 | To build a level db for the lexical model, please execute: 25 | ``` 26 | python sbd_leveldb/training_instance_generator.py config.ini 27 | ``` 28 | The `config.ini` file contains all parameters, which are needed during the creation of the training instances. 29 | It also contains the training files and test files, which should be used. 30 | The data root directory is set to `/mnt/naruto/sentence/data`. 31 | All training and test files should be located in this folder. 32 | The `config.ini.default` file contains an example of a valid `config.ini` file. 33 | 34 | The created level db can be found under `/mnt/naruto/sentence/leveldbs`. 35 | 36 | ## Creating LevelDB for acoustic model 37 | 38 | To build a level db for the acoustic model, please execute: 39 | ``` 40 | python sbd_leveldb/audio_training_instance_generator.py config.ini 41 | ``` 42 | The `config.ini` file contains all parameters, which are needed during the creation of the training instances. 43 | The parameter `lexical` needs to be set to `false`. 44 | 45 | It also contains the training files and test files, which should be used. 46 | The data root directory is set to `/mnt/naruto/sentence/data` 47 | All training and test files should be located in this folder. 48 | The corresponding `.pitch` and `.energy` files should be in the same folder as the `.ctm` files. 49 | Also, the `.pitch` and `.energy` files should have the following name: `_talkid.[pitch|energy]`. 50 | The `` parameter is extracted from the `.ctm` files. 51 | To create the `.pitch` and `.energy` files, you can use the `pitch_and_energy.sh` script under `/mnt/naruto/sentence/data/audio`. 52 | 53 | The created level db can be found under `/mnt/naruto/sentence/leveldbs`. 54 | -------------------------------------------------------------------------------- /python/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/python/common/__init__.py -------------------------------------------------------------------------------- /python/common/argparse_util.py: -------------------------------------------------------------------------------- 1 | def is_valid_file(parser, arg, mode='r'): 2 | try: 3 | f = open(arg, mode) 4 | f.close() 5 | return arg 6 | except IOError: 7 | parser.error('The file %s can not be opened!' % arg) 8 | -------------------------------------------------------------------------------- /python/common/send_email.py: -------------------------------------------------------------------------------- 1 | import smtplib, ConfigParser, argparse 2 | 3 | config_path = "email.ini" 4 | 5 | config = ConfigParser.ConfigParser() 6 | print("Reading email config: %s" % config_path) 7 | config.read(config_path) 8 | 9 | from_address = config.get('credentials','username') 10 | to_address_list = config.get('adresses','to').split(",") 11 | username = config.get('credentials','username') 12 | password = config.get('credentials','password') 13 | 14 | class EmailNotification(object): 15 | def __init__(self, subject, message): 16 | self.subject = subject 17 | self.message = message 18 | 19 | def __format_message(self): 20 | message_with_header = "\r\n".join([ 21 | "From: %s" % from_address, 22 | "To: %s" % ",".join(to_address_list), 23 | "Subject: [PAMuR] %s" % self.subject, 24 | "", 25 | "%s" % self.message 26 | ]) 27 | return message_with_header 28 | 29 | def send(self): 30 | server = smtplib.SMTP('smtp.gmail.com:587') 31 | server.ehlo() 32 | server.starttls() 33 | server.login(username, password) 34 | msg = self.__format_message() 35 | server.sendmail(from_address, to_address_list, msg) 36 | server.quit() 37 | 38 | def attach_files(self, files): 39 | for filename in files: 40 | self.message += "\r\n\r\n========== %s ==========\r\n" % filename 41 | with open(filename, "r") as file_: 42 | for line in file_: 43 | self.message += "> %s" % line 44 | self.message += "\r\n\r\n========== %s ==========\r\n" % filename 45 | 46 | def main(args): 47 | e = EmailNotification(args.subject, args.message) 48 | e.attach_files(args.files) 49 | # print e.message 50 | e.send() 51 | 52 | if __name__ == '__main__': 53 | parser = argparse.ArgumentParser(description='Send a notification email') 54 | parser.add_argument('subject', help='subject of the email') 55 | parser.add_argument('message', help='message of the email') 56 | parser.add_argument('files', help='files which are appended after the text', nargs='*') 57 | args = parser.parse_args() 58 | main(args) 59 | -------------------------------------------------------------------------------- /python/config.ini.default: -------------------------------------------------------------------------------- 1 | [data] 2 | normalize_class_distribution = false 3 | train_files = ted/2010-1.xml,ted/2010-2.xml,ted/2012.xml,ted/2013.xml 4 | test_files = ted/2011.xml 5 | 6 | [word_vector] 7 | # if set to 'avg' the average word vector is taken, 8 | # otherwise the given word is taken as key error vector. 9 | key_error_vector = this 10 | vector_file = google 11 | 12 | [windowing] 13 | window_size = 5 14 | punctuation_position = 3 15 | 16 | [features] 17 | use_question_mark = true 18 | pos_tagging = false 19 | number_replacement = true 20 | 21 | [model] 22 | lexical = true 23 | -------------------------------------------------------------------------------- /python/console_demo/README.md: -------------------------------------------------------------------------------- 1 | ## Demo 2 | 3 | Please check the [main readme file](../README.md) for the proper demo execution. -------------------------------------------------------------------------------- /python/console_demo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/python/console_demo/__init__.py -------------------------------------------------------------------------------- /python/console_demo/demo.py: -------------------------------------------------------------------------------- 1 | import argparse, numpy, caffe 2 | 3 | from preprocessing.nlp_pipeline import NlpPipeline 4 | from preprocessing.sliding_window import SlidingWindow, PUNCTUATION_POS 5 | from preprocessing.text import Sentence 6 | from preprocessing.word2vec_file import Word2VecFile 7 | 8 | classes = ["NONE", "COMMA", "PERIOD", "QUESTION"] 9 | classes_as_string = ["", ",", ".", "?"] 10 | 11 | class InputText(object): 12 | 13 | def __init__(self, text): 14 | self.text = text 15 | 16 | self.nlp_pipeline = NlpPipeline() 17 | self.gold_tokens = self.nlp_pipeline.parse_text(self.text) 18 | 19 | def get_gold_tokens(self): 20 | return self.gold_tokens 21 | 22 | 23 | class Demo(object): 24 | """parses demo data, feeds to a trained model and returns predictions""" 25 | 26 | def __init__(self, net, word2vec): 27 | self.word2vec = word2vec 28 | self.net = net 29 | 30 | def get_not_covered_words(self): 31 | return self.word2vec.not_covered_words 32 | 33 | def predict_text(self, text): 34 | input_text = InputText(text) 35 | 36 | for token in input_text.gold_tokens: 37 | if not token.is_punctuation(): 38 | token.word_vec = self.word2vec.get_vector(token.word.lower()) 39 | 40 | slidingWindow = SlidingWindow() 41 | instances = slidingWindow.list_windows(input_text) 42 | 43 | punctuations = [] 44 | for instance in instances: 45 | probs = self.predict_caffe(instance) 46 | #print instance 47 | #self.show_probs(probs) 48 | punctuations.append(numpy.argmax(probs)) 49 | #print punctuations 50 | 51 | print(">>> Sentence with boundaries:") 52 | for i in range(len(punctuations) - 1, -1, -1): 53 | input_text.gold_tokens.insert(i + PUNCTUATION_POS, classes_as_string[punctuations[i]]) 54 | print "{", 55 | for t in input_text.gold_tokens: 56 | print t, 57 | print "}" 58 | 59 | def predict_caffe(self, instance): 60 | transformer = caffe.io.Transformer({'data': self.net.blobs['data'].data.shape}) 61 | 62 | batchsize = 1 63 | self.net.blobs['data'].reshape(batchsize,1,5,300) 64 | reshaped_array = numpy.expand_dims(instance.get_array(), axis=0) 65 | 66 | self.net.blobs['data'].data[...] = reshaped_array 67 | 68 | out = self.net.forward() 69 | return out['softmax'] 70 | 71 | def show_probs(self, probs): 72 | for i in range (0, len(classes)): 73 | print classes[i], ":", probs[0][i] 74 | 75 | 76 | def main_no_loading(net, vector, datafile=None, show=False): 77 | if show: 78 | classes_as_string[0] = "_" 79 | caffe.set_mode_cpu() 80 | d = Demo(net, vector) 81 | if datafile: 82 | f = open(datafile) 83 | text = f.read() 84 | f.close() 85 | d.predict_text(text) 86 | else: 87 | while (1): 88 | text = raw_input("Please enter some text without punctuation for prediction (enter q to quit):") 89 | if text == "q": 90 | return 91 | d.predict_text(text) 92 | 93 | def main(vectorfile, caffeproto, caffemodel, datafile=None, show=False): 94 | vector = Word2VecFile(vectorfile) 95 | net = caffe.Net(caffeproto, caffemodel, caffe.TEST) 96 | main_no_loading(net, vector, datafile, show) 97 | 98 | if __name__ == '__main__': 99 | parser = argparse.ArgumentParser(description='Get word vector from binary data.') 100 | parser.add_argument('-d','--datafile', help='path to file with text, text can be entered interactively if ommited', dest=datafile) 101 | parser.add_argument('vectorfile', help='path to word vector binary') 102 | parser.add_argument('caffeproto', help='path to caffe proto file') 103 | parser.add_argument('caffemodel', help='path to caffe model file') 104 | parser.add_argument('-s','--show', help='show the non-existing punctuation with an underscore', action=store_true, dest=show) 105 | args = parser.parse_args() 106 | main(show=args.show, vectorfile=args.vectorfile, caffeproto=args.caffeproto, caffemodel=args.caffemodel, datafile=args.datafile) 107 | -------------------------------------------------------------------------------- /python/console_demo/demo_preparation.py: -------------------------------------------------------------------------------- 1 | import caffe 2 | 3 | import demo.demo as d 4 | from preprocessing.word2vec_file import Word2VecFile 5 | 6 | vector = Word2VecFile('models/GoogleNews-vectors-negative300.bin') 7 | net = caffe.Net('models/deploy.prototxt', 'models/model.caffemodel', caffe.TEST) 8 | 9 | -------------------------------------------------------------------------------- /python/demo_data/.gitignore: -------------------------------------------------------------------------------- 1 | *.* 2 | !*.gitignore 3 | !download_all.sh 4 | !download_models.sh 5 | !download_google_vector.sh 6 | !folders.txt 7 | !models.txt 8 | -------------------------------------------------------------------------------- /python/demo_data/audio_examples/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /python/demo_data/audio_models/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /python/demo_data/download_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | user="ms2015t3" 4 | host="172.16.23.193" 5 | path="/home/ms2015t3/demo_data" 6 | 7 | while IFS='' read -r folder || [[ -n "$folder" ]]; do 8 | echo "downloading folder: $folder..." 9 | mkdir "$folder" -p 10 | sftp -r "$user@$host:$path/$folder" . 11 | echo -e "*\n!.gitignore" > "$folder/.gitignore" 12 | done < "folders.txt" 13 | -------------------------------------------------------------------------------- /python/demo_data/download_google_vector.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | sftp ms2015t3@172.16.23.193:/home/ms2015t3/ms-2015-t3/GoogleNews-vectors-negative300.bin . 3 | -------------------------------------------------------------------------------- /python/demo_data/download_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | user="ms2015t3" 4 | host="172.16.23.193" 5 | path="/home/ms2015t3/sentence-boundary-detection-nn/net/experiments" 6 | 7 | while IFS='' read -r model || [[ -n "$model" ]]; do 8 | echo "downloading model: $model..." 9 | mkdir "lexical_models/$model" -p 10 | sftp -r "$user@$host:$path/$model/net.prototxt" "lexical_models/$model/" 11 | sftp -r "$user@$host:$path/$model/*.ini" "lexical_models/$model/" 12 | sftp -r "$user@$host:$path/$model/*.caffemodel" "lexical_models/$model/" 13 | done < "lexical_models.txt" 14 | -------------------------------------------------------------------------------- /python/demo_data/folders.txt: -------------------------------------------------------------------------------- 1 | audio_examples 2 | audio_models 3 | lexical_models 4 | text_data 5 | -------------------------------------------------------------------------------- /python/demo_data/lexical_models.txt: -------------------------------------------------------------------------------- 1 | 20160108-025006_google_ted_wiki_window-5-4_pos-false_qm-false_balanced-false_nr-rep-true_word-this 2 | 20160108-032648_google_ted_wiki_window-5-4_pos-true_qm-false_balanced-false_nr-rep-true_word-this 3 | 20160108-072841_google_ted_wiki_window-8-4_pos-false_qm-false_balanced-false_nr-rep-true_word-this 4 | 20160108-081712_google_ted_wiki_window-8-4_pos-true_qm-false_balanced-false_nr-rep-true_word-this 5 | -------------------------------------------------------------------------------- /python/demo_data/lexical_models/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /python/demo_data/text_data/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | *.result 4 | -------------------------------------------------------------------------------- /python/email.ini.default: -------------------------------------------------------------------------------- 1 | [credentials] 2 | username=username@gmail.com 3 | password=password 4 | 5 | [adresses] 6 | to=other1@gmail.com,other2@gmail.com 7 | -------------------------------------------------------------------------------- /python/evaluation/evaluation.py: -------------------------------------------------------------------------------- 1 | import common.sbd_config as sbd 2 | import argparse, os 3 | from parsing.audio_parser import AudioParser 4 | from sbd_classification.util import * 5 | from sbd_classification.classification_input import InputText, InputAudio 6 | from sbd_classification.fusion import get_evaluation_fusion_list 7 | from preprocessing.word2vec_file import Word2VecFile 8 | from preprocessing.tokens import Punctuation 9 | 10 | from sklearn.metrics import precision_recall_fscore_support 11 | 12 | class Evaluation(object): 13 | 14 | def __init__(self, talks): 15 | self.talks = talks 16 | self.tokens = [token for talk in self.talks for token in talk.get_tokens()] 17 | 18 | def evaluate(self, lexical_model_folder, audio_model_folder, vector): 19 | print("Evaluating %s and %s ..." % (lexical_model_folder, audio_model_folder)) 20 | 21 | lexical_classifier = load_lexical_classifier(lexical_model_folder, vector) 22 | audio_classifier = load_audio_classifier(audio_model_folder) 23 | 24 | # get audio probabilities 25 | self._load_config(audio_model_folder) 26 | input_audio = InputAudio(self.talks) 27 | audio_probs = audio_classifier.predict(input_audio) 28 | 29 | # get lexical probabilities 30 | self._load_config(lexical_model_folder) 31 | input_text = InputText(self.talks) 32 | lexical_probs = lexical_classifier.predict(input_text) 33 | 34 | # get config parameter 35 | (lexical_window_size, lexical_punctuation_pos, pos_tagging) = lexical_classifier.get_lexical_parameter() 36 | (audio_window_size, audio_punctuation_pos) = audio_classifier.get_audio_parameter() 37 | 38 | fusions = get_evaluation_fusion_list(lexical_punctuation_pos, lexical_window_size, audio_punctuation_pos, audio_window_size) 39 | 40 | assert(len(input_audio.tokens) == len(input_text.tokens)) 41 | print("fusion,precision[NONE],precision[PERIOD],recall[NONE],recall[PERIOD],f1[NONE],f1[PERIOD],support[NONE],support[PERIOD]") 42 | for fusion in fusions: 43 | print(str(fusion),) 44 | fusion_probs = fusion.fuse(len(input_text.tokens), lexical_probs, audio_probs) 45 | 46 | exp_actual = self.get_expected_actual(fusion_probs, self.tokens) 47 | self.calculate_evaluation_metrics(exp_actual) 48 | 49 | def get_expected_actual(self, fusion_probs, tokens): 50 | expected_actual = [] 51 | word_tokens = [token for token in tokens if not token.is_punctuation()] 52 | 53 | assert(len(word_tokens) == len(fusion_probs)) 54 | tokens_idx = 1 55 | for i in range(len(fusion_probs)): 56 | actual = fusion_probs[i].index(max(fusion_probs[i])) 57 | is_punctuation = tokens[tokens_idx].is_punctuation() 58 | expected = tokens[tokens_idx].punctuation_type.value if is_punctuation else 0 59 | if is_punctuation: 60 | tokens_idx += 1 61 | tokens_idx += 1 62 | if actual == Punctuation.COMMA.value: 63 | continue 64 | expected_actual.append((expected, actual)) 65 | 66 | return expected_actual 67 | 68 | def calculate_evaluation_metrics(self, expected_actual): 69 | expected = map(lambda x: x[0], expected_actual) 70 | actual = map(lambda x: x[1], expected_actual) 71 | results = precision_recall_fscore_support(expected, actual) 72 | print("%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f" % ( 73 | results[0][0], 74 | results[0][1], 75 | results[1][0], 76 | results[1][1], 77 | results[2][0], 78 | results[2][1], 79 | results[3][0], 80 | results[3][1] 81 | ) 82 | ) 83 | 84 | def _load_config(self, model_folder): 85 | config_file, caffemodel_file, net_proto = get_filenames(model_folder) 86 | sbd.SbdConfig(config_file) 87 | 88 | 89 | if __name__ == '__main__': 90 | parser = argparse.ArgumentParser(description='evaluates the fusion.') 91 | parser.add_argument('ctm_file', help="path to ctm_file", default="evaluation_data/data/tst2011_0.ctm", nargs='?') 92 | parser.add_argument('vectorfile', help='the google news word vector', default='evaluation_data/GoogleNews-vectors-negative300.bin', nargs='?') 93 | parser.add_argument('lexical_model_folder', help="path to lexical models", default="evaluation_data/lexical_models", nargs='?') 94 | parser.add_argument('audio_model_folder', help="path to audio models", default="evaluation_data/audio_models", nargs='?') 95 | parser.add_argument('--release', help="whether to test in release mode", action='store_true') 96 | args = parser.parse_args() 97 | 98 | if args.release: 99 | vector = Word2VecFile(args.vectorfile) 100 | else: 101 | vector = None 102 | 103 | # get all talks 104 | print("Reading all talks ...") 105 | audio_parser = AudioParser() 106 | talks = audio_parser.parse(args.ctm_file) 107 | 108 | 109 | # get all lexical models 110 | lexical_models = [] 111 | for dirname, dirnames, filenames in os.walk(args.lexical_model_folder): 112 | for subdirname in dirnames: 113 | lexical_models.append(os.path.join(dirname, subdirname)) 114 | 115 | # get all audio models 116 | audio_models = [] 117 | for dirname, dirnames, filenames in os.walk(args.audio_model_folder): 118 | for subdirname in dirnames: 119 | audio_models.append(os.path.join(dirname, subdirname)) 120 | 121 | 122 | # evaluate all combination of models 123 | evaluation = Evaluation(talks) 124 | for lexical_model in lexical_models: 125 | for audio_model in audio_models: 126 | evaluation.evaluate(lexical_model, audio_model, vector) 127 | 128 | -------------------------------------------------------------------------------- /python/evaluation_data/.gitignore: -------------------------------------------------------------------------------- 1 | *.* 2 | !*.gitignore 3 | !download_all.sh 4 | !download_models.sh 5 | !folders.txt 6 | !models.txt 7 | -------------------------------------------------------------------------------- /python/evaluation_data/download_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | user="ms2015t3" 4 | host="172.16.23.193" 5 | 6 | mkdir -p data 7 | scp "$user@$host:/mnt/naruto/sentence/data/audio/tst2011_*.{ctm,energy,pitch}" data/ 8 | 9 | mkdir -p audio_models 10 | scp -r "$user@$host:/home/ms2015t3/sentence-boundary-detection-nn/net-audio/experiments/20160126-053506_audio_window-8-4" audio_models/ 11 | 12 | mkdir -p lexical_models 13 | scp -r "$user@$host:/home/ms2015t3/sentence-boundary-detection-nn/net/experiments/20160111-131832_google_ted_window-8-4_pos-true_qm-false_balanced-false_nr-rep-true_word-this" lexical_models/ 14 | -------------------------------------------------------------------------------- /python/evaluation_data/folders.txt: -------------------------------------------------------------------------------- 1 | audio_models 2 | lexical_models 3 | data 4 | -------------------------------------------------------------------------------- /python/experiments/README.md: -------------------------------------------------------------------------------- 1 | If you want to train multiple configurations on multiple databases you can use the convenience scripts in this folder. 2 | 3 | First you will need to create a root folder, where all databases and experiment data is going to be saved, e.g. `/some/path`. 4 | Then create a `1_open`, a `2_databased`, a `3_trained`, a `4_database_failed` and a `5_training_failed` folder inside that path (e.g. `/some/path/1_open`). 5 | Insert all config files you want to test into the `1_open` folder. 6 | Note that you should give these config files **meaningful filenames**, as their filenames are used for identification purposes later on. 7 | 8 | # Creating Multiple Databases 9 | 10 | For lexical data use `databases.sh` and for acoustic data use `audio_databases.sh`. 11 | 12 | * Pass the original root folder (e.g. `/some/path`) to either of these scripts. 13 | * All config files for which a database was created successfully, will be moved to the subfolder `2_databased`. 14 | * If any fail, they will be moved to a subfolder `4_database_failed`. 15 | 16 | # Training on Multiple Databases 17 | 18 | For lexical data use `training.sh` and for acoustic data use `audio_training.sh`. 19 | 20 | * Pass the original root folder (e.g. `/some/path`) to either of these scripts. 21 | * This script will access all config files in the subfolder `2_databased`, and automatically move successful trained files to a subfolder `3_trained` 22 | * If any fail, they will be moved to a subfolder `5_training_failed`. 23 | 24 | The final experiment names will be taken from the basename of the config file. 25 | -------------------------------------------------------------------------------- /python/experiments/audio_databases.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ "$#" -ne 1 ]; then 4 | echo "Illegal number of parameters!" 5 | echo "./experiments.sh " 6 | exit 7 | fi 8 | 9 | CONFIG_FOLDER=$1 10 | 11 | if ! [[ -d $CONFIG_FOLDER ]]; then 12 | echo "$CONFIG_FOLDER is not a directory!" 13 | exit 14 | fi 15 | 16 | # Abort on first error 17 | set -e 18 | 19 | source $SENTENCE_HOME/use_python p2 20 | 21 | for CONFIG_FILE in "$CONFIG_FOLDER"/1_open/* 22 | do 23 | cd $SENTENCE_HOME/python/ 24 | CONFIG=$(basename ${CONFIG_FILE}) 25 | CONFIG="${CONFIG%.*}" 26 | echo "#################### Creating database with $CONFIG ####################" 27 | python sbd_leveldb/audio_training_instance_generator.py $CONFIG_FILE 28 | 29 | if [ $? -eq 0 ]; then 30 | echo "#################### Moving to 2_databased ####################" 31 | mv $CONFIG_FOLDER/1_open/$CONFIG.ini $CONFIG_FOLDER/2_databased/ 32 | else 33 | echo "#################### Moving to 4_database_failed ####################" 34 | mv $CONFIG_FOLDER/1_open/$CONFIG.ini $CONFIG_FOLDER/4_database_failed/ 35 | fi 36 | 37 | done 38 | -------------------------------------------------------------------------------- /python/experiments/audio_training.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ "$#" -ne 1 ]; then 4 | echo "Illegal number of parameters!" 5 | echo "./experiments.sh " 6 | exit 7 | fi 8 | 9 | CONFIG_FOLDER=$1 10 | 11 | if ! [[ -d $CONFIG_FOLDER ]]; then 12 | echo "$CONFIG_FOLDER is not a directory!" 13 | exit 14 | fi 15 | 16 | source $SENTENCE_HOME/use_python p2 17 | 18 | for CONFIG_FILE in "$CONFIG_FOLDER"/2_databased/* 19 | do 20 | cd $SENTENCE_HOME/python/ 21 | CONFIG=$(basename ${CONFIG_FILE}) 22 | CONFIG="${CONFIG%.*}" 23 | echo "#################### Training with $CONFIG ####################" 24 | echo "#################### Configuring net ####################" 25 | python tools/netconfig.py ../net-audio/net.prototxt -o ../net-audio/auto.prototxt -t $SENTENCE_HOME/leveldbs/$CONFIG 26 | echo "#################### Starting training ####################" 27 | date 28 | cd $SENTENCE_HOME/net-audio/ 29 | ./training.sh $CONFIG 30 | 31 | if [ $? -eq 0 ]; then 32 | echo "#################### Moving to 3_trained ####################" 33 | cd $SENTENCE_HOME/python/ 34 | mv $CONFIG_FOLDER/2_databased/$CONFIG.ini $CONFIG_FOLDER/3_trained 35 | else 36 | echo "#################### Moving to 5_training_failed ####################" 37 | cd $SENTENCE_HOME/python/ 38 | mv $CONFIG_FOLDER/2_databased/$CONFIG.ini $CONFIG_FOLDER/5_training_failed 39 | fi 40 | echo "#################### Removing net definition ####################" 41 | cd $SENTENCE_HOME/net-audio/ 42 | rm auto.prototxt 43 | done 44 | -------------------------------------------------------------------------------- /python/experiments/databases.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ "$#" -ne 1 ]; then 4 | echo "Illegal number of parameters!" 5 | echo "./experiments.sh " 6 | exit 7 | fi 8 | 9 | CONFIG_FOLDER=$1 10 | 11 | if ! [[ -d $CONFIG_FOLDER ]]; then 12 | echo "$CONFIG_FOLDER is not a directory!" 13 | exit 14 | fi 15 | 16 | # Abort on first error 17 | set -e 18 | 19 | source $SENTENCE_HOME/use_python p2 20 | 21 | for CONFIG_FILE in "$CONFIG_FOLDER"/1_open/* 22 | do 23 | cd $SENTENCE_HOME/python/ 24 | CONFIG=$(basename ${CONFIG_FILE}) 25 | CONFIG="${CONFIG%.*}" 26 | echo "#################### Creating database with $CONFIG ####################" 27 | python sbd_leveldb/training_instance_generator.py $CONFIG_FILE 28 | 29 | if [ $? -eq 0 ]; then 30 | echo "#################### Moving to 2_databased ####################" 31 | mv $CONFIG_FOLDER/1_open/$CONFIG.ini $CONFIG_FOLDER/2_databased/ 32 | else 33 | echo "#################### Moving to 4_database_failed ####################" 34 | mv $CONFIG_FOLDER/1_open/$CONFIG.ini $CONFIG_FOLDER/4_database_failed/ 35 | fi 36 | 37 | done 38 | -------------------------------------------------------------------------------- /python/experiments/training.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ "$#" -ne 1 ]; then 4 | echo "Illegal number of parameters!" 5 | echo "./experiments.sh " 6 | exit 7 | fi 8 | 9 | CONFIG_FOLDER=$1 10 | 11 | if ! [[ -d $CONFIG_FOLDER ]]; then 12 | echo "$CONFIG_FOLDER is not a directory!" 13 | exit 14 | fi 15 | 16 | source $SENTENCE_HOME/use_python p2 17 | 18 | for CONFIG_FILE in "$CONFIG_FOLDER"/2_databased/* 19 | do 20 | cd $SENTENCE_HOME/python/ 21 | CONFIG=$(basename ${CONFIG_FILE}) 22 | CONFIG="${CONFIG%.*}" 23 | echo "#################### Training with $CONFIG ####################" 24 | echo "#################### Configuring net ####################" 25 | python tools/netconfig.py ../net/net.prototxt -o ../net/auto.prototxt -t $SENTENCE_HOME/leveldbs/$CONFIG 26 | echo "#################### Starting training ####################" 27 | date 28 | cd $SENTENCE_HOME/net/ 29 | ./training.sh $CONFIG 30 | 31 | if [ $? -eq 0 ]; then 32 | echo "#################### Moving to 3_trained ####################" 33 | cd $SENTENCE_HOME/python/ 34 | mv $CONFIG_FOLDER/2_databased/$CONFIG.ini $CONFIG_FOLDER/3_trained 35 | else 36 | echo "#################### Moving to 5_training_failed ####################" 37 | cd $SENTENCE_HOME/python/ 38 | mv $CONFIG_FOLDER/2_databased/$CONFIG.ini $CONFIG_FOLDER/5_training_failed 39 | fi 40 | echo "#################### Removing net definition ####################" 41 | cd $SENTENCE_HOME/net/ 42 | rm auto.prototxt 43 | done 44 | -------------------------------------------------------------------------------- /python/parsing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/python/parsing/__init__.py -------------------------------------------------------------------------------- /python/parsing/abstract_parser.py: -------------------------------------------------------------------------------- 1 | import os, argparse 2 | 3 | from common.argparse_util import * 4 | 5 | class AbstractParser(object): 6 | """AbstractParser with standard filename methods, parse method has to be implemented by subclass""" 7 | def __init__(self, filename): 8 | self.filename = filename 9 | 10 | def _wanted_file_endings(self): 11 | """returns a list of file endings, that can be parsed by this parser""" 12 | raise NotImplementedError("to be implemented by subclass") 13 | 14 | def wants_this_file(self): 15 | basepath, extension = os.path.splitext(self.filename) 16 | return extension in self._wanted_file_endings() 17 | 18 | def get_file_name(self): 19 | return self.filename 20 | 21 | def parse(self): 22 | """returns a list of talks, it is recommended to use the python generator for less memory usage""" 23 | raise NotImplementedError("to be implemented by subclass") 24 | 25 | def progress(self): 26 | """progress of parsing, should be implemented for parsers with large file sizes""" 27 | raise NotImplementedError("to be implemented by subclass") 28 | 29 | def _no_progress_function(self): 30 | return 0. 31 | 32 | def _line_count_progress(self): 33 | return float(self._progress) / self._linenumber 34 | 35 | def _init_line_count_progress(self): 36 | i = -1 37 | with open(self.filename) as f: 38 | for i, line in enumerate(f): 39 | pass 40 | self._linenumber = i + 1 41 | self._progress = 0 42 | 43 | 44 | def main(filename, class_): 45 | parser = class_(filename) 46 | texts = parser.parse() 47 | for i, text in enumerate(texts): 48 | print "progress %f, text %d:" % (parser.progress(), i) 49 | print text 50 | 51 | def parse_command_line_arguments(class_): 52 | parser = argparse.ArgumentParser(description='Test the file parsing') 53 | parser.add_argument('filename', help='the file you want to parse', type=lambda arg: is_valid_file(parser, arg)) 54 | args = parser.parse_args() 55 | 56 | main(args.filename, class_) 57 | -------------------------------------------------------------------------------- /python/parsing/audio_parser.py: -------------------------------------------------------------------------------- 1 | from parsing.get_parser import * 2 | from sbd_classification.classification_input import InputText 3 | from sbd_classification.classification_input import InputAudio 4 | from preprocessing.tokens import WordToken 5 | from preprocessing.nlp_pipeline import NlpPipeline 6 | 7 | class AudioParser(object): 8 | 9 | def parse(self, ctm_file): 10 | parser = get_parser(ctm_file) 11 | base_dir = os.path.dirname(parser.get_file_name()) 12 | raw_talks = parser.parse() 13 | 14 | talks = [] 15 | for i, talk in enumerate(raw_talks): 16 | # build range map from second intervals to tokens 17 | talk.build_interval_tree() 18 | 19 | # get pitch feature values 20 | pitch_file = base_dir + "/" + talk.group_name + "_talkid" + str(talk.talk_id) + ".pitch" 21 | talk.parse_pitch_feature(pitch_file) 22 | 23 | # get energy feature values 24 | energy_file = base_dir + "/" + talk.group_name + "_talkid" + str(talk.talk_id) + ".energy" 25 | talk.parse_energy_feature(energy_file) 26 | 27 | # get pitch feature values 28 | talk.parse_pitch_feature(pitch_file) 29 | # get energy feature values 30 | talk.parse_energy_feature(energy_file) 31 | # normalize features 32 | talk.normalize() 33 | 34 | talks.append(talk) 35 | 36 | return talks 37 | 38 | -------------------------------------------------------------------------------- /python/parsing/ctm_parser.py: -------------------------------------------------------------------------------- 1 | import sys, argparse, os 2 | from os.path import basename 3 | 4 | import re 5 | from common.argparse_util import * 6 | import common.sbd_config as sbd 7 | from preprocessing.nlp_pipeline import NlpPipeline, PosTag 8 | from preprocessing.audio import Audio, AudioSentence 9 | from preprocessing.tokens import AudioToken, PunctuationToken, Punctuation 10 | 11 | from abstract_parser import AbstractParser, main, parse_command_line_arguments 12 | 13 | reload(sys) 14 | sys.setdefaultencoding('utf8') 15 | 16 | 17 | class CtmParser(AbstractParser): 18 | 19 | def __init__(self, filename): 20 | super(CtmParser, self).__init__(filename) 21 | if not self.wants_this_file(): 22 | return 23 | 24 | self._init_line_count_progress() 25 | 26 | def _wanted_file_endings(self): 27 | return (".ctm",) 28 | 29 | def parse(self): 30 | current_talk_id = 0 31 | audio = Audio() 32 | sentence = AudioSentence() 33 | sentence.tokens = [] 34 | 35 | group_name = self._extract_group_name() 36 | 37 | with open(self.filename, "r") as file_: 38 | for line_unenc in file_: 39 | self._progress += 1 40 | 41 | # parse line 42 | line = unicode(line_unenc, errors='ignore') 43 | line = line.rstrip() 44 | 45 | if line.startswith("#"): 46 | talk_id = self._extract_talk_id(line) 47 | token_count = len(sentence.tokens) 48 | 49 | # end of sentence reached 50 | if token_count > 0: 51 | sentence.begin = sentence.tokens[0].begin 52 | sentence.end = sentence.tokens[-1].begin + sentence.tokens[-1].duration 53 | sentence.tokenize() 54 | sentence.prepare() 55 | audio.add_sentence(sentence) 56 | 57 | # end of talk reached 58 | if talk_id != current_talk_id: 59 | if token_count > 0: 60 | # save audio talk 61 | audio.talk_id = current_talk_id 62 | audio.group_name = group_name 63 | audio = self._prepare_audio(audio) 64 | yield audio 65 | audio = Audio() 66 | current_talk_id = talk_id 67 | continue 68 | else: 69 | current_talk_id = talk_id 70 | 71 | # begin a new sentence 72 | sentence = AudioSentence() 73 | sentence.tokens = [] 74 | 75 | else: 76 | # parse line 77 | line_parts = re.split(" +", line) 78 | begin = float(line_parts[2]) 79 | duration = float(line_parts[3]) 80 | word = line_parts[4] 81 | 82 | # add token to sentence 83 | token = AudioToken(word.lower()) 84 | token.begin = begin 85 | token.duration = duration 86 | 87 | sentence.append_token(token) 88 | 89 | if (len(sentence.tokens) > 0): 90 | sentence.begin = sentence.tokens[0].begin 91 | sentence.end = sentence.tokens[-1].begin + sentence.tokens[-1].duration 92 | sentence.tokenize() 93 | sentence.prepare() 94 | audio.add_sentence(sentence) 95 | 96 | if len(audio.sentences) > 0: 97 | audio.talk_id = current_talk_id 98 | audio.group_name = group_name 99 | audio = self._prepare_audio(audio) 100 | yield audio 101 | 102 | def _extract_group_name(self): 103 | return basename(self.filename).split("_")[0] 104 | 105 | def _prepare_audio(self, audio): 106 | # sort sentences by begin 107 | sorted_sentences = sorted(audio.sentences, key=lambda x: x.begin) 108 | audio.sentences = sorted_sentences 109 | 110 | # calculate pause before and pause after 111 | return self._calculate_pause(audio) 112 | 113 | def _calculate_pause(self, audio): 114 | last_end = 0.0 115 | last_token = None 116 | 117 | for token in audio.get_tokens(): 118 | if token.is_punctuation(): 119 | continue 120 | 121 | pause = float(format(token.begin - last_end, '.4f')) 122 | 123 | if pause < 0.0 or pause == -0.0: 124 | pause = 0.0 125 | 126 | token.set_pause_before(pause) 127 | if last_token is not None: 128 | last_token.set_pause_after(pause) 129 | 130 | last_end = token.begin + token.duration 131 | last_token = token 132 | 133 | return audio 134 | 135 | def _extract_talk_id(self, line): 136 | line = line[2:] 137 | line_parts = line.split("talkid") 138 | relevant = line_parts[1] 139 | 140 | talkid = "0" 141 | for i in range(0, len(relevant)): 142 | if relevant[i].isdigit(): 143 | talkid += relevant[i] 144 | else: 145 | break 146 | 147 | return int(talkid) 148 | 149 | def progress(self): 150 | return self._line_count_progress() 151 | 152 | 153 | ################ 154 | # Example call # 155 | ################ 156 | 157 | if __name__ == '__main__': 158 | parse_command_line_arguments(CtmParser) 159 | -------------------------------------------------------------------------------- /python/parsing/get_parser.py: -------------------------------------------------------------------------------- 1 | import sys, argparse, os 2 | 3 | from common.argparse_util import * 4 | from line_parser import LineParser 5 | from plaintext_parser import PlaintextParser 6 | from xml_parser import XMLParser 7 | from ctm_parser import CtmParser 8 | 9 | 10 | def get_parser(filename): 11 | parsers = [] 12 | parsers.append(PlaintextParser(filename)) 13 | try: 14 | parsers.append(LineParser(filename)) 15 | except ValueError: 16 | pass 17 | parsers.append(XMLParser(filename)) 18 | parsers.append(CtmParser(filename)) 19 | 20 | for parser in parsers: 21 | if parser.wants_this_file(): 22 | return parser 23 | 24 | return None 25 | 26 | def main(filename): 27 | parser = get_parser(filename) 28 | if parser: 29 | texts = parser.parse() 30 | for i, text in enumerate(texts): 31 | print "progress %f, text %d:" % (parser.progress(), i) 32 | print text 33 | else: 34 | print "#error: no suitable parser for %s found, sorry." % filename 35 | 36 | if __name__ == '__main__': 37 | parser = argparse.ArgumentParser(description='Test the file parsing') 38 | parser.add_argument('filename', help='the file you want to parse', type=lambda arg: is_valid_file(parser, arg)) 39 | args = parser.parse_args() 40 | 41 | main(args.filename) 42 | -------------------------------------------------------------------------------- /python/parsing/line_parser.py: -------------------------------------------------------------------------------- 1 | import sys, argparse, os 2 | 3 | from common.argparse_util import * 4 | import common.sbd_config as sbd 5 | from preprocessing.nlp_pipeline import NlpPipeline, PosTag 6 | from preprocessing.text import Text, Sentence, END_OF_TEXT_MARKER 7 | from preprocessing.tokens import WordToken, PunctuationToken, Punctuation 8 | 9 | from abstract_parser import AbstractParser, main, parse_command_line_arguments 10 | 11 | reload(sys) 12 | sys.setdefaultencoding('utf8') 13 | 14 | 15 | class LineParser(AbstractParser): 16 | 17 | def __init__(self, filename): 18 | super(LineParser, self).__init__(filename) 19 | if not self.wants_this_file(): 20 | return 21 | 22 | self._init_line_count_progress() 23 | # if sbd.config.getboolean('features', 'use_question_mark'): 24 | # raise ValueError("Question marks not supported by LineParser") 25 | 26 | self.POS_TAGGING = sbd.config.getboolean('features', 'pos_tagging') 27 | self.nlp_pipeline = NlpPipeline() 28 | 29 | def _wanted_file_endings(self): 30 | return (".line", ) 31 | 32 | def parse(self): 33 | with open(self.filename, "r") as file_: 34 | text = Text() 35 | sentence = Sentence() 36 | sentence.tokens = [] 37 | 38 | for line_unenc in file_: 39 | # end of a text reached 40 | if line_unenc.rstrip() == END_OF_TEXT_MARKER: 41 | yield text 42 | text = Text() 43 | continue 44 | 45 | self._progress += 1 46 | 47 | # parse line 48 | line = unicode(line_unenc, errors='ignore') 49 | line = line.rstrip() 50 | 51 | # split line into word, pos_tags and type 52 | line_parts = line.split('\t') 53 | word = self._get_word(line_parts) 54 | if word is None: 55 | continue 56 | pos_tags = self._get_pos_tags(line_parts) 57 | punctuation = self._get_punctuation(line_parts) 58 | 59 | sentence.tokens.extend(self._create_tokens(word, pos_tags, punctuation)) 60 | 61 | # we are at the end of a sentence 62 | if punctuation == 'PERIOD': 63 | if self.POS_TAGGING and not pos_tags: 64 | self.nlp_pipeline.pos_tag(sentence.tokens) 65 | text.add_sentence(sentence) 66 | sentence = Sentence() 67 | sentence.tokens = [] 68 | 69 | # if we do not have any end-of-text-marker 70 | # return everything as one text 71 | if len(text.sentences) > 0: 72 | yield text 73 | 74 | def _get_word(self, line_parts): 75 | word = unicode(line_parts[0]) 76 | word = self.nlp_pipeline.process_word(word) 77 | # check if needed 78 | # if "?" in word and len(word) > 0: 79 | # word = word.replace("?", "") 80 | return word 81 | 82 | def _get_punctuation(self, line_parts): 83 | if len(line_parts) == 2: 84 | return unicode(line_parts[1]) 85 | else: 86 | return unicode(line_parts[2]) 87 | 88 | def _get_pos_tags(self, line_parts): 89 | if len(line_parts) == 2: 90 | return set() 91 | else: 92 | pos_tag_str = line_parts[1].split(",") 93 | pos_tag_types = map(lambda x: x.split(".")[1], pos_tag_str) 94 | return set(map(lambda x: PosTag[x], pos_tag_types)) 95 | 96 | def progress(self): 97 | return self._line_count_progress() 98 | 99 | def _create_tokens(self, word, pos_tags, punctuation): 100 | word_token = WordToken(word) 101 | word_token.set_pos_tags(pos_tags) 102 | 103 | punctuation_token = None 104 | if punctuation == 'PERIOD': 105 | punctuation_token = PunctuationToken(punctuation, Punctuation.PERIOD) 106 | elif punctuation == 'COMMA': 107 | punctuation_token = PunctuationToken(punctuation, Punctuation.COMMA) 108 | 109 | if punctuation_token is not None: 110 | return [word_token, punctuation_token] 111 | return [word_token] 112 | 113 | 114 | 115 | ################ 116 | # Example call # 117 | ################ 118 | 119 | if __name__ == '__main__': 120 | parse_command_line_arguments(LineParser) 121 | -------------------------------------------------------------------------------- /python/parsing/plaintext_parser.py: -------------------------------------------------------------------------------- 1 | import argparse, sys, os 2 | 3 | from common.argparse_util import * 4 | from preprocessing.nlp_pipeline import NlpPipeline 5 | from preprocessing.text import Sentence, Text 6 | 7 | from abstract_parser import AbstractParser, main, parse_command_line_arguments 8 | 9 | TEXT_SEPARATOR = "################################################################################" 10 | 11 | reload(sys) 12 | sys.setdefaultencoding('utf8') 13 | 14 | class PlaintextParser(AbstractParser): 15 | def __init__(self, filename): 16 | super(PlaintextParser, self).__init__(filename) 17 | if not self.wants_this_file(): 18 | return 19 | self._init_line_count_progress() 20 | self.nlp_pipeline = NlpPipeline() 21 | 22 | def _wanted_file_endings(self): 23 | return (".txt",) 24 | 25 | def parse(self): 26 | text = Text() 27 | 28 | with open(self.filename, "r") as file_: 29 | for line_unenc in file_: 30 | self._progress += 1 31 | line = unicode(line_unenc.encode('utf8')) 32 | if line.startswith(TEXT_SEPARATOR): 33 | if (len(text.sentences) > 0): 34 | yield text 35 | text = Text() 36 | continue 37 | sentences = self.nlp_pipeline.sentence_segmentation(line) 38 | for sentence in sentences: 39 | s = Sentence() 40 | s.set_sentence_text(sentence) 41 | s.set_tokens(self.nlp_pipeline.parse_text(sentence)) 42 | text.add_sentence(s) 43 | if (len(text.sentences) > 0): 44 | yield text 45 | 46 | def progress(self): 47 | return self._line_count_progress() 48 | 49 | 50 | ################ 51 | # Example call # 52 | ################ 53 | 54 | if __name__ == '__main__': 55 | parse_command_line_arguments(PlaintextParser) 56 | -------------------------------------------------------------------------------- /python/parsing/xml_parser.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree, sys, os.path, re 2 | 3 | from common.argparse_util import * 4 | from preprocessing.nlp_pipeline import NlpPipeline 5 | from preprocessing.text import * 6 | 7 | from abstract_parser import AbstractParser, main, parse_command_line_arguments 8 | 9 | class XMLParser(AbstractParser): 10 | def __init__(self, filename): 11 | super(XMLParser, self).__init__(filename) 12 | if not self.wants_this_file(): 13 | return 14 | self.nlp_pipeline = NlpPipeline() 15 | self._linenumber = self._count_docs() 16 | self._progress = 0 17 | 18 | def _wanted_file_endings(self): 19 | return (".xml",) 20 | 21 | def parse(self): 22 | mteval = xml.etree.ElementTree.parse(self.filename).getroot() 23 | srcset = mteval.find("srcset") 24 | for doc in srcset.findall('doc'): 25 | self._progress += 1 26 | talk = Text() 27 | 28 | for sentence in doc.findall("seg"): 29 | sentence_text = unicode(sentence.text) 30 | 31 | sentence = Sentence() 32 | sentence.set_sentence_text(sentence_text) 33 | sentence.set_tokens(self.nlp_pipeline.parse_text(sentence_text)) 34 | talk.add_sentence(sentence) 35 | 36 | yield talk 37 | 38 | def progress(self): 39 | return self._line_count_progress() 40 | 41 | def _count_docs(self): 42 | mteval = xml.etree.ElementTree.parse(self.filename).getroot() 43 | srcset = mteval.find("srcset") 44 | i = 0 45 | for doc in srcset.findall('doc'): 46 | i += 1 47 | return i 48 | 49 | ################ 50 | # Example call # 51 | ################ 52 | 53 | if __name__ == '__main__': 54 | parse_command_line_arguments(XMLParser) 55 | -------------------------------------------------------------------------------- /python/preprocessing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/python/preprocessing/__init__.py -------------------------------------------------------------------------------- /python/preprocessing/glove_file.py: -------------------------------------------------------------------------------- 1 | import sys, argparse, struct, numpy 2 | import common.sbd_config as sbd 3 | 4 | class GloveFile(object): 5 | """reads a binary word vector file, returns vectors for single words""" 6 | 7 | def __init__(self, filename): 8 | self.ENCODING = 'UTF-8' 9 | self.KEY_ERROR_VECTOR = sbd.config.get('word_vector', 'key_error_vector') 10 | 11 | # the following variable counts word, that are not covered in the given vector 12 | # see get_vector for details 13 | self.not_covered_words = dict() 14 | # and some bare numbers 15 | self.nr_covered_words = 0 16 | self.nr_uncovered_words = 0 17 | # read vector file 18 | self.__filename = filename 19 | 20 | try: 21 | self.__file = open(filename, 'rb') 22 | except IOError: 23 | print ('The file %s can not be read!' % self.__filename) 24 | return 25 | 26 | self.words = 400000 27 | self.vector_size = 50 28 | 29 | self.vector_array = numpy.zeros((self.words, self.vector_size), float) 30 | self.word2index = {} 31 | self.average_vector = numpy.zeros((self.vector_size,), float) 32 | 33 | index = 0 34 | with open(filename) as f: 35 | for line in f: 36 | if index % 100000 == 0: 37 | print("Parsed %d/%d lines." % (index, self.words)) 38 | parts = line.split(" ") 39 | word = parts[0] 40 | vector = parts[1:] 41 | 42 | self.word2index[word] = index 43 | for i in range (len(vector)): 44 | self.vector_array[index][i] = float(vector[i]) 45 | 46 | index += 1 47 | 48 | self.__file.close() 49 | print('Parsing finished!') 50 | 51 | def __del__(self): 52 | self.vector_array = None 53 | self.word2index = None 54 | 55 | def get_vector(self, word): 56 | try: 57 | idx = self.word2index[word] 58 | self.nr_covered_words += 1 59 | return self.vector_array[idx] 60 | except KeyError: 61 | self.not_covered_words[word] = self.not_covered_words.get(word, 0) + 1 62 | self.nr_uncovered_words += 1 63 | if self.KEY_ERROR_VECTOR != 'avg': 64 | idx = self.word2index[self.KEY_ERROR_VECTOR] 65 | return self.vector_array[idx] 66 | else: 67 | return self.average_vector 68 | -------------------------------------------------------------------------------- /python/preprocessing/nlp_pipeline.py: -------------------------------------------------------------------------------- 1 | import nltk, nltk.data 2 | from enum import Enum 3 | import regex as re 4 | import common.sbd_config as sbd 5 | from tokens import Punctuation, PunctuationToken, WordToken 6 | 7 | 8 | class PosTag(Enum): 9 | OTHER = 0 10 | VERB = 1 11 | NOUN = 2 12 | DETERMINER = 3 13 | ADJECTIVE = 4 14 | ADVERB = 5 15 | NUMERAL = 6 16 | CONJUNCTION = 7 17 | PARTICLE = 8 18 | EXISTENTIAL_THERE = 9 19 | MARKER = 10 20 | PRONOUN = 11 21 | INTERJECTION = 12 22 | QUESTION_WORDS = 13 23 | 24 | 25 | class NlpPipeline(object): 26 | 27 | def __init__(self): 28 | self.POS_TAGGING = sbd.config.getboolean('features', 'pos_tagging') 29 | self.NUMBER_REPLACEMENT = sbd.config.getboolean('features', 'number_replacement') 30 | 31 | self.punkt = None 32 | self.punctuation_regex = re.compile("^\p{posix_punct}+$") 33 | self.punctuation_mapping = { 34 | ";": Punctuation.PERIOD, 35 | ".": Punctuation.PERIOD, 36 | "!": Punctuation.PERIOD, 37 | ",": Punctuation.COMMA, 38 | ":": Punctuation.COMMA, 39 | "-": Punctuation.COMMA, 40 | "--": Punctuation.COMMA, 41 | "?": Punctuation.QUESTION 42 | } 43 | self.inv_pos_tag_mapping = { 44 | PosTag.ADJECTIVE: { 45 | "JJ", "JJR", "JJS" 46 | }, 47 | PosTag.ADVERB: { 48 | "RB", "RBR", "RBS" 49 | }, 50 | PosTag.PARTICLE: { 51 | "RP" 52 | }, 53 | PosTag.CONJUNCTION: { 54 | "CC", "IN" 55 | }, 56 | PosTag.NUMERAL: { 57 | "CD", "LS" 58 | }, 59 | PosTag.DETERMINER: { 60 | "DT", "PDT" 61 | }, 62 | PosTag.EXISTENTIAL_THERE: { 63 | "EX" 64 | }, 65 | PosTag.NOUN: { 66 | "FW", "NN", "NNP", "NNPS", "NNS" 67 | }, 68 | PosTag.VERB: { 69 | "MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ" 70 | }, 71 | PosTag.MARKER: { 72 | "POS", "TO" 73 | }, 74 | PosTag.PRONOUN: { 75 | "PRP", "PRP$" 76 | }, 77 | PosTag.INTERJECTION: { 78 | "UH" 79 | }, 80 | PosTag.QUESTION_WORDS: { 81 | "WDT", "WP", "WP$", "WRB" 82 | } 83 | } 84 | self.pos_tag_mapping = { 85 | v2: k for k, v1 in self.inv_pos_tag_mapping.items() for v2 in v1 86 | } 87 | 88 | def parse_text(self, text): 89 | """ 90 | Parses a text and create tokens. 91 | 92 | Args: 93 | text (str): A string representing a sentence. 94 | 95 | Returns: 96 | [token]: List of word and punctuation tokens. 97 | """ 98 | 99 | raw_tokens = nltk.word_tokenize(text) 100 | tokens = [] 101 | 102 | for raw_token in raw_tokens: 103 | if raw_token in self.punctuation_mapping: 104 | punctuation_type = self.punctuation_mapping[raw_token] 105 | tokens.append(PunctuationToken(raw_token, punctuation_type)) 106 | else: 107 | word_token = self.process_word(raw_token) 108 | if word_token is None: 109 | continue 110 | tokens.append(WordToken(word_token)) 111 | 112 | if self.POS_TAGGING: 113 | self.pos_tag(tokens) 114 | 115 | return tokens 116 | 117 | def process_word(self, raw_token): 118 | if re.match(self.punctuation_regex, raw_token): 119 | return None 120 | if self.NUMBER_REPLACEMENT: 121 | return self._replace_number(raw_token) 122 | return raw_token 123 | 124 | 125 | def pos_tag(self, tokens): 126 | word_tokens = map(lambda x: x.word, tokens) 127 | pos_tags = nltk.pos_tag(word_tokens) 128 | 129 | for i, token in enumerate(tokens): 130 | if isinstance(token, WordToken): 131 | pos_tag_str = pos_tags[i][1] 132 | token.set_pos_tags(self._parse_pos_tag(pos_tag_str)) 133 | 134 | def _parse_pos_tag(self, pos_tag_str): 135 | pos_tags = pos_tag_str.split("/") 136 | pos_tag_set = set() 137 | 138 | for pos_tag in pos_tags: 139 | pos_tag_set.add(self.pos_tag_mapping.get(pos_tag, PosTag.OTHER)) 140 | 141 | return pos_tag_set 142 | 143 | def sentence_segmentation(self, text): 144 | if not self.punkt: 145 | self.punkt = nltk.data.load('tokenizers/punkt/english.pickle') 146 | return self.punkt.tokenize(text.strip()) 147 | 148 | def _replace_number(self, word): 149 | if word[:-2].isdigit() and (word.endswith("st") or word.endswith("nd") or word.endswith("rd") or word.endswith("th")): 150 | return "1st" 151 | try: 152 | float(word) 153 | return "1" 154 | except ValueError: 155 | return word 156 | 157 | -------------------------------------------------------------------------------- /python/preprocessing/sliding_window.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | import common.sbd_config as sbd 4 | from nlp_pipeline import Punctuation, NlpPipeline 5 | from text import Sentence, Text 6 | from tokens import PunctuationToken 7 | from training_instance import TrainingInstance 8 | 9 | 10 | class SlidingWindow(object): 11 | 12 | def __init__(self): 13 | self.WINDOW_SIZE = sbd.config.getint('windowing', 'window_size') 14 | self.PUNCTUATION_POS = sbd.config.getint('windowing', 'punctuation_position') 15 | 16 | def list_windows(self, talk): 17 | 18 | tokens = talk.get_tokens() 19 | 20 | index = 0 21 | training_instances = [] 22 | 23 | while index <= len(tokens) - self.WINDOW_SIZE: 24 | window_tokens = [] 25 | instance_label = Punctuation.NONE 26 | 27 | i = index 28 | word_count = 0 29 | while word_count < self.WINDOW_SIZE and i < len(tokens): 30 | current_token = tokens[i] 31 | is_punctuation = current_token.is_punctuation() 32 | 33 | # if there are two punctuations in a row, the last punctuation token is taken 34 | 35 | if not is_punctuation: 36 | word_count += 1 37 | window_tokens.append(current_token) 38 | elif i == index: 39 | index += 1 ##dont parse windows with punctuations at the beginning twice 40 | 41 | if word_count == self.PUNCTUATION_POS and is_punctuation: 42 | instance_label = current_token.punctuation_type 43 | 44 | i += 1 45 | 46 | # if punctuation pos is behind the last word, determine the instance label 47 | if word_count == self.PUNCTUATION_POS and i < len(tokens): 48 | current_token = tokens[i] 49 | is_punctuation = current_token.is_punctuation() 50 | if is_punctuation: 51 | instance_label = current_token.punctuation_type 52 | 53 | if len(window_tokens) == self.WINDOW_SIZE: 54 | training_instances.append(TrainingInstance(window_tokens, instance_label)) 55 | index += 1 56 | 57 | return training_instances 58 | 59 | 60 | 61 | ################ 62 | # Example call # 63 | ################ 64 | 65 | def main(): 66 | nlp_pipeline = NlpPipeline() 67 | 68 | sentence = Sentence() 69 | sentence.set_sentence_text(unicode("I'm a savant, or more precisly, a high-functioning autisitic savant")) 70 | sentence.set_tokens(nlp_pipeline.parse_text(sentence.sentence_text)) 71 | 72 | text = Text() 73 | text.add_sentence(sentence) 74 | 75 | slidingWindow = SlidingWindow() 76 | windows = slidingWindow.list_windows(text) 77 | 78 | for window in windows: 79 | print(window) 80 | 81 | if __name__ == '__main__': 82 | main() 83 | -------------------------------------------------------------------------------- /python/preprocessing/text.py: -------------------------------------------------------------------------------- 1 | import common.sbd_config as sbd 2 | 3 | END_OF_TEXT_MARKER = "###END###" 4 | 5 | class Text(object): 6 | 7 | def __init__(self): 8 | self.sentences = [] 9 | self.POS_TAGGING = sbd.config.getboolean('features', 'pos_tagging') 10 | 11 | def add_sentence(self, sentence): 12 | self.sentences.append(sentence) 13 | 14 | def get_tokens(self): 15 | tokens = [] 16 | for sentence in self.sentences: 17 | tokens.extend(sentence.tokens) 18 | return tokens 19 | 20 | def append_to_file(self, filename): 21 | file = open(filename, "a") 22 | 23 | for sentence in self.sentences: 24 | tokens = sentence.get_tokens() 25 | # get the word vectors for all tokens in the sentence 26 | for i, token in enumerate(tokens): 27 | if not token.is_punctuation(): 28 | if i == len(tokens) - 1: 29 | punctuation_string = "PERIOD" 30 | else: 31 | next_token = tokens[i + 1] 32 | if next_token.is_punctuation(): 33 | punctuation_string = str(next_token.punctuation_type) 34 | punctuation_string = punctuation_string[12:] 35 | else: 36 | punctuation_string = "O" 37 | 38 | if self.POS_TAGGING: 39 | line_str = u"%s\t%s\t%s\n" % (token.word.lower(), " ".join(map(unicode, token.pos_tags)), punctuation_string) 40 | else: 41 | line_str = u"%s\t%s\n" % (token.word.lower(), punctuation_string) 42 | 43 | file.write(line_str) 44 | 45 | file.write("%s\n" % END_OF_TEXT_MARKER) 46 | file.close() 47 | 48 | def __str__(self): 49 | sentences_str = ''.join(map(str, self.sentences)) 50 | return sentences_str 51 | 52 | 53 | class Sentence(object): 54 | 55 | def __init__(self): 56 | self.tokens = None 57 | self.sentence_text = None 58 | 59 | def set_sentence_text(self, sentence_text): 60 | self.sentence_text = sentence_text 61 | 62 | def set_tokens(self, tokens): 63 | self.tokens = tokens 64 | 65 | def get_tokens(self): 66 | return self.tokens 67 | 68 | def __str__(self): 69 | tokens_str = ', '.join(map(str, self.tokens)) 70 | 71 | return "sentence: %s \n tokens: %s \n" % (self.sentence_text, tokens_str) 72 | -------------------------------------------------------------------------------- /python/preprocessing/tokens.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class Punctuation(Enum): 5 | NONE = 0 6 | COMMA = 1 7 | PERIOD = 2 8 | QUESTION = 3 9 | 10 | 11 | class AudioToken(object): 12 | def __init__(self, word): 13 | self.word = word 14 | self.begin = 0.0 15 | self.duration = 0.0 16 | self.pause_before = 0.0 17 | self.pause_after = 0.0 18 | self.energy = 0.0 19 | self.pitch = 0.0 20 | self.pitch_levels = [] 21 | self.energy_levels = [] 22 | 23 | def is_punctuation(self): 24 | return False 25 | 26 | def append_pitch_level(self, pitch_level): 27 | self.pitch_levels.append(pitch_level) 28 | 29 | def append_energy_level(self, energy_level): 30 | self.energy_levels.append(energy_level) 31 | 32 | def set_pause_before(self, pause_before): 33 | self.pause_before = pause_before 34 | 35 | def set_pause_after(self, pause_after): 36 | self.pause_after = pause_after 37 | 38 | def set_energy(self, energy): 39 | self.energy = energy 40 | 41 | def set_pitch(self, pitch): 42 | self.pitch = pitch 43 | 44 | def __str__(self): 45 | return "(pause: %s, pitch: %s, energy: %s) %s" % (str(self.pause_before), str(self.pitch), str(self.energy), self.word) 46 | 47 | def __repr__(self): 48 | return self.word 49 | 50 | def __eq__(self, other): 51 | if other.is_punctuation(): 52 | return False 53 | return self.word == other.word 54 | 55 | def __hash__(self): 56 | return hash(self.word) ^ hash(self.is_punctuation()) 57 | 58 | 59 | class WordToken(object): 60 | def __init__(self, word): 61 | self.word = word 62 | self.word_vec = None 63 | self.pos_tags = set() 64 | 65 | def is_punctuation(self): 66 | return False 67 | 68 | def set_word_vec(self, word_vec): 69 | self.word_vec = word_vec 70 | 71 | def set_pos_tags(self, pos_tag): 72 | self.pos_tags = pos_tag 73 | 74 | def __str__(self): 75 | pos_str = "" 76 | if len(self.pos_tags) > 0: 77 | pos_str = " (" + " ".join(map(unicode, self.pos_tags)) + ")" 78 | return self.word + pos_str 79 | 80 | def __repr__(self): 81 | return self.word 82 | 83 | def __eq__(self, other): 84 | if other.is_punctuation(): 85 | return False 86 | return self.word == other.word 87 | 88 | def __hash__(self): 89 | return hash(self.word) ^ hash(self.is_punctuation()) 90 | 91 | 92 | class PunctuationToken(object): 93 | def __init__(self, word, punctuation_type): 94 | self.word = word 95 | self.punctuation_type = punctuation_type 96 | 97 | def is_punctuation(self): 98 | return True 99 | 100 | def __str__(self): 101 | return str(self.punctuation_type) 102 | 103 | def __repr__(self): 104 | return str(self) 105 | 106 | def __eq__(self, other): 107 | if not other.is_punctuation(): 108 | return False 109 | return self.punctuation_type == other.punctuation_type 110 | 111 | def __hash__(self): 112 | return hash(self.punctuation_type) ^ hash(self.is_punctuation()) 113 | 114 | -------------------------------------------------------------------------------- /python/preprocessing/training_instance.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | import common.sbd_config as sbd 4 | from nlp_pipeline import PosTag 5 | from tokens import Punctuation, AudioToken 6 | 7 | 8 | class TrainingInstance(object): 9 | 10 | def __init__(self, tokens, label): 11 | self.WINDOW_SIZE = sbd.config.getint('windowing', 'window_size') 12 | self.POS_TAGGING = sbd.config.getboolean('features', 'pos_tagging') 13 | self.USE_QUESTION_MARK = sbd.config.getboolean('features', 'use_question_mark') 14 | self.LEXICAL = sbd.config.getboolean('model', 'lexical') 15 | 16 | self.tokens = tokens 17 | self.label = label 18 | 19 | def __repr__(self): 20 | return "TOKENS: %s \nLABEL: %s \n" % (" ".join(map(unicode, self.tokens)), str(self.label)) 21 | 22 | def get_array(self): 23 | if self.LEXICAL: 24 | return self.get_lexical_array() 25 | else: 26 | return self.get_audio_array() 27 | 28 | def get_lexical_array(self): 29 | word_vec_size = len(self.tokens[0].word_vec) 30 | feature_size = word_vec_size 31 | 32 | if self.POS_TAGGING: 33 | feature_size += len(PosTag) 34 | 35 | dimensions = (1, self.WINDOW_SIZE, feature_size) 36 | arr = numpy.zeros(dimensions, float) 37 | 38 | for i in range(0, self.WINDOW_SIZE): 39 | arr[0][i][0:word_vec_size] = self.tokens[i].word_vec 40 | 41 | if self.POS_TAGGING: 42 | for pos_tag in self.tokens[i].pos_tags: 43 | arr[0][i][word_vec_size + pos_tag.value] = 1.0 44 | 45 | return arr 46 | 47 | def get_audio_array(self): 48 | feature_size = 4 49 | 50 | dimensions = (1, self.WINDOW_SIZE, feature_size) 51 | arr = numpy.zeros(dimensions, float) 52 | 53 | for i in range(0, self.WINDOW_SIZE): 54 | arr[0][i][0] = self.tokens[i].pause_before 55 | arr[0][i][1] = self.tokens[i].pause_after 56 | arr[0][i][2] = self.tokens[i].energy 57 | arr[0][i][3] = self.tokens[i].pitch 58 | 59 | return arr 60 | 61 | def get_label(self): 62 | if self.LEXICAL: 63 | return self.get_lexical_label() 64 | else: 65 | return self.get_audio_label() 66 | 67 | def get_audio_label(self): 68 | if self.label == Punctuation.PERIOD: 69 | return 1 70 | else: 71 | return self.label.value 72 | 73 | def get_lexical_label(self): 74 | if not self.USE_QUESTION_MARK and self.label == Punctuation.QUESTION: 75 | return Punctuation.PERIOD.value 76 | return self.label.value 77 | 78 | def get_tokens(self): 79 | return self.tokens 80 | -------------------------------------------------------------------------------- /python/preprocessing/word2vec_file.py: -------------------------------------------------------------------------------- 1 | import sys, argparse, struct, numpy 2 | 3 | from common.argparse_util import * 4 | import common.sbd_config as sbd 5 | 6 | class Word2VecFile(object): 7 | """reads a binary word vector file, returns vectors for single words""" 8 | def __init__(self, filename): 9 | self.ENCODING = 'UTF-8' 10 | self.KEY_ERROR_VECTOR = "this" 11 | 12 | self.key_mapping = { 13 | "'s": "is", 14 | "a": "the", 15 | "of": "from", 16 | "to": "from", 17 | "and": "or" 18 | } 19 | 20 | # the following variable counts word, that are not covered in the given vector 21 | # see get_vector for details 22 | self.not_covered_words = dict() 23 | # and some bare numbers 24 | self.nr_covered_words = 0 25 | self.nr_uncovered_words = 0 26 | # read vector file 27 | self.__filename = filename 28 | 29 | try: 30 | self.__file = open(filename, 'rb') 31 | except IOError: 32 | print ('The file %s can not be read!' % self.__filename) 33 | return 34 | 35 | first_line = self.__file.readline().decode(self.ENCODING).split(' ') 36 | self.words = int(first_line[0]) 37 | self.vector_size = int(first_line[1]) 38 | print('File has %d words with vectors of size %d. Parsing ..' % (self.words, self.vector_size)) 39 | 40 | self.vector_array = numpy.zeros((self.words, self.vector_size), numpy.float32) 41 | self.word2index = {} 42 | 43 | progress_steps = self.words / 100 44 | 45 | chars = [] 46 | for w_index in range(0, self.words): 47 | if w_index % progress_steps == 0: 48 | progress = w_index * 100 / self.words 49 | sys.stdout.write(str(progress) + "% ") 50 | sys.stdout.flush() 51 | byte = self.__file.read(1) 52 | while byte: 53 | if byte == b" ": 54 | word = b"".join(chars) 55 | self.word2index[word.decode(self.ENCODING)] = w_index 56 | chars = [] 57 | break 58 | if byte != b"\n": 59 | chars.append(byte) 60 | byte = self.__file.read(1) 61 | for f_index in range(0, self.vector_size): 62 | f_bytes = self.__file.read(4) 63 | self.vector_array[w_index][f_index] = struct.unpack('f', f_bytes)[0] 64 | self.__file.close() 65 | 66 | print('Parsing finished!') 67 | 68 | def __del__(self): 69 | self.vector_array = None 70 | self.word2index = None 71 | 72 | def get_vector(self, word): 73 | try: 74 | if word in self.key_mapping: 75 | # TODO: This only works for google vector, which does not have the words 'and', 'of' etc. 76 | # If we use other word2vec vectors, this won't work 77 | word = self.key_mapping[word] 78 | idx = self.word2index[word] 79 | self.nr_covered_words += 1 80 | return self.vector_array[idx] 81 | except KeyError: 82 | self.not_covered_words[word] = self.not_covered_words.get(word, 0) + 1 83 | self.nr_uncovered_words += 1 84 | if self.KEY_ERROR_VECTOR != 'avg': 85 | idx = self.word2index[self.KEY_ERROR_VECTOR] 86 | return self.vector_array[idx] 87 | raise Exception 88 | 89 | 90 | ################ 91 | # Example call # 92 | ################ 93 | 94 | def main(args): 95 | word2VecFile = Word2VecFile(args.datafile) 96 | for word in args.word: 97 | try: 98 | print(word, word2VecFile.get_vector(word)) 99 | except KeyError: 100 | print(word, "not found!") 101 | 102 | def is_valid_file(parser, arg, mode): 103 | try: 104 | f = open(arg, mode) 105 | f.close() 106 | return arg 107 | except IOError: 108 | parser.error('The file %s can not be opened!' % arg) 109 | 110 | if __name__ == '__main__': 111 | parser = argparse.ArgumentParser(description='Get word vector from binary data.') 112 | parser.add_argument('datafile', help='path to binary data file', type=lambda arg: is_valid_file(parser, arg, 'rb')) 113 | parser.add_argument('word', help='word to find in data file', nargs='+') 114 | args = parser.parse_args() 115 | main(args) 116 | -------------------------------------------------------------------------------- /python/sbd_classification/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/python/sbd_classification/__init__.py -------------------------------------------------------------------------------- /python/sbd_classification/audio_classification.py: -------------------------------------------------------------------------------- 1 | import numpy, caffe, argparse 2 | import common.sbd_config as sbd 3 | from preprocessing.nlp_pipeline import NlpPipeline, PosTag 4 | from preprocessing.sliding_window import SlidingWindow 5 | from preprocessing.word2vec_file import Word2VecFile 6 | from parsing.audio_parser import AudioParser 7 | from classification_input import InputAudio 8 | 9 | 10 | class AudioClassifier(object): 11 | 12 | def __init__(self, net, debug = False): 13 | self.classes = ["NONE", "PERIOD"] 14 | 15 | self.WINDOW_SIZE = sbd.config.getint('windowing', 'window_size') 16 | self.PUNCTUATION_POS = sbd.config.getint('windowing', 'punctuation_position') 17 | 18 | self.net = net 19 | self.debug = debug 20 | 21 | def predict(self, input_audio): 22 | sliding_window = SlidingWindow() 23 | instances = sliding_window.list_windows(input_audio) 24 | 25 | # get caffe predictions 26 | punctuation_probs = [] 27 | for instance in instances: 28 | probs = self._predict_caffe(instance) 29 | punctuation_probs.extend(numpy.copy(probs)) 30 | 31 | return punctuation_probs 32 | 33 | def _predict_caffe(self, instance): 34 | caffe.io.Transformer({'data': self.net.blobs['data'].data.shape}) 35 | 36 | # batchsize = 1 37 | # self.net.blobs['data'].reshape(batchsize, 1, self.WINDOW_SIZE, self.FEATURE_LENGTH) 38 | reshaped_array = numpy.expand_dims(instance.get_array(), axis=0) 39 | self.net.blobs['data'].data[...] = reshaped_array 40 | 41 | out = self.net.forward() 42 | return out['softmax'] 43 | 44 | def get_audio_parameter(self): 45 | return (self.WINDOW_SIZE, self.PUNCTUATION_POS) 46 | 47 | ################ 48 | # Example call # 49 | ################ 50 | 51 | def main(model_folder, example_folder): 52 | config_file, caffemodel_file, net_proto = get_filenames(model_folder) 53 | sbd.SbdConfig(config_file) 54 | ctm_file, pitch_file, energy_file = get_audio_files(example_folder) 55 | 56 | # parse ctm_file, pitch_file and energy_file 57 | parser = AudioParser(ctm_file, pitch_file, energy_file) 58 | parser.parse() 59 | 60 | classifier = load_audio_classifier(model_folder) 61 | 62 | data = classifier.predict_audio(parser) 63 | print(data) 64 | 65 | if __name__ == '__main__': 66 | parser = argparse.ArgumentParser(description='run the web demo') 67 | parser.add_argument('model_folder', help='the trained caffemodel', default='demo_data/audio_models/audio_window-1-1/', nargs='?') 68 | parser.add_argument('example_folder', help='folder containing the ctm, pitch and energy files', default='demo_data/audio_examples/tst2011_talkid1169/', nargs='?') 69 | args = parser.parse_args() 70 | 71 | main(args.model_folder, args.example_folder) 72 | -------------------------------------------------------------------------------- /python/sbd_classification/classification_input.py: -------------------------------------------------------------------------------- 1 | import common.sbd_config as sbd 2 | from preprocessing.nlp_pipeline import NlpPipeline, PosTag 3 | from preprocessing.audio import Audio 4 | from preprocessing.tokens import WordToken 5 | 6 | class InputText(object): 7 | 8 | def __init__(self, obj): 9 | self.tokens = None 10 | 11 | if isinstance(obj, str) or isinstance(obj, unicode): 12 | self._initialize_with_text(obj) 13 | elif isinstance(obj, list): 14 | if obj: 15 | el = obj[0] 16 | if isinstance(el, Audio): 17 | self._initialize_with_talks(obj) 18 | elif isinstance(el, str): 19 | self._initialize_with_tokens(obj) 20 | else: 21 | print("ERROR: Could not initialize input text!") 22 | else: 23 | print("ERROR: Could not initialize input text!") 24 | 25 | 26 | def _initialize_with_text(self, text): 27 | nlp_pipeline = NlpPipeline() 28 | self.tokens = nlp_pipeline.parse_text(text) 29 | 30 | def _initialize_with_talks(self, talks): 31 | nlp_pipeline = NlpPipeline() 32 | word_tokens = [] 33 | 34 | for talk in talks: 35 | for sentence in talk.sentences: 36 | sentence_tokens = [] 37 | # get all word tokens 38 | for token in sentence.tokens: 39 | if not token.is_punctuation(): 40 | sentence_tokens.append(WordToken(token.word)) 41 | # do pos_tagging if needed on sentence level 42 | if sbd.config.getboolean('features', 'pos_tagging'): 43 | nlp_pipeline.pos_tag(sentence_tokens) 44 | for t in sentence_tokens: 45 | t.word = t.word.lower() 46 | word_tokens += sentence_tokens 47 | 48 | self.tokens = word_tokens 49 | 50 | def _initialize_with_tokens(self, tokens): 51 | # convert tokens to WordTokens 52 | word_tokens = [ WordToken(token) for token in tokens ] 53 | 54 | # do pos_tagging if needed 55 | if sbd.config.getboolean('features', 'pos_tagging'): 56 | nlp_pipeline = NlpPipeline() 57 | nlp_pipeline.pos_tag(wordTokens) 58 | 59 | self.tokens = word_tokens 60 | 61 | def get_tokens(self): 62 | return self.tokens 63 | 64 | 65 | class InputAudio(object): 66 | 67 | def __init__(self, talks): 68 | self.tokens = [] 69 | 70 | for talk in talks: 71 | for token in talk.get_tokens(): 72 | if not token.is_punctuation(): 73 | self.tokens.append(token) 74 | 75 | def get_tokens(self): 76 | return self.tokens 77 | -------------------------------------------------------------------------------- /python/sbd_classification/fusion.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from preprocessing.word2vec_file import Word2VecFile 3 | from sbd_classification.lexical_classification import LexicalClassifier 4 | from sbd_classification.util import * 5 | 6 | 7 | def norm_single(probs): 8 | s = 0.0 9 | for p in probs: 10 | s += p 11 | for i in range(0, len(probs)): 12 | probs[i] = probs[i] / s 13 | return probs 14 | 15 | def norm(probs_list): 16 | for probs in probs_list: 17 | norm_single(probs) 18 | return probs_list 19 | 20 | 21 | class Fusion(object): 22 | 23 | def __init__(self): 24 | # constants for index access into the probability vectors 25 | self.AUDIO_NONE_IDX = 0 26 | self.AUDIO_PERIOD_IDX = 1 27 | self.LEX_NONE_IDX = 0 28 | self.LEX_COMMA_IDX = 1 29 | self.LEX_PERIOD_IDX = 2 30 | 31 | self.__initialized = False 32 | 33 | def init_parameters(self, lexical_punctuation_pos, lexical_window_size, audio_punctuation_pos, audio_window_size): 34 | self.LEXICAL_PUNCTUATION_POS = lexical_punctuation_pos 35 | self.LEXICAL_WINDOW_SIZE = lexical_window_size 36 | self.AUDIO_PUNCTUATION_POS = audio_punctuation_pos 37 | self.AUDIO_WINDOW_SIZE = audio_window_size 38 | 39 | self.__initialized = True 40 | 41 | return self 42 | 43 | def fuse(self, nr_tokens, lexical_probs, audio_probs): 44 | assert(self.__initialized) 45 | assert(len(lexical_probs) + self.LEXICAL_WINDOW_SIZE == len(audio_probs) + self.AUDIO_WINDOW_SIZE) 46 | assert(nr_tokens == len(audio_probs) + self.AUDIO_WINDOW_SIZE - 1) 47 | assert(nr_tokens == len(lexical_probs) + self.LEXICAL_WINDOW_SIZE - 1) 48 | 49 | fusion_probs = [] 50 | for i in range(nr_tokens): 51 | lexical_pos = get_index(i, len(lexical_probs), self.LEXICAL_PUNCTUATION_POS) 52 | audio_pos = get_index(i, len(audio_probs), self.AUDIO_PUNCTUATION_POS) 53 | 54 | # if we have no predictions return NONE 55 | if lexical_pos < 0 and audio_pos < 0: 56 | fusion_probs.append([1.0, 0.0, 0.0]) 57 | continue 58 | 59 | # if we have no audio prediction return lexical prediction 60 | if audio_pos < 0: 61 | fusion_probs.append(lexical_probs[lexical_pos]) 62 | continue 63 | 64 | audio_none = audio_probs[audio_pos][self.AUDIO_NONE_IDX] 65 | audio_period = audio_probs[audio_pos][self.AUDIO_PERIOD_IDX] 66 | 67 | # if we have no lexical prediction return audio prediction 68 | if lexical_pos < 0: 69 | fusion_probs.append([audio_none, 0.0, audio_period]) 70 | continue 71 | 72 | fusion_result = self.sophisticated_fusion(lexical_probs[lexical_pos], audio_probs[audio_pos]) 73 | assert(len(fusion_result) == 3) 74 | fusion_probs.append(fusion_result) 75 | 76 | return fusion_probs 77 | 78 | def sophisticated_fusion(self, lexical_probs, audio_probs): 79 | raise Exception("Abstract base class") 80 | 81 | class ThresholdFusion(Fusion): 82 | 83 | def __init__(self, threshold_audio = 0.5, threshold_lexical = 0.9): 84 | super(ThresholdFusion, self).__init__() 85 | self.threshold_audio = threshold_audio 86 | self.threshold_lexical = threshold_lexical 87 | 88 | def sophisticated_fusion(self, lexical_probs, audio_probs): 89 | audio_none = audio_probs[self.AUDIO_NONE_IDX] 90 | audio_period = audio_probs[self.AUDIO_PERIOD_IDX] 91 | 92 | lexical_none = lexical_probs[self.LEX_NONE_IDX] 93 | lexical_comma = lexical_probs[self.LEX_COMMA_IDX] 94 | lexical_period = lexical_probs[self.LEX_PERIOD_IDX] 95 | 96 | # if audio model predicts a period, and lexical is not very confident, that there is no period, use audio prediction 97 | if audio_period > self.threshold_audio and lexical_none < self.threshold_lexical: 98 | return norm_single([lexical_none, lexical_comma, lexical_period + audio_period]) 99 | else: 100 | return [lexical_none, lexical_comma, lexical_period] 101 | 102 | def __str__(self): 103 | return "ThresholdFusion[AudioThresh: %.2f, LexicalThresh: %.2f]" % (self.threshold_audio, self.threshold_lexical) 104 | 105 | class BalanceFusion(Fusion): 106 | 107 | def __init__(self, lexical_audio_balance = 0.5): 108 | super(BalanceFusion, self).__init__() 109 | self.lexical_audio_balance = lexical_audio_balance 110 | 111 | def sophisticated_fusion(self, lexical_probs, audio_probs): 112 | audio_factor = (1 - self.lexical_audio_balance) 113 | lexical_factor = self.lexical_audio_balance 114 | 115 | audio_none = audio_probs[self.AUDIO_NONE_IDX] * audio_factor 116 | audio_period = audio_probs[self.AUDIO_PERIOD_IDX] * audio_factor 117 | 118 | lexical_none = lexical_probs[self.LEX_NONE_IDX] * lexical_factor 119 | lexical_comma = lexical_probs[self.LEX_COMMA_IDX] * lexical_factor 120 | lexical_period = lexical_probs[self.LEX_PERIOD_IDX] * lexical_factor 121 | 122 | return norm_single([audio_none + lexical_none, lexical_comma + audio_period, lexical_period + audio_period]) 123 | 124 | def __str__(self): 125 | return "BalanceFusion[BalanceValue: %.2f]" % (self.lexical_audio_balance) 126 | 127 | class BaselineLexicalFusion(Fusion): 128 | 129 | def sophisticated_fusion(self, lexical_probs, audio_probs): 130 | return [lexical_probs[self.LEX_NONE_IDX], lexical_probs[self.LEX_COMMA_IDX], lexical_probs[self.LEX_PERIOD_IDX]] 131 | 132 | def __str__(self): 133 | return "BaselineLexicalFusion" 134 | 135 | class BaselineAudioFusion(Fusion): 136 | 137 | def sophisticated_fusion(self, lexical_probs, audio_probs): 138 | return [audio_probs[self.AUDIO_NONE_IDX], 0.0, audio_probs[self.AUDIO_PERIOD_IDX]] 139 | 140 | def __str__(self): 141 | return "BaselineAudioFusion" 142 | 143 | ################ 144 | # Example call # 145 | ################ 146 | 147 | def get_evaluation_fusion_list(lexical_punctuation_pos, lexical_window_size, audio_punctuation_pos, audio_window_size): 148 | fusions = [] 149 | fusions.append(BaselineLexicalFusion()) 150 | fusions.append(BaselineAudioFusion()) 151 | fusions.append(ThresholdFusion(0.5, 0.8)) 152 | fusions.append(ThresholdFusion(0.5, 0.9)) 153 | fusions.append(ThresholdFusion(0.6, 0.8)) 154 | fusions.append(ThresholdFusion(0.6, 0.9)) 155 | fusions.append(ThresholdFusion(0.7, 0.8)) 156 | fusions.append(ThresholdFusion(0.7, 0.9)) 157 | fusions.append(BalanceFusion(0.1)) 158 | fusions.append(BalanceFusion(0.2)) 159 | fusions.append(BalanceFusion(0.3)) 160 | fusions.append(BalanceFusion(0.4)) 161 | fusions.append(BalanceFusion(0.5)) 162 | fusions.append(BalanceFusion(0.6)) 163 | fusions.append(BalanceFusion(0.7)) 164 | fusions.append(BalanceFusion(0.8)) 165 | fusions.append(BalanceFusion(0.9)) 166 | return [f.init_parameters(lexical_punctuation_pos, lexical_window_size, audio_punctuation_pos, audio_window_size) for f in fusions] 167 | 168 | def main(): 169 | import random 170 | 171 | lexical_punctuation_pos = 4 172 | lexical_window_size = 8 173 | audio_punctuation_pos = 2 174 | audio_window_size = 4 175 | 176 | fusions = get_evaluation_fusion_list(lexical_punctuation_pos, lexical_window_size, audio_punctuation_pos, audio_window_size) 177 | num_words = 9 178 | 179 | tokens = ["test" + str(i) for i in range(1, 1 + num_words)] 180 | probs_lexic = [[random.random(), random.random(), random.random()] for i in range(0, num_words - lexical_window_size + 1)] 181 | probs_audio = [[random.random(), random.random()] for i in range(0, num_words - audio_window_size + 1)] 182 | 183 | probs_lexic = norm(probs_lexic) 184 | probs_audio = norm(probs_audio) 185 | 186 | print tokens, len(probs_lexic), len(probs_audio) 187 | 188 | for fc in fusions: 189 | print fc 190 | print fc.fuse(len(tokens), probs_lexic, probs_audio) 191 | 192 | if __name__ == '__main__': 193 | main() 194 | -------------------------------------------------------------------------------- /python/sbd_classification/lexical_classification.py: -------------------------------------------------------------------------------- 1 | import numpy, caffe, argparse 2 | import common.sbd_config as sbd 3 | from preprocessing.nlp_pipeline import NlpPipeline, PosTag 4 | from preprocessing.sliding_window import SlidingWindow 5 | from preprocessing.word2vec_file import Word2VecFile 6 | from classification_input import InputText 7 | 8 | class LexicalClassifier(object): 9 | 10 | def __init__(self, net, word2vec): 11 | 12 | self.WINDOW_SIZE = sbd.config.getint('windowing', 'window_size') 13 | self.PUNCTUATION_POS = sbd.config.getint('windowing', 'punctuation_position') 14 | self.POS_TAGGING = sbd.config.getboolean('features', 'pos_tagging') 15 | 16 | self.FEATURE_LENGTH = 300 if not self.POS_TAGGING else 300 + len(PosTag) 17 | 18 | self.word2vec = word2vec 19 | self.net = net 20 | 21 | def predict(self, input_text): 22 | for token in input_text.tokens: 23 | if not token.is_punctuation(): 24 | if not self.word2vec: 25 | token.word_vec = numpy.random.rand(300) 26 | else: 27 | token.word_vec = self.word2vec.get_vector(token.word.lower()) 28 | 29 | sliding_window = SlidingWindow() 30 | instances = sliding_window.list_windows(input_text) 31 | 32 | # get caffe predictions 33 | punctuation_probs = [] 34 | for instance in instances: 35 | probs = self._predict_caffe(instance) 36 | punctuation_probs.extend(numpy.copy(probs)) 37 | 38 | return punctuation_probs 39 | 40 | def _predict_caffe(self, instance): 41 | caffe.io.Transformer({'data': self.net.blobs['data'].data.shape}) 42 | 43 | # batchsize = 1 44 | # self.net.blobs['data'].reshape(batchsize, 1, self.WINDOW_SIZE, self.FEATURE_LENGTH) 45 | reshaped_array = numpy.expand_dims(instance.get_array(), axis=0) 46 | 47 | self.net.blobs['data'].data[...] = reshaped_array 48 | 49 | out = self.net.forward() 50 | return out['softmax'] 51 | 52 | def get_lexical_parameter(self): 53 | return (self.WINDOW_SIZE, self.PUNCTUATION_POS, self.POS_TAGGING) 54 | 55 | ################ 56 | # Example call # 57 | ################ 58 | 59 | 60 | def main(caffeproto, caffemodel): 61 | net = caffe.Net(caffeproto, caffemodel, caffe.TEST) 62 | classifier = LexicalClassifier(net, None, True) 63 | 64 | text = "This is a very long text This text has two sentences" 65 | data = classifier.predict_text(text) 66 | print(data) 67 | 68 | if __name__ == '__main__': 69 | parser = argparse.ArgumentParser(description='run the web demo') 70 | parser.add_argument('caffeproto', help='the deploy prototxt of your trained model', default='models/deploy.prototxt', nargs='?') 71 | parser.add_argument('caffemodel', help='the trained caffemodel', default='models/model.caffemodel', nargs='?') 72 | args = parser.parse_args() 73 | 74 | main(args.caffeproto, args.caffemodel) 75 | -------------------------------------------------------------------------------- /python/sbd_classification/util.py: -------------------------------------------------------------------------------- 1 | import common.sbd_config as sbd 2 | import caffe, os 3 | from tools.netconfig import NetConfig 4 | from os import listdir 5 | from sbd_classification.lexical_classification import LexicalClassifier 6 | from sbd_classification.audio_classification import AudioClassifier 7 | from preprocessing.nlp_pipeline import PosTag 8 | 9 | def get_index(index, length, punctuation_pos): 10 | position = index - punctuation_pos + 1 11 | if 0 <= position < length: 12 | return position 13 | else: 14 | return -1 15 | 16 | def convert_probabilities(token_length, punctuation_pos, probabilities, classes = ["NONE", "COMMA", "PERIOD"]): 17 | new_probablities = [] 18 | for i in range(0, token_length): 19 | current_prediction_position = get_index(i, len(probabilities), punctuation_pos) 20 | if i == token_length - 1: 21 | new_probablities.append([(1.0 if current == "PERIOD" else 0.0) for current in classes]) 22 | elif current_prediction_position < 0: 23 | new_probablities.append([(1.0 if current == "NONE" else 0.0) for current in classes]) 24 | else: 25 | new_probablities.append(probabilities[current_prediction_position].tolist()) 26 | print probabilities, new_probablities 27 | return new_probablities 28 | 29 | def get_filenames(folder): 30 | for file_ in listdir(folder): 31 | if file_.endswith(".ini"): 32 | config_file = folder + "/" + file_ 33 | elif file_.endswith(".caffemodel"): 34 | caffemodel_file = folder + "/" + file_ 35 | elif file_ == "net.prototxt": 36 | net_proto = folder + "/" + file_ 37 | return config_file, caffemodel_file, net_proto 38 | 39 | def make_lexical_temp_deploy(folder, prototxt, temp_file_name = "temp_deploy.prototxt"): 40 | WINDOW_SIZE = sbd.config.getint('windowing', 'window_size') 41 | FEATURE_LENGTH = 300 if not sbd.config.getboolean('features', 'pos_tagging') else 300 + len(PosTag) 42 | 43 | with file(prototxt, "r") as input_: 44 | nc = NetConfig(input_) 45 | nc.transform_deploy([1, 1, WINDOW_SIZE, FEATURE_LENGTH]) 46 | temp_proto = "%s/%s" % (folder, temp_file_name) 47 | with file(temp_proto, "w") as output: 48 | nc.write_to(output) 49 | 50 | return temp_proto 51 | 52 | def make_audio_temp_deploy(folder, prototxt, temp_file_name = "temp_deploy.prototxt"): 53 | WINDOW_SIZE = sbd.config.getint('windowing', 'window_size') 54 | FEATURE_LENGTH = 4 55 | 56 | with file(prototxt, "r") as input_: 57 | nc = NetConfig(input_) 58 | nc.transform_deploy([1, 1, WINDOW_SIZE, FEATURE_LENGTH]) 59 | temp_proto = "%s/%s" % (folder, temp_file_name) 60 | with file(temp_proto, "w") as output: 61 | nc.write_to(output) 62 | 63 | return temp_proto 64 | 65 | def load_lexical_classifier(folder, vector): 66 | print('Loading config folder: ' + folder) 67 | 68 | config_file, caffemodel_file, net_proto = get_filenames(folder) 69 | 70 | sbd.SbdConfig(config_file) 71 | temp_proto = make_lexical_temp_deploy(folder, net_proto) 72 | 73 | net = caffe.Net(temp_proto, caffemodel_file, caffe.TEST) 74 | 75 | if vector: 76 | classifier = LexicalClassifier(net, vector) 77 | else: 78 | classifier = LexicalClassifier(net, vector) 79 | 80 | return classifier 81 | 82 | def load_audio_classifier(folder): 83 | print('Loading config folder: ' + folder) 84 | 85 | config_file, caffemodel_file, net_proto = get_filenames(folder) 86 | 87 | sbd.SbdConfig(config_file) 88 | temp_proto = make_audio_temp_deploy(folder, net_proto) 89 | 90 | net = caffe.Net(temp_proto, caffemodel_file, caffe.TEST) 91 | 92 | classifier = AudioClassifier(net) 93 | 94 | return classifier 95 | 96 | def get_audio_files(folder): 97 | ctm_file = None 98 | pitch_file = None 99 | energy_file = None 100 | 101 | for file_ in listdir(folder): 102 | if file_.endswith(".ctm"): 103 | ctm_file = os.path.join(folder, file_) 104 | elif file_.endswith(".pitch"): 105 | pitch_file = os.path.join(folder, file_) 106 | elif file_.endswith(".energy"): 107 | energy_file = os.path.join(folder, file_) 108 | 109 | return ctm_file, pitch_file, energy_file 110 | -------------------------------------------------------------------------------- /python/sbd_leveldb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/python/sbd_leveldb/__init__.py -------------------------------------------------------------------------------- /python/sbd_leveldb/audio_training_instance_generator.py: -------------------------------------------------------------------------------- 1 | import operator, os, shutil, sys, time, argparse 2 | 3 | from common.argparse_util import * 4 | import common.sbd_config as sbd 5 | from preprocessing.sliding_window import SlidingWindow 6 | from preprocessing.tokens import Punctuation 7 | from preprocessing.word2vec_file import Word2VecFile 8 | from preprocessing.glove_file import GloveFile 9 | from parsing.get_parser import * 10 | from level_db_creator import LevelDBCreator 11 | 12 | 13 | class TrainingInstanceGenerator(object): 14 | """reads the original data, process them and writes them to a level-db""" 15 | 16 | def __init__(self): 17 | self.test_talks = set() 18 | 19 | def generate(self, parsers, database, is_test): 20 | level_db = LevelDBCreator(database) 21 | window_slider = SlidingWindow() 22 | 23 | nr_instances = 0 24 | 25 | if is_test: 26 | plain_text_instances_file = open(database + "/../test_instances.txt", "w") 27 | else: 28 | plain_text_instances_file = open(database + "/../train_instances.txt", "w") 29 | 30 | for i, talk_parser in enumerate(parsers): 31 | talks = talk_parser.parse() 32 | 33 | prev_progress = 0 34 | print("") 35 | print("Processing file %s ..." % talk_parser.get_file_name()) 36 | 37 | for talk in talks: 38 | progress = int(talk_parser.progress() * 100) 39 | if progress > prev_progress: 40 | sys.stdout.write(str(progress) + "% ") 41 | sys.stdout.flush() 42 | prev_progress = progress 43 | 44 | talk.build_interval_tree() 45 | base_dir = os.path.dirname(talk_parser.get_file_name()) 46 | 47 | # get pitch feature values 48 | pitch_level_file = base_dir + "/" + talk.group_name + "_talkid" + str(talk.talk_id) + ".pitch" 49 | talk.parse_pitch_feature(pitch_level_file) 50 | 51 | # get energy feature values 52 | energy_level_file = base_dir + "/" + talk.group_name + "_talkid" + str(talk.talk_id) + ".energy" 53 | talk.parse_energy_feature(energy_level_file) 54 | 55 | # normalize features 56 | talk.normalize() 57 | 58 | # get the training instances 59 | training_instances = window_slider.list_windows(talk) 60 | 61 | # write training instances to level db 62 | for training_instance in training_instances: 63 | nr_instances += 1 64 | 65 | # write instance to file 66 | s = unicode(training_instance) + "\n" 67 | s += "\n" 68 | plain_text_instances_file.write(s.encode('utf8')) 69 | 70 | # write to level db 71 | level_db.write_training_instance(training_instance) 72 | 73 | plain_text_instances_file.close() 74 | 75 | if __name__ == '__main__': 76 | parser = argparse.ArgumentParser(description='create test and train datasets as a lmdb.') 77 | parser.add_argument('config_file', help="path to config file") 78 | args = parser.parse_args() 79 | 80 | # initialize config 81 | sbd.SbdConfig(args.config_file) 82 | 83 | # create proper name for the database 84 | SENTENCE_HOME = os.environ['SENTENCE_HOME'] 85 | data_folder = "/mnt/naruto/sentence/data/" 86 | LEVEL_DB_DIR = "leveldbs" 87 | 88 | database = SENTENCE_HOME + "/" + LEVEL_DB_DIR + "/" + sbd.SbdConfig.get_db_name_from_config(sbd.config) 89 | 90 | # check if database already exists 91 | if os.path.isdir(database): 92 | print("Deleting " + database + ". y/N?") 93 | sys.stdout.flush() 94 | s = raw_input() 95 | if s != "Y" and s != "y": 96 | print("Not deleting. Exiting ..") 97 | sys.exit(3) 98 | shutil.rmtree(database) 99 | 100 | # create database folder and copy config file 101 | os.mkdir(database) 102 | shutil.copy(args.config_file, database) 103 | 104 | # get training and test data 105 | training_data = sbd.config.get('data', 'train_files').split(",") 106 | test_data = sbd.config.get('data', 'test_files').split(",") 107 | 108 | # get training parsers 109 | training_parsers = [] 110 | for f in training_data: 111 | parser = get_parser(data_folder + f) 112 | if parser is None: 113 | print("WARNING: Could not find training parser for file %s!" % f) 114 | else: 115 | training_parsers.append(parser) 116 | 117 | # get test parsers 118 | test_parsers = [] 119 | for f in test_data: 120 | parser = get_parser(data_folder + f) 121 | if parser is None: 122 | print("WARNING: Could not find test parser for file %s!" % f) 123 | else: 124 | test_parsers.append(parser) 125 | 126 | # generate data 127 | generator = TrainingInstanceGenerator() 128 | 129 | print("Generating test data .. ") 130 | start = time.time() 131 | generator.generate(test_parsers, database + "/test", is_test = True) 132 | duration = int(time.time() - start) / 60 133 | print("Done in " + str(duration) + " min.") 134 | 135 | print("Generating training data .. ") 136 | start = time.time() 137 | generator.generate(training_parsers, database + "/train", is_test = False) 138 | duration = int(time.time() - start) / 60 139 | print("Done in " + str(duration) + " min.") 140 | -------------------------------------------------------------------------------- /python/sbd_leveldb/level_db_creator.py: -------------------------------------------------------------------------------- 1 | import argparse, numpy, leveldb 2 | from caffe.proto import caffe_pb2 3 | 4 | 5 | class LevelDBCreator(object): 6 | """create a new level db, fill it with word vectors""" 7 | 8 | def __init__(self, filename, batchsize=1000): 9 | self.__filename = filename 10 | self.__db = leveldb.LevelDB(filename) 11 | self.__current_batch_size = 0 12 | self.__batch = None 13 | self.__index = 0 14 | self.batchsize = batchsize 15 | 16 | def write_training_instance_list(self, training_instance_list): 17 | for training_instance in training_instance_list: 18 | self.write_training_instance(training_instance) 19 | 20 | def write_training_instance(self, training_instance): 21 | if (self.__batch == None): 22 | self.__batch = leveldb.WriteBatch() 23 | 24 | vectors = training_instance.get_array() 25 | label = training_instance.get_label() 26 | 27 | 28 | 29 | datum = caffe_pb2.Datum() 30 | datum.channels, datum.height, datum.width = vectors.shape 31 | datum.label = label 32 | datum.float_data.extend(vectors.flat) 33 | 34 | self.__batch.Put(str(self.__index), datum.SerializeToString()) 35 | 36 | self.__index += 1 37 | self.__current_batch_size += 1 38 | 39 | if (self.__current_batch_size == self.batchsize): 40 | self.__db.Write(self.__batch, sync=True) 41 | self.__batch = None 42 | self.__current_batch_size = 0 43 | 44 | def close(self): 45 | if (self.__batch): 46 | self.__db.Write(self.__batch, sync=True) 47 | self.__batch = None 48 | self.__current_batch_size = 0 49 | self.__db = None 50 | 51 | def read(self, key): 52 | return self.__db.Get(key) 53 | 54 | 55 | 56 | 57 | ################ 58 | # Example call # 59 | ################ 60 | 61 | class DummyTrainingInstance(): 62 | """assumed interface of training instance""" 63 | 64 | def __init__(self): 65 | pass 66 | 67 | def get_array(self): 68 | channels = 1 69 | window_size = 5 70 | vector_size = 300 71 | dimensions = (channels, window_size, vector_size) 72 | 73 | array = numpy.zeros((dimensions)) 74 | 75 | return array 76 | 77 | def get_label(self): 78 | return 0 79 | 80 | def main(args): 81 | ### writing 82 | ldbCreation = LevelDBCreator(args.dbfile) 83 | 84 | # write single instance 85 | instance = DummyTrainingInstance() 86 | ldbCreation.write_training_instance(instance) 87 | 88 | # write list 89 | training_instance_list = [] 90 | for i in range(0, 1000): 91 | training_instance_list += DummyTrainingInstance(), 92 | ldbCreation.write_training_instance_list(training_instance_list) 93 | 94 | # close after you are done! 95 | ldbCreation.close() 96 | 97 | ### reading (for debug) 98 | ldbCreation = LevelDBCreator(args.dbfile) 99 | datum = caffe_pb2.Datum() 100 | datum.ParseFromString(ldbCreation.read("1")) 101 | print(datum) 102 | print(datum.label) 103 | 104 | 105 | if __name__ == '__main__': 106 | parser = argparse.ArgumentParser(description='Write a test lmdb file.') 107 | parser.add_argument('dbfile', help='path to a level db test directory') 108 | args = parser.parse_args() 109 | main(args) 110 | -------------------------------------------------------------------------------- /python/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/python/tools/__init__.py -------------------------------------------------------------------------------- /python/tools/comparison.py: -------------------------------------------------------------------------------- 1 | import math, sys 2 | import numpy 3 | 4 | OUR = "/home/tanja/Desktop/output" 5 | XIAOYIN_DATA = "/home/tanja/Desktop/xiayin_data" 6 | XIAOYIN_LABEL = "/home/tanja/Desktop/xiayin_label" 7 | 8 | WINDOW = 80 9 | TAKE = 12000 10 | COUNT = 12000 11 | SKIP = 2 12 | DIFF = 0.001 13 | INSTANCE_SIZE = 250 14 | 15 | our_data = numpy.zeros((TAKE, INSTANCE_SIZE)) 16 | our_label = [] 17 | xiaoyin_data = numpy.zeros((TAKE, INSTANCE_SIZE)) 18 | xiaoyin_label = [] 19 | 20 | instance_count = 0 21 | with open(OUR, "r") as file_: 22 | for line in file_: 23 | if instance_count >= TAKE: 24 | continue 25 | 26 | line = line.rstrip() 27 | label = line[-1] 28 | our_label.append(float(label)) 29 | 30 | line = line[1:-4] 31 | parts = line.split(", ") 32 | for i, p in enumerate(parts): 33 | our_data[instance_count][i] = float(p) 34 | 35 | instance_count += 1 36 | 37 | instance_count = 0 38 | with open(XIAOYIN_LABEL, "r") as file_: 39 | for line in file_: 40 | if instance_count >= TAKE: 41 | continue 42 | if instance_count < SKIP: 43 | instance_count += 1 44 | continue 45 | line = line.rstrip() 46 | xiaoyin_label.append(float(line)) 47 | instance_count += 1 48 | 49 | instance_count = 0 50 | with open(XIAOYIN_DATA, "r") as file_: 51 | for line in file_: 52 | if instance_count >= TAKE: 53 | continue 54 | if instance_count < SKIP: 55 | instance_count += 1 56 | continue 57 | parts = line.split("\t") 58 | for i, p in enumerate(parts): 59 | xiaoyin_data[instance_count][i] = float(p) 60 | instance_count += 1 61 | 62 | assert(len(our_data) == len(xiaoyin_data)) 63 | assert(len(our_label) - SKIP == len(xiaoyin_label)) 64 | 65 | 66 | def check_instance(a, b): 67 | for i in range(INSTANCE_SIZE): 68 | if DIFF < math.fabs(a[i] - b[i]): 69 | return False 70 | return True 71 | 72 | 73 | count_label = 0 74 | count_data = 0 75 | 76 | 77 | for instance_nr in range(TAKE): 78 | equal = False 79 | for i in range(max(0, instance_nr - WINDOW), min(instance_nr + WINDOW, TAKE)): 80 | if check_instance(our_data[instance_nr], xiaoyin_data[i]): 81 | equal = True 82 | continue 83 | if not equal: 84 | count_data += 1 85 | 86 | print("DATA", float(count_data) / TAKE) 87 | 88 | 89 | # for i in range(len(xiaoyin_label)): 90 | # if our_label[i] != xiaoyin_label[i]: 91 | # for j in range(max(0, i-WINDOW), min(len(xiaoyin_label), i + WINDOW)): 92 | # if our_label[j] == xiaoyin_label[i] or our_label[i] == xiaoyin_label[j]: 93 | # continue 94 | # count_label += 1 95 | 96 | print("LABEL", float(count_label) / COUNT) 97 | -------------------------------------------------------------------------------- /python/tools/look_into_leveldb.py: -------------------------------------------------------------------------------- 1 | import leveldb, argparse 2 | from caffe.proto import caffe_pb2 3 | 4 | 5 | def main(leveldb_dir, limit): 6 | datum = caffe_pb2.Datum() 7 | db = leveldb.LevelDB(leveldb_dir) 8 | for i in range (0, limit): 9 | datum.ParseFromString(db.Get(str(i))) 10 | print datum.float_data, datum.label 11 | 12 | if __name__ == '__main__': 13 | parser = argparse.ArgumentParser(description='Print (beginning of) contents of a level db database.') 14 | parser.add_argument('leveldb', help='path to level db folder') 15 | parser.add_argument('-l','--limit', help='number of entries which should be displayed', type=int, default=10) 16 | args = parser.parse_args() 17 | main(args.leveldb, args.limit) 18 | -------------------------------------------------------------------------------- /python/tools/netconfig.py: -------------------------------------------------------------------------------- 1 | import argparse, sys 2 | from google.protobuf import text_format 3 | 4 | import caffe 5 | from caffe.proto import caffe_pb2 6 | 7 | def get_layer_by_name(net, name): 8 | for layer in net.layer: 9 | if layer.name == name: 10 | return layer 11 | 12 | def get_data_layer(net, phase): 13 | for layer in net.layer: 14 | if layer.name == "data": 15 | for value in layer.include: 16 | if value.phase == phase: 17 | return layer 18 | 19 | def get_test_data_layer(net): 20 | return get_data_layer(net, caffe_pb2.TEST) 21 | 22 | def get_train_data_layer(net): 23 | return get_data_layer(net, caffe_pb2.TRAIN) 24 | 25 | def replace_loss_with_softmax(net): 26 | losslayer = get_layer_by_name(net, "loss") 27 | if losslayer.type == "InfogainLoss": 28 | return losslayer 29 | losslayer.name = "softmax" 30 | losslayer.type = "Softmax" 31 | losslayer.bottom.remove("label") 32 | losslayer.top.remove("loss") 33 | losslayer.top.append("softmax") 34 | return losslayer 35 | 36 | class NetConfig(object): 37 | def __init__(self, prototxt): 38 | self.net = caffe_pb2.NetParameter() 39 | text_format.Merge(prototxt.read(), self.net) 40 | 41 | def transform_deploy(self, dimensions = [1, 1, 5, 300]): 42 | # make deploy version of net 43 | # remove data layers 44 | self.net.layer.remove(get_train_data_layer(self.net)) 45 | self.net.layer.remove(get_test_data_layer(self.net)) 46 | 47 | # remove accuracy layer 48 | self.net.layer.remove(get_layer_by_name(self.net, "accuracy")) 49 | 50 | # add input 51 | self.net.input.append("data") 52 | for d in dimensions: 53 | self.net.input_dim.append(d) 54 | 55 | uses_infogain = get_layer_by_name(self.net, "loss").type == "InfogainLoss" 56 | 57 | if not uses_infogain: 58 | # use softmax instead of loss layer 59 | replace_loss_with_softmax(self.net) 60 | else: 61 | # infogain already depends on softmax, we can just remove the weight and the infogain loss layer 62 | self.net.layer.remove(get_layer_by_name(self.net, "infogain_loss_matrix")) 63 | self.net.layer.remove(get_layer_by_name(self.net, "loss")) 64 | 65 | def transform_data_paths(self, db_pair_dir): 66 | db_pair_dir = args.train 67 | 68 | # modify path to leveldb for test and train data layer 69 | test_data_layer = get_test_data_layer(self.net) 70 | test_data_layer.data_param.source = db_pair_dir + "/test" 71 | 72 | train_data_layer = get_train_data_layer(self.net) 73 | train_data_layer.data_param.source = db_pair_dir + "/train" 74 | 75 | def get_database(self): 76 | test_data_layer = get_test_data_layer(self.net) 77 | return test_data_layer.data_param.source.replace("/test", "") 78 | 79 | def write_to(self, outstream): 80 | outstream.write(str(self.net)) 81 | 82 | def main(args): 83 | nc = NetConfig(args.prototxt) 84 | 85 | if args.deploy: 86 | nc.transform_deploy() 87 | 88 | if args.train: 89 | nc.transform_data_paths(args.train) 90 | 91 | if args.print_database: 92 | print nc.get_database() 93 | return 94 | 95 | nc.write_to(args.output) 96 | 97 | if __name__ == '__main__': 98 | parser = argparse.ArgumentParser(description='Configure your net') 99 | parser.add_argument('prototxt', help='the original net prototxt', type=argparse.FileType('r')) 100 | group = parser.add_mutually_exclusive_group() 101 | group.add_argument('-d','--deploy', help='preset: deploy; remove data layers, add softmax', action='store_true') 102 | group.add_argument('-p','--print_database', help='Whether to print the database folder or not', action='store_true') 103 | group.add_argument('-t','--train', help='preset: make training net on test/train leveldb in directory', metavar='directory') 104 | parser.add_argument('-o','--output', help='output of the modified net', type=argparse.FileType('w'), default=sys.stdout, metavar='output') 105 | # parser.add_argument('-v','--verbose', help='be verbose', action='store_true') 106 | args = parser.parse_args() 107 | 108 | main(args) 109 | -------------------------------------------------------------------------------- /python/tools/parse_result.py: -------------------------------------------------------------------------------- 1 | import sys, os, csv, re, argparse, ConfigParser 2 | 3 | 4 | 5 | def read_test_results(logPath): 6 | test_results = {} 7 | 8 | for line in file(logPath): 9 | if "Test net output" in line: 10 | search_terms = re.search('Test net output #([0-9]): (.*?) = (-?(0|1)\\.?[0-9]*)', line) 11 | if search_terms: 12 | key = str(search_terms.group(2)) + "_" + search_terms.group(1) 13 | test_results[key] = search_terms.group(3) 14 | return test_results 15 | 16 | ###config 17 | def read_config(config_path): 18 | sections = ['data', 'word_vector', 'windowing', 'features'] 19 | 20 | current_config = ConfigParser.ConfigParser() 21 | current_config.read(config_path) 22 | feature_map = {} 23 | 24 | for section in sections: 25 | for f in current_config.items(section): 26 | feature_map ["_" + f[0]] = f[1] 27 | return feature_map 28 | 29 | 30 | def main(experiments_path, result_file): 31 | all_values = [] 32 | 33 | for d in os.listdir(experiments_path): 34 | full_d_path = os.path.join(experiments_path,d) 35 | if os.path.isdir(full_d_path): 36 | print full_d_path 37 | logFile = None 38 | configFile = None 39 | 40 | files = os.listdir(full_d_path) 41 | for f in files: 42 | if f.endswith(".tlog"): 43 | print f 44 | logFile = os.path.join(full_d_path , f) 45 | elif f.endswith(".ini"): 46 | print f 47 | configFile = os.path.join(full_d_path, f) 48 | 49 | if logFile == None or configFile == None: 50 | print "#Warning: Skipped %s, log or config file was not found!" % full_d_path 51 | continue 52 | features = read_config(configFile) 53 | test_results = read_test_results(logFile) 54 | features.update(test_results) 55 | 56 | all_values.append(features) 57 | 58 | with open(result_file, 'w') as csvfile: 59 | fieldnames = [] 60 | for row in all_values: 61 | dict_keys = row.keys() 62 | dict_keys.sort() 63 | for key in dict_keys: 64 | if not key in fieldnames: 65 | fieldnames.append(key) 66 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 67 | writer.writeheader() 68 | 69 | for row in all_values: 70 | writer.writerow(row) 71 | 72 | if __name__ == '__main__': 73 | parser = argparse.ArgumentParser(description='Create overview csv file of training results.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) 74 | parser.add_argument('experimentfolder', help='path to experiment folder', default='../net/experiments', nargs='?') 75 | parser.add_argument('output', help='path of result file', default='../net/experiments/experiments.csv', nargs='?') 76 | args = parser.parse_args() 77 | main(args.experimentfolder, args.output) 78 | -------------------------------------------------------------------------------- /python/tools/text_converter.py: -------------------------------------------------------------------------------- 1 | import operator, os, shutil, sys, time, argparse 2 | 3 | from common.argparse_util import * 4 | import common.sbd_config as sbd 5 | from parsing.get_parser import * 6 | 7 | 8 | class TextConverter(object): 9 | 10 | def convert(self, parsers): 11 | for i, text_parser in enumerate(parsers): 12 | texts = text_parser.parse() 13 | file_path = text_parser.get_file_name() + ".line" 14 | 15 | if os.path.isfile(file_path): 16 | print("Deleting " + file_path + ".") 17 | os.remove(file_path) 18 | print("Writing file %s ..." % file_path) 19 | 20 | prev_progress = 0 21 | 22 | for text in texts: 23 | progress = int(text_parser.progress() * 100) 24 | if progress > prev_progress: 25 | sys.stdout.write(str(progress) + "% ") 26 | sys.stdout.flush() 27 | prev_progress = progress 28 | 29 | text.append_to_file(file_path) 30 | 31 | if __name__ == '__main__': 32 | parser = argparse.ArgumentParser(description='converts files into line format.') 33 | parser.add_argument('config_file', help="path to config file") 34 | args = parser.parse_args() 35 | 36 | # initialize config 37 | sbd.SbdConfig(args.config_file) 38 | 39 | # get training and test data 40 | training_data = sbd.config.get('data', 'train_files').split(",") 41 | test_data = sbd.config.get('data', 'test_files').split(",") 42 | 43 | data_folder = "/mnt/naruto/sentence/data/" 44 | 45 | # get training parsers 46 | training_parsers = [] 47 | for f in training_data: 48 | parser = get_parser(data_folder + f) 49 | if parser is None: 50 | print("WARNING: Could not find training parser for file %s!" % f) 51 | else: 52 | training_parsers.append(parser) 53 | 54 | # get test parsers 55 | test_parsers = [] 56 | for f in test_data: 57 | parser = get_parser(data_folder + f) 58 | if parser is None: 59 | print("WARNING: Could not find test parser for file %s!" % f) 60 | else: 61 | test_parsers.append(parser) 62 | 63 | # convert data 64 | converter = TextConverter() 65 | print("Converting data .. ") 66 | start = time.time() 67 | converter.convert(test_parsers) 68 | duration = int(time.time() - start) / 60 69 | print("Done in " + str(duration) + " min.") 70 | start = time.time() 71 | converter.convert(training_parsers) 72 | duration = int(time.time() - start) / 60 73 | print("Done in " + str(duration) + " min.") 74 | -------------------------------------------------------------------------------- /python/web_demo/README.md: -------------------------------------------------------------------------------- 1 | # Run Demo 2 | 3 | As described in the [general Python README](../README.md), before executing any scripts on the server, please execute the following command `. ./use_python p2` in `/home/ms2015t3/sentence-boundary-detection-nn`. 4 | 5 | ``` 6 | cd /home/ms2015t3/sentence-boundary-detection-nn 7 | . ./use_python p2 8 | ``` 9 | 10 | Then use the following command on the server to run the demo: 11 | 12 | `python web_demo/web.py /home/ms2015t3/demo_data /home/fb10dl01/workspace/ms-2015-t3/GoogleNews-vectors-negative300.bin -nd` 13 | 14 | Or the equivalent for your specific environment: 15 | 16 | `python web_demo/web.py [DemoDataFolder] [TrainedWord2VecModel] -nd` 17 | 18 | -------------------------------------------------------------------------------- /python/web_demo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knub/sentence-boundary-detection-nn/685a5a43d9ed5b2f9bcb6a834bbe8ad3696413b9/python/web_demo/__init__.py -------------------------------------------------------------------------------- /python/web_demo/file_io.py: -------------------------------------------------------------------------------- 1 | import common.sbd_config as sbd 2 | from sbd_classification.util import convert_probabilities 3 | 4 | class ResultWriter (object): 5 | 6 | def __init__(self, classes = ["NONE", "COMMA", "PERIOD"]): 7 | self.PUNCTUATION_POS = sbd.config.getint('windowing', 'punctuation_position') 8 | self.classes = classes 9 | self.separator = " " 10 | 11 | def writeToFile(self, file_name, tokens, punctuation_probs): 12 | with open(file_name, "w") as f: 13 | header = "%s\n" % (self.separator.join(["TOKEN"] + self.classes)) 14 | f.write(header) 15 | 16 | for i, token in enumerate(tokens): 17 | f.write("%s\n" % self.separator.join(str(prob) for prob in ([token] + punctuation_probs[i]))) 18 | 19 | 20 | class InputTextReader (object): 21 | 22 | def __init__(self): 23 | pass 24 | 25 | def readFile(self, file_name): 26 | text = "" 27 | with open(file_name, "r") as f: 28 | for line in f.readlines(): 29 | word = line.split("\t")[0] 30 | text += " " + word 31 | 32 | return text 33 | -------------------------------------------------------------------------------- /python/web_demo/json_converter.py: -------------------------------------------------------------------------------- 1 | import common.sbd_config as sbd 2 | import numpy 3 | from sbd_classification.util import get_index 4 | 5 | class JsonConverter(object): 6 | 7 | def __init__(self, lexical_punctuation_pos, lexical_window_size, audio_punctuation_pos, audio_window_size, pos_tagging): 8 | self.LEXICAL_PUNCTUATION_POS = lexical_punctuation_pos 9 | self.LEXICAL_WINDOW_SIZE = lexical_window_size 10 | self.AUDIO_PUNCTUATION_POS = audio_punctuation_pos 11 | self.AUDIO_WINDOW_SIZE = audio_window_size 12 | self.POS_TAGGING = pos_tagging 13 | self.classes_lexical_audio = ["NONE", "COMMA", "PERIOD"] 14 | self.classes_audio = ["NONE", "PERIOD"] 15 | 16 | def convert_fusion(self, tokens, fusion_probs, lexical_probs, audio_probs): 17 | json_data = [] 18 | 19 | # build json 20 | for i, token in enumerate(tokens): 21 | token_json = {'type': 'word', 'token': token.word} 22 | if self.POS_TAGGING: 23 | token_json['pos'] = [str(tag).replace("PosTag.", "") for tag in token.pos_tags] 24 | json_data.append(token_json) 25 | 26 | probs_json = {'type': 'punctuation'} 27 | 28 | # FUSION 29 | # we have probabilities for all tokens 30 | current_punctuation = self.classes_lexical_audio[numpy.argmax(fusion_probs[i])] 31 | class_distribution = self._get_class_distribution(fusion_probs[i], self.classes_lexical_audio) 32 | if i == len(tokens) - 1: 33 | probs_json['fusion'] = {'punctuation': 'PERIOD', 'probs': {'NONE': 0.0, 'PERIOD': 1.0}} 34 | else: 35 | probs_json['fusion'] = {'punctuation': current_punctuation, 'probs': class_distribution} 36 | 37 | # AUDIO 38 | current_prediction_position = get_index(i, len(audio_probs), self.AUDIO_PUNCTUATION_POS) 39 | if i == len(tokens) - 1: 40 | probs_json['audio'] = {'punctuation': 'PERIOD', 'probs': {'NONE': 0.0, 'PERIOD': 1.0}} 41 | elif current_prediction_position < 0: 42 | probs_json['audio'] = {'punctuation': 'NONE', 'probs': {'NONE': 1.0, 'PERIOD': 0.0}} 43 | else: 44 | current_punctuation = self.classes_audio[numpy.argmax(audio_probs[current_prediction_position])] 45 | class_distribution = self._get_class_distribution(audio_probs[current_prediction_position], self.classes_audio) 46 | probs_json['audio'] = { 'punctuation': current_punctuation, 'probs': class_distribution} 47 | 48 | # LEXICAL 49 | current_prediction_position = get_index(i, len(lexical_probs), self.LEXICAL_PUNCTUATION_POS) 50 | if i == len(tokens) - 1: 51 | probs_json['lexical'] = {'punctuation': 'PERIOD', 'probs': {'NONE': 0.0, 'COMMA': 0.0, 'PERIOD': 1.0}} 52 | elif current_prediction_position < 0: 53 | probs_json['lexical'] = {'punctuation': 'NONE', 'probs': {'NONE': 1.0, 'COMMA': 0.0, 'PERIOD': 0.0}} 54 | else: 55 | current_punctuation = self.classes_lexical_audio[numpy.argmax(lexical_probs[current_prediction_position])] 56 | class_distribution = self._get_class_distribution(lexical_probs[current_prediction_position], self.classes_lexical_audio) 57 | probs_json['lexical'] = {'punctuation': current_punctuation, 'probs': class_distribution} 58 | 59 | json_data.append(probs_json) 60 | 61 | return json_data 62 | 63 | def convert_lexical(self, tokens, punctuation_probs): 64 | json_data = [] 65 | # build json 66 | for index, token in enumerate(tokens): 67 | token_json = {'type': 'word', 'token': token.word} 68 | if self.POS_TAGGING: 69 | token_json['pos'] = [str(tag).replace("PosTag.", "") for tag in token.pos_tags] 70 | json_data.append(token_json) 71 | 72 | current_punctuation = self.classes_lexical_audio[numpy.argmax(punctuation_probs[index])] 73 | class_distribution = self._get_class_distribution(punctuation_probs[index], self.classes_lexical_audio) 74 | json_data.append({'type': 'punctuation', 'punctuation': current_punctuation, 'probs': class_distribution}) 75 | 76 | return json_data 77 | 78 | def _get_class_distribution(self, probs, classes): 79 | json_data = {} 80 | for i in range (0, len(classes)): 81 | json_data[classes[i]] = str(probs[i]) 82 | return json_data 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /python/web_demo/static/main.css: -------------------------------------------------------------------------------- 1 | #punctuation { 2 | padding: 15px; 3 | margin: 15px; 4 | max-width: 100%; 5 | min-height: 100px; 6 | background: #EEE; 7 | } 8 | .punctuation_div { 9 | padding: 15px; 10 | margin: 15px; 11 | max-width: 100%; 12 | min-height: 100px; 13 | background: #EEE; 14 | } 15 | #textarea-input { 16 | border: 0.5px solid black; 17 | font-size: 1em; 18 | margin: 10px 2px; 19 | } 20 | .token { 21 | background: #CCC; 22 | border-radius: 4px; 23 | padding: 5px; 24 | margin: 10px; 25 | margin-right: 3px; 26 | margin-left: 3px; 27 | float: left; 28 | } 29 | .token-punctuation { 30 | background: #F2C38A; 31 | padding: 5px; 32 | padding-right: 10px; 33 | padding-left: 10px; 34 | } 35 | .token-NONE { 36 | background: #F2C38A; 37 | background: #DDD; 38 | white-space: pre-wrap; 39 | } 40 | .token-COMMA { 41 | background: #FFC107; 42 | white-space: pre-line; 43 | } 44 | .token-PERIOD { 45 | background: #03A9F4; 46 | white-space: pre-line; 47 | } 48 | 49 | .glyphicon.spinning { 50 | animation: spin 1s infinite linear; 51 | -webkit-animation: spin2 1s infinite linear; 52 | } 53 | 54 | @keyframes spin { 55 | from { transform: scale(1) rotate(0deg); } 56 | to { transform: scale(1) rotate(360deg); } 57 | } 58 | 59 | @-webkit-keyframes spin2 { 60 | from { -webkit-transform: rotate(0deg); } 61 | to { -webkit-transform: rotate(360deg); } 62 | } 63 | -------------------------------------------------------------------------------- /python/web_demo/static/main.js: -------------------------------------------------------------------------------- 1 | $(document).ready(function() { 2 | 3 | function stringRepresentation(token) { 4 | token.string = token.punctuation 5 | if (token.punctuation == "NONE") 6 | { 7 | token.string = " " 8 | } 9 | if (token.punctuation == "PERIOD") 10 | { 11 | token.string = "." 12 | } 13 | if (token.punctuation == "COMMA") 14 | { 15 | token.string = "," 16 | } 17 | }; 18 | 19 | function buildProbsString(probs) { 20 | var probs_str = ""; 21 | for (var key in probs) { 22 | probs_str += key + ": " + (probs[key] * 100 ).toFixed(2) + "% " 23 | }; 24 | return probs_str 25 | }; 26 | 27 | function processPunctuationToken(token, resultDiv) { 28 | stringRepresentation(token) 29 | var probs_str = buildProbsString(token.probs); 30 | resultDiv.append("" + token.string + ""); 31 | }; 32 | 33 | function displayLexicalAudioResult(tokens) { 34 | var $resultDivLexicalAudio = $("#punctuation_lexical_audio"); 35 | var $resultDivLexical = $("#punctuation_lexical"); 36 | var $resultDivAudio = $("#punctuation_audio"); 37 | 38 | $resultDivLexicalAudio.empty(""); 39 | $resultDivLexical.empty(""); 40 | $resultDivAudio.empty(""); 41 | 42 | tokens.forEach(function(token) { 43 | if (token.type == "word") { 44 | var tag_str = ""; 45 | for (var key in token.pos) { 46 | tag_str += token.pos[key] + " " 47 | }; 48 | var s = "" + token.token + ""; 49 | $resultDivLexicalAudio.append(s); 50 | $resultDivLexical.append(s); 51 | $resultDivAudio.append(s); 52 | } else if (token.type == "punctuation") { 53 | processPunctuationToken(token.fusion, $resultDivLexicalAudio); 54 | processPunctuationToken(token.lexical, $resultDivLexical); 55 | processPunctuationToken(token.audio, $resultDivAudio); 56 | } 57 | }); 58 | }; 59 | 60 | function displayLexicalResult(tokens) { 61 | var $resultDiv = $("#punctuation"); 62 | $resultDiv.empty(""); 63 | tokens.forEach(function(token) { 64 | if (token.type == "word") { 65 | var tag_str = ""; 66 | for (var key in token.pos) { 67 | tag_str += token.pos[key] + " " 68 | }; 69 | $resultDiv.append("" + token.token + ""); 70 | } else if (token.type == "punctuation") { 71 | processPunctuationToken(token, $resultDiv); 72 | } 73 | }); 74 | }; 75 | 76 | 77 | $("#collapse2").on('hidden.bs.collapse', function () { 78 | $('#selection-text-file').val(''); 79 | }); 80 | 81 | $("#punctuate-lexical").click(function() { 82 | var text = { 83 | text: $('#textarea-input').val(), 84 | textfile: $('#selection-text-file').val(), 85 | lexical_folder: $("#selection-lexical-models").val() 86 | }; 87 | $('#loading').show(); 88 | $('#punctuation').empty(); 89 | $.post("/classify_lexical", text, function(response, textStatus) { 90 | $('#loading').hide(); 91 | displayLexicalResult(response); 92 | }, "json") 93 | .fail(function(data) { 94 | console.error(data); 95 | }); 96 | }); 97 | 98 | $("#punctuate-audio-lexical").click(function() { 99 | var setting = { 100 | example: $('#selection-audio-examples').val(), 101 | lexical_folder: $("#selection-lexical-models").val(), 102 | audio_folder: $("#selection-audio-models").val() 103 | }; 104 | $('#loading').show(); 105 | $('#punctuation').empty(); 106 | $.post("/classify_audio_lexical", setting, function(response, textStatus) { 107 | $('#loading').hide(); 108 | displayLexicalAudioResult(response); 109 | }, "json") 110 | .fail(function(data) { 111 | console.error(data); 112 | }); 113 | }); 114 | 115 | $("#selection-lexical-models").on('change', function() { 116 | var setting = { 117 | folder: $("#selection-lexical-models").val() 118 | }; 119 | $.post("/lexical_models", setting, function(response) {}) 120 | .fail(function(data) { 121 | console.error(data); 122 | }); 123 | }); 124 | 125 | $("#selection-audio-models").on('change', function() { 126 | var setting = { 127 | folder: $("#selection-audio-models").val() 128 | }; 129 | 130 | $.post("/audio_models", setting, function(response) {}) 131 | .fail(function(data) { 132 | console.error(data); 133 | }); 134 | }); 135 | 136 | function loadLexicalModels() { 137 | $.get("/lexical_models", function(response) { 138 | response.options.forEach(function(option){ 139 | 140 | if (response.selected === option){ 141 | $('#selection-lexical-models').append($('