├── LICENSE ├── README.md ├── header-annotation ├── 01_extract_headers_from_html.py ├── 02_generate_training_sequences.py ├── 03_train_model.py ├── 04_generate_probabilities.py ├── 05_annotate_headers.py ├── config.ini ├── header_util.py ├── missing_chapter_util.py └── regex_util.py └── segmentation ├── bert_full_window ├── 01_generate_training_sequences.py ├── 02_train_BERT_model.py └── 03_generate_BERT_probabilities.py ├── bert_single_para ├── 01_generate_training_data.py ├── 02_tokenize_sequences.py ├── 03_train_BERT_model.py ├── 04_generate_BERT_probabilities.py └── 05_generate_predictions_dp.py ├── bert_tokenize_test_books.py ├── generate_ground_truth.py ├── metrics └── generate_metrics.py ├── paragraph_to_sentence.py ├── tokenize_books.py └── weighted_overlap ├── compute_densities.py ├── compute_peaks_prominences.py └── get_predictions_dp.py /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/README.md -------------------------------------------------------------------------------- /header-annotation/01_extract_headers_from_html.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/header-annotation/01_extract_headers_from_html.py -------------------------------------------------------------------------------- /header-annotation/02_generate_training_sequences.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/header-annotation/02_generate_training_sequences.py -------------------------------------------------------------------------------- /header-annotation/03_train_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/header-annotation/03_train_model.py -------------------------------------------------------------------------------- /header-annotation/04_generate_probabilities.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/header-annotation/04_generate_probabilities.py -------------------------------------------------------------------------------- /header-annotation/05_annotate_headers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/header-annotation/05_annotate_headers.py -------------------------------------------------------------------------------- /header-annotation/config.ini: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/header-annotation/config.ini -------------------------------------------------------------------------------- /header-annotation/header_util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/header-annotation/header_util.py -------------------------------------------------------------------------------- /header-annotation/missing_chapter_util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/header-annotation/missing_chapter_util.py -------------------------------------------------------------------------------- /header-annotation/regex_util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/header-annotation/regex_util.py -------------------------------------------------------------------------------- /segmentation/bert_full_window/01_generate_training_sequences.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/segmentation/bert_full_window/01_generate_training_sequences.py -------------------------------------------------------------------------------- /segmentation/bert_full_window/02_train_BERT_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/segmentation/bert_full_window/02_train_BERT_model.py -------------------------------------------------------------------------------- /segmentation/bert_full_window/03_generate_BERT_probabilities.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/segmentation/bert_full_window/03_generate_BERT_probabilities.py -------------------------------------------------------------------------------- /segmentation/bert_single_para/01_generate_training_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/segmentation/bert_single_para/01_generate_training_data.py -------------------------------------------------------------------------------- /segmentation/bert_single_para/02_tokenize_sequences.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/segmentation/bert_single_para/02_tokenize_sequences.py -------------------------------------------------------------------------------- /segmentation/bert_single_para/03_train_BERT_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/segmentation/bert_single_para/03_train_BERT_model.py -------------------------------------------------------------------------------- /segmentation/bert_single_para/04_generate_BERT_probabilities.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/segmentation/bert_single_para/04_generate_BERT_probabilities.py -------------------------------------------------------------------------------- /segmentation/bert_single_para/05_generate_predictions_dp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/segmentation/bert_single_para/05_generate_predictions_dp.py -------------------------------------------------------------------------------- /segmentation/bert_tokenize_test_books.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/segmentation/bert_tokenize_test_books.py -------------------------------------------------------------------------------- /segmentation/generate_ground_truth.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/segmentation/generate_ground_truth.py -------------------------------------------------------------------------------- /segmentation/metrics/generate_metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/segmentation/metrics/generate_metrics.py -------------------------------------------------------------------------------- /segmentation/paragraph_to_sentence.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/segmentation/paragraph_to_sentence.py -------------------------------------------------------------------------------- /segmentation/tokenize_books.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/segmentation/tokenize_books.py -------------------------------------------------------------------------------- /segmentation/weighted_overlap/compute_densities.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/segmentation/weighted_overlap/compute_densities.py -------------------------------------------------------------------------------- /segmentation/weighted_overlap/compute_peaks_prominences.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/segmentation/weighted_overlap/compute_peaks_prominences.py -------------------------------------------------------------------------------- /segmentation/weighted_overlap/get_predictions_dp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbu-dsl/chapter-captor/HEAD/segmentation/weighted_overlap/get_predictions_dp.py --------------------------------------------------------------------------------