├── .dockerignore
├── .gitattributes
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug-report.md
    │   └── feature-request.md
    └── pull_request_template.md
├── .gitignore
├── .readthedocs.yml
├── .travis.yml
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── benchmark
    ├── BENCHMARK_SPECS.yml
    ├── BENCHMARK_SPECS_DEBUG.yml
    ├── README.md
    ├── benchmark_output
    │   ├── class_imbalance
    │   │   ├── ALBERT
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── BERT
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── DistilBERT
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── FastText
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── GPT
    │   │   │   ├── ERROR
    │   │   │   ├── output.md
    │   │   │   └── run-meta.json
    │   │   ├── GPT2
    │   │   │   ├── ERROR
    │   │   │   ├── output.md
    │   │   │   └── run-meta.json
    │   │   ├── MTDNN
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── RoBERTa
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── SKLearn
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── XLM-RoBERTa
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── XLM
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── XLNet
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── class_imbalance.md
    │   │   ├── spaCy
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   └── spacy-transformers
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   ├── data_augmentation
    │   │   ├── BERTMaskedLM
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── MarianMT
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── Word2Vec
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── WordNet
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   └── data_augmentation.md
    │   ├── document_windowing
    │   │   ├── BERT
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   └── document_windowing.md
    │   ├── imdb
    │   │   ├── ALBERT
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── BERT
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── DistilBERT
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── FastText
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── MTDNN
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── RoBERTa
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── SKLearn
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── XLM-RoBERTa
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── XLM
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── XLNet
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── imdb.md
    │   │   ├── spaCy
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   └── spacy-transformers
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   ├── imdb_embed
    │   │   ├── ALBERT
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── BERT
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── DistilBERT
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── ELECTRA
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── GPT
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── GPT2
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── RoBERTa
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── T5Model
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── TransformerXL
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── USE
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── XLM-RoBERTa
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── XLM
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── XLNet
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── imdb_embed.md
    │   │   ├── spaCy
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   └── spacy-transformers
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   ├── low_resource
    │   │   ├── ALBERT
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── BERT
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── DistilBERT
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── FastText
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── MTDNN
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── RoBERTa
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── SKLearn
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── XLM-RoBERTa
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── XLM
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── XLNet
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── low_resource.md
    │   │   └── spaCy
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   ├── moviesummary
    │   │   ├── ALBERT
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── DistilBERT
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── FastText
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── RoBERTa
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── SKLearn
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── XLM-RoBERTa
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── XLM
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── XLNet
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── moviesummary.md
    │   │   ├── spaCy
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   └── spacy-transformers
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   ├── newsgroups
    │   │   ├── ALBERT
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── BERT
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── DistilBERT
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── FastText
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── MTDNN
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── RoBERTa
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── SKLearn
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── XLM-RoBERTa
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── XLM
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── XLNet
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   ├── newsgroups.md
    │   │   ├── spaCy
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   │   └── spacy-transformers
    │   │   │   ├── output.md
    │   │   │   ├── plot.png
    │   │   │   └── run-meta.json
    │   └── newsgroups_embed
    │   │   ├── ALBERT
    │   │       ├── output.md
    │   │       ├── plot.png
    │   │       └── run-meta.json
    │   │   ├── BERT
    │   │       ├── output.md
    │   │       ├── plot.png
    │   │       └── run-meta.json
    │   │   ├── DistilBERT
    │   │       ├── output.md
    │   │       ├── plot.png
    │   │       └── run-meta.json
    │   │   ├── ELECTRA
    │   │       ├── output.md
    │   │       ├── plot.png
    │   │       └── run-meta.json
    │   │   ├── GPT
    │   │       ├── output.md
    │   │       ├── plot.png
    │   │       └── run-meta.json
    │   │   ├── GPT2
    │   │       ├── output.md
    │   │       ├── plot.png
    │   │       └── run-meta.json
    │   │   ├── RoBERTa
    │   │       ├── output.md
    │   │       ├── plot.png
    │   │       └── run-meta.json
    │   │   ├── T5Model
    │   │       ├── output.md
    │   │       ├── plot.png
    │   │       └── run-meta.json
    │   │   ├── TransformerXL
    │   │       ├── output.md
    │   │       ├── plot.png
    │   │       └── run-meta.json
    │   │   ├── USE
    │   │       ├── output.md
    │   │       ├── plot.png
    │   │       └── run-meta.json
    │   │   ├── XLM-RoBERTa
    │   │       ├── output.md
    │   │       ├── plot.png
    │   │       └── run-meta.json
    │   │   ├── XLM
    │   │       ├── output.md
    │   │       ├── plot.png
    │   │       └── run-meta.json
    │   │   ├── XLNet
    │   │       ├── output.md
    │   │       ├── plot.png
    │   │       └── run-meta.json
    │   │   ├── newsgroups_embed.md
    │   │   ├── sklearn_TF-IDF
    │   │       ├── output.md
    │   │       ├── plot.png
    │   │       └── run-meta.json
    │   │   ├── spaCy
    │   │       ├── output.md
    │   │       ├── plot.png
    │   │       └── run-meta.json
    │   │   └── spacy-transformers
    │   │       ├── output.md
    │   │       ├── plot.png
    │   │       └── run-meta.json
    ├── benchmark_util.py
    ├── docker-compose.yml
    ├── docker
    │   └── Dockerfile
    ├── requirements.txt
    ├── run_benchmarks.py
    ├── run_benchmarks.sh
    └── scenario.py
├── ci-gpu
    └── docker-compose.yml
├── ci
    └── docker-compose.yml
├── conftest.py
├── docker-compose.yml
├── docs
    ├── Makefile
    ├── _static
    │   ├── .gitkeep
    │   ├── gobbli_app.svg
    │   ├── gobbli_favicon.ico
    │   └── gobbli_lg.svg
    ├── advanced_usage.rst
    ├── api.rst
    ├── conf.py
    ├── img
    │   └── interactive_apps
    │   │   ├── evaluate
    │   │       └── evaluate.png
    │   │   ├── explain
    │   │       ├── explain.png
    │   │       └── explain_output.png
    │   │   └── explore
    │   │       ├── explore.png
    │   │       ├── explore_embeddings.png
    │   │       ├── explore_topic_model.png
    │   │       └── explore_trained_embeddings.png
    ├── index.rst
    ├── interactive_apps.rst
    ├── make.bat
    ├── prerequisites.rst
    ├── quickstart.rst
    ├── requirements.txt
    └── troubleshooting.rst
├── generate_docs.sh
├── gobbli
    ├── __init__.py
    ├── augment
    │   ├── __init__.py
    │   ├── base.py
    │   ├── bert
    │   │   ├── Dockerfile
    │   │   ├── __init__.py
    │   │   ├── model.py
    │   │   └── src
    │   │   │   └── augment_text.py
    │   ├── marian
    │   │   ├── Dockerfile
    │   │   ├── __init__.py
    │   │   ├── model.py
    │   │   └── src
    │   │   │   └── backtranslate_text.py
    │   ├── word2vec.py
    │   └── wordnet.py
    ├── cli.py
    ├── dataset
    │   ├── __init__.py
    │   ├── base.py
    │   ├── cmu_movie_summary.py
    │   ├── imdb.py
    │   ├── nested_file.py
    │   ├── newsgroups.py
    │   └── trivial.py
    ├── docker.py
    ├── experiment
    │   ├── __init__.py
    │   ├── base.py
    │   └── classification.py
    ├── inspect
    │   ├── __init__.py
    │   └── evaluate.py
    ├── interactive
    │   ├── evaluate.py
    │   ├── explain.py
    │   ├── explore.py
    │   └── util.py
    ├── io.py
    ├── model
    │   ├── __init__.py
    │   ├── base.py
    │   ├── bert
    │   │   ├── Dockerfile
    │   │   ├── __init__.py
    │   │   ├── model.py
    │   │   └── src
    │   │   │   ├── .gitignore
    │   │   │   ├── CONTRIBUTING.md
    │   │   │   ├── LICENSE
    │   │   │   ├── README.md
    │   │   │   ├── __init__.py
    │   │   │   ├── create_pretraining_data.py
    │   │   │   ├── extract_features.py
    │   │   │   ├── modeling.py
    │   │   │   ├── modeling_test.py
    │   │   │   ├── multilingual.md
    │   │   │   ├── optimization.py
    │   │   │   ├── optimization_test.py
    │   │   │   ├── predicting_movie_reviews_with_bert_on_tf_hub.ipynb
    │   │   │   ├── requirements.txt
    │   │   │   ├── run_classifier.py
    │   │   │   ├── run_classifier_with_tfhub.py
    │   │   │   ├── run_pretraining.py
    │   │   │   ├── run_squad.py
    │   │   │   ├── sample_text.txt
    │   │   │   ├── tokenization.py
    │   │   │   └── tokenization_test.py
    │   ├── context.py
    │   ├── fasttext
    │   │   ├── Dockerfile
    │   │   ├── __init__.py
    │   │   └── model.py
    │   ├── majority.py
    │   ├── mixin.py
    │   ├── mtdnn
    │   │   ├── Dockerfile
    │   │   ├── __init__.py
    │   │   ├── model.py
    │   │   └── src
    │   │   │   ├── .gitignore
    │   │   │   ├── LICENSE
    │   │   │   ├── README.md
    │   │   │   ├── config
    │   │   │       └── tasks_config.json
    │   │   │   ├── data_utils
    │   │   │       ├── __init__.py
    │   │   │       ├── glue_utils.py
    │   │   │       ├── label_map.py
    │   │   │       ├── log_wrapper.py
    │   │   │       ├── metrics.py
    │   │   │       ├── utils.py
    │   │   │       └── vocab.py
    │   │   │   ├── docker
    │   │   │       └── Dockerfile
    │   │   │   ├── download.sh
    │   │   │   ├── gobbli_train.py
    │   │   │   ├── module
    │   │   │       ├── __init__.py
    │   │   │       ├── bert_optim.py
    │   │   │       ├── common.py
    │   │   │       ├── dropout_wrapper.py
    │   │   │       ├── my_optim.py
    │   │   │       ├── san.py
    │   │   │       ├── similarity.py
    │   │   │       └── sub_layers.py
    │   │   │   ├── mt_dnn
    │   │   │       ├── __init__.py
    │   │   │       ├── batcher.py
    │   │   │       ├── gobbli_batcher.py
    │   │   │       ├── gobbli_model.py
    │   │   │       ├── matcher.py
    │   │   │       └── model.py
    │   │   │   ├── prepro.py
    │   │   │   ├── requirements.txt
    │   │   │   ├── run_toy.sh
    │   │   │   ├── scripts
    │   │   │       ├── domain_adaptation_run.sh
    │   │   │       ├── run_mt_dnn.sh
    │   │   │       ├── run_rte.sh
    │   │   │       ├── run_stsb.sh
    │   │   │       ├── scitail_domain_adaptation_bash.sh
    │   │   │       ├── snli_domain_adaptation_bash.sh
    │   │   │       └── strip_model.py
    │   │   │   └── train.py
    │   ├── random.py
    │   ├── sklearn
    │   │   ├── __init__.py
    │   │   └── model.py
    │   ├── spacy
    │   │   ├── Dockerfile
    │   │   ├── __init__.py
    │   │   ├── model.py
    │   │   └── src
    │   │   │   ├── requirements.txt
    │   │   │   └── run_spacy.py
    │   ├── transformer
    │   │   ├── Dockerfile
    │   │   ├── __init__.py
    │   │   ├── model.py
    │   │   └── src
    │   │   │   ├── requirements.txt
    │   │   │   └── run_model.py
    │   └── use
    │   │   ├── Dockerfile
    │   │   ├── __init__.py
    │   │   ├── model.py
    │   │   └── src
    │   │       ├── requirements.txt
    │   │       └── use.py
    ├── test
    │   ├── __init__.py
    │   ├── augment
    │   │   ├── __init__.py
    │   │   ├── test_bertmaskedlm.py
    │   │   ├── test_marian.py
    │   │   ├── test_word2vec.py
    │   │   └── test_wordnet.py
    │   ├── classification
    │   │   ├── __init__.py
    │   │   ├── test_classifiers.py
    │   │   └── test_embeddings.py
    │   ├── dataset
    │   │   ├── __init__.py
    │   │   ├── test_base_dataset.py
    │   │   ├── test_cmu_movie_summary.py
    │   │   ├── test_imdb.py
    │   │   └── test_newsgroups.py
    │   ├── experiment
    │   │   ├── __init__.py
    │   │   ├── test_base_experiment.py
    │   │   └── test_classification_experiment.py
    │   ├── inspect
    │   │   └── test_evaluate.py
    │   ├── interactive
    │   │   ├── __init__.py
    │   │   └── test_util.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── test_base_model.py
    │   │   ├── test_bert.py
    │   │   ├── test_fasttext.py
    │   │   ├── test_mtdnn.py
    │   │   ├── test_sklearn.py
    │   │   ├── test_spacy.py
    │   │   ├── test_transformer.py
    │   │   └── test_use.py
    │   ├── test_io.py
    │   ├── test_util.py
    │   └── util.py
    └── util.py
├── img
    ├── gobbli_app.svg
    └── gobbli_lg.svg
├── meta.json
├── paper
    ├── README.md
    ├── paper.bib
    └── paper.md
├── pyproject.toml
├── pytest.ini
├── requirements.txt
├── run_ci.sh
├── run_dist.sh
├── setup.cfg
├── setup.py
└── test_remote_gpu.sh


/.dockerignore:
--------------------------------------------------------------------------------
 1 | **/.ipynb_checkpoints
 2 | **/__pycache__
 3 | *.py[cod]
 4 | **/build
 5 | **/dist
 6 | 
 7 | **/.tox
 8 | **/.eggs
 9 | **/gobbli.egg-info
10 | **/.hypothesis
11 | **/.mypy_cache
12 | **/__pycache__
13 | *.py[cod]
14 | **/.test_cache
15 | 
16 | benchmark/benchmark_data/
17 | benchmark/benchmark_meta/
18 | benchmark/benchmark_gobbli/
19 | 
20 | scratch/
21 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-documentation
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug-report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "Bug Report"
 3 | about: Submit a bug report
 4 | ---
 5 | 
 6 | ## System Information
 7 | - OS platform and distribution (e.g. Linux Ubuntu 16.04):
 8 | - gobbli version:
 9 | - Python version:
10 | - Other information relevant to the problem (GPU model, Docker version, etc):
11 | 
12 | ## Description
13 | 
14 | <!-- In this section, concisely describe what the problem is.  If you have any ideas as to what's causing it and/or how it may be solved, please share those as well. -->
15 | 
16 | ## Code for Minimal Reproducible Example
17 | 
18 | <!-- Write code here that demonstrates your problem. Include any imports, logging, etc. needed to see the issue. Make sure it's formatted readably. -->
19 | 
20 | ## Output
21 | 
22 | <!-- Paste any output/logs that demonstrate the problem here. -->
23 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "Feature Request"
 3 | about: "Submit a request for a new feature."
 4 | ---
 5 | 
 6 | ## Feature
 7 | 
 8 | <!-- Describe the feature you'd like to see. What should gobbli do that it doesn't already? -->
 9 | 
10 | ## Motivation
11 | 
12 | <!-- Why would this feature be useful for you and other users? -->
13 | 
14 | ## Additional Details
15 | 
16 | <!-- Provide anything else you think will help us, including suggested implementation, links to relevant resources, etc. -->
17 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | ## Description of Changes
2 | 
3 | <!-- Describe the changes included in your pull request. -->
4 | 
5 | ## Related Issue(s), if any
6 | 
7 | <!-- Link to any relevant issues. -->
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .ipynb_checkpoints/
 2 | __pycache__/
 3 | *.py[cod]
 4 | build/
 5 | dist/
 6 | docs/_build/
 7 | docs/auto/
 8 | 
 9 | .tox/
10 | .eggs/
11 | gobbli.egg-info/
12 | .hypothesis/
13 | .coverage
14 | .mypy_cache/
15 | .pytest_cache/
16 | pip-wheel-metadata/
17 | 
18 | .test_cache/
19 | 
20 | benchmark/benchmark_data/
21 | benchmark/benchmark_meta/
22 | benchmark/benchmark_gobbli/
23 | benchmark/benchmark_output_debug/
24 | 
25 | scratch/


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | python:
 4 |   version: 3.7
 5 |   install:
 6 |     - method: pip
 7 |       path: .
 8 |     - requirements: docs/requirements.txt
 9 | 
10 | sphinx:
11 |   configuration: docs/conf.py
12 |   fail_on_warning: true
13 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: minimal
 2 | 
 3 | env:
 4 |   jobs:
 5 |     - PYTHON_VERSION=3.7
 6 |     - PYTHON_VERSION=3.8
 7 | 
 8 | services:
 9 |   - docker
10 | 
11 | install:
12 |   - cd ci
13 |   - docker-compose build gobbli-ci
14 | 
15 | script:
16 |   - docker-compose run --rm gobbli-ci
17 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | Thanks for your contribution!  We're starting out with a few simple guidelines:
 4 | 
 5 | ## Contributor License Agreement
 6 | 
 7 | You must [sign a Contributor License Agreement](https://www.clahub.com/agreements/RTIInternational/gobbli) (CLA) to contribute to this project.
 8 | 
 9 | ## Code Style
10 | 
11 | We use a few linting tools to enforce consistency of style and formatting, which are run in CI.  Make sure your code passes the pre-test checks in `run_ci.sh` before it's pushed.  Additionally:
12 | 
13 |  - Pretty much any code added under the main gobbli codebase should have type hints.  Code run as part of model Docker containers is exempt from this guideline but should still be formatted.
14 |  - Add docstrings where appropriate (especially public interface functions).  Use Sphinx references to link to other parts of the project where needed.
15 | 
16 | ## Tests
17 | 
18 | A lot of the functionality in gobbli is difficult to test (large models, long runtimes, complex functions).  We don't test every edge case, but try to make sure there's at least a black box test verifying end-to-end success of any new code you add.  White box testing is appreciated where feasible.
19 | 
20 | ## Code Reviews
21 | 
22 | All submissions must come in the form of PRs against the master branch and will be reviewed.  If possible, ensure your patch only implements/changes one thing.
23 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG PYTHON_VERSION=3.7
 2 | FROM python:${PYTHON_VERSION}
 3 | 
 4 | COPY ./setup.py ./meta.json ./requirements.txt ./README.md /code/
 5 | COPY ./docs/requirements.txt /code/docs/requirements.txt
 6 | 
 7 | WORKDIR /code
 8 | RUN pip install --upgrade pip \
 9 |     && pip install -e '.[augment,tokenize,interactive]' \
10 |     && pip install -r requirements.txt \
11 |     && pip install -r docs/requirements.txt
12 | 
13 | COPY ./ /code
14 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.md
3 | include meta.json
4 | # We have some .py files that are needed but aren't part of the package, so make sure they're included
5 | recursive-include gobbli *.py
6 | recursive-include gobbli Dockerfile
7 | recursive-include gobbli requirements.txt


--------------------------------------------------------------------------------
/benchmark/README.md:
--------------------------------------------------------------------------------
 1 | # gobbli Benchmarks
 2 | 
 3 | This directory contains benchmarking code and output for various aspects of gobbli model performance.
 4 | 
 5 | To run the benchmarks (note -- this may take several days depending on available computing resources):
 6 | 
 7 |     ./run_benchmarks.sh
 8 | 
 9 | To run with GPU support enabled:
10 | 
11 |     export GOBBLI_USE_GPU=1
12 |     ./run_benchmarks.sh
13 |     
14 | Use `--help` to see additional arguments in case you want to debug individual benchmarks, force re-running, etc.
15 | 
16 |     ./run_benchmarks.sh --help
17 | 


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/ALBERT/output.md:
--------------------------------------------------------------------------------
 1 | # Results: ALBERT
 2 | |    |   imbalance_proportion |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |   Minority Class F1 Score |   Majority Class F1 Score |
 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:|
 4 | |  0 |                   0.01 |            0.333333 |                   0.25     |                 0.5     |    0.5     |                  0        |                  0.666667 |
 5 | |  1 |                   0.05 |            0.333333 |                   0.25     |                 0.5     |    0.5     |                  0        |                  0.666667 |
 6 | |  2 |                   0.1  |            0.737548 |                   0.807997 |                 0.74992 |    0.74992 |                  0.680564 |                  0.794531 |
 7 | |  3 |                   0.25 |            0.820206 |                   0.837869 |                 0.82228 |    0.82228 |                  0.800896 |                  0.839516 |
 8 | |  4 |                   0.33 |            0.847492 |                   0.851868 |                 0.84792 |    0.84792 |                  0.839415 |                  0.855569 |
 9 | |  5 |                   0.5  |            0.856016 |                   0.856278 |                 0.85604 |    0.85604 |                  0.854156 |                  0.857876 |
10 | ![Results](ALBERT/plot.png)
11 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/ALBERT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/ALBERT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/ALBERT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "ALBERT", "model_name": "Transformer", "param_grid": {"transformer_weights": ["albert-base-v1", "albert-base-v2"], "transformer_model": ["Albert"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/BERT/output.md:
--------------------------------------------------------------------------------
 1 | # Results: BERT
 2 | |    |   imbalance_proportion |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |   Minority Class F1 Score |   Majority Class F1 Score |
 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:|
 4 | |  0 |                   0.01 |            0.460844 |                   0.759732 |                 0.56296 |    0.56296 |                  0.226204 |                  0.695485 |
 5 | |  1 |                   0.05 |            0.684874 |                   0.802653 |                 0.7092  |    0.7092  |                  0.597319 |                  0.772428 |
 6 | |  2 |                   0.1  |            0.761841 |                   0.829851 |                 0.77224 |    0.77224 |                  0.712075 |                  0.811607 |
 7 | |  3 |                   0.25 |            0.852626 |                   0.865441 |                 0.8538  |    0.8538  |                  0.839475 |                  0.865778 |
 8 | |  4 |                   0.33 |            0.868724 |                   0.874132 |                 0.86916 |    0.86916 |                  0.861157 |                  0.876291 |
 9 | |  5 |                   0.5  |            0.88172  |                   0.881725 |                 0.88172 |    0.88172 |                  0.881933 |                  0.881507 |
10 | ![Results](BERT/plot.png)
11 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/BERT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/BERT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/BERT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "BERT", "model_name": "BERT", "param_grid": {"bert_model": ["bert-base-uncased", "bert-base-cased", "scibert-uncased"], "max_seq_length": [128]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/DistilBERT/output.md:
--------------------------------------------------------------------------------
 1 | # Results: DistilBERT
 2 | |    |   imbalance_proportion |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |   Minority Class F1 Score |   Majority Class F1 Score |
 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:|
 4 | |  0 |                   0.01 |            0.442551 |                   0.754912 |                 0.55296 |    0.55296 |                  0.194464 |                  0.690638 |
 5 | |  1 |                   0.05 |            0.666977 |                   0.795581 |                 0.69524 |    0.69524 |                  0.569961 |                  0.763993 |
 6 | |  2 |                   0.1  |            0.766211 |                   0.825528 |                 0.77524 |    0.77524 |                  0.720267 |                  0.812155 |
 7 | |  3 |                   0.25 |            0.83463  |                   0.852239 |                 0.83648 |    0.83648 |                  0.817141 |                  0.85212  |
 8 | |  4 |                   0.33 |            0.851158 |                   0.85668  |                 0.85168 |    0.85168 |                  0.842347 |                  0.85997  |
 9 | |  5 |                   0.5  |            0.865635 |                   0.865692 |                 0.86564 |    0.86564 |                  0.864834 |                  0.866436 |
10 | ![Results](DistilBERT/plot.png)
11 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/DistilBERT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/DistilBERT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/DistilBERT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "DistilBERT", "model_name": "Transformer", "param_grid": {"transformer_weights": ["distilbert-base-uncased", "distilbert-base-uncased-distilled-squad"], "transformer_model": ["DistilBert"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/FastText/output.md:
--------------------------------------------------------------------------------
 1 | # Results: FastText
 2 | |    |   imbalance_proportion |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |   Minority Class F1 Score |   Majority Class F1 Score |
 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:|
 4 | |  0 |                   0.01 |            0.333333 |                   0.25     |                 0.5     |    0.5     |                  0        |                  0.666667 |
 5 | |  1 |                   0.05 |            0.586303 |                   0.783703 |                 0.63904 |    0.63904 |                  0.438596 |                  0.734009 |
 6 | |  2 |                   0.1  |            0.723779 |                   0.817785 |                 0.74056 |    0.74056 |                  0.655696 |                  0.791862 |
 7 | |  3 |                   0.25 |            0.853318 |                   0.868819 |                 0.85472 |    0.85472 |                  0.838979 |                  0.867658 |
 8 | |  4 |                   0.33 |            0.873252 |                   0.879321 |                 0.87372 |    0.87372 |                  0.865551 |                  0.880953 |
 9 | |  5 |                   0.5  |            0.89008  |                   0.89008  |                 0.89008 |    0.89008 |                  0.890054 |                  0.890106 |
10 | ![Results](FastText/plot.png)
11 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/FastText/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/FastText/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/FastText/run-meta.json:
--------------------------------------------------------------------------------
1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "FastText", "model_name": "FastText", "param_grid": {"word_ngrams": [1, 2], "dim": [100, 300], "lr": [0.5, 1.0]}, "preprocess_func": "fasttext_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/GPT/ERROR:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/GPT/ERROR


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/GPT/output.md:
--------------------------------------------------------------------------------
 1 | # ERROR: Exception during run 'GPT'.
 2 | 
 3 | ```
 4 |   File "/data/users/jnance/gobbli/benchmark/scenario.py", line 153, in run
 5 |     output = self._do_run(run, run_output_dir)
 6 | 
 7 |   File "/data/users/jnance/gobbli/benchmark/scenario.py", line 430, in _do_run
 8 |     run_kwargs=run.run_kwargs,
 9 | 
10 |   File "/data/users/jnance/gobbli/benchmark/benchmark_util.py", line 214, in run_benchmark_experiment
11 |     return exp.run(**run_kwargs)
12 | 
13 |   File "/code/gobbli/experiment/classification.py", line 659, in run
14 |     for params in grid
15 | 
16 |   File "/usr/local/lib/python3.7/site-packages/ray/worker.py", line 2247, in get
17 |     raise value
18 | 
19 | RayTaskError: [36mray_worker:gobbli.experiment.classification.train()[39m (pid=6724, host=0dd5b0b5e4f4)
20 |   File "/code/gobbli/experiment/classification.py", line 550, in train
21 |     train_output = clf.train(train_input)
22 |   File "/code/gobbli/model/mixin.py", line 104, in train
23 |     _run_task(self._train, train_input, self.train_dir(), train_dir_name),
24 |   File "/code/gobbli/model/mixin.py", line 42, in _run_task
25 |     task_output = cast(gobbli.io.TaskIO, task_func(task_input, context))
26 |   File "/code/gobbli/model/transformer/model.py", line 254, in _train
27 |     self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs
28 |   File "/code/gobbli/docker.py", line 113, in run_container
29 |     raise RuntimeError(err_msg)
30 | RuntimeError: Error running container (return code 1). Last 20 lines of logs: 
31 | Using device: cuda
32 | Number of GPUs: 1
33 | Initializing transformer...
34 | [36mray_worker:gobbli.experiment.classification.train()[39m (pid=6724, host=0dd5b0b5e4f4)
35 |   File "run_model.py", line 461, in <module>
36 |   Model: GPTForSequenceClassification
37 |     Weights: openai-gpt
38 |   Tokenizer: GPTTokenizer
39 |   Config: GPTConfig
40 |     model_cls = getattr(transformers, model_name)
41 | AttributeError: module 'transformers' has no attribute 'GPTForSequenceClassification'
42 | 
43 | 
44 | ```


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/GPT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "GPT", "model_name": "Transformer", "param_grid": {"transformer_weights": ["openai-gpt"], "transformer_model": ["GPT"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/GPT2/ERROR:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/GPT2/ERROR


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/GPT2/output.md:
--------------------------------------------------------------------------------
 1 | # ERROR: Exception during run 'GPT2'.
 2 | 
 3 | ```
 4 |   File "/data/users/jnance/gobbli/benchmark/scenario.py", line 153, in run
 5 |     output = self._do_run(run, run_output_dir)
 6 | 
 7 |   File "/data/users/jnance/gobbli/benchmark/scenario.py", line 430, in _do_run
 8 |     run_kwargs=run.run_kwargs,
 9 | 
10 |   File "/data/users/jnance/gobbli/benchmark/benchmark_util.py", line 214, in run_benchmark_experiment
11 |     return exp.run(**run_kwargs)
12 | 
13 |   File "/code/gobbli/experiment/classification.py", line 659, in run
14 |     for params in grid
15 | 
16 |   File "/usr/local/lib/python3.7/site-packages/ray/worker.py", line 2247, in get
17 |     raise value
18 | 
19 | RayTaskError: [36mray_worker:gobbli.experiment.classification.train()[39m (pid=6870, host=0dd5b0b5e4f4)
20 |   File "/code/gobbli/experiment/classification.py", line 550, in train
21 |     train_output = clf.train(train_input)
22 |   File "/code/gobbli/model/mixin.py", line 104, in train
23 |     _run_task(self._train, train_input, self.train_dir(), train_dir_name),
24 |   File "/code/gobbli/model/mixin.py", line 42, in _run_task
25 |     task_output = cast(gobbli.io.TaskIO, task_func(task_input, context))
26 |   File "/code/gobbli/model/transformer/model.py", line 254, in _train
27 |     self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs
28 |   File "/code/gobbli/docker.py", line 113, in run_container
29 |     raise RuntimeError(err_msg)
30 | RuntimeError: Error running container (return code 1). Last 20 lines of logs: 
31 | Using device: cuda
32 | Number of GPUs: 1
33 | Initializing transformer...
34 |   Model: GPT2ForSequenceClassification
35 |     Weights: gpt2
36 |   Tokenizer: GPT2Tokenizer
37 |   Config: GPT2Config
38 | [36mray_worker:gobbli.experiment.classification.train()[39m (pid=6870, host=0dd5b0b5e4f4)
39 |   File "run_model.py", line 461, in <module>
40 |     model_cls = getattr(transformers, model_name)
41 | AttributeError: module 'transformers' has no attribute 'GPT2ForSequenceClassification'
42 | 
43 | 
44 | ```


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/GPT2/run-meta.json:
--------------------------------------------------------------------------------
1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "GPT2", "model_name": "Transformer", "param_grid": {"transformer_weights": ["gpt2"], "transformer_model": ["GPT2"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/MTDNN/output.md:
--------------------------------------------------------------------------------
 1 | # Results: MTDNN
 2 | |    |   imbalance_proportion |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |   Minority Class F1 Score |   Majority Class F1 Score |
 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:|
 4 | |  0 |                   0.01 |            0.514917 |                   0.772073 |                 0.5942  |    0.5942  |                  0.318807 |                  0.711026 |
 5 | |  1 |                   0.05 |            0.729578 |                   0.82128  |                 0.74552 |    0.74552 |                  0.66392  |                  0.795237 |
 6 | |  2 |                   0.1  |            0.806281 |                   0.849677 |                 0.81156 |    0.81156 |                  0.774302 |                  0.83826  |
 7 | |  3 |                   0.25 |            0.853252 |                   0.866297 |                 0.85444 |    0.85444 |                  0.840051 |                  0.866454 |
 8 | |  4 |                   0.33 |            0.868913 |                   0.873457 |                 0.86928 |    0.86928 |                  0.861982 |                  0.875845 |
 9 | |  5 |                   0.5  |            0.883435 |                   0.883508 |                 0.88344 |    0.88344 |                  0.882661 |                  0.884209 |
10 | ![Results](MTDNN/plot.png)
11 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/MTDNN/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/MTDNN/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/MTDNN/run-meta.json:
--------------------------------------------------------------------------------
1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "MTDNN", "model_name": "MTDNN", "param_grid": {"mtdnn_model": ["mt-dnn-base"], "max_seq_length": [128]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/RoBERTa/output.md:
--------------------------------------------------------------------------------
 1 | # Results: RoBERTa
 2 | |    |   imbalance_proportion |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |   Minority Class F1 Score |   Majority Class F1 Score |
 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:|
 4 | |  0 |                   0.01 |            0.333333 |                   0.25     |                 0.5     |    0.5     |                  0        |                  0.666667 |
 5 | |  1 |                   0.05 |            0.333333 |                   0.25     |                 0.5     |    0.5     |                  0        |                  0.666667 |
 6 | |  2 |                   0.1  |            0.333333 |                   0.25     |                 0.5     |    0.5     |                  0        |                  0.666667 |
 7 | |  3 |                   0.25 |            0.865232 |                   0.876027 |                 0.86612 |    0.86612 |                  0.854295 |                  0.87617  |
 8 | |  4 |                   0.33 |            0.333333 |                   0.25     |                 0.5     |    0.5     |                  0        |                  0.666667 |
 9 | |  5 |                   0.5  |            0.8874   |                   0.8874   |                 0.8874  |    0.8874  |                  0.887341 |                  0.887459 |
10 | ![Results](RoBERTa/plot.png)
11 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/RoBERTa/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/RoBERTa/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/RoBERTa/run-meta.json:
--------------------------------------------------------------------------------
1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "RoBERTa", "model_name": "Transformer", "param_grid": {"transformer_weights": ["roberta-base"], "transformer_model": ["Roberta"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/SKLearn/output.md:
--------------------------------------------------------------------------------
 1 | # Results: SKLearn
 2 | |    |   imbalance_proportion |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |   Minority Class F1 Score |   Majority Class F1 Score |
 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:|
 4 | |  0 |                   0.01 |            0.333333 |                   0.25     |                 0.5     |    0.5     |                 0         |                  0.666667 |
 5 | |  1 |                   0.05 |            0.35963  |                   0.753057 |                 0.51208 |    0.51208 |                 0.0471801 |                  0.672079 |
 6 | |  2 |                   0.1  |            0.493072 |                   0.77027  |                 0.58156 |    0.58156 |                 0.281278  |                  0.704867 |
 7 | |  3 |                   0.25 |            0.766716 |                   0.832396 |                 0.77652 |    0.77652 |                 0.718893  |                  0.814539 |
 8 | |  4 |                   0.33 |            0.830675 |                   0.855935 |                 0.83336 |    0.83336 |                 0.809354  |                  0.851997 |
 9 | |  5 |                   0.5  |            0.88048  |                   0.880483 |                 0.88048 |    0.88048 |                 0.880317  |                  0.880642 |
10 | ![Results](SKLearn/plot.png)
11 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/SKLearn/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/SKLearn/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/SKLearn/run-meta.json:
--------------------------------------------------------------------------------
1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "SKLearn", "model_name": "SKLearnClassifier", "param_grid": {}, "preprocess_func": "fasttext_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/XLM-RoBERTa/output.md:
--------------------------------------------------------------------------------
 1 | # Results: XLM-RoBERTa
 2 | |    |   imbalance_proportion |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |   Minority Class F1 Score |   Majority Class F1 Score |
 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:|
 4 | |  0 |                   0.01 |            0.333333 |                   0.25     |                 0.5     |    0.5     |                  0        |                  0.666667 |
 5 | |  1 |                   0.05 |            0.333333 |                   0.25     |                 0.5     |    0.5     |                  0        |                  0.666667 |
 6 | |  2 |                   0.1  |            0.333333 |                   0.25     |                 0.5     |    0.5     |                  0        |                  0.666667 |
 7 | |  3 |                   0.25 |            0.835274 |                   0.854145 |                 0.83724 |    0.83724 |                  0.81728  |                  0.853269 |
 8 | |  4 |                   0.33 |            0.849965 |                   0.85965  |                 0.85088 |    0.85088 |                  0.838251 |                  0.86168  |
 9 | |  5 |                   0.5  |            0.869491 |                   0.870338 |                 0.86956 |    0.86956 |                  0.8665   |                  0.872483 |
10 | ![Results](XLM-RoBERTa/plot.png)
11 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/XLM-RoBERTa/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/XLM-RoBERTa/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/XLM-RoBERTa/run-meta.json:
--------------------------------------------------------------------------------
1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "XLM-RoBERTa", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlm-roberta-base"], "transformer_model": ["XLMRoberta"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/XLM/output.md:
--------------------------------------------------------------------------------
 1 | # Results: XLM
 2 | |    |   imbalance_proportion |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |   Minority Class F1 Score |   Majority Class F1 Score |
 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:|
 4 | |  0 |                   0.01 |            0.333333 |                   0.25     |                 0.5     |    0.5     |                  0        |                  0.666667 |
 5 | |  1 |                   0.05 |            0.502425 |                   0.744373 |                 0.58404 |    0.58404 |                  0.300908 |                  0.703943 |
 6 | |  2 |                   0.1  |            0.333333 |                   0.25     |                 0.5     |    0.5     |                  0        |                  0.666667 |
 7 | |  3 |                   0.25 |            0.583932 |                   0.674197 |                 0.61768 |    0.61768 |                  0.465436 |                  0.702428 |
 8 | |  4 |                   0.33 |            0.81156  |                   0.815876 |                 0.81212 |    0.81212 |                  0.801286 |                  0.821834 |
 9 | |  5 |                   0.5  |            0.333333 |                   0.25     |                 0.5     |    0.5     |                  0.666667 |                  0        |
10 | ![Results](XLM/plot.png)
11 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/XLM/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/XLM/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/XLM/run-meta.json:
--------------------------------------------------------------------------------
1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "XLM", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlm-mlm-tlm-xnli15-1024", "xlm-clm-ende-1024"], "transformer_model": ["XLM"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/XLNet/output.md:
--------------------------------------------------------------------------------
 1 | # Results: XLNet
 2 | |    |   imbalance_proportion |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |   Minority Class F1 Score |   Majority Class F1 Score |
 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:|
 4 | |  0 |                   0.01 |            0.333333 |                   0.25     |                 0.5     |    0.5     |                  0        |                  0.666667 |
 5 | |  1 |                   0.05 |            0.333333 |                   0.25     |                 0.5     |    0.5     |                  0        |                  0.666667 |
 6 | |  2 |                   0.1  |            0.785112 |                   0.842044 |                 0.79284 |    0.79284 |                  0.744361 |                  0.825863 |
 7 | |  3 |                   0.25 |            0.868935 |                   0.878797 |                 0.86972 |    0.86972 |                  0.85879  |                  0.879079 |
 8 | |  4 |                   0.33 |            0.878497 |                   0.882614 |                 0.8788  |    0.8788  |                  0.872432 |                  0.884563 |
 9 | |  5 |                   0.5  |            0.891879 |                   0.891891 |                 0.89188 |    0.89188 |                  0.89216  |                  0.891598 |
10 | ![Results](XLNet/plot.png)
11 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/XLNet/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/XLNet/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/XLNet/run-meta.json:
--------------------------------------------------------------------------------
1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "XLNet", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlnet-base-cased"], "transformer_model": ["XLNet"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/spaCy/output.md:
--------------------------------------------------------------------------------
 1 | # Results: spaCy
 2 | |    |   imbalance_proportion |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |   Minority Class F1 Score |   Majority Class F1 Score |
 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:|
 4 | |  0 |                   0.01 |            0.340035 |                   0.744259 |                 0.503   |    0.503   |                 0.0120856 |                  0.667985 |
 5 | |  1 |                   0.05 |            0.643363 |                   0.788313 |                 0.6776  |    0.6776  |                 0.532862  |                  0.753863 |
 6 | |  2 |                   0.1  |            0.740206 |                   0.82189  |                 0.75392 |    0.75392 |                 0.680515  |                  0.799896 |
 7 | |  3 |                   0.25 |            0.860444 |                   0.86873  |                 0.86116 |    0.86116 |                 0.850446  |                  0.870442 |
 8 | |  4 |                   0.33 |            0.883307 |                   0.888066 |                 0.88364 |    0.88364 |                 0.877076  |                  0.889539 |
 9 | |  5 |                   0.5  |            0.89692  |                   0.896925 |                 0.89692 |    0.89692 |                 0.897105  |                  0.896734 |
10 | ![Results](spaCy/plot.png)
11 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/spaCy/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/spaCy/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/spaCy/run-meta.json:
--------------------------------------------------------------------------------
1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "spaCy", "model_name": "SpaCyModel", "param_grid": {"model": ["en_core_web_sm", "en_core_web_lg"], "architecture": ["bow", "simple_cnn", "ensemble"]}, "preprocess_func": null, "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/spacy-transformers/output.md:
--------------------------------------------------------------------------------
 1 | # Results: spacy-transformers
 2 | |    |   imbalance_proportion |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |   Minority Class F1 Score |   Majority Class F1 Score |
 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:|
 4 | |  0 |                   0.01 |            0.333333 |                   0.25     |                 0.5     |    0.5     |                  0        |                  0.666667 |
 5 | |  1 |                   0.05 |            0.539247 |                   0.763022 |                 0.6074  |    0.6074  |                  0.362041 |                  0.716452 |
 6 | |  2 |                   0.1  |            0.662839 |                   0.792193 |                 0.6918  |    0.6918  |                  0.564024 |                  0.761654 |
 7 | |  3 |                   0.25 |            0.800328 |                   0.823934 |                 0.80348 |    0.80348 |                  0.775241 |                  0.825415 |
 8 | |  4 |                   0.33 |            0.820368 |                   0.830668 |                 0.8216  |    0.8216  |                  0.805495 |                  0.835242 |
 9 | |  5 |                   0.5  |            0.838622 |                   0.838789 |                 0.83864 |    0.83864 |                  0.840314 |                  0.836931 |
10 | ![Results](spacy-transformers/plot.png)
11 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/spacy-transformers/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/spacy-transformers/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/class_imbalance/spacy-transformers/run-meta.json:
--------------------------------------------------------------------------------
1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "spacy-transformers", "model_name": "SpaCyModel", "param_grid": {"model": ["en_trf_bertbaseuncased_lg", "en_trf_xlnetbasecased_lg", "en_trf_robertabase_lg", "en_trf_distilbertbaseuncased_lg"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/data_augmentation/BERTMaskedLM/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/data_augmentation/BERTMaskedLM/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/data_augmentation/BERTMaskedLM/run-meta.json:
--------------------------------------------------------------------------------
1 | {"percent_multipliers": [[0.005, 0], [0.005, 1], [0.005, 5], [0.005, 10], [0.05, 0], [0.05, 1], [0.05, 5], [0.05, 10], [0.33, 0], [0.33, 1], [0.33, 5], [0.75, 0], [0.75, 1], [0.75, 5]], "model_name": "FastText", "param_grid": {"word_ngrams": [1], "autotune_duration": [120]}, "preprocess_func": "fasttext_preprocess", "augment_probability": 0.15, "augment_name": "BERTMaskedLM", "params": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/data_augmentation/MarianMT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/data_augmentation/MarianMT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/data_augmentation/MarianMT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"percent_multipliers": [[0.005, 0], [0.005, 1], [0.005, 5], [0.005, 10], [0.05, 0], [0.05, 1], [0.05, 5], [0.05, 10], [0.33, 0], [0.33, 1], [0.33, 5], [0.75, 0], [0.75, 1], [0.75, 5]], "model_name": "FastText", "param_grid": {"word_ngrams": [1], "autotune_duration": [120]}, "preprocess_func": "fasttext_preprocess", "augment_probability": 0.15, "augment_name": "MarianMT", "params": {"target_languages": ["french", "german", "japanese", "russian", "italian", "portugese", "dutch", "indonesian", "ukrainian", "swedish"]}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/data_augmentation/Word2Vec/output.md:
--------------------------------------------------------------------------------
 1 | # Results: Word2Vec
 2 | |    |   percent |   multiplier |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |
 3 | |---:|----------:|-------------:|--------------------:|---------------------------:|------------------------:|-----------:|
 4 | |  0 |     0.005 |            0 |            0.333333 |                   0.25     |                 0.5     |    0.5     |
 5 | |  1 |     0.005 |            1 |            0.615251 |                   0.669323 |                 0.63484 |    0.63484 |
 6 | |  2 |     0.005 |            5 |            0.711989 |                   0.712191 |                 0.71204 |    0.71204 |
 7 | |  3 |     0.005 |           10 |            0.695156 |                   0.695169 |                 0.69516 |    0.69516 |
 8 | |  4 |     0.05  |            0 |            0.801613 |                   0.802089 |                 0.80168 |    0.80168 |
 9 | |  5 |     0.05  |            1 |            0.811721 |                   0.813553 |                 0.81196 |    0.81196 |
10 | |  6 |     0.05  |            5 |            0.814299 |                   0.81446  |                 0.81432 |    0.81432 |
11 | |  7 |     0.05  |           10 |            0.816025 |                   0.816146 |                 0.81604 |    0.81604 |
12 | |  8 |     0.33  |            0 |            0.857272 |                   0.857361 |                 0.85728 |    0.85728 |
13 | |  9 |     0.33  |            1 |            0.857917 |                   0.857955 |                 0.85792 |    0.85792 |
14 | | 10 |     0.33  |            5 |            0.85955  |                   0.859666 |                 0.85956 |    0.85956 |
15 | | 11 |     0.75  |            0 |            0.8724   |                   0.8724   |                 0.8724  |    0.8724  |
16 | | 12 |     0.75  |            1 |            0.874479 |                   0.874492 |                 0.87448 |    0.87448 |
17 | | 13 |     0.75  |            5 |            0.869273 |                   0.869364 |                 0.86928 |    0.86928 |
18 | ![Results](Word2Vec/plot.png)
19 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/data_augmentation/Word2Vec/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/data_augmentation/Word2Vec/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/data_augmentation/Word2Vec/run-meta.json:
--------------------------------------------------------------------------------
1 | {"percent_multipliers": [[0.005, 0], [0.005, 1], [0.005, 5], [0.005, 10], [0.05, 0], [0.05, 1], [0.05, 5], [0.05, 10], [0.33, 0], [0.33, 1], [0.33, 5], [0.75, 0], [0.75, 1], [0.75, 5]], "model_name": "FastText", "param_grid": {"word_ngrams": [1], "autotune_duration": [120]}, "preprocess_func": "fasttext_preprocess", "augment_probability": 0.15, "augment_name": "Word2Vec", "params": {"model": "glove.6B.300d", "tokenizer": "SPACY"}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/data_augmentation/WordNet/output.md:
--------------------------------------------------------------------------------
 1 | # Results: WordNet
 2 | |    |   percent |   multiplier |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |
 3 | |---:|----------:|-------------:|--------------------:|---------------------------:|------------------------:|-----------:|
 4 | |  0 |     0.005 |            0 |            0.333422 |                   0.75001  |                 0.50004 |    0.50004 |
 5 | |  1 |     0.005 |            1 |            0.650811 |                   0.660782 |                 0.65432 |    0.65432 |
 6 | |  2 |     0.005 |            5 |            0.707364 |                   0.71045  |                 0.70816 |    0.70816 |
 7 | |  3 |     0.005 |           10 |            0.677011 |                   0.679425 |                 0.67776 |    0.67776 |
 8 | |  4 |     0.05  |            0 |            0.778989 |                   0.779782 |                 0.77912 |    0.77912 |
 9 | |  5 |     0.05  |            1 |            0.795194 |                   0.796046 |                 0.79532 |    0.79532 |
10 | |  6 |     0.05  |            5 |            0.806279 |                   0.806284 |                 0.80628 |    0.80628 |
11 | |  7 |     0.05  |           10 |            0.803407 |                   0.803647 |                 0.80344 |    0.80344 |
12 | |  8 |     0.33  |            0 |            0.859476 |                   0.859526 |                 0.85948 |    0.85948 |
13 | |  9 |     0.33  |            1 |            0.858    |                   0.858004 |                 0.858   |    0.858   |
14 | | 10 |     0.33  |            5 |            0.85328  |                   0.85328  |                 0.85328 |    0.85328 |
15 | | 11 |     0.75  |            0 |            0.873038 |                   0.873058 |                 0.87304 |    0.87304 |
16 | | 12 |     0.75  |            1 |            0.87552  |                   0.875523 |                 0.87552 |    0.87552 |
17 | | 13 |     0.75  |            5 |            0.865713 |                   0.865798 |                 0.86572 |    0.86572 |
18 | ![Results](WordNet/plot.png)
19 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/data_augmentation/WordNet/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/data_augmentation/WordNet/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/data_augmentation/WordNet/run-meta.json:
--------------------------------------------------------------------------------
1 | {"percent_multipliers": [[0.005, 0], [0.005, 1], [0.005, 5], [0.005, 10], [0.05, 0], [0.05, 1], [0.05, 5], [0.05, 10], [0.33, 0], [0.33, 1], [0.33, 5], [0.75, 0], [0.75, 1], [0.75, 5]], "model_name": "FastText", "param_grid": {"word_ngrams": [1], "autotune_duration": [120]}, "preprocess_func": "fasttext_preprocess", "augment_probability": 0.15, "augment_name": "WordNet", "params": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/document_windowing/BERT/output.md:
--------------------------------------------------------------------------------
 1 | # Results: BERT
 2 | |    | Window Config           |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |
 3 | |---:|:------------------------|--------------------:|---------------------------:|------------------------:|-----------:|
 4 | |  0 | Length 250, pooling min |            0.880196 |                   0.880247 |                0.8802   |   0.8802   |
 5 | |  1 | Length 250, pooling min |            0.755273 |                   0.755374 |                0.755293 |   0.755293 |
 6 | |  2 | Length 250, pooling min |            0.791559 |                   0.791561 |                0.79156  |   0.79156  |
 7 | |  3 | Length 250, pooling min |            0.881205 |                   0.881759 |                0.881245 |   0.881245 |
 8 | |  4 | Length 250, pooling min |            0.754554 |                   0.754558 |                0.754555 |   0.754555 |
 9 | |  5 | Length 250, pooling min |            0.791008 |                   0.791103 |                0.791017 |   0.791017 |
10 | |  6 | Length 250, pooling min |            0.880026 |                   0.880286 |                0.880045 |   0.880045 |
11 | |  7 | Length 250, pooling min |            0.757108 |                   0.757175 |                0.75712  |   0.75712  |
12 | |  8 | Length 250, pooling min |            0.792644 |                   0.792682 |                0.792647 |   0.792647 |
13 | |  9 | Length 250, pooling min |            0.883124 |                   0.883129 |                0.883125 |   0.883125 |
14 | ![Results](BERT/plot.png)
15 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/document_windowing/BERT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/document_windowing/BERT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/document_windowing/BERT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"vocab_size": 2000, "sample_size": 0.1, "window_len_poolings": [[null, null], [50, "mean"], [125, "mean"], [250, "mean"], [50, "max"], [125, "max"], [250, "max"], [50, "min"], [125, "min"], [250, "min"]], "name": "BERT", "model_name": "BERT", "param_grid": {}, "preprocess_func": "bert_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/document_windowing/document_windowing.md:
--------------------------------------------------------------------------------
 1 | # Results: BERT
 2 | |    | Window Config           |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |
 3 | |---:|:------------------------|--------------------:|---------------------------:|------------------------:|-----------:|
 4 | |  0 | Length 250, pooling min |            0.880196 |                   0.880247 |                0.8802   |   0.8802   |
 5 | |  1 | Length 250, pooling min |            0.755273 |                   0.755374 |                0.755293 |   0.755293 |
 6 | |  2 | Length 250, pooling min |            0.791559 |                   0.791561 |                0.79156  |   0.79156  |
 7 | |  3 | Length 250, pooling min |            0.881205 |                   0.881759 |                0.881245 |   0.881245 |
 8 | |  4 | Length 250, pooling min |            0.754554 |                   0.754558 |                0.754555 |   0.754555 |
 9 | |  5 | Length 250, pooling min |            0.791008 |                   0.791103 |                0.791017 |   0.791017 |
10 | |  6 | Length 250, pooling min |            0.880026 |                   0.880286 |                0.880045 |   0.880045 |
11 | |  7 | Length 250, pooling min |            0.757108 |                   0.757175 |                0.75712  |   0.75712  |
12 | |  8 | Length 250, pooling min |            0.792644 |                   0.792682 |                0.792647 |   0.792647 |
13 | |  9 | Length 250, pooling min |            0.883124 |                   0.883129 |                0.883125 |   0.883125 |
14 | 
15 | ![Results](BERT/plot.png)
16 | ---
17 | 


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/ALBERT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/ALBERT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/ALBERT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "ALBERT", "model_name": "Transformer", "param_grid": {"transformer_weights": ["albert-base-v1", "albert-base-v2"], "transformer_model": ["Albert"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/BERT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/BERT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/BERT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "BERT", "model_name": "BERT", "param_grid": {"bert_model": ["bert-base-uncased", "bert-base-cased", "scibert-uncased"], "max_seq_length": [128]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/DistilBERT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/DistilBERT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/DistilBERT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "DistilBERT", "model_name": "Transformer", "param_grid": {"transformer_weights": ["distilbert-base-uncased", "distilbert-base-uncased-distilled-squad"], "transformer_model": ["DistilBert"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/FastText/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/FastText/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/FastText/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "FastText", "model_name": "FastText", "param_grid": {"word_ngrams": [1, 2], "dim": [100, 300], "lr": [0.5, 1.0]}, "preprocess_func": "fasttext_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/MTDNN/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/MTDNN/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/MTDNN/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "MTDNN", "model_name": "MTDNN", "param_grid": {"mtdnn_model": ["mt-dnn-base"], "max_seq_length": [128]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/RoBERTa/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/RoBERTa/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/RoBERTa/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "RoBERTa", "model_name": "Transformer", "param_grid": {"transformer_weights": ["roberta-base"], "transformer_model": ["Roberta"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/SKLearn/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/SKLearn/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/SKLearn/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "SKLearn", "model_name": "SKLearnClassifier", "param_grid": {}, "preprocess_func": "fasttext_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/XLM-RoBERTa/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/XLM-RoBERTa/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/XLM-RoBERTa/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "XLM-RoBERTa", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlm-roberta-base"], "transformer_model": ["XLMRoberta"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/XLM/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/XLM/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/XLM/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "XLM", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlm-mlm-tlm-xnli15-1024", "xlm-clm-ende-1024"], "transformer_model": ["XLM"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/XLNet/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/XLNet/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/XLNet/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "XLNet", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlnet-base-cased"], "transformer_model": ["XLNet"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/spaCy/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/spaCy/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/spaCy/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "spaCy", "model_name": "SpaCyModel", "param_grid": {"model": ["en_core_web_sm", "en_core_web_lg"], "architecture": ["bow", "simple_cnn", "ensemble"]}, "preprocess_func": null, "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/spacy-transformers/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/spacy-transformers/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb/spacy-transformers/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "spacy-transformers", "model_name": "SpaCyModel", "param_grid": {"model": ["en_trf_bertbaseuncased_lg", "en_trf_xlnetbasecased_lg", "en_trf_robertabase_lg", "en_trf_distilbertbaseuncased_lg"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/ALBERT/output.md:
--------------------------------------------------------------------------------
1 | # Results: ALBERT
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](ALBERT/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/ALBERT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/ALBERT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/ALBERT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "ALBERT", "model_name": "Transformer", "model_params": {"transformer_weights": "albert-base-v2", "transformer_model": "Albert"}, "preprocess_func": "bert_preprocess", "batch_size": 32}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/BERT/output.md:
--------------------------------------------------------------------------------
1 | # Results: BERT
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](BERT/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/BERT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/BERT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/BERT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "BERT", "model_name": "BERT", "model_params": {"bert_model": "bert-base-uncased", "max_seq_length": 128}, "preprocess_func": "bert_preprocess", "batch_size": 32}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/DistilBERT/output.md:
--------------------------------------------------------------------------------
1 | # Results: DistilBERT
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](DistilBERT/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/DistilBERT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/DistilBERT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/DistilBERT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "DistilBERT", "model_name": "Transformer", "model_params": {"transformer_weights": "distilbert-base-uncased", "transformer_model": "DistilBert"}, "preprocess_func": "bert_preprocess", "batch_size": 32}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/ELECTRA/output.md:
--------------------------------------------------------------------------------
1 | # Results: ELECTRA
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](ELECTRA/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/ELECTRA/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/ELECTRA/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/ELECTRA/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "ELECTRA", "model_name": "Transformer", "model_params": {"transformer_weights": "google/electra-base-discriminator", "transformer_model": "Electra"}, "preprocess_func": "bert_preprocess", "batch_size": 32}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/GPT/output.md:
--------------------------------------------------------------------------------
1 | # Results: GPT
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](GPT/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/GPT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/GPT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/GPT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "GPT", "model_name": "Transformer", "model_params": {"transformer_weights": "openai-gpt", "transformer_model": "OpenAIGPT"}, "preprocess_func": null, "batch_size": 32}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/GPT2/output.md:
--------------------------------------------------------------------------------
1 | # Results: GPT2
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](GPT2/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/GPT2/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/GPT2/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/GPT2/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "GPT2", "model_name": "Transformer", "model_params": {"transformer_weights": "gpt2-medium", "transformer_model": "GPT2"}, "preprocess_func": null, "batch_size": 16}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/RoBERTa/output.md:
--------------------------------------------------------------------------------
1 | # Results: RoBERTa
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](RoBERTa/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/RoBERTa/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/RoBERTa/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/RoBERTa/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "RoBERTa", "model_name": "Transformer", "model_params": {"transformer_weights": "roberta-base", "transformer_model": "Roberta"}, "preprocess_func": "bert_preprocess", "batch_size": 16}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/T5Model/output.md:
--------------------------------------------------------------------------------
1 | # Results: T5Model
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](T5Model/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/T5Model/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/T5Model/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/T5Model/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "T5Model", "model_name": "Transformer", "model_params": {"transformer_weights": "t5-base", "transformer_model": "T5"}, "preprocess_func": "bert_preprocess", "batch_size": 16}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/TransformerXL/output.md:
--------------------------------------------------------------------------------
1 | # Results: TransformerXL
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](TransformerXL/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/TransformerXL/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/TransformerXL/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/TransformerXL/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "TransformerXL", "model_name": "Transformer", "model_params": {"transformer_weights": "transfo-xl-wt103", "transformer_model": "TransfoXL"}, "preprocess_func": null, "batch_size": 16}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/USE/output.md:
--------------------------------------------------------------------------------
1 | # Results: USE
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](USE/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/USE/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/USE/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/USE/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "USE", "model_name": "USE", "model_params": {"use_model": "universal-sentence-encoder"}, "preprocess_func": null, "batch_size": 32}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/XLM-RoBERTa/output.md:
--------------------------------------------------------------------------------
1 | # Results: XLM-RoBERTa
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](XLM-RoBERTa/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/XLM-RoBERTa/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/XLM-RoBERTa/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/XLM-RoBERTa/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "XLM-RoBERTa", "model_name": "Transformer", "model_params": {"transformer_weights": "xlm-roberta-base", "transformer_model": "XLMRoberta"}, "preprocess_func": "bert_preprocess", "batch_size": 32}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/XLM/output.md:
--------------------------------------------------------------------------------
1 | # Results: XLM
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](XLM/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/XLM/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/XLM/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/XLM/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "XLM", "model_name": "Transformer", "model_params": {"transformer_weights": "xlm-clm-ende-1024", "transformer_model": "XLM"}, "preprocess_func": "bert_preprocess", "batch_size": 16}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/XLNet/output.md:
--------------------------------------------------------------------------------
1 | # Results: XLNet
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](XLNet/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/XLNet/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/XLNet/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/XLNet/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "XLNet", "model_name": "Transformer", "model_params": {"transformer_weights": "xlnet-base-cased", "transformer_model": "XLNet"}, "preprocess_func": "bert_preprocess", "batch_size": 16}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/imdb_embed.md:
--------------------------------------------------------------------------------
  1 | # Results: BERT
  2 | ```
  3 | 
  4 | ```
  5 | 
  6 | ![Results](BERT/plot.png)
  7 | ---
  8 | # Results: XLM
  9 | ```
 10 | 
 11 | ```
 12 | 
 13 | ![Results](XLM/plot.png)
 14 | ---
 15 | # Results: XLNet
 16 | ```
 17 | 
 18 | ```
 19 | 
 20 | ![Results](XLNet/plot.png)
 21 | ---
 22 | # Results: RoBERTa
 23 | ```
 24 | 
 25 | ```
 26 | 
 27 | ![Results](RoBERTa/plot.png)
 28 | ---
 29 | # Results: DistilBERT
 30 | ```
 31 | 
 32 | ```
 33 | 
 34 | ![Results](DistilBERT/plot.png)
 35 | ---
 36 | # Results: ALBERT
 37 | ```
 38 | 
 39 | ```
 40 | 
 41 | ![Results](ALBERT/plot.png)
 42 | ---
 43 | # Results: XLM-RoBERTa
 44 | ```
 45 | 
 46 | ```
 47 | 
 48 | ![Results](XLM-RoBERTa/plot.png)
 49 | ---
 50 | # Results: GPT
 51 | ```
 52 | 
 53 | ```
 54 | 
 55 | ![Results](GPT/plot.png)
 56 | ---
 57 | # Results: GPT2
 58 | ```
 59 | 
 60 | ```
 61 | 
 62 | ![Results](GPT2/plot.png)
 63 | ---
 64 | # Results: TransformerXL
 65 | ```
 66 | 
 67 | ```
 68 | 
 69 | ![Results](TransformerXL/plot.png)
 70 | ---
 71 | # Results: T5Model
 72 | ```
 73 | 
 74 | ```
 75 | 
 76 | ![Results](T5Model/plot.png)
 77 | ---
 78 | # Results: spaCy
 79 | ```
 80 | 
 81 | ```
 82 | 
 83 | ![Results](spaCy/plot.png)
 84 | ---
 85 | # Results: spacy-transformers
 86 | ```
 87 | 
 88 | ```
 89 | 
 90 | ![Results](spacy-transformers/plot.png)
 91 | ---
 92 | # Results: USE
 93 | ```
 94 | 
 95 | ```
 96 | 
 97 | ![Results](USE/plot.png)
 98 | ---
 99 | # Results: ELECTRA
100 | ```
101 | 
102 | ```
103 | 
104 | ![Results](ELECTRA/plot.png)
105 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/spaCy/output.md:
--------------------------------------------------------------------------------
1 | # Results: spaCy
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](spaCy/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/spaCy/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/spaCy/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/spaCy/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "spaCy", "model_name": "SpaCyModel", "model_params": {"model": "en_core_web_lg", "use_gpu": false}, "preprocess_func": null, "batch_size": 32}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/spacy-transformers/output.md:
--------------------------------------------------------------------------------
1 | # Results: spacy-transformers
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](spacy-transformers/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/spacy-transformers/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/spacy-transformers/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/imdb_embed/spacy-transformers/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "spacy-transformers", "model_name": "SpaCyModel", "model_params": {"model": "en_trf_robertabase_lg"}, "preprocess_func": "bert_preprocess", "batch_size": 16}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/ALBERT/output.md:
--------------------------------------------------------------------------------
 1 | # Results: ALBERT
 2 | |    |   data_proportion |   num_documents |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |
 3 | |---:|------------------:|----------------:|--------------------:|---------------------------:|------------------------:|-----------:|
 4 | |  0 |             0.005 |             125 |            0.559743 |                   0.580711 |                 0.57184 |    0.57184 |
 5 | |  1 |             0.01  |             250 |            0.604522 |                   0.616544 |                 0.61004 |    0.61004 |
 6 | |  2 |             0.1   |            2500 |            0.811767 |                   0.813246 |                 0.81196 |    0.81196 |
 7 | |  3 |             0.25  |            6250 |            0.831873 |                   0.831936 |                 0.83188 |    0.83188 |
 8 | |  4 |             0.33  |            8250 |            0.847549 |                   0.847665 |                 0.84756 |    0.84756 |
 9 | |  5 |             0.5   |           12500 |            0.836695 |                   0.836927 |                 0.83672 |    0.83672 |
10 | |  6 |             0.75  |           18750 |            0.852635 |                   0.853112 |                 0.85268 |    0.85268 |
11 | ![Results](ALBERT/plot.png)
12 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/ALBERT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/low_resource/ALBERT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/ALBERT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"data_proportions": [0.005, 0.01, 0.1, 0.25, 0.33, 0.5, 0.75], "name": "ALBERT", "model_name": "Transformer", "param_grid": {"transformer_weights": ["albert-base-v1", "albert-base-v2"], "transformer_model": ["Albert"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/BERT/output.md:
--------------------------------------------------------------------------------
 1 | # Results: BERT
 2 | |    |   data_proportion |   num_documents |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |
 3 | |---:|------------------:|----------------:|--------------------:|---------------------------:|------------------------:|-----------:|
 4 | |  0 |             0.005 |             125 |            0.74607  |                   0.750071 |                 0.74688 |    0.74688 |
 5 | |  1 |             0.01  |             250 |            0.781651 |                   0.784314 |                 0.78208 |    0.78208 |
 6 | |  2 |             0.1   |            2500 |            0.838392 |                   0.838467 |                 0.8384  |    0.8384  |
 7 | |  3 |             0.25  |            6250 |            0.85648  |                   0.856917 |                 0.85652 |    0.85652 |
 8 | |  4 |             0.33  |            8250 |            0.862295 |                   0.86305  |                 0.86236 |    0.86236 |
 9 | |  5 |             0.5   |           12500 |            0.871105 |                   0.871296 |                 0.87112 |    0.87112 |
10 | |  6 |             0.75  |           18750 |            0.879389 |                   0.879541 |                 0.8794  |    0.8794  |
11 | ![Results](BERT/plot.png)
12 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/BERT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/low_resource/BERT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/BERT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"data_proportions": [0.005, 0.01, 0.1, 0.25, 0.33, 0.5, 0.75], "name": "BERT", "model_name": "BERT", "param_grid": {"bert_model": ["bert-base-uncased", "bert-base-cased", "scibert-uncased"], "max_seq_length": [128]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/DistilBERT/output.md:
--------------------------------------------------------------------------------
 1 | # Results: DistilBERT
 2 | |    |   data_proportion |   num_documents |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |
 3 | |---:|------------------:|----------------:|--------------------:|---------------------------:|------------------------:|-----------:|
 4 | |  0 |             0.005 |             125 |            0.378616 |                   0.740677 |                 0.52064 |    0.52064 |
 5 | |  1 |             0.01  |             250 |            0.754579 |                   0.755721 |                 0.7548  |    0.7548  |
 6 | |  2 |             0.1   |            2500 |            0.828095 |                   0.82831  |                 0.82812 |    0.82812 |
 7 | |  3 |             0.25  |            6250 |            0.843423 |                   0.843985 |                 0.84348 |    0.84348 |
 8 | |  4 |             0.33  |            8250 |            0.847057 |                   0.847292 |                 0.84708 |    0.84708 |
 9 | |  5 |             0.5   |           12500 |            0.854278 |                   0.854728 |                 0.85432 |    0.85432 |
10 | |  6 |             0.75  |           18750 |            0.862146 |                   0.86231  |                 0.86216 |    0.86216 |
11 | ![Results](DistilBERT/plot.png)
12 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/DistilBERT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/low_resource/DistilBERT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/DistilBERT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"data_proportions": [0.005, 0.01, 0.1, 0.25, 0.33, 0.5, 0.75], "name": "DistilBERT", "model_name": "Transformer", "param_grid": {"transformer_weights": ["distilbert-base-uncased", "distilbert-base-uncased-distilled-squad"], "transformer_model": ["DistilBert"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/FastText/output.md:
--------------------------------------------------------------------------------
 1 | # Results: FastText
 2 | |    |   data_proportion |   num_documents |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |
 3 | |---:|------------------:|----------------:|--------------------:|---------------------------:|------------------------:|-----------:|
 4 | |  0 |             0.005 |             125 |            0.333422 |                   0.75001  |                 0.50004 |    0.50004 |
 5 | |  1 |             0.01  |             250 |            0.589766 |                   0.619454 |                 0.60348 |    0.60348 |
 6 | |  2 |             0.1   |            2500 |            0.800598 |                   0.800614 |                 0.8006  |    0.8006  |
 7 | |  3 |             0.25  |            6250 |            0.854741 |                   0.85495  |                 0.85476 |    0.85476 |
 8 | |  4 |             0.33  |            8250 |            0.8628   |                   0.8628   |                 0.8628  |    0.8628  |
 9 | |  5 |             0.5   |           12500 |            0.872959 |                   0.872967 |                 0.87296 |    0.87296 |
10 | |  6 |             0.75  |           18750 |            0.88352  |                   0.883524 |                 0.88352 |    0.88352 |
11 | ![Results](FastText/plot.png)
12 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/FastText/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/low_resource/FastText/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/FastText/run-meta.json:
--------------------------------------------------------------------------------
1 | {"data_proportions": [0.005, 0.01, 0.1, 0.25, 0.33, 0.5, 0.75], "name": "FastText", "model_name": "FastText", "param_grid": {"word_ngrams": [1, 2], "dim": [100, 300], "lr": [0.5, 1.0]}, "preprocess_func": "fasttext_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/MTDNN/output.md:
--------------------------------------------------------------------------------
 1 | # Results: MTDNN
 2 | |    |   data_proportion |   num_documents |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |
 3 | |---:|------------------:|----------------:|--------------------:|---------------------------:|------------------------:|-----------:|
 4 | |  0 |             0.005 |             125 |            0.823739 |                   0.829687 |                 0.82444 |    0.82444 |
 5 | |  1 |             0.01  |             250 |            0.835166 |                   0.836581 |                 0.83532 |    0.83532 |
 6 | |  2 |             0.1   |            2500 |            0.858437 |                   0.858475 |                 0.85844 |    0.85844 |
 7 | |  3 |             0.25  |            6250 |            0.862921 |                   0.863841 |                 0.863   |    0.863   |
 8 | |  4 |             0.33  |            8250 |            0.866312 |                   0.866405 |                 0.86632 |    0.86632 |
 9 | |  5 |             0.5   |           12500 |            0.86768  |                   0.867684 |                 0.86768 |    0.86768 |
10 | |  6 |             0.75  |           18750 |            0.87688  |                   0.876881 |                 0.87688 |    0.87688 |
11 | ![Results](MTDNN/plot.png)
12 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/MTDNN/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/low_resource/MTDNN/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/MTDNN/run-meta.json:
--------------------------------------------------------------------------------
1 | {"data_proportions": [0.005, 0.01, 0.1, 0.25, 0.33, 0.5, 0.75], "name": "MTDNN", "model_name": "MTDNN", "param_grid": {"mtdnn_model": ["mt-dnn-base"], "max_seq_length": [128]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/RoBERTa/output.md:
--------------------------------------------------------------------------------
 1 | # Results: RoBERTa
 2 | |    |   data_proportion |   num_documents |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |
 3 | |---:|------------------:|----------------:|--------------------:|---------------------------:|------------------------:|-----------:|
 4 | |  0 |             0.005 |             125 |            0.338979 |                   0.742939 |                 0.50252 |    0.50252 |
 5 | |  1 |             0.01  |             250 |            0.833661 |                   0.833831 |                 0.83368 |    0.83368 |
 6 | |  2 |             0.1   |            2500 |            0.865881 |                   0.866342 |                 0.86592 |    0.86592 |
 7 | |  3 |             0.25  |            6250 |            0.875998 |                   0.87603  |                 0.876   |    0.876   |
 8 | |  4 |             0.33  |            8250 |            0.882438 |                   0.882467 |                 0.88244 |    0.88244 |
 9 | |  5 |             0.5   |           12500 |            0.333333 |                   0.25     |                 0.5     |    0.5     |
10 | |  6 |             0.75  |           18750 |            0.888439 |                   0.88845  |                 0.88844 |    0.88844 |
11 | ![Results](RoBERTa/plot.png)
12 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/RoBERTa/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/low_resource/RoBERTa/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/RoBERTa/run-meta.json:
--------------------------------------------------------------------------------
1 | {"data_proportions": [0.005, 0.01, 0.1, 0.25, 0.33, 0.5, 0.75], "name": "RoBERTa", "model_name": "Transformer", "param_grid": {"transformer_weights": ["roberta-base"], "transformer_model": ["Roberta"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/SKLearn/output.md:
--------------------------------------------------------------------------------
 1 | # Results: SKLearn
 2 | |    |   data_proportion |   num_documents |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |
 3 | |---:|------------------:|----------------:|--------------------:|---------------------------:|------------------------:|-----------:|
 4 | |  0 |             0.005 |             125 |            0.502502 |                   0.700345 |                 0.57828 |    0.57828 |
 5 | |  1 |             0.01  |             250 |            0.682961 |                   0.701672 |                 0.68824 |    0.68824 |
 6 | |  2 |             0.1   |            2500 |            0.825509 |                   0.825598 |                 0.82552 |    0.82552 |
 7 | |  3 |             0.25  |            6250 |            0.853119 |                   0.853132 |                 0.85312 |    0.85312 |
 8 | |  4 |             0.33  |            8250 |            0.857173 |                   0.857471 |                 0.8572  |    0.8572  |
 9 | |  5 |             0.5   |           12500 |            0.866067 |                   0.86622  |                 0.86608 |    0.86608 |
10 | |  6 |             0.75  |           18750 |            0.874996 |                   0.87505  |                 0.875   |    0.875   |
11 | ![Results](SKLearn/plot.png)
12 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/SKLearn/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/low_resource/SKLearn/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/SKLearn/run-meta.json:
--------------------------------------------------------------------------------
1 | {"data_proportions": [0.005, 0.01, 0.1, 0.25, 0.33, 0.5, 0.75], "name": "SKLearn", "model_name": "SKLearnClassifier", "param_grid": {}, "preprocess_func": "fasttext_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/XLM-RoBERTa/output.md:
--------------------------------------------------------------------------------
 1 | # Results: XLM-RoBERTa
 2 | |    |   data_proportion |   num_documents |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |
 3 | |---:|------------------:|----------------:|--------------------:|---------------------------:|------------------------:|-----------:|
 4 | |  0 |             0.005 |             125 |            0.333333 |                   0.25     |                 0.5     |    0.5     |
 5 | |  1 |             0.01  |             250 |            0.516406 |                   0.689123 |                 0.58376 |    0.58376 |
 6 | |  2 |             0.1   |            2500 |            0.7987   |                   0.799119 |                 0.79876 |    0.79876 |
 7 | |  3 |             0.25  |            6250 |            0.333333 |                   0.25     |                 0.5     |    0.5     |
 8 | |  4 |             0.33  |            8250 |            0.852605 |                   0.853402 |                 0.85268 |    0.85268 |
 9 | |  5 |             0.5   |           12500 |            0.855971 |                   0.856728 |                 0.85604 |    0.85604 |
10 | |  6 |             0.75  |           18750 |            0.8442   |                   0.844201 |                 0.8442  |    0.8442  |
11 | ![Results](XLM-RoBERTa/plot.png)
12 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/XLM-RoBERTa/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/low_resource/XLM-RoBERTa/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/XLM-RoBERTa/run-meta.json:
--------------------------------------------------------------------------------
1 | {"data_proportions": [0.005, 0.01, 0.1, 0.25, 0.33, 0.5, 0.75], "name": "XLM-RoBERTa", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlm-roberta-base"], "transformer_model": ["XLMRoberta"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/XLM/output.md:
--------------------------------------------------------------------------------
 1 | # Results: XLM
 2 | |    |   data_proportion |   num_documents |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |
 3 | |---:|------------------:|----------------:|--------------------:|---------------------------:|------------------------:|-----------:|
 4 | |  0 |             0.005 |             125 |            0.333333 |                   0.25     |                 0.5     |    0.5     |
 5 | |  1 |             0.01  |             250 |            0.453665 |                   0.571183 |                 0.53092 |    0.53092 |
 6 | |  2 |             0.1   |            2500 |            0.771882 |                   0.77233  |                 0.77196 |    0.77196 |
 7 | |  3 |             0.25  |            6250 |            0.333333 |                   0.25     |                 0.5     |    0.5     |
 8 | |  4 |             0.33  |            8250 |            0.333333 |                   0.25     |                 0.5     |    0.5     |
 9 | |  5 |             0.5   |           12500 |            0.333333 |                   0.25     |                 0.5     |    0.5     |
10 | |  6 |             0.75  |           18750 |            0.333333 |                   0.25     |                 0.5     |    0.5     |
11 | ![Results](XLM/plot.png)
12 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/XLM/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/low_resource/XLM/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/XLM/run-meta.json:
--------------------------------------------------------------------------------
1 | {"data_proportions": [0.005, 0.01, 0.1, 0.25, 0.33, 0.5, 0.75], "name": "XLM", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlm-mlm-tlm-xnli15-1024", "xlm-clm-ende-1024"], "transformer_model": ["XLM"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/XLNet/output.md:
--------------------------------------------------------------------------------
 1 | # Results: XLNet
 2 | |    |   data_proportion |   num_documents |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |
 3 | |---:|------------------:|----------------:|--------------------:|---------------------------:|------------------------:|-----------:|
 4 | |  0 |             0.005 |             125 |            0.805214 |                   0.814967 |                 0.80652 |    0.80652 |
 5 | |  1 |             0.01  |             250 |            0.843484 |                   0.844229 |                 0.84356 |    0.84356 |
 6 | |  2 |             0.1   |            2500 |            0.866762 |                   0.867215 |                 0.8668  |    0.8668  |
 7 | |  3 |             0.25  |            6250 |            0.880065 |                   0.880265 |                 0.88008 |    0.88008 |
 8 | |  4 |             0.33  |            8250 |            0.878197 |                   0.87824  |                 0.8782  |    0.8782  |
 9 | |  5 |             0.5   |           12500 |            0.88088  |                   0.88088  |                 0.88088 |    0.88088 |
10 | |  6 |             0.75  |           18750 |            0.888433 |                   0.888538 |                 0.88844 |    0.88844 |
11 | ![Results](XLNet/plot.png)
12 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/XLNet/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/low_resource/XLNet/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/XLNet/run-meta.json:
--------------------------------------------------------------------------------
1 | {"data_proportions": [0.005, 0.01, 0.1, 0.25, 0.33, 0.5, 0.75], "name": "XLNet", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlnet-base-cased"], "transformer_model": ["XLNet"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/spaCy/output.md:
--------------------------------------------------------------------------------
 1 | # Results: spaCy
 2 | |    |   data_proportion |   num_documents |   Weighted F1 Score |   Weighted Precision Score |   Weighted Recall Score |   Accuracy |
 3 | |---:|------------------:|----------------:|--------------------:|---------------------------:|------------------------:|-----------:|
 4 | |  0 |             0.005 |             125 |            0.651859 |                   0.69133  |                 0.6642  |    0.6642  |
 5 | |  1 |             0.01  |             250 |            0.729198 |                   0.729735 |                 0.72932 |    0.72932 |
 6 | |  2 |             0.1   |            2500 |            0.823233 |                   0.82329  |                 0.82324 |    0.82324 |
 7 | |  3 |             0.25  |            6250 |            0.852519 |                   0.852534 |                 0.85252 |    0.85252 |
 8 | |  4 |             0.33  |            8250 |            0.864112 |                   0.864205 |                 0.86412 |    0.86412 |
 9 | |  5 |             0.5   |           12500 |            0.87436  |                   0.87436  |                 0.87436 |    0.87436 |
10 | |  6 |             0.75  |           18750 |            0.887432 |                   0.887551 |                 0.88744 |    0.88744 |
11 | ![Results](spaCy/plot.png)
12 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/spaCy/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/low_resource/spaCy/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/low_resource/spaCy/run-meta.json:
--------------------------------------------------------------------------------
1 | {"data_proportions": [0.005, 0.01, 0.1, 0.25, 0.33, 0.5, 0.75], "name": "spaCy", "model_name": "SpaCyModel", "param_grid": {"model": ["en_core_web_sm", "en_core_web_lg"], "architecture": ["bow", "simple_cnn", "ensemble"]}, "preprocess_func": null, "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/moviesummary/ALBERT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/moviesummary/ALBERT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/moviesummary/ALBERT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "ALBERT", "model_name": "Transformer", "param_grid": {"transformer_weights": ["albert-base-v1", "albert-base-v2"], "transformer_model": ["Albert"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/moviesummary/DistilBERT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/moviesummary/DistilBERT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/moviesummary/DistilBERT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "DistilBERT", "model_name": "Transformer", "param_grid": {"transformer_weights": ["distilbert-base-uncased", "distilbert-base-uncased-distilled-squad"], "transformer_model": ["DistilBert"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/moviesummary/FastText/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/moviesummary/FastText/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/moviesummary/FastText/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "FastText", "model_name": "FastText", "param_grid": {"word_ngrams": [1, 2], "dim": [100, 300], "lr": [0.5, 1.0]}, "preprocess_func": "fasttext_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/moviesummary/RoBERTa/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/moviesummary/RoBERTa/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/moviesummary/RoBERTa/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "RoBERTa", "model_name": "Transformer", "param_grid": {"transformer_weights": ["roberta-base"], "transformer_model": ["Roberta"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/moviesummary/SKLearn/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/moviesummary/SKLearn/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/moviesummary/SKLearn/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "SKLearn", "model_name": "SKLearnClassifier", "param_grid": {}, "preprocess_func": "fasttext_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/moviesummary/XLM-RoBERTa/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/moviesummary/XLM-RoBERTa/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/moviesummary/XLM-RoBERTa/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "XLM-RoBERTa", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlm-roberta-base"], "transformer_model": ["XLMRoberta"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/moviesummary/XLM/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/moviesummary/XLM/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/moviesummary/XLM/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "XLM", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlm-mlm-tlm-xnli15-1024", "xlm-clm-ende-1024"], "transformer_model": ["XLM"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/moviesummary/XLNet/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/moviesummary/XLNet/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/moviesummary/XLNet/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "XLNet", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlnet-base-cased"], "transformer_model": ["XLNet"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/moviesummary/spaCy/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/moviesummary/spaCy/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/moviesummary/spaCy/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "spaCy", "model_name": "SpaCyModel", "param_grid": {"model": ["en_core_web_sm", "en_core_web_lg"], "architecture": ["bow", "simple_cnn", "ensemble"]}, "preprocess_func": null, "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/moviesummary/spacy-transformers/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/moviesummary/spacy-transformers/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/moviesummary/spacy-transformers/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "spacy-transformers", "model_name": "SpaCyModel", "param_grid": {"model": ["en_trf_bertbaseuncased_lg", "en_trf_xlnetbasecased_lg", "en_trf_robertabase_lg", "en_trf_distilbertbaseuncased_lg"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/ALBERT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/ALBERT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/ALBERT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "ALBERT", "model_name": "Transformer", "param_grid": {"transformer_weights": ["albert-base-v1", "albert-base-v2"], "transformer_model": ["Albert"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/BERT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/BERT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/BERT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "BERT", "model_name": "BERT", "param_grid": {"bert_model": ["bert-base-uncased", "bert-base-cased", "scibert-uncased"], "max_seq_length": [128]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/DistilBERT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/DistilBERT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/DistilBERT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "DistilBERT", "model_name": "Transformer", "param_grid": {"transformer_weights": ["distilbert-base-uncased", "distilbert-base-uncased-distilled-squad"], "transformer_model": ["DistilBert"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/FastText/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/FastText/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/FastText/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "FastText", "model_name": "FastText", "param_grid": {"word_ngrams": [1, 2], "dim": [100, 300], "lr": [0.5, 1.0]}, "preprocess_func": "fasttext_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/MTDNN/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/MTDNN/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/MTDNN/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "MTDNN", "model_name": "MTDNN", "param_grid": {"mtdnn_model": ["mt-dnn-base"], "max_seq_length": [128]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/RoBERTa/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/RoBERTa/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/RoBERTa/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "RoBERTa", "model_name": "Transformer", "param_grid": {"transformer_weights": ["roberta-base"], "transformer_model": ["Roberta"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/SKLearn/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/SKLearn/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/SKLearn/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "SKLearn", "model_name": "SKLearnClassifier", "param_grid": {}, "preprocess_func": "fasttext_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/XLM-RoBERTa/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/XLM-RoBERTa/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/XLM-RoBERTa/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "XLM-RoBERTa", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlm-roberta-base"], "transformer_model": ["XLMRoberta"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/XLM/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/XLM/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/XLM/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "XLM", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlm-mlm-tlm-xnli15-1024", "xlm-clm-ende-1024"], "transformer_model": ["XLM"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/XLNet/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/XLNet/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/XLNet/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "XLNet", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlnet-base-cased"], "transformer_model": ["XLNet"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/spaCy/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/spaCy/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/spaCy/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "spaCy", "model_name": "SpaCyModel", "param_grid": {"model": ["en_core_web_sm", "en_core_web_lg"], "architecture": ["bow", "simple_cnn", "ensemble"]}, "preprocess_func": null, "run_kwargs": {}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/spacy-transformers/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/spacy-transformers/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups/spacy-transformers/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "spacy-transformers", "model_name": "SpaCyModel", "param_grid": {"model": ["en_trf_bertbaseuncased_lg", "en_trf_xlnetbasecased_lg", "en_trf_robertabase_lg", "en_trf_distilbertbaseuncased_lg"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/ALBERT/output.md:
--------------------------------------------------------------------------------
1 | # Results: ALBERT
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](ALBERT/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/ALBERT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/ALBERT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/ALBERT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "ALBERT", "model_name": "Transformer", "model_params": {"transformer_weights": "albert-base-v2", "transformer_model": "Albert"}, "preprocess_func": "bert_preprocess", "batch_size": 32}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/BERT/output.md:
--------------------------------------------------------------------------------
1 | # Results: BERT
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](BERT/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/BERT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/BERT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/BERT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "BERT", "model_name": "BERT", "model_params": {"bert_model": "bert-base-uncased", "max_seq_length": 128}, "preprocess_func": "bert_preprocess", "batch_size": 32}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/DistilBERT/output.md:
--------------------------------------------------------------------------------
1 | # Results: DistilBERT
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](DistilBERT/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/DistilBERT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/DistilBERT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/DistilBERT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "DistilBERT", "model_name": "Transformer", "model_params": {"transformer_weights": "distilbert-base-uncased", "transformer_model": "DistilBert"}, "preprocess_func": "bert_preprocess", "batch_size": 32}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/ELECTRA/output.md:
--------------------------------------------------------------------------------
1 | # Results: ELECTRA
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](ELECTRA/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/ELECTRA/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/ELECTRA/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/ELECTRA/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "ELECTRA", "model_name": "Transformer", "model_params": {"transformer_weights": "google/electra-base-discriminator", "transformer_model": "Electra"}, "preprocess_func": "bert_preprocess", "batch_size": 32}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/GPT/output.md:
--------------------------------------------------------------------------------
1 | # Results: GPT
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](GPT/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/GPT/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/GPT/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/GPT/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "GPT", "model_name": "Transformer", "model_params": {"transformer_weights": "openai-gpt", "transformer_model": "OpenAIGPT"}, "preprocess_func": null, "batch_size": 32}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/GPT2/output.md:
--------------------------------------------------------------------------------
1 | # Results: GPT2
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](GPT2/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/GPT2/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/GPT2/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/GPT2/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "GPT2", "model_name": "Transformer", "model_params": {"transformer_weights": "gpt2-medium", "transformer_model": "GPT2"}, "preprocess_func": null, "batch_size": 16}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/RoBERTa/output.md:
--------------------------------------------------------------------------------
1 | # Results: RoBERTa
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](RoBERTa/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/RoBERTa/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/RoBERTa/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/RoBERTa/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "RoBERTa", "model_name": "Transformer", "model_params": {"transformer_weights": "roberta-base", "transformer_model": "Roberta"}, "preprocess_func": "bert_preprocess", "batch_size": 16}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/T5Model/output.md:
--------------------------------------------------------------------------------
1 | # Results: T5Model
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](T5Model/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/T5Model/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/T5Model/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/T5Model/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "T5Model", "model_name": "Transformer", "model_params": {"transformer_weights": "t5-base", "transformer_model": "T5"}, "preprocess_func": "bert_preprocess", "batch_size": 16}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/TransformerXL/output.md:
--------------------------------------------------------------------------------
1 | # Results: TransformerXL
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](TransformerXL/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/TransformerXL/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/TransformerXL/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/TransformerXL/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "TransformerXL", "model_name": "Transformer", "model_params": {"transformer_weights": "transfo-xl-wt103", "transformer_model": "TransfoXL"}, "preprocess_func": null, "batch_size": 16}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/USE/output.md:
--------------------------------------------------------------------------------
1 | # Results: USE
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](USE/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/USE/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/USE/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/USE/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "USE", "model_name": "USE", "model_params": {"use_model": "universal-sentence-encoder"}, "preprocess_func": null, "batch_size": 32}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/XLM-RoBERTa/output.md:
--------------------------------------------------------------------------------
1 | # Results: XLM-RoBERTa
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](XLM-RoBERTa/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/XLM-RoBERTa/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/XLM-RoBERTa/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/XLM-RoBERTa/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "XLM-RoBERTa", "model_name": "Transformer", "model_params": {"transformer_weights": "xlm-roberta-base", "transformer_model": "XLMRoberta"}, "preprocess_func": "bert_preprocess", "batch_size": 32}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/XLM/output.md:
--------------------------------------------------------------------------------
1 | # Results: XLM
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](XLM/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/XLM/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/XLM/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/XLM/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "XLM", "model_name": "Transformer", "model_params": {"transformer_weights": "xlm-clm-ende-1024", "transformer_model": "XLM"}, "preprocess_func": "bert_preprocess", "batch_size": 16}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/XLNet/output.md:
--------------------------------------------------------------------------------
1 | # Results: XLNet
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](XLNet/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/XLNet/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/XLNet/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/XLNet/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "XLNet", "model_name": "Transformer", "model_params": {"transformer_weights": "xlnet-base-cased", "transformer_model": "XLNet"}, "preprocess_func": "bert_preprocess", "batch_size": 16}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/newsgroups_embed.md:
--------------------------------------------------------------------------------
  1 | # Results: BERT
  2 | ```
  3 | 
  4 | ```
  5 | 
  6 | ![Results](BERT/plot.png)
  7 | ---
  8 | # Results: XLM
  9 | ```
 10 | 
 11 | ```
 12 | 
 13 | ![Results](XLM/plot.png)
 14 | ---
 15 | # Results: XLNet
 16 | ```
 17 | 
 18 | ```
 19 | 
 20 | ![Results](XLNet/plot.png)
 21 | ---
 22 | # Results: RoBERTa
 23 | ```
 24 | 
 25 | ```
 26 | 
 27 | ![Results](RoBERTa/plot.png)
 28 | ---
 29 | # Results: DistilBERT
 30 | ```
 31 | 
 32 | ```
 33 | 
 34 | ![Results](DistilBERT/plot.png)
 35 | ---
 36 | # Results: ALBERT
 37 | ```
 38 | 
 39 | ```
 40 | 
 41 | ![Results](ALBERT/plot.png)
 42 | ---
 43 | # Results: XLM-RoBERTa
 44 | ```
 45 | 
 46 | ```
 47 | 
 48 | ![Results](XLM-RoBERTa/plot.png)
 49 | ---
 50 | # Results: GPT
 51 | ```
 52 | 
 53 | ```
 54 | 
 55 | ![Results](GPT/plot.png)
 56 | ---
 57 | # Results: GPT2
 58 | ```
 59 | 
 60 | ```
 61 | 
 62 | ![Results](GPT2/plot.png)
 63 | ---
 64 | # Results: TransformerXL
 65 | ```
 66 | 
 67 | ```
 68 | 
 69 | ![Results](TransformerXL/plot.png)
 70 | ---
 71 | # Results: T5Model
 72 | ```
 73 | 
 74 | ```
 75 | 
 76 | ![Results](T5Model/plot.png)
 77 | ---
 78 | # Results: sklearn_TF-IDF
 79 | ```
 80 | 
 81 | ```
 82 | 
 83 | ![Results](sklearn_TF-IDF/plot.png)
 84 | ---
 85 | # Results: spaCy
 86 | ```
 87 | 
 88 | ```
 89 | 
 90 | ![Results](spaCy/plot.png)
 91 | ---
 92 | # Results: spacy-transformers
 93 | ```
 94 | 
 95 | ```
 96 | 
 97 | ![Results](spacy-transformers/plot.png)
 98 | ---
 99 | # Results: USE
100 | ```
101 | 
102 | ```
103 | 
104 | ![Results](USE/plot.png)
105 | ---
106 | # Results: ELECTRA
107 | ```
108 | 
109 | ```
110 | 
111 | ![Results](ELECTRA/plot.png)
112 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/sklearn_TF-IDF/output.md:
--------------------------------------------------------------------------------
1 | # Results: sklearn_TF-IDF
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](sklearn_TF-IDF/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/sklearn_TF-IDF/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/sklearn_TF-IDF/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/sklearn_TF-IDF/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "sklearn_TF-IDF", "model_name": "TfidfEmbedder", "model_params": {}, "preprocess_func": "fasttext_preprocess", "batch_size": 32}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/spaCy/output.md:
--------------------------------------------------------------------------------
1 | # Results: spaCy
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](spaCy/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/spaCy/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/spaCy/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/spaCy/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "spaCy", "model_name": "SpaCyModel", "model_params": {"model": "en_core_web_lg", "use_gpu": false}, "preprocess_func": null, "batch_size": 32}


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/spacy-transformers/output.md:
--------------------------------------------------------------------------------
1 | # Results: spacy-transformers
2 | ```
3 | 
4 | ```
5 | 
6 | ![Results](spacy-transformers/plot.png)
7 | ---


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/spacy-transformers/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/spacy-transformers/plot.png


--------------------------------------------------------------------------------
/benchmark/benchmark_output/newsgroups_embed/spacy-transformers/run-meta.json:
--------------------------------------------------------------------------------
1 | {"name": "spacy-transformers", "model_name": "SpaCyModel", "model_params": {"model": "en_trf_robertabase_lg"}, "preprocess_func": "bert_preprocess", "batch_size": 16}


--------------------------------------------------------------------------------
/benchmark/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "2.3"
 2 | 
 3 | services:
 4 |   gobbli-benchmark-gpu:
 5 |     runtime: nvidia
 6 |     ipc: host
 7 |     build:
 8 |       context: ../
 9 |       dockerfile: ./benchmark/docker/Dockerfile
10 |     image: gobbli-benchmark:latest
11 |     environment:
12 |       NVIDIA_VISIBLE_DEVICES: ${NVIDIA_VISIBLE_DEVICES:-all}
13 |       GOBBLI_USE_GPU: "1"
14 |     shm_size: 4G
15 |     volumes:
16 |       # Needed to spawn containers
17 |       - /var/run/docker.sock:/var/run/docker.sock
18 |       # Needed to perform bind mounts as we would on the host
19 |       - $PWD:$PWD
20 |     working_dir: $PWD
21 | 
22 |   gobbli-benchmark:
23 |     build:
24 |       context: ../
25 |       dockerfile: ./benchmark/docker/Dockerfile
26 |     image: gobbli-benchmark:latest
27 |     shm_size: 4G
28 |     volumes:
29 |       # Needed to spawn containers
30 |       - /var/run/docker.sock:/var/run/docker.sock
31 |       # Needed to perform bind mounts as we would on the host
32 |       - $PWD:$PWD
33 |     working_dir: $PWD
34 | 


--------------------------------------------------------------------------------
/benchmark/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7
 2 | 
 3 | # Install Chromium for rendering Altair charts to PNG
 4 | # Fixed version to preserve compatibility with chromedriver
 5 | # in case the two releases get out of sync
 6 | RUN apt-get update && apt-get install -y \
 7 |     chromium=80.0.3987.149-1~deb10u1 \
 8 |     chromium-driver=80.0.3987.149-1~deb10u1 \
 9 |     && rm -rf /var/lib/apt/lists/*
10 | 
11 | # Copy essentials in to install requirements
12 | COPY ./setup.py ./meta.json ./requirements.txt ./README.md /code/
13 | COPY ./benchmark/requirements.txt /code/benchmark/requirements.txt
14 | 
15 | # Install dependencies
16 | WORKDIR /code
17 | RUN pip install -e '.[augment,tokenize]' \
18 |     && pip install -r requirements.txt \
19 |     && pip install -r benchmark/requirements.txt
20 | 
21 | # Copy the rest of the repository in
22 | COPY ./ /code
23 | 
24 | ENTRYPOINT ["python", "run_benchmarks.py"]
25 | 


--------------------------------------------------------------------------------
/benchmark/requirements.txt:
--------------------------------------------------------------------------------
1 | PyYAML==5.3
2 | tabulate==0.8.6
3 | umap-learn==0.3.10
4 | matplotlib==3.1.3
5 | # Needed to save Altair plots to PNG
6 | selenium==3.141.0
7 | 


--------------------------------------------------------------------------------
/benchmark/run_benchmarks.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Run all benchmarks that haven't been run.
 4 | # NOTE: This may take several days depending on your available resources.
 5 | 
 6 | image_name="gobbli-benchmark"
 7 | 
 8 | if [[ -n "$GOBBLI_USE_GPU" ]]; then
 9 |     image_name="${image_name}-gpu"
10 |     echo "GPU enabled."
11 | else
12 |     echo "GPU disabled; running on CPU."
13 | fi
14 | 
15 | # Set working directory so the container starts in our working directory
16 | # Otherwise it starts in the repository root
17 | docker-compose run --rm "$image_name" $@
18 | 


--------------------------------------------------------------------------------
/ci-gpu/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "2.3"
 2 | 
 3 | services:
 4 |   gobbli-ci-gpu:
 5 |     runtime: nvidia
 6 |     ipc: host
 7 |     build:
 8 |       context: ../
 9 |       args:
10 |         PYTHON_VERSION: "${PYTHON_VERSION:?Must specify Python version.}"
11 |     image: gobbli-ci:latest
12 |     environment:
13 |       NVIDIA_VISIBLE_DEVICES: ${NVIDIA_VISIBLE_DEVICES:-all}
14 |     # Change permissions after running to allow temp file cleanup by non-root user
15 |     command: bash -c 'py.test -x --use-gpu --nvidia-visible-devices $NVIDIA_VISIBLE_DEVICES; chmod -R a+w ./'
16 |     working_dir: $PWD/..
17 |     volumes:
18 |       # Needed for CI to be able to spawn containers
19 |       - /var/run/docker.sock:/var/run/docker.sock
20 |       # Needed for CI to perform bind mounts as we would on the host
21 |       - $PWD/..:$PWD/..
22 | 


--------------------------------------------------------------------------------
/ci/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.3"
 2 | 
 3 | services:
 4 |   gobbli-ci:
 5 |     build:
 6 |       context: ../
 7 |       args:
 8 |         PYTHON_VERSION: "${PYTHON_VERSION:?Must specify Python version.}"
 9 |     image: gobbli-ci:latest
10 |     ipc: host
11 |     # Travis only gives us ~7.5GB of memory, so we need to run tests in
12 |     # low resource mode
13 |     # The test environment won't be shared across runs, but we'd like to reuse
14 |     # artifacts between tests where possible to reduce runtime, so add the
15 |     # switch to persist data as well
16 |     command: ./run_ci.sh --low-resource --persist-data
17 |     working_dir: $PWD/..
18 |     volumes:
19 |       # Needed for CI to be able to spawn containers
20 |       - /var/run/docker.sock:/var/run/docker.sock
21 |       # Needed for CI to perform bind mounts as we would on the host
22 |       - $PWD/..:$PWD/..
23 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | # Compose services for testing the various model containers
 2 | # GPU not enabled to prevent dependency on the NVIDIA docker runtime
 3 | 
 4 | version: "3.7"
 5 | 
 6 | services:
 7 |   bert:
 8 |     build:
 9 |       context: ./gobbli/model/bert
10 | 
11 |   fasttext:
12 |     build:
13 |       context: ./gobbli/model/fasttext
14 | 
15 |   mt-dnn:
16 |     build:
17 |       context: ./gobbli/model/mtdnn
18 | 
19 |   use:
20 |     build:
21 |       context: ./gobbli/model/use
22 | 
23 |   bert-maskedlm:
24 |     build:
25 |       context: ./gobbli/augment/bert
26 | 
27 |   marian:
28 |     build:
29 |       context: ./gobbli/augment/marian
30 | 
31 |   transformer:
32 |     build:
33 |       context: ./gobbli/model/transformer
34 | 
35 |   spacy:
36 |     build:
37 |       context: ./gobbli/model/spacy
38 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/_static/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/docs/_static/.gitkeep


--------------------------------------------------------------------------------
/docs/_static/gobbli_favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/docs/_static/gobbli_favicon.ico


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
 1 | API Reference
 2 | =============
 3 | 
 4 | Detailed reference for all code in the library.
 5 | 
 6 | .. toctree::
 7 |    :maxdepth: 4
 8 | 
 9 |    auto/gobbli
10 | 


--------------------------------------------------------------------------------
/docs/img/interactive_apps/evaluate/evaluate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/docs/img/interactive_apps/evaluate/evaluate.png


--------------------------------------------------------------------------------
/docs/img/interactive_apps/explain/explain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/docs/img/interactive_apps/explain/explain.png


--------------------------------------------------------------------------------
/docs/img/interactive_apps/explain/explain_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/docs/img/interactive_apps/explain/explain_output.png


--------------------------------------------------------------------------------
/docs/img/interactive_apps/explore/explore.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/docs/img/interactive_apps/explore/explore.png


--------------------------------------------------------------------------------
/docs/img/interactive_apps/explore/explore_embeddings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/docs/img/interactive_apps/explore/explore_embeddings.png


--------------------------------------------------------------------------------
/docs/img/interactive_apps/explore/explore_topic_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/docs/img/interactive_apps/explore/explore_topic_model.png


--------------------------------------------------------------------------------
/docs/img/interactive_apps/explore/explore_trained_embeddings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/docs/img/interactive_apps/explore/explore_trained_embeddings.png


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. gobbli documentation master file, created by
 2 |    sphinx-quickstart on Tue Jun  4 14:50:18 2019.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to gobbli's documentation
 7 | =================================
 8 | 
 9 | gobbli is a library designed to make experimentation and analysis using deep learning easier.  It provides a simple, uniform interface to deep learning models that abstracts away most of the complexity in terms of different input/output formats, library versions, etc.  It attempts to implement a set of common use cases with an emphasis on usability rather than performance.
10 | 
11 | gobbli is *not* designed to provide deep learning models in a production context.  Each task generally involves running a Docker container in the background and transferring a large amount of data to and from disk, which creates significant overhead.  Additionally, gobbli does not support fine-grained model-specific tuning, such as custom loss functions.  Our goal is to take the user 80% of the way to their deep learning solution as quickly as possible so they can decide whether it's worth the effort to resolve the remaining 20%.
12 | 
13 | .. toctree::
14 |    prerequisites
15 |    quickstart
16 |    interactive_apps
17 |    troubleshooting
18 |    advanced_usage
19 |    api
20 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/prerequisites.rst:
--------------------------------------------------------------------------------
 1 | Prerequisites
 2 | =============
 3 | 
 4 | gobbli requires Python 3.7+.
 5 | 
 6 | First, ensure `Docker <https://www.docker.com/>`__ is installed and your user has permissions to run docker commands.  Next, install the ``gobbli`` package and dependencies into your environment:
 7 | 
 8 | .. code-block:: bash
 9 | 
10 |   pip install gobbli
11 | 
12 | Some of the :ref:`data-augmentation` methods require extra packages.  You can install them all using the following steps:
13 | 
14 | .. code-block:: bash
15 | 
16 |     pip install gobbli[augment]
17 |     python -m spacy download en_core_web_sm
18 | 
19 | Additionally, :ref:`document-windowing` with the `SentencePiece <https://github.com/google/sentencepiece>`__ tokenizer requires extra packages.  Install them like so:
20 | 
21 | .. code-block:: bash
22 | 
23 |     pip install gobbli[tokenize]
24 | 
25 | .. _interactive-app-prereqs:
26 | 
27 | The `Streamlit <https://streamlit.io>`__-based :ref:`interactive-apps` require their own set of dependencies:
28 | 
29 | .. code-block:: bash
30 | 
31 |    pip install gobbli[interactive]
32 | 
33 | If you want to train models using a GPU, you will additionally need an NVIDIA graphics card and `nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`__.
34 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Needed for readthedocs to install everything required to build the docs
 2 | -r ../requirements.txt
 3 | sphinx==2.1.0
 4 | sphinx-autobuild==0.7.1
 5 | sphinx-autodoc-typehints==1.6.0
 6 | sphinx-paramlinks==0.3.7
 7 | 
 8 | mock==3.0.5
 9 | autodoc==0.5.0
10 | 
11 | gobbli[augment,tokenize,interactive]
12 | 


--------------------------------------------------------------------------------
/generate_docs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -e
4 | 
5 | rm -f docs/auto/*
6 | cd docs
7 | make html
8 | 


--------------------------------------------------------------------------------
/gobbli/__init__.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | import pkg_resources
 4 | 
 5 | import gobbli.augment as augment
 6 | import gobbli.dataset as dataset
 7 | import gobbli.experiment as experiment
 8 | import gobbli.io as io
 9 | import gobbli.model as model
10 | from gobbli.util import TokenizeMethod
11 | 
12 | # Warn the user of potential conflicts using the old third-party typing/dataclasses
13 | # modules
14 | for conflicting_pkg in ("typing", "dataclasses"):
15 |     req = pkg_resources.Requirement.parse(conflicting_pkg)
16 |     if pkg_resources.working_set.find(req) is not None:
17 |         warnings.warn(
18 |             f"You've installed a third-party module named '{conflicting_pkg}' which "
19 |             "conflicts with a standard library module of the same name.  This can cause "
20 |             "errors when unpickling code, e.g. when running experiments using Ray. Consider "
21 |             f"uninstalling the module:\n\npip uninstall {conflicting_pkg}"
22 |         )
23 | 
24 | __all__ = [
25 |     # Modules
26 |     "augment",
27 |     "dataset",
28 |     "experiment",
29 |     "model",
30 |     "io",
31 |     # Misc top level imports
32 |     "TokenizeMethod",
33 | ]
34 | 


--------------------------------------------------------------------------------
/gobbli/augment/__init__.py:
--------------------------------------------------------------------------------
1 | from gobbli.augment.bert import BERTMaskedLM
2 | from gobbli.augment.marian import MarianMT
3 | from gobbli.augment.word2vec import Word2Vec
4 | from gobbli.augment.wordnet import WordNet
5 | 
6 | __all__ = ["BERTMaskedLM", "Word2Vec", "WordNet", "MarianMT"]
7 | 


--------------------------------------------------------------------------------
/gobbli/augment/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from pathlib import Path
 3 | from typing import List
 4 | 
 5 | from gobbli.util import gobbli_dir
 6 | 
 7 | 
 8 | def augment_dir() -> Path:
 9 |     return gobbli_dir() / "augment"
10 | 
11 | 
12 | class BaseAugment(ABC):
13 |     """
14 |     Base class for data augmentation methods.
15 |     """
16 | 
17 |     @abstractmethod
18 |     def augment(self, X: List[str], times: int = 5, p: float = 0.1) -> List[str]:
19 |         """
20 |         Return additional texts for each text in the passed array.
21 | 
22 |         Args:
23 |           X: Input texts.
24 |           times: How many texts to generate per text in the input.
25 |           p: Probability of considering each token in the input for replacement.
26 |             Note that some tokens aren't able to be replaced by a given augmentation
27 |             method and will be ignored, so the actual proportion of replaced tokens
28 |             in your input may be much lower than this number.
29 |         Returns:
30 |           Generated texts (length = ``times * len(X)``).
31 |         """
32 |         raise NotImplementedError
33 | 
34 |     @classmethod
35 |     def data_dir(cls) -> Path:
36 |         """
37 |         Returns:
38 |           The data directory used for this class of augmentation model.
39 |         """
40 |         return augment_dir() / cls.__name__
41 | 


--------------------------------------------------------------------------------
/gobbli/augment/bert/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM pytorch/pytorch:1.3-cuda10.1-cudnn7-runtime
2 | 
3 | RUN pip install transformers==2.3.0 sentencepiece==0.1.86
4 | 
5 | COPY ./src /code/bert
6 | WORKDIR /code/bert
7 | 


--------------------------------------------------------------------------------
/gobbli/augment/bert/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import BERTMaskedLM
2 | 
3 | __all__ = ["BERTMaskedLM"]
4 | 


--------------------------------------------------------------------------------
/gobbli/augment/marian/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM pytorch/pytorch:1.3-cuda10.1-cudnn7-runtime
2 | 
3 | RUN pip install transformers==2.9.1 sentencepiece==0.1.86
4 | 
5 | COPY ./src /code/marian
6 | WORKDIR /code/marian
7 | 


--------------------------------------------------------------------------------
/gobbli/augment/marian/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import MarianMT
2 | 
3 | __all__ = ["MarianMT"]
4 | 


--------------------------------------------------------------------------------
/gobbli/cli.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | from pathlib import Path
 3 | 
 4 | import click
 5 | 
 6 | INTERACTIVE_DIR = Path(__file__).parent / "interactive"
 7 | 
 8 | 
 9 | def _streamlit_run(app_name: str, *args):
10 |     return subprocess.check_call(
11 |         ["streamlit", "run", str(INTERACTIVE_DIR / f"{app_name}.py"), "--", *args]
12 |     )
13 | 
14 | 
15 | @click.group()
16 | def main():
17 |     pass
18 | 
19 | 
20 | @main.command(
21 |     # Forward the --help argument to the streamlit apps
22 |     "explore",
23 |     context_settings=dict(ignore_unknown_options=True),
24 |     add_help_option=False,
25 | )
26 | @click.argument("args", nargs=-1, type=click.UNPROCESSED)
27 | def main_explore(args):
28 |     _streamlit_run("explore", *args)
29 | 
30 | 
31 | @main.command(
32 |     "evaluate",
33 |     context_settings=dict(ignore_unknown_options=True),
34 |     add_help_option=False,
35 | )
36 | @click.argument("args", nargs=-1, type=click.UNPROCESSED)
37 | def main_evaluate(args):
38 |     _streamlit_run("evaluate", *args)
39 | 
40 | 
41 | @main.command(
42 |     "explain", context_settings=dict(ignore_unknown_options=True), add_help_option=False
43 | )
44 | @click.argument("args", nargs=-1, type=click.UNPROCESSED)
45 | def main_explain(args):
46 |     _streamlit_run("explain", *args)
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     main()
51 | 


--------------------------------------------------------------------------------
/gobbli/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | from gobbli.dataset.cmu_movie_summary import MovieSummaryDataset
2 | from gobbli.dataset.imdb import IMDBDataset
3 | from gobbli.dataset.newsgroups import NewsgroupsDataset
4 | from gobbli.dataset.trivial import TrivialDataset
5 | 
6 | __all__ = ["TrivialDataset", "NewsgroupsDataset", "IMDBDataset", "MovieSummaryDataset"]
7 | 


--------------------------------------------------------------------------------
/gobbli/dataset/imdb.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Set, Tuple
 3 | 
 4 | from gobbli.dataset.nested_file import NestedFileDataset
 5 | from gobbli.util import download_archive
 6 | 
 7 | 
 8 | class IMDBDataset(NestedFileDataset):
 9 |     """
10 |     gobbli Dataset for the IMDB sentiment analysis problem.
11 | 
12 |     https://ai.stanford.edu/~amaas/data/sentiment/
13 |     """
14 | 
15 |     def labels(self) -> Set[str]:
16 |         return {"pos", "neg"}
17 | 
18 |     def download(self, data_dir: Path) -> Path:
19 |         return download_archive(
20 |             "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", data_dir
21 |         )
22 | 
23 |     def folders(self) -> Tuple[Path, Path]:
24 |         return Path("aclImdb/train"), Path("aclImdb/test")
25 | 
26 |     def read_source_file(self, file_path: Path) -> str:
27 |         return file_path.read_text()
28 | 


--------------------------------------------------------------------------------
/gobbli/dataset/newsgroups.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Set, Tuple
 3 | 
 4 | from gobbli.dataset.nested_file import NestedFileDataset
 5 | from gobbli.util import download_archive
 6 | 
 7 | 
 8 | class NewsgroupsDataset(NestedFileDataset):
 9 |     """
10 |     gobbli Dataset for the 20 Newsgroups problem.
11 | 
12 |     http://qwone.com/~jason/20Newsgroups/
13 |     """
14 | 
15 |     def labels(self) -> Set[str]:
16 |         return {
17 |             "alt.atheism",
18 |             "comp.graphics",
19 |             "comp.os.ms-windows.misc",
20 |             "comp.sys.ibm.pc.hardware",
21 |             "comp.sys.mac.hardware",
22 |             "comp.windows.x",
23 |             "misc.forsale",
24 |             "rec.autos",
25 |             "rec.motorcycles",
26 |             "rec.sport.baseball",
27 |             "rec.sport.hockey",
28 |             "sci.crypt",
29 |             "sci.electronics",
30 |             "sci.med",
31 |             "sci.space",
32 |             "soc.religion.christian",
33 |             "talk.politics.guns",
34 |             "talk.politics.mideast",
35 |             "talk.politics.misc",
36 |             "talk.religion.misc",
37 |         }
38 | 
39 |     def download(self, data_dir: Path) -> Path:
40 |         return download_archive(
41 |             "https://ndownloader.figshare.com/files/5975967",
42 |             data_dir,
43 |             filename="20news-bydate.tar.gz",
44 |         )
45 | 
46 |     def folders(self) -> Tuple[Path, Path]:
47 |         return Path("20news-bydate-train"), Path("20news-bydate-test")
48 | 
49 |     def read_source_file(self, file_path: Path) -> str:
50 |         return file_path.read_text(encoding="latin-1")
51 | 


--------------------------------------------------------------------------------
/gobbli/dataset/trivial.py:
--------------------------------------------------------------------------------
 1 | from gobbli.dataset.base import BaseDataset
 2 | 
 3 | 
 4 | class TrivialDataset(BaseDataset):
 5 |     """
 6 |     gobbli Dataset containing only a few observations.
 7 |     Useful for verifying a model runs without waiting for an
 8 |     actual dataset to process.
 9 |     """
10 | 
11 |     DATASET = ["This is positive.", "This, although, is negative."]
12 |     LABELS = ["1", "0"]
13 | 
14 |     def _is_built(self) -> bool:
15 |         return True
16 | 
17 |     def _build(self):
18 |         pass
19 | 
20 |     def X_train(self):
21 |         return TrivialDataset.DATASET
22 | 
23 |     def y_train(self):
24 |         return TrivialDataset.LABELS
25 | 
26 |     def X_test(self):
27 |         return TrivialDataset.DATASET
28 | 
29 |     def y_test(self):
30 |         return TrivialDataset.LABELS
31 | 


--------------------------------------------------------------------------------
/gobbli/experiment/__init__.py:
--------------------------------------------------------------------------------
1 | from gobbli.experiment.classification import ClassificationExperiment
2 | 
3 | __all__ = ["ClassificationExperiment"]
4 | 


--------------------------------------------------------------------------------
/gobbli/inspect/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/gobbli/inspect/__init__.py


--------------------------------------------------------------------------------
/gobbli/model/__init__.py:
--------------------------------------------------------------------------------
 1 | from gobbli.model.bert import BERT
 2 | from gobbli.model.fasttext import FastText
 3 | from gobbli.model.majority import MajorityClassifier
 4 | from gobbli.model.mtdnn import MTDNN
 5 | from gobbli.model.random import RandomEmbedder
 6 | from gobbli.model.sklearn import SKLearnClassifier, TfidfEmbedder
 7 | from gobbli.model.spacy import SpaCyModel
 8 | from gobbli.model.transformer import Transformer
 9 | from gobbli.model.use import USE
10 | 
11 | __all__ = [
12 |     "BERT",
13 |     "FastText",
14 |     "MajorityClassifier",
15 |     "MTDNN",
16 |     "RandomEmbedder",
17 |     "Transformer",
18 |     "SKLearnClassifier",
19 |     "SpaCyModel",
20 |     "USE",
21 |     "TfidfEmbedder",
22 | ]
23 | 


--------------------------------------------------------------------------------
/gobbli/model/bert/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG GPU
 2 | FROM tensorflow/tensorflow:1.11.0${GPU:+-gpu}-py3
 3 | 
 4 | RUN apt-get update && apt-get install -y --no-install-recommends \
 5 |     git \
 6 |     && apt-get clean \
 7 |     && rm -rf /var/lib/apt/lists/*
 8 | 
 9 | # Copy modified source code in
10 | # Base commit: d66a146741588fb208450bde15aa7db143baaa69
11 | COPY ./src /code/bert
12 | WORKDIR /code/bert
13 | 


--------------------------------------------------------------------------------
/gobbli/model/bert/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import BERT
2 | 
3 | __all__ = ["BERT"]
4 | 


--------------------------------------------------------------------------------
/gobbli/model/bert/src/.gitignore:
--------------------------------------------------------------------------------
  1 | # Initially taken from Github's Python gitignore file
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # IPython
 79 | profile_default/
 80 | ipython_config.py
 81 | 
 82 | # pyenv
 83 | .python-version
 84 | 
 85 | # celery beat schedule file
 86 | celerybeat-schedule
 87 | 
 88 | # SageMath parsed files
 89 | *.sage.py
 90 | 
 91 | # Environments
 92 | .env
 93 | .venv
 94 | env/
 95 | venv/
 96 | ENV/
 97 | env.bak/
 98 | venv.bak/
 99 | 
100 | # Spyder project settings
101 | .spyderproject
102 | .spyproject
103 | 
104 | # Rope project settings
105 | .ropeproject
106 | 
107 | # mkdocs documentation
108 | /site
109 | 
110 | # mypy
111 | .mypy_cache/
112 | .dmypy.json
113 | dmypy.json
114 | 
115 | # Pyre type checker
116 | .pyre/
117 | 


--------------------------------------------------------------------------------
/gobbli/model/bert/src/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | BERT needs to maintain permanent compatibility with the pre-trained model files,
 4 | so we do not plan to make any major changes to this library (other than what was
 5 | promised in the README). However, we can accept small patches related to
 6 | re-factoring and documentation. To submit contributes, there are just a few
 7 | small guidelines you need to follow.
 8 | 
 9 | ## Contributor License Agreement
10 | 
11 | Contributions to this project must be accompanied by a Contributor License
12 | Agreement. You (or your employer) retain the copyright to your contribution;
13 | this simply gives us permission to use and redistribute your contributions as
14 | part of the project. Head over to <https://cla.developers.google.com/> to see
15 | your current agreements on file or to sign a new one.
16 | 
17 | You generally only need to submit a CLA once, so if you've already submitted one
18 | (even if it was for a different project), you probably don't need to do it
19 | again.
20 | 
21 | ## Code reviews
22 | 
23 | All submissions, including submissions by project members, require review. We
24 | use GitHub pull requests for this purpose. Consult
25 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
26 | information on using pull requests.
27 | 
28 | ## Community Guidelines
29 | 
30 | This project follows
31 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
32 | 


--------------------------------------------------------------------------------
/gobbli/model/bert/src/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/gobbli/model/bert/src/optimization_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | 
19 | import optimization
20 | import tensorflow as tf
21 | 
22 | 
23 | class OptimizationTest(tf.test.TestCase):
24 | 
25 |   def test_adam(self):
26 |     with self.test_session() as sess:
27 |       w = tf.get_variable(
28 |           "w",
29 |           shape=[3],
30 |           initializer=tf.constant_initializer([0.1, -0.2, -0.1]))
31 |       x = tf.constant([0.4, 0.2, -0.5])
32 |       loss = tf.reduce_mean(tf.square(x - w))
33 |       tvars = tf.trainable_variables()
34 |       grads = tf.gradients(loss, tvars)
35 |       global_step = tf.train.get_or_create_global_step()
36 |       optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2)
37 |       train_op = optimizer.apply_gradients(zip(grads, tvars), global_step)
38 |       init_op = tf.group(tf.global_variables_initializer(),
39 |                          tf.local_variables_initializer())
40 |       sess.run(init_op)
41 |       for _ in range(100):
42 |         sess.run(train_op)
43 |       w_np = sess.run(w)
44 |       self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2)
45 | 
46 | 
47 | if __name__ == "__main__":
48 |   tf.test.main()
49 | 


--------------------------------------------------------------------------------
/gobbli/model/bert/src/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow >= 1.11.0   # CPU Version of TensorFlow.
2 | # tensorflow-gpu  >= 1.11.0  # GPU version of TensorFlow.
3 | 


--------------------------------------------------------------------------------
/gobbli/model/fasttext/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Build stage to compile the binary
 2 | FROM ubuntu:18.04
 3 | 
 4 | RUN apt-get update && apt-get install -y \
 5 |         build-essential \
 6 |         wget \
 7 |         git \
 8 |         python-dev \
 9 |         unzip \
10 |         python-numpy \
11 |         python-scipy \
12 |     && rm -rf /var/cache/apk/*
13 | 
14 | WORKDIR /code
15 | 
16 | RUN git clone https://github.com/facebookresearch/fastText.git /code \
17 |     && cd /code \
18 |     && git checkout 5e1320a1594a026a081f8b1e5caa3085a711a625 \
19 |     && rm -rf .git* \
20 |     && make
21 | 
22 | # Final slim image containing just the binary
23 | FROM ubuntu:18.04
24 | 
25 | WORKDIR /code
26 | COPY --from=0 /code/fasttext .
27 | ENTRYPOINT ["./fasttext"]
28 | CMD ["help"]
29 | 


--------------------------------------------------------------------------------
/gobbli/model/fasttext/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import FastText, FastTextCheckpoint
2 | 
3 | __all__ = ["FastText", "FastTextCheckpoint"]
4 | 


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM allenlao/pytorch-mt-dnn:v0.1
2 | 
3 | # Copy modified source code in
4 | # Base commit: a7f74e0afcffd17ab68fb752fa1cc06eabaacda3
5 | COPY ./src /code/mt-dnn
6 | WORKDIR /code/mt-dnn
7 | 


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import MTDNN
2 | 
3 | __all__ = ["MTDNN"]
4 | 


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/src/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Microsoft
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/src/config/tasks_config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "mnli": 0.3,
3 |   "cola": 0.05
4 | }
5 | 


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/src/data_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/gobbli/model/mtdnn/src/data_utils/__init__.py


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/src/data_utils/log_wrapper.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft. All rights reserved.
 2 | import logging
 3 | from time import gmtime, strftime
 4 | import sys
 5 | 
 6 | def create_logger(name, silent=False, to_disk=False, log_file=None):
 7 |     """Logger wrapper
 8 |     """
 9 |     # setup logger
10 |     log = logging.getLogger(name)
11 |     log.setLevel(logging.DEBUG)
12 |     log.propagate = False
13 |     formatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S')
14 |     if not silent:
15 |         ch = logging.StreamHandler(sys.stdout)
16 |         ch.setLevel(logging.INFO)
17 |         ch.setFormatter(formatter)
18 |         log.addHandler(ch)
19 |     if to_disk:
20 |         log_file = log_file if log_file is not None else strftime("%Y-%m-%d-%H-%M-%S.log", gmtime())
21 |         fh = logging.FileHandler(log_file)
22 |         fh.setLevel(logging.DEBUG)
23 |         fh.setFormatter(formatter)
24 |         log.addHandler(fh)
25 |     return log
26 | 


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/src/data_utils/metrics.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft. All rights reserved.
 2 | from sklearn.metrics import matthews_corrcoef
 3 | from sklearn.metrics import accuracy_score, f1_score
 4 | from scipy.stats import pearsonr, spearmanr
 5 | from torch.nn.functional import cross_entropy
 6 | 
 7 | def compute_acc(predicts, labels):
 8 |     return 100.0 * accuracy_score(labels, predicts)
 9 | 
10 | def compute_f1(predicts, labels):
11 |     return 100.0 * f1_score(labels, predicts)
12 | 
13 | def compute_mcc(predicts, labels):
14 |     return 100.0 * matthews_corrcoef(labels, predicts)
15 | 
16 | def compute_pearson(predicts, labels):
17 |     pcof = pearsonr(labels, predicts)[0]
18 |     return 100.0 * pcof
19 | 
20 | def compute_spearman(predicts, labels):
21 |     scof = spearmanr(labels, predicts)[0]
22 |     return 100.0 * scof
23 | 
24 | def compute_cross_entropy(predicts, labels):
25 |     return cross_entropy(predicts, labels)
26 | 


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/src/data_utils/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft. All rights reserved.
 2 | import random
 3 | import torch
 4 | import numpy
 5 | from torch.autograd import Variable
 6 | import subprocess
 7 | 
 8 | class AverageMeter(object):
 9 |     """Computes and stores the average and current value."""
10 |     def __init__(self):
11 |         self.reset()
12 | 
13 |     def reset(self):
14 |         self.val = 0
15 |         self.avg = 0
16 |         self.sum = 0
17 |         self.count = 0
18 | 
19 |     def update(self, val, n=1):
20 |         self.val = val
21 |         self.sum += val * n
22 |         self.count += n
23 |         self.avg = self.sum / self.count
24 | 
25 | def set_environment(seed, set_cuda=False):
26 |     random.seed(seed)
27 |     numpy.random.seed(seed)
28 |     torch.manual_seed(seed)
29 |     if torch.cuda.is_available() and set_cuda:
30 |         torch.cuda.manual_seed_all(seed)
31 | 
32 | def patch_var(v, cuda=True):
33 |     if cuda:
34 |         v = Variable(v.cuda(async=True))
35 |     else:
36 |         v = Variable(v)
37 |     return v
38 | 
39 | def get_gpu_memory_map():
40 |     result = subprocess.check_output(
41 |         [
42 |             'nvidia-smi', '--query-gpu=memory.used',
43 |             '--format=csv,nounits,noheader'
44 |         ], encoding='utf-8')
45 |     gpu_memory = [int(x) for x in result.strip().split('\n')]
46 |     gpu_memory_map = dict(zip(range(len(gpu_memory)), gpu_memory))
47 |     return gpu_memory_map
48 | 
49 | def get_pip_env():
50 |     result = subprocess.call(["pip", "freeze"])
51 |     return result
52 | 


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/src/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04
 2 | RUN apt-get clean && apt-get update && apt-get install -y locales
 3 | ENV  LANG="en_US.UTF-8" LC_ALL="en_US.UTF-8" LANGUAGE="en_US.UTF-8" LC_TYPE="en_US.UTF-8" TERM=xterm-256color
 4 | RUN locale-gen en_US en_US.UTF-8
 5 | RUN apt-get update && apt-get install -y --no-install-recommends \
 6 |          build-essential \
 7 |          cmake \
 8 |          git \
 9 |          curl \
10 |          vim \
11 |          zip \
12 |          wget \
13 |          unzip \
14 |          ca-certificates \
15 |          libjpeg-dev \
16 |          libpng-dev &&\
17 |      rm -rf /var/lib/apt/lists/*
18 | 
19 | 
20 | ENV PYTHON_VERSION=3.6
21 | 
22 | RUN curl -o ~/miniconda.sh -O  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
23 |      chmod +x ~/miniconda.sh && \
24 |      ~/miniconda.sh -b -p /opt/conda && \
25 |      rm ~/miniconda.sh && \
26 |      /opt/conda/bin/conda create -y --name pytorch-py$PYTHON_VERSION python=$PYTHON_VERSION numpy=1.14.5 scipy ipython mkl&& \
27 |      /opt/conda/bin/conda clean -ya
28 | 
29 | ENV PATH /opt/conda/envs/pytorch-py$PYTHON_VERSION/bin:$PATH
30 | 
31 | RUN /opt/conda/bin/conda install --name pytorch-py$PYTHON_VERSION cuda90 pytorch=0.4.1 torchvision -c pytorch && \
32 |     /opt/conda/bin/conda clean -ya
33 | RUN pip install --upgrade pip
34 | RUN pip install tensorboard_logger
35 | RUN pip install tqdm
36 | RUN pip install h5py==2.7.1
37 | RUN pip install boto3
38 | RUN pip install -U scikit-learn
39 | # install pytorch bert
40 | RUN pip install pytorch-pretrained-bert==v0.6.0
41 | 
42 | # GLUE baseline dependencies
43 | RUN pip install nltk
44 | RUN pip install allennlp==0.4
45 | RUN pip install ipdb
46 | RUN pip install tensorboardX
47 | 
48 | WORKDIR /root
49 | #COPY requirements.txt /root/
50 | #RUN pip install -r requirements.txt
51 | 


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/src/download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | ############################################################## 
 3 | # This script is used to download resources for MT-DNN experiments
 4 | ############################################################## 
 5 | 
 6 | DATA_DIR=$(pwd)/data
 7 | echo "Create a folder $DATA_DIR"
 8 | mkdir ${DATA_DIR}
 9 | 
10 | BERT_DIR=$(pwd)/mt_dnn_models
11 | echo "Create a folder BERT_DIR"
12 | mkdir ${BERT_DIR}
13 | 
14 | ## DOWNLOAD GLUE DATA
15 | ## Please refer glue-baseline install requirments or other issues.
16 | git clone https://github.com/jsalt18-sentence-repl/jiant.git
17 | cd jiant
18 | python scripts/download_glue_data.py --data_dir $DATA_DIR --tasks all
19 | 
20 | cd ..
21 | rm -rf jiant
22 | #########################
23 | 
24 | ## DOWNLOAD SciTail 
25 | cd $DATA_DIR
26 | wget http://data.allenai.org.s3.amazonaws.com/downloads/SciTailV1.1.zip
27 | unzip SciTailV1.1.zip
28 | mv SciTailV1.1 SciTail
29 | # remove zip files
30 | rm *.zip
31 | 
32 | cd ${BERT_DIR}
33 | ## DOWNLOAD BERT
34 | wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip -O "uncased_bert_base.zip"
35 | unzip uncased_bert_base.zip
36 | mv uncased_L-12_H-768_A-12/vocab.txt "${BERT_DIR}/"
37 | rm *.zip
38 | rm -rf uncased_L-12_H-768_A-12
39 | 
40 | ## Download bert models
41 | wget https://mrc.blob.core.windows.net/mt-dnn-model/bert_model_base_v2.pt -O "${BERT_DIR}/bert_model_base.pt"
42 | wget https://mrc.blob.core.windows.net/mt-dnn-model/bert_model_large_v2.pt -O "${BERT_DIR}/bert_model_large.pt"
43 | wget https://mrc.blob.core.windows.net/mt-dnn-model/mt_dnn_base.pt -O "${BERT_DIR}/mt_dnn_base.pt"
44 | wget https://mrc.blob.core.windows.net/mt-dnn-model/mt_dnn_large.pt -O "${BERT_DIR}/mt_dnn_large.pt"
45 | 
46 | ## Download preprocessed SciTail/SNLI data for domain adaptation
47 | cd $DATA_DIR
48 | DOMAIN_ADP="domain_adaptation"
49 | echo "Create a folder $DATA_DIR"
50 | mkdir ${DOMAIN_ADP}
51 | 
52 | wget https://mrc.blob.core.windows.net/mt-dnn-model/data.zip 
53 | unzip data.zip
54 | mv data/* ${DOMAIN_ADP}
55 | rm -rf data.zip
56 | rm -rf data
57 | 


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/src/module/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/gobbli/model/mtdnn/src/module/__init__.py


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/src/module/common.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft. All rights reserved.
 2 | import torch
 3 | import math
 4 | from torch.nn.functional import tanh, relu, prelu, leaky_relu, sigmoid, elu, selu
 5 | 
 6 | def linear(x):
 7 |     return x
 8 | 
 9 | def swish(x):
10 |     return x * sigmoid(x)
11 | 
12 | def gelu(x):
13 |     """ref:https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L113
14 |     """
15 |     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
16 | 
17 | def activation(func_a):
18 |     """Activation function wrapper
19 |     """
20 |     try:
21 |         f = eval(func_a)
22 |     except:
23 |         f = linear
24 |     return f
25 | 


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/src/module/dropout_wrapper.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft. All rights reserved.
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from torch.autograd import Variable
 6 | 
 7 | class DropoutWrapper(nn.Module):
 8 |     """
 9 |     This is a dropout wrapper which supports the fix mask dropout
10 |     """
11 |     def __init__(self, dropout_p=0, enable_vbp=True):
12 |         super(DropoutWrapper, self).__init__()
13 |         """variational dropout means fix dropout mask
14 |         ref: https://discuss.pytorch.org/t/dropout-for-rnns/633/11
15 |         """
16 |         self.enable_variational_dropout = enable_vbp
17 |         self.dropout_p = dropout_p
18 | 
19 |     def forward(self, x):
20 |         """
21 |             :param x: batch * len * input_size
22 |         """
23 |         if self.training == False or self.dropout_p == 0:
24 |             return x
25 | 
26 |         if len(x.size()) == 3:
27 |             mask = Variable(1.0 / (1-self.dropout_p) * torch.bernoulli((1-self.dropout_p) * (x.data.new(x.size(0), x.size(2)).zero_() + 1)), requires_grad=False)
28 |             return mask.unsqueeze(1).expand_as(x) * x
29 |         else:
30 |             return F.dropout(x, p=self.dropout_p, training=self.training)
31 | 


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/src/module/sub_layers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft. All rights reserved.
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from torch.nn.parameter import Parameter
 6 | 
 7 | class LayerNorm(nn.Module):
 8 |     #ref: https://github.com/pytorch/pytorch/issues/1959
 9 |     #   :https://arxiv.org/pdf/1607.06450.pdf
10 |     def __init__(self, hidden_size, eps=1e-4):
11 |         super(LayerNorm, self).__init__()
12 |         self.alpha = Parameter(torch.ones(1,1,hidden_size)) # gain g
13 |         self.beta = Parameter(torch.zeros(1,1,hidden_size)) # bias b
14 |         self.eps = eps
15 | 
16 |     def forward(self, x):
17 |         """
18 |         Args:
19 |             :param x: batch * len * input_size
20 | 
21 |         Returns:
22 |             normalized x
23 |         """
24 |         mu = torch.mean(x, 2, keepdim=True).expand_as(x)
25 |         sigma = torch.std(x, 2, keepdim=True).expand_as(x)
26 |         return (x - mu) / (sigma + self.eps) * self.alpha.expand_as(x) + self.beta.expand_as(x)
27 | 


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/src/mt_dnn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/gobbli/model/mtdnn/src/mt_dnn/__init__.py


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/src/mt_dnn/gobbli_model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import torch
 4 | import torch.nn.functional as F
 5 | from torch.autograd import Variable
 6 | 
 7 | from .model import MTDNNModel
 8 | 
 9 | 
10 | class GobbliMTDNNModel(MTDNNModel):
11 |     def update(self, input_ids, token_type_ids, attention_mask, labels):
12 |         self.network.train()
13 |         if self.config['cuda']:
14 |             labels = labels.cuda(async=True)
15 | 
16 |         y = Variable(labels, requires_grad=False)
17 |         logits = self.mnetwork(input_ids, token_type_ids, attention_mask, task_id=0)
18 |         loss = F.cross_entropy(logits, y)
19 | 
20 |         self.train_loss.update(loss.item(), logits.size(0))
21 |         self.optimizer.zero_grad()
22 | 
23 |         loss.backward()
24 |         if self.config['global_grad_clipping'] > 0:
25 |             torch.nn.utils.clip_grad_norm_(self.network.parameters(),
26 |                                            self.config['global_grad_clipping'])
27 | 
28 |         self.optimizer.step()
29 |         self.updates += 1
30 |         self.update_ema()
31 | 
32 |     def predict(self, input_ids, token_type_ids, attention_mask):
33 |         self.network.eval()
34 |         score = self.mnetwork(input_ids, token_type_ids, attention_mask, task_id=0)
35 |         score = F.softmax(score, dim=1).data.cpu()
36 |         predict = np.argmax(score.numpy(), axis=1).tolist()
37 |         return score, predict
38 | 


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/src/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | torch==0.4.1
3 | tqdm
4 | colorlog
5 | boto3
6 | pytorch-pretrained-bert==v0.6.0
7 | 


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/src/run_toy.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | ############################### 
4 | # Training a mt-dnn model
5 | # Note that this is a toy setting and please refer our paper for detailed hyper-parameters.
6 | ############################### 
7 | 
8 | python prepro.py
9 | python train.py


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/src/scripts/domain_adaptation_run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [[ $# -ne 8 ]]; then
 3 |   echo "train.sh <prefix> <bert_path> <train_datasets> <test_datasets> <data_dir> <model_dir> <batch_size> <gpu>"
 4 |   exit 1
 5 | fi
 6 | prefix=$1
 7 | BERT_PATH=$2
 8 | train_datasets=$3
 9 | test_datasets=$4
10 | DATA_DIR=$5
11 | MODEL_ROOT=$6
12 | BATCH_SIZE=$7
13 | gpu=$8
14 | echo "export CUDA_VISIBLE_DEVICES=${gpu}"
15 | export CUDA_VISIBLE_DEVICES=${gpu}
16 | tstr=$(date +"%FT%H%M")
17 | 
18 | answer_opt=0
19 | optim="adamax"
20 | grad_clipping=0
21 | global_grad_clipping=1
22 | 
23 | model_dir="checkpoints/${prefix}_${optim}_answer_opt${answer_opt}_gc${grad_clipping}_ggc${global_grad_clipping}_${tstr}"
24 | log_file="${model_dir}/log.log"
25 | python ../train.py --data_dir ${DATA_DIR} --init_checkpoint ${BERT_PATH} --batch_size ${BATCH_SIZE} --output_dir ${model_dir} --log_file ${log_file} --answer_opt ${answer_opt} --optimizer ${optim} --train_datasets ${train_datasets} --test_datasets ${test_datasets} --grad_clipping ${grad_clipping} --global_grad_clipping ${global_grad_clipping} --multi_gpu_on
26 | 


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/src/scripts/run_mt_dnn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [[ $# -ne 2 ]]; then
 3 |   echo "train.sh <batch_size> <gpu>"
 4 |   exit 1
 5 | fi
 6 | prefix="mt-dnn-rte"
 7 | BATCH_SIZE=$1
 8 | gpu=$2
 9 | echo "export CUDA_VISIBLE_DEVICES=${gpu}"
10 | export CUDA_VISIBLE_DEVICES=${gpu}
11 | tstr=$(date +"%FT%H%M")
12 | 
13 | train_datasets="mnli,rte,qqp,qnli,mrpc,sst,cola,stsb"
14 | test_datasets="mnli_matched,mnli_mismatched,rte"
15 | MODEL_ROOT="checkpoints"
16 | BERT_PATH="../mt_dnn_models/bert_model_large.pt"
17 | DATA_DIR="../data/mt_dnn"
18 | 
19 | answer_opt=1
20 | optim="adamax"
21 | grad_clipping=0
22 | global_grad_clipping=1
23 | lr="5e-5"
24 | 
25 | model_dir="checkpoints/${prefix}_${optim}_answer_opt${answer_opt}_gc${grad_clipping}_ggc${global_grad_clipping}_${tstr}"
26 | log_file="${model_dir}/log.log"
27 | python ../train.py --data_dir ${DATA_DIR} --init_checkpoint ${BERT_PATH} --batch_size ${BATCH_SIZE} --output_dir ${model_dir} --log_file ${log_file} --answer_opt ${answer_opt} --optimizer ${optim} --train_datasets ${train_datasets} --test_datasets ${test_datasets} --grad_clipping ${grad_clipping} --global_grad_clipping ${global_grad_clipping} --learning_rate ${lr} --multi_gpu_on
28 | 


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/src/scripts/run_rte.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [[ $# -ne 2 ]]; then
 3 |   echo "train.sh <batch_size> <gpu>"
 4 |   exit 1
 5 | fi
 6 | prefix="mt-dnn-rte"
 7 | BATCH_SIZE=$1
 8 | gpu=$2
 9 | echo "export CUDA_VISIBLE_DEVICES=${gpu}"
10 | export CUDA_VISIBLE_DEVICES=${gpu}
11 | tstr=$(date +"%FT%H%M")
12 | 
13 | train_datasets="rte"
14 | test_datasets="rte"
15 | MODEL_ROOT="checkpoints"
16 | BERT_PATH="../mt_dnn_models/mt_dnn_large.pt"
17 | DATA_DIR="../data/mt_dnn"
18 | 
19 | answer_opt=0
20 | optim="adamax"
21 | grad_clipping=0
22 | global_grad_clipping=1
23 | lr="2e-5"
24 | 
25 | model_dir="checkpoints/${prefix}_${optim}_answer_opt${answer_opt}_gc${grad_clipping}_ggc${global_grad_clipping}_${tstr}"
26 | log_file="${model_dir}/log.log"
27 | python ../train.py --data_dir ${DATA_DIR} --init_checkpoint ${BERT_PATH} --batch_size ${BATCH_SIZE} --output_dir ${model_dir} --log_file ${log_file} --answer_opt ${answer_opt} --optimizer ${optim} --train_datasets ${train_datasets} --test_datasets ${test_datasets} --grad_clipping ${grad_clipping} --global_grad_clipping ${global_grad_clipping} --learning_rate ${lr}
28 | 


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/src/scripts/run_stsb.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [[ $# -ne 2 ]]; then
 3 |   echo "train.sh <batch_size> <gpu>"
 4 |   exit 1
 5 | fi
 6 | prefix="mt-dnn-stsb"
 7 | BATCH_SIZE=$1
 8 | gpu=$2
 9 | echo "export CUDA_VISIBLE_DEVICES=${gpu}"
10 | export CUDA_VISIBLE_DEVICES=${gpu}
11 | tstr=$(date +"%FT%H%M")
12 | 
13 | train_datasets="stsb"
14 | test_datasets="stsb"
15 | MODEL_ROOT="checkpoints"
16 | BERT_PATH="../mt_dnn_models/mt_dnn_large.pt"
17 | DATA_DIR="../data/mt_dnn"
18 | 
19 | answer_opt=0
20 | optim="adamax"
21 | grad_clipping=0
22 | global_grad_clipping=1
23 | 
24 | model_dir="checkpoints/${prefix}_${optim}_answer_opt${answer_opt}_gc${grad_clipping}_ggc${global_grad_clipping}_${tstr}"
25 | log_file="${model_dir}/log.log"
26 | python ../train.py --data_dir ${DATA_DIR} --init_checkpoint ${BERT_PATH} --batch_size ${BATCH_SIZE} --output_dir ${model_dir} --log_file ${log_file} --answer_opt ${answer_opt} --optimizer ${optim} --train_datasets ${train_datasets} --test_datasets ${test_datasets} --grad_clipping ${grad_clipping} --global_grad_clipping ${global_grad_clipping}
27 | 


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/src/scripts/scitail_domain_adaptation_bash.sh:
--------------------------------------------------------------------------------
1 | # 2 v100
2 | ./domain_adaptation_run.sh scitail_001_tl ../mt_dnn_models/mt_dnn_base.pt scitail_001 scitail ../data/domain_adaptation ../checkpoints 32 0,1 |tee scitail_001_tl.log
3 | ./domain_adaptation_run.sh scitail_01_tl ../mt_dnn_models/mt_dnn_base.pt scitail_01 scitail ../data/domain_adaptation ../checkpoints 32 0,1 |tee scitail_01_tl.log
4 | ./domain_adaptation_run.sh scitail_1_tl ../mt_dnn_models/mt_dnn_base.pt scitail_1 scitail ../data/domain_adaptation ../checkpoints 32 0,1 |tee scitail_1_tl.log
5 | ./domain_adaptation_run.sh scitail_full_tl ../mt_dnn_models/mt_dnn_base.pt scitail scitail ../data/domain_adaptation ../checkpoints 32 0,1 |tee scitail_full_tl.log
6 | 


--------------------------------------------------------------------------------
/gobbli/model/mtdnn/src/scripts/snli_domain_adaptation_bash.sh:
--------------------------------------------------------------------------------
1 | # 2 v100
2 | ./domain_adaptation_run.sh snli_001_tl ../mt_dnn_models/mt_dnn_base.pt snli_001 snli ../data/domain_adaptation ../checkpoints 32 0,1 |tee snli_001_tl.log
3 | ./domain_adaptation_run.sh snli_01_tl ../mt_dnn_models/mt_dnn_base.pt snli_01 snli ../data/domain_adaptation ../checkpoints 32 0,1 |tee snli_01_tl.log
4 | ./domain_adaptation_run.sh snli_1_tl ../mt_dnn_models/mt_dnn_base.pt snli_1 snli ../data/domain_adaptation ../checkpoints 32 0,1 |tee snli_1_tl.log
5 | ./domain_adaptation_run.sh snli_full_tl ../mt_dnn_models/mt_dnn_base.pt snli snli ../data/domain_adaptation ../checkpoints 32 0,1 |tee snli_full_tl.log
6 | 


--------------------------------------------------------------------------------
/gobbli/model/sklearn/__init__.py:
--------------------------------------------------------------------------------
 1 | from .model import (
 2 |     SKLearnClassifier,
 3 |     TfidfEmbedder,
 4 |     make_cv_tfidf_logistic_regression,
 5 |     make_default_tfidf_logistic_regression,
 6 |     persist_estimator,
 7 | )
 8 | 
 9 | __all__ = [
10 |     "SKLearnClassifier",
11 |     "TfidfEmbedder",
12 |     "persist_estimator",
13 |     "make_cv_tfidf_logistic_regression",
14 |     "make_default_tfidf_logistic_regression",
15 | ]
16 | 


--------------------------------------------------------------------------------
/gobbli/model/spacy/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:1.3-cuda10.1-cudnn7-devel
 2 | 
 3 | COPY ./src/requirements.txt /tmp/requirements.txt
 4 | RUN pip install -r /tmp/requirements.txt
 5 | 
 6 | COPY ./src /code/spacy
 7 | WORKDIR /code/spacy
 8 | 
 9 | ARG model
10 | RUN python -m spacy download ${model}
11 | 


--------------------------------------------------------------------------------
/gobbli/model/spacy/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import SpaCyModel
2 | 
3 | __all__ = ["SpaCyModel"]
4 | 


--------------------------------------------------------------------------------
/gobbli/model/spacy/src/requirements.txt:
--------------------------------------------------------------------------------
 1 | # These are additional requirements needed on top of the PyTorch image
 2 | pandas==0.25.0
 3 | # Use 2.2.1 to work around this issue:
 4 | # https://github.com/explosion/spacy-transformers/issues/105
 5 | # Can upgrade when this PR is merged:
 6 | # https://github.com/explosion/spacy-transformers/pull/120
 7 | spacy==2.2.1
 8 | spacy-transformers==0.5.1
 9 | # Resolve nested package version conflicts
10 | sentencepiece==0.1.86
11 | urllib3>=1.25.4,<1.27
12 | requests==2.25.1
13 | 
14 | # We're using the PyTorch image with CUDA 10.1, but spaCy doesn't have an extra
15 | # requirements specifier for CUDA 10.1 at the time of this writing (it only has 10.0).
16 | # We could use the "cuda" extra requirements specifier, but it results in spaCy
17 | # requiring the source distribution of cupy, which can't be compiled in a container
18 | # without the NVIDIA runtime (which would require us to have separate images for GPU
19 | # and no-GPU).  So, we manually install the spaCy GPU dependencies so we get
20 | # wheels compatible with CUDA 10.1.
21 | cupy-cuda101==7.0.0
22 | thinc_gpu_ops==0.0.4
23 | 


--------------------------------------------------------------------------------
/gobbli/model/transformer/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM pytorch/pytorch:1.3-cuda10.1-cudnn7-runtime
2 | 
3 | COPY ./src/requirements.txt /tmp/requirements.txt
4 | RUN pip install -r /tmp/requirements.txt
5 | 
6 | COPY ./src /code/transformer
7 | WORKDIR /code/transformer
8 | 


--------------------------------------------------------------------------------
/gobbli/model/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import Transformer
2 | 
3 | __all__ = ["Transformer"]
4 | 


--------------------------------------------------------------------------------
/gobbli/model/transformer/src/requirements.txt:
--------------------------------------------------------------------------------
1 | # These are additional requirements needed on top of the pytorch image
2 | pandas==0.25.0
3 | transformers==2.8.0
4 | sentencepiece==0.1.86
5 | 


--------------------------------------------------------------------------------
/gobbli/model/use/Dockerfile:
--------------------------------------------------------------------------------
1 | ARG GPU
2 | FROM tensorflow/tensorflow:2.0.1${GPU:+-gpu}-py3
3 | 
4 | WORKDIR /code/use
5 | COPY ./src/requirements.txt ./
6 | RUN pip install -r requirements.txt
7 | 
8 | COPY ./src/ ./
9 | 


--------------------------------------------------------------------------------
/gobbli/model/use/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import USE
2 | 
3 | __all__ = ["USE"]
4 | 


--------------------------------------------------------------------------------
/gobbli/model/use/src/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow-hub==0.7.0
2 | 


--------------------------------------------------------------------------------
/gobbli/model/use/src/use.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | import tensorflow_hub as hub
 5 | 
 6 | 
 7 | def read_texts(input_file):
 8 |     with open(input_file, "r", encoding="utf-8") as f:
 9 |         return f.readlines()
10 | 
11 | 
12 | def make_batches(l, batch_size):
13 |     for i in range(0, len(l), batch_size):
14 |         yield l[i : i + batch_size]
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     parser = argparse.ArgumentParser()
19 | 
20 |     parser.add_argument(
21 |         "--input-file",
22 |         required=True,
23 |         help="Path to the file containing input texts, one per line.",
24 |     )
25 |     parser.add_argument(
26 |         "--output-file",
27 |         required=True,
28 |         help="Path to write computed embeddings to (JSON format).",
29 |     )
30 |     parser.add_argument(
31 |         "--module-dir",
32 |         required=True,
33 |         help="Path to the downloaded/extracted TFHub Module for USE.",
34 |     )
35 |     parser.add_argument(
36 |         "--batch-size",
37 |         default=32,
38 |         type=int,
39 |         help="Number of texts to embed at once. Default: %(default)s",
40 |     )
41 | 
42 |     args = parser.parse_args()
43 | 
44 |     embed = hub.load(args.module_dir)
45 |     texts = read_texts(args.input_file)
46 | 
47 |     with open(args.output_file, "w") as f:
48 |         for batch in make_batches(texts, args.batch_size):
49 |             embeddings = embed(batch).numpy()
50 |             for embedding in embeddings.tolist():
51 |                 json.dump(embedding, f)
52 |                 f.write("\n")
53 | 


--------------------------------------------------------------------------------
/gobbli/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/gobbli/test/__init__.py


--------------------------------------------------------------------------------
/gobbli/test/augment/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/gobbli/test/augment/__init__.py


--------------------------------------------------------------------------------
/gobbli/test/augment/test_bertmaskedlm.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from gobbli.augment.bert import BERTMaskedLM
 4 | from gobbli.test.util import model_test_dir
 5 | 
 6 | 
 7 | @pytest.mark.parametrize(
 8 |     "params,exception",
 9 |     [
10 |         # Unknown param
11 |         ({"unknown": None}, ValueError),
12 |         # Bad type (diversity)
13 |         ({"diversity": 2}, TypeError),
14 |         # Bad type (batch size)
15 |         ({"batch_size": 2.5}, TypeError),
16 |         # Bad type (n_probable)
17 |         ({"n_probable": 2.5}, TypeError),
18 |         # Bad value (diversity)
19 |         ({"diversity": 0.0}, ValueError),
20 |         # Bad value (batch_size)
21 |         ({"batch_size": 0}, ValueError),
22 |         # Bad value (n_probable)
23 |         ({"n_probable": 0}, ValueError),
24 |         # OK values
25 |         ({"diversity": 0.5, "n_probable": 3, "batch_size": 16}, None),
26 |     ],
27 | )
28 | def test_init(params, exception):
29 |     if exception is None:
30 |         BERTMaskedLM(**params)
31 |     else:
32 |         with pytest.raises(exception):
33 |             BERTMaskedLM(**params)
34 | 
35 | 
36 | def test_bertmaskedlm_augment(model_gpu_config, gobbli_dir):
37 |     model = BERTMaskedLM(
38 |         data_dir=model_test_dir(BERTMaskedLM), load_existing=True, **model_gpu_config
39 |     )
40 |     model.build()
41 | 
42 |     times = 5
43 |     new_texts = model.augment(["This is a test."], times=times)
44 |     assert len(new_texts) == times
45 | 


--------------------------------------------------------------------------------
/gobbli/test/augment/test_marian.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from gobbli.augment.marian import MarianMT
 4 | from gobbli.test.util import model_test_dir
 5 | 
 6 | 
 7 | @pytest.mark.parametrize(
 8 |     "params,exception",
 9 |     [
10 |         # Unknown param
11 |         ({"unknown": None}, ValueError),
12 |         # Bad type (batch_size)
13 |         ({"batch_size": 2.5}, TypeError),
14 |         # Bad type (target_languages)
15 |         ({"target_languages": "english"}, TypeError),
16 |         # Bad value (batch_size)
17 |         ({"batch_size": 0}, ValueError),
18 |         # Bad value (target_languages)
19 |         ({"target_languages": ["not a language"]}, ValueError),
20 |         # Bad value, one OK value (target_languages)
21 |         ({"target_languages": ["french", "not a language"]}, ValueError),
22 |         # OK values
23 |         ({"batch_size": 16, "target_languages": ["russian", "french"]}, None),
24 |     ],
25 | )
26 | def test_init(params, exception):
27 |     if exception is None:
28 |         MarianMT(**params)
29 |     else:
30 |         with pytest.raises(exception):
31 |             MarianMT(**params)
32 | 
33 | 
34 | def test_marianmt_augment(model_gpu_config, gobbli_dir):
35 |     # Don't go overboard with the languages here, since each
36 |     # one requires a separate model (few hundred MB) to be downloaded
37 |     target_languages = ["russian", "french"]
38 |     model = MarianMT(
39 |         data_dir=model_test_dir(MarianMT),
40 |         load_existing=True,
41 |         target_languages=target_languages,
42 |         **model_gpu_config,
43 |     )
44 |     model.build()
45 | 
46 |     # Can't augment more times than target languages
47 |     invalid_num_times = len(target_languages) + 1
48 |     with pytest.raises(ValueError):
49 |         model.augment(["This is a test."], times=invalid_num_times)
50 | 
51 |     valid_num_times = len(target_languages)
52 |     new_texts = model.augment(["This is a test."], times=valid_num_times)
53 |     assert len(new_texts) == valid_num_times
54 | 


--------------------------------------------------------------------------------
/gobbli/test/augment/test_wordnet.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from spacy.lang.en import English
 3 | 
 4 | from gobbli.augment.wordnet import WordNet, _detokenize_doc
 5 | 
 6 | 
 7 | @pytest.mark.parametrize(
 8 |     "text",
 9 |     [
10 |         "This is a test.",
11 |         "Test  with double space.",
12 |         "Test-with hyphen.",
13 |         "Testing some 1 2 3 numbers.",
14 |     ],
15 | )
16 | def test_detokenize_doc(text):
17 |     # Initialize the spaCy extension needed to detokenize text
18 |     WordNet()
19 | 
20 |     nlp = English()
21 |     doc = nlp(text)
22 | 
23 |     # Fill out the replacement attribute as WordNet would.
24 |     for tok in doc:
25 |         tok._.replacement = tok.text
26 |     assert _detokenize_doc(doc) == text
27 | 
28 | 
29 | def test_wordnet_augment():
30 |     wn = WordNet()
31 |     times = 5
32 |     new_texts = wn.augment(["This is a test."], times=times)
33 |     assert len(new_texts) == times
34 | 


--------------------------------------------------------------------------------
/gobbli/test/classification/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/gobbli/test/classification/__init__.py


--------------------------------------------------------------------------------
/gobbli/test/dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/gobbli/test/dataset/__init__.py


--------------------------------------------------------------------------------
/gobbli/test/dataset/test_base_dataset.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from gobbli.test.util import MockDataset
 4 | 
 5 | 
 6 | def test_base_dataset_load():
 7 |     ds = MockDataset()
 8 | 
 9 |     # Dataset should be unbuilt after default initialization
10 |     assert ds._build_count == 0
11 | 
12 |     ds = MockDataset.load()
13 | 
14 |     # Dataset should now be built
15 |     assert ds._build_count == 1
16 | 
17 |     ds.load()
18 | 
19 |     # Dataset shouldn't have been built again
20 |     assert ds._build_count == 1
21 | 
22 | 
23 | def test_base_dataset_train_input():
24 |     # Need to build first
25 |     with pytest.raises(ValueError):
26 |         MockDataset().train_input()
27 | 
28 |     ds = MockDataset.load()
29 | 
30 |     # No limit
31 |     train_input = ds.train_input(valid_proportion=0.5)
32 | 
33 |     X_len = len(MockDataset.X_TRAIN_VALID)
34 | 
35 |     assert len(train_input.X_train) == X_len / 2
36 |     assert len(train_input.y_train) == X_len / 2
37 |     assert len(train_input.X_valid) == X_len / 2
38 |     assert len(train_input.y_valid) == X_len / 2
39 | 
40 |     # Limit
41 |     train_input = ds.train_input(valid_proportion=0.5, limit=2)
42 | 
43 |     assert len(train_input.X_train) == 1
44 |     assert len(train_input.y_train) == 1
45 |     assert len(train_input.X_valid) == 1
46 |     assert len(train_input.y_valid) == 1
47 | 
48 | 
49 | def test_base_dataset_predict_input():
50 |     # Need to build first
51 |     with pytest.raises(ValueError):
52 |         MockDataset().train_input()
53 | 
54 |     ds = MockDataset.load()
55 | 
56 |     # No limit
57 |     predict_input = ds.predict_input()
58 | 
59 |     X_len = len(MockDataset.X_TEST)
60 | 
61 |     assert len(predict_input.X) == X_len
62 |     assert set(predict_input.labels) == set(MockDataset.Y_TEST)
63 | 
64 |     # Limit applied
65 |     predict_input = ds.predict_input(limit=1)
66 | 
67 |     assert len(predict_input.X) == 1
68 | 
69 |     # Make sure we only have the labels from the limited subset
70 |     assert set(predict_input.labels) < set(MockDataset.Y_TEST)
71 | 


--------------------------------------------------------------------------------
/gobbli/test/dataset/test_cmu_movie_summary.py:
--------------------------------------------------------------------------------
 1 | from gobbli.dataset.cmu_movie_summary import MovieSummaryDataset
 2 | 
 3 | 
 4 | def test_load_cmu_movie_summary(tmp_gobbli_dir):
 5 |     ds = MovieSummaryDataset.load()
 6 | 
 7 |     X_train = ds.X_train()
 8 |     X_test = ds.X_test()
 9 | 
10 |     y_train = ds.y_train()
11 |     y_test = ds.y_test()
12 | 
13 |     assert len(X_train) == 33763
14 |     assert len(y_train) == 33763
15 |     assert len(X_test) == 8441
16 |     assert len(y_test) == 8441
17 | 
18 |     # Ensure these objects pass validation
19 |     train_input = ds.train_input()
20 |     ds.predict_input()
21 | 
22 |     assert len(train_input.labels()) == 357
23 | 


--------------------------------------------------------------------------------
/gobbli/test/dataset/test_imdb.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from gobbli.dataset.imdb import IMDBDataset
 4 | 
 5 | 
 6 | def test_load_imdb(tmp_gobbli_dir):
 7 |     ds = IMDBDataset.load()
 8 | 
 9 |     X_train = ds.X_train()
10 |     X_test = ds.X_test()
11 | 
12 |     y_train = ds.y_train()
13 |     y_test = ds.y_test()
14 | 
15 |     assert len(X_train) == 25000
16 |     assert len(y_train) == 25000
17 |     assert len(X_test) == 25000
18 |     assert len(y_test) == 25000
19 | 
20 |     assert len(pd.unique(y_train)) == 2
21 |     assert len(pd.unique(y_test)) == 2
22 | 
23 |     # Ensure these objects pass validation
24 |     ds.train_input()
25 |     ds.predict_input()
26 | 


--------------------------------------------------------------------------------
/gobbli/test/dataset/test_newsgroups.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from gobbli.dataset.newsgroups import NewsgroupsDataset
 4 | 
 5 | 
 6 | def test_load_newsgroups(tmp_gobbli_dir):
 7 |     ds = NewsgroupsDataset.load()
 8 | 
 9 |     X_train = ds.X_train()
10 |     X_test = ds.X_test()
11 | 
12 |     y_train = ds.y_train()
13 |     y_test = ds.y_test()
14 | 
15 |     assert len(X_train) == 11314
16 |     assert len(y_train) == 11314
17 |     assert len(X_test) == 7532
18 |     assert len(y_test) == 7532
19 | 
20 |     assert len(pd.unique(y_train)) == 20
21 |     assert len(pd.unique(y_test)) == 20
22 | 
23 |     # Ensure these objects pass validation
24 |     ds.train_input()
25 |     ds.predict_input()
26 | 


--------------------------------------------------------------------------------
/gobbli/test/experiment/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/gobbli/test/experiment/__init__.py


--------------------------------------------------------------------------------
/gobbli/test/interactive/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/gobbli/test/interactive/__init__.py


--------------------------------------------------------------------------------
/gobbli/test/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/gobbli/test/model/__init__.py


--------------------------------------------------------------------------------
/gobbli/test/model/test_bert.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from gobbli.model.bert import BERT
 4 | 
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     "params,exception",
 8 |     [
 9 |         # Unknown param
10 |         ({"unknown": None}, ValueError),
11 |         # Bad type (max_seq_length)
12 |         ({"max_seq_length": "100"}, TypeError),
13 |         # Bad value (bert_model)
14 |         ({"bert_model": "ernie"}, ValueError),
15 |         # OK type (max_seq_length)
16 |         ({"max_seq_length": 100}, None),
17 |         # OK value (bert_model)
18 |         ({"bert_model": "bert-base-uncased"}, None),
19 |         # OK values (both params)
20 |         ({"max_seq_length": 100, "bert_model": "bert-base-uncased"}, None),
21 |     ],
22 | )
23 | def test_init(params, exception):
24 |     if exception is None:
25 |         BERT(**params)
26 |     else:
27 |         with pytest.raises(exception):
28 |             BERT(**params)
29 | 


--------------------------------------------------------------------------------
/gobbli/test/model/test_fasttext.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from gobbli.model.fasttext import FastText
 4 | 
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     "params,exception",
 8 |     [
 9 |         # Unknown param
10 |         ({"unknown": None}, ValueError),
11 |         # Bad type (word_ngrams)
12 |         ({"word_ngrams": 1.0}, TypeError),
13 |         # Bad type (lr)
14 |         ({"lr": 1}, TypeError),
15 |         # Bad type (dim)
16 |         ({"dim": 100.0}, TypeError),
17 |         # Bad type (ws)
18 |         ({"ws": 3.0}, TypeError),
19 |         # Bad value (fasttext_model)
20 |         ({"fasttext_model": "bert"}, ValueError),
21 |         # OK value (fasttext_model)
22 |         ({"fasttext_model": "crawl-300d"}, None),
23 |         # Dim mismatch (pretrained vectors vs user-passed dim)
24 |         ({"fasttext_model": "crawl-300d", "dim": 100}, ValueError),
25 |         # OK values (all)
26 |         (
27 |             {
28 |                 "word_ngrams": 2,
29 |                 "lr": 0.01,
30 |                 "dim": 300,
31 |                 "ws": 3,
32 |                 "fasttext_model": "crawl-300d",
33 |             },
34 |             None,
35 |         ),
36 |     ],
37 | )
38 | def test_init(params, exception):
39 |     if exception is None:
40 |         FastText(**params)
41 |     else:
42 |         with pytest.raises(exception):
43 |             FastText(**params)
44 | 


--------------------------------------------------------------------------------
/gobbli/test/model/test_mtdnn.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from gobbli.model.mtdnn import MTDNN
 4 | 
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     "params,exception",
 8 |     [
 9 |         # Unknown param
10 |         ({"unknown": None}, ValueError),
11 |         # Bad type (max_seq_length)
12 |         ({"max_seq_length": "100"}, TypeError),
13 |         # Bad value (mtdnn_model)
14 |         ({"mtdnn_model": "bert"}, ValueError),
15 |         # OK type (max_seq_length)
16 |         ({"max_seq_length": 100}, None),
17 |         # OK value (mtdnn_model)
18 |         ({"mtdnn_model": "mt-dnn-base"}, None),
19 |         # OK values (both params)
20 |         ({"max_seq_length": 100, "mtdnn_model": "mt-dnn-base"}, None),
21 |     ],
22 | )
23 | def test_init(params, exception):
24 |     if exception is None:
25 |         MTDNN(**params)
26 |     else:
27 |         with pytest.raises(exception):
28 |             MTDNN(**params)
29 | 


--------------------------------------------------------------------------------
/gobbli/test/model/test_spacy.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from gobbli.model.spacy import SpaCyModel
 4 | 
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     "params,exception",
 8 |     [
 9 |         # Unknown param
10 |         ({"unknown": None}, ValueError),
11 |         # Bad type (dropout)
12 |         ({"dropout": "100"}, TypeError),
13 |         # OK type (dropout)
14 |         ({"dropout": 0.3}, None),
15 |         # Bad type (full_pipeline)
16 |         ({"full_pipeline": 1}, TypeError),
17 |         # OK type (full_pipeline)
18 |         ({"full_pipeline": True}, None),
19 |         # OK types (all params)
20 |         ({"full_pipeline": True, "dropout": 0.3}, None),
21 |     ],
22 | )
23 | def test_init(params, exception):
24 |     if exception is None:
25 |         SpaCyModel(**params)
26 |     else:
27 |         with pytest.raises(exception):
28 |             SpaCyModel(**params)
29 | 


--------------------------------------------------------------------------------
/gobbli/test/model/test_transformer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from gobbli.model.transformer import Transformer
 4 | 
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     "params,exception",
 8 |     [
 9 |         # Unknown param
10 |         ({"unknown": None}, ValueError),
11 |         # Bad type (max_seq_length)
12 |         ({"max_seq_length": "100"}, TypeError),
13 |         # OK type (max_seq_length)
14 |         ({"max_seq_length": 100}, None),
15 |         # Bad type (config_overrides)
16 |         ({"config_overrides": 1}, TypeError),
17 |         # OK type (config_overrides)
18 |         ({"config_overrides": {}}, None),
19 |         # Bad type (lr)
20 |         ({"lr": 1}, TypeError),
21 |         # OK type (lr)
22 |         ({"lr": 1e-3}, None),
23 |         # Bad type (adam_eps)
24 |         ({"adam_eps": 1}, TypeError),
25 |         # OK type (adam_eps)
26 |         ({"adam_eps": 1e-5}, None),
27 |         # Bad type (gradient_accumulation_steps)
28 |         ({"gradient_accumulation_steps": 1.0}, TypeError),
29 |         # OK type (gradient_accumulation_steps)
30 |         ({"gradient_accumulation_steps": 2}, None),
31 |         # OK values (all params),
32 |         (
33 |             {
34 |                 "max_seq_length": 100,
35 |                 "config_overrides": {},
36 |                 "lr": 1e-3,
37 |                 "adam_eps": 1e-5,
38 |                 "gradient_accumulation_steps": 2,
39 |             },
40 |             None,
41 |         ),
42 |     ],
43 | )
44 | def test_init(params, exception):
45 |     if exception is None:
46 |         Transformer(**params)
47 |     else:
48 |         with pytest.raises(exception):
49 |             Transformer(**params)
50 | 


--------------------------------------------------------------------------------
/gobbli/test/model/test_use.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from gobbli.model.use import USE
 4 | 
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     "params,exception",
 8 |     [
 9 |         # Unknown param
10 |         ({"unknown": None}, ValueError),
11 |         # Bad value (use_model)
12 |         ({"use_model": "bert"}, ValueError),
13 |         # OK value (use_model)
14 |         ({"use_model": "universal-sentence-encoder"}, None),
15 |     ],
16 | )
17 | def test_init(params, exception):
18 |     if exception is None:
19 |         USE(**params)
20 |     else:
21 |         with pytest.raises(exception):
22 |             USE(**params)
23 | 


--------------------------------------------------------------------------------
/meta.json:
--------------------------------------------------------------------------------
1 | {
2 |   "name": "gobbli",
3 |   "url": "https://github.com/RTIInternational/gobbli/",
4 |   "download_url": "",
5 |   "author": "RTI International",
6 |   "maintainer": "Jason Nance",
7 |   "version": "0.2.4",
8 |   "description": "Uniform interface to deep learning approaches via Docker containers."
9 | }


--------------------------------------------------------------------------------
/paper/README.md:
--------------------------------------------------------------------------------
1 | # Journal of Open Source Software Paper
2 | 
3 | This section of the repository contains materials for a paper submitted to [JOSS](https://joss.theoj.org).
4 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 88
 3 | target-version = ['py37']
 4 | exclude = '''
 5 | (
 6 |   /(
 7 |       \.eggs
 8 |     | \.git
 9 |     | \.hg
10 |     | \.mypy_cache
11 |     | \.tox
12 |     | \.venv
13 |     | _build
14 |     | buck-out
15 |     | build
16 |     | dist
17 |   )/
18 |   | gobbli/model/bert/src
19 |   | gobbli/model/mtdnn/src
20 | )
21 | '''


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths = gobbli/test
3 | norecursedirs = gobbli/model/bert/src gobbli/model/mtdnn/src build/
4 | 
5 | [pytest.ini]
6 | log_cli = 1
7 | log_cli_format = %(asctime)s [%(levelname)8s] %(message)s
8 | log_cli_date_format=%Y-%m-%d %H:%M:%S


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | flake8==3.7.7
 2 | mypy==0.720
 3 | pytest==4.5.0
 4 | ray[debug]==0.8.5
 5 | aiohttp==3.5.4
 6 | importmagic==0.1.7
 7 | epc==0.0.5
 8 | isort==4.3.20
 9 | isort[requirements]==4.3.20
10 | black==19.3b0
11 | setuptools==41.0.1
12 | wheel==0.33.6
13 | twine==1.13.0
14 | https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
15 | 


--------------------------------------------------------------------------------
/run_ci.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Run the various processes needed for CI.
 4 | # Pass additional script arguments to py.test.
 5 | 
 6 | set -e
 7 | 
 8 | isort -rc --check-only ./gobbli
 9 | black ./gobbli
10 | mypy ./gobbli --ignore-missing-imports
11 | flake8 ./gobbli --config setup.cfg
12 | py.test -vs $@ ./gobbli
13 | 


--------------------------------------------------------------------------------
/run_dist.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | function usage() {
 4 |     echo "Usage: $0 [test|live]"
 5 | }
 6 | 
 7 | if [[ $# -ne 1 ]]; then
 8 |     usage
 9 |     exit 1
10 | fi
11 | 
12 | mode="$1"
13 | 
14 | if [[ "$mode" != "test" && "$mode" != "live" ]]; then
15 |     usage
16 |     exit 1
17 | fi
18 | 
19 | rm -r ./dist/
20 | 
21 | python setup.py sdist bdist_wheel
22 | 
23 | if [[ "$mode" == "test" ]]; then
24 |     python -m twine upload --repository-url https://test.pypi.org/legacy/ dist/*
25 | elif [[ "$mode" == "live" ]]; then
26 |     python -m twine upload  dist/*
27 | fi
28 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | # https://github.com/python/black/issues/429
 3 | ignore=E101,E111,E114,E115,E116,E117,E121,E122,E123,E124,E125,E126,E127,E128,E129,E131,E133,E2,E3,E5,E701,E702,E703,E704,W1,W2,W3,W503,W504
 4 | exclude=gobbli/model/bert/src gobbli/model/mtdnn/src
 5 | 
 6 | [isort]
 7 | multi_line_output=3
 8 | include_trailing_comma=True
 9 | force_grid_wrap=0
10 | use_parentheses=True
11 | line_length=88
12 | skip=gobbli/model/bert/src,gobbli/model/mtdnn/src,.test_cache
13 | 


--------------------------------------------------------------------------------
/test_remote_gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Run tests on a GPU machine over SSH.  Assumes the remote user is a member of the
 4 | # Docker group (i.e. no sudo required for Docker commands), and that
 5 | # Docker/docker-compose/nvidia-docker are already installed on the remote server.
 6 | 
 7 | if [[ $# -ne 3 ]]; then
 8 |     echo "Usage: $0 <ssh_string> <remote_repo_dir> <visible_devices>"
 9 |     echo
10 |     echo "    ssh_string: SSH connection string for the remote server."
11 |     echo
12 |     echo "    remote_repo_dir: Path to use as the repository root on the "
13 |     echo "      remote server.  Files will be copied here."
14 |     echo
15 |     echo "    visible_devices: Value to use for the NVIDIA_VISIBLE_DEVICES environment "
16 |     echo "      variable controlling which GPUs are made available to the container "
17 |     echo "      for testing."
18 |     exit 1
19 | fi
20 | 
21 | ssh_string="$1"
22 | remote_repo_dir="$2"
23 | visible_gpus="$3"
24 | 
25 | if ssh "$ssh_string" "[[ -e $remote_repo_dir ]]"; then
26 |     echo "Directory '$remote_repo_dir' already exists on the remote server;"
27 |     echo "can't run tests pointing at an existing directory."
28 |     exit 1
29 | fi
30 | 
31 | rsync -raz \
32 |       --exclude .git \
33 |       --filter=':- .gitignore' \
34 |       ./ "$ssh_string:$remote_repo_dir"
35 | 
36 | ssh "$ssh_string" "cd $remote_repo_dir/ci-gpu \
37 |     && export NVIDIA_VISIBLE_DEVICES=$visible_gpus \
38 |     && export PYTHON_VERSION=3.7 \
39 |     && docker-compose build gobbli-ci-gpu \
40 |     && docker-compose run --rm gobbli-ci-gpu"
41 | 


--------------------------------------------------------------------------------