├── .dockerignore ├── .gitattributes ├── .github ├── ISSUE_TEMPLATE │ ├── bug-report.md │ └── feature-request.md └── pull_request_template.md ├── .gitignore ├── .readthedocs.yml ├── .travis.yml ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── README.md ├── benchmark ├── BENCHMARK_SPECS.yml ├── BENCHMARK_SPECS_DEBUG.yml ├── README.md ├── benchmark_output │ ├── class_imbalance │ │ ├── ALBERT │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── BERT │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── DistilBERT │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── FastText │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── GPT │ │ │ ├── ERROR │ │ │ ├── output.md │ │ │ └── run-meta.json │ │ ├── GPT2 │ │ │ ├── ERROR │ │ │ ├── output.md │ │ │ └── run-meta.json │ │ ├── MTDNN │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── RoBERTa │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── SKLearn │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── XLM-RoBERTa │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── XLM │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── XLNet │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── class_imbalance.md │ │ ├── spaCy │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ └── spacy-transformers │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ ├── data_augmentation │ │ ├── BERTMaskedLM │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── MarianMT │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── Word2Vec │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── WordNet │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ └── data_augmentation.md │ ├── document_windowing │ │ ├── BERT │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ └── document_windowing.md │ ├── imdb │ │ ├── ALBERT │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── BERT │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── DistilBERT │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── FastText │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── MTDNN │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── RoBERTa │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── SKLearn │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── XLM-RoBERTa │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── XLM │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── XLNet │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── imdb.md │ │ ├── spaCy │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ └── spacy-transformers │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ ├── imdb_embed │ │ ├── ALBERT │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── BERT │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── DistilBERT │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── ELECTRA │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── GPT │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── GPT2 │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── RoBERTa │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── T5Model │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── TransformerXL │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── USE │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── XLM-RoBERTa │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── XLM │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── XLNet │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── imdb_embed.md │ │ ├── spaCy │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ └── spacy-transformers │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ ├── low_resource │ │ ├── ALBERT │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── BERT │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── DistilBERT │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── FastText │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── MTDNN │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── RoBERTa │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── SKLearn │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── XLM-RoBERTa │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── XLM │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── XLNet │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── low_resource.md │ │ └── spaCy │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ ├── moviesummary │ │ ├── ALBERT │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── DistilBERT │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── FastText │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── RoBERTa │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── SKLearn │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── XLM-RoBERTa │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── XLM │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── XLNet │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── moviesummary.md │ │ ├── spaCy │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ └── spacy-transformers │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ ├── newsgroups │ │ ├── ALBERT │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── BERT │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── DistilBERT │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── FastText │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── MTDNN │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── RoBERTa │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── SKLearn │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── XLM-RoBERTa │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── XLM │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── XLNet │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ ├── newsgroups.md │ │ ├── spaCy │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ │ └── spacy-transformers │ │ │ ├── output.md │ │ │ ├── plot.png │ │ │ └── run-meta.json │ └── newsgroups_embed │ │ ├── ALBERT │ │ ├── output.md │ │ ├── plot.png │ │ └── run-meta.json │ │ ├── BERT │ │ ├── output.md │ │ ├── plot.png │ │ └── run-meta.json │ │ ├── DistilBERT │ │ ├── output.md │ │ ├── plot.png │ │ └── run-meta.json │ │ ├── ELECTRA │ │ ├── output.md │ │ ├── plot.png │ │ └── run-meta.json │ │ ├── GPT │ │ ├── output.md │ │ ├── plot.png │ │ └── run-meta.json │ │ ├── GPT2 │ │ ├── output.md │ │ ├── plot.png │ │ └── run-meta.json │ │ ├── RoBERTa │ │ ├── output.md │ │ ├── plot.png │ │ └── run-meta.json │ │ ├── T5Model │ │ ├── output.md │ │ ├── plot.png │ │ └── run-meta.json │ │ ├── TransformerXL │ │ ├── output.md │ │ ├── plot.png │ │ └── run-meta.json │ │ ├── USE │ │ ├── output.md │ │ ├── plot.png │ │ └── run-meta.json │ │ ├── XLM-RoBERTa │ │ ├── output.md │ │ ├── plot.png │ │ └── run-meta.json │ │ ├── XLM │ │ ├── output.md │ │ ├── plot.png │ │ └── run-meta.json │ │ ├── XLNet │ │ ├── output.md │ │ ├── plot.png │ │ └── run-meta.json │ │ ├── newsgroups_embed.md │ │ ├── sklearn_TF-IDF │ │ ├── output.md │ │ ├── plot.png │ │ └── run-meta.json │ │ ├── spaCy │ │ ├── output.md │ │ ├── plot.png │ │ └── run-meta.json │ │ └── spacy-transformers │ │ ├── output.md │ │ ├── plot.png │ │ └── run-meta.json ├── benchmark_util.py ├── docker-compose.yml ├── docker │ └── Dockerfile ├── requirements.txt ├── run_benchmarks.py ├── run_benchmarks.sh └── scenario.py ├── ci-gpu └── docker-compose.yml ├── ci └── docker-compose.yml ├── conftest.py ├── docker-compose.yml ├── docs ├── Makefile ├── _static │ ├── .gitkeep │ ├── gobbli_app.svg │ ├── gobbli_favicon.ico │ └── gobbli_lg.svg ├── advanced_usage.rst ├── api.rst ├── conf.py ├── img │ └── interactive_apps │ │ ├── evaluate │ │ └── evaluate.png │ │ ├── explain │ │ ├── explain.png │ │ └── explain_output.png │ │ └── explore │ │ ├── explore.png │ │ ├── explore_embeddings.png │ │ ├── explore_topic_model.png │ │ └── explore_trained_embeddings.png ├── index.rst ├── interactive_apps.rst ├── make.bat ├── prerequisites.rst ├── quickstart.rst ├── requirements.txt └── troubleshooting.rst ├── generate_docs.sh ├── gobbli ├── __init__.py ├── augment │ ├── __init__.py │ ├── base.py │ ├── bert │ │ ├── Dockerfile │ │ ├── __init__.py │ │ ├── model.py │ │ └── src │ │ │ └── augment_text.py │ ├── marian │ │ ├── Dockerfile │ │ ├── __init__.py │ │ ├── model.py │ │ └── src │ │ │ └── backtranslate_text.py │ ├── word2vec.py │ └── wordnet.py ├── cli.py ├── dataset │ ├── __init__.py │ ├── base.py │ ├── cmu_movie_summary.py │ ├── imdb.py │ ├── nested_file.py │ ├── newsgroups.py │ └── trivial.py ├── docker.py ├── experiment │ ├── __init__.py │ ├── base.py │ └── classification.py ├── inspect │ ├── __init__.py │ └── evaluate.py ├── interactive │ ├── evaluate.py │ ├── explain.py │ ├── explore.py │ └── util.py ├── io.py ├── model │ ├── __init__.py │ ├── base.py │ ├── bert │ │ ├── Dockerfile │ │ ├── __init__.py │ │ ├── model.py │ │ └── src │ │ │ ├── .gitignore │ │ │ ├── CONTRIBUTING.md │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── create_pretraining_data.py │ │ │ ├── extract_features.py │ │ │ ├── modeling.py │ │ │ ├── modeling_test.py │ │ │ ├── multilingual.md │ │ │ ├── optimization.py │ │ │ ├── optimization_test.py │ │ │ ├── predicting_movie_reviews_with_bert_on_tf_hub.ipynb │ │ │ ├── requirements.txt │ │ │ ├── run_classifier.py │ │ │ ├── run_classifier_with_tfhub.py │ │ │ ├── run_pretraining.py │ │ │ ├── run_squad.py │ │ │ ├── sample_text.txt │ │ │ ├── tokenization.py │ │ │ └── tokenization_test.py │ ├── context.py │ ├── fasttext │ │ ├── Dockerfile │ │ ├── __init__.py │ │ └── model.py │ ├── majority.py │ ├── mixin.py │ ├── mtdnn │ │ ├── Dockerfile │ │ ├── __init__.py │ │ ├── model.py │ │ └── src │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── config │ │ │ └── tasks_config.json │ │ │ ├── data_utils │ │ │ ├── __init__.py │ │ │ ├── glue_utils.py │ │ │ ├── label_map.py │ │ │ ├── log_wrapper.py │ │ │ ├── metrics.py │ │ │ ├── utils.py │ │ │ └── vocab.py │ │ │ ├── docker │ │ │ └── Dockerfile │ │ │ ├── download.sh │ │ │ ├── gobbli_train.py │ │ │ ├── module │ │ │ ├── __init__.py │ │ │ ├── bert_optim.py │ │ │ ├── common.py │ │ │ ├── dropout_wrapper.py │ │ │ ├── my_optim.py │ │ │ ├── san.py │ │ │ ├── similarity.py │ │ │ └── sub_layers.py │ │ │ ├── mt_dnn │ │ │ ├── __init__.py │ │ │ ├── batcher.py │ │ │ ├── gobbli_batcher.py │ │ │ ├── gobbli_model.py │ │ │ ├── matcher.py │ │ │ └── model.py │ │ │ ├── prepro.py │ │ │ ├── requirements.txt │ │ │ ├── run_toy.sh │ │ │ ├── scripts │ │ │ ├── domain_adaptation_run.sh │ │ │ ├── run_mt_dnn.sh │ │ │ ├── run_rte.sh │ │ │ ├── run_stsb.sh │ │ │ ├── scitail_domain_adaptation_bash.sh │ │ │ ├── snli_domain_adaptation_bash.sh │ │ │ └── strip_model.py │ │ │ └── train.py │ ├── random.py │ ├── sklearn │ │ ├── __init__.py │ │ └── model.py │ ├── spacy │ │ ├── Dockerfile │ │ ├── __init__.py │ │ ├── model.py │ │ └── src │ │ │ ├── requirements.txt │ │ │ └── run_spacy.py │ ├── transformer │ │ ├── Dockerfile │ │ ├── __init__.py │ │ ├── model.py │ │ └── src │ │ │ ├── requirements.txt │ │ │ └── run_model.py │ └── use │ │ ├── Dockerfile │ │ ├── __init__.py │ │ ├── model.py │ │ └── src │ │ ├── requirements.txt │ │ └── use.py ├── test │ ├── __init__.py │ ├── augment │ │ ├── __init__.py │ │ ├── test_bertmaskedlm.py │ │ ├── test_marian.py │ │ ├── test_word2vec.py │ │ └── test_wordnet.py │ ├── classification │ │ ├── __init__.py │ │ ├── test_classifiers.py │ │ └── test_embeddings.py │ ├── dataset │ │ ├── __init__.py │ │ ├── test_base_dataset.py │ │ ├── test_cmu_movie_summary.py │ │ ├── test_imdb.py │ │ └── test_newsgroups.py │ ├── experiment │ │ ├── __init__.py │ │ ├── test_base_experiment.py │ │ └── test_classification_experiment.py │ ├── inspect │ │ └── test_evaluate.py │ ├── interactive │ │ ├── __init__.py │ │ └── test_util.py │ ├── model │ │ ├── __init__.py │ │ ├── test_base_model.py │ │ ├── test_bert.py │ │ ├── test_fasttext.py │ │ ├── test_mtdnn.py │ │ ├── test_sklearn.py │ │ ├── test_spacy.py │ │ ├── test_transformer.py │ │ └── test_use.py │ ├── test_io.py │ ├── test_util.py │ └── util.py └── util.py ├── img ├── gobbli_app.svg └── gobbli_lg.svg ├── meta.json ├── paper ├── README.md ├── paper.bib └── paper.md ├── pyproject.toml ├── pytest.ini ├── requirements.txt ├── run_ci.sh ├── run_dist.sh ├── setup.cfg ├── setup.py └── test_remote_gpu.sh /.dockerignore: -------------------------------------------------------------------------------- 1 | **/.ipynb_checkpoints 2 | **/__pycache__ 3 | *.py[cod] 4 | **/build 5 | **/dist 6 | 7 | **/.tox 8 | **/.eggs 9 | **/gobbli.egg-info 10 | **/.hypothesis 11 | **/.mypy_cache 12 | **/__pycache__ 13 | *.py[cod] 14 | **/.test_cache 15 | 16 | benchmark/benchmark_data/ 17 | benchmark/benchmark_meta/ 18 | benchmark/benchmark_gobbli/ 19 | 20 | scratch/ 21 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-documentation 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "Bug Report" 3 | about: Submit a bug report 4 | --- 5 | 6 | ## System Information 7 | - OS platform and distribution (e.g. Linux Ubuntu 16.04): 8 | - gobbli version: 9 | - Python version: 10 | - Other information relevant to the problem (GPU model, Docker version, etc): 11 | 12 | ## Description 13 | 14 | 15 | 16 | ## Code for Minimal Reproducible Example 17 | 18 | 19 | 20 | ## Output 21 | 22 | 23 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "Feature Request" 3 | about: "Submit a request for a new feature." 4 | --- 5 | 6 | ## Feature 7 | 8 | 9 | 10 | ## Motivation 11 | 12 | 13 | 14 | ## Additional Details 15 | 16 | 17 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ## Description of Changes 2 | 3 | 4 | 5 | ## Related Issue(s), if any 6 | 7 | 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | __pycache__/ 3 | *.py[cod] 4 | build/ 5 | dist/ 6 | docs/_build/ 7 | docs/auto/ 8 | 9 | .tox/ 10 | .eggs/ 11 | gobbli.egg-info/ 12 | .hypothesis/ 13 | .coverage 14 | .mypy_cache/ 15 | .pytest_cache/ 16 | pip-wheel-metadata/ 17 | 18 | .test_cache/ 19 | 20 | benchmark/benchmark_data/ 21 | benchmark/benchmark_meta/ 22 | benchmark/benchmark_gobbli/ 23 | benchmark/benchmark_output_debug/ 24 | 25 | scratch/ -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | python: 4 | version: 3.7 5 | install: 6 | - method: pip 7 | path: . 8 | - requirements: docs/requirements.txt 9 | 10 | sphinx: 11 | configuration: docs/conf.py 12 | fail_on_warning: true 13 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: minimal 2 | 3 | env: 4 | jobs: 5 | - PYTHON_VERSION=3.7 6 | - PYTHON_VERSION=3.8 7 | 8 | services: 9 | - docker 10 | 11 | install: 12 | - cd ci 13 | - docker-compose build gobbli-ci 14 | 15 | script: 16 | - docker-compose run --rm gobbli-ci 17 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Thanks for your contribution! We're starting out with a few simple guidelines: 4 | 5 | ## Contributor License Agreement 6 | 7 | You must [sign a Contributor License Agreement](https://www.clahub.com/agreements/RTIInternational/gobbli) (CLA) to contribute to this project. 8 | 9 | ## Code Style 10 | 11 | We use a few linting tools to enforce consistency of style and formatting, which are run in CI. Make sure your code passes the pre-test checks in `run_ci.sh` before it's pushed. Additionally: 12 | 13 | - Pretty much any code added under the main gobbli codebase should have type hints. Code run as part of model Docker containers is exempt from this guideline but should still be formatted. 14 | - Add docstrings where appropriate (especially public interface functions). Use Sphinx references to link to other parts of the project where needed. 15 | 16 | ## Tests 17 | 18 | A lot of the functionality in gobbli is difficult to test (large models, long runtimes, complex functions). We don't test every edge case, but try to make sure there's at least a black box test verifying end-to-end success of any new code you add. White box testing is appreciated where feasible. 19 | 20 | ## Code Reviews 21 | 22 | All submissions must come in the form of PRs against the master branch and will be reviewed. If possible, ensure your patch only implements/changes one thing. 23 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PYTHON_VERSION=3.7 2 | FROM python:${PYTHON_VERSION} 3 | 4 | COPY ./setup.py ./meta.json ./requirements.txt ./README.md /code/ 5 | COPY ./docs/requirements.txt /code/docs/requirements.txt 6 | 7 | WORKDIR /code 8 | RUN pip install --upgrade pip \ 9 | && pip install -e '.[augment,tokenize,interactive]' \ 10 | && pip install -r requirements.txt \ 11 | && pip install -r docs/requirements.txt 12 | 13 | COPY ./ /code 14 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | include meta.json 4 | # We have some .py files that are needed but aren't part of the package, so make sure they're included 5 | recursive-include gobbli *.py 6 | recursive-include gobbli Dockerfile 7 | recursive-include gobbli requirements.txt -------------------------------------------------------------------------------- /benchmark/README.md: -------------------------------------------------------------------------------- 1 | # gobbli Benchmarks 2 | 3 | This directory contains benchmarking code and output for various aspects of gobbli model performance. 4 | 5 | To run the benchmarks (note -- this may take several days depending on available computing resources): 6 | 7 | ./run_benchmarks.sh 8 | 9 | To run with GPU support enabled: 10 | 11 | export GOBBLI_USE_GPU=1 12 | ./run_benchmarks.sh 13 | 14 | Use `--help` to see additional arguments in case you want to debug individual benchmarks, force re-running, etc. 15 | 16 | ./run_benchmarks.sh --help 17 | -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/ALBERT/output.md: -------------------------------------------------------------------------------- 1 | # Results: ALBERT 2 | | | imbalance_proportion | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | Minority Class F1 Score | Majority Class F1 Score | 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:| 4 | | 0 | 0.01 | 0.333333 | 0.25 | 0.5 | 0.5 | 0 | 0.666667 | 5 | | 1 | 0.05 | 0.333333 | 0.25 | 0.5 | 0.5 | 0 | 0.666667 | 6 | | 2 | 0.1 | 0.737548 | 0.807997 | 0.74992 | 0.74992 | 0.680564 | 0.794531 | 7 | | 3 | 0.25 | 0.820206 | 0.837869 | 0.82228 | 0.82228 | 0.800896 | 0.839516 | 8 | | 4 | 0.33 | 0.847492 | 0.851868 | 0.84792 | 0.84792 | 0.839415 | 0.855569 | 9 | | 5 | 0.5 | 0.856016 | 0.856278 | 0.85604 | 0.85604 | 0.854156 | 0.857876 | 10 | ![Results](ALBERT/plot.png) 11 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/ALBERT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/ALBERT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/ALBERT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "ALBERT", "model_name": "Transformer", "param_grid": {"transformer_weights": ["albert-base-v1", "albert-base-v2"], "transformer_model": ["Albert"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/BERT/output.md: -------------------------------------------------------------------------------- 1 | # Results: BERT 2 | | | imbalance_proportion | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | Minority Class F1 Score | Majority Class F1 Score | 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:| 4 | | 0 | 0.01 | 0.460844 | 0.759732 | 0.56296 | 0.56296 | 0.226204 | 0.695485 | 5 | | 1 | 0.05 | 0.684874 | 0.802653 | 0.7092 | 0.7092 | 0.597319 | 0.772428 | 6 | | 2 | 0.1 | 0.761841 | 0.829851 | 0.77224 | 0.77224 | 0.712075 | 0.811607 | 7 | | 3 | 0.25 | 0.852626 | 0.865441 | 0.8538 | 0.8538 | 0.839475 | 0.865778 | 8 | | 4 | 0.33 | 0.868724 | 0.874132 | 0.86916 | 0.86916 | 0.861157 | 0.876291 | 9 | | 5 | 0.5 | 0.88172 | 0.881725 | 0.88172 | 0.88172 | 0.881933 | 0.881507 | 10 | ![Results](BERT/plot.png) 11 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/BERT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/BERT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/BERT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "BERT", "model_name": "BERT", "param_grid": {"bert_model": ["bert-base-uncased", "bert-base-cased", "scibert-uncased"], "max_seq_length": [128]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/DistilBERT/output.md: -------------------------------------------------------------------------------- 1 | # Results: DistilBERT 2 | | | imbalance_proportion | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | Minority Class F1 Score | Majority Class F1 Score | 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:| 4 | | 0 | 0.01 | 0.442551 | 0.754912 | 0.55296 | 0.55296 | 0.194464 | 0.690638 | 5 | | 1 | 0.05 | 0.666977 | 0.795581 | 0.69524 | 0.69524 | 0.569961 | 0.763993 | 6 | | 2 | 0.1 | 0.766211 | 0.825528 | 0.77524 | 0.77524 | 0.720267 | 0.812155 | 7 | | 3 | 0.25 | 0.83463 | 0.852239 | 0.83648 | 0.83648 | 0.817141 | 0.85212 | 8 | | 4 | 0.33 | 0.851158 | 0.85668 | 0.85168 | 0.85168 | 0.842347 | 0.85997 | 9 | | 5 | 0.5 | 0.865635 | 0.865692 | 0.86564 | 0.86564 | 0.864834 | 0.866436 | 10 | ![Results](DistilBERT/plot.png) 11 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/DistilBERT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/DistilBERT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/DistilBERT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "DistilBERT", "model_name": "Transformer", "param_grid": {"transformer_weights": ["distilbert-base-uncased", "distilbert-base-uncased-distilled-squad"], "transformer_model": ["DistilBert"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/FastText/output.md: -------------------------------------------------------------------------------- 1 | # Results: FastText 2 | | | imbalance_proportion | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | Minority Class F1 Score | Majority Class F1 Score | 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:| 4 | | 0 | 0.01 | 0.333333 | 0.25 | 0.5 | 0.5 | 0 | 0.666667 | 5 | | 1 | 0.05 | 0.586303 | 0.783703 | 0.63904 | 0.63904 | 0.438596 | 0.734009 | 6 | | 2 | 0.1 | 0.723779 | 0.817785 | 0.74056 | 0.74056 | 0.655696 | 0.791862 | 7 | | 3 | 0.25 | 0.853318 | 0.868819 | 0.85472 | 0.85472 | 0.838979 | 0.867658 | 8 | | 4 | 0.33 | 0.873252 | 0.879321 | 0.87372 | 0.87372 | 0.865551 | 0.880953 | 9 | | 5 | 0.5 | 0.89008 | 0.89008 | 0.89008 | 0.89008 | 0.890054 | 0.890106 | 10 | ![Results](FastText/plot.png) 11 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/FastText/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/FastText/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/FastText/run-meta.json: -------------------------------------------------------------------------------- 1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "FastText", "model_name": "FastText", "param_grid": {"word_ngrams": [1, 2], "dim": [100, 300], "lr": [0.5, 1.0]}, "preprocess_func": "fasttext_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/GPT/ERROR: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/GPT/ERROR -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/GPT/output.md: -------------------------------------------------------------------------------- 1 | # ERROR: Exception during run 'GPT'. 2 | 3 | ``` 4 | File "/data/users/jnance/gobbli/benchmark/scenario.py", line 153, in run 5 | output = self._do_run(run, run_output_dir) 6 | 7 | File "/data/users/jnance/gobbli/benchmark/scenario.py", line 430, in _do_run 8 | run_kwargs=run.run_kwargs, 9 | 10 | File "/data/users/jnance/gobbli/benchmark/benchmark_util.py", line 214, in run_benchmark_experiment 11 | return exp.run(**run_kwargs) 12 | 13 | File "/code/gobbli/experiment/classification.py", line 659, in run 14 | for params in grid 15 | 16 | File "/usr/local/lib/python3.7/site-packages/ray/worker.py", line 2247, in get 17 | raise value 18 | 19 | RayTaskError: ray_worker:gobbli.experiment.classification.train() (pid=6724, host=0dd5b0b5e4f4) 20 | File "/code/gobbli/experiment/classification.py", line 550, in train 21 | train_output = clf.train(train_input) 22 | File "/code/gobbli/model/mixin.py", line 104, in train 23 | _run_task(self._train, train_input, self.train_dir(), train_dir_name), 24 | File "/code/gobbli/model/mixin.py", line 42, in _run_task 25 | task_output = cast(gobbli.io.TaskIO, task_func(task_input, context)) 26 | File "/code/gobbli/model/transformer/model.py", line 254, in _train 27 | self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs 28 | File "/code/gobbli/docker.py", line 113, in run_container 29 | raise RuntimeError(err_msg) 30 | RuntimeError: Error running container (return code 1). Last 20 lines of logs: 31 | Using device: cuda 32 | Number of GPUs: 1 33 | Initializing transformer... 34 | ray_worker:gobbli.experiment.classification.train() (pid=6724, host=0dd5b0b5e4f4) 35 | File "run_model.py", line 461, in 36 | Model: GPTForSequenceClassification 37 | Weights: openai-gpt 38 | Tokenizer: GPTTokenizer 39 | Config: GPTConfig 40 | model_cls = getattr(transformers, model_name) 41 | AttributeError: module 'transformers' has no attribute 'GPTForSequenceClassification' 42 | 43 | 44 | ``` -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/GPT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "GPT", "model_name": "Transformer", "param_grid": {"transformer_weights": ["openai-gpt"], "transformer_model": ["GPT"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/GPT2/ERROR: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/GPT2/ERROR -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/GPT2/output.md: -------------------------------------------------------------------------------- 1 | # ERROR: Exception during run 'GPT2'. 2 | 3 | ``` 4 | File "/data/users/jnance/gobbli/benchmark/scenario.py", line 153, in run 5 | output = self._do_run(run, run_output_dir) 6 | 7 | File "/data/users/jnance/gobbli/benchmark/scenario.py", line 430, in _do_run 8 | run_kwargs=run.run_kwargs, 9 | 10 | File "/data/users/jnance/gobbli/benchmark/benchmark_util.py", line 214, in run_benchmark_experiment 11 | return exp.run(**run_kwargs) 12 | 13 | File "/code/gobbli/experiment/classification.py", line 659, in run 14 | for params in grid 15 | 16 | File "/usr/local/lib/python3.7/site-packages/ray/worker.py", line 2247, in get 17 | raise value 18 | 19 | RayTaskError: ray_worker:gobbli.experiment.classification.train() (pid=6870, host=0dd5b0b5e4f4) 20 | File "/code/gobbli/experiment/classification.py", line 550, in train 21 | train_output = clf.train(train_input) 22 | File "/code/gobbli/model/mixin.py", line 104, in train 23 | _run_task(self._train, train_input, self.train_dir(), train_dir_name), 24 | File "/code/gobbli/model/mixin.py", line 42, in _run_task 25 | task_output = cast(gobbli.io.TaskIO, task_func(task_input, context)) 26 | File "/code/gobbli/model/transformer/model.py", line 254, in _train 27 | self.docker_client, self.image_tag, cmd, self.logger, **run_kwargs 28 | File "/code/gobbli/docker.py", line 113, in run_container 29 | raise RuntimeError(err_msg) 30 | RuntimeError: Error running container (return code 1). Last 20 lines of logs: 31 | Using device: cuda 32 | Number of GPUs: 1 33 | Initializing transformer... 34 | Model: GPT2ForSequenceClassification 35 | Weights: gpt2 36 | Tokenizer: GPT2Tokenizer 37 | Config: GPT2Config 38 | ray_worker:gobbli.experiment.classification.train() (pid=6870, host=0dd5b0b5e4f4) 39 | File "run_model.py", line 461, in 40 | model_cls = getattr(transformers, model_name) 41 | AttributeError: module 'transformers' has no attribute 'GPT2ForSequenceClassification' 42 | 43 | 44 | ``` -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/GPT2/run-meta.json: -------------------------------------------------------------------------------- 1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "GPT2", "model_name": "Transformer", "param_grid": {"transformer_weights": ["gpt2"], "transformer_model": ["GPT2"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/MTDNN/output.md: -------------------------------------------------------------------------------- 1 | # Results: MTDNN 2 | | | imbalance_proportion | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | Minority Class F1 Score | Majority Class F1 Score | 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:| 4 | | 0 | 0.01 | 0.514917 | 0.772073 | 0.5942 | 0.5942 | 0.318807 | 0.711026 | 5 | | 1 | 0.05 | 0.729578 | 0.82128 | 0.74552 | 0.74552 | 0.66392 | 0.795237 | 6 | | 2 | 0.1 | 0.806281 | 0.849677 | 0.81156 | 0.81156 | 0.774302 | 0.83826 | 7 | | 3 | 0.25 | 0.853252 | 0.866297 | 0.85444 | 0.85444 | 0.840051 | 0.866454 | 8 | | 4 | 0.33 | 0.868913 | 0.873457 | 0.86928 | 0.86928 | 0.861982 | 0.875845 | 9 | | 5 | 0.5 | 0.883435 | 0.883508 | 0.88344 | 0.88344 | 0.882661 | 0.884209 | 10 | ![Results](MTDNN/plot.png) 11 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/MTDNN/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/MTDNN/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/MTDNN/run-meta.json: -------------------------------------------------------------------------------- 1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "MTDNN", "model_name": "MTDNN", "param_grid": {"mtdnn_model": ["mt-dnn-base"], "max_seq_length": [128]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/RoBERTa/output.md: -------------------------------------------------------------------------------- 1 | # Results: RoBERTa 2 | | | imbalance_proportion | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | Minority Class F1 Score | Majority Class F1 Score | 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:| 4 | | 0 | 0.01 | 0.333333 | 0.25 | 0.5 | 0.5 | 0 | 0.666667 | 5 | | 1 | 0.05 | 0.333333 | 0.25 | 0.5 | 0.5 | 0 | 0.666667 | 6 | | 2 | 0.1 | 0.333333 | 0.25 | 0.5 | 0.5 | 0 | 0.666667 | 7 | | 3 | 0.25 | 0.865232 | 0.876027 | 0.86612 | 0.86612 | 0.854295 | 0.87617 | 8 | | 4 | 0.33 | 0.333333 | 0.25 | 0.5 | 0.5 | 0 | 0.666667 | 9 | | 5 | 0.5 | 0.8874 | 0.8874 | 0.8874 | 0.8874 | 0.887341 | 0.887459 | 10 | ![Results](RoBERTa/plot.png) 11 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/RoBERTa/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/RoBERTa/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/RoBERTa/run-meta.json: -------------------------------------------------------------------------------- 1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "RoBERTa", "model_name": "Transformer", "param_grid": {"transformer_weights": ["roberta-base"], "transformer_model": ["Roberta"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/SKLearn/output.md: -------------------------------------------------------------------------------- 1 | # Results: SKLearn 2 | | | imbalance_proportion | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | Minority Class F1 Score | Majority Class F1 Score | 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:| 4 | | 0 | 0.01 | 0.333333 | 0.25 | 0.5 | 0.5 | 0 | 0.666667 | 5 | | 1 | 0.05 | 0.35963 | 0.753057 | 0.51208 | 0.51208 | 0.0471801 | 0.672079 | 6 | | 2 | 0.1 | 0.493072 | 0.77027 | 0.58156 | 0.58156 | 0.281278 | 0.704867 | 7 | | 3 | 0.25 | 0.766716 | 0.832396 | 0.77652 | 0.77652 | 0.718893 | 0.814539 | 8 | | 4 | 0.33 | 0.830675 | 0.855935 | 0.83336 | 0.83336 | 0.809354 | 0.851997 | 9 | | 5 | 0.5 | 0.88048 | 0.880483 | 0.88048 | 0.88048 | 0.880317 | 0.880642 | 10 | ![Results](SKLearn/plot.png) 11 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/SKLearn/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/SKLearn/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/SKLearn/run-meta.json: -------------------------------------------------------------------------------- 1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "SKLearn", "model_name": "SKLearnClassifier", "param_grid": {}, "preprocess_func": "fasttext_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/XLM-RoBERTa/output.md: -------------------------------------------------------------------------------- 1 | # Results: XLM-RoBERTa 2 | | | imbalance_proportion | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | Minority Class F1 Score | Majority Class F1 Score | 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:| 4 | | 0 | 0.01 | 0.333333 | 0.25 | 0.5 | 0.5 | 0 | 0.666667 | 5 | | 1 | 0.05 | 0.333333 | 0.25 | 0.5 | 0.5 | 0 | 0.666667 | 6 | | 2 | 0.1 | 0.333333 | 0.25 | 0.5 | 0.5 | 0 | 0.666667 | 7 | | 3 | 0.25 | 0.835274 | 0.854145 | 0.83724 | 0.83724 | 0.81728 | 0.853269 | 8 | | 4 | 0.33 | 0.849965 | 0.85965 | 0.85088 | 0.85088 | 0.838251 | 0.86168 | 9 | | 5 | 0.5 | 0.869491 | 0.870338 | 0.86956 | 0.86956 | 0.8665 | 0.872483 | 10 | ![Results](XLM-RoBERTa/plot.png) 11 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/XLM-RoBERTa/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/XLM-RoBERTa/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/XLM-RoBERTa/run-meta.json: -------------------------------------------------------------------------------- 1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "XLM-RoBERTa", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlm-roberta-base"], "transformer_model": ["XLMRoberta"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/XLM/output.md: -------------------------------------------------------------------------------- 1 | # Results: XLM 2 | | | imbalance_proportion | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | Minority Class F1 Score | Majority Class F1 Score | 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:| 4 | | 0 | 0.01 | 0.333333 | 0.25 | 0.5 | 0.5 | 0 | 0.666667 | 5 | | 1 | 0.05 | 0.502425 | 0.744373 | 0.58404 | 0.58404 | 0.300908 | 0.703943 | 6 | | 2 | 0.1 | 0.333333 | 0.25 | 0.5 | 0.5 | 0 | 0.666667 | 7 | | 3 | 0.25 | 0.583932 | 0.674197 | 0.61768 | 0.61768 | 0.465436 | 0.702428 | 8 | | 4 | 0.33 | 0.81156 | 0.815876 | 0.81212 | 0.81212 | 0.801286 | 0.821834 | 9 | | 5 | 0.5 | 0.333333 | 0.25 | 0.5 | 0.5 | 0.666667 | 0 | 10 | ![Results](XLM/plot.png) 11 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/XLM/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/XLM/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/XLM/run-meta.json: -------------------------------------------------------------------------------- 1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "XLM", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlm-mlm-tlm-xnli15-1024", "xlm-clm-ende-1024"], "transformer_model": ["XLM"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/XLNet/output.md: -------------------------------------------------------------------------------- 1 | # Results: XLNet 2 | | | imbalance_proportion | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | Minority Class F1 Score | Majority Class F1 Score | 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:| 4 | | 0 | 0.01 | 0.333333 | 0.25 | 0.5 | 0.5 | 0 | 0.666667 | 5 | | 1 | 0.05 | 0.333333 | 0.25 | 0.5 | 0.5 | 0 | 0.666667 | 6 | | 2 | 0.1 | 0.785112 | 0.842044 | 0.79284 | 0.79284 | 0.744361 | 0.825863 | 7 | | 3 | 0.25 | 0.868935 | 0.878797 | 0.86972 | 0.86972 | 0.85879 | 0.879079 | 8 | | 4 | 0.33 | 0.878497 | 0.882614 | 0.8788 | 0.8788 | 0.872432 | 0.884563 | 9 | | 5 | 0.5 | 0.891879 | 0.891891 | 0.89188 | 0.89188 | 0.89216 | 0.891598 | 10 | ![Results](XLNet/plot.png) 11 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/XLNet/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/XLNet/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/XLNet/run-meta.json: -------------------------------------------------------------------------------- 1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "XLNet", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlnet-base-cased"], "transformer_model": ["XLNet"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/spaCy/output.md: -------------------------------------------------------------------------------- 1 | # Results: spaCy 2 | | | imbalance_proportion | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | Minority Class F1 Score | Majority Class F1 Score | 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:| 4 | | 0 | 0.01 | 0.340035 | 0.744259 | 0.503 | 0.503 | 0.0120856 | 0.667985 | 5 | | 1 | 0.05 | 0.643363 | 0.788313 | 0.6776 | 0.6776 | 0.532862 | 0.753863 | 6 | | 2 | 0.1 | 0.740206 | 0.82189 | 0.75392 | 0.75392 | 0.680515 | 0.799896 | 7 | | 3 | 0.25 | 0.860444 | 0.86873 | 0.86116 | 0.86116 | 0.850446 | 0.870442 | 8 | | 4 | 0.33 | 0.883307 | 0.888066 | 0.88364 | 0.88364 | 0.877076 | 0.889539 | 9 | | 5 | 0.5 | 0.89692 | 0.896925 | 0.89692 | 0.89692 | 0.897105 | 0.896734 | 10 | ![Results](spaCy/plot.png) 11 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/spaCy/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/spaCy/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/spaCy/run-meta.json: -------------------------------------------------------------------------------- 1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "spaCy", "model_name": "SpaCyModel", "param_grid": {"model": ["en_core_web_sm", "en_core_web_lg"], "architecture": ["bow", "simple_cnn", "ensemble"]}, "preprocess_func": null, "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/spacy-transformers/output.md: -------------------------------------------------------------------------------- 1 | # Results: spacy-transformers 2 | | | imbalance_proportion | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | Minority Class F1 Score | Majority Class F1 Score | 3 | |---:|-----------------------:|--------------------:|---------------------------:|------------------------:|-----------:|--------------------------:|--------------------------:| 4 | | 0 | 0.01 | 0.333333 | 0.25 | 0.5 | 0.5 | 0 | 0.666667 | 5 | | 1 | 0.05 | 0.539247 | 0.763022 | 0.6074 | 0.6074 | 0.362041 | 0.716452 | 6 | | 2 | 0.1 | 0.662839 | 0.792193 | 0.6918 | 0.6918 | 0.564024 | 0.761654 | 7 | | 3 | 0.25 | 0.800328 | 0.823934 | 0.80348 | 0.80348 | 0.775241 | 0.825415 | 8 | | 4 | 0.33 | 0.820368 | 0.830668 | 0.8216 | 0.8216 | 0.805495 | 0.835242 | 9 | | 5 | 0.5 | 0.838622 | 0.838789 | 0.83864 | 0.83864 | 0.840314 | 0.836931 | 10 | ![Results](spacy-transformers/plot.png) 11 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/spacy-transformers/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/class_imbalance/spacy-transformers/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/class_imbalance/spacy-transformers/run-meta.json: -------------------------------------------------------------------------------- 1 | {"imbalance_proportions": [0.01, 0.05, 0.1, 0.25, 0.33, 0.5], "name": "spacy-transformers", "model_name": "SpaCyModel", "param_grid": {"model": ["en_trf_bertbaseuncased_lg", "en_trf_xlnetbasecased_lg", "en_trf_robertabase_lg", "en_trf_distilbertbaseuncased_lg"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/data_augmentation/BERTMaskedLM/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/data_augmentation/BERTMaskedLM/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/data_augmentation/BERTMaskedLM/run-meta.json: -------------------------------------------------------------------------------- 1 | {"percent_multipliers": [[0.005, 0], [0.005, 1], [0.005, 5], [0.005, 10], [0.05, 0], [0.05, 1], [0.05, 5], [0.05, 10], [0.33, 0], [0.33, 1], [0.33, 5], [0.75, 0], [0.75, 1], [0.75, 5]], "model_name": "FastText", "param_grid": {"word_ngrams": [1], "autotune_duration": [120]}, "preprocess_func": "fasttext_preprocess", "augment_probability": 0.15, "augment_name": "BERTMaskedLM", "params": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/data_augmentation/MarianMT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/data_augmentation/MarianMT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/data_augmentation/MarianMT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"percent_multipliers": [[0.005, 0], [0.005, 1], [0.005, 5], [0.005, 10], [0.05, 0], [0.05, 1], [0.05, 5], [0.05, 10], [0.33, 0], [0.33, 1], [0.33, 5], [0.75, 0], [0.75, 1], [0.75, 5]], "model_name": "FastText", "param_grid": {"word_ngrams": [1], "autotune_duration": [120]}, "preprocess_func": "fasttext_preprocess", "augment_probability": 0.15, "augment_name": "MarianMT", "params": {"target_languages": ["french", "german", "japanese", "russian", "italian", "portugese", "dutch", "indonesian", "ukrainian", "swedish"]}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/data_augmentation/Word2Vec/output.md: -------------------------------------------------------------------------------- 1 | # Results: Word2Vec 2 | | | percent | multiplier | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | 3 | |---:|----------:|-------------:|--------------------:|---------------------------:|------------------------:|-----------:| 4 | | 0 | 0.005 | 0 | 0.333333 | 0.25 | 0.5 | 0.5 | 5 | | 1 | 0.005 | 1 | 0.615251 | 0.669323 | 0.63484 | 0.63484 | 6 | | 2 | 0.005 | 5 | 0.711989 | 0.712191 | 0.71204 | 0.71204 | 7 | | 3 | 0.005 | 10 | 0.695156 | 0.695169 | 0.69516 | 0.69516 | 8 | | 4 | 0.05 | 0 | 0.801613 | 0.802089 | 0.80168 | 0.80168 | 9 | | 5 | 0.05 | 1 | 0.811721 | 0.813553 | 0.81196 | 0.81196 | 10 | | 6 | 0.05 | 5 | 0.814299 | 0.81446 | 0.81432 | 0.81432 | 11 | | 7 | 0.05 | 10 | 0.816025 | 0.816146 | 0.81604 | 0.81604 | 12 | | 8 | 0.33 | 0 | 0.857272 | 0.857361 | 0.85728 | 0.85728 | 13 | | 9 | 0.33 | 1 | 0.857917 | 0.857955 | 0.85792 | 0.85792 | 14 | | 10 | 0.33 | 5 | 0.85955 | 0.859666 | 0.85956 | 0.85956 | 15 | | 11 | 0.75 | 0 | 0.8724 | 0.8724 | 0.8724 | 0.8724 | 16 | | 12 | 0.75 | 1 | 0.874479 | 0.874492 | 0.87448 | 0.87448 | 17 | | 13 | 0.75 | 5 | 0.869273 | 0.869364 | 0.86928 | 0.86928 | 18 | ![Results](Word2Vec/plot.png) 19 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/data_augmentation/Word2Vec/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/data_augmentation/Word2Vec/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/data_augmentation/Word2Vec/run-meta.json: -------------------------------------------------------------------------------- 1 | {"percent_multipliers": [[0.005, 0], [0.005, 1], [0.005, 5], [0.005, 10], [0.05, 0], [0.05, 1], [0.05, 5], [0.05, 10], [0.33, 0], [0.33, 1], [0.33, 5], [0.75, 0], [0.75, 1], [0.75, 5]], "model_name": "FastText", "param_grid": {"word_ngrams": [1], "autotune_duration": [120]}, "preprocess_func": "fasttext_preprocess", "augment_probability": 0.15, "augment_name": "Word2Vec", "params": {"model": "glove.6B.300d", "tokenizer": "SPACY"}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/data_augmentation/WordNet/output.md: -------------------------------------------------------------------------------- 1 | # Results: WordNet 2 | | | percent | multiplier | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | 3 | |---:|----------:|-------------:|--------------------:|---------------------------:|------------------------:|-----------:| 4 | | 0 | 0.005 | 0 | 0.333422 | 0.75001 | 0.50004 | 0.50004 | 5 | | 1 | 0.005 | 1 | 0.650811 | 0.660782 | 0.65432 | 0.65432 | 6 | | 2 | 0.005 | 5 | 0.707364 | 0.71045 | 0.70816 | 0.70816 | 7 | | 3 | 0.005 | 10 | 0.677011 | 0.679425 | 0.67776 | 0.67776 | 8 | | 4 | 0.05 | 0 | 0.778989 | 0.779782 | 0.77912 | 0.77912 | 9 | | 5 | 0.05 | 1 | 0.795194 | 0.796046 | 0.79532 | 0.79532 | 10 | | 6 | 0.05 | 5 | 0.806279 | 0.806284 | 0.80628 | 0.80628 | 11 | | 7 | 0.05 | 10 | 0.803407 | 0.803647 | 0.80344 | 0.80344 | 12 | | 8 | 0.33 | 0 | 0.859476 | 0.859526 | 0.85948 | 0.85948 | 13 | | 9 | 0.33 | 1 | 0.858 | 0.858004 | 0.858 | 0.858 | 14 | | 10 | 0.33 | 5 | 0.85328 | 0.85328 | 0.85328 | 0.85328 | 15 | | 11 | 0.75 | 0 | 0.873038 | 0.873058 | 0.87304 | 0.87304 | 16 | | 12 | 0.75 | 1 | 0.87552 | 0.875523 | 0.87552 | 0.87552 | 17 | | 13 | 0.75 | 5 | 0.865713 | 0.865798 | 0.86572 | 0.86572 | 18 | ![Results](WordNet/plot.png) 19 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/data_augmentation/WordNet/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/data_augmentation/WordNet/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/data_augmentation/WordNet/run-meta.json: -------------------------------------------------------------------------------- 1 | {"percent_multipliers": [[0.005, 0], [0.005, 1], [0.005, 5], [0.005, 10], [0.05, 0], [0.05, 1], [0.05, 5], [0.05, 10], [0.33, 0], [0.33, 1], [0.33, 5], [0.75, 0], [0.75, 1], [0.75, 5]], "model_name": "FastText", "param_grid": {"word_ngrams": [1], "autotune_duration": [120]}, "preprocess_func": "fasttext_preprocess", "augment_probability": 0.15, "augment_name": "WordNet", "params": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/document_windowing/BERT/output.md: -------------------------------------------------------------------------------- 1 | # Results: BERT 2 | | | Window Config | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | 3 | |---:|:------------------------|--------------------:|---------------------------:|------------------------:|-----------:| 4 | | 0 | Length 250, pooling min | 0.880196 | 0.880247 | 0.8802 | 0.8802 | 5 | | 1 | Length 250, pooling min | 0.755273 | 0.755374 | 0.755293 | 0.755293 | 6 | | 2 | Length 250, pooling min | 0.791559 | 0.791561 | 0.79156 | 0.79156 | 7 | | 3 | Length 250, pooling min | 0.881205 | 0.881759 | 0.881245 | 0.881245 | 8 | | 4 | Length 250, pooling min | 0.754554 | 0.754558 | 0.754555 | 0.754555 | 9 | | 5 | Length 250, pooling min | 0.791008 | 0.791103 | 0.791017 | 0.791017 | 10 | | 6 | Length 250, pooling min | 0.880026 | 0.880286 | 0.880045 | 0.880045 | 11 | | 7 | Length 250, pooling min | 0.757108 | 0.757175 | 0.75712 | 0.75712 | 12 | | 8 | Length 250, pooling min | 0.792644 | 0.792682 | 0.792647 | 0.792647 | 13 | | 9 | Length 250, pooling min | 0.883124 | 0.883129 | 0.883125 | 0.883125 | 14 | ![Results](BERT/plot.png) 15 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/document_windowing/BERT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/document_windowing/BERT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/document_windowing/BERT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"vocab_size": 2000, "sample_size": 0.1, "window_len_poolings": [[null, null], [50, "mean"], [125, "mean"], [250, "mean"], [50, "max"], [125, "max"], [250, "max"], [50, "min"], [125, "min"], [250, "min"]], "name": "BERT", "model_name": "BERT", "param_grid": {}, "preprocess_func": "bert_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/document_windowing/document_windowing.md: -------------------------------------------------------------------------------- 1 | # Results: BERT 2 | | | Window Config | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | 3 | |---:|:------------------------|--------------------:|---------------------------:|------------------------:|-----------:| 4 | | 0 | Length 250, pooling min | 0.880196 | 0.880247 | 0.8802 | 0.8802 | 5 | | 1 | Length 250, pooling min | 0.755273 | 0.755374 | 0.755293 | 0.755293 | 6 | | 2 | Length 250, pooling min | 0.791559 | 0.791561 | 0.79156 | 0.79156 | 7 | | 3 | Length 250, pooling min | 0.881205 | 0.881759 | 0.881245 | 0.881245 | 8 | | 4 | Length 250, pooling min | 0.754554 | 0.754558 | 0.754555 | 0.754555 | 9 | | 5 | Length 250, pooling min | 0.791008 | 0.791103 | 0.791017 | 0.791017 | 10 | | 6 | Length 250, pooling min | 0.880026 | 0.880286 | 0.880045 | 0.880045 | 11 | | 7 | Length 250, pooling min | 0.757108 | 0.757175 | 0.75712 | 0.75712 | 12 | | 8 | Length 250, pooling min | 0.792644 | 0.792682 | 0.792647 | 0.792647 | 13 | | 9 | Length 250, pooling min | 0.883124 | 0.883129 | 0.883125 | 0.883125 | 14 | 15 | ![Results](BERT/plot.png) 16 | --- 17 | -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/ALBERT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/ALBERT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/ALBERT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "ALBERT", "model_name": "Transformer", "param_grid": {"transformer_weights": ["albert-base-v1", "albert-base-v2"], "transformer_model": ["Albert"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/BERT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/BERT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/BERT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "BERT", "model_name": "BERT", "param_grid": {"bert_model": ["bert-base-uncased", "bert-base-cased", "scibert-uncased"], "max_seq_length": [128]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/DistilBERT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/DistilBERT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/DistilBERT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "DistilBERT", "model_name": "Transformer", "param_grid": {"transformer_weights": ["distilbert-base-uncased", "distilbert-base-uncased-distilled-squad"], "transformer_model": ["DistilBert"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/FastText/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/FastText/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/FastText/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "FastText", "model_name": "FastText", "param_grid": {"word_ngrams": [1, 2], "dim": [100, 300], "lr": [0.5, 1.0]}, "preprocess_func": "fasttext_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/MTDNN/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/MTDNN/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/MTDNN/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "MTDNN", "model_name": "MTDNN", "param_grid": {"mtdnn_model": ["mt-dnn-base"], "max_seq_length": [128]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/RoBERTa/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/RoBERTa/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/RoBERTa/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "RoBERTa", "model_name": "Transformer", "param_grid": {"transformer_weights": ["roberta-base"], "transformer_model": ["Roberta"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/SKLearn/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/SKLearn/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/SKLearn/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "SKLearn", "model_name": "SKLearnClassifier", "param_grid": {}, "preprocess_func": "fasttext_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/XLM-RoBERTa/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/XLM-RoBERTa/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/XLM-RoBERTa/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "XLM-RoBERTa", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlm-roberta-base"], "transformer_model": ["XLMRoberta"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/XLM/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/XLM/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/XLM/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "XLM", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlm-mlm-tlm-xnli15-1024", "xlm-clm-ende-1024"], "transformer_model": ["XLM"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/XLNet/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/XLNet/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/XLNet/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "XLNet", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlnet-base-cased"], "transformer_model": ["XLNet"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/spaCy/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/spaCy/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/spaCy/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "spaCy", "model_name": "SpaCyModel", "param_grid": {"model": ["en_core_web_sm", "en_core_web_lg"], "architecture": ["bow", "simple_cnn", "ensemble"]}, "preprocess_func": null, "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/spacy-transformers/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb/spacy-transformers/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb/spacy-transformers/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "spacy-transformers", "model_name": "SpaCyModel", "param_grid": {"model": ["en_trf_bertbaseuncased_lg", "en_trf_xlnetbasecased_lg", "en_trf_robertabase_lg", "en_trf_distilbertbaseuncased_lg"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/ALBERT/output.md: -------------------------------------------------------------------------------- 1 | # Results: ALBERT 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](ALBERT/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/ALBERT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/ALBERT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/ALBERT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "ALBERT", "model_name": "Transformer", "model_params": {"transformer_weights": "albert-base-v2", "transformer_model": "Albert"}, "preprocess_func": "bert_preprocess", "batch_size": 32} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/BERT/output.md: -------------------------------------------------------------------------------- 1 | # Results: BERT 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](BERT/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/BERT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/BERT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/BERT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "BERT", "model_name": "BERT", "model_params": {"bert_model": "bert-base-uncased", "max_seq_length": 128}, "preprocess_func": "bert_preprocess", "batch_size": 32} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/DistilBERT/output.md: -------------------------------------------------------------------------------- 1 | # Results: DistilBERT 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](DistilBERT/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/DistilBERT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/DistilBERT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/DistilBERT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "DistilBERT", "model_name": "Transformer", "model_params": {"transformer_weights": "distilbert-base-uncased", "transformer_model": "DistilBert"}, "preprocess_func": "bert_preprocess", "batch_size": 32} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/ELECTRA/output.md: -------------------------------------------------------------------------------- 1 | # Results: ELECTRA 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](ELECTRA/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/ELECTRA/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/ELECTRA/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/ELECTRA/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "ELECTRA", "model_name": "Transformer", "model_params": {"transformer_weights": "google/electra-base-discriminator", "transformer_model": "Electra"}, "preprocess_func": "bert_preprocess", "batch_size": 32} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/GPT/output.md: -------------------------------------------------------------------------------- 1 | # Results: GPT 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](GPT/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/GPT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/GPT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/GPT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "GPT", "model_name": "Transformer", "model_params": {"transformer_weights": "openai-gpt", "transformer_model": "OpenAIGPT"}, "preprocess_func": null, "batch_size": 32} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/GPT2/output.md: -------------------------------------------------------------------------------- 1 | # Results: GPT2 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](GPT2/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/GPT2/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/GPT2/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/GPT2/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "GPT2", "model_name": "Transformer", "model_params": {"transformer_weights": "gpt2-medium", "transformer_model": "GPT2"}, "preprocess_func": null, "batch_size": 16} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/RoBERTa/output.md: -------------------------------------------------------------------------------- 1 | # Results: RoBERTa 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](RoBERTa/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/RoBERTa/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/RoBERTa/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/RoBERTa/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "RoBERTa", "model_name": "Transformer", "model_params": {"transformer_weights": "roberta-base", "transformer_model": "Roberta"}, "preprocess_func": "bert_preprocess", "batch_size": 16} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/T5Model/output.md: -------------------------------------------------------------------------------- 1 | # Results: T5Model 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](T5Model/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/T5Model/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/T5Model/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/T5Model/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "T5Model", "model_name": "Transformer", "model_params": {"transformer_weights": "t5-base", "transformer_model": "T5"}, "preprocess_func": "bert_preprocess", "batch_size": 16} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/TransformerXL/output.md: -------------------------------------------------------------------------------- 1 | # Results: TransformerXL 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](TransformerXL/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/TransformerXL/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/TransformerXL/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/TransformerXL/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "TransformerXL", "model_name": "Transformer", "model_params": {"transformer_weights": "transfo-xl-wt103", "transformer_model": "TransfoXL"}, "preprocess_func": null, "batch_size": 16} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/USE/output.md: -------------------------------------------------------------------------------- 1 | # Results: USE 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](USE/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/USE/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/USE/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/USE/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "USE", "model_name": "USE", "model_params": {"use_model": "universal-sentence-encoder"}, "preprocess_func": null, "batch_size": 32} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/XLM-RoBERTa/output.md: -------------------------------------------------------------------------------- 1 | # Results: XLM-RoBERTa 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](XLM-RoBERTa/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/XLM-RoBERTa/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/XLM-RoBERTa/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/XLM-RoBERTa/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "XLM-RoBERTa", "model_name": "Transformer", "model_params": {"transformer_weights": "xlm-roberta-base", "transformer_model": "XLMRoberta"}, "preprocess_func": "bert_preprocess", "batch_size": 32} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/XLM/output.md: -------------------------------------------------------------------------------- 1 | # Results: XLM 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](XLM/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/XLM/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/XLM/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/XLM/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "XLM", "model_name": "Transformer", "model_params": {"transformer_weights": "xlm-clm-ende-1024", "transformer_model": "XLM"}, "preprocess_func": "bert_preprocess", "batch_size": 16} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/XLNet/output.md: -------------------------------------------------------------------------------- 1 | # Results: XLNet 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](XLNet/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/XLNet/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/XLNet/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/XLNet/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "XLNet", "model_name": "Transformer", "model_params": {"transformer_weights": "xlnet-base-cased", "transformer_model": "XLNet"}, "preprocess_func": "bert_preprocess", "batch_size": 16} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/imdb_embed.md: -------------------------------------------------------------------------------- 1 | # Results: BERT 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](BERT/plot.png) 7 | --- 8 | # Results: XLM 9 | ``` 10 | 11 | ``` 12 | 13 | ![Results](XLM/plot.png) 14 | --- 15 | # Results: XLNet 16 | ``` 17 | 18 | ``` 19 | 20 | ![Results](XLNet/plot.png) 21 | --- 22 | # Results: RoBERTa 23 | ``` 24 | 25 | ``` 26 | 27 | ![Results](RoBERTa/plot.png) 28 | --- 29 | # Results: DistilBERT 30 | ``` 31 | 32 | ``` 33 | 34 | ![Results](DistilBERT/plot.png) 35 | --- 36 | # Results: ALBERT 37 | ``` 38 | 39 | ``` 40 | 41 | ![Results](ALBERT/plot.png) 42 | --- 43 | # Results: XLM-RoBERTa 44 | ``` 45 | 46 | ``` 47 | 48 | ![Results](XLM-RoBERTa/plot.png) 49 | --- 50 | # Results: GPT 51 | ``` 52 | 53 | ``` 54 | 55 | ![Results](GPT/plot.png) 56 | --- 57 | # Results: GPT2 58 | ``` 59 | 60 | ``` 61 | 62 | ![Results](GPT2/plot.png) 63 | --- 64 | # Results: TransformerXL 65 | ``` 66 | 67 | ``` 68 | 69 | ![Results](TransformerXL/plot.png) 70 | --- 71 | # Results: T5Model 72 | ``` 73 | 74 | ``` 75 | 76 | ![Results](T5Model/plot.png) 77 | --- 78 | # Results: spaCy 79 | ``` 80 | 81 | ``` 82 | 83 | ![Results](spaCy/plot.png) 84 | --- 85 | # Results: spacy-transformers 86 | ``` 87 | 88 | ``` 89 | 90 | ![Results](spacy-transformers/plot.png) 91 | --- 92 | # Results: USE 93 | ``` 94 | 95 | ``` 96 | 97 | ![Results](USE/plot.png) 98 | --- 99 | # Results: ELECTRA 100 | ``` 101 | 102 | ``` 103 | 104 | ![Results](ELECTRA/plot.png) 105 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/spaCy/output.md: -------------------------------------------------------------------------------- 1 | # Results: spaCy 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](spaCy/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/spaCy/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/spaCy/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/spaCy/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "spaCy", "model_name": "SpaCyModel", "model_params": {"model": "en_core_web_lg", "use_gpu": false}, "preprocess_func": null, "batch_size": 32} -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/spacy-transformers/output.md: -------------------------------------------------------------------------------- 1 | # Results: spacy-transformers 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](spacy-transformers/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/spacy-transformers/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/imdb_embed/spacy-transformers/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/imdb_embed/spacy-transformers/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "spacy-transformers", "model_name": "SpaCyModel", "model_params": {"model": "en_trf_robertabase_lg"}, "preprocess_func": "bert_preprocess", "batch_size": 16} -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/ALBERT/output.md: -------------------------------------------------------------------------------- 1 | # Results: ALBERT 2 | | | data_proportion | num_documents | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | 3 | |---:|------------------:|----------------:|--------------------:|---------------------------:|------------------------:|-----------:| 4 | | 0 | 0.005 | 125 | 0.559743 | 0.580711 | 0.57184 | 0.57184 | 5 | | 1 | 0.01 | 250 | 0.604522 | 0.616544 | 0.61004 | 0.61004 | 6 | | 2 | 0.1 | 2500 | 0.811767 | 0.813246 | 0.81196 | 0.81196 | 7 | | 3 | 0.25 | 6250 | 0.831873 | 0.831936 | 0.83188 | 0.83188 | 8 | | 4 | 0.33 | 8250 | 0.847549 | 0.847665 | 0.84756 | 0.84756 | 9 | | 5 | 0.5 | 12500 | 0.836695 | 0.836927 | 0.83672 | 0.83672 | 10 | | 6 | 0.75 | 18750 | 0.852635 | 0.853112 | 0.85268 | 0.85268 | 11 | ![Results](ALBERT/plot.png) 12 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/ALBERT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/low_resource/ALBERT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/ALBERT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"data_proportions": [0.005, 0.01, 0.1, 0.25, 0.33, 0.5, 0.75], "name": "ALBERT", "model_name": "Transformer", "param_grid": {"transformer_weights": ["albert-base-v1", "albert-base-v2"], "transformer_model": ["Albert"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/BERT/output.md: -------------------------------------------------------------------------------- 1 | # Results: BERT 2 | | | data_proportion | num_documents | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | 3 | |---:|------------------:|----------------:|--------------------:|---------------------------:|------------------------:|-----------:| 4 | | 0 | 0.005 | 125 | 0.74607 | 0.750071 | 0.74688 | 0.74688 | 5 | | 1 | 0.01 | 250 | 0.781651 | 0.784314 | 0.78208 | 0.78208 | 6 | | 2 | 0.1 | 2500 | 0.838392 | 0.838467 | 0.8384 | 0.8384 | 7 | | 3 | 0.25 | 6250 | 0.85648 | 0.856917 | 0.85652 | 0.85652 | 8 | | 4 | 0.33 | 8250 | 0.862295 | 0.86305 | 0.86236 | 0.86236 | 9 | | 5 | 0.5 | 12500 | 0.871105 | 0.871296 | 0.87112 | 0.87112 | 10 | | 6 | 0.75 | 18750 | 0.879389 | 0.879541 | 0.8794 | 0.8794 | 11 | ![Results](BERT/plot.png) 12 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/BERT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/low_resource/BERT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/BERT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"data_proportions": [0.005, 0.01, 0.1, 0.25, 0.33, 0.5, 0.75], "name": "BERT", "model_name": "BERT", "param_grid": {"bert_model": ["bert-base-uncased", "bert-base-cased", "scibert-uncased"], "max_seq_length": [128]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/DistilBERT/output.md: -------------------------------------------------------------------------------- 1 | # Results: DistilBERT 2 | | | data_proportion | num_documents | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | 3 | |---:|------------------:|----------------:|--------------------:|---------------------------:|------------------------:|-----------:| 4 | | 0 | 0.005 | 125 | 0.378616 | 0.740677 | 0.52064 | 0.52064 | 5 | | 1 | 0.01 | 250 | 0.754579 | 0.755721 | 0.7548 | 0.7548 | 6 | | 2 | 0.1 | 2500 | 0.828095 | 0.82831 | 0.82812 | 0.82812 | 7 | | 3 | 0.25 | 6250 | 0.843423 | 0.843985 | 0.84348 | 0.84348 | 8 | | 4 | 0.33 | 8250 | 0.847057 | 0.847292 | 0.84708 | 0.84708 | 9 | | 5 | 0.5 | 12500 | 0.854278 | 0.854728 | 0.85432 | 0.85432 | 10 | | 6 | 0.75 | 18750 | 0.862146 | 0.86231 | 0.86216 | 0.86216 | 11 | ![Results](DistilBERT/plot.png) 12 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/DistilBERT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/low_resource/DistilBERT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/DistilBERT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"data_proportions": [0.005, 0.01, 0.1, 0.25, 0.33, 0.5, 0.75], "name": "DistilBERT", "model_name": "Transformer", "param_grid": {"transformer_weights": ["distilbert-base-uncased", "distilbert-base-uncased-distilled-squad"], "transformer_model": ["DistilBert"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/FastText/output.md: -------------------------------------------------------------------------------- 1 | # Results: FastText 2 | | | data_proportion | num_documents | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | 3 | |---:|------------------:|----------------:|--------------------:|---------------------------:|------------------------:|-----------:| 4 | | 0 | 0.005 | 125 | 0.333422 | 0.75001 | 0.50004 | 0.50004 | 5 | | 1 | 0.01 | 250 | 0.589766 | 0.619454 | 0.60348 | 0.60348 | 6 | | 2 | 0.1 | 2500 | 0.800598 | 0.800614 | 0.8006 | 0.8006 | 7 | | 3 | 0.25 | 6250 | 0.854741 | 0.85495 | 0.85476 | 0.85476 | 8 | | 4 | 0.33 | 8250 | 0.8628 | 0.8628 | 0.8628 | 0.8628 | 9 | | 5 | 0.5 | 12500 | 0.872959 | 0.872967 | 0.87296 | 0.87296 | 10 | | 6 | 0.75 | 18750 | 0.88352 | 0.883524 | 0.88352 | 0.88352 | 11 | ![Results](FastText/plot.png) 12 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/FastText/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/low_resource/FastText/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/FastText/run-meta.json: -------------------------------------------------------------------------------- 1 | {"data_proportions": [0.005, 0.01, 0.1, 0.25, 0.33, 0.5, 0.75], "name": "FastText", "model_name": "FastText", "param_grid": {"word_ngrams": [1, 2], "dim": [100, 300], "lr": [0.5, 1.0]}, "preprocess_func": "fasttext_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/MTDNN/output.md: -------------------------------------------------------------------------------- 1 | # Results: MTDNN 2 | | | data_proportion | num_documents | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | 3 | |---:|------------------:|----------------:|--------------------:|---------------------------:|------------------------:|-----------:| 4 | | 0 | 0.005 | 125 | 0.823739 | 0.829687 | 0.82444 | 0.82444 | 5 | | 1 | 0.01 | 250 | 0.835166 | 0.836581 | 0.83532 | 0.83532 | 6 | | 2 | 0.1 | 2500 | 0.858437 | 0.858475 | 0.85844 | 0.85844 | 7 | | 3 | 0.25 | 6250 | 0.862921 | 0.863841 | 0.863 | 0.863 | 8 | | 4 | 0.33 | 8250 | 0.866312 | 0.866405 | 0.86632 | 0.86632 | 9 | | 5 | 0.5 | 12500 | 0.86768 | 0.867684 | 0.86768 | 0.86768 | 10 | | 6 | 0.75 | 18750 | 0.87688 | 0.876881 | 0.87688 | 0.87688 | 11 | ![Results](MTDNN/plot.png) 12 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/MTDNN/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/low_resource/MTDNN/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/MTDNN/run-meta.json: -------------------------------------------------------------------------------- 1 | {"data_proportions": [0.005, 0.01, 0.1, 0.25, 0.33, 0.5, 0.75], "name": "MTDNN", "model_name": "MTDNN", "param_grid": {"mtdnn_model": ["mt-dnn-base"], "max_seq_length": [128]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/RoBERTa/output.md: -------------------------------------------------------------------------------- 1 | # Results: RoBERTa 2 | | | data_proportion | num_documents | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | 3 | |---:|------------------:|----------------:|--------------------:|---------------------------:|------------------------:|-----------:| 4 | | 0 | 0.005 | 125 | 0.338979 | 0.742939 | 0.50252 | 0.50252 | 5 | | 1 | 0.01 | 250 | 0.833661 | 0.833831 | 0.83368 | 0.83368 | 6 | | 2 | 0.1 | 2500 | 0.865881 | 0.866342 | 0.86592 | 0.86592 | 7 | | 3 | 0.25 | 6250 | 0.875998 | 0.87603 | 0.876 | 0.876 | 8 | | 4 | 0.33 | 8250 | 0.882438 | 0.882467 | 0.88244 | 0.88244 | 9 | | 5 | 0.5 | 12500 | 0.333333 | 0.25 | 0.5 | 0.5 | 10 | | 6 | 0.75 | 18750 | 0.888439 | 0.88845 | 0.88844 | 0.88844 | 11 | ![Results](RoBERTa/plot.png) 12 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/RoBERTa/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/low_resource/RoBERTa/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/RoBERTa/run-meta.json: -------------------------------------------------------------------------------- 1 | {"data_proportions": [0.005, 0.01, 0.1, 0.25, 0.33, 0.5, 0.75], "name": "RoBERTa", "model_name": "Transformer", "param_grid": {"transformer_weights": ["roberta-base"], "transformer_model": ["Roberta"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/SKLearn/output.md: -------------------------------------------------------------------------------- 1 | # Results: SKLearn 2 | | | data_proportion | num_documents | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | 3 | |---:|------------------:|----------------:|--------------------:|---------------------------:|------------------------:|-----------:| 4 | | 0 | 0.005 | 125 | 0.502502 | 0.700345 | 0.57828 | 0.57828 | 5 | | 1 | 0.01 | 250 | 0.682961 | 0.701672 | 0.68824 | 0.68824 | 6 | | 2 | 0.1 | 2500 | 0.825509 | 0.825598 | 0.82552 | 0.82552 | 7 | | 3 | 0.25 | 6250 | 0.853119 | 0.853132 | 0.85312 | 0.85312 | 8 | | 4 | 0.33 | 8250 | 0.857173 | 0.857471 | 0.8572 | 0.8572 | 9 | | 5 | 0.5 | 12500 | 0.866067 | 0.86622 | 0.86608 | 0.86608 | 10 | | 6 | 0.75 | 18750 | 0.874996 | 0.87505 | 0.875 | 0.875 | 11 | ![Results](SKLearn/plot.png) 12 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/SKLearn/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/low_resource/SKLearn/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/SKLearn/run-meta.json: -------------------------------------------------------------------------------- 1 | {"data_proportions": [0.005, 0.01, 0.1, 0.25, 0.33, 0.5, 0.75], "name": "SKLearn", "model_name": "SKLearnClassifier", "param_grid": {}, "preprocess_func": "fasttext_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/XLM-RoBERTa/output.md: -------------------------------------------------------------------------------- 1 | # Results: XLM-RoBERTa 2 | | | data_proportion | num_documents | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | 3 | |---:|------------------:|----------------:|--------------------:|---------------------------:|------------------------:|-----------:| 4 | | 0 | 0.005 | 125 | 0.333333 | 0.25 | 0.5 | 0.5 | 5 | | 1 | 0.01 | 250 | 0.516406 | 0.689123 | 0.58376 | 0.58376 | 6 | | 2 | 0.1 | 2500 | 0.7987 | 0.799119 | 0.79876 | 0.79876 | 7 | | 3 | 0.25 | 6250 | 0.333333 | 0.25 | 0.5 | 0.5 | 8 | | 4 | 0.33 | 8250 | 0.852605 | 0.853402 | 0.85268 | 0.85268 | 9 | | 5 | 0.5 | 12500 | 0.855971 | 0.856728 | 0.85604 | 0.85604 | 10 | | 6 | 0.75 | 18750 | 0.8442 | 0.844201 | 0.8442 | 0.8442 | 11 | ![Results](XLM-RoBERTa/plot.png) 12 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/XLM-RoBERTa/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/low_resource/XLM-RoBERTa/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/XLM-RoBERTa/run-meta.json: -------------------------------------------------------------------------------- 1 | {"data_proportions": [0.005, 0.01, 0.1, 0.25, 0.33, 0.5, 0.75], "name": "XLM-RoBERTa", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlm-roberta-base"], "transformer_model": ["XLMRoberta"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/XLM/output.md: -------------------------------------------------------------------------------- 1 | # Results: XLM 2 | | | data_proportion | num_documents | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | 3 | |---:|------------------:|----------------:|--------------------:|---------------------------:|------------------------:|-----------:| 4 | | 0 | 0.005 | 125 | 0.333333 | 0.25 | 0.5 | 0.5 | 5 | | 1 | 0.01 | 250 | 0.453665 | 0.571183 | 0.53092 | 0.53092 | 6 | | 2 | 0.1 | 2500 | 0.771882 | 0.77233 | 0.77196 | 0.77196 | 7 | | 3 | 0.25 | 6250 | 0.333333 | 0.25 | 0.5 | 0.5 | 8 | | 4 | 0.33 | 8250 | 0.333333 | 0.25 | 0.5 | 0.5 | 9 | | 5 | 0.5 | 12500 | 0.333333 | 0.25 | 0.5 | 0.5 | 10 | | 6 | 0.75 | 18750 | 0.333333 | 0.25 | 0.5 | 0.5 | 11 | ![Results](XLM/plot.png) 12 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/XLM/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/low_resource/XLM/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/XLM/run-meta.json: -------------------------------------------------------------------------------- 1 | {"data_proportions": [0.005, 0.01, 0.1, 0.25, 0.33, 0.5, 0.75], "name": "XLM", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlm-mlm-tlm-xnli15-1024", "xlm-clm-ende-1024"], "transformer_model": ["XLM"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/XLNet/output.md: -------------------------------------------------------------------------------- 1 | # Results: XLNet 2 | | | data_proportion | num_documents | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | 3 | |---:|------------------:|----------------:|--------------------:|---------------------------:|------------------------:|-----------:| 4 | | 0 | 0.005 | 125 | 0.805214 | 0.814967 | 0.80652 | 0.80652 | 5 | | 1 | 0.01 | 250 | 0.843484 | 0.844229 | 0.84356 | 0.84356 | 6 | | 2 | 0.1 | 2500 | 0.866762 | 0.867215 | 0.8668 | 0.8668 | 7 | | 3 | 0.25 | 6250 | 0.880065 | 0.880265 | 0.88008 | 0.88008 | 8 | | 4 | 0.33 | 8250 | 0.878197 | 0.87824 | 0.8782 | 0.8782 | 9 | | 5 | 0.5 | 12500 | 0.88088 | 0.88088 | 0.88088 | 0.88088 | 10 | | 6 | 0.75 | 18750 | 0.888433 | 0.888538 | 0.88844 | 0.88844 | 11 | ![Results](XLNet/plot.png) 12 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/XLNet/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/low_resource/XLNet/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/XLNet/run-meta.json: -------------------------------------------------------------------------------- 1 | {"data_proportions": [0.005, 0.01, 0.1, 0.25, 0.33, 0.5, 0.75], "name": "XLNet", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlnet-base-cased"], "transformer_model": ["XLNet"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/spaCy/output.md: -------------------------------------------------------------------------------- 1 | # Results: spaCy 2 | | | data_proportion | num_documents | Weighted F1 Score | Weighted Precision Score | Weighted Recall Score | Accuracy | 3 | |---:|------------------:|----------------:|--------------------:|---------------------------:|------------------------:|-----------:| 4 | | 0 | 0.005 | 125 | 0.651859 | 0.69133 | 0.6642 | 0.6642 | 5 | | 1 | 0.01 | 250 | 0.729198 | 0.729735 | 0.72932 | 0.72932 | 6 | | 2 | 0.1 | 2500 | 0.823233 | 0.82329 | 0.82324 | 0.82324 | 7 | | 3 | 0.25 | 6250 | 0.852519 | 0.852534 | 0.85252 | 0.85252 | 8 | | 4 | 0.33 | 8250 | 0.864112 | 0.864205 | 0.86412 | 0.86412 | 9 | | 5 | 0.5 | 12500 | 0.87436 | 0.87436 | 0.87436 | 0.87436 | 10 | | 6 | 0.75 | 18750 | 0.887432 | 0.887551 | 0.88744 | 0.88744 | 11 | ![Results](spaCy/plot.png) 12 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/spaCy/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/low_resource/spaCy/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/low_resource/spaCy/run-meta.json: -------------------------------------------------------------------------------- 1 | {"data_proportions": [0.005, 0.01, 0.1, 0.25, 0.33, 0.5, 0.75], "name": "spaCy", "model_name": "SpaCyModel", "param_grid": {"model": ["en_core_web_sm", "en_core_web_lg"], "architecture": ["bow", "simple_cnn", "ensemble"]}, "preprocess_func": null, "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/moviesummary/ALBERT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/moviesummary/ALBERT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/moviesummary/ALBERT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "ALBERT", "model_name": "Transformer", "param_grid": {"transformer_weights": ["albert-base-v1", "albert-base-v2"], "transformer_model": ["Albert"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/moviesummary/DistilBERT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/moviesummary/DistilBERT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/moviesummary/DistilBERT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "DistilBERT", "model_name": "Transformer", "param_grid": {"transformer_weights": ["distilbert-base-uncased", "distilbert-base-uncased-distilled-squad"], "transformer_model": ["DistilBert"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/moviesummary/FastText/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/moviesummary/FastText/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/moviesummary/FastText/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "FastText", "model_name": "FastText", "param_grid": {"word_ngrams": [1, 2], "dim": [100, 300], "lr": [0.5, 1.0]}, "preprocess_func": "fasttext_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/moviesummary/RoBERTa/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/moviesummary/RoBERTa/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/moviesummary/RoBERTa/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "RoBERTa", "model_name": "Transformer", "param_grid": {"transformer_weights": ["roberta-base"], "transformer_model": ["Roberta"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/moviesummary/SKLearn/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/moviesummary/SKLearn/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/moviesummary/SKLearn/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "SKLearn", "model_name": "SKLearnClassifier", "param_grid": {}, "preprocess_func": "fasttext_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/moviesummary/XLM-RoBERTa/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/moviesummary/XLM-RoBERTa/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/moviesummary/XLM-RoBERTa/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "XLM-RoBERTa", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlm-roberta-base"], "transformer_model": ["XLMRoberta"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/moviesummary/XLM/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/moviesummary/XLM/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/moviesummary/XLM/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "XLM", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlm-mlm-tlm-xnli15-1024", "xlm-clm-ende-1024"], "transformer_model": ["XLM"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/moviesummary/XLNet/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/moviesummary/XLNet/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/moviesummary/XLNet/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "XLNet", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlnet-base-cased"], "transformer_model": ["XLNet"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/moviesummary/spaCy/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/moviesummary/spaCy/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/moviesummary/spaCy/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "spaCy", "model_name": "SpaCyModel", "param_grid": {"model": ["en_core_web_sm", "en_core_web_lg"], "architecture": ["bow", "simple_cnn", "ensemble"]}, "preprocess_func": null, "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/moviesummary/spacy-transformers/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/moviesummary/spacy-transformers/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/moviesummary/spacy-transformers/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "spacy-transformers", "model_name": "SpaCyModel", "param_grid": {"model": ["en_trf_bertbaseuncased_lg", "en_trf_xlnetbasecased_lg", "en_trf_robertabase_lg", "en_trf_distilbertbaseuncased_lg"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/ALBERT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/ALBERT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/ALBERT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "ALBERT", "model_name": "Transformer", "param_grid": {"transformer_weights": ["albert-base-v1", "albert-base-v2"], "transformer_model": ["Albert"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/BERT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/BERT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/BERT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "BERT", "model_name": "BERT", "param_grid": {"bert_model": ["bert-base-uncased", "bert-base-cased", "scibert-uncased"], "max_seq_length": [128]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/DistilBERT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/DistilBERT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/DistilBERT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "DistilBERT", "model_name": "Transformer", "param_grid": {"transformer_weights": ["distilbert-base-uncased", "distilbert-base-uncased-distilled-squad"], "transformer_model": ["DistilBert"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/FastText/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/FastText/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/FastText/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "FastText", "model_name": "FastText", "param_grid": {"word_ngrams": [1, 2], "dim": [100, 300], "lr": [0.5, 1.0]}, "preprocess_func": "fasttext_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/MTDNN/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/MTDNN/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/MTDNN/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "MTDNN", "model_name": "MTDNN", "param_grid": {"mtdnn_model": ["mt-dnn-base"], "max_seq_length": [128]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/RoBERTa/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/RoBERTa/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/RoBERTa/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "RoBERTa", "model_name": "Transformer", "param_grid": {"transformer_weights": ["roberta-base"], "transformer_model": ["Roberta"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/SKLearn/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/SKLearn/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/SKLearn/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "SKLearn", "model_name": "SKLearnClassifier", "param_grid": {}, "preprocess_func": "fasttext_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/XLM-RoBERTa/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/XLM-RoBERTa/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/XLM-RoBERTa/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "XLM-RoBERTa", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlm-roberta-base"], "transformer_model": ["XLMRoberta"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/XLM/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/XLM/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/XLM/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "XLM", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlm-mlm-tlm-xnli15-1024", "xlm-clm-ende-1024"], "transformer_model": ["XLM"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/XLNet/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/XLNet/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/XLNet/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "XLNet", "model_name": "Transformer", "param_grid": {"transformer_weights": ["xlnet-base-cased"], "transformer_model": ["XLNet"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/spaCy/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/spaCy/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/spaCy/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "spaCy", "model_name": "SpaCyModel", "param_grid": {"model": ["en_core_web_sm", "en_core_web_lg"], "architecture": ["bow", "simple_cnn", "ensemble"]}, "preprocess_func": null, "run_kwargs": {}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/spacy-transformers/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups/spacy-transformers/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups/spacy-transformers/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "spacy-transformers", "model_name": "SpaCyModel", "param_grid": {"model": ["en_trf_bertbaseuncased_lg", "en_trf_xlnetbasecased_lg", "en_trf_robertabase_lg", "en_trf_distilbertbaseuncased_lg"]}, "preprocess_func": "bert_preprocess", "run_kwargs": {"train_batch_size": 16}} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/ALBERT/output.md: -------------------------------------------------------------------------------- 1 | # Results: ALBERT 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](ALBERT/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/ALBERT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/ALBERT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/ALBERT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "ALBERT", "model_name": "Transformer", "model_params": {"transformer_weights": "albert-base-v2", "transformer_model": "Albert"}, "preprocess_func": "bert_preprocess", "batch_size": 32} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/BERT/output.md: -------------------------------------------------------------------------------- 1 | # Results: BERT 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](BERT/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/BERT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/BERT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/BERT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "BERT", "model_name": "BERT", "model_params": {"bert_model": "bert-base-uncased", "max_seq_length": 128}, "preprocess_func": "bert_preprocess", "batch_size": 32} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/DistilBERT/output.md: -------------------------------------------------------------------------------- 1 | # Results: DistilBERT 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](DistilBERT/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/DistilBERT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/DistilBERT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/DistilBERT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "DistilBERT", "model_name": "Transformer", "model_params": {"transformer_weights": "distilbert-base-uncased", "transformer_model": "DistilBert"}, "preprocess_func": "bert_preprocess", "batch_size": 32} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/ELECTRA/output.md: -------------------------------------------------------------------------------- 1 | # Results: ELECTRA 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](ELECTRA/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/ELECTRA/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/ELECTRA/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/ELECTRA/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "ELECTRA", "model_name": "Transformer", "model_params": {"transformer_weights": "google/electra-base-discriminator", "transformer_model": "Electra"}, "preprocess_func": "bert_preprocess", "batch_size": 32} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/GPT/output.md: -------------------------------------------------------------------------------- 1 | # Results: GPT 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](GPT/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/GPT/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/GPT/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/GPT/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "GPT", "model_name": "Transformer", "model_params": {"transformer_weights": "openai-gpt", "transformer_model": "OpenAIGPT"}, "preprocess_func": null, "batch_size": 32} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/GPT2/output.md: -------------------------------------------------------------------------------- 1 | # Results: GPT2 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](GPT2/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/GPT2/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/GPT2/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/GPT2/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "GPT2", "model_name": "Transformer", "model_params": {"transformer_weights": "gpt2-medium", "transformer_model": "GPT2"}, "preprocess_func": null, "batch_size": 16} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/RoBERTa/output.md: -------------------------------------------------------------------------------- 1 | # Results: RoBERTa 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](RoBERTa/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/RoBERTa/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/RoBERTa/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/RoBERTa/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "RoBERTa", "model_name": "Transformer", "model_params": {"transformer_weights": "roberta-base", "transformer_model": "Roberta"}, "preprocess_func": "bert_preprocess", "batch_size": 16} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/T5Model/output.md: -------------------------------------------------------------------------------- 1 | # Results: T5Model 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](T5Model/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/T5Model/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/T5Model/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/T5Model/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "T5Model", "model_name": "Transformer", "model_params": {"transformer_weights": "t5-base", "transformer_model": "T5"}, "preprocess_func": "bert_preprocess", "batch_size": 16} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/TransformerXL/output.md: -------------------------------------------------------------------------------- 1 | # Results: TransformerXL 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](TransformerXL/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/TransformerXL/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/TransformerXL/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/TransformerXL/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "TransformerXL", "model_name": "Transformer", "model_params": {"transformer_weights": "transfo-xl-wt103", "transformer_model": "TransfoXL"}, "preprocess_func": null, "batch_size": 16} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/USE/output.md: -------------------------------------------------------------------------------- 1 | # Results: USE 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](USE/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/USE/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/USE/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/USE/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "USE", "model_name": "USE", "model_params": {"use_model": "universal-sentence-encoder"}, "preprocess_func": null, "batch_size": 32} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/XLM-RoBERTa/output.md: -------------------------------------------------------------------------------- 1 | # Results: XLM-RoBERTa 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](XLM-RoBERTa/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/XLM-RoBERTa/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/XLM-RoBERTa/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/XLM-RoBERTa/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "XLM-RoBERTa", "model_name": "Transformer", "model_params": {"transformer_weights": "xlm-roberta-base", "transformer_model": "XLMRoberta"}, "preprocess_func": "bert_preprocess", "batch_size": 32} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/XLM/output.md: -------------------------------------------------------------------------------- 1 | # Results: XLM 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](XLM/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/XLM/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/XLM/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/XLM/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "XLM", "model_name": "Transformer", "model_params": {"transformer_weights": "xlm-clm-ende-1024", "transformer_model": "XLM"}, "preprocess_func": "bert_preprocess", "batch_size": 16} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/XLNet/output.md: -------------------------------------------------------------------------------- 1 | # Results: XLNet 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](XLNet/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/XLNet/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/XLNet/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/XLNet/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "XLNet", "model_name": "Transformer", "model_params": {"transformer_weights": "xlnet-base-cased", "transformer_model": "XLNet"}, "preprocess_func": "bert_preprocess", "batch_size": 16} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/newsgroups_embed.md: -------------------------------------------------------------------------------- 1 | # Results: BERT 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](BERT/plot.png) 7 | --- 8 | # Results: XLM 9 | ``` 10 | 11 | ``` 12 | 13 | ![Results](XLM/plot.png) 14 | --- 15 | # Results: XLNet 16 | ``` 17 | 18 | ``` 19 | 20 | ![Results](XLNet/plot.png) 21 | --- 22 | # Results: RoBERTa 23 | ``` 24 | 25 | ``` 26 | 27 | ![Results](RoBERTa/plot.png) 28 | --- 29 | # Results: DistilBERT 30 | ``` 31 | 32 | ``` 33 | 34 | ![Results](DistilBERT/plot.png) 35 | --- 36 | # Results: ALBERT 37 | ``` 38 | 39 | ``` 40 | 41 | ![Results](ALBERT/plot.png) 42 | --- 43 | # Results: XLM-RoBERTa 44 | ``` 45 | 46 | ``` 47 | 48 | ![Results](XLM-RoBERTa/plot.png) 49 | --- 50 | # Results: GPT 51 | ``` 52 | 53 | ``` 54 | 55 | ![Results](GPT/plot.png) 56 | --- 57 | # Results: GPT2 58 | ``` 59 | 60 | ``` 61 | 62 | ![Results](GPT2/plot.png) 63 | --- 64 | # Results: TransformerXL 65 | ``` 66 | 67 | ``` 68 | 69 | ![Results](TransformerXL/plot.png) 70 | --- 71 | # Results: T5Model 72 | ``` 73 | 74 | ``` 75 | 76 | ![Results](T5Model/plot.png) 77 | --- 78 | # Results: sklearn_TF-IDF 79 | ``` 80 | 81 | ``` 82 | 83 | ![Results](sklearn_TF-IDF/plot.png) 84 | --- 85 | # Results: spaCy 86 | ``` 87 | 88 | ``` 89 | 90 | ![Results](spaCy/plot.png) 91 | --- 92 | # Results: spacy-transformers 93 | ``` 94 | 95 | ``` 96 | 97 | ![Results](spacy-transformers/plot.png) 98 | --- 99 | # Results: USE 100 | ``` 101 | 102 | ``` 103 | 104 | ![Results](USE/plot.png) 105 | --- 106 | # Results: ELECTRA 107 | ``` 108 | 109 | ``` 110 | 111 | ![Results](ELECTRA/plot.png) 112 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/sklearn_TF-IDF/output.md: -------------------------------------------------------------------------------- 1 | # Results: sklearn_TF-IDF 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](sklearn_TF-IDF/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/sklearn_TF-IDF/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/sklearn_TF-IDF/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/sklearn_TF-IDF/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "sklearn_TF-IDF", "model_name": "TfidfEmbedder", "model_params": {}, "preprocess_func": "fasttext_preprocess", "batch_size": 32} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/spaCy/output.md: -------------------------------------------------------------------------------- 1 | # Results: spaCy 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](spaCy/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/spaCy/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/spaCy/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/spaCy/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "spaCy", "model_name": "SpaCyModel", "model_params": {"model": "en_core_web_lg", "use_gpu": false}, "preprocess_func": null, "batch_size": 32} -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/spacy-transformers/output.md: -------------------------------------------------------------------------------- 1 | # Results: spacy-transformers 2 | ``` 3 | 4 | ``` 5 | 6 | ![Results](spacy-transformers/plot.png) 7 | --- -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/spacy-transformers/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/benchmark/benchmark_output/newsgroups_embed/spacy-transformers/plot.png -------------------------------------------------------------------------------- /benchmark/benchmark_output/newsgroups_embed/spacy-transformers/run-meta.json: -------------------------------------------------------------------------------- 1 | {"name": "spacy-transformers", "model_name": "SpaCyModel", "model_params": {"model": "en_trf_robertabase_lg"}, "preprocess_func": "bert_preprocess", "batch_size": 16} -------------------------------------------------------------------------------- /benchmark/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "2.3" 2 | 3 | services: 4 | gobbli-benchmark-gpu: 5 | runtime: nvidia 6 | ipc: host 7 | build: 8 | context: ../ 9 | dockerfile: ./benchmark/docker/Dockerfile 10 | image: gobbli-benchmark:latest 11 | environment: 12 | NVIDIA_VISIBLE_DEVICES: ${NVIDIA_VISIBLE_DEVICES:-all} 13 | GOBBLI_USE_GPU: "1" 14 | shm_size: 4G 15 | volumes: 16 | # Needed to spawn containers 17 | - /var/run/docker.sock:/var/run/docker.sock 18 | # Needed to perform bind mounts as we would on the host 19 | - $PWD:$PWD 20 | working_dir: $PWD 21 | 22 | gobbli-benchmark: 23 | build: 24 | context: ../ 25 | dockerfile: ./benchmark/docker/Dockerfile 26 | image: gobbli-benchmark:latest 27 | shm_size: 4G 28 | volumes: 29 | # Needed to spawn containers 30 | - /var/run/docker.sock:/var/run/docker.sock 31 | # Needed to perform bind mounts as we would on the host 32 | - $PWD:$PWD 33 | working_dir: $PWD 34 | -------------------------------------------------------------------------------- /benchmark/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7 2 | 3 | # Install Chromium for rendering Altair charts to PNG 4 | # Fixed version to preserve compatibility with chromedriver 5 | # in case the two releases get out of sync 6 | RUN apt-get update && apt-get install -y \ 7 | chromium=80.0.3987.149-1~deb10u1 \ 8 | chromium-driver=80.0.3987.149-1~deb10u1 \ 9 | && rm -rf /var/lib/apt/lists/* 10 | 11 | # Copy essentials in to install requirements 12 | COPY ./setup.py ./meta.json ./requirements.txt ./README.md /code/ 13 | COPY ./benchmark/requirements.txt /code/benchmark/requirements.txt 14 | 15 | # Install dependencies 16 | WORKDIR /code 17 | RUN pip install -e '.[augment,tokenize]' \ 18 | && pip install -r requirements.txt \ 19 | && pip install -r benchmark/requirements.txt 20 | 21 | # Copy the rest of the repository in 22 | COPY ./ /code 23 | 24 | ENTRYPOINT ["python", "run_benchmarks.py"] 25 | -------------------------------------------------------------------------------- /benchmark/requirements.txt: -------------------------------------------------------------------------------- 1 | PyYAML==5.3 2 | tabulate==0.8.6 3 | umap-learn==0.3.10 4 | matplotlib==3.1.3 5 | # Needed to save Altair plots to PNG 6 | selenium==3.141.0 7 | -------------------------------------------------------------------------------- /benchmark/run_benchmarks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Run all benchmarks that haven't been run. 4 | # NOTE: This may take several days depending on your available resources. 5 | 6 | image_name="gobbli-benchmark" 7 | 8 | if [[ -n "$GOBBLI_USE_GPU" ]]; then 9 | image_name="${image_name}-gpu" 10 | echo "GPU enabled." 11 | else 12 | echo "GPU disabled; running on CPU." 13 | fi 14 | 15 | # Set working directory so the container starts in our working directory 16 | # Otherwise it starts in the repository root 17 | docker-compose run --rm "$image_name" $@ 18 | -------------------------------------------------------------------------------- /ci-gpu/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "2.3" 2 | 3 | services: 4 | gobbli-ci-gpu: 5 | runtime: nvidia 6 | ipc: host 7 | build: 8 | context: ../ 9 | args: 10 | PYTHON_VERSION: "${PYTHON_VERSION:?Must specify Python version.}" 11 | image: gobbli-ci:latest 12 | environment: 13 | NVIDIA_VISIBLE_DEVICES: ${NVIDIA_VISIBLE_DEVICES:-all} 14 | # Change permissions after running to allow temp file cleanup by non-root user 15 | command: bash -c 'py.test -x --use-gpu --nvidia-visible-devices $NVIDIA_VISIBLE_DEVICES; chmod -R a+w ./' 16 | working_dir: $PWD/.. 17 | volumes: 18 | # Needed for CI to be able to spawn containers 19 | - /var/run/docker.sock:/var/run/docker.sock 20 | # Needed for CI to perform bind mounts as we would on the host 21 | - $PWD/..:$PWD/.. 22 | -------------------------------------------------------------------------------- /ci/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.3" 2 | 3 | services: 4 | gobbli-ci: 5 | build: 6 | context: ../ 7 | args: 8 | PYTHON_VERSION: "${PYTHON_VERSION:?Must specify Python version.}" 9 | image: gobbli-ci:latest 10 | ipc: host 11 | # Travis only gives us ~7.5GB of memory, so we need to run tests in 12 | # low resource mode 13 | # The test environment won't be shared across runs, but we'd like to reuse 14 | # artifacts between tests where possible to reduce runtime, so add the 15 | # switch to persist data as well 16 | command: ./run_ci.sh --low-resource --persist-data 17 | working_dir: $PWD/.. 18 | volumes: 19 | # Needed for CI to be able to spawn containers 20 | - /var/run/docker.sock:/var/run/docker.sock 21 | # Needed for CI to perform bind mounts as we would on the host 22 | - $PWD/..:$PWD/.. 23 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | # Compose services for testing the various model containers 2 | # GPU not enabled to prevent dependency on the NVIDIA docker runtime 3 | 4 | version: "3.7" 5 | 6 | services: 7 | bert: 8 | build: 9 | context: ./gobbli/model/bert 10 | 11 | fasttext: 12 | build: 13 | context: ./gobbli/model/fasttext 14 | 15 | mt-dnn: 16 | build: 17 | context: ./gobbli/model/mtdnn 18 | 19 | use: 20 | build: 21 | context: ./gobbli/model/use 22 | 23 | bert-maskedlm: 24 | build: 25 | context: ./gobbli/augment/bert 26 | 27 | marian: 28 | build: 29 | context: ./gobbli/augment/marian 30 | 31 | transformer: 32 | build: 33 | context: ./gobbli/model/transformer 34 | 35 | spacy: 36 | build: 37 | context: ./gobbli/model/spacy 38 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_static/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/docs/_static/.gitkeep -------------------------------------------------------------------------------- /docs/_static/gobbli_favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/docs/_static/gobbli_favicon.ico -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | ============= 3 | 4 | Detailed reference for all code in the library. 5 | 6 | .. toctree:: 7 | :maxdepth: 4 8 | 9 | auto/gobbli 10 | -------------------------------------------------------------------------------- /docs/img/interactive_apps/evaluate/evaluate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/docs/img/interactive_apps/evaluate/evaluate.png -------------------------------------------------------------------------------- /docs/img/interactive_apps/explain/explain.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/docs/img/interactive_apps/explain/explain.png -------------------------------------------------------------------------------- /docs/img/interactive_apps/explain/explain_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/docs/img/interactive_apps/explain/explain_output.png -------------------------------------------------------------------------------- /docs/img/interactive_apps/explore/explore.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/docs/img/interactive_apps/explore/explore.png -------------------------------------------------------------------------------- /docs/img/interactive_apps/explore/explore_embeddings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/docs/img/interactive_apps/explore/explore_embeddings.png -------------------------------------------------------------------------------- /docs/img/interactive_apps/explore/explore_topic_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/docs/img/interactive_apps/explore/explore_topic_model.png -------------------------------------------------------------------------------- /docs/img/interactive_apps/explore/explore_trained_embeddings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/docs/img/interactive_apps/explore/explore_trained_embeddings.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. gobbli documentation master file, created by 2 | sphinx-quickstart on Tue Jun 4 14:50:18 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to gobbli's documentation 7 | ================================= 8 | 9 | gobbli is a library designed to make experimentation and analysis using deep learning easier. It provides a simple, uniform interface to deep learning models that abstracts away most of the complexity in terms of different input/output formats, library versions, etc. It attempts to implement a set of common use cases with an emphasis on usability rather than performance. 10 | 11 | gobbli is *not* designed to provide deep learning models in a production context. Each task generally involves running a Docker container in the background and transferring a large amount of data to and from disk, which creates significant overhead. Additionally, gobbli does not support fine-grained model-specific tuning, such as custom loss functions. Our goal is to take the user 80% of the way to their deep learning solution as quickly as possible so they can decide whether it's worth the effort to resolve the remaining 20%. 12 | 13 | .. toctree:: 14 | prerequisites 15 | quickstart 16 | interactive_apps 17 | troubleshooting 18 | advanced_usage 19 | api 20 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/prerequisites.rst: -------------------------------------------------------------------------------- 1 | Prerequisites 2 | ============= 3 | 4 | gobbli requires Python 3.7+. 5 | 6 | First, ensure `Docker `__ is installed and your user has permissions to run docker commands. Next, install the ``gobbli`` package and dependencies into your environment: 7 | 8 | .. code-block:: bash 9 | 10 | pip install gobbli 11 | 12 | Some of the :ref:`data-augmentation` methods require extra packages. You can install them all using the following steps: 13 | 14 | .. code-block:: bash 15 | 16 | pip install gobbli[augment] 17 | python -m spacy download en_core_web_sm 18 | 19 | Additionally, :ref:`document-windowing` with the `SentencePiece `__ tokenizer requires extra packages. Install them like so: 20 | 21 | .. code-block:: bash 22 | 23 | pip install gobbli[tokenize] 24 | 25 | .. _interactive-app-prereqs: 26 | 27 | The `Streamlit `__-based :ref:`interactive-apps` require their own set of dependencies: 28 | 29 | .. code-block:: bash 30 | 31 | pip install gobbli[interactive] 32 | 33 | If you want to train models using a GPU, you will additionally need an NVIDIA graphics card and `nvidia-docker `__. 34 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | # Needed for readthedocs to install everything required to build the docs 2 | -r ../requirements.txt 3 | sphinx==2.1.0 4 | sphinx-autobuild==0.7.1 5 | sphinx-autodoc-typehints==1.6.0 6 | sphinx-paramlinks==0.3.7 7 | 8 | mock==3.0.5 9 | autodoc==0.5.0 10 | 11 | gobbli[augment,tokenize,interactive] 12 | -------------------------------------------------------------------------------- /generate_docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | rm -f docs/auto/* 6 | cd docs 7 | make html 8 | -------------------------------------------------------------------------------- /gobbli/__init__.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import pkg_resources 4 | 5 | import gobbli.augment as augment 6 | import gobbli.dataset as dataset 7 | import gobbli.experiment as experiment 8 | import gobbli.io as io 9 | import gobbli.model as model 10 | from gobbli.util import TokenizeMethod 11 | 12 | # Warn the user of potential conflicts using the old third-party typing/dataclasses 13 | # modules 14 | for conflicting_pkg in ("typing", "dataclasses"): 15 | req = pkg_resources.Requirement.parse(conflicting_pkg) 16 | if pkg_resources.working_set.find(req) is not None: 17 | warnings.warn( 18 | f"You've installed a third-party module named '{conflicting_pkg}' which " 19 | "conflicts with a standard library module of the same name. This can cause " 20 | "errors when unpickling code, e.g. when running experiments using Ray. Consider " 21 | f"uninstalling the module:\n\npip uninstall {conflicting_pkg}" 22 | ) 23 | 24 | __all__ = [ 25 | # Modules 26 | "augment", 27 | "dataset", 28 | "experiment", 29 | "model", 30 | "io", 31 | # Misc top level imports 32 | "TokenizeMethod", 33 | ] 34 | -------------------------------------------------------------------------------- /gobbli/augment/__init__.py: -------------------------------------------------------------------------------- 1 | from gobbli.augment.bert import BERTMaskedLM 2 | from gobbli.augment.marian import MarianMT 3 | from gobbli.augment.word2vec import Word2Vec 4 | from gobbli.augment.wordnet import WordNet 5 | 6 | __all__ = ["BERTMaskedLM", "Word2Vec", "WordNet", "MarianMT"] 7 | -------------------------------------------------------------------------------- /gobbli/augment/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from pathlib import Path 3 | from typing import List 4 | 5 | from gobbli.util import gobbli_dir 6 | 7 | 8 | def augment_dir() -> Path: 9 | return gobbli_dir() / "augment" 10 | 11 | 12 | class BaseAugment(ABC): 13 | """ 14 | Base class for data augmentation methods. 15 | """ 16 | 17 | @abstractmethod 18 | def augment(self, X: List[str], times: int = 5, p: float = 0.1) -> List[str]: 19 | """ 20 | Return additional texts for each text in the passed array. 21 | 22 | Args: 23 | X: Input texts. 24 | times: How many texts to generate per text in the input. 25 | p: Probability of considering each token in the input for replacement. 26 | Note that some tokens aren't able to be replaced by a given augmentation 27 | method and will be ignored, so the actual proportion of replaced tokens 28 | in your input may be much lower than this number. 29 | Returns: 30 | Generated texts (length = ``times * len(X)``). 31 | """ 32 | raise NotImplementedError 33 | 34 | @classmethod 35 | def data_dir(cls) -> Path: 36 | """ 37 | Returns: 38 | The data directory used for this class of augmentation model. 39 | """ 40 | return augment_dir() / cls.__name__ 41 | -------------------------------------------------------------------------------- /gobbli/augment/bert/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.3-cuda10.1-cudnn7-runtime 2 | 3 | RUN pip install transformers==2.3.0 sentencepiece==0.1.86 4 | 5 | COPY ./src /code/bert 6 | WORKDIR /code/bert 7 | -------------------------------------------------------------------------------- /gobbli/augment/bert/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import BERTMaskedLM 2 | 3 | __all__ = ["BERTMaskedLM"] 4 | -------------------------------------------------------------------------------- /gobbli/augment/marian/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.3-cuda10.1-cudnn7-runtime 2 | 3 | RUN pip install transformers==2.9.1 sentencepiece==0.1.86 4 | 5 | COPY ./src /code/marian 6 | WORKDIR /code/marian 7 | -------------------------------------------------------------------------------- /gobbli/augment/marian/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import MarianMT 2 | 3 | __all__ = ["MarianMT"] 4 | -------------------------------------------------------------------------------- /gobbli/cli.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from pathlib import Path 3 | 4 | import click 5 | 6 | INTERACTIVE_DIR = Path(__file__).parent / "interactive" 7 | 8 | 9 | def _streamlit_run(app_name: str, *args): 10 | return subprocess.check_call( 11 | ["streamlit", "run", str(INTERACTIVE_DIR / f"{app_name}.py"), "--", *args] 12 | ) 13 | 14 | 15 | @click.group() 16 | def main(): 17 | pass 18 | 19 | 20 | @main.command( 21 | # Forward the --help argument to the streamlit apps 22 | "explore", 23 | context_settings=dict(ignore_unknown_options=True), 24 | add_help_option=False, 25 | ) 26 | @click.argument("args", nargs=-1, type=click.UNPROCESSED) 27 | def main_explore(args): 28 | _streamlit_run("explore", *args) 29 | 30 | 31 | @main.command( 32 | "evaluate", 33 | context_settings=dict(ignore_unknown_options=True), 34 | add_help_option=False, 35 | ) 36 | @click.argument("args", nargs=-1, type=click.UNPROCESSED) 37 | def main_evaluate(args): 38 | _streamlit_run("evaluate", *args) 39 | 40 | 41 | @main.command( 42 | "explain", context_settings=dict(ignore_unknown_options=True), add_help_option=False 43 | ) 44 | @click.argument("args", nargs=-1, type=click.UNPROCESSED) 45 | def main_explain(args): 46 | _streamlit_run("explain", *args) 47 | 48 | 49 | if __name__ == "__main__": 50 | main() 51 | -------------------------------------------------------------------------------- /gobbli/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from gobbli.dataset.cmu_movie_summary import MovieSummaryDataset 2 | from gobbli.dataset.imdb import IMDBDataset 3 | from gobbli.dataset.newsgroups import NewsgroupsDataset 4 | from gobbli.dataset.trivial import TrivialDataset 5 | 6 | __all__ = ["TrivialDataset", "NewsgroupsDataset", "IMDBDataset", "MovieSummaryDataset"] 7 | -------------------------------------------------------------------------------- /gobbli/dataset/imdb.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Set, Tuple 3 | 4 | from gobbli.dataset.nested_file import NestedFileDataset 5 | from gobbli.util import download_archive 6 | 7 | 8 | class IMDBDataset(NestedFileDataset): 9 | """ 10 | gobbli Dataset for the IMDB sentiment analysis problem. 11 | 12 | https://ai.stanford.edu/~amaas/data/sentiment/ 13 | """ 14 | 15 | def labels(self) -> Set[str]: 16 | return {"pos", "neg"} 17 | 18 | def download(self, data_dir: Path) -> Path: 19 | return download_archive( 20 | "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", data_dir 21 | ) 22 | 23 | def folders(self) -> Tuple[Path, Path]: 24 | return Path("aclImdb/train"), Path("aclImdb/test") 25 | 26 | def read_source_file(self, file_path: Path) -> str: 27 | return file_path.read_text() 28 | -------------------------------------------------------------------------------- /gobbli/dataset/newsgroups.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Set, Tuple 3 | 4 | from gobbli.dataset.nested_file import NestedFileDataset 5 | from gobbli.util import download_archive 6 | 7 | 8 | class NewsgroupsDataset(NestedFileDataset): 9 | """ 10 | gobbli Dataset for the 20 Newsgroups problem. 11 | 12 | http://qwone.com/~jason/20Newsgroups/ 13 | """ 14 | 15 | def labels(self) -> Set[str]: 16 | return { 17 | "alt.atheism", 18 | "comp.graphics", 19 | "comp.os.ms-windows.misc", 20 | "comp.sys.ibm.pc.hardware", 21 | "comp.sys.mac.hardware", 22 | "comp.windows.x", 23 | "misc.forsale", 24 | "rec.autos", 25 | "rec.motorcycles", 26 | "rec.sport.baseball", 27 | "rec.sport.hockey", 28 | "sci.crypt", 29 | "sci.electronics", 30 | "sci.med", 31 | "sci.space", 32 | "soc.religion.christian", 33 | "talk.politics.guns", 34 | "talk.politics.mideast", 35 | "talk.politics.misc", 36 | "talk.religion.misc", 37 | } 38 | 39 | def download(self, data_dir: Path) -> Path: 40 | return download_archive( 41 | "https://ndownloader.figshare.com/files/5975967", 42 | data_dir, 43 | filename="20news-bydate.tar.gz", 44 | ) 45 | 46 | def folders(self) -> Tuple[Path, Path]: 47 | return Path("20news-bydate-train"), Path("20news-bydate-test") 48 | 49 | def read_source_file(self, file_path: Path) -> str: 50 | return file_path.read_text(encoding="latin-1") 51 | -------------------------------------------------------------------------------- /gobbli/dataset/trivial.py: -------------------------------------------------------------------------------- 1 | from gobbli.dataset.base import BaseDataset 2 | 3 | 4 | class TrivialDataset(BaseDataset): 5 | """ 6 | gobbli Dataset containing only a few observations. 7 | Useful for verifying a model runs without waiting for an 8 | actual dataset to process. 9 | """ 10 | 11 | DATASET = ["This is positive.", "This, although, is negative."] 12 | LABELS = ["1", "0"] 13 | 14 | def _is_built(self) -> bool: 15 | return True 16 | 17 | def _build(self): 18 | pass 19 | 20 | def X_train(self): 21 | return TrivialDataset.DATASET 22 | 23 | def y_train(self): 24 | return TrivialDataset.LABELS 25 | 26 | def X_test(self): 27 | return TrivialDataset.DATASET 28 | 29 | def y_test(self): 30 | return TrivialDataset.LABELS 31 | -------------------------------------------------------------------------------- /gobbli/experiment/__init__.py: -------------------------------------------------------------------------------- 1 | from gobbli.experiment.classification import ClassificationExperiment 2 | 3 | __all__ = ["ClassificationExperiment"] 4 | -------------------------------------------------------------------------------- /gobbli/inspect/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/gobbli/inspect/__init__.py -------------------------------------------------------------------------------- /gobbli/model/__init__.py: -------------------------------------------------------------------------------- 1 | from gobbli.model.bert import BERT 2 | from gobbli.model.fasttext import FastText 3 | from gobbli.model.majority import MajorityClassifier 4 | from gobbli.model.mtdnn import MTDNN 5 | from gobbli.model.random import RandomEmbedder 6 | from gobbli.model.sklearn import SKLearnClassifier, TfidfEmbedder 7 | from gobbli.model.spacy import SpaCyModel 8 | from gobbli.model.transformer import Transformer 9 | from gobbli.model.use import USE 10 | 11 | __all__ = [ 12 | "BERT", 13 | "FastText", 14 | "MajorityClassifier", 15 | "MTDNN", 16 | "RandomEmbedder", 17 | "Transformer", 18 | "SKLearnClassifier", 19 | "SpaCyModel", 20 | "USE", 21 | "TfidfEmbedder", 22 | ] 23 | -------------------------------------------------------------------------------- /gobbli/model/bert/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG GPU 2 | FROM tensorflow/tensorflow:1.11.0${GPU:+-gpu}-py3 3 | 4 | RUN apt-get update && apt-get install -y --no-install-recommends \ 5 | git \ 6 | && apt-get clean \ 7 | && rm -rf /var/lib/apt/lists/* 8 | 9 | # Copy modified source code in 10 | # Base commit: d66a146741588fb208450bde15aa7db143baaa69 11 | COPY ./src /code/bert 12 | WORKDIR /code/bert 13 | -------------------------------------------------------------------------------- /gobbli/model/bert/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import BERT 2 | 3 | __all__ = ["BERT"] 4 | -------------------------------------------------------------------------------- /gobbli/model/bert/src/.gitignore: -------------------------------------------------------------------------------- 1 | # Initially taken from Github's Python gitignore file 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # IPython 79 | profile_default/ 80 | ipython_config.py 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # celery beat schedule file 86 | celerybeat-schedule 87 | 88 | # SageMath parsed files 89 | *.sage.py 90 | 91 | # Environments 92 | .env 93 | .venv 94 | env/ 95 | venv/ 96 | ENV/ 97 | env.bak/ 98 | venv.bak/ 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | .spyproject 103 | 104 | # Rope project settings 105 | .ropeproject 106 | 107 | # mkdocs documentation 108 | /site 109 | 110 | # mypy 111 | .mypy_cache/ 112 | .dmypy.json 113 | dmypy.json 114 | 115 | # Pyre type checker 116 | .pyre/ 117 | -------------------------------------------------------------------------------- /gobbli/model/bert/src/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | BERT needs to maintain permanent compatibility with the pre-trained model files, 4 | so we do not plan to make any major changes to this library (other than what was 5 | promised in the README). However, we can accept small patches related to 6 | re-factoring and documentation. To submit contributes, there are just a few 7 | small guidelines you need to follow. 8 | 9 | ## Contributor License Agreement 10 | 11 | Contributions to this project must be accompanied by a Contributor License 12 | Agreement. You (or your employer) retain the copyright to your contribution; 13 | this simply gives us permission to use and redistribute your contributions as 14 | part of the project. Head over to to see 15 | your current agreements on file or to sign a new one. 16 | 17 | You generally only need to submit a CLA once, so if you've already submitted one 18 | (even if it was for a different project), you probably don't need to do it 19 | again. 20 | 21 | ## Code reviews 22 | 23 | All submissions, including submissions by project members, require review. We 24 | use GitHub pull requests for this purpose. Consult 25 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 26 | information on using pull requests. 27 | 28 | ## Community Guidelines 29 | 30 | This project follows 31 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/). 32 | -------------------------------------------------------------------------------- /gobbli/model/bert/src/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /gobbli/model/bert/src/optimization_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import optimization 20 | import tensorflow as tf 21 | 22 | 23 | class OptimizationTest(tf.test.TestCase): 24 | 25 | def test_adam(self): 26 | with self.test_session() as sess: 27 | w = tf.get_variable( 28 | "w", 29 | shape=[3], 30 | initializer=tf.constant_initializer([0.1, -0.2, -0.1])) 31 | x = tf.constant([0.4, 0.2, -0.5]) 32 | loss = tf.reduce_mean(tf.square(x - w)) 33 | tvars = tf.trainable_variables() 34 | grads = tf.gradients(loss, tvars) 35 | global_step = tf.train.get_or_create_global_step() 36 | optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2) 37 | train_op = optimizer.apply_gradients(zip(grads, tvars), global_step) 38 | init_op = tf.group(tf.global_variables_initializer(), 39 | tf.local_variables_initializer()) 40 | sess.run(init_op) 41 | for _ in range(100): 42 | sess.run(train_op) 43 | w_np = sess.run(w) 44 | self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2) 45 | 46 | 47 | if __name__ == "__main__": 48 | tf.test.main() 49 | -------------------------------------------------------------------------------- /gobbli/model/bert/src/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow >= 1.11.0 # CPU Version of TensorFlow. 2 | # tensorflow-gpu >= 1.11.0 # GPU version of TensorFlow. 3 | -------------------------------------------------------------------------------- /gobbli/model/fasttext/Dockerfile: -------------------------------------------------------------------------------- 1 | # Build stage to compile the binary 2 | FROM ubuntu:18.04 3 | 4 | RUN apt-get update && apt-get install -y \ 5 | build-essential \ 6 | wget \ 7 | git \ 8 | python-dev \ 9 | unzip \ 10 | python-numpy \ 11 | python-scipy \ 12 | && rm -rf /var/cache/apk/* 13 | 14 | WORKDIR /code 15 | 16 | RUN git clone https://github.com/facebookresearch/fastText.git /code \ 17 | && cd /code \ 18 | && git checkout 5e1320a1594a026a081f8b1e5caa3085a711a625 \ 19 | && rm -rf .git* \ 20 | && make 21 | 22 | # Final slim image containing just the binary 23 | FROM ubuntu:18.04 24 | 25 | WORKDIR /code 26 | COPY --from=0 /code/fasttext . 27 | ENTRYPOINT ["./fasttext"] 28 | CMD ["help"] 29 | -------------------------------------------------------------------------------- /gobbli/model/fasttext/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import FastText, FastTextCheckpoint 2 | 3 | __all__ = ["FastText", "FastTextCheckpoint"] 4 | -------------------------------------------------------------------------------- /gobbli/model/mtdnn/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM allenlao/pytorch-mt-dnn:v0.1 2 | 3 | # Copy modified source code in 4 | # Base commit: a7f74e0afcffd17ab68fb752fa1cc06eabaacda3 5 | COPY ./src /code/mt-dnn 6 | WORKDIR /code/mt-dnn 7 | -------------------------------------------------------------------------------- /gobbli/model/mtdnn/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import MTDNN 2 | 3 | __all__ = ["MTDNN"] 4 | -------------------------------------------------------------------------------- /gobbli/model/mtdnn/src/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Microsoft 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /gobbli/model/mtdnn/src/config/tasks_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "mnli": 0.3, 3 | "cola": 0.05 4 | } 5 | -------------------------------------------------------------------------------- /gobbli/model/mtdnn/src/data_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/gobbli/model/mtdnn/src/data_utils/__init__.py -------------------------------------------------------------------------------- /gobbli/model/mtdnn/src/data_utils/log_wrapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft. All rights reserved. 2 | import logging 3 | from time import gmtime, strftime 4 | import sys 5 | 6 | def create_logger(name, silent=False, to_disk=False, log_file=None): 7 | """Logger wrapper 8 | """ 9 | # setup logger 10 | log = logging.getLogger(name) 11 | log.setLevel(logging.DEBUG) 12 | log.propagate = False 13 | formatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S') 14 | if not silent: 15 | ch = logging.StreamHandler(sys.stdout) 16 | ch.setLevel(logging.INFO) 17 | ch.setFormatter(formatter) 18 | log.addHandler(ch) 19 | if to_disk: 20 | log_file = log_file if log_file is not None else strftime("%Y-%m-%d-%H-%M-%S.log", gmtime()) 21 | fh = logging.FileHandler(log_file) 22 | fh.setLevel(logging.DEBUG) 23 | fh.setFormatter(formatter) 24 | log.addHandler(fh) 25 | return log 26 | -------------------------------------------------------------------------------- /gobbli/model/mtdnn/src/data_utils/metrics.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft. All rights reserved. 2 | from sklearn.metrics import matthews_corrcoef 3 | from sklearn.metrics import accuracy_score, f1_score 4 | from scipy.stats import pearsonr, spearmanr 5 | from torch.nn.functional import cross_entropy 6 | 7 | def compute_acc(predicts, labels): 8 | return 100.0 * accuracy_score(labels, predicts) 9 | 10 | def compute_f1(predicts, labels): 11 | return 100.0 * f1_score(labels, predicts) 12 | 13 | def compute_mcc(predicts, labels): 14 | return 100.0 * matthews_corrcoef(labels, predicts) 15 | 16 | def compute_pearson(predicts, labels): 17 | pcof = pearsonr(labels, predicts)[0] 18 | return 100.0 * pcof 19 | 20 | def compute_spearman(predicts, labels): 21 | scof = spearmanr(labels, predicts)[0] 22 | return 100.0 * scof 23 | 24 | def compute_cross_entropy(predicts, labels): 25 | return cross_entropy(predicts, labels) 26 | -------------------------------------------------------------------------------- /gobbli/model/mtdnn/src/data_utils/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft. All rights reserved. 2 | import random 3 | import torch 4 | import numpy 5 | from torch.autograd import Variable 6 | import subprocess 7 | 8 | class AverageMeter(object): 9 | """Computes and stores the average and current value.""" 10 | def __init__(self): 11 | self.reset() 12 | 13 | def reset(self): 14 | self.val = 0 15 | self.avg = 0 16 | self.sum = 0 17 | self.count = 0 18 | 19 | def update(self, val, n=1): 20 | self.val = val 21 | self.sum += val * n 22 | self.count += n 23 | self.avg = self.sum / self.count 24 | 25 | def set_environment(seed, set_cuda=False): 26 | random.seed(seed) 27 | numpy.random.seed(seed) 28 | torch.manual_seed(seed) 29 | if torch.cuda.is_available() and set_cuda: 30 | torch.cuda.manual_seed_all(seed) 31 | 32 | def patch_var(v, cuda=True): 33 | if cuda: 34 | v = Variable(v.cuda(async=True)) 35 | else: 36 | v = Variable(v) 37 | return v 38 | 39 | def get_gpu_memory_map(): 40 | result = subprocess.check_output( 41 | [ 42 | 'nvidia-smi', '--query-gpu=memory.used', 43 | '--format=csv,nounits,noheader' 44 | ], encoding='utf-8') 45 | gpu_memory = [int(x) for x in result.strip().split('\n')] 46 | gpu_memory_map = dict(zip(range(len(gpu_memory)), gpu_memory)) 47 | return gpu_memory_map 48 | 49 | def get_pip_env(): 50 | result = subprocess.call(["pip", "freeze"]) 51 | return result 52 | -------------------------------------------------------------------------------- /gobbli/model/mtdnn/src/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04 2 | RUN apt-get clean && apt-get update && apt-get install -y locales 3 | ENV LANG="en_US.UTF-8" LC_ALL="en_US.UTF-8" LANGUAGE="en_US.UTF-8" LC_TYPE="en_US.UTF-8" TERM=xterm-256color 4 | RUN locale-gen en_US en_US.UTF-8 5 | RUN apt-get update && apt-get install -y --no-install-recommends \ 6 | build-essential \ 7 | cmake \ 8 | git \ 9 | curl \ 10 | vim \ 11 | zip \ 12 | wget \ 13 | unzip \ 14 | ca-certificates \ 15 | libjpeg-dev \ 16 | libpng-dev &&\ 17 | rm -rf /var/lib/apt/lists/* 18 | 19 | 20 | ENV PYTHON_VERSION=3.6 21 | 22 | RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ 23 | chmod +x ~/miniconda.sh && \ 24 | ~/miniconda.sh -b -p /opt/conda && \ 25 | rm ~/miniconda.sh && \ 26 | /opt/conda/bin/conda create -y --name pytorch-py$PYTHON_VERSION python=$PYTHON_VERSION numpy=1.14.5 scipy ipython mkl&& \ 27 | /opt/conda/bin/conda clean -ya 28 | 29 | ENV PATH /opt/conda/envs/pytorch-py$PYTHON_VERSION/bin:$PATH 30 | 31 | RUN /opt/conda/bin/conda install --name pytorch-py$PYTHON_VERSION cuda90 pytorch=0.4.1 torchvision -c pytorch && \ 32 | /opt/conda/bin/conda clean -ya 33 | RUN pip install --upgrade pip 34 | RUN pip install tensorboard_logger 35 | RUN pip install tqdm 36 | RUN pip install h5py==2.7.1 37 | RUN pip install boto3 38 | RUN pip install -U scikit-learn 39 | # install pytorch bert 40 | RUN pip install pytorch-pretrained-bert==v0.6.0 41 | 42 | # GLUE baseline dependencies 43 | RUN pip install nltk 44 | RUN pip install allennlp==0.4 45 | RUN pip install ipdb 46 | RUN pip install tensorboardX 47 | 48 | WORKDIR /root 49 | #COPY requirements.txt /root/ 50 | #RUN pip install -r requirements.txt 51 | -------------------------------------------------------------------------------- /gobbli/model/mtdnn/src/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ############################################################## 3 | # This script is used to download resources for MT-DNN experiments 4 | ############################################################## 5 | 6 | DATA_DIR=$(pwd)/data 7 | echo "Create a folder $DATA_DIR" 8 | mkdir ${DATA_DIR} 9 | 10 | BERT_DIR=$(pwd)/mt_dnn_models 11 | echo "Create a folder BERT_DIR" 12 | mkdir ${BERT_DIR} 13 | 14 | ## DOWNLOAD GLUE DATA 15 | ## Please refer glue-baseline install requirments or other issues. 16 | git clone https://github.com/jsalt18-sentence-repl/jiant.git 17 | cd jiant 18 | python scripts/download_glue_data.py --data_dir $DATA_DIR --tasks all 19 | 20 | cd .. 21 | rm -rf jiant 22 | ######################### 23 | 24 | ## DOWNLOAD SciTail 25 | cd $DATA_DIR 26 | wget http://data.allenai.org.s3.amazonaws.com/downloads/SciTailV1.1.zip 27 | unzip SciTailV1.1.zip 28 | mv SciTailV1.1 SciTail 29 | # remove zip files 30 | rm *.zip 31 | 32 | cd ${BERT_DIR} 33 | ## DOWNLOAD BERT 34 | wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip -O "uncased_bert_base.zip" 35 | unzip uncased_bert_base.zip 36 | mv uncased_L-12_H-768_A-12/vocab.txt "${BERT_DIR}/" 37 | rm *.zip 38 | rm -rf uncased_L-12_H-768_A-12 39 | 40 | ## Download bert models 41 | wget https://mrc.blob.core.windows.net/mt-dnn-model/bert_model_base_v2.pt -O "${BERT_DIR}/bert_model_base.pt" 42 | wget https://mrc.blob.core.windows.net/mt-dnn-model/bert_model_large_v2.pt -O "${BERT_DIR}/bert_model_large.pt" 43 | wget https://mrc.blob.core.windows.net/mt-dnn-model/mt_dnn_base.pt -O "${BERT_DIR}/mt_dnn_base.pt" 44 | wget https://mrc.blob.core.windows.net/mt-dnn-model/mt_dnn_large.pt -O "${BERT_DIR}/mt_dnn_large.pt" 45 | 46 | ## Download preprocessed SciTail/SNLI data for domain adaptation 47 | cd $DATA_DIR 48 | DOMAIN_ADP="domain_adaptation" 49 | echo "Create a folder $DATA_DIR" 50 | mkdir ${DOMAIN_ADP} 51 | 52 | wget https://mrc.blob.core.windows.net/mt-dnn-model/data.zip 53 | unzip data.zip 54 | mv data/* ${DOMAIN_ADP} 55 | rm -rf data.zip 56 | rm -rf data 57 | -------------------------------------------------------------------------------- /gobbli/model/mtdnn/src/module/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/gobbli/model/mtdnn/src/module/__init__.py -------------------------------------------------------------------------------- /gobbli/model/mtdnn/src/module/common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft. All rights reserved. 2 | import torch 3 | import math 4 | from torch.nn.functional import tanh, relu, prelu, leaky_relu, sigmoid, elu, selu 5 | 6 | def linear(x): 7 | return x 8 | 9 | def swish(x): 10 | return x * sigmoid(x) 11 | 12 | def gelu(x): 13 | """ref:https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L113 14 | """ 15 | return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) 16 | 17 | def activation(func_a): 18 | """Activation function wrapper 19 | """ 20 | try: 21 | f = eval(func_a) 22 | except: 23 | f = linear 24 | return f 25 | -------------------------------------------------------------------------------- /gobbli/model/mtdnn/src/module/dropout_wrapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft. All rights reserved. 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | 7 | class DropoutWrapper(nn.Module): 8 | """ 9 | This is a dropout wrapper which supports the fix mask dropout 10 | """ 11 | def __init__(self, dropout_p=0, enable_vbp=True): 12 | super(DropoutWrapper, self).__init__() 13 | """variational dropout means fix dropout mask 14 | ref: https://discuss.pytorch.org/t/dropout-for-rnns/633/11 15 | """ 16 | self.enable_variational_dropout = enable_vbp 17 | self.dropout_p = dropout_p 18 | 19 | def forward(self, x): 20 | """ 21 | :param x: batch * len * input_size 22 | """ 23 | if self.training == False or self.dropout_p == 0: 24 | return x 25 | 26 | if len(x.size()) == 3: 27 | mask = Variable(1.0 / (1-self.dropout_p) * torch.bernoulli((1-self.dropout_p) * (x.data.new(x.size(0), x.size(2)).zero_() + 1)), requires_grad=False) 28 | return mask.unsqueeze(1).expand_as(x) * x 29 | else: 30 | return F.dropout(x, p=self.dropout_p, training=self.training) 31 | -------------------------------------------------------------------------------- /gobbli/model/mtdnn/src/module/sub_layers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft. All rights reserved. 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.nn.parameter import Parameter 6 | 7 | class LayerNorm(nn.Module): 8 | #ref: https://github.com/pytorch/pytorch/issues/1959 9 | # :https://arxiv.org/pdf/1607.06450.pdf 10 | def __init__(self, hidden_size, eps=1e-4): 11 | super(LayerNorm, self).__init__() 12 | self.alpha = Parameter(torch.ones(1,1,hidden_size)) # gain g 13 | self.beta = Parameter(torch.zeros(1,1,hidden_size)) # bias b 14 | self.eps = eps 15 | 16 | def forward(self, x): 17 | """ 18 | Args: 19 | :param x: batch * len * input_size 20 | 21 | Returns: 22 | normalized x 23 | """ 24 | mu = torch.mean(x, 2, keepdim=True).expand_as(x) 25 | sigma = torch.std(x, 2, keepdim=True).expand_as(x) 26 | return (x - mu) / (sigma + self.eps) * self.alpha.expand_as(x) + self.beta.expand_as(x) 27 | -------------------------------------------------------------------------------- /gobbli/model/mtdnn/src/mt_dnn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/gobbli/model/mtdnn/src/mt_dnn/__init__.py -------------------------------------------------------------------------------- /gobbli/model/mtdnn/src/mt_dnn/gobbli_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | 7 | from .model import MTDNNModel 8 | 9 | 10 | class GobbliMTDNNModel(MTDNNModel): 11 | def update(self, input_ids, token_type_ids, attention_mask, labels): 12 | self.network.train() 13 | if self.config['cuda']: 14 | labels = labels.cuda(async=True) 15 | 16 | y = Variable(labels, requires_grad=False) 17 | logits = self.mnetwork(input_ids, token_type_ids, attention_mask, task_id=0) 18 | loss = F.cross_entropy(logits, y) 19 | 20 | self.train_loss.update(loss.item(), logits.size(0)) 21 | self.optimizer.zero_grad() 22 | 23 | loss.backward() 24 | if self.config['global_grad_clipping'] > 0: 25 | torch.nn.utils.clip_grad_norm_(self.network.parameters(), 26 | self.config['global_grad_clipping']) 27 | 28 | self.optimizer.step() 29 | self.updates += 1 30 | self.update_ema() 31 | 32 | def predict(self, input_ids, token_type_ids, attention_mask): 33 | self.network.eval() 34 | score = self.mnetwork(input_ids, token_type_ids, attention_mask, task_id=0) 35 | score = F.softmax(score, dim=1).data.cpu() 36 | predict = np.argmax(score.numpy(), axis=1).tolist() 37 | return score, predict 38 | -------------------------------------------------------------------------------- /gobbli/model/mtdnn/src/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | torch==0.4.1 3 | tqdm 4 | colorlog 5 | boto3 6 | pytorch-pretrained-bert==v0.6.0 7 | -------------------------------------------------------------------------------- /gobbli/model/mtdnn/src/run_toy.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ############################### 4 | # Training a mt-dnn model 5 | # Note that this is a toy setting and please refer our paper for detailed hyper-parameters. 6 | ############################### 7 | 8 | python prepro.py 9 | python train.py -------------------------------------------------------------------------------- /gobbli/model/mtdnn/src/scripts/domain_adaptation_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [[ $# -ne 8 ]]; then 3 | echo "train.sh " 4 | exit 1 5 | fi 6 | prefix=$1 7 | BERT_PATH=$2 8 | train_datasets=$3 9 | test_datasets=$4 10 | DATA_DIR=$5 11 | MODEL_ROOT=$6 12 | BATCH_SIZE=$7 13 | gpu=$8 14 | echo "export CUDA_VISIBLE_DEVICES=${gpu}" 15 | export CUDA_VISIBLE_DEVICES=${gpu} 16 | tstr=$(date +"%FT%H%M") 17 | 18 | answer_opt=0 19 | optim="adamax" 20 | grad_clipping=0 21 | global_grad_clipping=1 22 | 23 | model_dir="checkpoints/${prefix}_${optim}_answer_opt${answer_opt}_gc${grad_clipping}_ggc${global_grad_clipping}_${tstr}" 24 | log_file="${model_dir}/log.log" 25 | python ../train.py --data_dir ${DATA_DIR} --init_checkpoint ${BERT_PATH} --batch_size ${BATCH_SIZE} --output_dir ${model_dir} --log_file ${log_file} --answer_opt ${answer_opt} --optimizer ${optim} --train_datasets ${train_datasets} --test_datasets ${test_datasets} --grad_clipping ${grad_clipping} --global_grad_clipping ${global_grad_clipping} --multi_gpu_on 26 | -------------------------------------------------------------------------------- /gobbli/model/mtdnn/src/scripts/run_mt_dnn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [[ $# -ne 2 ]]; then 3 | echo "train.sh " 4 | exit 1 5 | fi 6 | prefix="mt-dnn-rte" 7 | BATCH_SIZE=$1 8 | gpu=$2 9 | echo "export CUDA_VISIBLE_DEVICES=${gpu}" 10 | export CUDA_VISIBLE_DEVICES=${gpu} 11 | tstr=$(date +"%FT%H%M") 12 | 13 | train_datasets="mnli,rte,qqp,qnli,mrpc,sst,cola,stsb" 14 | test_datasets="mnli_matched,mnli_mismatched,rte" 15 | MODEL_ROOT="checkpoints" 16 | BERT_PATH="../mt_dnn_models/bert_model_large.pt" 17 | DATA_DIR="../data/mt_dnn" 18 | 19 | answer_opt=1 20 | optim="adamax" 21 | grad_clipping=0 22 | global_grad_clipping=1 23 | lr="5e-5" 24 | 25 | model_dir="checkpoints/${prefix}_${optim}_answer_opt${answer_opt}_gc${grad_clipping}_ggc${global_grad_clipping}_${tstr}" 26 | log_file="${model_dir}/log.log" 27 | python ../train.py --data_dir ${DATA_DIR} --init_checkpoint ${BERT_PATH} --batch_size ${BATCH_SIZE} --output_dir ${model_dir} --log_file ${log_file} --answer_opt ${answer_opt} --optimizer ${optim} --train_datasets ${train_datasets} --test_datasets ${test_datasets} --grad_clipping ${grad_clipping} --global_grad_clipping ${global_grad_clipping} --learning_rate ${lr} --multi_gpu_on 28 | -------------------------------------------------------------------------------- /gobbli/model/mtdnn/src/scripts/run_rte.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [[ $# -ne 2 ]]; then 3 | echo "train.sh " 4 | exit 1 5 | fi 6 | prefix="mt-dnn-rte" 7 | BATCH_SIZE=$1 8 | gpu=$2 9 | echo "export CUDA_VISIBLE_DEVICES=${gpu}" 10 | export CUDA_VISIBLE_DEVICES=${gpu} 11 | tstr=$(date +"%FT%H%M") 12 | 13 | train_datasets="rte" 14 | test_datasets="rte" 15 | MODEL_ROOT="checkpoints" 16 | BERT_PATH="../mt_dnn_models/mt_dnn_large.pt" 17 | DATA_DIR="../data/mt_dnn" 18 | 19 | answer_opt=0 20 | optim="adamax" 21 | grad_clipping=0 22 | global_grad_clipping=1 23 | lr="2e-5" 24 | 25 | model_dir="checkpoints/${prefix}_${optim}_answer_opt${answer_opt}_gc${grad_clipping}_ggc${global_grad_clipping}_${tstr}" 26 | log_file="${model_dir}/log.log" 27 | python ../train.py --data_dir ${DATA_DIR} --init_checkpoint ${BERT_PATH} --batch_size ${BATCH_SIZE} --output_dir ${model_dir} --log_file ${log_file} --answer_opt ${answer_opt} --optimizer ${optim} --train_datasets ${train_datasets} --test_datasets ${test_datasets} --grad_clipping ${grad_clipping} --global_grad_clipping ${global_grad_clipping} --learning_rate ${lr} 28 | -------------------------------------------------------------------------------- /gobbli/model/mtdnn/src/scripts/run_stsb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [[ $# -ne 2 ]]; then 3 | echo "train.sh " 4 | exit 1 5 | fi 6 | prefix="mt-dnn-stsb" 7 | BATCH_SIZE=$1 8 | gpu=$2 9 | echo "export CUDA_VISIBLE_DEVICES=${gpu}" 10 | export CUDA_VISIBLE_DEVICES=${gpu} 11 | tstr=$(date +"%FT%H%M") 12 | 13 | train_datasets="stsb" 14 | test_datasets="stsb" 15 | MODEL_ROOT="checkpoints" 16 | BERT_PATH="../mt_dnn_models/mt_dnn_large.pt" 17 | DATA_DIR="../data/mt_dnn" 18 | 19 | answer_opt=0 20 | optim="adamax" 21 | grad_clipping=0 22 | global_grad_clipping=1 23 | 24 | model_dir="checkpoints/${prefix}_${optim}_answer_opt${answer_opt}_gc${grad_clipping}_ggc${global_grad_clipping}_${tstr}" 25 | log_file="${model_dir}/log.log" 26 | python ../train.py --data_dir ${DATA_DIR} --init_checkpoint ${BERT_PATH} --batch_size ${BATCH_SIZE} --output_dir ${model_dir} --log_file ${log_file} --answer_opt ${answer_opt} --optimizer ${optim} --train_datasets ${train_datasets} --test_datasets ${test_datasets} --grad_clipping ${grad_clipping} --global_grad_clipping ${global_grad_clipping} 27 | -------------------------------------------------------------------------------- /gobbli/model/mtdnn/src/scripts/scitail_domain_adaptation_bash.sh: -------------------------------------------------------------------------------- 1 | # 2 v100 2 | ./domain_adaptation_run.sh scitail_001_tl ../mt_dnn_models/mt_dnn_base.pt scitail_001 scitail ../data/domain_adaptation ../checkpoints 32 0,1 |tee scitail_001_tl.log 3 | ./domain_adaptation_run.sh scitail_01_tl ../mt_dnn_models/mt_dnn_base.pt scitail_01 scitail ../data/domain_adaptation ../checkpoints 32 0,1 |tee scitail_01_tl.log 4 | ./domain_adaptation_run.sh scitail_1_tl ../mt_dnn_models/mt_dnn_base.pt scitail_1 scitail ../data/domain_adaptation ../checkpoints 32 0,1 |tee scitail_1_tl.log 5 | ./domain_adaptation_run.sh scitail_full_tl ../mt_dnn_models/mt_dnn_base.pt scitail scitail ../data/domain_adaptation ../checkpoints 32 0,1 |tee scitail_full_tl.log 6 | -------------------------------------------------------------------------------- /gobbli/model/mtdnn/src/scripts/snli_domain_adaptation_bash.sh: -------------------------------------------------------------------------------- 1 | # 2 v100 2 | ./domain_adaptation_run.sh snli_001_tl ../mt_dnn_models/mt_dnn_base.pt snli_001 snli ../data/domain_adaptation ../checkpoints 32 0,1 |tee snli_001_tl.log 3 | ./domain_adaptation_run.sh snli_01_tl ../mt_dnn_models/mt_dnn_base.pt snli_01 snli ../data/domain_adaptation ../checkpoints 32 0,1 |tee snli_01_tl.log 4 | ./domain_adaptation_run.sh snli_1_tl ../mt_dnn_models/mt_dnn_base.pt snli_1 snli ../data/domain_adaptation ../checkpoints 32 0,1 |tee snli_1_tl.log 5 | ./domain_adaptation_run.sh snli_full_tl ../mt_dnn_models/mt_dnn_base.pt snli snli ../data/domain_adaptation ../checkpoints 32 0,1 |tee snli_full_tl.log 6 | -------------------------------------------------------------------------------- /gobbli/model/sklearn/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import ( 2 | SKLearnClassifier, 3 | TfidfEmbedder, 4 | make_cv_tfidf_logistic_regression, 5 | make_default_tfidf_logistic_regression, 6 | persist_estimator, 7 | ) 8 | 9 | __all__ = [ 10 | "SKLearnClassifier", 11 | "TfidfEmbedder", 12 | "persist_estimator", 13 | "make_cv_tfidf_logistic_regression", 14 | "make_default_tfidf_logistic_regression", 15 | ] 16 | -------------------------------------------------------------------------------- /gobbli/model/spacy/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.3-cuda10.1-cudnn7-devel 2 | 3 | COPY ./src/requirements.txt /tmp/requirements.txt 4 | RUN pip install -r /tmp/requirements.txt 5 | 6 | COPY ./src /code/spacy 7 | WORKDIR /code/spacy 8 | 9 | ARG model 10 | RUN python -m spacy download ${model} 11 | -------------------------------------------------------------------------------- /gobbli/model/spacy/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import SpaCyModel 2 | 3 | __all__ = ["SpaCyModel"] 4 | -------------------------------------------------------------------------------- /gobbli/model/spacy/src/requirements.txt: -------------------------------------------------------------------------------- 1 | # These are additional requirements needed on top of the PyTorch image 2 | pandas==0.25.0 3 | # Use 2.2.1 to work around this issue: 4 | # https://github.com/explosion/spacy-transformers/issues/105 5 | # Can upgrade when this PR is merged: 6 | # https://github.com/explosion/spacy-transformers/pull/120 7 | spacy==2.2.1 8 | spacy-transformers==0.5.1 9 | # Resolve nested package version conflicts 10 | sentencepiece==0.1.86 11 | urllib3>=1.25.4,<1.27 12 | requests==2.25.1 13 | 14 | # We're using the PyTorch image with CUDA 10.1, but spaCy doesn't have an extra 15 | # requirements specifier for CUDA 10.1 at the time of this writing (it only has 10.0). 16 | # We could use the "cuda" extra requirements specifier, but it results in spaCy 17 | # requiring the source distribution of cupy, which can't be compiled in a container 18 | # without the NVIDIA runtime (which would require us to have separate images for GPU 19 | # and no-GPU). So, we manually install the spaCy GPU dependencies so we get 20 | # wheels compatible with CUDA 10.1. 21 | cupy-cuda101==7.0.0 22 | thinc_gpu_ops==0.0.4 23 | -------------------------------------------------------------------------------- /gobbli/model/transformer/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.3-cuda10.1-cudnn7-runtime 2 | 3 | COPY ./src/requirements.txt /tmp/requirements.txt 4 | RUN pip install -r /tmp/requirements.txt 5 | 6 | COPY ./src /code/transformer 7 | WORKDIR /code/transformer 8 | -------------------------------------------------------------------------------- /gobbli/model/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import Transformer 2 | 3 | __all__ = ["Transformer"] 4 | -------------------------------------------------------------------------------- /gobbli/model/transformer/src/requirements.txt: -------------------------------------------------------------------------------- 1 | # These are additional requirements needed on top of the pytorch image 2 | pandas==0.25.0 3 | transformers==2.8.0 4 | sentencepiece==0.1.86 5 | -------------------------------------------------------------------------------- /gobbli/model/use/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG GPU 2 | FROM tensorflow/tensorflow:2.0.1${GPU:+-gpu}-py3 3 | 4 | WORKDIR /code/use 5 | COPY ./src/requirements.txt ./ 6 | RUN pip install -r requirements.txt 7 | 8 | COPY ./src/ ./ 9 | -------------------------------------------------------------------------------- /gobbli/model/use/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import USE 2 | 3 | __all__ = ["USE"] 4 | -------------------------------------------------------------------------------- /gobbli/model/use/src/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow-hub==0.7.0 2 | -------------------------------------------------------------------------------- /gobbli/model/use/src/use.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import tensorflow_hub as hub 5 | 6 | 7 | def read_texts(input_file): 8 | with open(input_file, "r", encoding="utf-8") as f: 9 | return f.readlines() 10 | 11 | 12 | def make_batches(l, batch_size): 13 | for i in range(0, len(l), batch_size): 14 | yield l[i : i + batch_size] 15 | 16 | 17 | if __name__ == "__main__": 18 | parser = argparse.ArgumentParser() 19 | 20 | parser.add_argument( 21 | "--input-file", 22 | required=True, 23 | help="Path to the file containing input texts, one per line.", 24 | ) 25 | parser.add_argument( 26 | "--output-file", 27 | required=True, 28 | help="Path to write computed embeddings to (JSON format).", 29 | ) 30 | parser.add_argument( 31 | "--module-dir", 32 | required=True, 33 | help="Path to the downloaded/extracted TFHub Module for USE.", 34 | ) 35 | parser.add_argument( 36 | "--batch-size", 37 | default=32, 38 | type=int, 39 | help="Number of texts to embed at once. Default: %(default)s", 40 | ) 41 | 42 | args = parser.parse_args() 43 | 44 | embed = hub.load(args.module_dir) 45 | texts = read_texts(args.input_file) 46 | 47 | with open(args.output_file, "w") as f: 48 | for batch in make_batches(texts, args.batch_size): 49 | embeddings = embed(batch).numpy() 50 | for embedding in embeddings.tolist(): 51 | json.dump(embedding, f) 52 | f.write("\n") 53 | -------------------------------------------------------------------------------- /gobbli/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/gobbli/test/__init__.py -------------------------------------------------------------------------------- /gobbli/test/augment/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/gobbli/test/augment/__init__.py -------------------------------------------------------------------------------- /gobbli/test/augment/test_bertmaskedlm.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from gobbli.augment.bert import BERTMaskedLM 4 | from gobbli.test.util import model_test_dir 5 | 6 | 7 | @pytest.mark.parametrize( 8 | "params,exception", 9 | [ 10 | # Unknown param 11 | ({"unknown": None}, ValueError), 12 | # Bad type (diversity) 13 | ({"diversity": 2}, TypeError), 14 | # Bad type (batch size) 15 | ({"batch_size": 2.5}, TypeError), 16 | # Bad type (n_probable) 17 | ({"n_probable": 2.5}, TypeError), 18 | # Bad value (diversity) 19 | ({"diversity": 0.0}, ValueError), 20 | # Bad value (batch_size) 21 | ({"batch_size": 0}, ValueError), 22 | # Bad value (n_probable) 23 | ({"n_probable": 0}, ValueError), 24 | # OK values 25 | ({"diversity": 0.5, "n_probable": 3, "batch_size": 16}, None), 26 | ], 27 | ) 28 | def test_init(params, exception): 29 | if exception is None: 30 | BERTMaskedLM(**params) 31 | else: 32 | with pytest.raises(exception): 33 | BERTMaskedLM(**params) 34 | 35 | 36 | def test_bertmaskedlm_augment(model_gpu_config, gobbli_dir): 37 | model = BERTMaskedLM( 38 | data_dir=model_test_dir(BERTMaskedLM), load_existing=True, **model_gpu_config 39 | ) 40 | model.build() 41 | 42 | times = 5 43 | new_texts = model.augment(["This is a test."], times=times) 44 | assert len(new_texts) == times 45 | -------------------------------------------------------------------------------- /gobbli/test/augment/test_marian.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from gobbli.augment.marian import MarianMT 4 | from gobbli.test.util import model_test_dir 5 | 6 | 7 | @pytest.mark.parametrize( 8 | "params,exception", 9 | [ 10 | # Unknown param 11 | ({"unknown": None}, ValueError), 12 | # Bad type (batch_size) 13 | ({"batch_size": 2.5}, TypeError), 14 | # Bad type (target_languages) 15 | ({"target_languages": "english"}, TypeError), 16 | # Bad value (batch_size) 17 | ({"batch_size": 0}, ValueError), 18 | # Bad value (target_languages) 19 | ({"target_languages": ["not a language"]}, ValueError), 20 | # Bad value, one OK value (target_languages) 21 | ({"target_languages": ["french", "not a language"]}, ValueError), 22 | # OK values 23 | ({"batch_size": 16, "target_languages": ["russian", "french"]}, None), 24 | ], 25 | ) 26 | def test_init(params, exception): 27 | if exception is None: 28 | MarianMT(**params) 29 | else: 30 | with pytest.raises(exception): 31 | MarianMT(**params) 32 | 33 | 34 | def test_marianmt_augment(model_gpu_config, gobbli_dir): 35 | # Don't go overboard with the languages here, since each 36 | # one requires a separate model (few hundred MB) to be downloaded 37 | target_languages = ["russian", "french"] 38 | model = MarianMT( 39 | data_dir=model_test_dir(MarianMT), 40 | load_existing=True, 41 | target_languages=target_languages, 42 | **model_gpu_config, 43 | ) 44 | model.build() 45 | 46 | # Can't augment more times than target languages 47 | invalid_num_times = len(target_languages) + 1 48 | with pytest.raises(ValueError): 49 | model.augment(["This is a test."], times=invalid_num_times) 50 | 51 | valid_num_times = len(target_languages) 52 | new_texts = model.augment(["This is a test."], times=valid_num_times) 53 | assert len(new_texts) == valid_num_times 54 | -------------------------------------------------------------------------------- /gobbli/test/augment/test_wordnet.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from spacy.lang.en import English 3 | 4 | from gobbli.augment.wordnet import WordNet, _detokenize_doc 5 | 6 | 7 | @pytest.mark.parametrize( 8 | "text", 9 | [ 10 | "This is a test.", 11 | "Test with double space.", 12 | "Test-with hyphen.", 13 | "Testing some 1 2 3 numbers.", 14 | ], 15 | ) 16 | def test_detokenize_doc(text): 17 | # Initialize the spaCy extension needed to detokenize text 18 | WordNet() 19 | 20 | nlp = English() 21 | doc = nlp(text) 22 | 23 | # Fill out the replacement attribute as WordNet would. 24 | for tok in doc: 25 | tok._.replacement = tok.text 26 | assert _detokenize_doc(doc) == text 27 | 28 | 29 | def test_wordnet_augment(): 30 | wn = WordNet() 31 | times = 5 32 | new_texts = wn.augment(["This is a test."], times=times) 33 | assert len(new_texts) == times 34 | -------------------------------------------------------------------------------- /gobbli/test/classification/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/gobbli/test/classification/__init__.py -------------------------------------------------------------------------------- /gobbli/test/dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/gobbli/test/dataset/__init__.py -------------------------------------------------------------------------------- /gobbli/test/dataset/test_base_dataset.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from gobbli.test.util import MockDataset 4 | 5 | 6 | def test_base_dataset_load(): 7 | ds = MockDataset() 8 | 9 | # Dataset should be unbuilt after default initialization 10 | assert ds._build_count == 0 11 | 12 | ds = MockDataset.load() 13 | 14 | # Dataset should now be built 15 | assert ds._build_count == 1 16 | 17 | ds.load() 18 | 19 | # Dataset shouldn't have been built again 20 | assert ds._build_count == 1 21 | 22 | 23 | def test_base_dataset_train_input(): 24 | # Need to build first 25 | with pytest.raises(ValueError): 26 | MockDataset().train_input() 27 | 28 | ds = MockDataset.load() 29 | 30 | # No limit 31 | train_input = ds.train_input(valid_proportion=0.5) 32 | 33 | X_len = len(MockDataset.X_TRAIN_VALID) 34 | 35 | assert len(train_input.X_train) == X_len / 2 36 | assert len(train_input.y_train) == X_len / 2 37 | assert len(train_input.X_valid) == X_len / 2 38 | assert len(train_input.y_valid) == X_len / 2 39 | 40 | # Limit 41 | train_input = ds.train_input(valid_proportion=0.5, limit=2) 42 | 43 | assert len(train_input.X_train) == 1 44 | assert len(train_input.y_train) == 1 45 | assert len(train_input.X_valid) == 1 46 | assert len(train_input.y_valid) == 1 47 | 48 | 49 | def test_base_dataset_predict_input(): 50 | # Need to build first 51 | with pytest.raises(ValueError): 52 | MockDataset().train_input() 53 | 54 | ds = MockDataset.load() 55 | 56 | # No limit 57 | predict_input = ds.predict_input() 58 | 59 | X_len = len(MockDataset.X_TEST) 60 | 61 | assert len(predict_input.X) == X_len 62 | assert set(predict_input.labels) == set(MockDataset.Y_TEST) 63 | 64 | # Limit applied 65 | predict_input = ds.predict_input(limit=1) 66 | 67 | assert len(predict_input.X) == 1 68 | 69 | # Make sure we only have the labels from the limited subset 70 | assert set(predict_input.labels) < set(MockDataset.Y_TEST) 71 | -------------------------------------------------------------------------------- /gobbli/test/dataset/test_cmu_movie_summary.py: -------------------------------------------------------------------------------- 1 | from gobbli.dataset.cmu_movie_summary import MovieSummaryDataset 2 | 3 | 4 | def test_load_cmu_movie_summary(tmp_gobbli_dir): 5 | ds = MovieSummaryDataset.load() 6 | 7 | X_train = ds.X_train() 8 | X_test = ds.X_test() 9 | 10 | y_train = ds.y_train() 11 | y_test = ds.y_test() 12 | 13 | assert len(X_train) == 33763 14 | assert len(y_train) == 33763 15 | assert len(X_test) == 8441 16 | assert len(y_test) == 8441 17 | 18 | # Ensure these objects pass validation 19 | train_input = ds.train_input() 20 | ds.predict_input() 21 | 22 | assert len(train_input.labels()) == 357 23 | -------------------------------------------------------------------------------- /gobbli/test/dataset/test_imdb.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from gobbli.dataset.imdb import IMDBDataset 4 | 5 | 6 | def test_load_imdb(tmp_gobbli_dir): 7 | ds = IMDBDataset.load() 8 | 9 | X_train = ds.X_train() 10 | X_test = ds.X_test() 11 | 12 | y_train = ds.y_train() 13 | y_test = ds.y_test() 14 | 15 | assert len(X_train) == 25000 16 | assert len(y_train) == 25000 17 | assert len(X_test) == 25000 18 | assert len(y_test) == 25000 19 | 20 | assert len(pd.unique(y_train)) == 2 21 | assert len(pd.unique(y_test)) == 2 22 | 23 | # Ensure these objects pass validation 24 | ds.train_input() 25 | ds.predict_input() 26 | -------------------------------------------------------------------------------- /gobbli/test/dataset/test_newsgroups.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from gobbli.dataset.newsgroups import NewsgroupsDataset 4 | 5 | 6 | def test_load_newsgroups(tmp_gobbli_dir): 7 | ds = NewsgroupsDataset.load() 8 | 9 | X_train = ds.X_train() 10 | X_test = ds.X_test() 11 | 12 | y_train = ds.y_train() 13 | y_test = ds.y_test() 14 | 15 | assert len(X_train) == 11314 16 | assert len(y_train) == 11314 17 | assert len(X_test) == 7532 18 | assert len(y_test) == 7532 19 | 20 | assert len(pd.unique(y_train)) == 20 21 | assert len(pd.unique(y_test)) == 20 22 | 23 | # Ensure these objects pass validation 24 | ds.train_input() 25 | ds.predict_input() 26 | -------------------------------------------------------------------------------- /gobbli/test/experiment/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/gobbli/test/experiment/__init__.py -------------------------------------------------------------------------------- /gobbli/test/interactive/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/gobbli/test/interactive/__init__.py -------------------------------------------------------------------------------- /gobbli/test/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RTIInternational/gobbli/d9ec8132f74ce49dc4bead2fad25b661bcef6e76/gobbli/test/model/__init__.py -------------------------------------------------------------------------------- /gobbli/test/model/test_bert.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from gobbli.model.bert import BERT 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "params,exception", 8 | [ 9 | # Unknown param 10 | ({"unknown": None}, ValueError), 11 | # Bad type (max_seq_length) 12 | ({"max_seq_length": "100"}, TypeError), 13 | # Bad value (bert_model) 14 | ({"bert_model": "ernie"}, ValueError), 15 | # OK type (max_seq_length) 16 | ({"max_seq_length": 100}, None), 17 | # OK value (bert_model) 18 | ({"bert_model": "bert-base-uncased"}, None), 19 | # OK values (both params) 20 | ({"max_seq_length": 100, "bert_model": "bert-base-uncased"}, None), 21 | ], 22 | ) 23 | def test_init(params, exception): 24 | if exception is None: 25 | BERT(**params) 26 | else: 27 | with pytest.raises(exception): 28 | BERT(**params) 29 | -------------------------------------------------------------------------------- /gobbli/test/model/test_fasttext.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from gobbli.model.fasttext import FastText 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "params,exception", 8 | [ 9 | # Unknown param 10 | ({"unknown": None}, ValueError), 11 | # Bad type (word_ngrams) 12 | ({"word_ngrams": 1.0}, TypeError), 13 | # Bad type (lr) 14 | ({"lr": 1}, TypeError), 15 | # Bad type (dim) 16 | ({"dim": 100.0}, TypeError), 17 | # Bad type (ws) 18 | ({"ws": 3.0}, TypeError), 19 | # Bad value (fasttext_model) 20 | ({"fasttext_model": "bert"}, ValueError), 21 | # OK value (fasttext_model) 22 | ({"fasttext_model": "crawl-300d"}, None), 23 | # Dim mismatch (pretrained vectors vs user-passed dim) 24 | ({"fasttext_model": "crawl-300d", "dim": 100}, ValueError), 25 | # OK values (all) 26 | ( 27 | { 28 | "word_ngrams": 2, 29 | "lr": 0.01, 30 | "dim": 300, 31 | "ws": 3, 32 | "fasttext_model": "crawl-300d", 33 | }, 34 | None, 35 | ), 36 | ], 37 | ) 38 | def test_init(params, exception): 39 | if exception is None: 40 | FastText(**params) 41 | else: 42 | with pytest.raises(exception): 43 | FastText(**params) 44 | -------------------------------------------------------------------------------- /gobbli/test/model/test_mtdnn.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from gobbli.model.mtdnn import MTDNN 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "params,exception", 8 | [ 9 | # Unknown param 10 | ({"unknown": None}, ValueError), 11 | # Bad type (max_seq_length) 12 | ({"max_seq_length": "100"}, TypeError), 13 | # Bad value (mtdnn_model) 14 | ({"mtdnn_model": "bert"}, ValueError), 15 | # OK type (max_seq_length) 16 | ({"max_seq_length": 100}, None), 17 | # OK value (mtdnn_model) 18 | ({"mtdnn_model": "mt-dnn-base"}, None), 19 | # OK values (both params) 20 | ({"max_seq_length": 100, "mtdnn_model": "mt-dnn-base"}, None), 21 | ], 22 | ) 23 | def test_init(params, exception): 24 | if exception is None: 25 | MTDNN(**params) 26 | else: 27 | with pytest.raises(exception): 28 | MTDNN(**params) 29 | -------------------------------------------------------------------------------- /gobbli/test/model/test_spacy.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from gobbli.model.spacy import SpaCyModel 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "params,exception", 8 | [ 9 | # Unknown param 10 | ({"unknown": None}, ValueError), 11 | # Bad type (dropout) 12 | ({"dropout": "100"}, TypeError), 13 | # OK type (dropout) 14 | ({"dropout": 0.3}, None), 15 | # Bad type (full_pipeline) 16 | ({"full_pipeline": 1}, TypeError), 17 | # OK type (full_pipeline) 18 | ({"full_pipeline": True}, None), 19 | # OK types (all params) 20 | ({"full_pipeline": True, "dropout": 0.3}, None), 21 | ], 22 | ) 23 | def test_init(params, exception): 24 | if exception is None: 25 | SpaCyModel(**params) 26 | else: 27 | with pytest.raises(exception): 28 | SpaCyModel(**params) 29 | -------------------------------------------------------------------------------- /gobbli/test/model/test_transformer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from gobbli.model.transformer import Transformer 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "params,exception", 8 | [ 9 | # Unknown param 10 | ({"unknown": None}, ValueError), 11 | # Bad type (max_seq_length) 12 | ({"max_seq_length": "100"}, TypeError), 13 | # OK type (max_seq_length) 14 | ({"max_seq_length": 100}, None), 15 | # Bad type (config_overrides) 16 | ({"config_overrides": 1}, TypeError), 17 | # OK type (config_overrides) 18 | ({"config_overrides": {}}, None), 19 | # Bad type (lr) 20 | ({"lr": 1}, TypeError), 21 | # OK type (lr) 22 | ({"lr": 1e-3}, None), 23 | # Bad type (adam_eps) 24 | ({"adam_eps": 1}, TypeError), 25 | # OK type (adam_eps) 26 | ({"adam_eps": 1e-5}, None), 27 | # Bad type (gradient_accumulation_steps) 28 | ({"gradient_accumulation_steps": 1.0}, TypeError), 29 | # OK type (gradient_accumulation_steps) 30 | ({"gradient_accumulation_steps": 2}, None), 31 | # OK values (all params), 32 | ( 33 | { 34 | "max_seq_length": 100, 35 | "config_overrides": {}, 36 | "lr": 1e-3, 37 | "adam_eps": 1e-5, 38 | "gradient_accumulation_steps": 2, 39 | }, 40 | None, 41 | ), 42 | ], 43 | ) 44 | def test_init(params, exception): 45 | if exception is None: 46 | Transformer(**params) 47 | else: 48 | with pytest.raises(exception): 49 | Transformer(**params) 50 | -------------------------------------------------------------------------------- /gobbli/test/model/test_use.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from gobbli.model.use import USE 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "params,exception", 8 | [ 9 | # Unknown param 10 | ({"unknown": None}, ValueError), 11 | # Bad value (use_model) 12 | ({"use_model": "bert"}, ValueError), 13 | # OK value (use_model) 14 | ({"use_model": "universal-sentence-encoder"}, None), 15 | ], 16 | ) 17 | def test_init(params, exception): 18 | if exception is None: 19 | USE(**params) 20 | else: 21 | with pytest.raises(exception): 22 | USE(**params) 23 | -------------------------------------------------------------------------------- /meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "gobbli", 3 | "url": "https://github.com/RTIInternational/gobbli/", 4 | "download_url": "", 5 | "author": "RTI International", 6 | "maintainer": "Jason Nance", 7 | "version": "0.2.4", 8 | "description": "Uniform interface to deep learning approaches via Docker containers." 9 | } -------------------------------------------------------------------------------- /paper/README.md: -------------------------------------------------------------------------------- 1 | # Journal of Open Source Software Paper 2 | 3 | This section of the repository contains materials for a paper submitted to [JOSS](https://joss.theoj.org). 4 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 88 3 | target-version = ['py37'] 4 | exclude = ''' 5 | ( 6 | /( 7 | \.eggs 8 | | \.git 9 | | \.hg 10 | | \.mypy_cache 11 | | \.tox 12 | | \.venv 13 | | _build 14 | | buck-out 15 | | build 16 | | dist 17 | )/ 18 | | gobbli/model/bert/src 19 | | gobbli/model/mtdnn/src 20 | ) 21 | ''' -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = gobbli/test 3 | norecursedirs = gobbli/model/bert/src gobbli/model/mtdnn/src build/ 4 | 5 | [pytest.ini] 6 | log_cli = 1 7 | log_cli_format = %(asctime)s [%(levelname)8s] %(message)s 8 | log_cli_date_format=%Y-%m-%d %H:%M:%S -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | flake8==3.7.7 2 | mypy==0.720 3 | pytest==4.5.0 4 | ray[debug]==0.8.5 5 | aiohttp==3.5.4 6 | importmagic==0.1.7 7 | epc==0.0.5 8 | isort==4.3.20 9 | isort[requirements]==4.3.20 10 | black==19.3b0 11 | setuptools==41.0.1 12 | wheel==0.33.6 13 | twine==1.13.0 14 | https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz 15 | -------------------------------------------------------------------------------- /run_ci.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Run the various processes needed for CI. 4 | # Pass additional script arguments to py.test. 5 | 6 | set -e 7 | 8 | isort -rc --check-only ./gobbli 9 | black ./gobbli 10 | mypy ./gobbli --ignore-missing-imports 11 | flake8 ./gobbli --config setup.cfg 12 | py.test -vs $@ ./gobbli 13 | -------------------------------------------------------------------------------- /run_dist.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | function usage() { 4 | echo "Usage: $0 [test|live]" 5 | } 6 | 7 | if [[ $# -ne 1 ]]; then 8 | usage 9 | exit 1 10 | fi 11 | 12 | mode="$1" 13 | 14 | if [[ "$mode" != "test" && "$mode" != "live" ]]; then 15 | usage 16 | exit 1 17 | fi 18 | 19 | rm -r ./dist/ 20 | 21 | python setup.py sdist bdist_wheel 22 | 23 | if [[ "$mode" == "test" ]]; then 24 | python -m twine upload --repository-url https://test.pypi.org/legacy/ dist/* 25 | elif [[ "$mode" == "live" ]]; then 26 | python -m twine upload dist/* 27 | fi 28 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # https://github.com/python/black/issues/429 3 | ignore=E101,E111,E114,E115,E116,E117,E121,E122,E123,E124,E125,E126,E127,E128,E129,E131,E133,E2,E3,E5,E701,E702,E703,E704,W1,W2,W3,W503,W504 4 | exclude=gobbli/model/bert/src gobbli/model/mtdnn/src 5 | 6 | [isort] 7 | multi_line_output=3 8 | include_trailing_comma=True 9 | force_grid_wrap=0 10 | use_parentheses=True 11 | line_length=88 12 | skip=gobbli/model/bert/src,gobbli/model/mtdnn/src,.test_cache 13 | -------------------------------------------------------------------------------- /test_remote_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Run tests on a GPU machine over SSH. Assumes the remote user is a member of the 4 | # Docker group (i.e. no sudo required for Docker commands), and that 5 | # Docker/docker-compose/nvidia-docker are already installed on the remote server. 6 | 7 | if [[ $# -ne 3 ]]; then 8 | echo "Usage: $0 " 9 | echo 10 | echo " ssh_string: SSH connection string for the remote server." 11 | echo 12 | echo " remote_repo_dir: Path to use as the repository root on the " 13 | echo " remote server. Files will be copied here." 14 | echo 15 | echo " visible_devices: Value to use for the NVIDIA_VISIBLE_DEVICES environment " 16 | echo " variable controlling which GPUs are made available to the container " 17 | echo " for testing." 18 | exit 1 19 | fi 20 | 21 | ssh_string="$1" 22 | remote_repo_dir="$2" 23 | visible_gpus="$3" 24 | 25 | if ssh "$ssh_string" "[[ -e $remote_repo_dir ]]"; then 26 | echo "Directory '$remote_repo_dir' already exists on the remote server;" 27 | echo "can't run tests pointing at an existing directory." 28 | exit 1 29 | fi 30 | 31 | rsync -raz \ 32 | --exclude .git \ 33 | --filter=':- .gitignore' \ 34 | ./ "$ssh_string:$remote_repo_dir" 35 | 36 | ssh "$ssh_string" "cd $remote_repo_dir/ci-gpu \ 37 | && export NVIDIA_VISIBLE_DEVICES=$visible_gpus \ 38 | && export PYTHON_VERSION=3.7 \ 39 | && docker-compose build gobbli-ci-gpu \ 40 | && docker-compose run --rm gobbli-ci-gpu" 41 | --------------------------------------------------------------------------------