├── .gitignore ├── README.md ├── boundary_creator.py ├── cleaners ├── alphabet_numerals.py ├── clean.py ├── homoglyphs.py ├── test.py └── utils.py ├── configs ├── baseline.yaml ├── entropy.yaml ├── fixed.yaml ├── gumbel.yaml ├── unigram.yaml ├── whitespaces.yaml └── wiki.yaml ├── data_utils.py ├── hourglass.py ├── media └── dynamic_pooling.gif ├── requirements.txt ├── scripts ├── download_wiki40b.py ├── get_text8.sh ├── get_wiki40b.sh ├── prep_text8.py └── run_exp.sh ├── shortening.py ├── test.py ├── tokenizer_data ├── spm │ ├── cc-100 │ │ └── text8 │ │ │ ├── spmunigram-10000.model │ │ │ └── spmunigram-200.model │ ├── text8 │ │ ├── spmunigram-1000.model │ │ ├── spmunigram-10000.model │ │ ├── spmunigram-200.model │ │ ├── spmunigram-3000.model │ │ ├── spmunigram-500.model │ │ └── spmunigram-5000.model │ └── wiki40b │ │ ├── en │ │ └── text8 │ │ │ ├── spmunigram-10000.model │ │ │ └── spmunigram-200.model │ │ ├── fi │ │ └── text8 │ │ │ ├── spmunigram-1000.model │ │ │ ├── spmunigram-10000.model │ │ │ ├── spmunigram-200.model │ │ │ ├── spmunigram-3000.model │ │ │ ├── spmunigram-500.model │ │ │ └── spmunigram-5000.model │ │ ├── he │ │ └── text8 │ │ │ ├── spmunigram-1000.model │ │ │ ├── spmunigram-10000.model │ │ │ ├── spmunigram-200.model │ │ │ ├── spmunigram-3000.model │ │ │ ├── spmunigram-500.model │ │ │ └── spmunigram-5000.model │ │ └── vi │ │ └── text8 │ │ ├── spmunigram-1000.model │ │ ├── spmunigram-10000.model │ │ ├── spmunigram-200.model │ │ ├── spmunigram-3000.model │ │ ├── spmunigram-500.model │ │ └── spmunigram-5000.model └── train_tokenizer.py ├── train.py └── utils ├── __init__.py ├── distributed.py ├── exp_utils.py ├── init.py └── vocabulary.py /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | .neptune 3 | experiments/ 4 | LM-TFM/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/README.md -------------------------------------------------------------------------------- /boundary_creator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/boundary_creator.py -------------------------------------------------------------------------------- /cleaners/alphabet_numerals.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/cleaners/alphabet_numerals.py -------------------------------------------------------------------------------- /cleaners/clean.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/cleaners/clean.py -------------------------------------------------------------------------------- /cleaners/homoglyphs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/cleaners/homoglyphs.py -------------------------------------------------------------------------------- /cleaners/test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/cleaners/test.py -------------------------------------------------------------------------------- /cleaners/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/cleaners/utils.py -------------------------------------------------------------------------------- /configs/baseline.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/configs/baseline.yaml -------------------------------------------------------------------------------- /configs/entropy.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/configs/entropy.yaml -------------------------------------------------------------------------------- /configs/fixed.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/configs/fixed.yaml -------------------------------------------------------------------------------- /configs/gumbel.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/configs/gumbel.yaml -------------------------------------------------------------------------------- /configs/unigram.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/configs/unigram.yaml -------------------------------------------------------------------------------- /configs/whitespaces.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/configs/whitespaces.yaml -------------------------------------------------------------------------------- /configs/wiki.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/configs/wiki.yaml -------------------------------------------------------------------------------- /data_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/data_utils.py -------------------------------------------------------------------------------- /hourglass.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/hourglass.py -------------------------------------------------------------------------------- /media/dynamic_pooling.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/media/dynamic_pooling.gif -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/requirements.txt -------------------------------------------------------------------------------- /scripts/download_wiki40b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/scripts/download_wiki40b.py -------------------------------------------------------------------------------- /scripts/get_text8.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/scripts/get_text8.sh -------------------------------------------------------------------------------- /scripts/get_wiki40b.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/scripts/get_wiki40b.sh -------------------------------------------------------------------------------- /scripts/prep_text8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/scripts/prep_text8.py -------------------------------------------------------------------------------- /scripts/run_exp.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/scripts/run_exp.sh -------------------------------------------------------------------------------- /shortening.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/shortening.py -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/test.py -------------------------------------------------------------------------------- /tokenizer_data/spm/cc-100/text8/spmunigram-10000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/cc-100/text8/spmunigram-10000.model -------------------------------------------------------------------------------- /tokenizer_data/spm/cc-100/text8/spmunigram-200.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/cc-100/text8/spmunigram-200.model -------------------------------------------------------------------------------- /tokenizer_data/spm/text8/spmunigram-1000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/text8/spmunigram-1000.model -------------------------------------------------------------------------------- /tokenizer_data/spm/text8/spmunigram-10000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/text8/spmunigram-10000.model -------------------------------------------------------------------------------- /tokenizer_data/spm/text8/spmunigram-200.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/text8/spmunigram-200.model -------------------------------------------------------------------------------- /tokenizer_data/spm/text8/spmunigram-3000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/text8/spmunigram-3000.model -------------------------------------------------------------------------------- /tokenizer_data/spm/text8/spmunigram-500.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/text8/spmunigram-500.model -------------------------------------------------------------------------------- /tokenizer_data/spm/text8/spmunigram-5000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/text8/spmunigram-5000.model -------------------------------------------------------------------------------- /tokenizer_data/spm/wiki40b/en/text8/spmunigram-10000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/wiki40b/en/text8/spmunigram-10000.model -------------------------------------------------------------------------------- /tokenizer_data/spm/wiki40b/en/text8/spmunigram-200.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/wiki40b/en/text8/spmunigram-200.model -------------------------------------------------------------------------------- /tokenizer_data/spm/wiki40b/fi/text8/spmunigram-1000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/wiki40b/fi/text8/spmunigram-1000.model -------------------------------------------------------------------------------- /tokenizer_data/spm/wiki40b/fi/text8/spmunigram-10000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/wiki40b/fi/text8/spmunigram-10000.model -------------------------------------------------------------------------------- /tokenizer_data/spm/wiki40b/fi/text8/spmunigram-200.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/wiki40b/fi/text8/spmunigram-200.model -------------------------------------------------------------------------------- /tokenizer_data/spm/wiki40b/fi/text8/spmunigram-3000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/wiki40b/fi/text8/spmunigram-3000.model -------------------------------------------------------------------------------- /tokenizer_data/spm/wiki40b/fi/text8/spmunigram-500.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/wiki40b/fi/text8/spmunigram-500.model -------------------------------------------------------------------------------- /tokenizer_data/spm/wiki40b/fi/text8/spmunigram-5000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/wiki40b/fi/text8/spmunigram-5000.model -------------------------------------------------------------------------------- /tokenizer_data/spm/wiki40b/he/text8/spmunigram-1000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/wiki40b/he/text8/spmunigram-1000.model -------------------------------------------------------------------------------- /tokenizer_data/spm/wiki40b/he/text8/spmunigram-10000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/wiki40b/he/text8/spmunigram-10000.model -------------------------------------------------------------------------------- /tokenizer_data/spm/wiki40b/he/text8/spmunigram-200.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/wiki40b/he/text8/spmunigram-200.model -------------------------------------------------------------------------------- /tokenizer_data/spm/wiki40b/he/text8/spmunigram-3000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/wiki40b/he/text8/spmunigram-3000.model -------------------------------------------------------------------------------- /tokenizer_data/spm/wiki40b/he/text8/spmunigram-500.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/wiki40b/he/text8/spmunigram-500.model -------------------------------------------------------------------------------- /tokenizer_data/spm/wiki40b/he/text8/spmunigram-5000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/wiki40b/he/text8/spmunigram-5000.model -------------------------------------------------------------------------------- /tokenizer_data/spm/wiki40b/vi/text8/spmunigram-1000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/wiki40b/vi/text8/spmunigram-1000.model -------------------------------------------------------------------------------- /tokenizer_data/spm/wiki40b/vi/text8/spmunigram-10000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/wiki40b/vi/text8/spmunigram-10000.model -------------------------------------------------------------------------------- /tokenizer_data/spm/wiki40b/vi/text8/spmunigram-200.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/wiki40b/vi/text8/spmunigram-200.model -------------------------------------------------------------------------------- /tokenizer_data/spm/wiki40b/vi/text8/spmunigram-3000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/wiki40b/vi/text8/spmunigram-3000.model -------------------------------------------------------------------------------- /tokenizer_data/spm/wiki40b/vi/text8/spmunigram-500.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/wiki40b/vi/text8/spmunigram-500.model -------------------------------------------------------------------------------- /tokenizer_data/spm/wiki40b/vi/text8/spmunigram-5000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/spm/wiki40b/vi/text8/spmunigram-5000.model -------------------------------------------------------------------------------- /tokenizer_data/train_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/tokenizer_data/train_tokenizer.py -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/train.py -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/utils/__init__.py -------------------------------------------------------------------------------- /utils/distributed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/utils/distributed.py -------------------------------------------------------------------------------- /utils/exp_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/utils/exp_utils.py -------------------------------------------------------------------------------- /utils/init.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/utils/init.py -------------------------------------------------------------------------------- /utils/vocabulary.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PiotrNawrot/dynamic-pooling/HEAD/utils/vocabulary.py --------------------------------------------------------------------------------