├── README.md ├── demos └── patchscopes_app.py ├── requirements.txt ├── scripts ├── run_expansion_estimation.sh └── run_vocab_expansion.sh ├── src └── tokens2words │ ├── __init__.py │ ├── analysis │ └── identified_in_patchscopes.py │ ├── experiments │ ├── __init__.py │ └── detokenization │ │ ├── __init__.py │ │ ├── tokens_aggregation.py │ │ ├── utils.py │ │ ├── word_retrieval_multi_tokens.py │ │ ├── word_retrieval_seperations.py │ │ └── word_retrieval_typos.py │ ├── processor.py │ ├── representation_translator.py │ ├── run_new_vocab_success_estimate.py │ ├── run_patchscopes.py │ ├── run_vocab_expansion_eval.py │ ├── utils │ ├── __init__.py │ ├── calibration_utils.py │ ├── core_vocab_utils.py │ ├── data_utils.py │ ├── downstream_utils.py │ ├── enums.py │ ├── eval_utils.py │ ├── file_utils.py │ ├── hebrew_utils.py │ ├── logit_lens.py │ ├── logits_utils.py │ ├── model_utils.py │ └── procrustes │ │ ├── __init__.py │ │ ├── orthogonal.py │ │ └── utils.py │ ├── vocab_modifier.py │ └── word_retriever.py └── word_lists ├── top_5k_arabic_words.txt └── top_5k_hebrew_words_without_nikud.txt /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/README.md -------------------------------------------------------------------------------- /demos/patchscopes_app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/demos/patchscopes_app.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/requirements.txt -------------------------------------------------------------------------------- /scripts/run_expansion_estimation.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/scripts/run_expansion_estimation.sh -------------------------------------------------------------------------------- /scripts/run_vocab_expansion.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/scripts/run_vocab_expansion.sh -------------------------------------------------------------------------------- /src/tokens2words/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/tokens2words/analysis/identified_in_patchscopes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/analysis/identified_in_patchscopes.py -------------------------------------------------------------------------------- /src/tokens2words/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/tokens2words/experiments/detokenization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/tokens2words/experiments/detokenization/tokens_aggregation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/experiments/detokenization/tokens_aggregation.py -------------------------------------------------------------------------------- /src/tokens2words/experiments/detokenization/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/experiments/detokenization/utils.py -------------------------------------------------------------------------------- /src/tokens2words/experiments/detokenization/word_retrieval_multi_tokens.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/experiments/detokenization/word_retrieval_multi_tokens.py -------------------------------------------------------------------------------- /src/tokens2words/experiments/detokenization/word_retrieval_seperations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/experiments/detokenization/word_retrieval_seperations.py -------------------------------------------------------------------------------- /src/tokens2words/experiments/detokenization/word_retrieval_typos.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/experiments/detokenization/word_retrieval_typos.py -------------------------------------------------------------------------------- /src/tokens2words/processor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/processor.py -------------------------------------------------------------------------------- /src/tokens2words/representation_translator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/representation_translator.py -------------------------------------------------------------------------------- /src/tokens2words/run_new_vocab_success_estimate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/run_new_vocab_success_estimate.py -------------------------------------------------------------------------------- /src/tokens2words/run_patchscopes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/run_patchscopes.py -------------------------------------------------------------------------------- /src/tokens2words/run_vocab_expansion_eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/run_vocab_expansion_eval.py -------------------------------------------------------------------------------- /src/tokens2words/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/tokens2words/utils/calibration_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/utils/calibration_utils.py -------------------------------------------------------------------------------- /src/tokens2words/utils/core_vocab_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/utils/core_vocab_utils.py -------------------------------------------------------------------------------- /src/tokens2words/utils/data_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/utils/data_utils.py -------------------------------------------------------------------------------- /src/tokens2words/utils/downstream_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/utils/downstream_utils.py -------------------------------------------------------------------------------- /src/tokens2words/utils/enums.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/utils/enums.py -------------------------------------------------------------------------------- /src/tokens2words/utils/eval_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/utils/eval_utils.py -------------------------------------------------------------------------------- /src/tokens2words/utils/file_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/utils/file_utils.py -------------------------------------------------------------------------------- /src/tokens2words/utils/hebrew_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/utils/hebrew_utils.py -------------------------------------------------------------------------------- /src/tokens2words/utils/logit_lens.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/utils/logit_lens.py -------------------------------------------------------------------------------- /src/tokens2words/utils/logits_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/utils/logits_utils.py -------------------------------------------------------------------------------- /src/tokens2words/utils/model_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/utils/model_utils.py -------------------------------------------------------------------------------- /src/tokens2words/utils/procrustes/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/tokens2words/utils/procrustes/orthogonal.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/utils/procrustes/orthogonal.py -------------------------------------------------------------------------------- /src/tokens2words/utils/procrustes/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/utils/procrustes/utils.py -------------------------------------------------------------------------------- /src/tokens2words/vocab_modifier.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/vocab_modifier.py -------------------------------------------------------------------------------- /src/tokens2words/word_retriever.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/src/tokens2words/word_retriever.py -------------------------------------------------------------------------------- /word_lists/top_5k_arabic_words.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/word_lists/top_5k_arabic_words.txt -------------------------------------------------------------------------------- /word_lists/top_5k_hebrew_words_without_nikud.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schwartz-lab-NLP/Tokens2Words/HEAD/word_lists/top_5k_hebrew_words_without_nikud.txt --------------------------------------------------------------------------------