├── .github ├── CODEOWNERS └── workflows │ ├── publish.yml │ └── test.yml ├── .gitignore ├── .python-version ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── dalm ├── __init__.py ├── cli.py ├── datasets │ ├── __init__.py │ ├── docs_to_passage │ │ ├── __init__.py │ │ ├── json_pre_process.py │ │ ├── main.py │ │ └── utils.py │ ├── qa_gen │ │ ├── __init__.py │ │ ├── check_dataset.py │ │ ├── knowledge_dataset.csv │ │ └── question_answer_generation.py │ ├── reading_comprehension_generation │ │ ├── README.md │ │ ├── regex_based.py │ │ ├── synthetic_based.py │ │ └── utils.py │ └── toy_data_train.csv ├── eval │ ├── README.md │ ├── __init__.py │ ├── eval_rag.py │ ├── eval_results.py │ ├── eval_retriever_only.py │ ├── triplets.csv │ └── utils.py ├── models │ ├── __init__.py │ ├── rag_e2e_base_model.py │ └── retriever_only_base_model.py ├── pipelines │ ├── README.md │ └── reading_comprehension_pipeline.py ├── py.typed ├── training │ ├── __init__.py │ ├── generator_only │ │ └── trainer.py │ ├── rag_e2e │ │ ├── README.md │ │ ├── __init__.py │ │ └── train_rage2e.py │ ├── retriever_only │ │ ├── README.md │ │ ├── __init__.py │ │ ├── dataset │ │ │ ├── abstracts.csv │ │ │ ├── dataset_with_question.csv │ │ │ ├── knowledge_dataset.csv │ │ │ ├── knowledge_dataset │ │ │ │ ├── abstracts │ │ │ │ │ ├── cache-c6862fbba42bb48d.arrow │ │ │ │ │ ├── data-00000-of-00001.arrow │ │ │ │ │ ├── dataset_info.json │ │ │ │ │ └── state.json │ │ │ │ ├── cache-7d82ce38b8a589b1.arrow │ │ │ │ ├── data-00000-of-00001.arrow │ │ │ │ ├── dataset_dict.json │ │ │ │ ├── dataset_info.json │ │ │ │ └── state.json │ │ │ ├── train.csv │ │ │ └── valid.csv │ │ └── train_retriever_only.py │ └── utils │ │ ├── __init__.py │ │ ├── rag_e2e_dataloader_utils.py │ │ ├── retriever_only_dataloader_utils.py │ │ └── train_utils.py └── utils.py ├── experiments ├── README.md ├── llama-index-10k │ ├── README.md │ ├── data_gen.py │ ├── lyft_2021.pdf │ ├── requirements.txt │ └── uber_2021.pdf └── llama-index-synthetic-data │ ├── README.md │ ├── base_model_results_llama.json │ ├── eval_utils.py │ ├── evaluate_llama_model.py │ ├── ft_results.json │ ├── model_output_test │ ├── config.json │ └── pytorch_model.bin │ ├── prepare_data_for_llama.py │ └── train_dataset_llama.py ├── pyproject.toml ├── resources └── general.spm ├── tasks.py └── tests ├── __init__.py ├── conftest.py ├── datasets └── reading_comprehension_generation │ └── test_utils.py ├── test_cli.py └── training ├── __init__.py ├── rag_e2e ├── __init__.py ├── test_base_model.py └── test_train_rage2e.py ├── retriever_only └── __init__.py └── utils └── __init__.py /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | @shamanez @Jacobsolawetz @ben-epstein @SachiraKuruppu @metric-space -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/.github/workflows/publish.yml -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/.github/workflows/test.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/.gitignore -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.11 -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/CONTRIBUTING.md -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/README.md -------------------------------------------------------------------------------- /dalm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/__init__.py -------------------------------------------------------------------------------- /dalm/cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/cli.py -------------------------------------------------------------------------------- /dalm/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dalm/datasets/docs_to_passage/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dalm/datasets/docs_to_passage/json_pre_process.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/datasets/docs_to_passage/json_pre_process.py -------------------------------------------------------------------------------- /dalm/datasets/docs_to_passage/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/datasets/docs_to_passage/main.py -------------------------------------------------------------------------------- /dalm/datasets/docs_to_passage/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/datasets/docs_to_passage/utils.py -------------------------------------------------------------------------------- /dalm/datasets/qa_gen/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dalm/datasets/qa_gen/check_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/datasets/qa_gen/check_dataset.py -------------------------------------------------------------------------------- /dalm/datasets/qa_gen/knowledge_dataset.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/datasets/qa_gen/knowledge_dataset.csv -------------------------------------------------------------------------------- /dalm/datasets/qa_gen/question_answer_generation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/datasets/qa_gen/question_answer_generation.py -------------------------------------------------------------------------------- /dalm/datasets/reading_comprehension_generation/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/datasets/reading_comprehension_generation/README.md -------------------------------------------------------------------------------- /dalm/datasets/reading_comprehension_generation/regex_based.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/datasets/reading_comprehension_generation/regex_based.py -------------------------------------------------------------------------------- /dalm/datasets/reading_comprehension_generation/synthetic_based.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/datasets/reading_comprehension_generation/synthetic_based.py -------------------------------------------------------------------------------- /dalm/datasets/reading_comprehension_generation/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/datasets/reading_comprehension_generation/utils.py -------------------------------------------------------------------------------- /dalm/datasets/toy_data_train.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/datasets/toy_data_train.csv -------------------------------------------------------------------------------- /dalm/eval/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/eval/README.md -------------------------------------------------------------------------------- /dalm/eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dalm/eval/eval_rag.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/eval/eval_rag.py -------------------------------------------------------------------------------- /dalm/eval/eval_results.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/eval/eval_results.py -------------------------------------------------------------------------------- /dalm/eval/eval_retriever_only.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/eval/eval_retriever_only.py -------------------------------------------------------------------------------- /dalm/eval/triplets.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/eval/triplets.csv -------------------------------------------------------------------------------- /dalm/eval/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/eval/utils.py -------------------------------------------------------------------------------- /dalm/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dalm/models/rag_e2e_base_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/models/rag_e2e_base_model.py -------------------------------------------------------------------------------- /dalm/models/retriever_only_base_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/models/retriever_only_base_model.py -------------------------------------------------------------------------------- /dalm/pipelines/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/pipelines/README.md -------------------------------------------------------------------------------- /dalm/pipelines/reading_comprehension_pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/pipelines/reading_comprehension_pipeline.py -------------------------------------------------------------------------------- /dalm/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dalm/training/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dalm/training/generator_only/trainer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/generator_only/trainer.py -------------------------------------------------------------------------------- /dalm/training/rag_e2e/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/rag_e2e/README.md -------------------------------------------------------------------------------- /dalm/training/rag_e2e/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dalm/training/rag_e2e/train_rage2e.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/rag_e2e/train_rage2e.py -------------------------------------------------------------------------------- /dalm/training/retriever_only/README.md: -------------------------------------------------------------------------------- 1 | # arcee-retriever 2 | 3 | -------------------------------------------------------------------------------- /dalm/training/retriever_only/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dalm/training/retriever_only/dataset/abstracts.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/abstracts.csv -------------------------------------------------------------------------------- /dalm/training/retriever_only/dataset/dataset_with_question.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/dataset_with_question.csv -------------------------------------------------------------------------------- /dalm/training/retriever_only/dataset/knowledge_dataset.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/knowledge_dataset.csv -------------------------------------------------------------------------------- /dalm/training/retriever_only/dataset/knowledge_dataset/abstracts/cache-c6862fbba42bb48d.arrow: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/knowledge_dataset/abstracts/cache-c6862fbba42bb48d.arrow -------------------------------------------------------------------------------- /dalm/training/retriever_only/dataset/knowledge_dataset/abstracts/data-00000-of-00001.arrow: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/knowledge_dataset/abstracts/data-00000-of-00001.arrow -------------------------------------------------------------------------------- /dalm/training/retriever_only/dataset/knowledge_dataset/abstracts/dataset_info.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/knowledge_dataset/abstracts/dataset_info.json -------------------------------------------------------------------------------- /dalm/training/retriever_only/dataset/knowledge_dataset/abstracts/state.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/knowledge_dataset/abstracts/state.json -------------------------------------------------------------------------------- /dalm/training/retriever_only/dataset/knowledge_dataset/cache-7d82ce38b8a589b1.arrow: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/knowledge_dataset/cache-7d82ce38b8a589b1.arrow -------------------------------------------------------------------------------- /dalm/training/retriever_only/dataset/knowledge_dataset/data-00000-of-00001.arrow: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/knowledge_dataset/data-00000-of-00001.arrow -------------------------------------------------------------------------------- /dalm/training/retriever_only/dataset/knowledge_dataset/dataset_dict.json: -------------------------------------------------------------------------------- 1 | {"splits": ["abstracts"]} -------------------------------------------------------------------------------- /dalm/training/retriever_only/dataset/knowledge_dataset/dataset_info.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/knowledge_dataset/dataset_info.json -------------------------------------------------------------------------------- /dalm/training/retriever_only/dataset/knowledge_dataset/state.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/knowledge_dataset/state.json -------------------------------------------------------------------------------- /dalm/training/retriever_only/dataset/train.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/train.csv -------------------------------------------------------------------------------- /dalm/training/retriever_only/dataset/valid.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/valid.csv -------------------------------------------------------------------------------- /dalm/training/retriever_only/train_retriever_only.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/train_retriever_only.py -------------------------------------------------------------------------------- /dalm/training/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dalm/training/utils/rag_e2e_dataloader_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/utils/rag_e2e_dataloader_utils.py -------------------------------------------------------------------------------- /dalm/training/utils/retriever_only_dataloader_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/utils/retriever_only_dataloader_utils.py -------------------------------------------------------------------------------- /dalm/training/utils/train_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/utils/train_utils.py -------------------------------------------------------------------------------- /dalm/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/utils.py -------------------------------------------------------------------------------- /experiments/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/README.md -------------------------------------------------------------------------------- /experiments/llama-index-10k/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-10k/README.md -------------------------------------------------------------------------------- /experiments/llama-index-10k/data_gen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-10k/data_gen.py -------------------------------------------------------------------------------- /experiments/llama-index-10k/lyft_2021.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-10k/lyft_2021.pdf -------------------------------------------------------------------------------- /experiments/llama-index-10k/requirements.txt: -------------------------------------------------------------------------------- 1 | llama-index 2 | pypdf 3 | -------------------------------------------------------------------------------- /experiments/llama-index-10k/uber_2021.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-10k/uber_2021.pdf -------------------------------------------------------------------------------- /experiments/llama-index-synthetic-data/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-synthetic-data/README.md -------------------------------------------------------------------------------- /experiments/llama-index-synthetic-data/base_model_results_llama.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-synthetic-data/base_model_results_llama.json -------------------------------------------------------------------------------- /experiments/llama-index-synthetic-data/eval_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-synthetic-data/eval_utils.py -------------------------------------------------------------------------------- /experiments/llama-index-synthetic-data/evaluate_llama_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-synthetic-data/evaluate_llama_model.py -------------------------------------------------------------------------------- /experiments/llama-index-synthetic-data/ft_results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-synthetic-data/ft_results.json -------------------------------------------------------------------------------- /experiments/llama-index-synthetic-data/model_output_test/config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-synthetic-data/model_output_test/config.json -------------------------------------------------------------------------------- /experiments/llama-index-synthetic-data/model_output_test/pytorch_model.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-synthetic-data/model_output_test/pytorch_model.bin -------------------------------------------------------------------------------- /experiments/llama-index-synthetic-data/prepare_data_for_llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-synthetic-data/prepare_data_for_llama.py -------------------------------------------------------------------------------- /experiments/llama-index-synthetic-data/train_dataset_llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-synthetic-data/train_dataset_llama.py -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/pyproject.toml -------------------------------------------------------------------------------- /resources/general.spm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/resources/general.spm -------------------------------------------------------------------------------- /tasks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/tasks.py -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/datasets/reading_comprehension_generation/test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/tests/datasets/reading_comprehension_generation/test_utils.py -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/tests/test_cli.py -------------------------------------------------------------------------------- /tests/training/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/training/rag_e2e/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/training/rag_e2e/test_base_model.py: -------------------------------------------------------------------------------- 1 | def test_base_model() -> None: 2 | assert True 3 | -------------------------------------------------------------------------------- /tests/training/rag_e2e/test_train_rage2e.py: -------------------------------------------------------------------------------- 1 | def test_train_e2e() -> None: 2 | assert True 3 | -------------------------------------------------------------------------------- /tests/training/retriever_only/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/training/utils/__init__.py: -------------------------------------------------------------------------------- 1 | --------------------------------------------------------------------------------