├── .github
    ├── CODEOWNERS
    └── workflows
    │   ├── publish.yml
    │   └── test.yml
├── .gitignore
├── .python-version
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── dalm
    ├── __init__.py
    ├── cli.py
    ├── datasets
    │   ├── __init__.py
    │   ├── docs_to_passage
    │   │   ├── __init__.py
    │   │   ├── json_pre_process.py
    │   │   ├── main.py
    │   │   └── utils.py
    │   ├── qa_gen
    │   │   ├── __init__.py
    │   │   ├── check_dataset.py
    │   │   ├── knowledge_dataset.csv
    │   │   └── question_answer_generation.py
    │   ├── reading_comprehension_generation
    │   │   ├── README.md
    │   │   ├── regex_based.py
    │   │   ├── synthetic_based.py
    │   │   └── utils.py
    │   └── toy_data_train.csv
    ├── eval
    │   ├── README.md
    │   ├── __init__.py
    │   ├── eval_rag.py
    │   ├── eval_results.py
    │   ├── eval_retriever_only.py
    │   ├── triplets.csv
    │   └── utils.py
    ├── models
    │   ├── __init__.py
    │   ├── rag_e2e_base_model.py
    │   └── retriever_only_base_model.py
    ├── pipelines
    │   ├── README.md
    │   └── reading_comprehension_pipeline.py
    ├── py.typed
    ├── training
    │   ├── __init__.py
    │   ├── generator_only
    │   │   └── trainer.py
    │   ├── rag_e2e
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   └── train_rage2e.py
    │   ├── retriever_only
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── dataset
    │   │   │   ├── abstracts.csv
    │   │   │   ├── dataset_with_question.csv
    │   │   │   ├── knowledge_dataset.csv
    │   │   │   ├── knowledge_dataset
    │   │   │   │   ├── abstracts
    │   │   │   │   │   ├── cache-c6862fbba42bb48d.arrow
    │   │   │   │   │   ├── data-00000-of-00001.arrow
    │   │   │   │   │   ├── dataset_info.json
    │   │   │   │   │   └── state.json
    │   │   │   │   ├── cache-7d82ce38b8a589b1.arrow
    │   │   │   │   ├── data-00000-of-00001.arrow
    │   │   │   │   ├── dataset_dict.json
    │   │   │   │   ├── dataset_info.json
    │   │   │   │   └── state.json
    │   │   │   ├── train.csv
    │   │   │   └── valid.csv
    │   │   └── train_retriever_only.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── rag_e2e_dataloader_utils.py
    │   │   ├── retriever_only_dataloader_utils.py
    │   │   └── train_utils.py
    └── utils.py
├── experiments
    ├── README.md
    ├── llama-index-10k
    │   ├── README.md
    │   ├── data_gen.py
    │   ├── lyft_2021.pdf
    │   ├── requirements.txt
    │   └── uber_2021.pdf
    └── llama-index-synthetic-data
    │   ├── README.md
    │   ├── base_model_results_llama.json
    │   ├── eval_utils.py
    │   ├── evaluate_llama_model.py
    │   ├── ft_results.json
    │   ├── model_output_test
    │       ├── config.json
    │       └── pytorch_model.bin
    │   ├── prepare_data_for_llama.py
    │   └── train_dataset_llama.py
├── pyproject.toml
├── resources
    └── general.spm
├── tasks.py
└── tests
    ├── __init__.py
    ├── conftest.py
    ├── datasets
        └── reading_comprehension_generation
        │   └── test_utils.py
    ├── test_cli.py
    └── training
        ├── __init__.py
        ├── rag_e2e
            ├── __init__.py
            ├── test_base_model.py
            └── test_train_rage2e.py
        ├── retriever_only
            └── __init__.py
        └── utils
            └── __init__.py


/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | @shamanez @Jacobsolawetz @ben-epstein @SachiraKuruppu @metric-space


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/.github/workflows/publish.yml


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/.github/workflows/test.yml


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/.gitignore


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.11


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/CONTRIBUTING.md


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/LICENSE


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/README.md


--------------------------------------------------------------------------------
/dalm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/__init__.py


--------------------------------------------------------------------------------
/dalm/cli.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/cli.py


--------------------------------------------------------------------------------
/dalm/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dalm/datasets/docs_to_passage/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dalm/datasets/docs_to_passage/json_pre_process.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/datasets/docs_to_passage/json_pre_process.py


--------------------------------------------------------------------------------
/dalm/datasets/docs_to_passage/main.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/datasets/docs_to_passage/main.py


--------------------------------------------------------------------------------
/dalm/datasets/docs_to_passage/utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/datasets/docs_to_passage/utils.py


--------------------------------------------------------------------------------
/dalm/datasets/qa_gen/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dalm/datasets/qa_gen/check_dataset.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/datasets/qa_gen/check_dataset.py


--------------------------------------------------------------------------------
/dalm/datasets/qa_gen/knowledge_dataset.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/datasets/qa_gen/knowledge_dataset.csv


--------------------------------------------------------------------------------
/dalm/datasets/qa_gen/question_answer_generation.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/datasets/qa_gen/question_answer_generation.py


--------------------------------------------------------------------------------
/dalm/datasets/reading_comprehension_generation/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/datasets/reading_comprehension_generation/README.md


--------------------------------------------------------------------------------
/dalm/datasets/reading_comprehension_generation/regex_based.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/datasets/reading_comprehension_generation/regex_based.py


--------------------------------------------------------------------------------
/dalm/datasets/reading_comprehension_generation/synthetic_based.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/datasets/reading_comprehension_generation/synthetic_based.py


--------------------------------------------------------------------------------
/dalm/datasets/reading_comprehension_generation/utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/datasets/reading_comprehension_generation/utils.py


--------------------------------------------------------------------------------
/dalm/datasets/toy_data_train.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/datasets/toy_data_train.csv


--------------------------------------------------------------------------------
/dalm/eval/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/eval/README.md


--------------------------------------------------------------------------------
/dalm/eval/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dalm/eval/eval_rag.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/eval/eval_rag.py


--------------------------------------------------------------------------------
/dalm/eval/eval_results.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/eval/eval_results.py


--------------------------------------------------------------------------------
/dalm/eval/eval_retriever_only.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/eval/eval_retriever_only.py


--------------------------------------------------------------------------------
/dalm/eval/triplets.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/eval/triplets.csv


--------------------------------------------------------------------------------
/dalm/eval/utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/eval/utils.py


--------------------------------------------------------------------------------
/dalm/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dalm/models/rag_e2e_base_model.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/models/rag_e2e_base_model.py


--------------------------------------------------------------------------------
/dalm/models/retriever_only_base_model.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/models/retriever_only_base_model.py


--------------------------------------------------------------------------------
/dalm/pipelines/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/pipelines/README.md


--------------------------------------------------------------------------------
/dalm/pipelines/reading_comprehension_pipeline.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/pipelines/reading_comprehension_pipeline.py


--------------------------------------------------------------------------------
/dalm/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dalm/training/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dalm/training/generator_only/trainer.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/generator_only/trainer.py


--------------------------------------------------------------------------------
/dalm/training/rag_e2e/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/rag_e2e/README.md


--------------------------------------------------------------------------------
/dalm/training/rag_e2e/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dalm/training/rag_e2e/train_rage2e.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/rag_e2e/train_rage2e.py


--------------------------------------------------------------------------------
/dalm/training/retriever_only/README.md:
--------------------------------------------------------------------------------
1 | # arcee-retriever
2 | 
3 | 


--------------------------------------------------------------------------------
/dalm/training/retriever_only/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dalm/training/retriever_only/dataset/abstracts.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/abstracts.csv


--------------------------------------------------------------------------------
/dalm/training/retriever_only/dataset/dataset_with_question.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/dataset_with_question.csv


--------------------------------------------------------------------------------
/dalm/training/retriever_only/dataset/knowledge_dataset.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/knowledge_dataset.csv


--------------------------------------------------------------------------------
/dalm/training/retriever_only/dataset/knowledge_dataset/abstracts/cache-c6862fbba42bb48d.arrow:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/knowledge_dataset/abstracts/cache-c6862fbba42bb48d.arrow


--------------------------------------------------------------------------------
/dalm/training/retriever_only/dataset/knowledge_dataset/abstracts/data-00000-of-00001.arrow:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/knowledge_dataset/abstracts/data-00000-of-00001.arrow


--------------------------------------------------------------------------------
/dalm/training/retriever_only/dataset/knowledge_dataset/abstracts/dataset_info.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/knowledge_dataset/abstracts/dataset_info.json


--------------------------------------------------------------------------------
/dalm/training/retriever_only/dataset/knowledge_dataset/abstracts/state.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/knowledge_dataset/abstracts/state.json


--------------------------------------------------------------------------------
/dalm/training/retriever_only/dataset/knowledge_dataset/cache-7d82ce38b8a589b1.arrow:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/knowledge_dataset/cache-7d82ce38b8a589b1.arrow


--------------------------------------------------------------------------------
/dalm/training/retriever_only/dataset/knowledge_dataset/data-00000-of-00001.arrow:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/knowledge_dataset/data-00000-of-00001.arrow


--------------------------------------------------------------------------------
/dalm/training/retriever_only/dataset/knowledge_dataset/dataset_dict.json:
--------------------------------------------------------------------------------
1 | {"splits": ["abstracts"]}


--------------------------------------------------------------------------------
/dalm/training/retriever_only/dataset/knowledge_dataset/dataset_info.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/knowledge_dataset/dataset_info.json


--------------------------------------------------------------------------------
/dalm/training/retriever_only/dataset/knowledge_dataset/state.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/knowledge_dataset/state.json


--------------------------------------------------------------------------------
/dalm/training/retriever_only/dataset/train.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/train.csv


--------------------------------------------------------------------------------
/dalm/training/retriever_only/dataset/valid.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/dataset/valid.csv


--------------------------------------------------------------------------------
/dalm/training/retriever_only/train_retriever_only.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/retriever_only/train_retriever_only.py


--------------------------------------------------------------------------------
/dalm/training/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dalm/training/utils/rag_e2e_dataloader_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/utils/rag_e2e_dataloader_utils.py


--------------------------------------------------------------------------------
/dalm/training/utils/retriever_only_dataloader_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/utils/retriever_only_dataloader_utils.py


--------------------------------------------------------------------------------
/dalm/training/utils/train_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/training/utils/train_utils.py


--------------------------------------------------------------------------------
/dalm/utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/dalm/utils.py


--------------------------------------------------------------------------------
/experiments/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/README.md


--------------------------------------------------------------------------------
/experiments/llama-index-10k/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-10k/README.md


--------------------------------------------------------------------------------
/experiments/llama-index-10k/data_gen.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-10k/data_gen.py


--------------------------------------------------------------------------------
/experiments/llama-index-10k/lyft_2021.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-10k/lyft_2021.pdf


--------------------------------------------------------------------------------
/experiments/llama-index-10k/requirements.txt:
--------------------------------------------------------------------------------
1 | llama-index
2 | pypdf
3 | 


--------------------------------------------------------------------------------
/experiments/llama-index-10k/uber_2021.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-10k/uber_2021.pdf


--------------------------------------------------------------------------------
/experiments/llama-index-synthetic-data/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-synthetic-data/README.md


--------------------------------------------------------------------------------
/experiments/llama-index-synthetic-data/base_model_results_llama.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-synthetic-data/base_model_results_llama.json


--------------------------------------------------------------------------------
/experiments/llama-index-synthetic-data/eval_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-synthetic-data/eval_utils.py


--------------------------------------------------------------------------------
/experiments/llama-index-synthetic-data/evaluate_llama_model.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-synthetic-data/evaluate_llama_model.py


--------------------------------------------------------------------------------
/experiments/llama-index-synthetic-data/ft_results.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-synthetic-data/ft_results.json


--------------------------------------------------------------------------------
/experiments/llama-index-synthetic-data/model_output_test/config.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-synthetic-data/model_output_test/config.json


--------------------------------------------------------------------------------
/experiments/llama-index-synthetic-data/model_output_test/pytorch_model.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-synthetic-data/model_output_test/pytorch_model.bin


--------------------------------------------------------------------------------
/experiments/llama-index-synthetic-data/prepare_data_for_llama.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-synthetic-data/prepare_data_for_llama.py


--------------------------------------------------------------------------------
/experiments/llama-index-synthetic-data/train_dataset_llama.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/experiments/llama-index-synthetic-data/train_dataset_llama.py


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/pyproject.toml


--------------------------------------------------------------------------------
/resources/general.spm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/resources/general.spm


--------------------------------------------------------------------------------
/tasks.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/tasks.py


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/datasets/reading_comprehension_generation/test_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/tests/datasets/reading_comprehension_generation/test_utils.py


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arcee-ai/DALM/HEAD/tests/test_cli.py


--------------------------------------------------------------------------------
/tests/training/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/training/rag_e2e/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/training/rag_e2e/test_base_model.py:
--------------------------------------------------------------------------------
1 | def test_base_model() -> None:
2 |     assert True
3 | 


--------------------------------------------------------------------------------
/tests/training/rag_e2e/test_train_rage2e.py:
--------------------------------------------------------------------------------
1 | def test_train_e2e() -> None:
2 |     assert True
3 | 


--------------------------------------------------------------------------------
/tests/training/retriever_only/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/training/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------