├── .gitattributes ├── .gitignore ├── Data Collection ├── LargeDataCollector.ipynb ├── britannica │ ├── URLFetcher.py │ ├── __init__.py │ ├── main.py │ ├── queries.py │ ├── requirements.txt │ └── search_queries.json ├── javascript │ ├── customLinkFinder.js │ ├── customSearch.js │ ├── customWebScrapper.js │ ├── googleCustomSearch.js │ ├── sample.js │ └── webDataScrapping.js ├── run_britannica.py ├── run_transcripts.py ├── run_wiki.py ├── wikipedia │ ├── __init__.py │ ├── fetch_urls.py │ ├── main.py │ ├── queries.py │ ├── requirements.txt │ └── search_queries.json └── youtube_transcripts │ ├── __init__.py │ ├── basic.py │ ├── channel_ids.json │ ├── channel_ids_snippet.json │ ├── main.py │ ├── requirements.txt │ ├── snippets.py │ └── version2.py ├── Data Processing ├── append_files.py ├── split_files.py ├── un-parquet.py └── unzip.py ├── Data ├── captions.txt └── training_data.txt ├── LICENSE ├── Models ├── TrainModel.ipynb ├── config.json ├── decoder.py ├── encoder.py ├── lora.py ├── qlora.py ├── run.py ├── sequence.py └── tokenizer.py ├── README.md ├── null.png ├── requirements.txt └── training.md /.gitattributes: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/.gitattributes -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/.gitignore -------------------------------------------------------------------------------- /Data Collection/LargeDataCollector.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/LargeDataCollector.ipynb -------------------------------------------------------------------------------- /Data Collection/britannica/URLFetcher.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/britannica/URLFetcher.py -------------------------------------------------------------------------------- /Data Collection/britannica/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/britannica/__init__.py -------------------------------------------------------------------------------- /Data Collection/britannica/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/britannica/main.py -------------------------------------------------------------------------------- /Data Collection/britannica/queries.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/britannica/queries.py -------------------------------------------------------------------------------- /Data Collection/britannica/requirements.txt: -------------------------------------------------------------------------------- 1 | bs4 2 | requests 3 | tqdm 4 | timeit 5 | json 6 | re -------------------------------------------------------------------------------- /Data Collection/britannica/search_queries.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/britannica/search_queries.json -------------------------------------------------------------------------------- /Data Collection/javascript/customLinkFinder.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/javascript/customLinkFinder.js -------------------------------------------------------------------------------- /Data Collection/javascript/customSearch.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/javascript/customSearch.js -------------------------------------------------------------------------------- /Data Collection/javascript/customWebScrapper.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/javascript/customWebScrapper.js -------------------------------------------------------------------------------- /Data Collection/javascript/googleCustomSearch.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/javascript/googleCustomSearch.js -------------------------------------------------------------------------------- /Data Collection/javascript/sample.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/javascript/sample.js -------------------------------------------------------------------------------- /Data Collection/javascript/webDataScrapping.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/javascript/webDataScrapping.js -------------------------------------------------------------------------------- /Data Collection/run_britannica.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/run_britannica.py -------------------------------------------------------------------------------- /Data Collection/run_transcripts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/run_transcripts.py -------------------------------------------------------------------------------- /Data Collection/run_wiki.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/run_wiki.py -------------------------------------------------------------------------------- /Data Collection/wikipedia/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/wikipedia/__init__.py -------------------------------------------------------------------------------- /Data Collection/wikipedia/fetch_urls.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/wikipedia/fetch_urls.py -------------------------------------------------------------------------------- /Data Collection/wikipedia/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/wikipedia/main.py -------------------------------------------------------------------------------- /Data Collection/wikipedia/queries.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/wikipedia/queries.py -------------------------------------------------------------------------------- /Data Collection/wikipedia/requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | bs4 3 | requests -------------------------------------------------------------------------------- /Data Collection/wikipedia/search_queries.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/wikipedia/search_queries.json -------------------------------------------------------------------------------- /Data Collection/youtube_transcripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/youtube_transcripts/__init__.py -------------------------------------------------------------------------------- /Data Collection/youtube_transcripts/basic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/youtube_transcripts/basic.py -------------------------------------------------------------------------------- /Data Collection/youtube_transcripts/channel_ids.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/youtube_transcripts/channel_ids.json -------------------------------------------------------------------------------- /Data Collection/youtube_transcripts/channel_ids_snippet.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/youtube_transcripts/channel_ids_snippet.json -------------------------------------------------------------------------------- /Data Collection/youtube_transcripts/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/youtube_transcripts/main.py -------------------------------------------------------------------------------- /Data Collection/youtube_transcripts/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/youtube_transcripts/requirements.txt -------------------------------------------------------------------------------- /Data Collection/youtube_transcripts/snippets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/youtube_transcripts/snippets.py -------------------------------------------------------------------------------- /Data Collection/youtube_transcripts/version2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Collection/youtube_transcripts/version2.py -------------------------------------------------------------------------------- /Data Processing/append_files.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Processing/append_files.py -------------------------------------------------------------------------------- /Data Processing/split_files.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Processing/split_files.py -------------------------------------------------------------------------------- /Data Processing/un-parquet.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Processing/un-parquet.py -------------------------------------------------------------------------------- /Data Processing/unzip.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data Processing/unzip.py -------------------------------------------------------------------------------- /Data/captions.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data/captions.txt -------------------------------------------------------------------------------- /Data/training_data.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Data/training_data.txt -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/LICENSE -------------------------------------------------------------------------------- /Models/TrainModel.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Models/TrainModel.ipynb -------------------------------------------------------------------------------- /Models/config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Models/config.json -------------------------------------------------------------------------------- /Models/decoder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Models/decoder.py -------------------------------------------------------------------------------- /Models/encoder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Models/encoder.py -------------------------------------------------------------------------------- /Models/lora.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Models/lora.py -------------------------------------------------------------------------------- /Models/qlora.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Models/qlora.py -------------------------------------------------------------------------------- /Models/run.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Models/run.py -------------------------------------------------------------------------------- /Models/sequence.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Models/sequence.py -------------------------------------------------------------------------------- /Models/tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/Models/tokenizer.py -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/README.md -------------------------------------------------------------------------------- /null.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/null.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/requirements.txt -------------------------------------------------------------------------------- /training.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shivendrra/SmallLanguageModel/HEAD/training.md --------------------------------------------------------------------------------