├── README.md ├── basic-lora-merge.ipynb ├── exllama-fast-inference.ipynb ├── quantize-ggml.ipynb ├── quantize-gguf.ipynb ├── re-shard.ipynb └── weighted-lora-merge.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # LLM-notebooks 2 | Jupyter notebooks for cloud-based usage 3 | -------------------------------------------------------------------------------- /basic-lora-merge.ipynb: -------------------------------------------------------------------------------- 1 | {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"### Basic Lora Merge\nBased on TheBloke's script for lora adapter merge","metadata":{}},{"cell_type":"code","source":"# Install reqs\n%cd /kaggle/\n!pip install -U transformers peft accelerate","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Login to hub\nfrom huggingface_hub import notebook_login\nnotebook_login()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Full repo download model\n\n# Select model\nrepo_id = \"TheBloke/Llama-2-13B-fp16\"\n\n# Select branch\nrevision=\"main\"\n\n# Download model\nfrom huggingface_hub import snapshot_download\nsnapshot_download(repo_id=repo_id, revision=revision, local_dir=f\"./{repo_id.replace('/', '_')}\")\n\nprint(f\"Model dir: './{repo_id.replace('/', '_')}'\")","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Full repo download lora\n\n# Select model\nrepo_id = \"lemonilia/limarp-llama2\"\n\n# Select branch\nrevision=\"main\"\n\n# Download model\nfrom huggingface_hub import snapshot_download\nsnapshot_download(repo_id=repo_id, revision=revision, local_dir=f\"./{repo_id.replace('/', '_')}\")\n\nprint(f\"Lora dir: './{repo_id.replace('/', '_')}'\")","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Set variables\nmodel_dir = \"./TheBloke_Llama-2-13B-fp16\"\nlora_dir = \"./lemonilia_limarp-llama2/LIMARP-Llama2-LoRA-adapter-13B\"\n\n# Push to hub vs save files\nrepo_name = \"Limarp-Merged-L2-13b\"\npush_to_hub = False\n\noutput_dir = \"merge\"\n\n# Run merge\nfrom transformers import AutoModelForCausalLM, AutoTokenizer\nfrom peft import PeftModel\nimport torch\nimport shutil\n\nimport os\n\ndef merge_lora(base_model_path, lora_path, do_push):\n offload_model_path = \"./offload\"\n offload_peft_path = \"./offload_peft\"\n shutil.rmtree(offload_model_path, ignore_errors=True)\n shutil.rmtree(offload_peft_path, ignore_errors=True)\n os.makedirs(offload_model_path, exist_ok=True)\n os.makedirs(offload_peft_path, exist_ok=True)\n \n device_map = \"cpu\"\n float_type = torch.float16\n \n base_model = AutoModelForCausalLM.from_pretrained(\n base_model_path,\n return_dict=True,\n torch_dtype=float_type,\n device_map = device_map,\n offload_folder=offload_model_path,\n low_cpu_mem_usage=True\n )\n\n print(f\"Loading PEFT: {lora_path}\")\n model = PeftModel.from_pretrained(base_model, lora_path, torch_dtype=float_type, device_map = device_map, offload_folder=offload_peft_path, low_cpu_mem_usage=True)\n print(f\"Running merge_and_unload\")\n model = model.merge_and_unload()\n\n tokenizer = AutoTokenizer.from_pretrained(base_model_path)\n \n if do_push:\n model.push_to_hub(repo_name, private=True)\n tokenizer.push_to_hub(repo_name, private=True)\n else:\n os.makedirs(output_dir, exist_ok=True)\n model.save_pretrained(output_dir)\n tokenizer.save_pretrained(output_dir)\n print(f\"Model saved to {output_dir}\")\n \nmerge_lora(model_dir, lora_dir, push_to_hub)","metadata":{"trusted":true},"execution_count":null,"outputs":[]}]} -------------------------------------------------------------------------------- /exllama-fast-inference.ipynb: -------------------------------------------------------------------------------- 1 | {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.10","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"## ExLlama Fast Inference\nSupports ExLlama WebUI as well as [Oobabooga WebUI API imitation](https://gist.github.com/BlankParenthesis/4f490630b6307ec441364ab64f3ce900)\n\nUp to 34B 4-bit on 2x T4, and 13B 4-bit on 1x P100 or 1x T4\n### Installation","metadata":{}},{"cell_type":"code","source":"# Kaggle\n%cd /kaggle/\n\n# Colab\n# %cd /content/\n\n# Install ExLlama and deps\n!pip install -q --pre torch --index-url https://download.pytorch.org/whl/nightly/cu118\n!pip install -q safetensors sentencepiece ninja\n!pip install -q huggingface_hub\n\n!git clone https://github.com/turboderp/exllama\n%cd exllama\n\n# Install WebUI deps\n!pip install -q flask waitress\n\n# Install deps for Oobabooga WebUI API imitation\n!wget \"https://gist.githubusercontent.com/BlankParenthesis/4f490630b6307ec441364ab64f3ce900/raw/38f4feb8ea2c023907eaacf4a98c645bca2dfe3a/api.py\"\n!pip install -q flask_sock\n\n# Install localtunnel to access Flask/API\n!npm install localtunnel","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Model download\nDownload using HuggingFace repo ID","metadata":{}},{"cell_type":"code","source":"# Login to hub (to access private models)\nfrom huggingface_hub import notebook_login\nnotebook_login()","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Full repo download method\n\n# Select model\nrepo_id = \"TheBloke/Chronoboros-33B-GPTQ\"\n#repo_id = \"TheBloke/chronos-33b-GPTQ\"\n#repo_id = \"ausboss/llama-30b-supercot-4bit\"\n#repo_id = \"CalderaAI/30B-Lazarus-GPTQ4bit\"\n\n#repo_id = \"TheBloke/Llama-2-13B-GPTQ\"\n#repo_id = \"TheBloke/chronos-hermes-13B-GPTQ\"\n#repo_id = \"TehVenom/Metharme-13b-4bit-GPTQ\"\n#repo_id = \"TheBloke/Nous-Hermes-13B-GPTQ\"\n\n# Select branch\n#revision=\"main\"\nrevision=\"gptq-4bit-128g-actorder_True\"\n#revision=\"gptq-8bit-128g-actorder_True\"\n\n# Download model\nfrom huggingface_hub import snapshot_download\nsnapshot_download(repo_id=repo_id, revision=revision, local_dir=f\"./{repo_id.replace('/', '_')}\")\n\nimport os\nos.environ[\"MODEL_DIR\"] = f\"{repo_id.replace('/', '_')}\"\n\nprint(f\"Model dir: './{repo_id.replace('/', '_')}'\")","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Old download method - for repos where multiple versions are in the same branch\n\n# Select model\nrepo_id = \"reeducator/bluemoonrp-30b\"\nmodel_filename = \"bluemoonrp-30b-4bit-128g.safetensors\" # From the model repo\n\n# Select branch\nrevision=\"main\"\n\n# Download model\nfrom huggingface_hub import hf_hub_download\nhf_hub_download(repo_id=repo_id, revision=revision, filename=\"config.json\", local_dir=f\"./{repo_id.replace('/', '_')}\")\nhf_hub_download(repo_id=repo_id, revision=revision, filename=\"tokenizer.model\", local_dir=f\"./{repo_id.replace('/', '_')}\")\nhf_hub_download(repo_id=repo_id, revision=revision, filename=model_filename, local_dir=f\"./{repo_id.replace('/', '_')}\")\n\nimport os\nos.environ[\"MODEL_DIR\"] = f\"{repo_id.replace('/', '_')}\"\n\nprint(f\"Model dir: './{repo_id.replace('/', '_')}'\")","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Full repo download lora\n\n# Select model\nrepo_id = \"Ruaif/Kimiko_13B\"\n\n# Select branch\nrevision=\"main\"\n\n# Download model\nfrom huggingface_hub import snapshot_download\nsnapshot_download(repo_id=repo_id, revision=revision, local_dir=f\"./{repo_id.replace('/', '_')}\")\n\nimport os\nos.environ[\"LORA_DIR\"] = f\"{repo_id.replace('/', '_')}\"\n\nprint(f\"Lora dir: './{repo_id.replace('/', '_')}'\")","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Delete downloaded model\n!rm -r $MODEL_DIR\n!dir","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Delete downloaded lora\n!rm -r $LORA_DIR\n!dir","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Run inference\nSelect either ExLlama WebUI or Oobabooga WebUI API imitation\n\nArguments:\n\n-d : Path to directory containing config.json, tokenizer.model and * .safetensors **(use $MODEL_DIR)**\n\n--lora : Path to directory containing adapter_config.json and adapter_model.bin **(use $LORA_DIR)**\n\n-gs : Comma-separated list of VRAM (in GB) to use per GPU device for model layers **(recommend 8,11 for 33/34B models on 2x T4, disable on 1x T4 or 1x P100)**\n\n-l : Maximum sequence length **(2048 for llama 1 models and 4096 for llama 2 models, higher for extended context models)**\n\n-a : alpha for context size extension via embedding extension **(leave at 1 for trained context, ~2.5 for 4k context on llama 1 models, 93 for codellama)**\n\n-cpe : Compression factor for positional embeddings **(set to trained value for linear rope scaling models such as superhot or llongma)**","metadata":{}},{"cell_type":"code","source":"# ExLlama WebUI\n# Access localtunnel page and input IP as password\n!curl ipv4.icanhazip.com\n!python ./webui/app.py -d $MODEL_DIR --host \"127.0.0.1:5000\" -gs 8,11 -l 4096 & npx localtunnel --port 5000","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Oobabooga WebUI API imitation\n# Access localtunnel page and input IP as password\n# Standard API: https://X/api\n# Streaming API: ws://X/api/v1/stream\n!curl ipv4.icanhazip.com\n!python api.py -d $MODEL_DIR -gs 8,11 -l 4096 & npx localtunnel --port 5000","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Misc tests","metadata":{}},{"cell_type":"code","source":"# Benchmarking speeds\n!python test_benchmark_inference.py -d $MODEL_DIR -p -gs 8,11 -l 4096","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Benchmarking perplexity\n!python test_benchmark_inference.py -d $MODEL_DIR -ppl -ppl_ds \"./datasets/wikitext2_val_sample.jsonl\" -gs 8,11 -l 4096","metadata":{"trusted":true},"execution_count":null,"outputs":[]}]} -------------------------------------------------------------------------------- /quantize-ggml.ipynb: -------------------------------------------------------------------------------- 1 | {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"### Quantize GGML\nBased on TheBloke's script for ggml conversion and quantization\n\nPinned to final llama.cpp commit for ggmlv3","metadata":{}},{"cell_type":"code","source":"# Install llama.cpp\n%cd /kaggle/\n!git clone https://github.com/ggerganov/llama.cpp\n%cd /kaggle/llama.cpp\n!git checkout dadbed99e65252d79f81101a392d0d6497b86caa\n!make\n%cd /kaggle/","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Login to hub\nfrom huggingface_hub import notebook_login\nnotebook_login()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Full repo download model\n\n# Select model\nrepo_id = \"TheBloke/Llama-2-13B-fp16\"\n\n# Select branch\nrevision=\"main\"\n\n# Download model\nfrom huggingface_hub import snapshot_download\nsnapshot_download(repo_id=repo_id, revision=revision, local_dir=f\"./{repo_id.replace('/', '_')}\")\n\nprint(f\"Model dir: './{repo_id.replace('/', '_')}'\")","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# This step is necessary only if your base model is a standard 32000 vocab model AND the uploader accidentally kept added_tokens.json in the repo\n\n# Remove added_tokens.json\n%cd /kaggle/TheBloke_Llama-2-13B-fp16\n%rm added_tokens.json\n%cd /kaggle/","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Set variables\ninput_dir = \"./TheBloke_Llama-2-13B-fp16\"\nbase_model_name = \"Llama-2-13B\"\nremove_fp16 = True\n\n# Run quantize\nimport os\nimport subprocess\n\ndef quantize(model, outbase, outdir):\n llamabase = \"/kaggle/llama.cpp\"\n ggml_version = \"ggmlv3\"\n\n if not os.path.isdir(model):\n raise Exception(f\"Could not find model dir at {model}\")\n\n if not os.path.isfile(f\"{model}/config.json\"):\n raise Exception(f\"Could not find config.json in {model}\")\n\n os.makedirs(outdir, exist_ok=True)\n fp16 = f\"{outdir}/{outbase}.{ggml_version}.fp16.bin\"\n\n print(f\"Making unquantised GGML at {fp16}\")\n if not os.path.isfile(fp16):\n subprocess.run(f\"python {llamabase}/convert.py {model} --outtype f16 --outfile {fp16}\", shell=True, check=True)\n else:\n print(f\"Unquantised GGML already exists at: {fp16}\")\n\n print(\"Making quants\")\n for type in [\"q4_0\", \"q4_K_M\"]:\n outfile = f\"{outdir}/{outbase}.{ggml_version}.{type}.bin\"\n print(f\"Making {type} : {outfile}\")\n subprocess.run(f\"{llamabase}/quantize {fp16} {outfile} {type}\", shell=True, check=True)\n \n if remove_fp16:\n os.remove(fp16)\n\nquantize(input_dir, base_model_name, \"quantized\")","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Set variables\nusername = \"username\"\nbase_model_name = \"Llama-2-13B\"\n\n# Push to hub\nfrom huggingface_hub import create_repo, HfApi\napi = HfApi()\n\ncreate_repo(repo_id = f\"{username}/{base_model_name}-GGML\", private = True, repo_type = \"model\", exist_ok = True)\napi.upload_folder(\n folder_path=\"/kaggle/quantized\",\n repo_id=f\"{username}/{base_model_name}-GGML\",\n allow_patterns=f\"{base_model_name}*.bin\"\n)","metadata":{"trusted":true},"execution_count":null,"outputs":[]}]} -------------------------------------------------------------------------------- /quantize-gguf.ipynb: -------------------------------------------------------------------------------- 1 | {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"### Quantize GGUF\nBased on TheBloke's script for ggml conversion and quantization","metadata":{}},{"cell_type":"code","source":"# Install llama.cpp\n%cd /kaggle/\n!git clone https://github.com/ggerganov/llama.cpp\n%cd /kaggle/llama.cpp\n!pip install -r requirements.txt\n!make\n%cd /kaggle/","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Login to hub\nfrom huggingface_hub import notebook_login\nnotebook_login()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Full repo download model\n\n# Select model\nrepo_id = \"TheBloke/Llama-2-13B-fp16\"\n\n# Select branch\nrevision=\"main\"\n\n# Download model\nfrom huggingface_hub import snapshot_download\nsnapshot_download(repo_id=repo_id, revision=revision, local_dir=f\"./{repo_id.replace('/', '_')}\")\n\nprint(f\"Model dir: './{repo_id.replace('/', '_')}'\")","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# This step is necessary only if your base model is a standard 32000 vocab model AND the uploader accidentally kept added_tokens.json in the repo\n\n# Remove added_tokens.json\n%cd /kaggle/TheBloke_Llama-2-13B-fp16\n%rm added_tokens.json\n%cd /kaggle/","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Set variables\ninput_dir = \"./TheBloke_Llama-2-13B-fp16\"\nbase_model_name = \"Llama-2-13B\"\nremove_fp16 = True\n\n# Run quantize\nimport os\nimport subprocess\n\ndef quantize(model, outbase, outdir):\n llamabase = \"/kaggle/llama.cpp\"\n ggml_version = \"ggufv1\"\n\n if not os.path.isdir(model):\n raise Exception(f\"Could not find model dir at {model}\")\n\n if not os.path.isfile(f\"{model}/config.json\"):\n raise Exception(f\"Could not find config.json in {model}\")\n\n os.makedirs(outdir, exist_ok=True)\n fp16 = f\"{outdir}/{outbase}.fp16.gguf\"\n\n print(f\"Making unquantised GGUF at {fp16}\")\n if not os.path.isfile(fp16):\n subprocess.run(f\"python {llamabase}/convert.py {model} --outtype f16 --outfile {fp16}\", shell=True, check=True)\n else:\n print(f\"Unquantised GGUF already exists at: {fp16}\")\n\n print(\"Making quants\")\n for type in [\"q4_K_S\", \"q5_K_M\"]:\n outfile = f\"{outdir}/{outbase}.{type}.gguf\"\n print(f\"Making {type} : {outfile}\")\n subprocess.run(f\"{llamabase}/quantize {fp16} {outfile} {type}\", shell=True, check=True)\n \n if remove_fp16:\n os.remove(fp16)\n\nquantize(input_dir, base_model_name, \"quantized\")","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Set variables\nusername = \"username\"\nbase_model_name = \"Llama-2-13B\"\n\n# Push to hub\nfrom huggingface_hub import create_repo, HfApi\napi = HfApi()\n\ncreate_repo(repo_id = f\"{username}/{base_model_name}-GGUF\", private = True, repo_type = \"model\", exist_ok = True)\napi.upload_folder(\n folder_path=\"/kaggle/quantized\",\n repo_id=f\"{username}/{base_model_name}-GGUF\",\n allow_patterns=f\"{base_model_name}*.gguf\"\n)","metadata":{"trusted":true},"execution_count":null,"outputs":[]}]} -------------------------------------------------------------------------------- /re-shard.ipynb: -------------------------------------------------------------------------------- 1 | {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"### Re-Shard\nLoad a model and re-upload it with a different shard size and/or float type","metadata":{}},{"cell_type":"code","source":"# Install reqs\n%cd /kaggle/\n!pip install -U transformers","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Login to hub\nfrom huggingface_hub import notebook_login\nnotebook_login()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Full repo download model\n\n# Select model\nrepo_id = \"TheBloke/Llama-2-13B-fp16\"\n\n# Select branch\nrevision=\"main\"\n\n# Download model\nfrom huggingface_hub import snapshot_download\nsnapshot_download(repo_id=repo_id, revision=revision, local_dir=f\"./{repo_id.replace('/', '_')}\")\n\nprint(f\"Model dir: './{repo_id.replace('/', '_')}'\")","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Set variables\nmodel_path = \"./TheBloke_Llama-2-13B-fp16\"\nrepo_name = \"Llama-2-13B\"\nfloat_type = torch.float16\nshard_size = \"2GB\"\n\n# Re-shard and push to hub\nimport torch\nfrom transformers import AutoModelForCausalLM, AutoTokenizer\ndevice_map = \"cpu\"\nmodel = AutoModelForCausalLM.from_pretrained(\n model_path,\n return_dict=True,\n torch_dtype=float_type,\n device_map=device_map\n )\ntokenizer = AutoTokenizer.from_pretrained(model_path)\nmodel.push_to_hub(repo_name, private=True, max_shard_size=shard_size)\ntokenizer.push_to_hub(repo_name, private=True)","metadata":{"trusted":true},"execution_count":null,"outputs":[]}]} -------------------------------------------------------------------------------- /weighted-lora-merge.ipynb: -------------------------------------------------------------------------------- 1 | {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"## Weighted Lora Merge\nBased on implementation from https://github.com/CoffeeVampir3/ez-trainer and CLI adaption from https://github.com/zarakiquemparte/zaraki-tools","metadata":{}},{"cell_type":"code","source":"# Install reqs\n%cd /kaggle/\n!pip install -U transformers peft accelerate","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Login to hub\nfrom huggingface_hub import notebook_login\nnotebook_login()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Full repo download model\n\n# Select model\nrepo_id = \"TheBloke/Llama-2-13B-fp16\"\n\n# Select branch\nrevision=\"main\"\n\n# Download model\nfrom huggingface_hub import snapshot_download\nsnapshot_download(repo_id=repo_id, revision=revision, local_dir=f\"./{repo_id.replace('/', '_')}\")\n\nprint(f\"Model dir: './{repo_id.replace('/', '_')}'\")","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Full repo download lora\n\n# Select model\nrepo_id = \"lemonilia/limarp-llama2\"\n\n# Select branch\nrevision=\"main\"\n\n# Download model\nfrom huggingface_hub import snapshot_download\nsnapshot_download(repo_id=repo_id, revision=revision, local_dir=f\"./{repo_id.replace('/', '_')}\")\n\nprint(f\"Lora dir: './{repo_id.replace('/', '_')}'\")","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Set variables\nmodel_dir = \"./TheBloke_Llama-2-13B-fp16\"\nlora_dir = \"./lemonilia_limarp-llama2/LIMARP-Llama2-LoRA-adapter-13B\"\nweight = 0.66\n\n# Push to hub vs save files\nrepo_name = \"Limarp-Merged-L2-13b\"\npush_to_hub = False\n\noutput_dir = \"merge\"\n\n# Run merge\nfrom transformers import AutoModelForCausalLM, AutoTokenizer\nfrom peft import PeftModel\nimport torch\nimport shutil\n\nimport os\n\ndef merge(base_model, lora_model, scaling, merge_weight=1.0):\n weights_list = []\n\n # Loop over all parameters\n for name, param in lora_model.named_parameters():\n # If the parameter name ends with '.weight', it's an original weight\n if name.endswith('.weight'):\n # Make sure it's not a lora_A or lora_B weight\n if not any(substring in name for substring in ['lora_A', 'lora_B']):\n # Construct the names of the corresponding lora_A and lora_B weights\n layers = name.split('.')\n try:\n layer = lora_model\n for item in layers[:-1]: # We go until the penultimate item (excluding the 'weight' part)\n if 'lora' in item: # Split further if lora_A or lora_B\n item, lora_item = item.split('_')\n layer = getattr(layer, item)\n layer = getattr(layer, lora_item)\n else:\n layer = getattr(layer, item)\n \n # Try to get lora_A and lora_B weights\n lora_A = getattr(layer, 'lora_A').default.weight\n lora_B = getattr(layer, 'lora_B').default.weight\n\n # Add a tuple to the list with the parameter name as the first item\n weights_list.append((name, param.data, lora_A, lora_B))\n\n except AttributeError:\n pass\n #print(f\"Unable to find lora_A or lora_B weights for {name}\")\n\n for (name,weight,a,b) in weights_list:\n ab = b @ a\n weight += ab * scaling * merge_weight\n print(f\"Did thing for layer named {name}\")\n \n #clean lora loading trash\n for name, module in base_model.named_modules():\n if 'lora_A' in dir(module):\n delattr(module, 'lora_A')\n if 'lora_B' in dir(module):\n delattr(module, 'lora_B')\n\ndef get_lora_scaling(lora_model):\n r = lora_model.peft_config[\"default\"].r\n alpha = lora_model.peft_config[\"default\"].lora_alpha\n\n scaling = alpha/r\n return scaling\n\ndef load_model(model_path, lora_path):\n offload_model_path = \"./offload\"\n offload_peft_path = \"./offload_peft\"\n shutil.rmtree(offload_model_path, ignore_errors=True)\n shutil.rmtree(offload_peft_path, ignore_errors=True)\n os.makedirs(offload_model_path, exist_ok=True)\n os.makedirs(offload_peft_path, exist_ok=True)\n\n device_map = \"cpu\"\n float_type = torch.float16\n\n base_model = AutoModelForCausalLM.from_pretrained(\n model_path,\n return_dict=True,\n torch_dtype=float_type,\n device_map = device_map,\n offload_folder=offload_model_path,\n low_cpu_mem_usage=True\n )\n\n print(f\"Loading PEFT: {lora_path}\")\n lora_model = PeftModel.from_pretrained(base_model, lora_path, torch_dtype=float_type, device_map = device_map, offload_folder=offload_peft_path, low_cpu_mem_usage=True)\n \n return base_model, lora_model\n\ndef initiate_model_lora_merge(model_path, lora_path, merge_weight, do_push = False):\n print(model_path)\n print(lora_path)\n\n base_model, lora_model = load_model(model_path, lora_path)\n scaling = get_lora_scaling(lora_model)\n \n print(f\"Lora Scaling: {scaling}\")\n \n merge(base_model, lora_model, scaling, merge_weight=merge_weight)\n \n tokenizer = AutoTokenizer.from_pretrained(model_path)\n \n if do_push:\n base_model.push_to_hub(repo_name, private=True)\n tokenizer.push_to_hub(repo_name, private=True)\n else:\n os.makedirs(output_dir, exist_ok=True)\n base_model.save_pretrained(output_dir)\n tokenizer.save_pretrained(output_dir)\n \n print(\"Done merging.\")\n return\n\ninitiate_model_lora_merge(model_dir, lora_dir, weight, push_to_hub)","metadata":{"trusted":true},"execution_count":null,"outputs":[]}]} --------------------------------------------------------------------------------