├── honeypot-server ├── logs │ ├── auth.log │ └── log_2024-05-31_15-09-54.txt ├── README.md ├── main.py ├── server.key.pub ├── llm.py ├── server.key └── ssh_server.py ├── .DS_Store ├── supervised-finetuning ├── model_inference.yaml ├── multi_gpu_sft_script.sh ├── merge_lora_sft.yaml ├── multi_gpu_phi3.yaml ├── multi_gpu_llama3.yaml ├── multi_gpu_codellama.yaml └── README.md ├── notebooks ├── ssh_server.png ├── training_loss.png ├── similarity_score_distribution_140samples.png ├── preprocess.ipynb └── postprocess.ipynb └── README.md /honeypot-server/logs/auth.log: -------------------------------------------------------------------------------- 1 | root:dagshj 2 | -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI-in-Complex-Systems-Lab/LLM-Honeypot/HEAD/.DS_Store -------------------------------------------------------------------------------- /supervised-finetuning/model_inference.yaml: -------------------------------------------------------------------------------- 1 | model_name_or_path: models/honeypot-llama3-8B 2 | template: llama3 3 | -------------------------------------------------------------------------------- /notebooks/ssh_server.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI-in-Complex-Systems-Lab/LLM-Honeypot/HEAD/notebooks/ssh_server.png -------------------------------------------------------------------------------- /honeypot-server/README.md: -------------------------------------------------------------------------------- 1 | ## Start server 2 | > python main.py 3 | 4 | ## Connect with SSH 5 | > ssh -T -p 2222 "root@localhost" -------------------------------------------------------------------------------- /notebooks/training_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI-in-Complex-Systems-Lab/LLM-Honeypot/HEAD/notebooks/training_loss.png -------------------------------------------------------------------------------- /notebooks/similarity_score_distribution_140samples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI-in-Complex-Systems-Lab/LLM-Honeypot/HEAD/notebooks/similarity_score_distribution_140samples.png -------------------------------------------------------------------------------- /honeypot-server/logs/log_2024-05-31_15-09-54.txt: -------------------------------------------------------------------------------- 1 | @CMD: ls 2 | @RESP: ``` 3 | bin desktop documents downloads music pictures public videos 4 | ``` 5 | 6 | @CMD: 7 | @RESP: ``` 8 | bash: command not found 9 | ``` 10 | 11 | -------------------------------------------------------------------------------- /honeypot-server/main.py: -------------------------------------------------------------------------------- 1 | from ssh_server import start_ssh_server 2 | from llm import LLM 3 | 4 | MODEL_NAME = "../models/honeypot-llama3-8B" 5 | llama = LLM(MODEL_NAME) 6 | print(llama.answer('ls -al', ['mkdir test_directory; touch test.txt','\n'])) 7 | 8 | start_ssh_server(llama) -------------------------------------------------------------------------------- /supervised-finetuning/multi_gpu_sft_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES=0,1 accelerate launch src/train.py multi_gpu_phi3.yaml 4 | 5 | CUDA_VISIBLE_DEVICES=0,1 accelerate launch src/train.py multi_gpu_llama3.yaml 6 | 7 | CUDA_VISIBLE_DEVICES=0,1 accelerate launch src/train.py multi_gpu_codellama.yaml -------------------------------------------------------------------------------- /supervised-finetuning/merge_lora_sft.yaml: -------------------------------------------------------------------------------- 1 | # Note: DO NOT use quantized model or quantization_bit when merging lora adapters 2 | 3 | # model 4 | model_name_or_path: NousResearch/Meta-Llama-3-8B-Instruct 5 | adapter_name_or_path: saves/Custom/lora/honeypot_llama3_v3 6 | template: llama3 7 | finetuning_type: lora 8 | 9 | # export 10 | export_dir: models/honeypot-llama3-8B 11 | export_size: 1 12 | export_device: cuda 13 | export_legacy_format: false 14 | -------------------------------------------------------------------------------- /honeypot-server/server.key.pub: -------------------------------------------------------------------------------- 1 | ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQDsqkBcRScrwI5rbuAbaiEUurBruU6R3s4BW8iDSXCtVlA7/SK9zBNnswjUdJDIzhA4Q/gb1n3q+jp8Ivmn8fQhuqOzfeQY/m5iAxA547/lkYrppwpRuKCUn5uBfwLI2G94BdGUQI2XBYW/foDkSr+j1sYQm3XZJnjkkljQ7F0lWA7bQx5RdcRASn0/LMa+eFqlkFmKqG7AsPHApFXsHs8E+4QGhvdDUifB/99aDCthDA0Jn5OpOXqnZ2mImQcSDwcoHOvTG9CvRY0fBfRBXTOt4Pw+RU3qFzJJHx/8wsoR5yBFJDoTRYt3KpaWyd0zdkhxRVPYN6ePhofCMmCG8CJuJNtBYYuc1pvWMGgNescNMX3qeQc2fT8B3PetMkU1yYAFx3Wquf4mIdbSWr2A+8DYt3lPvKCHEheY8z5+bLQ5aICj89hQoeB8h4DFoapMA0gCDSPm+Oa/p/32f7zUB2f68V76UHA42EKzQXy0XN2B20LqIn7lz5QYuk0QC8khriU= hotal@dyn-169-226-53-44.etec.albany.edu 2 | -------------------------------------------------------------------------------- /supervised-finetuning/multi_gpu_phi3.yaml: -------------------------------------------------------------------------------- 1 | bf16: true 2 | cutoff_len: 1024 3 | dataset: honeypot_logs 4 | dataset_dir: data 5 | do_train: true 6 | finetuning_type: lora 7 | flash_attn: fa2 8 | gradient_accumulation_steps: 8 9 | learning_rate: 5.0e-05 10 | logging_steps: 1 11 | lora_alpha: 16 12 | lora_dropout: 0 13 | lora_rank: 8 14 | lora_target: qkv_proj 15 | lr_scheduler_type: cosine 16 | max_grad_norm: 1.0 17 | max_samples: 100000 18 | model_name_or_path: microsoft/Phi-3-mini-4k-instruct 19 | num_train_epochs: 2.0 20 | optim: adamw_torch 21 | output_dir: saves/Custom/lora/honeypot_phi3 22 | packing: false 23 | per_device_train_batch_size: 16 24 | quantization_bit: 8 25 | report_to: none 26 | save_steps: 100 27 | stage: sft 28 | template: phi 29 | warmup_steps: 0 30 | -------------------------------------------------------------------------------- /supervised-finetuning/multi_gpu_llama3.yaml: -------------------------------------------------------------------------------- 1 | bf16: true 2 | cutoff_len: 2048 3 | dataset: honeypot_logs 4 | dataset_dir: data 5 | do_train: true 6 | finetuning_type: lora 7 | flash_attn: fa2 8 | gradient_accumulation_steps: 8 9 | learning_rate: 1.0e-04 10 | logging_steps: 1 11 | lora_alpha: 16 12 | lora_dropout: 0 13 | lora_rank: 8 14 | lora_target: q_proj,v_proj 15 | lr_scheduler_type: cosine 16 | max_grad_norm: 1.0 17 | max_samples: 100000 18 | model_name_or_path: NousResearch/Meta-Llama-3-8B-Instruct 19 | num_train_epochs: 3.0 20 | optim: adamw_torch 21 | output_dir: saves/Custom/lora/honeypot_llama3_v3 22 | packing: false 23 | per_device_train_batch_size: 8 24 | quantization_bit: 8 25 | report_to: none 26 | save_steps: 100 27 | stage: sft 28 | template: llama3 29 | warmup_steps: 0 30 | -------------------------------------------------------------------------------- /supervised-finetuning/multi_gpu_codellama.yaml: -------------------------------------------------------------------------------- 1 | bf16: true 2 | cutoff_len: 1024 3 | dataset: honeypot_logs 4 | dataset_dir: data 5 | do_train: true 6 | finetuning_type: lora 7 | flash_attn: fa2 8 | gradient_accumulation_steps: 8 9 | learning_rate: 5.0e-05 10 | logging_steps: 1 11 | lora_alpha: 16 12 | lora_dropout: 0 13 | lora_rank: 8 14 | lora_target: q_proj,v_proj 15 | lr_scheduler_type: cosine 16 | max_grad_norm: 1.0 17 | max_samples: 100000 18 | model_name_or_path: codellama/CodeLlama-7b-Instruct-hf 19 | num_train_epochs: 2.0 20 | optim: adamw_torch 21 | output_dir: saves/Custom/lora/honeypot_codellama 22 | packing: false 23 | per_device_train_batch_size: 16 24 | quantization_bit: 8 25 | report_to: none 26 | save_steps: 100 27 | stage: sft 28 | template: llama2 29 | warmup_steps: 0 30 | -------------------------------------------------------------------------------- /supervised-finetuning/README.md: -------------------------------------------------------------------------------- 1 | > conda activate python3-11 2 | 3 | ### Single-GPU: 4 | 5 | > CUDA_VISIBLE_DEVICES=0 GRADIO_SHARE=0 llamafactory-cli webui 6 | 7 | - Update data/dataset_info.json 8 | 9 | 10 | ### Multi-GPU: 11 | 12 | > sh multi_gpu_sft_script.sh 13 | 14 | > CUDA_VISIBLE_DEVICES=0,1 accelerate launch src/train.py multi_gpu_llama3.yaml 15 | 16 | 17 | ### Merge: 18 | 19 | > CUDA_VISIBLE_DEVICES=0 llamafactory-cli export merge_lora_sft.yaml 20 | 21 | 22 | 23 | #### Prompt: 24 | 25 | You are mimicking a linux server. Respond with what the terminal would respond when a code given. I want you to only reply with the terminal outputs inside one unique code block and nothing else. Do not write any explanations. Do not type any commands unless I instruct you to do so. -------------------------------------------------------------------------------- /honeypot-server/llm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import gc 3 | import re 4 | from transformers import ( 5 | AutoModelForCausalLM, 6 | AutoTokenizer, 7 | BitsAndBytesConfig, 8 | pipeline 9 | ) 10 | 11 | class LLM: 12 | def __init__(self, model_name="NousResearch/Meta-Llama-3-8B-Instruct"): 13 | gc.collect() 14 | torch.cuda.empty_cache() 15 | print("Cleared GPU...") 16 | 17 | self.DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu" 18 | self.BASE_MODEL_NAME = model_name 19 | self.SYSTEM_PROMPT = "You are mimicking a linux server. Respond with what the terminal would respond when a code given. I want you to only reply with the terminal outputs inside one unique code block and nothing else. Do not write any explanations. Do not type any commands unless I instruct you to do so." 20 | 21 | # Model configuration 22 | self.pipeline = pipeline( 23 | "text-generation", 24 | model=self.BASE_MODEL_NAME, 25 | tokenizer=self.BASE_MODEL_NAME, 26 | model_kwargs={"torch_dtype": torch.bfloat16}, 27 | device=self.DEVICE, 28 | ) 29 | 30 | print("Loaded Model: ", self.BASE_MODEL_NAME) 31 | 32 | def answer(self, query, log_history=[], max_tokens=4096, temperature=0.01, top_p=0.8): 33 | 34 | message_history = [{"role": "system", "content": self.SYSTEM_PROMPT}] 35 | if len(log_history) > 0: 36 | for i, item in enumerate(log_history): 37 | if i % 2 == 0: 38 | message_history.append({"role": "user", "content": item}) 39 | else: 40 | message_history.append({"role": "assistant", "content": item}) 41 | 42 | user_prompt = message_history + [{"role": "user", "content": query}] 43 | prompt = self.pipeline.tokenizer.apply_chat_template( 44 | user_prompt, tokenize=False, add_generation_prompt=True 45 | ) 46 | outputs = self.pipeline( 47 | prompt, 48 | max_new_tokens=max_tokens, 49 | eos_token_id=self.pipeline.tokenizer.eos_token_id, 50 | do_sample=True, 51 | temperature=temperature, 52 | top_p=top_p, 53 | ) 54 | response = outputs[0]["generated_text"][len(prompt):] 55 | 56 | # remove unnecessary quotes 57 | if response.startswith("```") and response.endswith("```"): 58 | response = response[3:-3] 59 | elif response.startswith("`") and response.endswith("`"): 60 | response = response[1:-1] 61 | 62 | return response 63 | -------------------------------------------------------------------------------- /honeypot-server/server.key: -------------------------------------------------------------------------------- 1 | -----BEGIN OPENSSH PRIVATE KEY----- 2 | b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAABlwAAAAdzc2gtcn 3 | NhAAAAAwEAAQAAAYEA7KpAXEUnK8COa27gG2ohFLqwa7lOkd7OAVvIg0lwrVZQO/0ivcwT 4 | Z7MI1HSQyM4QOEP4G9Z96vo6fCL5p/H0Ibqjs33kGP5uYgMQOeO/5ZGK6acKUbiglJ+bgX 5 | 8CyNhveAXRlECNlwWFv36A5Eq/o9bGEJt12SZ45JJY0OxdJVgO20MeUXXEQEp9PyzGvnha 6 | pZBZiqhuwLDxwKRV7B7PBPuEBob3Q1Inwf/fWgwrYQwNCZ+TqTl6p2dpiJkHEg8HKBzr0x 7 | vQr0WNHwX0QV0zreD8PkVN6hcySR8f/MLKEecgRSQ6E0WLdyqWlsndM3ZIcUVT2Denj4aH 8 | wjJghvAibiTbQWGLnNab1jBoDXrHDTF96nkHNn0/Adz3rTJFNcmABcd1qrn+JiHW0lq9gP 9 | vA2Ld5T7yghxIXmPM+fmy0OWiAo/PYUKHgfIeAxaGqTANIAg0j5vjmv6f99n+81Adn+vFe 10 | +lBwONhCs0F8tFzdgdtC6iJ+5c+UGLpNEAvJIa4lAAAFoKiQHGqokBxqAAAAB3NzaC1yc2 11 | EAAAGBAOyqQFxFJyvAjmtu4BtqIRS6sGu5TpHezgFbyINJcK1WUDv9Ir3ME2ezCNR0kMjO 12 | EDhD+BvWfer6Onwi+afx9CG6o7N95Bj+bmIDEDnjv+WRiumnClG4oJSfm4F/AsjYb3gF0Z 13 | RAjZcFhb9+gORKv6PWxhCbddkmeOSSWNDsXSVYDttDHlF1xEBKfT8sxr54WqWQWYqobsCw 14 | 8cCkVewezwT7hAaG90NSJ8H/31oMK2EMDQmfk6k5eqdnaYiZBxIPBygc69Mb0K9FjR8F9E 15 | FdM63g/D5FTeoXMkkfH/zCyhHnIEUkOhNFi3cqlpbJ3TN2SHFFU9g3p4+Gh8IyYIbwIm4k 16 | 20Fhi5zWm9YwaA16xw0xfep5BzZ9PwHc960yRTXJgAXHdaq5/iYh1tJavYD7wNi3eU+8oI 17 | cSF5jzPn5stDlogKPz2FCh4HyHgMWhqkwDSAINI+b45r+n/fZ/vNQHZ/rxXvpQcDjYQrNB 18 | fLRc3YHbQuoifuXPlBi6TRALySGuJQAAAAMBAAEAAAGAXNBwKT+dmxULRarYDShUDPMiWT 19 | z1SVPd6r56JrLYk8Iz+TLPOywbuCGIpvmIBph51/cgCJrYCx1Tbnew/WJwYgH2TWBj4kF2 20 | PM4CwRFGotuvZO7zpxUHNQJbVC/hga8QGDv/82pVnRK3X2BXYDDIc5K5Xq35S85AvzN9zY 21 | HFDe449VifPUc4ThZohCwlTXGZ6zBvTWe896jtRqWsZMu4WLXjE0Q/GJHRsc2NZJ09LfQx 22 | 8MI6gLVgGuL9imqpUTW+0ZQAxLpHl+AHEGJPzm2zMSrjf1qij8ERudKlu3ucuLfJP6oq1Y 23 | TCy7ABOoQ5+tub+rW+27NG4ESwfhokWPgrDyfHr8urbiS01ZzXRbFNNINu+zki6ahQM2p/ 24 | bK+kR0l04P3oaIGzyza5f8S/NzR4FwHRpVfH9S+V1tS6UkEU9RJfXTkR5jDHX44oZHv4mO 25 | ffbAxui7s/Wn4JTPwfKBuTdAJ8l4P2Y5IXPRkCl2lj9cxcc83q2VqlbxJuJEtyyhyBAAAA 26 | wQCGtIhzAtC2UM/6SpSJf0RwXgQij1MI/S+qGDsmwm06PLPtw7cgYmBLdwdapOyap4XYZc 27 | Mgt/Y65ooOW07tSN88XzjZLrRq5JEr2YKkLxRu23ZBET/UNWC9WoSrdkJTu5p150UzYjIx 28 | X5ulTNOwFpB5wNW7pK1Jvpobj7dPtr6wb5+aZixVM/HM95QvvFw18+BBv5Y1jSVKuSQd+t 29 | QpuKLhGB6nArQfLxd3C5WbPJ7JqA8WdlD1zkt51dxOo3qX2W4AAADBAP9pIkijafH+K6U5 30 | aT+bXA0hRRsTYCsN72WFTyt54eJzwZs9kDAeYhZNjJoA3KggGpqM7yC/AQEAUZ1B/qbwvN 31 | 5rPc9K7M10EtQMesKWeN8FG8CXp6NbRg/bp6sucy/xLJ/o5/zcSBV2H9aQArakNpkQA7vD 32 | 4olu5A2M/oOP3w14gcfz83BiKODESlE12Ob9HbdyUK6+p5wGFYc6mi13cxGPJGYGcuLnm9 33 | 8J+UsJNZ/Xid80/aXqsQa9Tpy8+eNz8QAAAMEA7TYLeI0dltsQHacCuPPEZVhCzHB69a6g 34 | qp/GwZ0hHX9DQiPqImzH9ke0cW0GDS7ybqhckx1jctvc1Q0LK1osvA9dbMu/FVhELe3agI 35 | hIdwljT/zDShyD/nfudvVUGGTN7N0V1SDmhdvQ5Tgp7ZWt8541qTuQGptNdQXcyQwNbQt2 36 | NThCHKQ4nHHMMRX4HJFtUztcCGuKFUuxiwuQNjj80wNww2pr3CkUKVXQ0ZyIHheKlgy74h 37 | zgQ6bYVcUHFcF1AAAAJ2hvdGFsQGR5bi0xNjktMjI2LTUzLTQ0LmV0ZWMuYWxiYW55LmVk 38 | dQECAw== 39 | -----END OPENSSH PRIVATE KEY----- 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LLM Honeypot: Leveraging Large Language Models as Advanced Interactive Honeypot Systems 2 | 3 | Code for our paper "LLM Honeypot: Leveraging Large Language Models as Advanced Interactive Honeypot Systems" published in 2024 IEEE Conference on Communications and Network Security (CNS). 4 | 5 | You can download the paper via: [[IEEE]](https://ieeexplore.ieee.org/iel8/10735442/10735467/10735607.pdf) - [[DOI]](https://doi.org/10.1109/CNS62487.2024.10735607) 6 | 7 | [Dataset](https://huggingface.co/datasets/hotal/honeypot_logs) 8 | 9 | [Finetuned Model](https://huggingface.co/hotal/honeypot-llama3-8B) 10 | 11 | ## Training 12 | 13 | The training and fine-tuning process for the model presented in the paper utilized the [Llama-Factory](https://github.com/hiyouga/LLaMA-Factory) tool. Consequently, specific training scripts are not included in this repository. 14 | 15 | Please be aware that the Llama-Factory repository is actively maintained and frequently updated. The methods or scripts used during our research might require adjustments to remain compatible with the latest versions of Llama-Factory. 16 | 17 | To replicate the model training: 18 | 1. Use our custom dataset available on Hugging Face: [hotal/honeypot_logs](https://huggingface.co/datasets/hotal/honeypot_logs). 19 | 2. Follow the Llama-Factory documentation on using custom datasets, which can be found here: [Llama-Factory Custom Data Documentation](https://github.com/hiyouga/LLaMA-Factory/tree/main/data). 20 | 21 | Combining our dataset with the instructions provided by Llama-Factory should offer the most direct path to reproduce the training process. 22 | 23 | ## Running the Model 24 | 25 | The code relevant for running the honeypot server with the trained model can be found within the `/honeypot-server` directory of this repository. 26 | 27 | You can use the pre-trained model available on Hugging Face: [hotal/honeypot-llama3-8B](https://huggingface.co/hotal/honeypot-llama3-8B). 28 | 29 | **Note:** Similar to the training dependencies, the code in `/honeypot-server` may require updates to ensure compatibility with the current versions of dependent libraries (e.g., huggingface, transformers, etc.). Please check library compatibility if you encounter issues. 30 | 31 | ## Abstract 32 | 33 | The rapid evolution of cyber threats necessitates innovative solutions for detecting and analyzing malicious activity. Honeypots, which are decoy systems designed to lure and interact with attackers, have emerged as a critical component in cybersecurity. In this paper, we present a novel approach to creating realistic and interactive honeypot systems using Large Language Models (LLMs). By fine-tuning a pre-trained open-source language model on a diverse dataset of attacker-generated commands and responses, we developed a honeypot capable of sophisticated engagement with attackers. Our methodology involved several key steps: data collection and processing, prompt engineering, model selection, and supervised fine-tuning to optimize the model’s performance. Evaluation through similarity metrics and live deployment demonstrated that our approach effectively generates accurate and informative responses. The results highlight the potential of LLMs to revolutionize honeypot technology, providing cybersecurity professionals with a powerful tool to detect and analyze malicious activity, thereby enhancing overall security infrastructure. 34 | 35 | ## Citation 36 | 37 | If this work is helpful, please cite as: 38 | 39 | ```bibtex 40 | @INPROCEEDINGS{ 41 | 10735607, 42 | author={Otal, Hakan T. and Canbaz, M. Abdullah}, 43 | booktitle={2024 IEEE Conference on Communications and Network Security (CNS)}, 44 | title={LLM Honeypot: Leveraging Large Language Models as Advanced Interactive Honeypot Systems}, 45 | year={2024}, 46 | pages={1-6}, 47 | doi={10.1109/CNS62487.2024.10735607} 48 | } 49 | ``` 50 | 51 | ## Contact 52 | 53 | hotal [AT] albany [DOT] edu 54 | -------------------------------------------------------------------------------- /honeypot-server/ssh_server.py: -------------------------------------------------------------------------------- 1 | import socket, sys, threading 2 | import paramiko 3 | from datetime import datetime 4 | 5 | # Generate keys with 'ssh-keygen -t rsa -f server.key' 6 | HOST_KEY = paramiko.RSAKey(filename='server.key') 7 | SSH_PORT = 2222 8 | 9 | # Log the user:password combinations to files 10 | LOGFILE = 'logs/auth.log' 11 | LOGFILE_LOCK = threading.Lock() 12 | 13 | class SSHServerHandler(paramiko.ServerInterface): 14 | def __init__(self, llm_model): 15 | self.event = threading.Event() 16 | self.llm_model = llm_model 17 | self.log_history = [] 18 | 19 | def check_channel_request(self, kind, channelID): 20 | return paramiko.OPEN_SUCCEEDED 21 | 22 | def check_channel_shell_request(self, channel): 23 | print("Channel", channel) 24 | self.channel = channel 25 | return True 26 | 27 | def check_channel_pty_request(self, c, t, w, h, p, ph, m): 28 | return True 29 | 30 | def get_allowed_auths(self, username): 31 | return 'password' 32 | 33 | def check_auth_password(self, username, password): 34 | self.username = username 35 | 36 | # save login info to a file 37 | LOGFILE_LOCK.acquire() 38 | try: 39 | logfile_handle = open(LOGFILE,"a") 40 | print("New login: " + username + ":" + password) 41 | logfile_handle.write(username + ":" + password + "\n") 42 | logfile_handle.close() 43 | finally: 44 | LOGFILE_LOCK.release() 45 | 46 | return paramiko.AUTH_SUCCESSFUL 47 | 48 | def handle_shell(self): 49 | log_filename = f"logs/log_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.txt" 50 | 51 | while not self.channel.exit_status_ready(): 52 | try: 53 | # Receive user input 54 | self.channel.sendall(f'{self.username}@localhost:~/ $') 55 | command = self.channel.recv(1024).decode("utf-8").strip() 56 | print("CMD:", command) 57 | 58 | # Produce output with LLM 59 | response = self.llm_model.answer(command, self.log_history) 60 | 61 | # Save the logs 62 | self.log_history.append(command) 63 | self.log_history.append(response) 64 | log_file = open(log_filename, "a") 65 | log_file.write(f"@CMD: {command}\n@RESP: {response}\n\n") 66 | log_file.close() 67 | 68 | # Send response 69 | self.channel.sendall(f'{response}\n') 70 | 71 | except Exception as e: 72 | print("Channel closed:", e) 73 | self.channel.close() 74 | self.event.set() 75 | return 76 | 77 | self.channel.close() 78 | self.event.set() 79 | 80 | 81 | def handleConnection(client, llm_model): 82 | transport = paramiko.Transport(client) 83 | transport.add_server_key(HOST_KEY) 84 | 85 | server_handler = SSHServerHandler(llm_model) 86 | transport.start_server(server=server_handler) 87 | 88 | channel = transport.accept() 89 | 90 | if channel is None: 91 | transport.close() 92 | return 93 | 94 | server_handler.channel = channel 95 | server_handler.handle_shell() 96 | 97 | def start_ssh_server(llm_model): 98 | try: 99 | server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 100 | server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) 101 | server_socket.bind(('', SSH_PORT)) 102 | server_socket.listen(100) 103 | print('Server started...') 104 | 105 | while(True): 106 | try: 107 | client_socket, client_addr = server_socket.accept() 108 | print(f'New Connection: {client_addr}') 109 | threading.Thread(target=handleConnection, args=(client_socket,llm_model,)).start() 110 | except Exception as e: 111 | print("ERROR: Client handling") 112 | print(e) 113 | 114 | except Exception as e: 115 | print("ERROR: Failed to create socket") 116 | print(e) 117 | sys.exit(1) 118 | -------------------------------------------------------------------------------- /notebooks/preprocess.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 10, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from datetime import datetime\n", 10 | "import pandas as pd\n", 11 | "import json\n", 12 | "from datasets import load_dataset, Dataset, DatasetDict" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 11, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "name": "stderr", 22 | "output_type": "stream", 23 | "text": [ 24 | "Downloading readme: 100%|██████████| 405/405 [00:00<00:00, 3.61MB/s]\n", 25 | "Downloading data: 100%|██████████| 116k/116k [00:10<00:00, 10.9kB/s]\n", 26 | "Generating train split: 100%|██████████| 481/481 [00:00<00:00, 108763.83 examples/s]\n" 27 | ] 28 | }, 29 | { 30 | "data": { 31 | "text/plain": [ 32 | "DatasetDict({\n", 33 | " train: Dataset({\n", 34 | " features: ['system', 'command', 'response', '__index_level_0__'],\n", 35 | " num_rows: 481\n", 36 | " })\n", 37 | "})" 38 | ] 39 | }, 40 | "execution_count": 11, 41 | "metadata": {}, 42 | "output_type": "execute_result" 43 | } 44 | ], 45 | "source": [ 46 | "dataset = load_dataset(\"hotal/linux_commands\")\n", 47 | "dataset" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 12, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "data": { 57 | "text/html": [ 58 | "
\n", 59 | "\n", 72 | "\n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | "
systemcommandresponse__index_level_0__
0I want you to understand what every Linux term...chgrp# chgrp\\n> Change group ownership of files and...0
1I want you to understand what every Linux term...more# more\\n> Open a file for interactive reading,...1
2I want you to understand what every Linux term...git-hash-object# git hash-object\\n> Computes the unique hash ...2
3I want you to understand what every Linux term...id# id\\n> Display current user and group identit...3
4I want you to understand what every Linux term...nl# nl\\n> A utility for numbering lines, either ...4
...............
476I want you to understand what every Linux term...sleep# sleep\\n> Delay for a specified amount of tim...476
477I want you to understand what every Linux term...manpath# manpath\\n> Determine the search path for man...477
478I want you to understand what every Linux term...mv# mv\\n> Move or rename files and directories. ...478
479I want you to understand what every Linux term...whereis# whereis\\n> Locate the binary, source, and ma...479
480I want you to understand what every Linux term...git-daemon# git daemon\\n> A really simple server for Git...480
\n", 162 | "

481 rows × 4 columns

\n", 163 | "
" 164 | ], 165 | "text/plain": [ 166 | " system command \\\n", 167 | "0 I want you to understand what every Linux term... chgrp \n", 168 | "1 I want you to understand what every Linux term... more \n", 169 | "2 I want you to understand what every Linux term... git-hash-object \n", 170 | "3 I want you to understand what every Linux term... id \n", 171 | "4 I want you to understand what every Linux term... nl \n", 172 | ".. ... ... \n", 173 | "476 I want you to understand what every Linux term... sleep \n", 174 | "477 I want you to understand what every Linux term... manpath \n", 175 | "478 I want you to understand what every Linux term... mv \n", 176 | "479 I want you to understand what every Linux term... whereis \n", 177 | "480 I want you to understand what every Linux term... git-daemon \n", 178 | "\n", 179 | " response __index_level_0__ \n", 180 | "0 # chgrp\\n> Change group ownership of files and... 0 \n", 181 | "1 # more\\n> Open a file for interactive reading,... 1 \n", 182 | "2 # git hash-object\\n> Computes the unique hash ... 2 \n", 183 | "3 # id\\n> Display current user and group identit... 3 \n", 184 | "4 # nl\\n> A utility for numbering lines, either ... 4 \n", 185 | ".. ... ... \n", 186 | "476 # sleep\\n> Delay for a specified amount of tim... 476 \n", 187 | "477 # manpath\\n> Determine the search path for man... 477 \n", 188 | "478 # mv\\n> Move or rename files and directories. ... 478 \n", 189 | "479 # whereis\\n> Locate the binary, source, and ma... 479 \n", 190 | "480 # git daemon\\n> A really simple server for Git... 480 \n", 191 | "\n", 192 | "[481 rows x 4 columns]" 193 | ] 194 | }, 195 | "execution_count": 12, 196 | "metadata": {}, 197 | "output_type": "execute_result" 198 | } 199 | ], 200 | "source": [ 201 | "df = pd.DataFrame(dataset['train'])\n", 202 | "df" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 17, 208 | "metadata": {}, 209 | "outputs": [ 210 | { 211 | "data": { 212 | "text/html": [ 213 | "
\n", 214 | "\n", 227 | "\n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | "
systemcommandresponse
0You are a linux expert. You understand what ev...chgrp# chgrp\\n> Change group ownership of files and...
1You are a linux expert. You understand what ev...more# more\\n> Open a file for interactive reading,...
2You are a linux expert. You understand what ev...git-hash-object# git hash-object\\n> Computes the unique hash ...
3You are a linux expert. You understand what ev...id# id\\n> Display current user and group identit...
4You are a linux expert. You understand what ev...nl# nl\\n> A utility for numbering lines, either ...
............
476You are a linux expert. You understand what ev...sleep# sleep\\n> Delay for a specified amount of tim...
477You are a linux expert. You understand what ev...manpath# manpath\\n> Determine the search path for man...
478You are a linux expert. You understand what ev...mv# mv\\n> Move or rename files and directories. ...
479You are a linux expert. You understand what ev...whereis# whereis\\n> Locate the binary, source, and ma...
480You are a linux expert. You understand what ev...git-daemon# git daemon\\n> A really simple server for Git...
\n", 305 | "

481 rows × 3 columns

\n", 306 | "
" 307 | ], 308 | "text/plain": [ 309 | " system command \\\n", 310 | "0 You are a linux expert. You understand what ev... chgrp \n", 311 | "1 You are a linux expert. You understand what ev... more \n", 312 | "2 You are a linux expert. You understand what ev... git-hash-object \n", 313 | "3 You are a linux expert. You understand what ev... id \n", 314 | "4 You are a linux expert. You understand what ev... nl \n", 315 | ".. ... ... \n", 316 | "476 You are a linux expert. You understand what ev... sleep \n", 317 | "477 You are a linux expert. You understand what ev... manpath \n", 318 | "478 You are a linux expert. You understand what ev... mv \n", 319 | "479 You are a linux expert. You understand what ev... whereis \n", 320 | "480 You are a linux expert. You understand what ev... git-daemon \n", 321 | "\n", 322 | " response \n", 323 | "0 # chgrp\\n> Change group ownership of files and... \n", 324 | "1 # more\\n> Open a file for interactive reading,... \n", 325 | "2 # git hash-object\\n> Computes the unique hash ... \n", 326 | "3 # id\\n> Display current user and group identit... \n", 327 | "4 # nl\\n> A utility for numbering lines, either ... \n", 328 | ".. ... \n", 329 | "476 # sleep\\n> Delay for a specified amount of tim... \n", 330 | "477 # manpath\\n> Determine the search path for man... \n", 331 | "478 # mv\\n> Move or rename files and directories. ... \n", 332 | "479 # whereis\\n> Locate the binary, source, and ma... \n", 333 | "480 # git daemon\\n> A really simple server for Git... \n", 334 | "\n", 335 | "[481 rows x 3 columns]" 336 | ] 337 | }, 338 | "execution_count": 17, 339 | "metadata": {}, 340 | "output_type": "execute_result" 341 | } 342 | ], 343 | "source": [ 344 | "for i, data in df.iterrows():\n", 345 | " df.loc[i, 'system'] = \"You are a linux expert. You understand what every Linux terminal command does and you reply with the explanation when asked.\"\n", 346 | "\n", 347 | "df = df.drop('__index_level_0__', axis=1)\n", 348 | "df" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 19, 354 | "metadata": {}, 355 | "outputs": [ 356 | { 357 | "data": { 358 | "text/html": [ 359 | "
\n", 360 | "\n", 373 | "\n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | "
systemcommandresponse
0You are a linux expert. You understand what ev...w# w\\n> Show who is logged on and what they are...
1You are a linux expert. You understand what ev...ar# ar\\n> Create, modify, and extract from Unix ...
2You are a linux expert. You understand what ev...mv# mv\\n> Move or rename files and directories. ...
3You are a linux expert. You understand what ev...ps# ps\\n> Information about running processes. M...
4You are a linux expert. You understand what ev...ld# ld\\n> Link object files together. More infor...
............
476You are a linux expert. You understand what ev...git-cvsexportcommit# git cvsexportcommit\\n> Export a single `Git`...
477You are a linux expert. You understand what ev...update-alternatives# update-alternatives\\n> A convenient tool for...
478You are a linux expert. You understand what ev...git-credential-store# git credential-store\\n> `git` helper to stor...
479You are a linux expert. You understand what ev...git-credential-cache# git credential-cache\\n> Git helper to tempor...
480You are a linux expert. You understand what ev...git-check-ref-format# git check-ref-format\\n> Checks if a given re...
\n", 451 | "

481 rows × 3 columns

\n", 452 | "
" 453 | ], 454 | "text/plain": [ 455 | " system command \\\n", 456 | "0 You are a linux expert. You understand what ev... w \n", 457 | "1 You are a linux expert. You understand what ev... ar \n", 458 | "2 You are a linux expert. You understand what ev... mv \n", 459 | "3 You are a linux expert. You understand what ev... ps \n", 460 | "4 You are a linux expert. You understand what ev... ld \n", 461 | ".. ... ... \n", 462 | "476 You are a linux expert. You understand what ev... git-cvsexportcommit \n", 463 | "477 You are a linux expert. You understand what ev... update-alternatives \n", 464 | "478 You are a linux expert. You understand what ev... git-credential-store \n", 465 | "479 You are a linux expert. You understand what ev... git-credential-cache \n", 466 | "480 You are a linux expert. You understand what ev... git-check-ref-format \n", 467 | "\n", 468 | " response \n", 469 | "0 # w\\n> Show who is logged on and what they are... \n", 470 | "1 # ar\\n> Create, modify, and extract from Unix ... \n", 471 | "2 # mv\\n> Move or rename files and directories. ... \n", 472 | "3 # ps\\n> Information about running processes. M... \n", 473 | "4 # ld\\n> Link object files together. More infor... \n", 474 | ".. ... \n", 475 | "476 # git cvsexportcommit\\n> Export a single `Git`... \n", 476 | "477 # update-alternatives\\n> A convenient tool for... \n", 477 | "478 # git credential-store\\n> `git` helper to stor... \n", 478 | "479 # git credential-cache\\n> Git helper to tempor... \n", 479 | "480 # git check-ref-format\\n> Checks if a given re... \n", 480 | "\n", 481 | "[481 rows x 3 columns]" 482 | ] 483 | }, 484 | "execution_count": 19, 485 | "metadata": {}, 486 | "output_type": "execute_result" 487 | } 488 | ], 489 | "source": [ 490 | "s = df.command.str.len().sort_values(ascending=True).index\n", 491 | "df = df.reindex(s).reset_index(drop=True)\n", 492 | "df" 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": 8, 498 | "metadata": {}, 499 | "outputs": [], 500 | "source": [ 501 | "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": 20, 507 | "metadata": {}, 508 | "outputs": [ 509 | { 510 | "name": "stderr", 511 | "output_type": "stream", 512 | "text": [ 513 | "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 958.04ba/s]\n", 514 | "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.49s/it]\n" 515 | ] 516 | }, 517 | { 518 | "data": { 519 | "text/plain": [ 520 | "CommitInfo(commit_url='https://huggingface.co/datasets/hotal/linux_commands/commit/96c38caacb53d0b586cb93731b94e3e0dad1fa10', commit_message='Upload dataset', commit_description='', oid='96c38caacb53d0b586cb93731b94e3e0dad1fa10', pr_url=None, pr_revision=None, pr_num=None)" 521 | ] 522 | }, 523 | "execution_count": 20, 524 | "metadata": {}, 525 | "output_type": "execute_result" 526 | } 527 | ], 528 | "source": [ 529 | "dataset = Dataset.from_pandas(df)\n", 530 | "dataset.push_to_hub(\"hotal/linux_commands\")" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": null, 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [] 539 | } 540 | ], 541 | "metadata": { 542 | "kernelspec": { 543 | "display_name": "venv", 544 | "language": "python", 545 | "name": "python3" 546 | }, 547 | "language_info": { 548 | "codemirror_mode": { 549 | "name": "ipython", 550 | "version": 3 551 | }, 552 | "file_extension": ".py", 553 | "mimetype": "text/x-python", 554 | "name": "python", 555 | "nbconvert_exporter": "python", 556 | "pygments_lexer": "ipython3", 557 | "version": "3.11.8" 558 | } 559 | }, 560 | "nbformat": 4, 561 | "nbformat_minor": 2 562 | } 563 | -------------------------------------------------------------------------------- /notebooks/postprocess.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Upload Model to HF" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from transformers import AutoModel,AutoTokenizer\n", 17 | "\n", 18 | "model = AutoModel.from_pretrained('../models/honeypot-llama3-8B')\n", 19 | "tokenizer = AutoTokenizer.from_pretrained('../models/honeypot-llama3-8B')\n", 20 | "\n", 21 | "model.push_to_hub(\"honeypot-llama3-8B\")\n", 22 | "tokenizer.push_to_hub(\"honeypot-llama3-8B\")" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "## LLM Inference" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 29, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | "Cleared GPU...\n" 42 | ] 43 | }, 44 | { 45 | "name": "stderr", 46 | "output_type": "stream", 47 | "text": [ 48 | "Loading checkpoint shards: 100%|██████████| 17/17 [00:05<00:00, 3.38it/s]\n", 49 | "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" 50 | ] 51 | }, 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "Loaded Model: ../models/honeypot-llama3-8B\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "import torch\n", 62 | "import gc\n", 63 | "import re\n", 64 | "from transformers import (\n", 65 | " AutoModelForCausalLM,\n", 66 | " AutoTokenizer,\n", 67 | " BitsAndBytesConfig,\n", 68 | " pipeline\n", 69 | ")\n", 70 | "\n", 71 | "class LLM:\n", 72 | " def __init__(self, model_name=\"NousResearch/Meta-Llama-3-8B-Instruct\"):\n", 73 | " gc.collect()\n", 74 | " torch.cuda.empty_cache()\n", 75 | " print(\"Cleared GPU...\")\n", 76 | "\n", 77 | " self.DEVICE = \"cuda:1\" if torch.cuda.is_available() else \"cpu\"\n", 78 | " self.BASE_MODEL_NAME = model_name\n", 79 | " self.SYSTEM_PROMPT = \"You are mimicking a linux server. Respond with what the terminal would respond when a code given. I want you to only reply with the terminal outputs inside one unique code block and nothing else. Do not write any explanations. Do not type any commands unless I instruct you to do so.\"\n", 80 | "\n", 81 | " # Model configuration\n", 82 | " self.pipeline = pipeline(\n", 83 | " \"text-generation\",\n", 84 | " model=self.BASE_MODEL_NAME,\n", 85 | " tokenizer=self.BASE_MODEL_NAME,\n", 86 | " model_kwargs={\"torch_dtype\": torch.bfloat16},\n", 87 | " device=self.DEVICE,\n", 88 | " )\n", 89 | "\n", 90 | " print(\"Loaded Model: \", self.BASE_MODEL_NAME)\n", 91 | "\n", 92 | " def answer(self, query, log_history=[], max_tokens=4096, temperature=0.01, top_p=0.8):\n", 93 | "\n", 94 | " message_history = [{\"role\": \"system\", \"content\": self.SYSTEM_PROMPT}]\n", 95 | " if len(log_history) > 0:\n", 96 | " for i, item in enumerate(log_history):\n", 97 | " if i % 2 == 0:\n", 98 | " message_history.append({\"role\": \"user\", \"content\": item})\n", 99 | " else:\n", 100 | " message_history.append({\"role\": \"assistant\", \"content\": item})\n", 101 | "\n", 102 | " user_prompt = message_history + [{\"role\": \"user\", \"content\": query}]\n", 103 | " prompt = self.pipeline.tokenizer.apply_chat_template(\n", 104 | " user_prompt, tokenize=False, add_generation_prompt=True\n", 105 | " )\n", 106 | " outputs = self.pipeline(\n", 107 | " prompt,\n", 108 | " max_new_tokens=max_tokens,\n", 109 | " eos_token_id=self.pipeline.tokenizer.eos_token_id,\n", 110 | " do_sample=True,\n", 111 | " temperature=temperature,\n", 112 | " top_p=top_p,\n", 113 | " )\n", 114 | " response = outputs[0][\"generated_text\"][len(prompt):]\n", 115 | "\n", 116 | " # remove unnecessary quotes\n", 117 | " if response.startswith(\"```\") and response.endswith(\"```\"):\n", 118 | " response = response[3:-3]\n", 119 | " elif response.startswith(\"`\") and response.endswith(\"`\"):\n", 120 | " response = response[1:-1]\n", 121 | " \n", 122 | " return response\n", 123 | " \n", 124 | "\n", 125 | "llama = LLM(\"../models/honeypot-llama3-8B\")\n", 126 | "#print(llama.answer('ls -al', ['mkdir test_directory; touch test.txt','\\n']))" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "## Evaluation" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 52, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "from openai import OpenAI\n", 143 | "\n", 144 | "client = OpenAI(base_url=\"http://localhost:1234/v1\", api_key=\"lm-studio\")\n", 145 | "\n", 146 | "\n", 147 | "def measure_correctness(prompt, command, generated_answer, reference_answer):\n", 148 | " try:\n", 149 | " correctness_prompt = f\"\"\"###Task Description: An instruction (might include an Input inside it), a query, a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given. \n", 150 | " 1. Write a detailed feedback that assesses the quality of the response strictly based on the given score rubric, not evaluating in general. \n", 151 | " 2. After writing a feedback, write a score that is either 1 or 2 or 3 or 4 or 5. You should refer to the score rubric. \n", 152 | " 3. The output format should look as follows: \"Feedback: (write a feedback for criteria) [RESULT] (1 or 2 or 3 or 4 or 5)\" \n", 153 | " 4. Please do not generate any other opening, closing, and explanations. \n", 154 | " 5. Generated answer and reference answer can be completely different, consider if it makes sense for the given command.\n", 155 | " 6. Error messages in the generated answer are fine as long as the reference answer has error messages too.\n", 156 | "\n", 157 | " ###The instruction to evaluate: Your task is to evaluate the generated answer and reference answer for the query: {prompt}\\nCommand:{command}\n", 158 | "\n", 159 | " ###Generate answer to evaluate: {generated_answer} \n", 160 | "\n", 161 | " ###Reference Answer (Score 5): {reference_answer}\n", 162 | "\n", 163 | " ###Score Rubrics: \n", 164 | " Score 1: The response is completely irrelevant to the given command.\n", 165 | " Score 2: The response executes the command but provides incomplete or inaccurate information.\n", 166 | " Score 3: The response provides some accurate information but lacks detail or has minor errors.\n", 167 | " Score 4: The response provides mostly accurate and complete information but has some minor errors or omissions.\n", 168 | " Score 5: The response provides a complete and accurate response to the given command, identical to what a Linux terminal would output.\n", 169 | " \n", 170 | " ###Feedback:\"\"\"\n", 171 | "\n", 172 | " completion = client.chat.completions.create(\n", 173 | " model=\"RichardErkhov/prometheus-eval_-_prometheus-7b-v2.0-gguf\",\n", 174 | " messages=[\n", 175 | " {\"role\": \"system\", \"content\": \"You are a language model evaluator. Respond according to the given task.\"},\n", 176 | " {\"role\": \"user\", \"content\": correctness_prompt}\n", 177 | " ],\n", 178 | " temperature=0.01,\n", 179 | " )\n", 180 | "\n", 181 | " return int(completion.choices[0].message.content.split('[RESULT] ')[1])\n", 182 | " except:\n", 183 | " return -1" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 53, 189 | "metadata": {}, 190 | "outputs": [ 191 | { 192 | "data": { 193 | "text/plain": [ 194 | "2" 195 | ] 196 | }, 197 | "execution_count": 53, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "prompt = 'You are mimicking a linux server. Respond with what the terminal would respond when a code given. I want you to only reply with the terminal outputs inside one unique code block and nothing else. Do not write any explanations. Do not type any commands unless I instruct you to do so.'\n", 204 | "command = 'ls'\n", 205 | "generated_answer = 'bin boot dev etc home '\n", 206 | "reference_answer = 'bin boot dev etc home lib lib64 lost+found media mnt opt proc root run sbin srv sys tmp usr var'\n", 207 | "\n", 208 | "measure_correctness(prompt, command, generated_answer, reference_answer)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 72, 214 | "metadata": {}, 215 | "outputs": [ 216 | { 217 | "name": "stderr", 218 | "output_type": "stream", 219 | "text": [ 220 | "12it [00:28, 6.20s/it]" 221 | ] 222 | }, 223 | { 224 | "name": "stdout", 225 | "output_type": "stream", 226 | "text": [ 227 | "Skipping row 9 due to timeout (> 15 seconds)\n" 228 | ] 229 | }, 230 | { 231 | "name": "stderr", 232 | "output_type": "stream", 233 | "text": [ 234 | "16it [00:45, 6.26s/it]" 235 | ] 236 | }, 237 | { 238 | "name": "stdout", 239 | "output_type": "stream", 240 | "text": [ 241 | "Skipping row 132 due to timeout (> 15 seconds)\n" 242 | ] 243 | }, 244 | { 245 | "name": "stderr", 246 | "output_type": "stream", 247 | "text": [ 248 | "42it [01:32, 5.52s/it]" 249 | ] 250 | }, 251 | { 252 | "name": "stdout", 253 | "output_type": "stream", 254 | "text": [ 255 | "Skipping row 42 due to timeout (> 15 seconds)\n" 256 | ] 257 | }, 258 | { 259 | "name": "stderr", 260 | "output_type": "stream", 261 | "text": [ 262 | "60it [02:28, 9.42s/it]" 263 | ] 264 | }, 265 | { 266 | "name": "stdout", 267 | "output_type": "stream", 268 | "text": [ 269 | "Skipping row 179 due to timeout (> 15 seconds)\n" 270 | ] 271 | }, 272 | { 273 | "name": "stderr", 274 | "output_type": "stream", 275 | "text": [ 276 | "67it [02:58, 6.99s/it]" 277 | ] 278 | }, 279 | { 280 | "name": "stdout", 281 | "output_type": "stream", 282 | "text": [ 283 | "Skipping row 60 due to timeout (> 15 seconds)\n" 284 | ] 285 | }, 286 | { 287 | "name": "stderr", 288 | "output_type": "stream", 289 | "text": [ 290 | "76it [03:24, 5.86s/it]" 291 | ] 292 | }, 293 | { 294 | "name": "stdout", 295 | "output_type": "stream", 296 | "text": [ 297 | "Skipping row 177 due to timeout (> 15 seconds)\n" 298 | ] 299 | }, 300 | { 301 | "name": "stderr", 302 | "output_type": "stream", 303 | "text": [ 304 | "90it [03:50, 3.73s/it]" 305 | ] 306 | }, 307 | { 308 | "name": "stdout", 309 | "output_type": "stream", 310 | "text": [ 311 | "Skipping row 185 due to timeout (> 15 seconds)\n" 312 | ] 313 | }, 314 | { 315 | "name": "stderr", 316 | "output_type": "stream", 317 | "text": [ 318 | "142it [05:04, 4.91s/it]" 319 | ] 320 | }, 321 | { 322 | "name": "stdout", 323 | "output_type": "stream", 324 | "text": [ 325 | "Skipping row 221 due to timeout (> 15 seconds)\n" 326 | ] 327 | }, 328 | { 329 | "name": "stderr", 330 | "output_type": "stream", 331 | "text": [ 332 | "150it [05:30, 2.20s/it]" 333 | ] 334 | }, 335 | { 336 | "name": "stdout", 337 | "output_type": "stream", 338 | "text": [ 339 | "Skipping row 222 due to timeout (> 15 seconds)\n" 340 | ] 341 | }, 342 | { 343 | "name": "stderr", 344 | "output_type": "stream", 345 | "text": [ 346 | "\n" 347 | ] 348 | }, 349 | { 350 | "data": { 351 | "text/html": [ 352 | "
\n", 353 | "\n", 366 | "\n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | "
systemcommandresponsegenerated
25You are mimicking a linux server. Respond with...psPID TTY TIME COMMAND ...\\n PID TTY TIME CMD\\n 1234 pts/0 ...
309You are mimicking a linux server. Respond with...uptime -u18:42:29 up 3:14, 1 user, load average: 0....\\n 00:00:00\\n
73You are mimicking a linux server. Respond with...chmod u+rw /home/phil/file_or_directory1,/home...chmod: cannot access '/home/phil/file_or_direc...\\nchmod: cannot access '/home/phil/file_or_dir...
195You are mimicking a linux server. Respond with...mv /home/phil/source/myfile.txt /home/phil/tar...mv: cannot create regular file `/home/phil/tar...mv: cannot move '/home/phil/source/myfile.txt'...
57You are mimicking a linux server. Respond with...cat /test2ELF\u0002\u0001\u0001cat: /test2: No such file or directory
...............
283You are mimicking a linux server. Respond with...uname -huname: invalid option -- 'h'\\nTry 'uname --hel...Linux
148You are mimicking a linux server. Respond with...gcc -Wall -O2 -o path/to/output_executable pat...gcc: path/to/source.cpp: No such file or direc...\\ngcc: error: path/to/output_executable: No su...
291You are mimicking a linux server. Respond with...uptime18:42:08 up 3:13, 1 user, load average: 0....\\n 21:34:12 up 2 days, 1:04, 1 user, load a...
239You are mimicking a linux server. Respond with...rmdir -p /home/phil/directory1 /home/phil/dire...rmdir: failed to remove `-p': No such file or ...\\nrmdir: failed to remove `/home/phil/director...
222You are mimicking a linux server. Respond with...ps auxUSER PID %CPU %MEM VSZ ...None
\n", 456 | "

150 rows × 4 columns

\n", 457 | "
" 458 | ], 459 | "text/plain": [ 460 | " system \\\n", 461 | "25 You are mimicking a linux server. Respond with... \n", 462 | "309 You are mimicking a linux server. Respond with... \n", 463 | "73 You are mimicking a linux server. Respond with... \n", 464 | "195 You are mimicking a linux server. Respond with... \n", 465 | "57 You are mimicking a linux server. Respond with... \n", 466 | ".. ... \n", 467 | "283 You are mimicking a linux server. Respond with... \n", 468 | "148 You are mimicking a linux server. Respond with... \n", 469 | "291 You are mimicking a linux server. Respond with... \n", 470 | "239 You are mimicking a linux server. Respond with... \n", 471 | "222 You are mimicking a linux server. Respond with... \n", 472 | "\n", 473 | " command \\\n", 474 | "25 ps \n", 475 | "309 uptime -u \n", 476 | "73 chmod u+rw /home/phil/file_or_directory1,/home... \n", 477 | "195 mv /home/phil/source/myfile.txt /home/phil/tar... \n", 478 | "57 cat /test2 \n", 479 | ".. ... \n", 480 | "283 uname -h \n", 481 | "148 gcc -Wall -O2 -o path/to/output_executable pat... \n", 482 | "291 uptime \n", 483 | "239 rmdir -p /home/phil/directory1 /home/phil/dire... \n", 484 | "222 ps aux \n", 485 | "\n", 486 | " response \\\n", 487 | "25 PID TTY TIME COMMAND ... \n", 488 | "309 18:42:29 up 3:14, 1 user, load average: 0.... \n", 489 | "73 chmod: cannot access '/home/phil/file_or_direc... \n", 490 | "195 mv: cannot create regular file `/home/phil/tar... \n", 491 | "57 ELF\u0002\u0001\u0001 \n", 492 | ".. ... \n", 493 | "283 uname: invalid option -- 'h'\\nTry 'uname --hel... \n", 494 | "148 gcc: path/to/source.cpp: No such file or direc... \n", 495 | "291 18:42:08 up 3:13, 1 user, load average: 0.... \n", 496 | "239 rmdir: failed to remove `-p': No such file or ... \n", 497 | "222 USER PID %CPU %MEM VSZ ... \n", 498 | "\n", 499 | " generated \n", 500 | "25 \\n PID TTY TIME CMD\\n 1234 pts/0 ... \n", 501 | "309 \\n 00:00:00\\n \n", 502 | "73 \\nchmod: cannot access '/home/phil/file_or_dir... \n", 503 | "195 mv: cannot move '/home/phil/source/myfile.txt'... \n", 504 | "57 cat: /test2: No such file or directory \n", 505 | ".. ... \n", 506 | "283 Linux \n", 507 | "148 \\ngcc: error: path/to/output_executable: No su... \n", 508 | "291 \\n 21:34:12 up 2 days, 1:04, 1 user, load a... \n", 509 | "239 \\nrmdir: failed to remove `/home/phil/director... \n", 510 | "222 None \n", 511 | "\n", 512 | "[150 rows x 4 columns]" 513 | ] 514 | }, 515 | "execution_count": 72, 516 | "metadata": {}, 517 | "output_type": "execute_result" 518 | } 519 | ], 520 | "source": [ 521 | "import pandas as pd\n", 522 | "from datasets import load_dataset\n", 523 | "from tqdm import tqdm\n", 524 | "import time\n", 525 | "import spacy\n", 526 | "import signal\n", 527 | "nlp = spacy.load(\"en_core_web_sm\")\n", 528 | "\n", 529 | "dataset = load_dataset(\"hotal/honeypot_logs\")\n", 530 | "df = pd.DataFrame(dataset['train'])[:334].sample(n=150, random_state=42)\n", 531 | "\n", 532 | "def timeout_handler(signum, frame):\n", 533 | " raise TimeoutError\n", 534 | "\n", 535 | "for index, row in tqdm(df.iterrows()):\n", 536 | " signal.signal(signal.SIGALRM, timeout_handler)\n", 537 | " signal.alarm(15) # 10 seconds\n", 538 | " try:\n", 539 | " df.loc[index, 'generated'] = llama.answer(row['command'], [])\n", 540 | " except TimeoutError:\n", 541 | " print(f\"Skipping row {index} due to timeout (> 15 seconds)\")\n", 542 | " df.loc[index, 'generated'] = None\n", 543 | " finally:\n", 544 | " signal.alarm(0) # reset the alarm\n", 545 | "\n", 546 | "df\n" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": 78, 552 | "metadata": {}, 553 | "outputs": [ 554 | { 555 | "name": "stderr", 556 | "output_type": "stream", 557 | "text": [ 558 | "0it [00:00, ?it/s]/tmp/ipykernel_1638383/3606016577.py:5: UserWarning: [W007] The model you're using has no word vectors loaded, so the result of the Doc.similarity method will be based on the tagger, parser and NER, which may not give useful similarity judgements. This may happen if you're using one of the small models, e.g. `en_core_web_sm`, which don't ship with word vectors and only use context-sensitive tensors. You can always add your own word vectors, or use one of the larger models instead if available.\n", 559 | " similarity = doc1.similarity(doc2)\n", 560 | "150it [00:01, 130.00it/s]\n" 561 | ] 562 | }, 563 | { 564 | "data": { 565 | "text/plain": [ 566 | "[0.9266226889377728,\n", 567 | " 0.7483038693015535,\n", 568 | " 0.9854740744166057,\n", 569 | " 0.8750943762180343,\n", 570 | " 0.1222527798910181,\n", 571 | " 0.9599760967911799,\n", 572 | " 0.15790873052795754,\n", 573 | " 0.3427842031666325,\n", 574 | " 0.8901137910677026,\n", 575 | " 0.9250525849039357,\n", 576 | " 0.6146497515738819,\n", 577 | " 0.6783382553049272,\n", 578 | " 0.9551248995922451,\n", 579 | " 0.627599031453704,\n", 580 | " 0.8775345504719663,\n", 581 | " 0.8153339463875003,\n", 582 | " 0.43039500631938543,\n", 583 | " 0.4415520404516888,\n", 584 | " 0.6499677098298162,\n", 585 | " 0.7572771114785681,\n", 586 | " 0.7478363709157725,\n", 587 | " 0.517494247868626,\n", 588 | " 0.5214024835335245,\n", 589 | " 0.5217871483441932,\n", 590 | " 0.4151627115886709,\n", 591 | " 0.7464344056291038,\n", 592 | " 0.5545979492340203,\n", 593 | " 0.7469779958000291,\n", 594 | " 0.3084552815614088,\n", 595 | " 0.9770636258190092,\n", 596 | " 0.42111358743318955,\n", 597 | " 0.508898890243933,\n", 598 | " 0.4691406705359161,\n", 599 | " 0.8500015026444628,\n", 600 | " 0.7013135151145201,\n", 601 | " 0.9176107960715111,\n", 602 | " 0.8260988634215954,\n", 603 | " 0.47781991339817054,\n", 604 | " 0.41822584352315856,\n", 605 | " 0.40500443370038985,\n", 606 | " 0.6149629113333346,\n", 607 | " 0.672761011302793,\n", 608 | " 0.9757408913151834,\n", 609 | " 0.7929636702502307,\n", 610 | " 0.28980344659973345,\n", 611 | " 0.5850302585113578,\n", 612 | " 0.5611672372311479,\n", 613 | " 0.8562951543701436,\n", 614 | " 0.950702976797263,\n", 615 | " 0.91938663377379,\n", 616 | " 0.9457809333641432,\n", 617 | " 0.4807638058305509,\n", 618 | " 0.9092084540315909,\n", 619 | " 0.6367839109198675,\n", 620 | " 0.9812164222549482,\n", 621 | " 0.8186939002754022,\n", 622 | " 0.9771496763658669,\n", 623 | " 0.7486600345735321,\n", 624 | " 0.41910924509250236,\n", 625 | " 0.4091024332843984,\n", 626 | " 0.3836090746137256,\n", 627 | " 0.9759682589885805,\n", 628 | " 0.44465350719130636,\n", 629 | " 0.6120472117886202,\n", 630 | " 0.9789453902284974,\n", 631 | " 0.7153902543235714,\n", 632 | " 0.7426386062855006,\n", 633 | " 0.9744701036083163,\n", 634 | " 0.6558226640808129,\n", 635 | " 0.9532569160570045,\n", 636 | " 0.6264219005218772,\n", 637 | " 0.468927884859053,\n", 638 | " 0.5790208651494848,\n", 639 | " 0.9093797194028849,\n", 640 | " 0.8619641459279357,\n", 641 | " 0.40928139596157587,\n", 642 | " 0.05515600619347201,\n", 643 | " 0.5654284362318464,\n", 644 | " 0.705511556698048,\n", 645 | " 0.7566292069891919,\n", 646 | " 0.5345443710608845,\n", 647 | " 0.9750915300170969,\n", 648 | " 0.2812566174949539,\n", 649 | " 0.9016269139383067,\n", 650 | " 0.7629358538450524,\n", 651 | " 0.9885831370195848,\n", 652 | " 0.8946553398557442,\n", 653 | " 0.40125993982842567,\n", 654 | " 0.9216778170046728,\n", 655 | " 0.3842899033080288,\n", 656 | " 0.9172229385791985,\n", 657 | " 0.5771070729807252,\n", 658 | " 0.34359980264799184,\n", 659 | " 0.8792366989444065,\n", 660 | " 0.9686253387818091,\n", 661 | " 0.8772259641843925,\n", 662 | " 0.28082071452590873,\n", 663 | " 0.6648062023302003,\n", 664 | " 0.7091523172314665,\n", 665 | " 0.9308563527991272,\n", 666 | " 0.6379564087919317,\n", 667 | " 0.2941533246890943,\n", 668 | " 0.40961384119132643,\n", 669 | " 0.9830999518848895,\n", 670 | " 0.8573340272480829,\n", 671 | " 0.6096735436269064,\n", 672 | " 0.867486599832943,\n", 673 | " 0.3184837440305669,\n", 674 | " 0.3161855948383834,\n", 675 | " 0.9135009372705184,\n", 676 | " 0.8137313915768604,\n", 677 | " 0.8123733670376505,\n", 678 | " 0.9667515554007875,\n", 679 | " 0.8935746811734981,\n", 680 | " 0.8542122675981001,\n", 681 | " 0.9885607732671146,\n", 682 | " 0.9491577734752112,\n", 683 | " 0.6577316278883858,\n", 684 | " 0.8955741072535015,\n", 685 | " 0.35920635435693815,\n", 686 | " 0.9752386936788853,\n", 687 | " 0.7163785906107277,\n", 688 | " 0.7413162555648729,\n", 689 | " 0.76000288795456,\n", 690 | " 0.8606838733019466,\n", 691 | " 0.6496282700594403,\n", 692 | " 0.40762294957370837,\n", 693 | " 0.8426753638284821,\n", 694 | " 0.9070421984574294,\n", 695 | " 0.9451967510573314,\n", 696 | " 0.07869086945968735,\n", 697 | " 0.5564012886490949,\n", 698 | " 0.9267505365363619,\n", 699 | " 0.6127896978889558,\n", 700 | " 0.41232448670824684,\n", 701 | " 0.6403345808255426,\n", 702 | " 0.2019148676242682,\n", 703 | " 0.929285189521436,\n", 704 | " 0.9783733409331761,\n", 705 | " 0.906493423110178]" 706 | ] 707 | }, 708 | "execution_count": 78, 709 | "metadata": {}, 710 | "output_type": "execute_result" 711 | } 712 | ], 713 | "source": [ 714 | "def measure_similarity(sentence1, sentence2):\n", 715 | " try:\n", 716 | " doc1 = nlp(sentence1)\n", 717 | " doc2 = nlp(sentence2)\n", 718 | " similarity = doc1.similarity(doc2)\n", 719 | " return similarity\n", 720 | " except Exception as e:\n", 721 | " return -1\n", 722 | "\n", 723 | "results = []\n", 724 | "for index, row in tqdm(df.iterrows()):\n", 725 | " \n", 726 | " prompt = row['system']\n", 727 | " command = row['command']\n", 728 | " reference_answer = row['response']\n", 729 | " generated_answer = row['generated']\n", 730 | "\n", 731 | " #result = measure_correctness(prompt, command, generated_answer, reference_answer)\n", 732 | " result = measure_similarity(generated_answer, reference_answer)\n", 733 | " if result != -1:\n", 734 | " results.append(result)\n", 735 | " #time.sleep(0.05)\n", 736 | "\n", 737 | "results" 738 | ] 739 | }, 740 | { 741 | "cell_type": "markdown", 742 | "metadata": {}, 743 | "source": [ 744 | "## Visualizations" 745 | ] 746 | }, 747 | { 748 | "cell_type": "code", 749 | "execution_count": 113, 750 | "metadata": {}, 751 | "outputs": [ 752 | { 753 | "data": { 754 | "text/plain": [ 755 | "(0.6885405904371525, 140)" 756 | ] 757 | }, 758 | "execution_count": 113, 759 | "metadata": {}, 760 | "output_type": "execute_result" 761 | } 762 | ], 763 | "source": [ 764 | "import numpy as np\n", 765 | "np.array(results).mean(), len(results)" 766 | ] 767 | }, 768 | { 769 | "cell_type": "code", 770 | "execution_count": 146, 771 | "metadata": {}, 772 | "outputs": [ 773 | { 774 | "data": { 775 | "image/png": "", 776 | "text/plain": [ 777 | "
" 778 | ] 779 | }, 780 | "metadata": {}, 781 | "output_type": "display_data" 782 | } 783 | ], 784 | "source": [ 785 | "import matplotlib.pyplot as plt\n", 786 | " \n", 787 | "n, bins = np.histogram(results, bins=np.arange(0, 1.1, 0.1))\n", 788 | "plt.figure(figsize=(10, 6), dpi=150)\n", 789 | "plt.bar(bins[:-1], n, width=0.1, align='edge', edgecolor='black')\n", 790 | "plt.xticks(np.arange(0, 1.1, 0.1))\n", 791 | " \n", 792 | "plt.xlabel('Cosine Similarity Score')\n", 793 | "plt.ylabel('Frequency')\n", 794 | "\n", 795 | "#plt.title('Similarity Score Distribution between Generated and Reference Output ', fontweight = \"bold\")\n", 796 | "plt.tight_layout()\n", 797 | "plt.savefig('similarity_score_distribution_140samples.png')\n", 798 | "plt.show()\n" 799 | ] 800 | }, 801 | { 802 | "cell_type": "code", 803 | "execution_count": 145, 804 | "metadata": {}, 805 | "outputs": [ 806 | { 807 | "data": { 808 | "image/png": "", 809 | "text/plain": [ 810 | "
" 811 | ] 812 | }, 813 | "metadata": {}, 814 | "output_type": "display_data" 815 | } 816 | ], 817 | "source": [ 818 | "logs = [\n", 819 | " {\"current_steps\": 1, \"total_steps\": 36, \"loss\": 1.7841, \"learning_rate\": 0.0004990486745229364, \"epoch\": 0.1, \"percentage\": 2.78, \"elapsed_time\": \"0:00:18\", \"remaining_time\": \"0:11:00\"},\n", 820 | "{\"current_steps\": 2, \"total_steps\": 36, \"loss\": 1.6008, \"learning_rate\": 0.000496201938253052, \"epoch\": 0.2, \"percentage\": 5.56, \"elapsed_time\": \"0:00:35\", \"remaining_time\": \"0:09:59\"},\n", 821 | "{\"current_steps\": 3, \"total_steps\": 36, \"loss\": 1.2327, \"learning_rate\": 0.0004914814565722671, \"epoch\": 0.3, \"percentage\": 8.33, \"elapsed_time\": \"0:00:56\", \"remaining_time\": \"0:10:26\"},\n", 822 | "{\"current_steps\": 4, \"total_steps\": 36, \"loss\": 1.4636, \"learning_rate\": 0.0004849231551964771, \"epoch\": 0.41, \"percentage\": 11.11, \"elapsed_time\": \"0:01:08\", \"remaining_time\": \"0:09:09\"},\n", 823 | "{\"current_steps\": 5, \"total_steps\": 36, \"loss\": 1.4043, \"learning_rate\": 0.00047657694675916254, \"epoch\": 0.51, \"percentage\": 13.89, \"elapsed_time\": \"0:01:22\", \"remaining_time\": \"0:08:31\"},\n", 824 | "{\"current_steps\": 6, \"total_steps\": 36, \"loss\": 1.1577, \"learning_rate\": 0.00046650635094610973, \"epoch\": 0.61, \"percentage\": 16.67, \"elapsed_time\": \"0:01:51\", \"remaining_time\": \"0:09:18\"},\n", 825 | "{\"current_steps\": 7, \"total_steps\": 36, \"loss\": 1.1639, \"learning_rate\": 0.00045478801107224796, \"epoch\": 0.71, \"percentage\": 19.44, \"elapsed_time\": \"0:02:05\", \"remaining_time\": \"0:08:40\"},\n", 826 | "{\"current_steps\": 8, \"total_steps\": 36, \"loss\": 1.001, \"learning_rate\": 0.0004415111107797445, \"epoch\": 0.81, \"percentage\": 22.22, \"elapsed_time\": \"0:02:22\", \"remaining_time\": \"0:08:18\"},\n", 827 | "{\"current_steps\": 9, \"total_steps\": 36, \"loss\": 1.0497, \"learning_rate\": 0.00042677669529663686, \"epoch\": 0.91, \"percentage\": 25.0, \"elapsed_time\": \"0:02:37\", \"remaining_time\": \"0:07:53\"},\n", 828 | "{\"current_steps\": 10, \"total_steps\": 36, \"loss\": 0.9004, \"learning_rate\": 0.0004106969024216348, \"epoch\": 1.01, \"percentage\": 27.78, \"elapsed_time\": \"0:03:00\", \"remaining_time\": \"0:07:49\"},\n", 829 | "{\"current_steps\": 11, \"total_steps\": 36, \"loss\": 0.9483, \"learning_rate\": 0.00039339410908776154, \"epoch\": 1.11, \"percentage\": 30.56, \"elapsed_time\": \"0:03:13\", \"remaining_time\": \"0:07:19\"},\n", 830 | "{\"current_steps\": 12, \"total_steps\": 36, \"loss\": 0.8806, \"learning_rate\": 0.000375, \"epoch\": 1.22, \"percentage\": 33.33, \"elapsed_time\": \"0:03:28\", \"remaining_time\": \"0:06:57\"},\n", 831 | "{\"current_steps\": 13, \"total_steps\": 36, \"loss\": 1.0342, \"learning_rate\": 0.00035565456543517487, \"epoch\": 1.32, \"percentage\": 36.11, \"elapsed_time\": \"0:03:43\", \"remaining_time\": \"0:06:35\"},\n", 832 | "{\"current_steps\": 14, \"total_steps\": 36, \"loss\": 0.8794, \"learning_rate\": 0.0003355050358314172, \"epoch\": 1.42, \"percentage\": 38.89, \"elapsed_time\": \"0:04:06\", \"remaining_time\": \"0:06:26\"},\n", 833 | "{\"current_steps\": 15, \"total_steps\": 36, \"loss\": 0.767, \"learning_rate\": 0.00031470476127563017, \"epoch\": 1.52, \"percentage\": 41.67, \"elapsed_time\": \"0:04:27\", \"remaining_time\": \"0:06:14\"},\n", 834 | "{\"current_steps\": 16, \"total_steps\": 36, \"loss\": 0.8725, \"learning_rate\": 0.00029341204441673266, \"epoch\": 1.62, \"percentage\": 44.44, \"elapsed_time\": \"0:04:41\", \"remaining_time\": \"0:05:51\"},\n", 835 | "{\"current_steps\": 17, \"total_steps\": 36, \"loss\": 0.7652, \"learning_rate\": 0.0002717889356869146, \"epoch\": 1.72, \"percentage\": 47.22, \"elapsed_time\": \"0:05:09\", \"remaining_time\": \"0:05:45\"},\n", 836 | "{\"current_steps\": 18, \"total_steps\": 36, \"loss\": 0.9222, \"learning_rate\": 0.00025, \"epoch\": 1.82, \"percentage\": 50.0, \"elapsed_time\": \"0:05:34\", \"remaining_time\": \"0:05:34\"},\n", 837 | "{\"current_steps\": 19, \"total_steps\": 36, \"loss\": 0.8331, \"learning_rate\": 0.00022821106431308543, \"epoch\": 1.92, \"percentage\": 52.78, \"elapsed_time\": \"0:05:49\", \"remaining_time\": \"0:05:12\"},\n", 838 | "{\"current_steps\": 20, \"total_steps\": 36, \"loss\": 0.9051, \"learning_rate\": 0.00020658795558326743, \"epoch\": 2.03, \"percentage\": 55.56, \"elapsed_time\": \"0:06:02\", \"remaining_time\": \"0:04:49\"},\n", 839 | "{\"current_steps\": 21, \"total_steps\": 36, \"loss\": 0.7643, \"learning_rate\": 0.0001852952387243698, \"epoch\": 2.13, \"percentage\": 58.33, \"elapsed_time\": \"0:06:16\", \"remaining_time\": \"0:04:28\"},\n", 840 | "{\"current_steps\": 22, \"total_steps\": 36, \"loss\": 0.7853, \"learning_rate\": 0.00016449496416858284, \"epoch\": 2.23, \"percentage\": 61.11, \"elapsed_time\": \"0:06:29\", \"remaining_time\": \"0:04:07\"},\n", 841 | "{\"current_steps\": 23, \"total_steps\": 36, \"loss\": 0.8496, \"learning_rate\": 0.0001443454345648252, \"epoch\": 2.33, \"percentage\": 63.89, \"elapsed_time\": \"0:06:53\", \"remaining_time\": \"0:03:53\"},\n", 842 | "{\"current_steps\": 24, \"total_steps\": 36, \"loss\": 0.8802, \"learning_rate\": 0.00012500000000000006, \"epoch\": 2.43, \"percentage\": 66.67, \"elapsed_time\": \"0:07:07\", \"remaining_time\": \"0:03:33\"},\n", 843 | "{\"current_steps\": 25, \"total_steps\": 36, \"loss\": 0.8165, \"learning_rate\": 0.00010660589091223854, \"epoch\": 2.53, \"percentage\": 69.44, \"elapsed_time\": \"0:07:21\", \"remaining_time\": \"0:03:14\"},\n", 844 | "{\"current_steps\": 26, \"total_steps\": 36, \"loss\": 0.9055, \"learning_rate\": 8.930309757836516e-05, \"epoch\": 2.63, \"percentage\": 72.22, \"elapsed_time\": \"0:07:38\", \"remaining_time\": \"0:02:56\"},\n", 845 | "{\"current_steps\": 27, \"total_steps\": 36, \"loss\": 0.7184, \"learning_rate\": 7.322330470336314e-05, \"epoch\": 2.73, \"percentage\": 75.0, \"elapsed_time\": \"0:08:09\", \"remaining_time\": \"0:02:43\"},\n", 846 | "{\"current_steps\": 28, \"total_steps\": 36, \"loss\": 0.8326, \"learning_rate\": 5.848888922025553e-05, \"epoch\": 2.84, \"percentage\": 77.78, \"elapsed_time\": \"0:08:30\", \"remaining_time\": \"0:02:25\"},\n", 847 | "{\"current_steps\": 29, \"total_steps\": 36, \"loss\": 0.8122, \"learning_rate\": 4.521198892775202e-05, \"epoch\": 2.94, \"percentage\": 80.56, \"elapsed_time\": \"0:08:52\", \"remaining_time\": \"0:02:08\"},\n", 848 | "{\"current_steps\": 30, \"total_steps\": 36, \"loss\": 0.8105, \"learning_rate\": 3.3493649053890325e-05, \"epoch\": 3.04, \"percentage\": 83.33, \"elapsed_time\": \"0:09:07\", \"remaining_time\": \"0:01:49\"},\n", 849 | "{\"current_steps\": 31, \"total_steps\": 36, \"loss\": 0.7703, \"learning_rate\": 2.3423053240837516e-05, \"epoch\": 3.14, \"percentage\": 86.11, \"elapsed_time\": \"0:09:22\", \"remaining_time\": \"0:01:30\"},\n", 850 | "{\"current_steps\": 32, \"total_steps\": 36, \"loss\": 0.8548, \"learning_rate\": 1.5076844803522921e-05, \"epoch\": 3.24, \"percentage\": 88.89, \"elapsed_time\": \"0:09:37\", \"remaining_time\": \"0:01:12\"},\n", 851 | "{\"current_steps\": 33, \"total_steps\": 36, \"loss\": 0.8, \"learning_rate\": 8.51854342773295e-06, \"epoch\": 3.34, \"percentage\": 91.67, \"elapsed_time\": \"0:09:59\", \"remaining_time\": \"0:00:54\"},\n", 852 | "{\"current_steps\": 34, \"total_steps\": 36, \"loss\": 0.7628, \"learning_rate\": 3.798061746947995e-06, \"epoch\": 3.44, \"percentage\": 94.44, \"elapsed_time\": \"0:10:17\", \"remaining_time\": \"0:00:36\"},\n", 853 | "{\"current_steps\": 35, \"total_steps\": 36, \"loss\": 0.7972, \"learning_rate\": 9.513254770636137e-07, \"epoch\": 3.54, \"percentage\": 97.22, \"elapsed_time\": \"0:10:31\", \"remaining_time\": \"0:00:18\"},\n", 854 | "{\"current_steps\": 36, \"total_steps\": 36, \"loss\": 0.7941, \"learning_rate\": 0.0, \"epoch\": 3.65, \"percentage\": 100.0, \"elapsed_time\": \"0:10:45\", \"remaining_time\": \"0:00:00\"}\n", 855 | "]\n", 856 | "\n", 857 | "import numpy as np\n", 858 | "import matplotlib.pyplot as plt\n", 859 | "from scipy.interpolate import interp1d\n", 860 | "\n", 861 | "\n", 862 | "plt.figure(figsize=(10, 6), dpi=150)\n", 863 | "losses = np.array([x[\"loss\"] for x in logs])\n", 864 | "steps = range(1, len(logs)+1)\n", 865 | "\n", 866 | "plt.plot(steps, losses, label='Losses')\n", 867 | "plt.scatter(steps, losses)\n", 868 | "\n", 869 | "f_cubic = interp1d(steps, losses)\n", 870 | "xnew = np.linspace(1, 36, num=10, endpoint=True)\n", 871 | "plt.plot(xnew, f_cubic(xnew), '--', label='Smoothed', c='r')\n", 872 | "\n", 873 | "\n", 874 | "plt.xlabel('Steps')\n", 875 | "plt.ylabel('Training Loss')\n", 876 | "plt.legend(loc='best')\n", 877 | "plt.tight_layout()\n", 878 | "plt.savefig('training_loss.png')\n", 879 | "plt.show()\n" 880 | ] 881 | }, 882 | { 883 | "cell_type": "code", 884 | "execution_count": null, 885 | "metadata": {}, 886 | "outputs": [], 887 | "source": [] 888 | } 889 | ], 890 | "metadata": { 891 | "kernelspec": { 892 | "display_name": "python3-11", 893 | "language": "python", 894 | "name": "python3" 895 | }, 896 | "language_info": { 897 | "codemirror_mode": { 898 | "name": "ipython", 899 | "version": 3 900 | }, 901 | "file_extension": ".py", 902 | "mimetype": "text/x-python", 903 | "name": "python", 904 | "nbconvert_exporter": "python", 905 | "pygments_lexer": "ipython3", 906 | "version": "3.11.8" 907 | } 908 | }, 909 | "nbformat": 4, 910 | "nbformat_minor": 2 911 | } 912 | --------------------------------------------------------------------------------