├── honeypot-server
├── logs
│ ├── auth.log
│ └── log_2024-05-31_15-09-54.txt
├── README.md
├── main.py
├── server.key.pub
├── llm.py
├── server.key
└── ssh_server.py
├── .DS_Store
├── supervised-finetuning
├── model_inference.yaml
├── multi_gpu_sft_script.sh
├── merge_lora_sft.yaml
├── multi_gpu_phi3.yaml
├── multi_gpu_llama3.yaml
├── multi_gpu_codellama.yaml
└── README.md
├── notebooks
├── ssh_server.png
├── training_loss.png
├── similarity_score_distribution_140samples.png
├── preprocess.ipynb
└── postprocess.ipynb
└── README.md
/honeypot-server/logs/auth.log:
--------------------------------------------------------------------------------
1 | root:dagshj
2 |
--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI-in-Complex-Systems-Lab/LLM-Honeypot/HEAD/.DS_Store
--------------------------------------------------------------------------------
/supervised-finetuning/model_inference.yaml:
--------------------------------------------------------------------------------
1 | model_name_or_path: models/honeypot-llama3-8B
2 | template: llama3
3 |
--------------------------------------------------------------------------------
/notebooks/ssh_server.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI-in-Complex-Systems-Lab/LLM-Honeypot/HEAD/notebooks/ssh_server.png
--------------------------------------------------------------------------------
/honeypot-server/README.md:
--------------------------------------------------------------------------------
1 | ## Start server
2 | > python main.py
3 |
4 | ## Connect with SSH
5 | > ssh -T -p 2222 "root@localhost"
--------------------------------------------------------------------------------
/notebooks/training_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI-in-Complex-Systems-Lab/LLM-Honeypot/HEAD/notebooks/training_loss.png
--------------------------------------------------------------------------------
/notebooks/similarity_score_distribution_140samples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI-in-Complex-Systems-Lab/LLM-Honeypot/HEAD/notebooks/similarity_score_distribution_140samples.png
--------------------------------------------------------------------------------
/honeypot-server/logs/log_2024-05-31_15-09-54.txt:
--------------------------------------------------------------------------------
1 | @CMD: ls
2 | @RESP: ```
3 | bin desktop documents downloads music pictures public videos
4 | ```
5 |
6 | @CMD:
7 | @RESP: ```
8 | bash: command not found
9 | ```
10 |
11 |
--------------------------------------------------------------------------------
/honeypot-server/main.py:
--------------------------------------------------------------------------------
1 | from ssh_server import start_ssh_server
2 | from llm import LLM
3 |
4 | MODEL_NAME = "../models/honeypot-llama3-8B"
5 | llama = LLM(MODEL_NAME)
6 | print(llama.answer('ls -al', ['mkdir test_directory; touch test.txt','\n']))
7 |
8 | start_ssh_server(llama)
--------------------------------------------------------------------------------
/supervised-finetuning/multi_gpu_sft_script.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | CUDA_VISIBLE_DEVICES=0,1 accelerate launch src/train.py multi_gpu_phi3.yaml
4 |
5 | CUDA_VISIBLE_DEVICES=0,1 accelerate launch src/train.py multi_gpu_llama3.yaml
6 |
7 | CUDA_VISIBLE_DEVICES=0,1 accelerate launch src/train.py multi_gpu_codellama.yaml
--------------------------------------------------------------------------------
/supervised-finetuning/merge_lora_sft.yaml:
--------------------------------------------------------------------------------
1 | # Note: DO NOT use quantized model or quantization_bit when merging lora adapters
2 |
3 | # model
4 | model_name_or_path: NousResearch/Meta-Llama-3-8B-Instruct
5 | adapter_name_or_path: saves/Custom/lora/honeypot_llama3_v3
6 | template: llama3
7 | finetuning_type: lora
8 |
9 | # export
10 | export_dir: models/honeypot-llama3-8B
11 | export_size: 1
12 | export_device: cuda
13 | export_legacy_format: false
14 |
--------------------------------------------------------------------------------
/honeypot-server/server.key.pub:
--------------------------------------------------------------------------------
1 | ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQDsqkBcRScrwI5rbuAbaiEUurBruU6R3s4BW8iDSXCtVlA7/SK9zBNnswjUdJDIzhA4Q/gb1n3q+jp8Ivmn8fQhuqOzfeQY/m5iAxA547/lkYrppwpRuKCUn5uBfwLI2G94BdGUQI2XBYW/foDkSr+j1sYQm3XZJnjkkljQ7F0lWA7bQx5RdcRASn0/LMa+eFqlkFmKqG7AsPHApFXsHs8E+4QGhvdDUifB/99aDCthDA0Jn5OpOXqnZ2mImQcSDwcoHOvTG9CvRY0fBfRBXTOt4Pw+RU3qFzJJHx/8wsoR5yBFJDoTRYt3KpaWyd0zdkhxRVPYN6ePhofCMmCG8CJuJNtBYYuc1pvWMGgNescNMX3qeQc2fT8B3PetMkU1yYAFx3Wquf4mIdbSWr2A+8DYt3lPvKCHEheY8z5+bLQ5aICj89hQoeB8h4DFoapMA0gCDSPm+Oa/p/32f7zUB2f68V76UHA42EKzQXy0XN2B20LqIn7lz5QYuk0QC8khriU= hotal@dyn-169-226-53-44.etec.albany.edu
2 |
--------------------------------------------------------------------------------
/supervised-finetuning/multi_gpu_phi3.yaml:
--------------------------------------------------------------------------------
1 | bf16: true
2 | cutoff_len: 1024
3 | dataset: honeypot_logs
4 | dataset_dir: data
5 | do_train: true
6 | finetuning_type: lora
7 | flash_attn: fa2
8 | gradient_accumulation_steps: 8
9 | learning_rate: 5.0e-05
10 | logging_steps: 1
11 | lora_alpha: 16
12 | lora_dropout: 0
13 | lora_rank: 8
14 | lora_target: qkv_proj
15 | lr_scheduler_type: cosine
16 | max_grad_norm: 1.0
17 | max_samples: 100000
18 | model_name_or_path: microsoft/Phi-3-mini-4k-instruct
19 | num_train_epochs: 2.0
20 | optim: adamw_torch
21 | output_dir: saves/Custom/lora/honeypot_phi3
22 | packing: false
23 | per_device_train_batch_size: 16
24 | quantization_bit: 8
25 | report_to: none
26 | save_steps: 100
27 | stage: sft
28 | template: phi
29 | warmup_steps: 0
30 |
--------------------------------------------------------------------------------
/supervised-finetuning/multi_gpu_llama3.yaml:
--------------------------------------------------------------------------------
1 | bf16: true
2 | cutoff_len: 2048
3 | dataset: honeypot_logs
4 | dataset_dir: data
5 | do_train: true
6 | finetuning_type: lora
7 | flash_attn: fa2
8 | gradient_accumulation_steps: 8
9 | learning_rate: 1.0e-04
10 | logging_steps: 1
11 | lora_alpha: 16
12 | lora_dropout: 0
13 | lora_rank: 8
14 | lora_target: q_proj,v_proj
15 | lr_scheduler_type: cosine
16 | max_grad_norm: 1.0
17 | max_samples: 100000
18 | model_name_or_path: NousResearch/Meta-Llama-3-8B-Instruct
19 | num_train_epochs: 3.0
20 | optim: adamw_torch
21 | output_dir: saves/Custom/lora/honeypot_llama3_v3
22 | packing: false
23 | per_device_train_batch_size: 8
24 | quantization_bit: 8
25 | report_to: none
26 | save_steps: 100
27 | stage: sft
28 | template: llama3
29 | warmup_steps: 0
30 |
--------------------------------------------------------------------------------
/supervised-finetuning/multi_gpu_codellama.yaml:
--------------------------------------------------------------------------------
1 | bf16: true
2 | cutoff_len: 1024
3 | dataset: honeypot_logs
4 | dataset_dir: data
5 | do_train: true
6 | finetuning_type: lora
7 | flash_attn: fa2
8 | gradient_accumulation_steps: 8
9 | learning_rate: 5.0e-05
10 | logging_steps: 1
11 | lora_alpha: 16
12 | lora_dropout: 0
13 | lora_rank: 8
14 | lora_target: q_proj,v_proj
15 | lr_scheduler_type: cosine
16 | max_grad_norm: 1.0
17 | max_samples: 100000
18 | model_name_or_path: codellama/CodeLlama-7b-Instruct-hf
19 | num_train_epochs: 2.0
20 | optim: adamw_torch
21 | output_dir: saves/Custom/lora/honeypot_codellama
22 | packing: false
23 | per_device_train_batch_size: 16
24 | quantization_bit: 8
25 | report_to: none
26 | save_steps: 100
27 | stage: sft
28 | template: llama2
29 | warmup_steps: 0
30 |
--------------------------------------------------------------------------------
/supervised-finetuning/README.md:
--------------------------------------------------------------------------------
1 | > conda activate python3-11
2 |
3 | ### Single-GPU:
4 |
5 | > CUDA_VISIBLE_DEVICES=0 GRADIO_SHARE=0 llamafactory-cli webui
6 |
7 | - Update data/dataset_info.json
8 |
9 |
10 | ### Multi-GPU:
11 |
12 | > sh multi_gpu_sft_script.sh
13 |
14 | > CUDA_VISIBLE_DEVICES=0,1 accelerate launch src/train.py multi_gpu_llama3.yaml
15 |
16 |
17 | ### Merge:
18 |
19 | > CUDA_VISIBLE_DEVICES=0 llamafactory-cli export merge_lora_sft.yaml
20 |
21 |
22 |
23 | #### Prompt:
24 |
25 | You are mimicking a linux server. Respond with what the terminal would respond when a code given. I want you to only reply with the terminal outputs inside one unique code block and nothing else. Do not write any explanations. Do not type any commands unless I instruct you to do so.
--------------------------------------------------------------------------------
/honeypot-server/llm.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import gc
3 | import re
4 | from transformers import (
5 | AutoModelForCausalLM,
6 | AutoTokenizer,
7 | BitsAndBytesConfig,
8 | pipeline
9 | )
10 |
11 | class LLM:
12 | def __init__(self, model_name="NousResearch/Meta-Llama-3-8B-Instruct"):
13 | gc.collect()
14 | torch.cuda.empty_cache()
15 | print("Cleared GPU...")
16 |
17 | self.DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
18 | self.BASE_MODEL_NAME = model_name
19 | self.SYSTEM_PROMPT = "You are mimicking a linux server. Respond with what the terminal would respond when a code given. I want you to only reply with the terminal outputs inside one unique code block and nothing else. Do not write any explanations. Do not type any commands unless I instruct you to do so."
20 |
21 | # Model configuration
22 | self.pipeline = pipeline(
23 | "text-generation",
24 | model=self.BASE_MODEL_NAME,
25 | tokenizer=self.BASE_MODEL_NAME,
26 | model_kwargs={"torch_dtype": torch.bfloat16},
27 | device=self.DEVICE,
28 | )
29 |
30 | print("Loaded Model: ", self.BASE_MODEL_NAME)
31 |
32 | def answer(self, query, log_history=[], max_tokens=4096, temperature=0.01, top_p=0.8):
33 |
34 | message_history = [{"role": "system", "content": self.SYSTEM_PROMPT}]
35 | if len(log_history) > 0:
36 | for i, item in enumerate(log_history):
37 | if i % 2 == 0:
38 | message_history.append({"role": "user", "content": item})
39 | else:
40 | message_history.append({"role": "assistant", "content": item})
41 |
42 | user_prompt = message_history + [{"role": "user", "content": query}]
43 | prompt = self.pipeline.tokenizer.apply_chat_template(
44 | user_prompt, tokenize=False, add_generation_prompt=True
45 | )
46 | outputs = self.pipeline(
47 | prompt,
48 | max_new_tokens=max_tokens,
49 | eos_token_id=self.pipeline.tokenizer.eos_token_id,
50 | do_sample=True,
51 | temperature=temperature,
52 | top_p=top_p,
53 | )
54 | response = outputs[0]["generated_text"][len(prompt):]
55 |
56 | # remove unnecessary quotes
57 | if response.startswith("```") and response.endswith("```"):
58 | response = response[3:-3]
59 | elif response.startswith("`") and response.endswith("`"):
60 | response = response[1:-1]
61 |
62 | return response
63 |
--------------------------------------------------------------------------------
/honeypot-server/server.key:
--------------------------------------------------------------------------------
1 | -----BEGIN OPENSSH PRIVATE KEY-----
2 | b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAABlwAAAAdzc2gtcn
3 | NhAAAAAwEAAQAAAYEA7KpAXEUnK8COa27gG2ohFLqwa7lOkd7OAVvIg0lwrVZQO/0ivcwT
4 | Z7MI1HSQyM4QOEP4G9Z96vo6fCL5p/H0Ibqjs33kGP5uYgMQOeO/5ZGK6acKUbiglJ+bgX
5 | 8CyNhveAXRlECNlwWFv36A5Eq/o9bGEJt12SZ45JJY0OxdJVgO20MeUXXEQEp9PyzGvnha
6 | pZBZiqhuwLDxwKRV7B7PBPuEBob3Q1Inwf/fWgwrYQwNCZ+TqTl6p2dpiJkHEg8HKBzr0x
7 | vQr0WNHwX0QV0zreD8PkVN6hcySR8f/MLKEecgRSQ6E0WLdyqWlsndM3ZIcUVT2Denj4aH
8 | wjJghvAibiTbQWGLnNab1jBoDXrHDTF96nkHNn0/Adz3rTJFNcmABcd1qrn+JiHW0lq9gP
9 | vA2Ld5T7yghxIXmPM+fmy0OWiAo/PYUKHgfIeAxaGqTANIAg0j5vjmv6f99n+81Adn+vFe
10 | +lBwONhCs0F8tFzdgdtC6iJ+5c+UGLpNEAvJIa4lAAAFoKiQHGqokBxqAAAAB3NzaC1yc2
11 | EAAAGBAOyqQFxFJyvAjmtu4BtqIRS6sGu5TpHezgFbyINJcK1WUDv9Ir3ME2ezCNR0kMjO
12 | EDhD+BvWfer6Onwi+afx9CG6o7N95Bj+bmIDEDnjv+WRiumnClG4oJSfm4F/AsjYb3gF0Z
13 | RAjZcFhb9+gORKv6PWxhCbddkmeOSSWNDsXSVYDttDHlF1xEBKfT8sxr54WqWQWYqobsCw
14 | 8cCkVewezwT7hAaG90NSJ8H/31oMK2EMDQmfk6k5eqdnaYiZBxIPBygc69Mb0K9FjR8F9E
15 | FdM63g/D5FTeoXMkkfH/zCyhHnIEUkOhNFi3cqlpbJ3TN2SHFFU9g3p4+Gh8IyYIbwIm4k
16 | 20Fhi5zWm9YwaA16xw0xfep5BzZ9PwHc960yRTXJgAXHdaq5/iYh1tJavYD7wNi3eU+8oI
17 | cSF5jzPn5stDlogKPz2FCh4HyHgMWhqkwDSAINI+b45r+n/fZ/vNQHZ/rxXvpQcDjYQrNB
18 | fLRc3YHbQuoifuXPlBi6TRALySGuJQAAAAMBAAEAAAGAXNBwKT+dmxULRarYDShUDPMiWT
19 | z1SVPd6r56JrLYk8Iz+TLPOywbuCGIpvmIBph51/cgCJrYCx1Tbnew/WJwYgH2TWBj4kF2
20 | PM4CwRFGotuvZO7zpxUHNQJbVC/hga8QGDv/82pVnRK3X2BXYDDIc5K5Xq35S85AvzN9zY
21 | HFDe449VifPUc4ThZohCwlTXGZ6zBvTWe896jtRqWsZMu4WLXjE0Q/GJHRsc2NZJ09LfQx
22 | 8MI6gLVgGuL9imqpUTW+0ZQAxLpHl+AHEGJPzm2zMSrjf1qij8ERudKlu3ucuLfJP6oq1Y
23 | TCy7ABOoQ5+tub+rW+27NG4ESwfhokWPgrDyfHr8urbiS01ZzXRbFNNINu+zki6ahQM2p/
24 | bK+kR0l04P3oaIGzyza5f8S/NzR4FwHRpVfH9S+V1tS6UkEU9RJfXTkR5jDHX44oZHv4mO
25 | ffbAxui7s/Wn4JTPwfKBuTdAJ8l4P2Y5IXPRkCl2lj9cxcc83q2VqlbxJuJEtyyhyBAAAA
26 | wQCGtIhzAtC2UM/6SpSJf0RwXgQij1MI/S+qGDsmwm06PLPtw7cgYmBLdwdapOyap4XYZc
27 | Mgt/Y65ooOW07tSN88XzjZLrRq5JEr2YKkLxRu23ZBET/UNWC9WoSrdkJTu5p150UzYjIx
28 | X5ulTNOwFpB5wNW7pK1Jvpobj7dPtr6wb5+aZixVM/HM95QvvFw18+BBv5Y1jSVKuSQd+t
29 | QpuKLhGB6nArQfLxd3C5WbPJ7JqA8WdlD1zkt51dxOo3qX2W4AAADBAP9pIkijafH+K6U5
30 | aT+bXA0hRRsTYCsN72WFTyt54eJzwZs9kDAeYhZNjJoA3KggGpqM7yC/AQEAUZ1B/qbwvN
31 | 5rPc9K7M10EtQMesKWeN8FG8CXp6NbRg/bp6sucy/xLJ/o5/zcSBV2H9aQArakNpkQA7vD
32 | 4olu5A2M/oOP3w14gcfz83BiKODESlE12Ob9HbdyUK6+p5wGFYc6mi13cxGPJGYGcuLnm9
33 | 8J+UsJNZ/Xid80/aXqsQa9Tpy8+eNz8QAAAMEA7TYLeI0dltsQHacCuPPEZVhCzHB69a6g
34 | qp/GwZ0hHX9DQiPqImzH9ke0cW0GDS7ybqhckx1jctvc1Q0LK1osvA9dbMu/FVhELe3agI
35 | hIdwljT/zDShyD/nfudvVUGGTN7N0V1SDmhdvQ5Tgp7ZWt8541qTuQGptNdQXcyQwNbQt2
36 | NThCHKQ4nHHMMRX4HJFtUztcCGuKFUuxiwuQNjj80wNww2pr3CkUKVXQ0ZyIHheKlgy74h
37 | zgQ6bYVcUHFcF1AAAAJ2hvdGFsQGR5bi0xNjktMjI2LTUzLTQ0LmV0ZWMuYWxiYW55LmVk
38 | dQECAw==
39 | -----END OPENSSH PRIVATE KEY-----
40 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # LLM Honeypot: Leveraging Large Language Models as Advanced Interactive Honeypot Systems
2 |
3 | Code for our paper "LLM Honeypot: Leveraging Large Language Models as Advanced Interactive Honeypot Systems" published in 2024 IEEE Conference on Communications and Network Security (CNS).
4 |
5 | You can download the paper via: [[IEEE]](https://ieeexplore.ieee.org/iel8/10735442/10735467/10735607.pdf) - [[DOI]](https://doi.org/10.1109/CNS62487.2024.10735607)
6 |
7 | [Dataset](https://huggingface.co/datasets/hotal/honeypot_logs)
8 |
9 | [Finetuned Model](https://huggingface.co/hotal/honeypot-llama3-8B)
10 |
11 | ## Training
12 |
13 | The training and fine-tuning process for the model presented in the paper utilized the [Llama-Factory](https://github.com/hiyouga/LLaMA-Factory) tool. Consequently, specific training scripts are not included in this repository.
14 |
15 | Please be aware that the Llama-Factory repository is actively maintained and frequently updated. The methods or scripts used during our research might require adjustments to remain compatible with the latest versions of Llama-Factory.
16 |
17 | To replicate the model training:
18 | 1. Use our custom dataset available on Hugging Face: [hotal/honeypot_logs](https://huggingface.co/datasets/hotal/honeypot_logs).
19 | 2. Follow the Llama-Factory documentation on using custom datasets, which can be found here: [Llama-Factory Custom Data Documentation](https://github.com/hiyouga/LLaMA-Factory/tree/main/data).
20 |
21 | Combining our dataset with the instructions provided by Llama-Factory should offer the most direct path to reproduce the training process.
22 |
23 | ## Running the Model
24 |
25 | The code relevant for running the honeypot server with the trained model can be found within the `/honeypot-server` directory of this repository.
26 |
27 | You can use the pre-trained model available on Hugging Face: [hotal/honeypot-llama3-8B](https://huggingface.co/hotal/honeypot-llama3-8B).
28 |
29 | **Note:** Similar to the training dependencies, the code in `/honeypot-server` may require updates to ensure compatibility with the current versions of dependent libraries (e.g., huggingface, transformers, etc.). Please check library compatibility if you encounter issues.
30 |
31 | ## Abstract
32 |
33 | The rapid evolution of cyber threats necessitates innovative solutions for detecting and analyzing malicious activity. Honeypots, which are decoy systems designed to lure and interact with attackers, have emerged as a critical component in cybersecurity. In this paper, we present a novel approach to creating realistic and interactive honeypot systems using Large Language Models (LLMs). By fine-tuning a pre-trained open-source language model on a diverse dataset of attacker-generated commands and responses, we developed a honeypot capable of sophisticated engagement with attackers. Our methodology involved several key steps: data collection and processing, prompt engineering, model selection, and supervised fine-tuning to optimize the model’s performance. Evaluation through similarity metrics and live deployment demonstrated that our approach effectively generates accurate and informative responses. The results highlight the potential of LLMs to revolutionize honeypot technology, providing cybersecurity professionals with a powerful tool to detect and analyze malicious activity, thereby enhancing overall security infrastructure.
34 |
35 | ## Citation
36 |
37 | If this work is helpful, please cite as:
38 |
39 | ```bibtex
40 | @INPROCEEDINGS{
41 | 10735607,
42 | author={Otal, Hakan T. and Canbaz, M. Abdullah},
43 | booktitle={2024 IEEE Conference on Communications and Network Security (CNS)},
44 | title={LLM Honeypot: Leveraging Large Language Models as Advanced Interactive Honeypot Systems},
45 | year={2024},
46 | pages={1-6},
47 | doi={10.1109/CNS62487.2024.10735607}
48 | }
49 | ```
50 |
51 | ## Contact
52 |
53 | hotal [AT] albany [DOT] edu
54 |
--------------------------------------------------------------------------------
/honeypot-server/ssh_server.py:
--------------------------------------------------------------------------------
1 | import socket, sys, threading
2 | import paramiko
3 | from datetime import datetime
4 |
5 | # Generate keys with 'ssh-keygen -t rsa -f server.key'
6 | HOST_KEY = paramiko.RSAKey(filename='server.key')
7 | SSH_PORT = 2222
8 |
9 | # Log the user:password combinations to files
10 | LOGFILE = 'logs/auth.log'
11 | LOGFILE_LOCK = threading.Lock()
12 |
13 | class SSHServerHandler(paramiko.ServerInterface):
14 | def __init__(self, llm_model):
15 | self.event = threading.Event()
16 | self.llm_model = llm_model
17 | self.log_history = []
18 |
19 | def check_channel_request(self, kind, channelID):
20 | return paramiko.OPEN_SUCCEEDED
21 |
22 | def check_channel_shell_request(self, channel):
23 | print("Channel", channel)
24 | self.channel = channel
25 | return True
26 |
27 | def check_channel_pty_request(self, c, t, w, h, p, ph, m):
28 | return True
29 |
30 | def get_allowed_auths(self, username):
31 | return 'password'
32 |
33 | def check_auth_password(self, username, password):
34 | self.username = username
35 |
36 | # save login info to a file
37 | LOGFILE_LOCK.acquire()
38 | try:
39 | logfile_handle = open(LOGFILE,"a")
40 | print("New login: " + username + ":" + password)
41 | logfile_handle.write(username + ":" + password + "\n")
42 | logfile_handle.close()
43 | finally:
44 | LOGFILE_LOCK.release()
45 |
46 | return paramiko.AUTH_SUCCESSFUL
47 |
48 | def handle_shell(self):
49 | log_filename = f"logs/log_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.txt"
50 |
51 | while not self.channel.exit_status_ready():
52 | try:
53 | # Receive user input
54 | self.channel.sendall(f'{self.username}@localhost:~/ $')
55 | command = self.channel.recv(1024).decode("utf-8").strip()
56 | print("CMD:", command)
57 |
58 | # Produce output with LLM
59 | response = self.llm_model.answer(command, self.log_history)
60 |
61 | # Save the logs
62 | self.log_history.append(command)
63 | self.log_history.append(response)
64 | log_file = open(log_filename, "a")
65 | log_file.write(f"@CMD: {command}\n@RESP: {response}\n\n")
66 | log_file.close()
67 |
68 | # Send response
69 | self.channel.sendall(f'{response}\n')
70 |
71 | except Exception as e:
72 | print("Channel closed:", e)
73 | self.channel.close()
74 | self.event.set()
75 | return
76 |
77 | self.channel.close()
78 | self.event.set()
79 |
80 |
81 | def handleConnection(client, llm_model):
82 | transport = paramiko.Transport(client)
83 | transport.add_server_key(HOST_KEY)
84 |
85 | server_handler = SSHServerHandler(llm_model)
86 | transport.start_server(server=server_handler)
87 |
88 | channel = transport.accept()
89 |
90 | if channel is None:
91 | transport.close()
92 | return
93 |
94 | server_handler.channel = channel
95 | server_handler.handle_shell()
96 |
97 | def start_ssh_server(llm_model):
98 | try:
99 | server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
100 | server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
101 | server_socket.bind(('', SSH_PORT))
102 | server_socket.listen(100)
103 | print('Server started...')
104 |
105 | while(True):
106 | try:
107 | client_socket, client_addr = server_socket.accept()
108 | print(f'New Connection: {client_addr}')
109 | threading.Thread(target=handleConnection, args=(client_socket,llm_model,)).start()
110 | except Exception as e:
111 | print("ERROR: Client handling")
112 | print(e)
113 |
114 | except Exception as e:
115 | print("ERROR: Failed to create socket")
116 | print(e)
117 | sys.exit(1)
118 |
--------------------------------------------------------------------------------
/notebooks/preprocess.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 10,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from datetime import datetime\n",
10 | "import pandas as pd\n",
11 | "import json\n",
12 | "from datasets import load_dataset, Dataset, DatasetDict"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 11,
18 | "metadata": {},
19 | "outputs": [
20 | {
21 | "name": "stderr",
22 | "output_type": "stream",
23 | "text": [
24 | "Downloading readme: 100%|██████████| 405/405 [00:00<00:00, 3.61MB/s]\n",
25 | "Downloading data: 100%|██████████| 116k/116k [00:10<00:00, 10.9kB/s]\n",
26 | "Generating train split: 100%|██████████| 481/481 [00:00<00:00, 108763.83 examples/s]\n"
27 | ]
28 | },
29 | {
30 | "data": {
31 | "text/plain": [
32 | "DatasetDict({\n",
33 | " train: Dataset({\n",
34 | " features: ['system', 'command', 'response', '__index_level_0__'],\n",
35 | " num_rows: 481\n",
36 | " })\n",
37 | "})"
38 | ]
39 | },
40 | "execution_count": 11,
41 | "metadata": {},
42 | "output_type": "execute_result"
43 | }
44 | ],
45 | "source": [
46 | "dataset = load_dataset(\"hotal/linux_commands\")\n",
47 | "dataset"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 12,
53 | "metadata": {},
54 | "outputs": [
55 | {
56 | "data": {
57 | "text/html": [
58 | "
\n",
59 | "\n",
72 | "
\n",
73 | " \n",
74 | " \n",
75 | " \n",
76 | " system \n",
77 | " command \n",
78 | " response \n",
79 | " __index_level_0__ \n",
80 | " \n",
81 | " \n",
82 | " \n",
83 | " \n",
84 | " 0 \n",
85 | " I want you to understand what every Linux term... \n",
86 | " chgrp \n",
87 | " # chgrp\\n> Change group ownership of files and... \n",
88 | " 0 \n",
89 | " \n",
90 | " \n",
91 | " 1 \n",
92 | " I want you to understand what every Linux term... \n",
93 | " more \n",
94 | " # more\\n> Open a file for interactive reading,... \n",
95 | " 1 \n",
96 | " \n",
97 | " \n",
98 | " 2 \n",
99 | " I want you to understand what every Linux term... \n",
100 | " git-hash-object \n",
101 | " # git hash-object\\n> Computes the unique hash ... \n",
102 | " 2 \n",
103 | " \n",
104 | " \n",
105 | " 3 \n",
106 | " I want you to understand what every Linux term... \n",
107 | " id \n",
108 | " # id\\n> Display current user and group identit... \n",
109 | " 3 \n",
110 | " \n",
111 | " \n",
112 | " 4 \n",
113 | " I want you to understand what every Linux term... \n",
114 | " nl \n",
115 | " # nl\\n> A utility for numbering lines, either ... \n",
116 | " 4 \n",
117 | " \n",
118 | " \n",
119 | " ... \n",
120 | " ... \n",
121 | " ... \n",
122 | " ... \n",
123 | " ... \n",
124 | " \n",
125 | " \n",
126 | " 476 \n",
127 | " I want you to understand what every Linux term... \n",
128 | " sleep \n",
129 | " # sleep\\n> Delay for a specified amount of tim... \n",
130 | " 476 \n",
131 | " \n",
132 | " \n",
133 | " 477 \n",
134 | " I want you to understand what every Linux term... \n",
135 | " manpath \n",
136 | " # manpath\\n> Determine the search path for man... \n",
137 | " 477 \n",
138 | " \n",
139 | " \n",
140 | " 478 \n",
141 | " I want you to understand what every Linux term... \n",
142 | " mv \n",
143 | " # mv\\n> Move or rename files and directories. ... \n",
144 | " 478 \n",
145 | " \n",
146 | " \n",
147 | " 479 \n",
148 | " I want you to understand what every Linux term... \n",
149 | " whereis \n",
150 | " # whereis\\n> Locate the binary, source, and ma... \n",
151 | " 479 \n",
152 | " \n",
153 | " \n",
154 | " 480 \n",
155 | " I want you to understand what every Linux term... \n",
156 | " git-daemon \n",
157 | " # git daemon\\n> A really simple server for Git... \n",
158 | " 480 \n",
159 | " \n",
160 | " \n",
161 | "
\n",
162 | "
481 rows × 4 columns
\n",
163 | "
"
164 | ],
165 | "text/plain": [
166 | " system command \\\n",
167 | "0 I want you to understand what every Linux term... chgrp \n",
168 | "1 I want you to understand what every Linux term... more \n",
169 | "2 I want you to understand what every Linux term... git-hash-object \n",
170 | "3 I want you to understand what every Linux term... id \n",
171 | "4 I want you to understand what every Linux term... nl \n",
172 | ".. ... ... \n",
173 | "476 I want you to understand what every Linux term... sleep \n",
174 | "477 I want you to understand what every Linux term... manpath \n",
175 | "478 I want you to understand what every Linux term... mv \n",
176 | "479 I want you to understand what every Linux term... whereis \n",
177 | "480 I want you to understand what every Linux term... git-daemon \n",
178 | "\n",
179 | " response __index_level_0__ \n",
180 | "0 # chgrp\\n> Change group ownership of files and... 0 \n",
181 | "1 # more\\n> Open a file for interactive reading,... 1 \n",
182 | "2 # git hash-object\\n> Computes the unique hash ... 2 \n",
183 | "3 # id\\n> Display current user and group identit... 3 \n",
184 | "4 # nl\\n> A utility for numbering lines, either ... 4 \n",
185 | ".. ... ... \n",
186 | "476 # sleep\\n> Delay for a specified amount of tim... 476 \n",
187 | "477 # manpath\\n> Determine the search path for man... 477 \n",
188 | "478 # mv\\n> Move or rename files and directories. ... 478 \n",
189 | "479 # whereis\\n> Locate the binary, source, and ma... 479 \n",
190 | "480 # git daemon\\n> A really simple server for Git... 480 \n",
191 | "\n",
192 | "[481 rows x 4 columns]"
193 | ]
194 | },
195 | "execution_count": 12,
196 | "metadata": {},
197 | "output_type": "execute_result"
198 | }
199 | ],
200 | "source": [
201 | "df = pd.DataFrame(dataset['train'])\n",
202 | "df"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": 17,
208 | "metadata": {},
209 | "outputs": [
210 | {
211 | "data": {
212 | "text/html": [
213 | "\n",
214 | "\n",
227 | "
\n",
228 | " \n",
229 | " \n",
230 | " \n",
231 | " system \n",
232 | " command \n",
233 | " response \n",
234 | " \n",
235 | " \n",
236 | " \n",
237 | " \n",
238 | " 0 \n",
239 | " You are a linux expert. You understand what ev... \n",
240 | " chgrp \n",
241 | " # chgrp\\n> Change group ownership of files and... \n",
242 | " \n",
243 | " \n",
244 | " 1 \n",
245 | " You are a linux expert. You understand what ev... \n",
246 | " more \n",
247 | " # more\\n> Open a file for interactive reading,... \n",
248 | " \n",
249 | " \n",
250 | " 2 \n",
251 | " You are a linux expert. You understand what ev... \n",
252 | " git-hash-object \n",
253 | " # git hash-object\\n> Computes the unique hash ... \n",
254 | " \n",
255 | " \n",
256 | " 3 \n",
257 | " You are a linux expert. You understand what ev... \n",
258 | " id \n",
259 | " # id\\n> Display current user and group identit... \n",
260 | " \n",
261 | " \n",
262 | " 4 \n",
263 | " You are a linux expert. You understand what ev... \n",
264 | " nl \n",
265 | " # nl\\n> A utility for numbering lines, either ... \n",
266 | " \n",
267 | " \n",
268 | " ... \n",
269 | " ... \n",
270 | " ... \n",
271 | " ... \n",
272 | " \n",
273 | " \n",
274 | " 476 \n",
275 | " You are a linux expert. You understand what ev... \n",
276 | " sleep \n",
277 | " # sleep\\n> Delay for a specified amount of tim... \n",
278 | " \n",
279 | " \n",
280 | " 477 \n",
281 | " You are a linux expert. You understand what ev... \n",
282 | " manpath \n",
283 | " # manpath\\n> Determine the search path for man... \n",
284 | " \n",
285 | " \n",
286 | " 478 \n",
287 | " You are a linux expert. You understand what ev... \n",
288 | " mv \n",
289 | " # mv\\n> Move or rename files and directories. ... \n",
290 | " \n",
291 | " \n",
292 | " 479 \n",
293 | " You are a linux expert. You understand what ev... \n",
294 | " whereis \n",
295 | " # whereis\\n> Locate the binary, source, and ma... \n",
296 | " \n",
297 | " \n",
298 | " 480 \n",
299 | " You are a linux expert. You understand what ev... \n",
300 | " git-daemon \n",
301 | " # git daemon\\n> A really simple server for Git... \n",
302 | " \n",
303 | " \n",
304 | "
\n",
305 | "
481 rows × 3 columns
\n",
306 | "
"
307 | ],
308 | "text/plain": [
309 | " system command \\\n",
310 | "0 You are a linux expert. You understand what ev... chgrp \n",
311 | "1 You are a linux expert. You understand what ev... more \n",
312 | "2 You are a linux expert. You understand what ev... git-hash-object \n",
313 | "3 You are a linux expert. You understand what ev... id \n",
314 | "4 You are a linux expert. You understand what ev... nl \n",
315 | ".. ... ... \n",
316 | "476 You are a linux expert. You understand what ev... sleep \n",
317 | "477 You are a linux expert. You understand what ev... manpath \n",
318 | "478 You are a linux expert. You understand what ev... mv \n",
319 | "479 You are a linux expert. You understand what ev... whereis \n",
320 | "480 You are a linux expert. You understand what ev... git-daemon \n",
321 | "\n",
322 | " response \n",
323 | "0 # chgrp\\n> Change group ownership of files and... \n",
324 | "1 # more\\n> Open a file for interactive reading,... \n",
325 | "2 # git hash-object\\n> Computes the unique hash ... \n",
326 | "3 # id\\n> Display current user and group identit... \n",
327 | "4 # nl\\n> A utility for numbering lines, either ... \n",
328 | ".. ... \n",
329 | "476 # sleep\\n> Delay for a specified amount of tim... \n",
330 | "477 # manpath\\n> Determine the search path for man... \n",
331 | "478 # mv\\n> Move or rename files and directories. ... \n",
332 | "479 # whereis\\n> Locate the binary, source, and ma... \n",
333 | "480 # git daemon\\n> A really simple server for Git... \n",
334 | "\n",
335 | "[481 rows x 3 columns]"
336 | ]
337 | },
338 | "execution_count": 17,
339 | "metadata": {},
340 | "output_type": "execute_result"
341 | }
342 | ],
343 | "source": [
344 | "for i, data in df.iterrows():\n",
345 | " df.loc[i, 'system'] = \"You are a linux expert. You understand what every Linux terminal command does and you reply with the explanation when asked.\"\n",
346 | "\n",
347 | "df = df.drop('__index_level_0__', axis=1)\n",
348 | "df"
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": 19,
354 | "metadata": {},
355 | "outputs": [
356 | {
357 | "data": {
358 | "text/html": [
359 | "\n",
360 | "\n",
373 | "
\n",
374 | " \n",
375 | " \n",
376 | " \n",
377 | " system \n",
378 | " command \n",
379 | " response \n",
380 | " \n",
381 | " \n",
382 | " \n",
383 | " \n",
384 | " 0 \n",
385 | " You are a linux expert. You understand what ev... \n",
386 | " w \n",
387 | " # w\\n> Show who is logged on and what they are... \n",
388 | " \n",
389 | " \n",
390 | " 1 \n",
391 | " You are a linux expert. You understand what ev... \n",
392 | " ar \n",
393 | " # ar\\n> Create, modify, and extract from Unix ... \n",
394 | " \n",
395 | " \n",
396 | " 2 \n",
397 | " You are a linux expert. You understand what ev... \n",
398 | " mv \n",
399 | " # mv\\n> Move or rename files and directories. ... \n",
400 | " \n",
401 | " \n",
402 | " 3 \n",
403 | " You are a linux expert. You understand what ev... \n",
404 | " ps \n",
405 | " # ps\\n> Information about running processes. M... \n",
406 | " \n",
407 | " \n",
408 | " 4 \n",
409 | " You are a linux expert. You understand what ev... \n",
410 | " ld \n",
411 | " # ld\\n> Link object files together. More infor... \n",
412 | " \n",
413 | " \n",
414 | " ... \n",
415 | " ... \n",
416 | " ... \n",
417 | " ... \n",
418 | " \n",
419 | " \n",
420 | " 476 \n",
421 | " You are a linux expert. You understand what ev... \n",
422 | " git-cvsexportcommit \n",
423 | " # git cvsexportcommit\\n> Export a single `Git`... \n",
424 | " \n",
425 | " \n",
426 | " 477 \n",
427 | " You are a linux expert. You understand what ev... \n",
428 | " update-alternatives \n",
429 | " # update-alternatives\\n> A convenient tool for... \n",
430 | " \n",
431 | " \n",
432 | " 478 \n",
433 | " You are a linux expert. You understand what ev... \n",
434 | " git-credential-store \n",
435 | " # git credential-store\\n> `git` helper to stor... \n",
436 | " \n",
437 | " \n",
438 | " 479 \n",
439 | " You are a linux expert. You understand what ev... \n",
440 | " git-credential-cache \n",
441 | " # git credential-cache\\n> Git helper to tempor... \n",
442 | " \n",
443 | " \n",
444 | " 480 \n",
445 | " You are a linux expert. You understand what ev... \n",
446 | " git-check-ref-format \n",
447 | " # git check-ref-format\\n> Checks if a given re... \n",
448 | " \n",
449 | " \n",
450 | "
\n",
451 | "
481 rows × 3 columns
\n",
452 | "
"
453 | ],
454 | "text/plain": [
455 | " system command \\\n",
456 | "0 You are a linux expert. You understand what ev... w \n",
457 | "1 You are a linux expert. You understand what ev... ar \n",
458 | "2 You are a linux expert. You understand what ev... mv \n",
459 | "3 You are a linux expert. You understand what ev... ps \n",
460 | "4 You are a linux expert. You understand what ev... ld \n",
461 | ".. ... ... \n",
462 | "476 You are a linux expert. You understand what ev... git-cvsexportcommit \n",
463 | "477 You are a linux expert. You understand what ev... update-alternatives \n",
464 | "478 You are a linux expert. You understand what ev... git-credential-store \n",
465 | "479 You are a linux expert. You understand what ev... git-credential-cache \n",
466 | "480 You are a linux expert. You understand what ev... git-check-ref-format \n",
467 | "\n",
468 | " response \n",
469 | "0 # w\\n> Show who is logged on and what they are... \n",
470 | "1 # ar\\n> Create, modify, and extract from Unix ... \n",
471 | "2 # mv\\n> Move or rename files and directories. ... \n",
472 | "3 # ps\\n> Information about running processes. M... \n",
473 | "4 # ld\\n> Link object files together. More infor... \n",
474 | ".. ... \n",
475 | "476 # git cvsexportcommit\\n> Export a single `Git`... \n",
476 | "477 # update-alternatives\\n> A convenient tool for... \n",
477 | "478 # git credential-store\\n> `git` helper to stor... \n",
478 | "479 # git credential-cache\\n> Git helper to tempor... \n",
479 | "480 # git check-ref-format\\n> Checks if a given re... \n",
480 | "\n",
481 | "[481 rows x 3 columns]"
482 | ]
483 | },
484 | "execution_count": 19,
485 | "metadata": {},
486 | "output_type": "execute_result"
487 | }
488 | ],
489 | "source": [
490 | "s = df.command.str.len().sort_values(ascending=True).index\n",
491 | "df = df.reindex(s).reset_index(drop=True)\n",
492 | "df"
493 | ]
494 | },
495 | {
496 | "cell_type": "code",
497 | "execution_count": 8,
498 | "metadata": {},
499 | "outputs": [],
500 | "source": [
501 | "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')"
502 | ]
503 | },
504 | {
505 | "cell_type": "code",
506 | "execution_count": 20,
507 | "metadata": {},
508 | "outputs": [
509 | {
510 | "name": "stderr",
511 | "output_type": "stream",
512 | "text": [
513 | "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 958.04ba/s]\n",
514 | "Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00, 1.49s/it]\n"
515 | ]
516 | },
517 | {
518 | "data": {
519 | "text/plain": [
520 | "CommitInfo(commit_url='https://huggingface.co/datasets/hotal/linux_commands/commit/96c38caacb53d0b586cb93731b94e3e0dad1fa10', commit_message='Upload dataset', commit_description='', oid='96c38caacb53d0b586cb93731b94e3e0dad1fa10', pr_url=None, pr_revision=None, pr_num=None)"
521 | ]
522 | },
523 | "execution_count": 20,
524 | "metadata": {},
525 | "output_type": "execute_result"
526 | }
527 | ],
528 | "source": [
529 | "dataset = Dataset.from_pandas(df)\n",
530 | "dataset.push_to_hub(\"hotal/linux_commands\")"
531 | ]
532 | },
533 | {
534 | "cell_type": "code",
535 | "execution_count": null,
536 | "metadata": {},
537 | "outputs": [],
538 | "source": []
539 | }
540 | ],
541 | "metadata": {
542 | "kernelspec": {
543 | "display_name": "venv",
544 | "language": "python",
545 | "name": "python3"
546 | },
547 | "language_info": {
548 | "codemirror_mode": {
549 | "name": "ipython",
550 | "version": 3
551 | },
552 | "file_extension": ".py",
553 | "mimetype": "text/x-python",
554 | "name": "python",
555 | "nbconvert_exporter": "python",
556 | "pygments_lexer": "ipython3",
557 | "version": "3.11.8"
558 | }
559 | },
560 | "nbformat": 4,
561 | "nbformat_minor": 2
562 | }
563 |
--------------------------------------------------------------------------------
/notebooks/postprocess.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Upload Model to HF"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "from transformers import AutoModel,AutoTokenizer\n",
17 | "\n",
18 | "model = AutoModel.from_pretrained('../models/honeypot-llama3-8B')\n",
19 | "tokenizer = AutoTokenizer.from_pretrained('../models/honeypot-llama3-8B')\n",
20 | "\n",
21 | "model.push_to_hub(\"honeypot-llama3-8B\")\n",
22 | "tokenizer.push_to_hub(\"honeypot-llama3-8B\")"
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "metadata": {},
28 | "source": [
29 | "## LLM Inference"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 29,
35 | "metadata": {},
36 | "outputs": [
37 | {
38 | "name": "stdout",
39 | "output_type": "stream",
40 | "text": [
41 | "Cleared GPU...\n"
42 | ]
43 | },
44 | {
45 | "name": "stderr",
46 | "output_type": "stream",
47 | "text": [
48 | "Loading checkpoint shards: 100%|██████████| 17/17 [00:05<00:00, 3.38it/s]\n",
49 | "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
50 | ]
51 | },
52 | {
53 | "name": "stdout",
54 | "output_type": "stream",
55 | "text": [
56 | "Loaded Model: ../models/honeypot-llama3-8B\n"
57 | ]
58 | }
59 | ],
60 | "source": [
61 | "import torch\n",
62 | "import gc\n",
63 | "import re\n",
64 | "from transformers import (\n",
65 | " AutoModelForCausalLM,\n",
66 | " AutoTokenizer,\n",
67 | " BitsAndBytesConfig,\n",
68 | " pipeline\n",
69 | ")\n",
70 | "\n",
71 | "class LLM:\n",
72 | " def __init__(self, model_name=\"NousResearch/Meta-Llama-3-8B-Instruct\"):\n",
73 | " gc.collect()\n",
74 | " torch.cuda.empty_cache()\n",
75 | " print(\"Cleared GPU...\")\n",
76 | "\n",
77 | " self.DEVICE = \"cuda:1\" if torch.cuda.is_available() else \"cpu\"\n",
78 | " self.BASE_MODEL_NAME = model_name\n",
79 | " self.SYSTEM_PROMPT = \"You are mimicking a linux server. Respond with what the terminal would respond when a code given. I want you to only reply with the terminal outputs inside one unique code block and nothing else. Do not write any explanations. Do not type any commands unless I instruct you to do so.\"\n",
80 | "\n",
81 | " # Model configuration\n",
82 | " self.pipeline = pipeline(\n",
83 | " \"text-generation\",\n",
84 | " model=self.BASE_MODEL_NAME,\n",
85 | " tokenizer=self.BASE_MODEL_NAME,\n",
86 | " model_kwargs={\"torch_dtype\": torch.bfloat16},\n",
87 | " device=self.DEVICE,\n",
88 | " )\n",
89 | "\n",
90 | " print(\"Loaded Model: \", self.BASE_MODEL_NAME)\n",
91 | "\n",
92 | " def answer(self, query, log_history=[], max_tokens=4096, temperature=0.01, top_p=0.8):\n",
93 | "\n",
94 | " message_history = [{\"role\": \"system\", \"content\": self.SYSTEM_PROMPT}]\n",
95 | " if len(log_history) > 0:\n",
96 | " for i, item in enumerate(log_history):\n",
97 | " if i % 2 == 0:\n",
98 | " message_history.append({\"role\": \"user\", \"content\": item})\n",
99 | " else:\n",
100 | " message_history.append({\"role\": \"assistant\", \"content\": item})\n",
101 | "\n",
102 | " user_prompt = message_history + [{\"role\": \"user\", \"content\": query}]\n",
103 | " prompt = self.pipeline.tokenizer.apply_chat_template(\n",
104 | " user_prompt, tokenize=False, add_generation_prompt=True\n",
105 | " )\n",
106 | " outputs = self.pipeline(\n",
107 | " prompt,\n",
108 | " max_new_tokens=max_tokens,\n",
109 | " eos_token_id=self.pipeline.tokenizer.eos_token_id,\n",
110 | " do_sample=True,\n",
111 | " temperature=temperature,\n",
112 | " top_p=top_p,\n",
113 | " )\n",
114 | " response = outputs[0][\"generated_text\"][len(prompt):]\n",
115 | "\n",
116 | " # remove unnecessary quotes\n",
117 | " if response.startswith(\"```\") and response.endswith(\"```\"):\n",
118 | " response = response[3:-3]\n",
119 | " elif response.startswith(\"`\") and response.endswith(\"`\"):\n",
120 | " response = response[1:-1]\n",
121 | " \n",
122 | " return response\n",
123 | " \n",
124 | "\n",
125 | "llama = LLM(\"../models/honeypot-llama3-8B\")\n",
126 | "#print(llama.answer('ls -al', ['mkdir test_directory; touch test.txt','\\n']))"
127 | ]
128 | },
129 | {
130 | "cell_type": "markdown",
131 | "metadata": {},
132 | "source": [
133 | "## Evaluation"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": 52,
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "from openai import OpenAI\n",
143 | "\n",
144 | "client = OpenAI(base_url=\"http://localhost:1234/v1\", api_key=\"lm-studio\")\n",
145 | "\n",
146 | "\n",
147 | "def measure_correctness(prompt, command, generated_answer, reference_answer):\n",
148 | " try:\n",
149 | " correctness_prompt = f\"\"\"###Task Description: An instruction (might include an Input inside it), a query, a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given. \n",
150 | " 1. Write a detailed feedback that assesses the quality of the response strictly based on the given score rubric, not evaluating in general. \n",
151 | " 2. After writing a feedback, write a score that is either 1 or 2 or 3 or 4 or 5. You should refer to the score rubric. \n",
152 | " 3. The output format should look as follows: \"Feedback: (write a feedback for criteria) [RESULT] (1 or 2 or 3 or 4 or 5)\" \n",
153 | " 4. Please do not generate any other opening, closing, and explanations. \n",
154 | " 5. Generated answer and reference answer can be completely different, consider if it makes sense for the given command.\n",
155 | " 6. Error messages in the generated answer are fine as long as the reference answer has error messages too.\n",
156 | "\n",
157 | " ###The instruction to evaluate: Your task is to evaluate the generated answer and reference answer for the query: {prompt}\\nCommand:{command}\n",
158 | "\n",
159 | " ###Generate answer to evaluate: {generated_answer} \n",
160 | "\n",
161 | " ###Reference Answer (Score 5): {reference_answer}\n",
162 | "\n",
163 | " ###Score Rubrics: \n",
164 | " Score 1: The response is completely irrelevant to the given command.\n",
165 | " Score 2: The response executes the command but provides incomplete or inaccurate information.\n",
166 | " Score 3: The response provides some accurate information but lacks detail or has minor errors.\n",
167 | " Score 4: The response provides mostly accurate and complete information but has some minor errors or omissions.\n",
168 | " Score 5: The response provides a complete and accurate response to the given command, identical to what a Linux terminal would output.\n",
169 | " \n",
170 | " ###Feedback:\"\"\"\n",
171 | "\n",
172 | " completion = client.chat.completions.create(\n",
173 | " model=\"RichardErkhov/prometheus-eval_-_prometheus-7b-v2.0-gguf\",\n",
174 | " messages=[\n",
175 | " {\"role\": \"system\", \"content\": \"You are a language model evaluator. Respond according to the given task.\"},\n",
176 | " {\"role\": \"user\", \"content\": correctness_prompt}\n",
177 | " ],\n",
178 | " temperature=0.01,\n",
179 | " )\n",
180 | "\n",
181 | " return int(completion.choices[0].message.content.split('[RESULT] ')[1])\n",
182 | " except:\n",
183 | " return -1"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": 53,
189 | "metadata": {},
190 | "outputs": [
191 | {
192 | "data": {
193 | "text/plain": [
194 | "2"
195 | ]
196 | },
197 | "execution_count": 53,
198 | "metadata": {},
199 | "output_type": "execute_result"
200 | }
201 | ],
202 | "source": [
203 | "prompt = 'You are mimicking a linux server. Respond with what the terminal would respond when a code given. I want you to only reply with the terminal outputs inside one unique code block and nothing else. Do not write any explanations. Do not type any commands unless I instruct you to do so.'\n",
204 | "command = 'ls'\n",
205 | "generated_answer = 'bin boot dev etc home '\n",
206 | "reference_answer = 'bin boot dev etc home lib lib64 lost+found media mnt opt proc root run sbin srv sys tmp usr var'\n",
207 | "\n",
208 | "measure_correctness(prompt, command, generated_answer, reference_answer)"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": 72,
214 | "metadata": {},
215 | "outputs": [
216 | {
217 | "name": "stderr",
218 | "output_type": "stream",
219 | "text": [
220 | "12it [00:28, 6.20s/it]"
221 | ]
222 | },
223 | {
224 | "name": "stdout",
225 | "output_type": "stream",
226 | "text": [
227 | "Skipping row 9 due to timeout (> 15 seconds)\n"
228 | ]
229 | },
230 | {
231 | "name": "stderr",
232 | "output_type": "stream",
233 | "text": [
234 | "16it [00:45, 6.26s/it]"
235 | ]
236 | },
237 | {
238 | "name": "stdout",
239 | "output_type": "stream",
240 | "text": [
241 | "Skipping row 132 due to timeout (> 15 seconds)\n"
242 | ]
243 | },
244 | {
245 | "name": "stderr",
246 | "output_type": "stream",
247 | "text": [
248 | "42it [01:32, 5.52s/it]"
249 | ]
250 | },
251 | {
252 | "name": "stdout",
253 | "output_type": "stream",
254 | "text": [
255 | "Skipping row 42 due to timeout (> 15 seconds)\n"
256 | ]
257 | },
258 | {
259 | "name": "stderr",
260 | "output_type": "stream",
261 | "text": [
262 | "60it [02:28, 9.42s/it]"
263 | ]
264 | },
265 | {
266 | "name": "stdout",
267 | "output_type": "stream",
268 | "text": [
269 | "Skipping row 179 due to timeout (> 15 seconds)\n"
270 | ]
271 | },
272 | {
273 | "name": "stderr",
274 | "output_type": "stream",
275 | "text": [
276 | "67it [02:58, 6.99s/it]"
277 | ]
278 | },
279 | {
280 | "name": "stdout",
281 | "output_type": "stream",
282 | "text": [
283 | "Skipping row 60 due to timeout (> 15 seconds)\n"
284 | ]
285 | },
286 | {
287 | "name": "stderr",
288 | "output_type": "stream",
289 | "text": [
290 | "76it [03:24, 5.86s/it]"
291 | ]
292 | },
293 | {
294 | "name": "stdout",
295 | "output_type": "stream",
296 | "text": [
297 | "Skipping row 177 due to timeout (> 15 seconds)\n"
298 | ]
299 | },
300 | {
301 | "name": "stderr",
302 | "output_type": "stream",
303 | "text": [
304 | "90it [03:50, 3.73s/it]"
305 | ]
306 | },
307 | {
308 | "name": "stdout",
309 | "output_type": "stream",
310 | "text": [
311 | "Skipping row 185 due to timeout (> 15 seconds)\n"
312 | ]
313 | },
314 | {
315 | "name": "stderr",
316 | "output_type": "stream",
317 | "text": [
318 | "142it [05:04, 4.91s/it]"
319 | ]
320 | },
321 | {
322 | "name": "stdout",
323 | "output_type": "stream",
324 | "text": [
325 | "Skipping row 221 due to timeout (> 15 seconds)\n"
326 | ]
327 | },
328 | {
329 | "name": "stderr",
330 | "output_type": "stream",
331 | "text": [
332 | "150it [05:30, 2.20s/it]"
333 | ]
334 | },
335 | {
336 | "name": "stdout",
337 | "output_type": "stream",
338 | "text": [
339 | "Skipping row 222 due to timeout (> 15 seconds)\n"
340 | ]
341 | },
342 | {
343 | "name": "stderr",
344 | "output_type": "stream",
345 | "text": [
346 | "\n"
347 | ]
348 | },
349 | {
350 | "data": {
351 | "text/html": [
352 | "\n",
353 | "\n",
366 | "
\n",
367 | " \n",
368 | " \n",
369 | " \n",
370 | " system \n",
371 | " command \n",
372 | " response \n",
373 | " generated \n",
374 | " \n",
375 | " \n",
376 | " \n",
377 | " \n",
378 | " 25 \n",
379 | " You are mimicking a linux server. Respond with... \n",
380 | " ps \n",
381 | " PID TTY TIME COMMAND ... \n",
382 | " \\n PID TTY TIME CMD\\n 1234 pts/0 ... \n",
383 | " \n",
384 | " \n",
385 | " 309 \n",
386 | " You are mimicking a linux server. Respond with... \n",
387 | " uptime -u \n",
388 | " 18:42:29 up 3:14, 1 user, load average: 0.... \n",
389 | " \\n 00:00:00\\n \n",
390 | " \n",
391 | " \n",
392 | " 73 \n",
393 | " You are mimicking a linux server. Respond with... \n",
394 | " chmod u+rw /home/phil/file_or_directory1,/home... \n",
395 | " chmod: cannot access '/home/phil/file_or_direc... \n",
396 | " \\nchmod: cannot access '/home/phil/file_or_dir... \n",
397 | " \n",
398 | " \n",
399 | " 195 \n",
400 | " You are mimicking a linux server. Respond with... \n",
401 | " mv /home/phil/source/myfile.txt /home/phil/tar... \n",
402 | " mv: cannot create regular file `/home/phil/tar... \n",
403 | " mv: cannot move '/home/phil/source/myfile.txt'... \n",
404 | " \n",
405 | " \n",
406 | " 57 \n",
407 | " You are mimicking a linux server. Respond with... \n",
408 | " cat /test2 \n",
409 | " ELF\u0002\u0001\u0001 \n",
410 | " cat: /test2: No such file or directory \n",
411 | " \n",
412 | " \n",
413 | " ... \n",
414 | " ... \n",
415 | " ... \n",
416 | " ... \n",
417 | " ... \n",
418 | " \n",
419 | " \n",
420 | " 283 \n",
421 | " You are mimicking a linux server. Respond with... \n",
422 | " uname -h \n",
423 | " uname: invalid option -- 'h'\\nTry 'uname --hel... \n",
424 | " Linux \n",
425 | " \n",
426 | " \n",
427 | " 148 \n",
428 | " You are mimicking a linux server. Respond with... \n",
429 | " gcc -Wall -O2 -o path/to/output_executable pat... \n",
430 | " gcc: path/to/source.cpp: No such file or direc... \n",
431 | " \\ngcc: error: path/to/output_executable: No su... \n",
432 | " \n",
433 | " \n",
434 | " 291 \n",
435 | " You are mimicking a linux server. Respond with... \n",
436 | " uptime \n",
437 | " 18:42:08 up 3:13, 1 user, load average: 0.... \n",
438 | " \\n 21:34:12 up 2 days, 1:04, 1 user, load a... \n",
439 | " \n",
440 | " \n",
441 | " 239 \n",
442 | " You are mimicking a linux server. Respond with... \n",
443 | " rmdir -p /home/phil/directory1 /home/phil/dire... \n",
444 | " rmdir: failed to remove `-p': No such file or ... \n",
445 | " \\nrmdir: failed to remove `/home/phil/director... \n",
446 | " \n",
447 | " \n",
448 | " 222 \n",
449 | " You are mimicking a linux server. Respond with... \n",
450 | " ps aux \n",
451 | " USER PID %CPU %MEM VSZ ... \n",
452 | " None \n",
453 | " \n",
454 | " \n",
455 | "
\n",
456 | "
150 rows × 4 columns
\n",
457 | "
"
458 | ],
459 | "text/plain": [
460 | " system \\\n",
461 | "25 You are mimicking a linux server. Respond with... \n",
462 | "309 You are mimicking a linux server. Respond with... \n",
463 | "73 You are mimicking a linux server. Respond with... \n",
464 | "195 You are mimicking a linux server. Respond with... \n",
465 | "57 You are mimicking a linux server. Respond with... \n",
466 | ".. ... \n",
467 | "283 You are mimicking a linux server. Respond with... \n",
468 | "148 You are mimicking a linux server. Respond with... \n",
469 | "291 You are mimicking a linux server. Respond with... \n",
470 | "239 You are mimicking a linux server. Respond with... \n",
471 | "222 You are mimicking a linux server. Respond with... \n",
472 | "\n",
473 | " command \\\n",
474 | "25 ps \n",
475 | "309 uptime -u \n",
476 | "73 chmod u+rw /home/phil/file_or_directory1,/home... \n",
477 | "195 mv /home/phil/source/myfile.txt /home/phil/tar... \n",
478 | "57 cat /test2 \n",
479 | ".. ... \n",
480 | "283 uname -h \n",
481 | "148 gcc -Wall -O2 -o path/to/output_executable pat... \n",
482 | "291 uptime \n",
483 | "239 rmdir -p /home/phil/directory1 /home/phil/dire... \n",
484 | "222 ps aux \n",
485 | "\n",
486 | " response \\\n",
487 | "25 PID TTY TIME COMMAND ... \n",
488 | "309 18:42:29 up 3:14, 1 user, load average: 0.... \n",
489 | "73 chmod: cannot access '/home/phil/file_or_direc... \n",
490 | "195 mv: cannot create regular file `/home/phil/tar... \n",
491 | "57 ELF\u0002\u0001\u0001 \n",
492 | ".. ... \n",
493 | "283 uname: invalid option -- 'h'\\nTry 'uname --hel... \n",
494 | "148 gcc: path/to/source.cpp: No such file or direc... \n",
495 | "291 18:42:08 up 3:13, 1 user, load average: 0.... \n",
496 | "239 rmdir: failed to remove `-p': No such file or ... \n",
497 | "222 USER PID %CPU %MEM VSZ ... \n",
498 | "\n",
499 | " generated \n",
500 | "25 \\n PID TTY TIME CMD\\n 1234 pts/0 ... \n",
501 | "309 \\n 00:00:00\\n \n",
502 | "73 \\nchmod: cannot access '/home/phil/file_or_dir... \n",
503 | "195 mv: cannot move '/home/phil/source/myfile.txt'... \n",
504 | "57 cat: /test2: No such file or directory \n",
505 | ".. ... \n",
506 | "283 Linux \n",
507 | "148 \\ngcc: error: path/to/output_executable: No su... \n",
508 | "291 \\n 21:34:12 up 2 days, 1:04, 1 user, load a... \n",
509 | "239 \\nrmdir: failed to remove `/home/phil/director... \n",
510 | "222 None \n",
511 | "\n",
512 | "[150 rows x 4 columns]"
513 | ]
514 | },
515 | "execution_count": 72,
516 | "metadata": {},
517 | "output_type": "execute_result"
518 | }
519 | ],
520 | "source": [
521 | "import pandas as pd\n",
522 | "from datasets import load_dataset\n",
523 | "from tqdm import tqdm\n",
524 | "import time\n",
525 | "import spacy\n",
526 | "import signal\n",
527 | "nlp = spacy.load(\"en_core_web_sm\")\n",
528 | "\n",
529 | "dataset = load_dataset(\"hotal/honeypot_logs\")\n",
530 | "df = pd.DataFrame(dataset['train'])[:334].sample(n=150, random_state=42)\n",
531 | "\n",
532 | "def timeout_handler(signum, frame):\n",
533 | " raise TimeoutError\n",
534 | "\n",
535 | "for index, row in tqdm(df.iterrows()):\n",
536 | " signal.signal(signal.SIGALRM, timeout_handler)\n",
537 | " signal.alarm(15) # 10 seconds\n",
538 | " try:\n",
539 | " df.loc[index, 'generated'] = llama.answer(row['command'], [])\n",
540 | " except TimeoutError:\n",
541 | " print(f\"Skipping row {index} due to timeout (> 15 seconds)\")\n",
542 | " df.loc[index, 'generated'] = None\n",
543 | " finally:\n",
544 | " signal.alarm(0) # reset the alarm\n",
545 | "\n",
546 | "df\n"
547 | ]
548 | },
549 | {
550 | "cell_type": "code",
551 | "execution_count": 78,
552 | "metadata": {},
553 | "outputs": [
554 | {
555 | "name": "stderr",
556 | "output_type": "stream",
557 | "text": [
558 | "0it [00:00, ?it/s]/tmp/ipykernel_1638383/3606016577.py:5: UserWarning: [W007] The model you're using has no word vectors loaded, so the result of the Doc.similarity method will be based on the tagger, parser and NER, which may not give useful similarity judgements. This may happen if you're using one of the small models, e.g. `en_core_web_sm`, which don't ship with word vectors and only use context-sensitive tensors. You can always add your own word vectors, or use one of the larger models instead if available.\n",
559 | " similarity = doc1.similarity(doc2)\n",
560 | "150it [00:01, 130.00it/s]\n"
561 | ]
562 | },
563 | {
564 | "data": {
565 | "text/plain": [
566 | "[0.9266226889377728,\n",
567 | " 0.7483038693015535,\n",
568 | " 0.9854740744166057,\n",
569 | " 0.8750943762180343,\n",
570 | " 0.1222527798910181,\n",
571 | " 0.9599760967911799,\n",
572 | " 0.15790873052795754,\n",
573 | " 0.3427842031666325,\n",
574 | " 0.8901137910677026,\n",
575 | " 0.9250525849039357,\n",
576 | " 0.6146497515738819,\n",
577 | " 0.6783382553049272,\n",
578 | " 0.9551248995922451,\n",
579 | " 0.627599031453704,\n",
580 | " 0.8775345504719663,\n",
581 | " 0.8153339463875003,\n",
582 | " 0.43039500631938543,\n",
583 | " 0.4415520404516888,\n",
584 | " 0.6499677098298162,\n",
585 | " 0.7572771114785681,\n",
586 | " 0.7478363709157725,\n",
587 | " 0.517494247868626,\n",
588 | " 0.5214024835335245,\n",
589 | " 0.5217871483441932,\n",
590 | " 0.4151627115886709,\n",
591 | " 0.7464344056291038,\n",
592 | " 0.5545979492340203,\n",
593 | " 0.7469779958000291,\n",
594 | " 0.3084552815614088,\n",
595 | " 0.9770636258190092,\n",
596 | " 0.42111358743318955,\n",
597 | " 0.508898890243933,\n",
598 | " 0.4691406705359161,\n",
599 | " 0.8500015026444628,\n",
600 | " 0.7013135151145201,\n",
601 | " 0.9176107960715111,\n",
602 | " 0.8260988634215954,\n",
603 | " 0.47781991339817054,\n",
604 | " 0.41822584352315856,\n",
605 | " 0.40500443370038985,\n",
606 | " 0.6149629113333346,\n",
607 | " 0.672761011302793,\n",
608 | " 0.9757408913151834,\n",
609 | " 0.7929636702502307,\n",
610 | " 0.28980344659973345,\n",
611 | " 0.5850302585113578,\n",
612 | " 0.5611672372311479,\n",
613 | " 0.8562951543701436,\n",
614 | " 0.950702976797263,\n",
615 | " 0.91938663377379,\n",
616 | " 0.9457809333641432,\n",
617 | " 0.4807638058305509,\n",
618 | " 0.9092084540315909,\n",
619 | " 0.6367839109198675,\n",
620 | " 0.9812164222549482,\n",
621 | " 0.8186939002754022,\n",
622 | " 0.9771496763658669,\n",
623 | " 0.7486600345735321,\n",
624 | " 0.41910924509250236,\n",
625 | " 0.4091024332843984,\n",
626 | " 0.3836090746137256,\n",
627 | " 0.9759682589885805,\n",
628 | " 0.44465350719130636,\n",
629 | " 0.6120472117886202,\n",
630 | " 0.9789453902284974,\n",
631 | " 0.7153902543235714,\n",
632 | " 0.7426386062855006,\n",
633 | " 0.9744701036083163,\n",
634 | " 0.6558226640808129,\n",
635 | " 0.9532569160570045,\n",
636 | " 0.6264219005218772,\n",
637 | " 0.468927884859053,\n",
638 | " 0.5790208651494848,\n",
639 | " 0.9093797194028849,\n",
640 | " 0.8619641459279357,\n",
641 | " 0.40928139596157587,\n",
642 | " 0.05515600619347201,\n",
643 | " 0.5654284362318464,\n",
644 | " 0.705511556698048,\n",
645 | " 0.7566292069891919,\n",
646 | " 0.5345443710608845,\n",
647 | " 0.9750915300170969,\n",
648 | " 0.2812566174949539,\n",
649 | " 0.9016269139383067,\n",
650 | " 0.7629358538450524,\n",
651 | " 0.9885831370195848,\n",
652 | " 0.8946553398557442,\n",
653 | " 0.40125993982842567,\n",
654 | " 0.9216778170046728,\n",
655 | " 0.3842899033080288,\n",
656 | " 0.9172229385791985,\n",
657 | " 0.5771070729807252,\n",
658 | " 0.34359980264799184,\n",
659 | " 0.8792366989444065,\n",
660 | " 0.9686253387818091,\n",
661 | " 0.8772259641843925,\n",
662 | " 0.28082071452590873,\n",
663 | " 0.6648062023302003,\n",
664 | " 0.7091523172314665,\n",
665 | " 0.9308563527991272,\n",
666 | " 0.6379564087919317,\n",
667 | " 0.2941533246890943,\n",
668 | " 0.40961384119132643,\n",
669 | " 0.9830999518848895,\n",
670 | " 0.8573340272480829,\n",
671 | " 0.6096735436269064,\n",
672 | " 0.867486599832943,\n",
673 | " 0.3184837440305669,\n",
674 | " 0.3161855948383834,\n",
675 | " 0.9135009372705184,\n",
676 | " 0.8137313915768604,\n",
677 | " 0.8123733670376505,\n",
678 | " 0.9667515554007875,\n",
679 | " 0.8935746811734981,\n",
680 | " 0.8542122675981001,\n",
681 | " 0.9885607732671146,\n",
682 | " 0.9491577734752112,\n",
683 | " 0.6577316278883858,\n",
684 | " 0.8955741072535015,\n",
685 | " 0.35920635435693815,\n",
686 | " 0.9752386936788853,\n",
687 | " 0.7163785906107277,\n",
688 | " 0.7413162555648729,\n",
689 | " 0.76000288795456,\n",
690 | " 0.8606838733019466,\n",
691 | " 0.6496282700594403,\n",
692 | " 0.40762294957370837,\n",
693 | " 0.8426753638284821,\n",
694 | " 0.9070421984574294,\n",
695 | " 0.9451967510573314,\n",
696 | " 0.07869086945968735,\n",
697 | " 0.5564012886490949,\n",
698 | " 0.9267505365363619,\n",
699 | " 0.6127896978889558,\n",
700 | " 0.41232448670824684,\n",
701 | " 0.6403345808255426,\n",
702 | " 0.2019148676242682,\n",
703 | " 0.929285189521436,\n",
704 | " 0.9783733409331761,\n",
705 | " 0.906493423110178]"
706 | ]
707 | },
708 | "execution_count": 78,
709 | "metadata": {},
710 | "output_type": "execute_result"
711 | }
712 | ],
713 | "source": [
714 | "def measure_similarity(sentence1, sentence2):\n",
715 | " try:\n",
716 | " doc1 = nlp(sentence1)\n",
717 | " doc2 = nlp(sentence2)\n",
718 | " similarity = doc1.similarity(doc2)\n",
719 | " return similarity\n",
720 | " except Exception as e:\n",
721 | " return -1\n",
722 | "\n",
723 | "results = []\n",
724 | "for index, row in tqdm(df.iterrows()):\n",
725 | " \n",
726 | " prompt = row['system']\n",
727 | " command = row['command']\n",
728 | " reference_answer = row['response']\n",
729 | " generated_answer = row['generated']\n",
730 | "\n",
731 | " #result = measure_correctness(prompt, command, generated_answer, reference_answer)\n",
732 | " result = measure_similarity(generated_answer, reference_answer)\n",
733 | " if result != -1:\n",
734 | " results.append(result)\n",
735 | " #time.sleep(0.05)\n",
736 | "\n",
737 | "results"
738 | ]
739 | },
740 | {
741 | "cell_type": "markdown",
742 | "metadata": {},
743 | "source": [
744 | "## Visualizations"
745 | ]
746 | },
747 | {
748 | "cell_type": "code",
749 | "execution_count": 113,
750 | "metadata": {},
751 | "outputs": [
752 | {
753 | "data": {
754 | "text/plain": [
755 | "(0.6885405904371525, 140)"
756 | ]
757 | },
758 | "execution_count": 113,
759 | "metadata": {},
760 | "output_type": "execute_result"
761 | }
762 | ],
763 | "source": [
764 | "import numpy as np\n",
765 | "np.array(results).mean(), len(results)"
766 | ]
767 | },
768 | {
769 | "cell_type": "code",
770 | "execution_count": 146,
771 | "metadata": {},
772 | "outputs": [
773 | {
774 | "data": {
775 | "image/png": "",
776 | "text/plain": [
777 | ""
778 | ]
779 | },
780 | "metadata": {},
781 | "output_type": "display_data"
782 | }
783 | ],
784 | "source": [
785 | "import matplotlib.pyplot as plt\n",
786 | " \n",
787 | "n, bins = np.histogram(results, bins=np.arange(0, 1.1, 0.1))\n",
788 | "plt.figure(figsize=(10, 6), dpi=150)\n",
789 | "plt.bar(bins[:-1], n, width=0.1, align='edge', edgecolor='black')\n",
790 | "plt.xticks(np.arange(0, 1.1, 0.1))\n",
791 | " \n",
792 | "plt.xlabel('Cosine Similarity Score')\n",
793 | "plt.ylabel('Frequency')\n",
794 | "\n",
795 | "#plt.title('Similarity Score Distribution between Generated and Reference Output ', fontweight = \"bold\")\n",
796 | "plt.tight_layout()\n",
797 | "plt.savefig('similarity_score_distribution_140samples.png')\n",
798 | "plt.show()\n"
799 | ]
800 | },
801 | {
802 | "cell_type": "code",
803 | "execution_count": 145,
804 | "metadata": {},
805 | "outputs": [
806 | {
807 | "data": {
808 | "image/png": "",
809 | "text/plain": [
810 | ""
811 | ]
812 | },
813 | "metadata": {},
814 | "output_type": "display_data"
815 | }
816 | ],
817 | "source": [
818 | "logs = [\n",
819 | " {\"current_steps\": 1, \"total_steps\": 36, \"loss\": 1.7841, \"learning_rate\": 0.0004990486745229364, \"epoch\": 0.1, \"percentage\": 2.78, \"elapsed_time\": \"0:00:18\", \"remaining_time\": \"0:11:00\"},\n",
820 | "{\"current_steps\": 2, \"total_steps\": 36, \"loss\": 1.6008, \"learning_rate\": 0.000496201938253052, \"epoch\": 0.2, \"percentage\": 5.56, \"elapsed_time\": \"0:00:35\", \"remaining_time\": \"0:09:59\"},\n",
821 | "{\"current_steps\": 3, \"total_steps\": 36, \"loss\": 1.2327, \"learning_rate\": 0.0004914814565722671, \"epoch\": 0.3, \"percentage\": 8.33, \"elapsed_time\": \"0:00:56\", \"remaining_time\": \"0:10:26\"},\n",
822 | "{\"current_steps\": 4, \"total_steps\": 36, \"loss\": 1.4636, \"learning_rate\": 0.0004849231551964771, \"epoch\": 0.41, \"percentage\": 11.11, \"elapsed_time\": \"0:01:08\", \"remaining_time\": \"0:09:09\"},\n",
823 | "{\"current_steps\": 5, \"total_steps\": 36, \"loss\": 1.4043, \"learning_rate\": 0.00047657694675916254, \"epoch\": 0.51, \"percentage\": 13.89, \"elapsed_time\": \"0:01:22\", \"remaining_time\": \"0:08:31\"},\n",
824 | "{\"current_steps\": 6, \"total_steps\": 36, \"loss\": 1.1577, \"learning_rate\": 0.00046650635094610973, \"epoch\": 0.61, \"percentage\": 16.67, \"elapsed_time\": \"0:01:51\", \"remaining_time\": \"0:09:18\"},\n",
825 | "{\"current_steps\": 7, \"total_steps\": 36, \"loss\": 1.1639, \"learning_rate\": 0.00045478801107224796, \"epoch\": 0.71, \"percentage\": 19.44, \"elapsed_time\": \"0:02:05\", \"remaining_time\": \"0:08:40\"},\n",
826 | "{\"current_steps\": 8, \"total_steps\": 36, \"loss\": 1.001, \"learning_rate\": 0.0004415111107797445, \"epoch\": 0.81, \"percentage\": 22.22, \"elapsed_time\": \"0:02:22\", \"remaining_time\": \"0:08:18\"},\n",
827 | "{\"current_steps\": 9, \"total_steps\": 36, \"loss\": 1.0497, \"learning_rate\": 0.00042677669529663686, \"epoch\": 0.91, \"percentage\": 25.0, \"elapsed_time\": \"0:02:37\", \"remaining_time\": \"0:07:53\"},\n",
828 | "{\"current_steps\": 10, \"total_steps\": 36, \"loss\": 0.9004, \"learning_rate\": 0.0004106969024216348, \"epoch\": 1.01, \"percentage\": 27.78, \"elapsed_time\": \"0:03:00\", \"remaining_time\": \"0:07:49\"},\n",
829 | "{\"current_steps\": 11, \"total_steps\": 36, \"loss\": 0.9483, \"learning_rate\": 0.00039339410908776154, \"epoch\": 1.11, \"percentage\": 30.56, \"elapsed_time\": \"0:03:13\", \"remaining_time\": \"0:07:19\"},\n",
830 | "{\"current_steps\": 12, \"total_steps\": 36, \"loss\": 0.8806, \"learning_rate\": 0.000375, \"epoch\": 1.22, \"percentage\": 33.33, \"elapsed_time\": \"0:03:28\", \"remaining_time\": \"0:06:57\"},\n",
831 | "{\"current_steps\": 13, \"total_steps\": 36, \"loss\": 1.0342, \"learning_rate\": 0.00035565456543517487, \"epoch\": 1.32, \"percentage\": 36.11, \"elapsed_time\": \"0:03:43\", \"remaining_time\": \"0:06:35\"},\n",
832 | "{\"current_steps\": 14, \"total_steps\": 36, \"loss\": 0.8794, \"learning_rate\": 0.0003355050358314172, \"epoch\": 1.42, \"percentage\": 38.89, \"elapsed_time\": \"0:04:06\", \"remaining_time\": \"0:06:26\"},\n",
833 | "{\"current_steps\": 15, \"total_steps\": 36, \"loss\": 0.767, \"learning_rate\": 0.00031470476127563017, \"epoch\": 1.52, \"percentage\": 41.67, \"elapsed_time\": \"0:04:27\", \"remaining_time\": \"0:06:14\"},\n",
834 | "{\"current_steps\": 16, \"total_steps\": 36, \"loss\": 0.8725, \"learning_rate\": 0.00029341204441673266, \"epoch\": 1.62, \"percentage\": 44.44, \"elapsed_time\": \"0:04:41\", \"remaining_time\": \"0:05:51\"},\n",
835 | "{\"current_steps\": 17, \"total_steps\": 36, \"loss\": 0.7652, \"learning_rate\": 0.0002717889356869146, \"epoch\": 1.72, \"percentage\": 47.22, \"elapsed_time\": \"0:05:09\", \"remaining_time\": \"0:05:45\"},\n",
836 | "{\"current_steps\": 18, \"total_steps\": 36, \"loss\": 0.9222, \"learning_rate\": 0.00025, \"epoch\": 1.82, \"percentage\": 50.0, \"elapsed_time\": \"0:05:34\", \"remaining_time\": \"0:05:34\"},\n",
837 | "{\"current_steps\": 19, \"total_steps\": 36, \"loss\": 0.8331, \"learning_rate\": 0.00022821106431308543, \"epoch\": 1.92, \"percentage\": 52.78, \"elapsed_time\": \"0:05:49\", \"remaining_time\": \"0:05:12\"},\n",
838 | "{\"current_steps\": 20, \"total_steps\": 36, \"loss\": 0.9051, \"learning_rate\": 0.00020658795558326743, \"epoch\": 2.03, \"percentage\": 55.56, \"elapsed_time\": \"0:06:02\", \"remaining_time\": \"0:04:49\"},\n",
839 | "{\"current_steps\": 21, \"total_steps\": 36, \"loss\": 0.7643, \"learning_rate\": 0.0001852952387243698, \"epoch\": 2.13, \"percentage\": 58.33, \"elapsed_time\": \"0:06:16\", \"remaining_time\": \"0:04:28\"},\n",
840 | "{\"current_steps\": 22, \"total_steps\": 36, \"loss\": 0.7853, \"learning_rate\": 0.00016449496416858284, \"epoch\": 2.23, \"percentage\": 61.11, \"elapsed_time\": \"0:06:29\", \"remaining_time\": \"0:04:07\"},\n",
841 | "{\"current_steps\": 23, \"total_steps\": 36, \"loss\": 0.8496, \"learning_rate\": 0.0001443454345648252, \"epoch\": 2.33, \"percentage\": 63.89, \"elapsed_time\": \"0:06:53\", \"remaining_time\": \"0:03:53\"},\n",
842 | "{\"current_steps\": 24, \"total_steps\": 36, \"loss\": 0.8802, \"learning_rate\": 0.00012500000000000006, \"epoch\": 2.43, \"percentage\": 66.67, \"elapsed_time\": \"0:07:07\", \"remaining_time\": \"0:03:33\"},\n",
843 | "{\"current_steps\": 25, \"total_steps\": 36, \"loss\": 0.8165, \"learning_rate\": 0.00010660589091223854, \"epoch\": 2.53, \"percentage\": 69.44, \"elapsed_time\": \"0:07:21\", \"remaining_time\": \"0:03:14\"},\n",
844 | "{\"current_steps\": 26, \"total_steps\": 36, \"loss\": 0.9055, \"learning_rate\": 8.930309757836516e-05, \"epoch\": 2.63, \"percentage\": 72.22, \"elapsed_time\": \"0:07:38\", \"remaining_time\": \"0:02:56\"},\n",
845 | "{\"current_steps\": 27, \"total_steps\": 36, \"loss\": 0.7184, \"learning_rate\": 7.322330470336314e-05, \"epoch\": 2.73, \"percentage\": 75.0, \"elapsed_time\": \"0:08:09\", \"remaining_time\": \"0:02:43\"},\n",
846 | "{\"current_steps\": 28, \"total_steps\": 36, \"loss\": 0.8326, \"learning_rate\": 5.848888922025553e-05, \"epoch\": 2.84, \"percentage\": 77.78, \"elapsed_time\": \"0:08:30\", \"remaining_time\": \"0:02:25\"},\n",
847 | "{\"current_steps\": 29, \"total_steps\": 36, \"loss\": 0.8122, \"learning_rate\": 4.521198892775202e-05, \"epoch\": 2.94, \"percentage\": 80.56, \"elapsed_time\": \"0:08:52\", \"remaining_time\": \"0:02:08\"},\n",
848 | "{\"current_steps\": 30, \"total_steps\": 36, \"loss\": 0.8105, \"learning_rate\": 3.3493649053890325e-05, \"epoch\": 3.04, \"percentage\": 83.33, \"elapsed_time\": \"0:09:07\", \"remaining_time\": \"0:01:49\"},\n",
849 | "{\"current_steps\": 31, \"total_steps\": 36, \"loss\": 0.7703, \"learning_rate\": 2.3423053240837516e-05, \"epoch\": 3.14, \"percentage\": 86.11, \"elapsed_time\": \"0:09:22\", \"remaining_time\": \"0:01:30\"},\n",
850 | "{\"current_steps\": 32, \"total_steps\": 36, \"loss\": 0.8548, \"learning_rate\": 1.5076844803522921e-05, \"epoch\": 3.24, \"percentage\": 88.89, \"elapsed_time\": \"0:09:37\", \"remaining_time\": \"0:01:12\"},\n",
851 | "{\"current_steps\": 33, \"total_steps\": 36, \"loss\": 0.8, \"learning_rate\": 8.51854342773295e-06, \"epoch\": 3.34, \"percentage\": 91.67, \"elapsed_time\": \"0:09:59\", \"remaining_time\": \"0:00:54\"},\n",
852 | "{\"current_steps\": 34, \"total_steps\": 36, \"loss\": 0.7628, \"learning_rate\": 3.798061746947995e-06, \"epoch\": 3.44, \"percentage\": 94.44, \"elapsed_time\": \"0:10:17\", \"remaining_time\": \"0:00:36\"},\n",
853 | "{\"current_steps\": 35, \"total_steps\": 36, \"loss\": 0.7972, \"learning_rate\": 9.513254770636137e-07, \"epoch\": 3.54, \"percentage\": 97.22, \"elapsed_time\": \"0:10:31\", \"remaining_time\": \"0:00:18\"},\n",
854 | "{\"current_steps\": 36, \"total_steps\": 36, \"loss\": 0.7941, \"learning_rate\": 0.0, \"epoch\": 3.65, \"percentage\": 100.0, \"elapsed_time\": \"0:10:45\", \"remaining_time\": \"0:00:00\"}\n",
855 | "]\n",
856 | "\n",
857 | "import numpy as np\n",
858 | "import matplotlib.pyplot as plt\n",
859 | "from scipy.interpolate import interp1d\n",
860 | "\n",
861 | "\n",
862 | "plt.figure(figsize=(10, 6), dpi=150)\n",
863 | "losses = np.array([x[\"loss\"] for x in logs])\n",
864 | "steps = range(1, len(logs)+1)\n",
865 | "\n",
866 | "plt.plot(steps, losses, label='Losses')\n",
867 | "plt.scatter(steps, losses)\n",
868 | "\n",
869 | "f_cubic = interp1d(steps, losses)\n",
870 | "xnew = np.linspace(1, 36, num=10, endpoint=True)\n",
871 | "plt.plot(xnew, f_cubic(xnew), '--', label='Smoothed', c='r')\n",
872 | "\n",
873 | "\n",
874 | "plt.xlabel('Steps')\n",
875 | "plt.ylabel('Training Loss')\n",
876 | "plt.legend(loc='best')\n",
877 | "plt.tight_layout()\n",
878 | "plt.savefig('training_loss.png')\n",
879 | "plt.show()\n"
880 | ]
881 | },
882 | {
883 | "cell_type": "code",
884 | "execution_count": null,
885 | "metadata": {},
886 | "outputs": [],
887 | "source": []
888 | }
889 | ],
890 | "metadata": {
891 | "kernelspec": {
892 | "display_name": "python3-11",
893 | "language": "python",
894 | "name": "python3"
895 | },
896 | "language_info": {
897 | "codemirror_mode": {
898 | "name": "ipython",
899 | "version": 3
900 | },
901 | "file_extension": ".py",
902 | "mimetype": "text/x-python",
903 | "name": "python",
904 | "nbconvert_exporter": "python",
905 | "pygments_lexer": "ipython3",
906 | "version": "3.11.8"
907 | }
908 | },
909 | "nbformat": 4,
910 | "nbformat_minor": 2
911 | }
912 |
--------------------------------------------------------------------------------