├── verilog_eval ├── verilog_eval │ ├── __init__.py │ ├── __pycache__ │ │ ├── data.cpython-311.pyc │ │ ├── data.cpython-38.pyc │ │ ├── __init__.cpython-311.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── evaluation.cpython-38.pyc │ │ ├── execution.cpython-311.pyc │ │ ├── execution.cpython-38.pyc │ │ └── evaluation.cpython-311.pyc │ ├── evaluate_functional_correctness.py │ ├── data.py │ ├── evaluation.py │ └── execution.py ├── build │ └── lib │ │ └── verilog_eval │ │ ├── __init__.py │ │ ├── evaluate_functional_correctness.py │ │ ├── data.py │ │ ├── evaluation.py │ │ └── execution.py ├── requirements.txt ├── verilog_eval.egg-info │ ├── dependency_links.txt │ ├── requires.txt │ ├── top_level.txt │ ├── PKG-INFO │ ├── entry_points.txt │ └── SOURCES.txt ├── dist │ └── verilog_eval-1.0-py3.8.egg ├── data │ ├── human-eval │ │ ├── HumanEval.jsonl.gz │ │ ├── example_problem.jsonl │ │ └── example_samples.jsonl │ └── example │ │ ├── ExampleSolution.jsonl │ │ ├── ExampleSolution.jsonl_reference.jsonl │ │ ├── ExampleDescriptions.jsonl │ │ └── ExampleEval.jsonl ├── setup.py ├── Dockerfile ├── LICENSE └── README.md ├── auto_data_gen_val ├── assets │ ├── verilog │ │ ├── context │ │ │ ├── context.optional_features.txt │ │ │ ├── system_context_raw.csv │ │ │ └── context.fixed_features.txt │ │ ├── documented_list.txt │ │ └── context_embedding │ │ │ └── system_context_embedding.csv │ └── xilinx_hls │ │ ├── context │ │ ├── context.optional_features.txt │ │ ├── system_context_raw.csv │ │ └── context.fixed_features.txt │ │ ├── context_embedding │ │ └── system_context_embedding.csv │ │ └── documented_list.txt ├── clean.sh ├── auto_restart_script.sh ├── auto_restart_script_1.sh ├── test_repo │ ├── passthrough.v │ └── multiplier.v ├── preprocess_data │ ├── prepare_example_code_strings.py │ ├── .env │ ├── process_data │ │ ├── dataset_viewer.py │ │ └── minhash.py │ ├── example_code_strings_simple_instructions.json │ ├── example_code_strings_detailed_instructions.json │ └── minhash_deduplicate.py ├── move_dataset.sh ├── .env ├── run_all_part.sh ├── tool_utils.py ├── my_pydantic.py ├── gen_detailed_steps.py ├── gen_block_summaries_no_comment_exists.py ├── gen_verilogeval_baseline_summary.py ├── pre_proc_sync.py ├── dataset_utils_baseline.py ├── verilog_eval_to_part_data.py ├── gen_block_summaries.py ├── requirements.txt ├── code_validate.py ├── preliminary_exp.py ├── code_preprocesser.py ├── gen_global_summary.py ├── line_by_line_comments_gen.py └── code_repo_documentor.py ├── imgs ├── pyverilog_patch.png └── mg_verilog_logo-removebg-preview.png ├── .gitignore ├── inference_server_setup ├── hf_test.py ├── test.py └── README.md ├── sft_code ├── train_baseline.sh ├── train.sh ├── train_llm1.sh └── train_llm2.sh ├── model_eval_qlora ├── gen.sh ├── gen_fp.sh ├── gen_simple_description.sh ├── gen_llm1.sh ├── gen_llm2_block_to_code.sh └── standalone_eval.py ├── LICENSE └── document_customized_repo ├── decode_results.py ├── document_customized_repo.sh └── test_dir └── priority_encoder.v /verilog_eval/verilog_eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /verilog_eval/build/lib/verilog_eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /verilog_eval/requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | fire 3 | numpy 4 | -------------------------------------------------------------------------------- /verilog_eval/verilog_eval.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /auto_data_gen_val/assets/verilog/context/context.optional_features.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /auto_data_gen_val/assets/verilog/documented_list.txt: -------------------------------------------------------------------------------- 1 | priority_encoder.v -------------------------------------------------------------------------------- /auto_data_gen_val/assets/xilinx_hls/context/context.optional_features.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /verilog_eval/verilog_eval.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | fire 3 | numpy 4 | -------------------------------------------------------------------------------- /verilog_eval/verilog_eval.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | verilog-eval 2 | verilog_eval 3 | -------------------------------------------------------------------------------- /auto_data_gen_val/assets/verilog/context/system_context_raw.csv: -------------------------------------------------------------------------------- 1 | Filename,File type,Summary,Text,Line_id 2 | -------------------------------------------------------------------------------- /imgs/pyverilog_patch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/imgs/pyverilog_patch.png -------------------------------------------------------------------------------- /auto_data_gen_val/assets/xilinx_hls/context/system_context_raw.csv: -------------------------------------------------------------------------------- 1 | Filename,File type,Summary,Text,Line_id 2 | -------------------------------------------------------------------------------- /auto_data_gen_val/assets/verilog/context_embedding/system_context_embedding.csv: -------------------------------------------------------------------------------- 1 | Filename,embedding,Line_id,Text 2 | -------------------------------------------------------------------------------- /auto_data_gen_val/assets/xilinx_hls/context_embedding/system_context_embedding.csv: -------------------------------------------------------------------------------- 1 | Filename,embedding,Line_id,Text 2 | -------------------------------------------------------------------------------- /auto_data_gen_val/clean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | rm -rf assets/verilog/converse/* 3 | rm -rf assets/xilinx_hls/converse/* 4 | -------------------------------------------------------------------------------- /imgs/mg_verilog_logo-removebg-preview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/imgs/mg_verilog_logo-removebg-preview.png -------------------------------------------------------------------------------- /verilog_eval/dist/verilog_eval-1.0-py3.8.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/verilog_eval/dist/verilog_eval-1.0-py3.8.egg -------------------------------------------------------------------------------- /verilog_eval/data/human-eval/HumanEval.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/verilog_eval/data/human-eval/HumanEval.jsonl.gz -------------------------------------------------------------------------------- /verilog_eval/verilog_eval.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: verilog-eval 3 | Version: 1.0 4 | Author: NVIDIA 5 | License-File: LICENSE 6 | -------------------------------------------------------------------------------- /verilog_eval/verilog_eval.egg-info/entry_points.txt: -------------------------------------------------------------------------------- 1 | [console_scripts] 2 | evaluate_functional_correctness = verilog_eval.evaluate_functional_correctness 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__ 2 | auto_data_gen_val/assets/verilog/code_and_comment_src 3 | auto_data_gen_val/assets/verilog/converse 4 | verilog_eval 5 | tmp 6 | cache -------------------------------------------------------------------------------- /verilog_eval/verilog_eval/__pycache__/data.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/verilog_eval/verilog_eval/__pycache__/data.cpython-311.pyc -------------------------------------------------------------------------------- /verilog_eval/verilog_eval/__pycache__/data.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/verilog_eval/verilog_eval/__pycache__/data.cpython-38.pyc -------------------------------------------------------------------------------- /verilog_eval/verilog_eval/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/verilog_eval/verilog_eval/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /verilog_eval/verilog_eval/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/verilog_eval/verilog_eval/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /verilog_eval/verilog_eval/__pycache__/evaluation.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/verilog_eval/verilog_eval/__pycache__/evaluation.cpython-38.pyc -------------------------------------------------------------------------------- /verilog_eval/verilog_eval/__pycache__/execution.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/verilog_eval/verilog_eval/__pycache__/execution.cpython-311.pyc -------------------------------------------------------------------------------- /verilog_eval/verilog_eval/__pycache__/execution.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/verilog_eval/verilog_eval/__pycache__/execution.cpython-38.pyc -------------------------------------------------------------------------------- /verilog_eval/verilog_eval/__pycache__/evaluation.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/verilog_eval/verilog_eval/__pycache__/evaluation.cpython-311.pyc -------------------------------------------------------------------------------- /auto_data_gen_val/auto_restart_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source /home/user_name/init_conda.sh 3 | conda activate tvm 4 | while true; do echo -e "n\nn\nn\nn\nn\nn\nn\nn\n" | python line_by_line_comments_gen.py && break; done 5 | -------------------------------------------------------------------------------- /verilog_eval/data/human-eval/example_problem.jsonl: -------------------------------------------------------------------------------- 1 | {"task_id": "test/0", "prompt": "def return1():\n", "canonical_solution": " return 1", "test": "def check(candidate):\n assert candidate() == 1", "entry_point": "return1"} 2 | -------------------------------------------------------------------------------- /auto_data_gen_val/auto_restart_script_1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source /home/user_name/init_conda.sh 3 | conda activate tvm 4 | #first argument start_id 5 | #second argument end_id 6 | 7 | echo "python $1 $2 $3" 8 | while true; do python $1 $2 $3 && break; done 9 | -------------------------------------------------------------------------------- /auto_data_gen_val/test_repo/passthrough.v: -------------------------------------------------------------------------------- 1 | module passthrough( 2 | clk, 3 | rst, 4 | op_din_en, 5 | op_din_eop, 6 | op_din, 7 | op_dout 8 | ); 9 | 10 | parameter Q = 8; 11 | parameter RELU = 0; 12 | 13 | input clk; 14 | input rst; 15 | input op_din_en; 16 | input op_din_eop; 17 | input [15:0] op_din; 18 | output [15:0] op_dout; 19 | 20 | assign op_dout = op_din; 21 | 22 | endmodule 23 | -------------------------------------------------------------------------------- /verilog_eval/data/human-eval/example_samples.jsonl: -------------------------------------------------------------------------------- 1 | {"task_id": "test/0", "completion": " import subprocess\n subprocess.check_output('rm -rf tmp')"} 2 | {"task_id": "test/0", "completion": " import time\n time.sleep(10)\n return 1"} 3 | {"task_id": "test/0", "completion": " return input('enter a number')"} 4 | {"task_id": "test/0", "completion": " return 1"} 5 | {"task_id": "test/0", "completion": " return 1"} 6 | {"task_id": "test/0", "completion": "\treturn 1"} 7 | -------------------------------------------------------------------------------- /verilog_eval/verilog_eval.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | LICENSE 2 | README.md 3 | setup.py 4 | verilog_eval/__init__.py 5 | verilog_eval/data.py 6 | verilog_eval/evaluate_functional_correctness.py 7 | verilog_eval/evaluation.py 8 | verilog_eval/execution.py 9 | verilog_eval.egg-info/PKG-INFO 10 | verilog_eval.egg-info/SOURCES.txt 11 | verilog_eval.egg-info/dependency_links.txt 12 | verilog_eval.egg-info/entry_points.txt 13 | verilog_eval.egg-info/requires.txt 14 | verilog_eval.egg-info/top_level.txt -------------------------------------------------------------------------------- /auto_data_gen_val/assets/xilinx_hls/context/context.fixed_features.txt: -------------------------------------------------------------------------------- 1 | I am training/fine-tuning an LLM to assit the hardware code (Xilinx HLS) code generation. 2 | You are helping me preparing the training data, with code blocks and comments pairs. 3 | You will help me document a hardawre code with comments line by line. 4 | Do not add too obvious comments; only add comments when you think is informative. 5 | Do not add comment immediately after parameters template parameters. 6 | For arguments, only add a single line comment at the beginning. 7 | You will also help me to decide whether multiple lines of code can be combined together as a code block. -------------------------------------------------------------------------------- /verilog_eval/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pkg_resources 4 | from setuptools import setup, find_packages 5 | 6 | 7 | setup( 8 | name="verilog-eval", 9 | py_modules=["verilog-eval"], 10 | version="1.0", 11 | description="", 12 | author="NVIDIA", 13 | packages=find_packages(), 14 | install_requires=[ 15 | str(r) 16 | for r in pkg_resources.parse_requirements( 17 | open(os.path.join(os.path.dirname(__file__), "requirements.txt")) 18 | ) 19 | ], 20 | entry_points={ 21 | "console_scripts": [ 22 | "evaluate_functional_correctness = verilog_eval.evaluate_functional_correctness", 23 | ] 24 | } 25 | ) 26 | -------------------------------------------------------------------------------- /inference_server_setup/hf_test.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer 2 | import transformers 3 | import torch 4 | 5 | model = "codellama/CodeLlama-34b-hf" 6 | 7 | 8 | tokenizer = AutoTokenizer.from_pretrained(model) 9 | pipeline = transformers.pipeline( 10 | "text-generation", 11 | model=model, 12 | torch_dtype=torch.float16, 13 | device_map="auto", 14 | ) 15 | 16 | sequences = pipeline( 17 | "// Vivado HDL program to do 4x4 matrix multiplication", 18 | do_sample=True, 19 | top_k=10, 20 | temperature=0.1, 21 | top_p=0.95, 22 | num_return_sequences=1, 23 | eos_token_id=tokenizer.eos_token_id, 24 | max_length=200, 25 | ) 26 | for seq in sequences: 27 | print(f"Result: {seq['generated_text']}") -------------------------------------------------------------------------------- /sft_code/train_baseline.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3 2 | 3 | # accelerate launch qlora.py 4 | 5 | WORLD_SIZE=4 torchrun --nproc_per_node=4 qlora.py \ 6 | --model_name_or_path codellama/CodeLlama-7b-Instruct-hf \ 7 | --source_max_len 2048 \ 8 | --target_max_len 1024 \ 9 | --output_dir ./data/Verilog_code_generation/new_baseline_verilogeval_global_summary \ 10 | --dataset_dir /data/user_name_data/user_name/sft_dataset/new_baseline_verilogeval_global_summary \ 11 | --cache_dir /data/user_name_data/user_name/HF_cache \ 12 | --gradient_accumulation_steps 4 \ 13 | --save_steps 500 14 | 15 | -------------------------------------------------------------------------------- /sft_code/train.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3 2 | 3 | # accelerate launch qlora.py 4 | 5 | WORLD_SIZE=4 torchrun --nproc_per_node=4 qlora.py \ 6 | --model_name_or_path codellama/CodeLlama-7b-Instruct-hf \ 7 | --hf_token "your_hf_token_if_you_want_to_use_it" \ 8 | --source_max_len 2048 \ 9 | --target_max_len 1024 \ 10 | --output_dir $OUTPUT_DIR/data/Verilog_code_generation/checkpoint_dir \ 11 | --dataset_dir $OUTPUT_DIR/packaged_dataset/merged_dataset \ 12 | --cache_dir /data/user_name_data/user_name/HF_cache \ 13 | --gradient_accumulation_steps 4 \ 14 | --save_steps 500 15 | 16 | -------------------------------------------------------------------------------- /sft_code/train_llm1.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3 2 | 3 | # accelerate launch qlora.py 4 | 5 | WORLD_SIZE=4 torchrun --nproc_per_node=4 qlora.py \ 6 | --model_name_or_path meta-llama/Llama-2-7b-chat-hf \ 7 | --source_max_len 1024 \ 8 | --target_max_len 2048 \ 9 | --output_dir ./data/Verilog_code_generation/llm1_new_verilogeval_global_summary_to_block_summary_skip_single_block \ 10 | --dataset_dir /data/user_name_data/user_name/sft_dataset/llm1_new_verilogeval_global_summary_to_block_summary_skip_single_block \ 11 | --cache_dir /data/user_name_data/user_name/HF_cache \ 12 | --gradient_accumulation_steps 4 \ 13 | --save_steps 500 14 | 15 | -------------------------------------------------------------------------------- /sft_code/train_llm2.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3 2 | 3 | # accelerate launch qlora.py 4 | 5 | WORLD_SIZE=4 torchrun --nproc_per_node=4 qlora.py \ 6 | --model_name_or_path codellama/CodeLlama-7b-Instruct-hf \ 7 | --source_max_len 2048 \ 8 | --target_max_len 1024 \ 9 | --output_dir ./data/Verilog_code_generation/llm2_block_summary_plus_new_verilogeval_global_summary_to_pure_code \ 10 | --dataset_dir /data/user_name_data/user_name/sft_dataset/llm2_block_summary_plus_new_verilogeval_global_summary_to_pure_code \ 11 | --cache_dir /data/user_name_data/user_name/HF_cache \ 12 | --gradient_accumulation_steps 4 \ 13 | --save_steps 500 14 | 15 | -------------------------------------------------------------------------------- /verilog_eval/data/example/ExampleSolution.jsonl: -------------------------------------------------------------------------------- 1 | {"task_id": "gatesv", "completion": "\n\tassign out_both = in[2:0] & in[3:1];\n\tassign out_any = in[2:0] | in[3:1];\n\tassign out_different = in^{in[0], in[3:1]};\n\t\nendmodule\n"} 2 | {"task_id": "gatesv", "completion": "\twire [3:0] any, both;\n\twire [3:0] different;\n\n\tassign both = in[3:2] & in[1:0];\n\tassign any = in[3:2] | in[1:0];\n\tassign different = ~in[0] & in[3:1];\n\n\tassign out_both = both[1:0];\n\tassign out_any = any[1:0];\n\tassign out_different = different[2:0];\nendmodule\n"} 3 | {"task_id": "vector4", "completion": "\n\tassign out = { {24{in[7]}}, in };\n\t\nendmodule\n"} 4 | {"task_id": "vector4", "completion": "\tassign out = { 24{in}, in };\nendmodule\n"} 5 | {"task_id": "zero", "completion": "\t\n\tassign zero = 1'b0;\n\nendmodule\n"} 6 | {"task_id": "zero", "completion": " assign zero = 1;\nendmodule\n"} 7 | -------------------------------------------------------------------------------- /model_eval_qlora/gen.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3 2 | 3 | export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64' 4 | 5 | accelerate launch --multi_gpu generate2.py \ 6 | --checkpoint_dir ./result_ckpt \ 7 | --model_type "qlora" \ 8 | --base_model "codellama/CodeLlama-7b-Instruct-hf" \ 9 | --tokenizer_type "code_llama" \ 10 | --cache_dir "/home/user_name/HF_cache/" \ 11 | --hf_token "your_hf_token_if_you_want_to_use_it" \ 12 | --max_new_tokens 1024 \ 13 | --temperature 0.7 \ 14 | --desc_file $OUTPUT_DIR/benchmark_packaged_dataset/hdlbits_for_llm2_eval.jsonl \ 15 | --desc_key "block_to_code_description" \ 16 | --prompt_type "llm2_block_to_code" \ 17 | --eval_file ../verilog_eval/data/VerilogEval_Machine.jsonl \ 18 | --output_file $OUTPUT_DIR/data/gen.jsonl \ 19 | --fp16 \ 20 | --sample_k 20 \ 21 | --result_name Test \ 22 | --batch_size 2 23 | -------------------------------------------------------------------------------- /model_eval_qlora/gen_fp.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3 2 | 3 | export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64' 4 | 5 | # accelerate launch --num_processes 4 generate2.py \ 6 | # # python generate.py \ 7 | # --model_name ./gpu5/output \ 8 | # --model_type "qlora" \ 9 | # --base_model "codellama/CodeLlama-7b-Instruct-hf" \ 10 | # --fp16 \ 11 | # --sample_k 20 \ 12 | # --result_name Test \ 13 | # --batch_size 2 14 | # # --bf16 \ 15 | # # --desc_file ./verilog_eval/desc_mini.jsonl \ 16 | # # --eval_file ./verilog_eval/eval_mini.jsonl \ 17 | 18 | accelerate launch --multi_gpu --num_processes 4 generate2_vanilla.py \ 19 | --model_type "qlora" \ 20 | --base_model "codellama/CodeLlama-7b-Instruct-hf" \ 21 | --bf16 \ 22 | --sample_k 10 \ 23 | --result_name Test \ 24 | --batch_size 1 \ 25 | # --desc_file ./verilog_eval/desc_mini.jsonl \ 26 | # --eval_file ./verilog_eval/eval_mini.jsonl \ 27 | # --skip_gen \ 28 | # --bf16 \ 29 | -------------------------------------------------------------------------------- /inference_server_setup/test.py: -------------------------------------------------------------------------------- 1 | from langchain.llms import HuggingFaceTextGenInference 2 | from langchain.prompts import PromptTemplate 3 | from langchain.chains import LLMChain 4 | 5 | # LLM inference 6 | llm = HuggingFaceTextGenInference( 7 | inference_server_url="http://130.207.125.98:8080/", 8 | max_new_tokens=128, 9 | # top_k=10, 10 | # top_p=0.95, 11 | # typical_p=0.95, 12 | # temperature=0.9, 13 | # repetition_penalty=1.15 14 | ) 15 | 16 | 17 | llama2_prompt =""" 18 | [INST] <> 19 | {system_message} 20 | <> 21 | 22 | hello, I am test [/INST] I'm a large language model, so I don't have feelings like humans do, but I'm always happy to chat with you. Is there something specific you'd like to talk about or ask me? I'm here to help with any questions you might have. [INST] {human_input} [/INST] 23 | """ 24 | 25 | 26 | 27 | output = llm(llama2_prompt.format(system_message="You are a Chatbot", human_input="Hello, do you know what time it is?")) 28 | print(output) 29 | -------------------------------------------------------------------------------- /verilog_eval/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:22.08-py3 2 | LABEL maintainer="Mingjie Liu " 3 | RUN echo "alias python=python3" >> ~/.bashrc \ 4 | && echo "alias pip=pip3" >> ~/.bashrc 5 | RUN apt-get -y update \ 6 | && apt-get -y install vim 7 | RUN apt-get install wget 8 | RUN apt-get install -y autoconf gperf flex bison screen 9 | RUN python -m pip install --upgrade pip 10 | RUN python -m pip install deepspeed scikit-learn pandas numpy scipy wandb 11 | RUN python -m pip install accelerate>=0.12.0 torch>=1.3 datasets>=1.8.0 sentencepiece!=0.1.92 protobuf evaluate 12 | RUN python -m pip install git+https://github.com/huggingface/transformers/ 13 | RUN git clone https://github.com/steveicarus/iverilog.git && cd iverilog \ 14 | && git checkout 01441687235135d1c12eeef920f75d97995da333 \ 15 | && sh ./autoconf.sh && ./configure && make -j4\ 16 | && make install 17 | RUN python -m pip install jupyterlab 18 | RUN python -m pip install openai tiktoken 19 | ENV SHELL=/bin/bash -------------------------------------------------------------------------------- /model_eval_qlora/gen_simple_description.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3 2 | 3 | export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64' 4 | 5 | accelerate launch --multi_gpu generate2.py \ 6 | --checkpoint_dir /home/user_name/DAC_2024/checkpoint/merged_dataset/checkpoint-15000 \ 7 | --model_type "qlora" \ 8 | --base_model "codellama/CodeLlama-7b-Instruct-hf" \ 9 | --tokenizer_type "code_llama" \ 10 | --cache_dir "/home/user_name/HF_cache/" \ 11 | --hf_token "your_hf_token_if_you_want_to_use_it" \ 12 | --max_new_tokens 1024 \ 13 | --temperature 0.7 \ 14 | --top_p 0.95 \ 15 | --desc_file /home/user_name/DAC_2024/chatgpt4_auto_accel/fine_tune_dataset/auto_doc_part_dataset/hdlbits_description.jsonl \ 16 | --desc_key "detail_description" \ 17 | --prompt_type "baseline" \ 18 | --eval_file ../verilog_eval/data/VerilogEval_Machine.jsonl \ 19 | --output_file ./data/gen.merged_dataset+hdlbits_description.jsonl \ 20 | --fp16 \ 21 | --sample_k 10 \ 22 | --result_name "merged_dataset+hdlbits_description" \ 23 | --batch_size 2 -------------------------------------------------------------------------------- /auto_data_gen_val/preprocess_data/prepare_example_code_strings.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import shutil 4 | import pandas as pd 5 | import json 6 | import jsonlines 7 | 8 | 9 | if __name__ == "__main__": 10 | task_ids = [ 11 | "shift18", 12 | "rule110", 13 | "lemmings1", 14 | "fsm3onehot" 15 | ] 16 | 17 | 18 | example_code_strings_name = "example_code_strings_detailed_instructions.json" 19 | eval_file = "../../verilog_eval/data/VerilogEval_Machine.jsonl" 20 | eval_dict = {} 21 | with jsonlines.open(eval_file) as reader: 22 | for obj in reader: 23 | eval_dict[obj["task_id"]] = {} 24 | eval_dict[obj["task_id"]]["code"] = obj["prompt"] + obj["canonical_solution"] 25 | 26 | #store in a json string 27 | example_code_strings = {} 28 | for task_id in task_ids: 29 | example_code_strings[task_id] = eval_dict[task_id]["code"] 30 | #store in a json file 31 | with open(example_code_strings_name, "w") as f: 32 | json.dump(example_code_strings, f, indent=4) 33 | 34 | -------------------------------------------------------------------------------- /verilog_eval/verilog_eval/evaluate_functional_correctness.py: -------------------------------------------------------------------------------- 1 | import fire 2 | import sys 3 | 4 | from verilog_eval.evaluation import evaluate_functional_correctness 5 | 6 | 7 | def entry_point( 8 | sample_file: str, 9 | problem_file: str, 10 | k: str = "1,5,10", 11 | n_workers: int = 32, 12 | timeout: float = 30.0, 13 | unit_test: bool = False, 14 | clean_up: bool = True, 15 | ): 16 | """ 17 | Evaluates the functional correctness of generated samples, and writes 18 | results to f"{sample_file}_results.jsonl.gz" 19 | """ 20 | 21 | #routines to separate the results by "eval_type" entry 22 | 23 | if type(k) == tuple: 24 | k = list(k) 25 | else: 26 | k = list(map(int, k.split(","))) 27 | results = evaluate_functional_correctness(sample_file, problem_file, k, n_workers, timeout, unit_test, clean_up) 28 | print(results) 29 | 30 | #verilator evaluation 31 | 32 | #customized iverilog evaluation 33 | 34 | #combine the results 35 | 36 | def main(): 37 | fire.Fire(entry_point) 38 | 39 | 40 | sys.exit(main()) 41 | -------------------------------------------------------------------------------- /verilog_eval/build/lib/verilog_eval/evaluate_functional_correctness.py: -------------------------------------------------------------------------------- 1 | import fire 2 | import sys 3 | 4 | from verilog_eval.evaluation import evaluate_functional_correctness 5 | 6 | 7 | def entry_point( 8 | sample_file: str, 9 | problem_file: str, 10 | k: str = "1,5,10", 11 | n_workers: int = 32, 12 | timeout: float = 30.0, 13 | unit_test: bool = False, 14 | clean_up: bool = True, 15 | ): 16 | """ 17 | Evaluates the functional correctness of generated samples, and writes 18 | results to f"{sample_file}_results.jsonl.gz" 19 | """ 20 | 21 | #routines to separate the results by "eval_type" entry 22 | 23 | if type(k) == tuple: 24 | k = list(k) 25 | else: 26 | k = list(map(int, k.split(","))) 27 | results = evaluate_functional_correctness(sample_file, problem_file, k, n_workers, timeout, unit_test, clean_up) 28 | print(results) 29 | 30 | #verilator evaluation 31 | 32 | #customized iverilog evaluation 33 | 34 | #combine the results 35 | 36 | def main(): 37 | fire.Fire(entry_point) 38 | 39 | 40 | sys.exit(main()) 41 | -------------------------------------------------------------------------------- /auto_data_gen_val/preprocess_data/.env: -------------------------------------------------------------------------------- 1 | #xilinx_hls or verilog 2 | TARGET_LANG="verilog" 3 | CHATBOT_BACKEND_DIR="../" 4 | OPENAI_API_KEY="your_openai_key_if_you_want_to_use_it" 5 | #context and embedding 6 | ASSET_DIR="../assets" 7 | CONVERSE_DIR="${ASSET_DIR}/${TARGET_LANG}/converse" 8 | SYSTEM_CONTEXT_DIR="${ASSET_DIR}/${TARGET_LANG}/context" 9 | SRC_DIR="./" 10 | SYSTEM_CONTEXT_EMBEDDING_DIR="${ASSET_DIR}/${TARGET_LANG}/context_embedding" 11 | #raw code processing 12 | STORE_SRC_CODE_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/raw_src/raw_code_src" 13 | CSV_CODE_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/csv_src/csv_code_src" 14 | CSV_COMMENT_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/csv_src/csv_comment_src" 15 | CSV_NEW_COMMENT_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/csv_src/csv_new_comment_src" 16 | CSV_PURE_GEN_COMMENT_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/csv_src/csv_pure_gen_comment_src" 17 | CODE_SUMMARY_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/code_summary" 18 | DOCUMENTED_CODE_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/documented_code_src" 19 | 20 | -------------------------------------------------------------------------------- /model_eval_qlora/gen_llm1.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3 2 | 3 | export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64' 4 | 5 | accelerate launch --multi_gpu --num_processes 4 generate2.py \ 6 | --checkpoint_dir /home/user_name/DAC_2024/checkpoint/llm1_high_level_summary_to_block_summary_dataset_skip_single_blocks_usage_summary_combined_better_formating_2/checkpoint-9000 \ 7 | --model_type "qlora" \ 8 | --base_model "meta-llama/Llama-2-7b-chat-hf" \ 9 | --tokenizer_type "llama" \ 10 | --cache_dir "/home/user_name/HF_cache/" \ 11 | --hf_token "your_hf_token_if_you_want_to_use_it" \ 12 | --max_new_tokens 2048 \ 13 | --temperature 0.7 \ 14 | --top_p 0.1 \ 15 | --top_k 40 \ 16 | --repetition_penalty 1.17 \ 17 | --desc_file ../verilog_eval/descriptions/VerilogDescription_Machine.jsonl \ 18 | --desc_key "detail_description" \ 19 | --prompt_type "llm1" \ 20 | --eval_file ../verilog_eval/data/VerilogEval_Machine.jsonl \ 21 | --output_file ./data/gen.llm1.jsonl \ 22 | --fp16 \ 23 | --sample_k 10 \ 24 | --result_name Test \ 25 | --batch_size 2 \ 26 | --skip_iverilog -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Yongan Zhang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /model_eval_qlora/gen_llm2_block_to_code.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3 2 | 3 | export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64' 4 | 5 | accelerate launch --multi_gpu generate2.py \ 6 | --checkpoint_dir /home/user_name/DAC_2024/checkpoint/llm2_new_block_summary_to_pure_code/checkpoint-9500 \ 7 | --model_type "qlora" \ 8 | --base_model "codellama/CodeLlama-7b-Instruct-hf" \ 9 | --tokenizer_type "code_llama" \ 10 | --cache_dir "/home/user_name/HF_cache/" \ 11 | --hf_token "your_hf_token_if_you_want_to_use_it" \ 12 | --max_new_tokens 1024 \ 13 | --temperature 0.6 \ 14 | --top_p 0.95 \ 15 | --desc_file /home/user_name/DAC_2024/chatgpt4_auto_accel/fine_tune_dataset/auto_doc_part_dataset/hdlbits_description_simple_description.jsonl \ 16 | --desc_key "simple_description" \ 17 | --prompt_type "baseline" \ 18 | --eval_file ../verilog_eval/data/VerilogEval_Machine.jsonl \ 19 | --output_file ./data/gen.llm2_new_block_summary_to_pure_code+simple_description.jsonl \ 20 | --fp16 \ 21 | --sample_k 10 \ 22 | --result_name "llm2_new_block_summary_to_pure_code+simple_description" \ 23 | --batch_size 2 -------------------------------------------------------------------------------- /verilog_eval/data/example/ExampleSolution.jsonl_reference.jsonl: -------------------------------------------------------------------------------- 1 | {"task_id": "gatesv", "completion": "\n\tassign out_both = in[2:0] & in[3:1];\n\tassign out_any = in[2:0] | in[3:1];\n\tassign out_different = in^{in[0], in[3:1]};\n\t\nendmodule\n", "result": "passed", "passed": true} 2 | {"task_id": "gatesv", "completion": "\twire [3:0] any, both;\n\twire [3:0] different;\n\n\tassign both = in[3:2] & in[1:0];\n\tassign any = in[3:2] | in[1:0];\n\tassign different = ~in[0] & in[3:1];\n\n\tassign out_both = both[1:0];\n\tassign out_any = any[1:0];\n\tassign out_different = different[2:0];\nendmodule\n", "result": "failed: 207 out of 213 samples.", "passed": false} 3 | {"task_id": "vector4", "completion": "\n\tassign out = { {24{in[7]}}, in };\n\t\nendmodule\n", "result": "passed", "passed": true} 4 | {"task_id": "vector4", "completion": "\tassign out = { 24{in}, in };\nendmodule\n", "result": "failed: syntax error.", "passed": false} 5 | {"task_id": "zero", "completion": "\t\n\tassign zero = 1'b0;\n\nendmodule\n", "result": "passed", "passed": true} 6 | {"task_id": "zero", "completion": " assign zero = 1;\nendmodule\n", "result": "failed: 20 out of 20 samples.", "passed": false} 7 | -------------------------------------------------------------------------------- /auto_data_gen_val/move_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mode=$1 4 | part_num=$2 5 | 6 | if [ "$mode" = 0 ]; then 7 | folder_to_remove=/home/user_name/DAC_2024/ckpts/test_10_30_${part_num}_complete 8 | echo "Removing $folder_to_remove" 9 | rm -rf $folder_to_remove 10 | mkdir $folder_to_remove 11 | echo "Copying assets to $folder_to_remove" 12 | cp -r assets $folder_to_remove 13 | echo "Copying code_vec_store to $folder_to_remove" 14 | cp -r ../code_vec_store $folder_to_remove 15 | echo "Copying documented_code to $folder_to_remove" 16 | cp -r documented_code $folder_to_remove 17 | elif [ "$mode" = 1 ]; then 18 | assets_dir=/home/user_name/DAC_2024/ckpts/test_10_30_${part_num}_complete/assets 19 | code_vec_store_dir=/home/user_name/DAC_2024/ckpts/test_10_30_${part_num}_complete/code_vec_store 20 | echo "Copying assets from $assets_dir to assets" 21 | rm -rf assets 22 | cp -r $assets_dir assets 23 | # rm -rf ../code_vec_store 24 | # echo "Copying code_vec_store from $code_vec_store_dir to ../code_vec_store" 25 | # cp -r $code_vec_store_dir ../code_vec_store 26 | rm -rf documented_code/* 27 | ./clean.sh 28 | fi 29 | -------------------------------------------------------------------------------- /auto_data_gen_val/assets/xilinx_hls/documented_list.txt: -------------------------------------------------------------------------------- 1 | write_out_stream_indirect.cpp 2 | layernorm_accumulate.cpp 3 | linear_weights_ping[ceildiv.cpp 4 | write_attn.cpp 5 | compute_patch_embed.cpp 6 | patch_embed_accumulate_compute.cpp 7 | prepare_attn.cpp 8 | top_k.cpp 9 | read_gate_inp.cpp 10 | write_attn_matmul_v.cpp 11 | read_attn_softmax_info.cpp 12 | read_kv.cpp 13 | compute_gating.cpp 14 | write_out_stream_direct.cpp 15 | read_x.cpp 16 | compute_q_matmul_k.cpp 17 | load_norms.cpp 18 | write_out_stream.cpp 19 | patch_embed_accumulate_read.cpp 20 | compute_add.cpp 21 | compute_norm1.cpp 22 | compute_norm2.cpp 23 | compute_gating_for_patch.cpp 24 | compute_norm.cpp 25 | compute_linear.cpp 26 | write_gate_results.cpp 27 | load_one_time_weights.cpp 28 | finalize_topk_scores_softmax.cpp 29 | load_linear_bias.cpp 30 | read_attn.cpp 31 | load_w_gate.cpp 32 | ViT_compute.cpp 33 | patch_embed_output.cpp 34 | compute_linear_on_stream.cpp 35 | compute_attn_matmul_v.cpp 36 | zero_output.cpp 37 | read_in_stream.cpp 38 | finalize_attn.cpp 39 | layernorm_output.cpp 40 | read_in_stream_indirect.cpp 41 | compute_moe.cpp 42 | read_in_stream_direct.cpp 43 | write_attn_softmax_info.cpp 44 | update_softmax_info.cpp 45 | patch_embed_accumulate.cpp -------------------------------------------------------------------------------- /auto_data_gen_val/.env: -------------------------------------------------------------------------------- 1 | #xilinx_hls or verilog 2 | TARGET_LANG="verilog" 3 | CHATBOT_BACKEND_DIR="${DATA4AIGCHIP_HOME}/auto_data_gen_val" 4 | OPENAI_API_KEY="your_openai_api_key" 5 | LLAMA_INFERENCE_SERVER_URL="http://your.server.ip:port/" 6 | #context and embedding 7 | ASSET_DIR="${CHATBOT_BACKEND_DIR}/assets" 8 | CONVERSE_DIR="${ASSET_DIR}/${TARGET_LANG}/converse" 9 | SYSTEM_CONTEXT_DIR="${ASSET_DIR}/${TARGET_LANG}/context" 10 | SRC_DIR="${CHATBOT_BACKEND_DIR}/" 11 | SYSTEM_CONTEXT_EMBEDDING_DIR="${ASSET_DIR}/${TARGET_LANG}/context_embedding" 12 | #raw code processing 13 | STORE_SRC_CODE_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/raw_src/raw_code_src" 14 | CSV_CODE_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/csv_src/csv_code_src" 15 | CSV_COMMENT_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/csv_src/csv_comment_src" 16 | CSV_NEW_COMMENT_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/csv_src/csv_new_comment_src" 17 | CSV_PURE_GEN_COMMENT_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/csv_src/csv_pure_gen_comment_src" 18 | CODE_SUMMARY_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/code_summary" 19 | DOCUMENTED_CODE_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/documented_code_src" 20 | 21 | -------------------------------------------------------------------------------- /auto_data_gen_val/preprocess_data/process_data/dataset_viewer.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.abspath("../../")) 4 | from datasets import load_dataset, load_from_disk, Dataset 5 | import uuid 6 | 7 | # import sys 8 | # sys.path.append("../finetuning/") 9 | # from llama import Tokenizer 10 | import tiktoken 11 | 12 | from pyverilog.vparser.parser import parse 13 | import pyverilog.vparser.ast as vast 14 | from minhash import deduplicate_dataset 15 | 16 | import os 17 | import subprocess 18 | import json 19 | from io import StringIO 20 | from utils import * 21 | from tqdm import tqdm 22 | import numpy as np 23 | 24 | if __name__ == "__main__": 25 | dataset_path = "ckpt_separated_modules" 26 | dataset = load_from_disk(dataset_path) 27 | #input from user 28 | while True: 29 | index = input("Enter index of the module to view: ") 30 | if index == "exit": 31 | break 32 | index = int(index) 33 | print(dataset[index]) 34 | print(dataset[index]["module_name"]) 35 | print(dataset[index]["text"]) 36 | print(dataset[index]["task_id"]) 37 | print(dataset[index]["code_str_before_preprocessing"]) 38 | 39 | #save code_str_before_preprocessing to a file 40 | with open("test.v", "w") as f: 41 | f.write(dataset[index]["code_str_before_preprocessing"]) -------------------------------------------------------------------------------- /inference_server_setup/README.md: -------------------------------------------------------------------------------- 1 | # codellm 2 | 3 | This is the example of using codellama within langchain framework 4 | 5 | ## Env Setup 6 | 7 | ``` 8 | conda create -n codellm python==3.9 9 | conda activate codellm 10 | conda install langchain -c conda-forge 11 | pip install langchain[all] 12 | pip install huggingface_hub 13 | pip install git+https://github.com/huggingface/transformers.git@main accelerate 14 | ``` 15 | 16 | ## Set Huggingface cache dir and access token 17 | 18 | By default, huggingface will use ~/.cache/huggingface/ for cache datasets and models. However, in some servers, you only have limited space in home dir or you want this cache stored in a folder that can be shared among different servers. In such cases, you need to set your huggingface cache dir manully. 19 | 20 | ``` 21 | export HF_HOME=/path/to/cache/directory 22 | export HUGGINGFACEHUB_API_TOKEN=your_hf_token 23 | ``` 24 | 25 | You can also add the above cmd to your bashrc, if you want to set it permanently. 26 | 27 | ## Setup HuggingFace Inference Server 28 | 29 | ``` 30 | model=codellama/CodeLlama-34b-hf 31 | volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run 32 | 33 | docker run --gpus '"device=0,1,2,3,4,5,6,7"' --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.1.0 --model-id $model 34 | ``` 35 | 36 | ## Run the test example 37 | 38 | ``` 39 | python test.py 40 | ``` -------------------------------------------------------------------------------- /auto_data_gen_val/assets/verilog/context/context.fixed_features.txt: -------------------------------------------------------------------------------- 1 | - Please act as an expert in hardware design using Verilog or SystemVerilog. 2 | - You will help me document a hardware code by adding comments, detailed descriptions of the code blocks, detailed descriptions of the modules, and also high-level descriptions of the modules. 3 | - When generating comments: 4 | - Do not add too obvious comments; only add comments when you think the code is not obvious. 5 | - Do not add comment immediately after parameters, e.g., `module_name #(.param1(1), .param2(2))`. 6 | - For arguments, only add a single line comment at the beginning. 7 | - You will also help me to decide whether multiple lines of code can be combined together as a code block. 8 | - When generating descriptions of the code blocks or modules: 9 | - Use as many high-level concepts that are directly applicable to describe the code of the whole design. 10 | - When necessary, explicitly mention the specifications of inputs and outputs in terms of their bit-width, range, and any other constraints or considerations. 11 | - Pay special attention to the temporal logic of the signals; e.g., how the registers are updated, how the state machines transition, etc. 12 | - Assume your response will be used by an experienced hardware designer as the only basis for implementing the equivalent functionality and provide the same top module input/output interface as described in the code. -------------------------------------------------------------------------------- /auto_data_gen_val/run_all_part.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ./move_dataset.sh 1 0 3 | sed -i 's/code_part = [0-9]*/code_part = 0/' test_10_30.py 4 | ./auto_restart_script.sh 5 | ./move_dataset.sh 0 0 6 | 7 | ./move_dataset.sh 1 1 8 | sed -i 's/code_part = [0-9]*/code_part = 1/' test_10_30.py 9 | ./auto_restart_script.sh 10 | ./move_dataset.sh 0 1 11 | 12 | ./move_dataset.sh 1 2 13 | sed -i 's/code_part = [0-9]*/code_part = 2/' test_10_30.py 14 | ./auto_restart_script.sh 15 | ./move_dataset.sh 0 2 16 | 17 | ./move_dataset.sh 1 3 18 | sed -i 's/code_part = [0-9]*/code_part = 3/' test_10_30.py 19 | ./auto_restart_script.sh 20 | ./move_dataset.sh 0 3 21 | 22 | ./move_dataset.sh 1 4 23 | sed -i 's/code_part = [0-9]*/code_part = 4/' test_10_30.py 24 | ./auto_restart_script.sh 25 | ./move_dataset.sh 0 4 26 | 27 | ./move_dataset.sh 1 5 28 | sed -i 's/code_part = [0-9]*/code_part = 5/' test_10_30.py 29 | ./auto_restart_script.sh 30 | ./move_dataset.sh 0 5 31 | 32 | ./move_dataset.sh 1 6 33 | sed -i 's/code_part = [0-9]*/code_part = 6/' test_10_30.py 34 | ./auto_restart_script.sh 35 | ./move_dataset.sh 0 6 36 | 37 | ./move_dataset.sh 1 7 38 | sed -i 's/code_part = [0-9]*/code_part = 7/' test_10_30.py 39 | ./auto_restart_script.sh 40 | ./move_dataset.sh 0 7 41 | 42 | # ./move_dataset.sh 1 8 43 | # sed -i 's/code_part = [0-9]*/code_part = 8/' test_10_30.py 44 | # ./auto_restart_script.sh 45 | # ./move_dataset.sh 0 8 46 | 47 | # ./move_dataset.sh 1 9 48 | # sed -i 's/code_part = [0-9]*/code_part = 9/' test_10_30.py 49 | # ./auto_restart_script.sh 50 | # ./move_dataset.sh 0 9 51 | 52 | 53 | -------------------------------------------------------------------------------- /verilog_eval/verilog_eval/data.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, Dict 2 | import gzip 3 | import json 4 | import os 5 | 6 | 7 | ROOT = os.path.dirname(os.path.abspath(__file__)) 8 | 9 | def read_problems(evalset_file: str) -> Dict[str, Dict]: 10 | return {task["task_id"]: task for task in stream_jsonl(evalset_file)} 11 | 12 | 13 | def stream_jsonl(filename: str) -> Iterable[Dict]: 14 | """ 15 | Parses each jsonl line and yields it as a dictionary 16 | """ 17 | if filename.endswith(".gz"): 18 | with open(filename, "rb") as gzfp: 19 | with gzip.open(gzfp, 'rt') as fp: 20 | for line in fp: 21 | if any(not x.isspace() for x in line): 22 | yield json.loads(line) 23 | else: 24 | with open(filename, "r") as fp: 25 | for line in fp: 26 | if any(not x.isspace() for x in line): 27 | yield json.loads(line) 28 | 29 | 30 | def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False): 31 | """ 32 | Writes an iterable of dictionaries to jsonl 33 | Skipping None in data 34 | """ 35 | if append: 36 | mode = 'ab' 37 | else: 38 | mode = 'wb' 39 | filename = os.path.expanduser(filename) 40 | if filename.endswith(".gz"): 41 | with open(filename, mode) as fp: 42 | with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp: 43 | for x in data: 44 | if x: 45 | gzfp.write((json.dumps(x) + "\n").encode('utf-8')) 46 | else: 47 | with open(filename, mode) as fp: 48 | for x in data: 49 | if x: 50 | fp.write((json.dumps(x) + "\n").encode('utf-8')) 51 | -------------------------------------------------------------------------------- /auto_data_gen_val/test_repo/multiplier.v: -------------------------------------------------------------------------------- 1 | module multiplier( 2 | clk, 3 | d_i, 4 | w_i, 5 | m_o 6 | ); 7 | 8 | test_mod test_mod_inst ( 9 | .clk(clk), 10 | .d_i(d_i), 11 | .w_i(w_i), 12 | .m_o(m_o) 13 | ); 14 | 15 | parameter C_WIDTH = 2; // data channel, set to 2 when using 8 bit 16 | parameter D_WIDTH = 8; 17 | parameter W_WIDTH = 16; 18 | parameter M_WIDTH = 32; 19 | /* W_WIDTH should be 1/2/8/16 */ 20 | input clk; 21 | input [D_WIDTH-1:0] d_i; 22 | input [W_WIDTH-1:0] w_i; 23 | output wire [M_WIDTH-1:0] m_o; 24 | 25 | reg [M_WIDTH-1:0] m_o_reg; 26 | wire [M_WIDTH-1:0] m_o_tmp; 27 | 28 | wire [23:0] w_i_tmp; 29 | 30 | assign w_i_tmp = {{w_i[15:8] + {8{w_i[7]}}},{8{w_i[7]}}, w_i[7:0]}; 31 | 32 | generate 33 | case (W_WIDTH/C_WIDTH) 34 | 1: always @(posedge clk) 35 | begin 36 | m_o_reg <= w_i[0]?(~d_i[D_WIDTH-1:0]+1):D_WIDTH[D_WIDTH-1:0]; 37 | end 38 | 2: always @ (posedge clk) 39 | begin 40 | m_o_reg <= w_i[1]?(~d_i[D_WIDTH-1:0]+1):(w_i[0]?d_i[D_WIDTH-1:0]:{D_WIDTH{1'b0}}); 41 | end 42 | 8: if (D_WIDTH == 8) begin 43 | mul24x8_signed u_mul24x8_signed ( 44 | .CLK(clk), 45 | .A(w_i_tmp), 46 | .B(d_i[7:0]), 47 | .P(m_o_tmp) // output 48 | ); 49 | assign m_o[15:0] = m_o_tmp[15:0]; 50 | assign m_o[31:16] = m_o_tmp[31:16]-{16{m_o_tmp[15]}}; 51 | end 52 | else 53 | mul16x16_signed u_mul16x16_signed ( 54 | .CLK(clk), 55 | .A(w_i[15:0]), 56 | .B(d_i[15:0]), 57 | .P(m_o) // output 58 | ); 59 | default: mul16x16_signed u_mul16x16_signed ( 60 | .CLK(clk), 61 | .A(w_i[15:0]), 62 | .B(d_i[15:0]), 63 | .P(m_o) // output 64 | ); 65 | endcase 66 | if (W_WIDTH < 3) begin: gen_mo 67 | assign m_o = m_o_reg; 68 | end 69 | endgenerate 70 | 71 | endmodule 72 | 73 | 74 | -------------------------------------------------------------------------------- /verilog_eval/build/lib/verilog_eval/data.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, Dict 2 | import gzip 3 | import json 4 | import os 5 | 6 | 7 | ROOT = os.path.dirname(os.path.abspath(__file__)) 8 | 9 | def read_problems(evalset_file: str) -> Dict[str, Dict]: 10 | return {task["task_id"]: task for task in stream_jsonl(evalset_file)} 11 | 12 | 13 | def stream_jsonl(filename: str) -> Iterable[Dict]: 14 | """ 15 | Parses each jsonl line and yields it as a dictionary 16 | """ 17 | if filename.endswith(".gz"): 18 | with open(filename, "rb") as gzfp: 19 | with gzip.open(gzfp, 'rt') as fp: 20 | for line in fp: 21 | if any(not x.isspace() for x in line): 22 | yield json.loads(line) 23 | else: 24 | with open(filename, "r") as fp: 25 | for line in fp: 26 | if any(not x.isspace() for x in line): 27 | yield json.loads(line) 28 | 29 | 30 | def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False): 31 | """ 32 | Writes an iterable of dictionaries to jsonl 33 | Skipping None in data 34 | """ 35 | if append: 36 | mode = 'ab' 37 | else: 38 | mode = 'wb' 39 | filename = os.path.expanduser(filename) 40 | if filename.endswith(".gz"): 41 | with open(filename, mode) as fp: 42 | with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp: 43 | for x in data: 44 | if x: 45 | gzfp.write((json.dumps(x) + "\n").encode('utf-8')) 46 | else: 47 | with open(filename, mode) as fp: 48 | for x in data: 49 | if x: 50 | fp.write((json.dumps(x) + "\n").encode('utf-8')) 51 | -------------------------------------------------------------------------------- /auto_data_gen_val/tool_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Type 2 | from langchain.tools import BaseTool, StructuredTool, Tool, tool 3 | from pydantic import BaseModel, Field 4 | from langchain.retrievers.multi_vector import MultiVectorRetriever 5 | from typing import Any 6 | 7 | from langchain.callbacks.manager import ( 8 | AsyncCallbackManagerForToolRun, 9 | CallbackManagerForToolRun, 10 | ) 11 | 12 | 13 | # You can provide a custom args schema to add descriptions or custom validation 14 | class SCodeRetrieveSchema(BaseModel): 15 | query: str = Field(description="should be the function name you want to search for") 16 | 17 | #TODO: add similarity thresholding 18 | #TODO: multiple doc retrieval 19 | class GlobalCodeRetrieve(BaseTool): 20 | name = "retrieve_code_function" 21 | description = "useful for when wantting to look for a function called in a code block to retriveve its summary" 22 | args_schema: Type[SCodeRetrieveSchema] = SCodeRetrieveSchema 23 | retriever: Any 24 | 25 | def __init__(self, retriever: Any): 26 | super(GlobalCodeRetrieve,self).__init__(retriever=retriever) 27 | 28 | def _run( 29 | self, 30 | query: str, 31 | run_manager: Optional[CallbackManagerForToolRun] = None, 32 | ) -> str: 33 | """Use the tool.""" 34 | doc = self.retriever.vectorstore.similarity_search(query) 35 | doc_summary = doc[0].metadata["summary"] 36 | return f"Document summary: {doc_summary}" 37 | 38 | async def _arun( 39 | self, 40 | query: str, 41 | run_manager: Optional[AsyncCallbackManagerForToolRun] = None, 42 | ) -> str: 43 | """Use the tool asynchronously.""" 44 | raise NotImplementedError("custom_search does not support async") 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /document_customized_repo/decode_results.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import argparse 4 | 5 | if __name__ == "__main__": 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("output_dir", help="output dir where the results are stored") 8 | 9 | args = parser.parse_args() 10 | output_dir = args.output_dir 11 | 12 | metadatapath = "/documented_code/dataset_metadata/part0/global_high_level_summary.json" 13 | 14 | #load metadata 15 | with open(output_dir+metadatapath, "r") as f: 16 | metadata = json.load(f) 17 | #store "global_summary_high_level" to global_summary_high_level.txt 18 | with open("global_summary_high_level.txt", "w") as f: 19 | f.write(metadata["priority_encoder.v"]["global_summary_high_level"]) 20 | #store "global_summary_detailed" to global_summary_detailed.txt 21 | with open("global_summary_detailed.txt", "w") as f: 22 | f.write(metadata["priority_encoder.v"]["global_summary_detailed"]) 23 | 24 | 25 | #block metadata 26 | block_metadatapath = "/documented_code/dataset_metadata/part0/block_summary.json" 27 | #load block metadata 28 | with open(output_dir+ block_metadatapath, "r") as f: 29 | block_metadata = json.load(f) 30 | #store "block_summary" to block_summary.txt 31 | block_idx = 0 32 | with open( "block_summary.txt", "w") as f: 33 | for block_summary in block_metadata["priority_encoder.v"]["block_summary"]: 34 | f.write(f"Block {block_idx}: {block_summary}\n\n") 35 | block_idx += 1 36 | 37 | documented_code_path = "/documented_code/part0/priority_encoder/priority_encoder.v" 38 | #store "documented_code" to documented_code.v 39 | with open( "documented_code.v", "w") as f: 40 | with open(output_dir+ documented_code_path, "r") as f2: 41 | f.write(f2.read()) 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /verilog_eval/data/example/ExampleDescriptions.jsonl: -------------------------------------------------------------------------------- 1 | {"task_id": "gatesv", "simple_description": " This module takes in 4-bit inputs and outputs 3-bit outputs based on the logical operations of AND, OR, and XOR.", "detail_description": " This Verilog module takes four input bits and produces three output bits. The first output bit is the result of a bitwise AND operation between the two least significant bits of the input. The second output bit is the result of a bitwise OR operation between the two least significant bits of the input. The third output bit is the result of a bitwise XOR operation between the two least significant bits of the input and all the other bits of the input except for the least significant bit."} 2 | {"task_id": "vector4", "simple_description": " This module takes an 8-bit input and replicates it 24 times to create a 32-bit output.", "detail_description": " This Verilog module is a simple combinational circuit that takes an 8-bit input and produces a 32-bit output. The output is formed by replicating the 8-bit input 24 times and then concatenating it with the original 8-bit input. This module does not contain any instantiated modules or state transitions, so the functionality is straightforward. The input is connected directly to the output, with the 8-bit input being replicated 24 times. The output is 32 bits wide, with the first 24 bits being the replicated input and the last 8 bits being the original input."} 3 | {"task_id": "zero", "simple_description": "This module assigns the output 'zero' to a logic value of 0.", "detail_description": " This top Verilog module is a simple module that assigns the output zero to a value of 0. This module does not have any inputs and is used to assign a constant value to the output. This module is useful when a constant value is needed for a design. For example, if a design requires a signal to be always 0, this module can be used to assign the output zero to 0. This module can also be used to assign a constant value to a signal that is used as an input to another module."} -------------------------------------------------------------------------------- /auto_data_gen_val/my_pydantic.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | from typing import Type, TypeVar 4 | 5 | from langchain.output_parsers.format_instructions import PYDANTIC_FORMAT_INSTRUCTIONS 6 | from langchain.pydantic_v1 import BaseModel, ValidationError 7 | from langchain.schema import BaseOutputParser, OutputParserException 8 | from langchain.schema import AIMessage 9 | 10 | T = TypeVar("T", bound=BaseModel) 11 | 12 | 13 | class PydanticOutputParserMessages(BaseOutputParser[T]): 14 | """Parse an output using a pydantic model.""" 15 | 16 | pydantic_object: Type[T] 17 | """The pydantic model to parse.""" 18 | 19 | def parse(self, text: str) -> T: 20 | try: 21 | # Greedy search for 1st json candidate. 22 | match = re.search( 23 | r"\{.*\}", text.strip(), re.MULTILINE | re.IGNORECASE | re.DOTALL 24 | ) 25 | json_str = "" 26 | if match: 27 | json_str = match.group() 28 | json_object = json.loads(json_str, strict=False) 29 | return self.pydantic_object.parse_obj(json_object).json() 30 | 31 | except (json.JSONDecodeError, ValidationError) as e: 32 | name = self.pydantic_object.__name__ 33 | msg = f"Failed to parse {name} from completion {text}. Got: {e}" 34 | raise OutputParserException(msg, llm_output=text) 35 | 36 | def get_format_instructions(self) -> str: 37 | schema = self.pydantic_object.schema() 38 | 39 | # Remove extraneous fields. 40 | reduced_schema = schema 41 | if "title" in reduced_schema: 42 | del reduced_schema["title"] 43 | if "type" in reduced_schema: 44 | del reduced_schema["type"] 45 | # Ensure json in context is well-formed with double quotes. 46 | schema_str = json.dumps(reduced_schema) 47 | 48 | return PYDANTIC_FORMAT_INSTRUCTIONS.format(schema=schema_str) 49 | 50 | @property 51 | def _type(self) -> str: 52 | return "pydantic" 53 | 54 | -------------------------------------------------------------------------------- /document_customized_repo/document_customized_repo.sh: -------------------------------------------------------------------------------- 1 | TEST_DIR=$1 2 | OUTPUT_DIR=$2 3 | CURRENT_DIR=$(cd $(dirname $0); pwd) 4 | export DATA4AIGCHIP_HOME=$(cd $CURRENT_DIR/..; pwd) 5 | echo "DATA4AIGCHIP_HOME=$DATA4AIGCHIP_HOME" 6 | echo "TEST_DIR=$TEST_DIR" 7 | echo "OUTPUT_DIR=$OUTPUT_DIR" 8 | 9 | 10 | 11 | python ../auto_data_gen_val/preprocess_data/process_data/preprocess.py $OUTPUT_DIR/raw_code -customized_dataset_dir $TEST_DIR 12 | 13 | python ../auto_data_gen_val/utils.py \ 14 | --src_code_dir $OUTPUT_DIR/raw_code \ 15 | --src_code_metadata_file $OUTPUT_DIR/module_inst.json \ 16 | --output_dir $OUTPUT_DIR/partitioned_dataset_output_path/ \ 17 | --shared_lib_dir $OUTPUT_DIR/directory_to_store_common_modules/ \ 18 | --output_code_metadata_dir $OUTPUT_DIR/output_dir_for_code_metadata/ \ 19 | --output_code_metadata_file codes.json \ 20 | --module_to_task_id_map_file $OUTPUT_DIR/module_name_to_task_id_mapping.json 21 | 22 | 23 | python ../auto_data_gen_val/line_by_line_comments_gen.py \ 24 | --total_parts 1 \ 25 | --output_dir $OUTPUT_DIR/documented_code \ 26 | --src_code_dir $OUTPUT_DIR/partitioned_dataset_output_path/ \ 27 | --code_metadata_dir $OUTPUT_DIR/output_dir_for_code_metadata/ \ 28 | --code_lib_path $OUTPUT_DIR/directory_to_store_common_modules/ \ 29 | --code_vec_store $OUTPUT_DIR/code_vec_store/test/ \ 30 | --discard_original_comment 31 | 32 | 33 | python ../auto_data_gen_val/gen_block_summaries.py 0 1 \ 34 | --code_metadata_dir $OUTPUT_DIR/output_dir_for_code_metadata/ \ 35 | --documented_code_dir $OUTPUT_DIR/documented_code \ 36 | --block_line_length 10 \ 37 | --model gpt-4-turbo 38 | 39 | 40 | 41 | python ../auto_data_gen_val/gen_global_summary.py 0 1 \ 42 | --code_metadata_dir $OUTPUT_DIR/output_dir_for_code_metadata/ \ 43 | --documented_code_dir $OUTPUT_DIR/documented_code \ 44 | --model gpt-4-turbo \ 45 | --detailed 46 | 47 | 48 | python ../auto_data_gen_val/gen_global_summary.py 0 1 \ 49 | --code_metadata_dir $OUTPUT_DIR/output_dir_for_code_metadata/ \ 50 | --documented_code_dir $OUTPUT_DIR/documented_code \ 51 | --model gpt-4-turbo 52 | 53 | 54 | python decode_results.py $OUTPUT_DIR -------------------------------------------------------------------------------- /auto_data_gen_val/gen_detailed_steps.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import shutil 4 | from dotenv import load_dotenv 5 | load_dotenv() 6 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR"))) 7 | from embedding_lookup_utils import CodeDataset 8 | from langchain.callbacks import get_openai_callback #with get_openai_callback() as cb: 9 | 10 | if __name__ == "__main__": 11 | dataset_metadata_dir = "./dataset_metadata/" 12 | if not os.path.exists(dataset_metadata_dir): 13 | os.makedirs(dataset_metadata_dir) 14 | 15 | total_code_parts = 6 16 | code_part_start_id = 2 17 | with get_openai_callback() as cb: 18 | for code_part_id in range(code_part_start_id, total_code_parts): 19 | if not os.path.exists("{}/part{}".format(dataset_metadata_dir, code_part_id)): 20 | os.makedirs("{}/part{}".format(dataset_metadata_dir, code_part_id)) 21 | codedb = CodeDataset( 22 | "/home/user_name/DAC_2024/ckpts_test/test_10_30_{}_complete/documented_code/".format(code_part_id), 23 | bookkeeping_dir="{}/part{}/bookkeeping/".format(dataset_metadata_dir,code_part_id), 24 | vectorembedding_dir="{}/part{}/vectorembedding/".format(dataset_metadata_dir, code_part_id), 25 | force_refresh=False, 26 | cb=cb 27 | ) 28 | codedb.load_and_split_code(skip_small_doc=True, split_by_line=True, based_on_code_lines_only=True, 29 | csv_code_dir="/home/user_name/DAC_2024/ckpts_test/test_10_30_{}_complete/assets/verilog/code_and_comment_src/csv_src/csv_code_src".format(code_part_id), 30 | csv_comment_dir="/home/user_name/DAC_2024/ckpts_test/test_10_30_{}_complete/assets/verilog/code_and_comment_src/csv_src/csv_new_comment_src".format(code_part_id) 31 | ) 32 | codedb.init_vectorstore() 33 | codedb.supplement_detailed_steps() 34 | 35 | codedb.save_detail_steps( 36 | "{}/part{}/detailed_steps.json".format(dataset_metadata_dir, code_part_id), 37 | split_by_line = True 38 | ) -------------------------------------------------------------------------------- /verilog_eval/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 NVIDIA Research Projects 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | 24 | This project contains code from human-eval (https://github.com/openai/human-eval/). 25 | 26 | The MIT License 27 | 28 | Copyright (c) OpenAI (https://openai.com) 29 | 30 | Permission is hereby granted, free of charge, to any person obtaining a copy 31 | of this software and associated documentation files (the "Software"), to deal 32 | in the Software without restriction, including without limitation the rights 33 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 34 | copies of the Software, and to permit persons to whom the Software is 35 | furnished to do so, subject to the following conditions: 36 | 37 | The above copyright notice and this permission notice shall be included in 38 | all copies or substantial portions of the Software. 39 | 40 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 41 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 42 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 43 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 44 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 45 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 46 | THE SOFTWARE. 47 | 48 | 49 | -------------------------------------------------------------------------------- /auto_data_gen_val/gen_block_summaries_no_comment_exists.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import shutil 4 | from dotenv import load_dotenv 5 | load_dotenv() 6 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR"))) 7 | from embedding_lookup_utils import CodeDataset 8 | from langchain.callbacks import get_openai_callback #with get_openai_callback() as cb: 9 | 10 | if __name__ == "__main__": 11 | dataset_metadata_dir = "./dataset_metadata/" 12 | if not os.path.exists(dataset_metadata_dir): 13 | os.makedirs(dataset_metadata_dir) 14 | 15 | total_code_parts = 10 16 | code_part_start_id = 0 17 | with get_openai_callback() as cb: 18 | for code_part_id in range(code_part_start_id, total_code_parts): 19 | if not os.path.exists("{}/part{}".format(dataset_metadata_dir, code_part_id)): 20 | os.makedirs("{}/part{}".format(dataset_metadata_dir, code_part_id)) 21 | codedb = CodeDataset( 22 | "/home/user_name/DAC_2024/ckpts/test_10_30_{}_complete/documented_code/".format(code_part_id), 23 | bookkeeping_dir="{}/part{}/bookkeeping/".format(dataset_metadata_dir,code_part_id), 24 | vectorembedding_dir="{}/part{}/vectorembedding/".format(dataset_metadata_dir, code_part_id), 25 | force_refresh=False, 26 | cb=cb 27 | ) 28 | codedb.load_and_split_code(skip_small_doc=True, split_by_line=True, based_on_code_lines_only=True, 29 | csv_code_dir="/home/user_name/DAC_2024/ckpts/test_10_30_{}_complete/assets/verilog/code_and_comment_src/csv_src/csv_code_src".format(code_part_id), 30 | csv_comment_dir="/home/user_name/DAC_2024/ckpts/test_10_30_{}_complete/assets/verilog/code_and_comment_src/csv_src/csv_comment_src".format(code_part_id) 31 | ) 32 | codedb.init_vectorstore() 33 | codedb.supplement_summary(block_summary_placeholding=False, use_global_summary_for_block_summary=False) 34 | codedb.save_block_summary( 35 | "{}/part{}/block_summary.json".format(dataset_metadata_dir, code_part_id), 36 | split_by_line = True 37 | ) 38 | -------------------------------------------------------------------------------- /auto_data_gen_val/gen_verilogeval_baseline_summary.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import shutil 4 | import argparse 5 | 6 | from dotenv import load_dotenv 7 | load_dotenv() 8 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR"))) 9 | from embedding_lookup_utils import CodeDataset 10 | from langchain.callbacks import get_openai_callback #with get_openai_callback() as cb: 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | #positional arguments 15 | #start_id, total_code_parts 16 | parser.add_argument("start_id", help="start id of the code parts", type=int) 17 | parser.add_argument("total_code_parts", help="total number of code parts", type=int) 18 | args = parser.parse_args() 19 | code_part_start_id = args.start_id 20 | total_code_parts = args.total_code_parts 21 | 22 | dataset_metadata_dir = "./dataset_metadata/" 23 | desc_key = "detail_description" 24 | if not os.path.exists(dataset_metadata_dir): 25 | os.makedirs(dataset_metadata_dir) 26 | 27 | with get_openai_callback() as cb: 28 | for code_part_id in range(code_part_start_id, total_code_parts): 29 | if not os.path.exists("{}/part{}".format(dataset_metadata_dir, code_part_id)): 30 | os.makedirs("{}/part{}".format(dataset_metadata_dir, code_part_id)) 31 | codedb = CodeDataset( 32 | "/home/user_name/DAC_2024/ckpts/test_10_30_{}_complete/documented_code/".format(code_part_id), 33 | bookkeeping_dir="{}/part{}/bookkeeping/".format(dataset_metadata_dir,code_part_id), 34 | vectorembedding_dir="{}/part{}/vectorembedding/".format(dataset_metadata_dir, code_part_id), 35 | force_refresh=False, 36 | cb=cb 37 | ) 38 | codedb.load_and_split_code(skip_small_doc=True, split_by_line=True, based_on_code_lines_only=True, 39 | csv_code_dir="/home/user_name/DAC_2024/ckpts/test_10_30_{}_complete/assets/verilog/code_and_comment_src/csv_src/csv_code_src".format(code_part_id), 40 | csv_comment_dir="/home/user_name/DAC_2024/ckpts/test_10_30_{}_complete/assets/verilog/code_and_comment_src/csv_src/csv_new_comment_src".format(code_part_id) 41 | ) 42 | codedb.init_vectorstore() 43 | codedb.supplement_summary(block_summary_placeholding=True, 44 | force_refresh_global_summary=True, 45 | global_summary_example_desc_key=desc_key) 46 | 47 | codedb.save_global_summary( 48 | "{}/part{}/global_summary.json".format(dataset_metadata_dir, code_part_id) 49 | ) 50 | -------------------------------------------------------------------------------- /auto_data_gen_val/pre_proc_sync.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from dotenv import load_dotenv 4 | load_dotenv() 5 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR"))) 6 | import openai 7 | import requests 8 | import json 9 | import copy 10 | import time 11 | import datetime 12 | import shutil 13 | from embedding_lookup_utils import * 14 | from utils import * 15 | from completion_handler import * 16 | from code_preprocesser import * 17 | from code_repo_documentor import * 18 | 19 | #documenting the first version with module instantiation 20 | #one_shot 5 lines 21 | #pure llama 2 70B 22 | #around 12k samples 23 | 24 | if __name__ == "__main__": 25 | #NOTE: run utils.py first to partition the code first 26 | code_part = 0 27 | code_dir = "/home/user_name/DAC_2024/ckpt3_user_name_valid_content_renamed/part{}".format(code_part) 28 | code_metadata_file = "/home/user_name/DAC_2024/ckpt3_user_name_valid_content_code_metadata/part{}/codes.json".format(code_part) 29 | code_lib_path = "/home/user_name/DAC_2024/ckpt3_user_name_valid_content_shared_lib/" 30 | code_vec_store = "../code_vec_store/test_10_30/" 31 | language = os.environ.get("TARGET_LANG") 32 | if os.environ.get("TARGET_LANG") == "verilog": 33 | code_suffix = [".v", ".sv", ".vh"] 34 | elif os.environ.get("TARGET_LANG") == "xilinx_hls": 35 | code_suffix = [".c", ".cpp", ".h", ".hpp"] 36 | store_src_code_dir = os.environ.get("STORE_SRC_CODE_DIR") 37 | csv_code_dir = os.environ.get("CSV_CODE_DIR") 38 | csv_comment_dir = os.environ.get("CSV_COMMENT_DIR") 39 | csv_new_comment_dir = os.environ.get("CSV_NEW_COMMENT_DIR") 40 | csv_pure_gen_comment_dir = os.environ.get("CSV_PURE_GEN_COMMENT_DIR") 41 | code_summary_dir = os.environ.get("CODE_SUMMARY_DIR") 42 | documented_code_dir = os.environ.get("DOCUMENTED_CODE_DIR") 43 | 44 | 45 | with get_openai_callback() as cb: 46 | #This switch will discard 1. the comments in the raw code copy and 2. the comments will be converted to the raw code csv 47 | discard_original_comment = True 48 | 49 | code_repo_documentor = CodeRepoDocumentor(code_dir, store_src_code_dir, 50 | csv_code_dir, csv_comment_dir, csv_new_comment_dir, 51 | csv_pure_gen_comment_dir, code_summary_dir, documented_code_dir, 52 | code_metadata_file=code_metadata_file, 53 | code_suffix=code_suffix, language=language, 54 | discard_original_comment=discard_original_comment, 55 | code_lib_path=code_lib_path, code_vec_store=code_vec_store, 56 | skip_rag_db=True, 57 | cb = cb) 58 | code_repo_documentor.create_embedding() 59 | code_repo_documentor.code_preprocess() 60 | code_repo_documentor.package_documented_code("./documented_code") 61 | -------------------------------------------------------------------------------- /auto_data_gen_val/preprocess_data/example_code_strings_simple_instructions.json: -------------------------------------------------------------------------------- 1 | { 2 | "lemmings1": "module top_module (\n\tinput clk,\n\tinput areset,\n\tinput bump_left,\n\tinput bump_right,\n\toutput walk_left,\n\toutput walk_right\n);\n\tparameter WL=0, WR=1;\n\treg state;\n\treg next;\n \n always_comb begin\n\t\tcase (state)\n\t\t\tWL: next = bump_left ? WR : WL;\n\t\t\tWR: next = bump_right ? WL: WR;\n\t\tendcase\n end\n \n always @(posedge clk, posedge areset) begin\n\t\tif (areset) state <= WL;\n else state <= next;\n\tend\n\t\t\n\tassign walk_left = (state==WL);\n\tassign walk_right = (state==WR);\n\n\t\nendmodule\n", 3 | "rotate100": "module top_module(\n\tinput clk,\n\tinput load,\n\tinput [1:0] ena,\n\tinput [99:0] data,\n\toutput reg [99:0] q);\n\t\n\t\n\talways @(posedge clk) begin\n\t\tif (load)\n\t\t\tq <= data;\n\t\telse if (ena == 2'h1)\n\t\t\tq <= {q[0], q[99:1]};\n\t\telse if (ena == 2'h2)\n\t\t\tq <= {q[98:0], q[99]};\n\tend\nendmodule\n", 4 | "vector2": "module top_module (\n\tinput [31:0] in,\n\toutput [31:0] out\n);\n\n\tassign out = {in[7:0], in[15:8], in[23:16], in[31:24]};\t\n\t\nendmodule\n", 5 | "gatesv100": "module top_module (\n\tinput [99:0] in,\n\toutput [98:0] out_both,\n\toutput [99:1] out_any,\n\toutput [99:0] out_different\n);\n\n\tassign out_both = in & in[99:1];\n\tassign out_any = in | in[99:1];\n\tassign out_different = in^{in[0], in[99:1]};\n\t\nendmodule\n", 6 | "history_shift": "module top_module\n(\n input clk,\n input areset,\n input predict_valid,\n input predict_taken,\n output logic [31:0] predict_history,\n \n input train_mispredicted,\n input train_taken,\n input [31:0] train_history\n);\n always@(posedge clk, posedge areset)\n\t\tif (areset) begin\n\t\t\tpredict_history = 0;\n end\telse begin\n\t\t\tif (train_mispredicted)\n\t\t\t\tpredict_history <= {train_history, train_taken};\n\t\t\telse if (predict_valid)\n\t\t\t\tpredict_history <= {predict_history, predict_taken};\n\t\tend\nendmodule\n", 7 | "ece241_2013_q2": "module top_module (\n\tinput a,\n\tinput b,\n\tinput c,\n\tinput d,\n\toutput out_sop,\n\toutput out_pos\n);\n\t\n\twire pos0, pos1;\n\tassign out_sop = c&d | ~a&~b&c;\n\tassign pos0 = c & (~b|d)&(~a|b);\n\tassign pos1 = c & (~b|d)&(~a|d);\n\t\n\tassign out_pos = (pos0 == pos1) ? pos0 : 1'bx;\nendmodule\n", 8 | "dff16e": "module top_module(\n\tinput clk,\n\tinput resetn,\n\tinput [1:0] byteena,\n\tinput [15:0] d,\n\toutput reg [15:0] q);\n\t\n\talways @(posedge clk) begin\n\t\tif (!resetn)\n\t\t\tq <= 0;\n\t\telse begin\n\t\t\tif (byteena[0])\n\t\t\t\tq[7:0] <= d[7:0];\n\t\t\tif (byteena[1])\n\t\t\t\tq[15:8] <= d[15:8];\n\t\tend\n\tend\n\t\nendmodule\n", 9 | "fsm2": "module top_module (\n\tinput clk,\n\tinput j,\n\tinput k,\n\tinput areset,\n\toutput out\n);\n\tparameter A=0, B=1;\n\treg state;\n\treg next;\n \n always_comb begin\n\t\tcase (state)\n\t\t\tA: next = j ? B : A;\n\t\t\tB: next = k ? A : B;\n\t\tendcase\n end\n \n always @(posedge clk, posedge areset) begin\n\t\tif (areset) state <= A;\n else state <= next;\n\tend\n\t\t\n\tassign out = (state==B);\n\n\t\nendmodule\n", 10 | "vector100r": "module top_module (\n\tinput [99:0] in,\n\toutput reg [99:0] out\n);\n\t\n\talways_comb \n\t\tfor (int i=0;i<$bits(out);i++)\n\t\t\tout[i] = in[$bits(out)-i-1];\n\t\nendmodule\n" 11 | } -------------------------------------------------------------------------------- /auto_data_gen_val/dataset_utils_baseline.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from dotenv import load_dotenv 4 | load_dotenv() 5 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR"))) 6 | import openai 7 | import requests 8 | import json 9 | import copy 10 | import time 11 | import datetime 12 | import shutil 13 | import pandas as pd 14 | import tiktoken 15 | from openai.embeddings_utils import get_embedding, cosine_similarity 16 | from ast import literal_eval 17 | import numpy as np 18 | import re 19 | from tqdm import tqdm 20 | from datasets import load_from_disk 21 | from datasets import Dataset 22 | import jsonlines 23 | from utils import extract_module_header, preprocess 24 | 25 | 26 | 27 | def random_sample_dataset(datasetpath, sample_percent, savepath): 28 | dataset = load_from_disk(datasetpath) 29 | dataset = dataset.shuffle(seed=42) 30 | dataset = dataset.select(range(int(len(dataset)*sample_percent))) 31 | dataset.save_to_disk(savepath) 32 | return dataset 33 | 34 | def find_fail_all_entries(result_file): 35 | all_entries = [] 36 | with jsonlines.open(result_file) as reader: 37 | for obj in reader: 38 | all_entries.append(obj["task_id"]) 39 | all_entries = set(all_entries) 40 | with jsonlines.open(result_file) as reader: 41 | for obj in reader: 42 | if obj["result"] == "passed": 43 | if obj["task_id"] in all_entries: 44 | all_entries.remove(obj["task_id"]) 45 | return all_entries 46 | 47 | def form_new_prob_file(orig_prob_file, new_prob_file, fail_entries): 48 | with jsonlines.open(orig_prob_file) as reader: 49 | with jsonlines.open(new_prob_file, mode='w') as writer: 50 | for obj in reader: 51 | if obj["task_id"] in fail_entries: 52 | writer.write(obj) 53 | return new_prob_file 54 | 55 | 56 | if __name__ == "__main__": 57 | # datasetpath = "/home/user_name/DAC_2024/sft_dataset/vanilla_baseline" 58 | # sample_percent = 0.1 59 | # savepath = "/home/user_name/DAC_2024/sft_dataset/vanilla_baseline_sample_0_{}".format(int(sample_percent*10)) 60 | # random_sample_dataset(datasetpath, sample_percent, savepath) 61 | result_file = "/home/user_name/DAC_2024/chatgpt4_auto_accel/model_eval_qlora_kevin/data/gen.merged_dataset+simple_description.jsonl_results.jsonl" 62 | new_prob_dir = "./special_set" 63 | hdlbits_hlvl = "hdlbits_description.jsonl" 64 | hdlbits_simple_desc = "hdlbits_description_simple_description.jsonl" 65 | hdlbits_detail_desc = "hdlbits_description_detail_description.jsonl" 66 | hdlbits_block_desc = "hdlbits_for_llm2_eval.jsonl" 67 | eval_file = "/home/user_name/DAC_2024/chatgpt4_auto_accel/verilog_eval/data/VerilogEval_Machine.jsonl" 68 | fail_entries = find_fail_all_entries(result_file) 69 | print(len(fail_entries)) 70 | print(fail_entries) 71 | new_prob_file = form_new_prob_file(hdlbits_hlvl, os.path.join(new_prob_dir, hdlbits_hlvl), fail_entries) 72 | new_prob_file = form_new_prob_file(hdlbits_simple_desc, os.path.join(new_prob_dir, hdlbits_simple_desc), fail_entries) 73 | new_prob_file = form_new_prob_file(hdlbits_detail_desc, os.path.join(new_prob_dir, hdlbits_detail_desc), fail_entries) 74 | new_prob_file = form_new_prob_file(hdlbits_block_desc, os.path.join(new_prob_dir, hdlbits_block_desc), fail_entries) 75 | new_prob_file = form_new_prob_file(eval_file, os.path.join(new_prob_dir, "VerilogEval_Machine.jsonl"), fail_entries) 76 | -------------------------------------------------------------------------------- /document_customized_repo/test_dir/priority_encoder.v: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (c) 2014-2021 Alex Forencich 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | 23 | */ 24 | 25 | // Language: Verilog 2001 26 | 27 | `resetall 28 | `timescale 1ns / 1ps 29 | `default_nettype none 30 | 31 | /* 32 | * Priority encoder module 33 | */ 34 | module priority_encoder # 35 | ( 36 | parameter WIDTH = 4, 37 | // LSB priority selection 38 | parameter LSB_HIGH_PRIORITY = 0 39 | ) 40 | ( 41 | input wire [WIDTH-1:0] input_unencoded, 42 | output wire output_valid, 43 | output wire [$clog2(WIDTH)-1:0] output_encoded, 44 | output wire [WIDTH-1:0] output_unencoded 45 | ); 46 | 47 | parameter LEVELS = WIDTH > 2 ? $clog2(WIDTH) : 1; 48 | parameter W = 2**LEVELS; 49 | 50 | // pad input to even power of two 51 | wire [W-1:0] input_padded = {{W-WIDTH{1'b0}}, input_unencoded}; 52 | 53 | wire [W/2-1:0] stage_valid[LEVELS-1:0]; 54 | wire [W/2-1:0] stage_enc[LEVELS-1:0]; 55 | 56 | generate 57 | genvar l, n; 58 | 59 | // process input bits; generate valid bit and encoded bit for each pair 60 | for (n = 0; n < W/2; n = n + 1) begin : loop_in 61 | assign stage_valid[0][n] = |input_padded[n*2+1:n*2]; 62 | if (LSB_HIGH_PRIORITY) begin 63 | // bit 0 is highest priority 64 | assign stage_enc[0][n] = !input_padded[n*2+0]; 65 | end else begin 66 | // bit 0 is lowest priority 67 | assign stage_enc[0][n] = input_padded[n*2+1]; 68 | end 69 | end 70 | 71 | // compress down to single valid bit and encoded bus 72 | for (l = 1; l < LEVELS; l = l + 1) begin : loop_levels 73 | for (n = 0; n < W/(2*2**l); n = n + 1) begin : loop_compress 74 | assign stage_valid[l][n] = |stage_valid[l-1][n*2+1:n*2]; 75 | if (LSB_HIGH_PRIORITY) begin 76 | // bit 0 is highest priority 77 | assign stage_enc[l][(n+1)*(l+1)-1:n*(l+1)] = stage_valid[l-1][n*2+0] ? {1'b0, stage_enc[l-1][(n*2+1)*l-1:(n*2+0)*l]} : {1'b1, stage_enc[l-1][(n*2+2)*l-1:(n*2+1)*l]}; 78 | end else begin 79 | // bit 0 is lowest priority 80 | assign stage_enc[l][(n+1)*(l+1)-1:n*(l+1)] = stage_valid[l-1][n*2+1] ? {1'b1, stage_enc[l-1][(n*2+2)*l-1:(n*2+1)*l]} : {1'b0, stage_enc[l-1][(n*2+1)*l-1:(n*2+0)*l]}; 81 | end 82 | end 83 | end 84 | endgenerate 85 | 86 | assign output_valid = stage_valid[LEVELS-1]; 87 | assign output_encoded = stage_enc[LEVELS-1]; 88 | assign output_unencoded = 1 << output_encoded; 89 | 90 | endmodule 91 | 92 | `resetall 93 | -------------------------------------------------------------------------------- /auto_data_gen_val/verilog_eval_to_part_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from dotenv import load_dotenv 4 | load_dotenv() 5 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR"))) 6 | import openai 7 | import requests 8 | import json 9 | import copy 10 | import time 11 | import datetime 12 | import shutil 13 | import pandas as pd 14 | import tiktoken 15 | from openai.embeddings_utils import get_embedding, cosine_similarity 16 | from ast import literal_eval 17 | import numpy as np 18 | import re 19 | from tqdm import tqdm 20 | import tiktoken 21 | from datasets import load_from_disk 22 | from datasets import Dataset 23 | from utils import * 24 | 25 | llama2_prompt_with_memory =""" 26 | [INST] <> 27 | {system_message} 28 | <> 29 | 30 | {chat_history} {human_input} [/INST] 31 | """ 32 | 33 | llama2_prompt_without_memory =""" 34 | [INST] <> 35 | {system_message} 36 | <> 37 | 38 | {human_input} [/INST] 39 | """ 40 | 41 | llama2_prompt_without_memory_without_sys =""" 42 | [INST] {human_input} [/INST] 43 | """ 44 | 45 | llama2_pompt_with_memory_without_sys =""" 46 | [INST] {chat_history} {human_input} [/INST] 47 | """ 48 | 49 | llama2_memory_prompt ="""{human_input} [/INST] {model_reply}[INST]""" 50 | 51 | 52 | def eval_file_to_part_data(eval_file, data_dir, meta_data_dir): 53 | if os.path.exists(data_dir): 54 | shutil.rmtree(data_dir) 55 | os.makedirs(data_dir) 56 | if os.path.exists(meta_data_dir): 57 | shutil.rmtree(meta_data_dir) 58 | os.makedirs(meta_data_dir) 59 | code_pieces = [] 60 | with open(eval_file, "r") as f: 61 | for line in f: 62 | code_pieces.append(json.loads(line)) 63 | meta_data = {} 64 | for code_idx, code in tqdm(enumerate(code_pieces), total=len(code_pieces), desc="Preparing data"): 65 | code_name = code["task_id"] 66 | module_header = code["prompt"].replace("top_module", code_name) 67 | code_content = module_header+code["canonical_solution"] 68 | 69 | #preprocess code 70 | output_str_list, module_name_list = part_verilog_module_string(code_content) 71 | assert len(module_name_list) == 1 72 | assert code_name == module_name_list[0] 73 | code_content = output_str_list[0] 74 | 75 | #enter dummy meta data 76 | meta_data[code_name] = {"code_name": code_name, "module_inst_list": []} 77 | 78 | #save code to file 79 | code_file = os.path.join(data_dir, code_name+".v") 80 | with open(code_file, "w") as f: 81 | f.write(code_content) 82 | #save meta data to file 83 | meta_data_file = os.path.join(meta_data_dir, "codes.json") 84 | with open(meta_data_file, "w") as f: 85 | json.dump(meta_data, f, indent=4) 86 | 87 | return data_dir 88 | 89 | 90 | 91 | 92 | if __name__ == "__main__": 93 | import argparse 94 | parser = argparse.ArgumentParser() 95 | parser.add_argument("--eval_file", type=str, default="/home/user_name/DAC_2024/chatgpt4_auto_accel/verilog_eval/data/VerilogEval_Machine.jsonl") 96 | parser.add_argument("--data_dir", type=str, default="/home/user_name/DAC_2024/ckpt3_user_name_valid_content_renamed/part10") 97 | parser.add_argument("--meta_data_dir", type=str, default="/home/user_name/DAC_2024/ckpt3_user_name_valid_content_code_metadata/part10") 98 | args = parser.parse_args() 99 | eval_file = args.eval_file 100 | data_dir = args.data_dir 101 | meta_data_dir = args.meta_data_dir 102 | eval_file_to_part_data(eval_file, data_dir, meta_data_dir) 103 | -------------------------------------------------------------------------------- /auto_data_gen_val/gen_block_summaries.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import shutil 4 | import argparse 5 | from dotenv import load_dotenv 6 | load_dotenv() 7 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR"))) 8 | from embedding_lookup_utils import CodeDataset 9 | from langchain.callbacks import get_openai_callback #with get_openai_callback() as cb: 10 | 11 | if __name__ == "__main__": 12 | parser = argparse.ArgumentParser() 13 | #positional arguments 14 | #start_id, total_code_parts 15 | parser.add_argument("start_id", help="start id of the code parts", type=int) 16 | parser.add_argument("total_code_parts", help="total number of code parts", type=int) 17 | 18 | #optional arguments 19 | parser.add_argument("--documented_code_dir", help="documented code directory", type=str, default="/home/user_name/DAC_2024/ckpts/") 20 | parser.add_argument("--code_metadata_dir", help="code metadata directory", type=str, default="/home/user_name/DAC_2024/ckpt3_user_name_valid_content_code_metadata/") 21 | parser.add_argument("--block_line_length", help="block line length", type=int, default=10) 22 | parser.add_argument("--model", help="model", type=str, default="gpt-3.5-turbo-1106") 23 | args = parser.parse_args() 24 | code_part_start_id = args.start_id 25 | total_code_parts = args.total_code_parts 26 | documented_code_dir = args.documented_code_dir 27 | code_metadata_dir = args.code_metadata_dir 28 | block_line_length = args.block_line_length 29 | model = args.model 30 | 31 | dataset_metadata_dir = os.path.join(documented_code_dir, "dataset_metadata") 32 | if not os.path.exists(dataset_metadata_dir): 33 | os.makedirs(dataset_metadata_dir) 34 | 35 | with get_openai_callback() as cb: 36 | for code_part_id in range(code_part_start_id, total_code_parts): 37 | if not os.path.exists("{}/part{}".format(dataset_metadata_dir, code_part_id)): 38 | os.makedirs("{}/part{}".format(dataset_metadata_dir, code_part_id)) 39 | src_code_dir = os.path.join(documented_code_dir, "part{}".format(code_part_id)) 40 | codedb = CodeDataset( 41 | src_code_dir, 42 | bookkeeping_dir="{}/part{}/bookkeeping/".format(dataset_metadata_dir,code_part_id), 43 | vectorembedding_dir="{}/part{}/vectorembedding/".format(dataset_metadata_dir, code_part_id), 44 | force_refresh=False, 45 | cb=cb 46 | ) 47 | csv_code_dir = os.path.join(code_metadata_dir, "part{}".format(code_part_id), "assets", "verilog", "code_and_comment_src", "csv_src", "csv_code_src") 48 | csv_comment_dir = os.path.join(code_metadata_dir, "part{}".format(code_part_id), "assets", "verilog", "code_and_comment_src", "csv_src", "csv_new_comment_src") 49 | codedb.load_and_split_code(skip_small_doc=True, split_by_line=True, 50 | line_length=block_line_length, 51 | based_on_code_lines_only=True, 52 | csv_code_dir=csv_code_dir, 53 | csv_comment_dir=csv_comment_dir 54 | ) 55 | codedb.init_vectorstore(block_summary_model=model) 56 | codedb.supplement_summary(block_summary_placeholding=False) 57 | codedb.save_block_summary( 58 | "{}/part{}/block_summary.json".format(dataset_metadata_dir, code_part_id), 59 | split_by_line = True 60 | ) 61 | -------------------------------------------------------------------------------- /auto_data_gen_val/requirements.txt: -------------------------------------------------------------------------------- 1 | adal==1.2.7 2 | aiohttp==3.8.5 3 | aiosignal==1.3.1 4 | asttokens==2.2.1 5 | async-timeout==4.0.2 6 | attrs==23.1.0 7 | azure-common==1.1.28 8 | azure-core==1.29.2 9 | backcall==0.2.0 10 | beautifulsoup4==4.12.2 11 | blinker==1.6.2 12 | certifi==2023.7.22 13 | cffi==1.15.1 14 | chardet==5.2.0 15 | charset-normalizer==3.2.0 16 | click==8.1.6 17 | cloudpickle==2.2.1 18 | cmake==3.27.1 19 | comm==0.1.4 20 | contourpy==1.1.0 21 | cryptography==41.0.3 22 | cycler==0.11.0 23 | dataclasses-json==0.5.14 24 | decorator==5.1.1 25 | docx2txt==0.8 26 | dominate==2.8.0 27 | executing==1.2.0 28 | filelock==3.12.2 29 | filetype==1.2.0 30 | Flask==2.2.3 31 | Flask-Bootstrap==3.3.7.1 32 | fonttools==4.42.0 33 | frozenlist==1.4.0 34 | fsspec==2023.6.0 35 | gevent==23.7.0 36 | greenlet==2.0.2 37 | huggingface-hub==0.16.4 38 | idna==3.4 39 | importlib-metadata==6.8.0 40 | importlib-resources==6.0.1 41 | ipython==8.12.2 42 | ipywidgets==8.1.0 43 | isodate==0.6.1 44 | itsdangerous==2.1.2 45 | jedi==0.19.0 46 | Jinja2==3.1.2 47 | joblib==1.3.1 48 | jupyterlab-widgets==3.0.8 49 | kiwisolver==1.4.4 50 | langchain==0.0.257 51 | langsmith==0.0.19 52 | lit==16.0.6 53 | llama-index==0.7.21 54 | llvmlite==0.40.1 55 | load-dotenv==0.1.0 56 | lxml==4.9.3 57 | MarkupSafe==2.1.3 58 | marshmallow==3.20.1 59 | matplotlib==3.7.2 60 | matplotlib-inline==0.1.6 61 | microsoft-bing-autosuggest==1.0.0 62 | microsoft-bing-customimagesearch==1.0.0 63 | microsoft-bing-customwebsearch==1.0.0 64 | microsoft-bing-entitysearch==1.0.0 65 | microsoft-bing-imagesearch==1.0.0 66 | microsoft-bing-newssearch==1.0.0 67 | microsoft-bing-spellcheck==1.0.0 68 | microsoft-bing-videosearch==1.0.0 69 | microsoft-bing-visualsearch==1.0.0 70 | microsoft-bing-websearch==1.0.0 71 | more-itertools==10.1.0 72 | mpmath==1.3.0 73 | msrest==0.7.1 74 | msrestazure==0.6.4 75 | multidict==6.0.4 76 | mypy-extensions==1.0.0 77 | nest-asyncio==1.5.7 78 | networkx==3.1 79 | nltk==3.8.1 80 | numba==0.57.1 81 | numexpr==2.8.5 82 | numpy==1.24.4 83 | nvidia-cublas-cu11==11.10.3.66 84 | nvidia-cuda-cupti-cu11==11.7.101 85 | nvidia-cuda-nvrtc-cu11==11.7.99 86 | nvidia-cuda-runtime-cu11==11.7.99 87 | nvidia-cudnn-cu11==8.5.0.96 88 | nvidia-cufft-cu11==10.9.0.58 89 | nvidia-curand-cu11==10.2.10.91 90 | nvidia-cusolver-cu11==11.4.0.1 91 | nvidia-cusparse-cu11==11.7.4.91 92 | nvidia-nccl-cu11==2.14.3 93 | nvidia-nvtx-cu11==11.7.91 94 | oauthlib==3.2.2 95 | openai==0.27.8 96 | openai-whisper @ git+https://github.com/openai/whisper.git@e8622f9afc4eba139bf796c210f5c01081000472 97 | openapi-schema-pydantic==1.2.4 98 | packaging==23.1 99 | pandas==2.0.3 100 | parso==0.8.3 101 | pexpect==4.8.0 102 | pickleshare==0.7.5 103 | Pillow==9.4.0 104 | plotly==5.15.0 105 | prompt-toolkit==3.0.39 106 | psutil==5.9.5 107 | ptyprocess==0.7.0 108 | pure-eval==0.2.2 109 | pycparser==2.21 110 | pydantic==1.10.12 111 | pydub==0.25.1 112 | Pygments==2.16.1 113 | PyJWT==2.8.0 114 | pyparsing==3.0.9 115 | PyPDF2==3.0.1 116 | python-dateutil==2.8.2 117 | python-dotenv==1.0.0 118 | python-magic==0.4.27 119 | python-pptx==0.6.21 120 | pytz==2023.3 121 | PyYAML==6.0.1 122 | regex==2023.6.3 123 | requests==2.31.0 124 | requests-oauthlib==1.3.1 125 | scikit-learn==1.3.0 126 | scipy==1.10.1 127 | sentencepiece==0.1.97 128 | six==1.16.0 129 | soupsieve==2.4.1 130 | SQLAlchemy==2.0.19 131 | stack-data==0.6.2 132 | sympy==1.12 133 | tabulate==0.9.0 134 | tenacity==8.2.2 135 | threadpoolctl==3.2.0 136 | tiktoken==0.3.3 137 | tokenizers==0.13.3 138 | torch==2.0.1 139 | tornado==6.3.2 140 | tqdm==4.65.0 141 | traitlets==5.9.0 142 | transformers==4.27.3 143 | triton==2.0.0 144 | typing-inspect==0.9.0 145 | typing_extensions==4.7.1 146 | tzdata==2023.3 147 | unstructured==0.9.1 148 | urllib3==1.26.16 149 | visitor==0.1.3 150 | wcwidth==0.2.6 151 | Werkzeug==2.3.6 152 | widgetsnbextension==4.0.8 153 | xgboost==1.7.6 154 | XlsxWriter==3.1.2 155 | yarl==1.9.2 156 | zipp==3.16.2 157 | zope.event==5.0 158 | zope.interface==6.0 159 | -------------------------------------------------------------------------------- /auto_data_gen_val/code_validate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from dotenv import load_dotenv 4 | load_dotenv() 5 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR"))) 6 | 7 | sys.path.append("../verilog_eval/verilog_eval") 8 | from evaluation import evaluate_functional_correctness 9 | 10 | import requests 11 | import json 12 | import uuid 13 | from io import StringIO 14 | import copy 15 | import time 16 | import datetime 17 | import shutil 18 | import pandas as pd 19 | import tiktoken 20 | from openai.embeddings_utils import get_embedding, cosine_similarity 21 | from ast import literal_eval 22 | import numpy as np 23 | from utils import * 24 | from tqdm import tqdm 25 | import jsonlines 26 | 27 | from chain_utils import SimpleConverseChain 28 | from pyverilog.vparser.parser import parse 29 | from datasets import load_dataset, load_from_disk, Dataset 30 | 31 | 32 | def compile_syntax_check(code_str): 33 | row = code_str 34 | file_id = str(uuid.uuid4()) 35 | path = "tmp/file{}.v".format(file_id) 36 | asset_dir = "tmp/asset{}".format(file_id) 37 | 38 | #check if tmp dir exists 39 | if not os.path.exists("tmp"): 40 | os.makedirs("tmp", exist_ok=True) 41 | 42 | #check and make asset dir 43 | if not os.path.exists(asset_dir): 44 | os.makedirs(os.path.dirname(asset_dir), exist_ok=True) 45 | 46 | with open(path, "w") as f: 47 | f.write(row) 48 | 49 | try: 50 | ast, directives = parse([path], debug=False, outputdir=asset_dir, preprocess_output="tmp/preprocess.output.{}".format(file_id)) 51 | output = StringIO() 52 | ast.show(buf=output) 53 | for lineno, directive in directives: 54 | output.write('Line %d : %s' % (lineno, directive)) 55 | #delete the file 56 | os.remove(path) 57 | shutil.rmtree(asset_dir) 58 | return True 59 | except Exception as e: 60 | #delete the file 61 | os.remove(path) 62 | shutil.rmtree(asset_dir) 63 | return False 64 | 65 | def reverse_codegen(description, code_str, model="gpt-4-0613", max_trials=10): 66 | system_prompt = "You only complete chats with syntax correct Verilog code. End the Verilog module code completion with 'endmodule'. Do not include module, input and output definitions." 67 | question_prompt = "Implement the Verilog module based on the following description. Assume that signals are positive clock/clk edge triggered unless otherwise stated." 68 | problem_description = "\n\n {description} \n\n Module header:\n\n {module_header}\n" 69 | #retrieve the module header 70 | module_header = extract_module_header(code_str, code_str=True) 71 | #generate the prompt 72 | user_prompt = question_prompt + problem_description.format(description=description, module_header=module_header) 73 | chain = SimpleConverseChain(system_prompt=system_prompt, model=model, temperature=0.7, max_tokens=512, top_p=0.95, have_memory=False, verbose=False) 74 | for trial in range(max_trials): 75 | completion = chain.chat(user_prompt, system_prompt=system_prompt) 76 | #check if the completion is valid 77 | if compile_syntax_check(completion): 78 | return True, completion 79 | return False, completion 80 | 81 | 82 | 83 | 84 | if __name__ == "__main__": 85 | import argparse 86 | parser = argparse.ArgumentParser() 87 | parser.add_argument("--dataset_dir", type=str, default="/home/user_name/DAC_2024/sft_dataset/detail_description_dataset") 88 | parser.add_argument("--output_dir", type=str, default="/home/user_name/DAC_2024/sft_dataset/detail_description_dataset_val") 89 | dataset_dir = parser.parse_args().dataset_dir 90 | output_dir = parser.parse_args().output_dir 91 | 92 | #load the dataset 93 | generated_dataset = load_from_disk(dataset_dir) 94 | 95 | new_dataset = {"code": [], "description": []} 96 | for i in range(len(generated_dataset)): 97 | code_str = generated_dataset[i]["code"] 98 | passed, completion = reverse_codegen(generated_dataset[i]["description"], code_str) 99 | if passed: 100 | new_dataset["code"].append(code_str) 101 | new_dataset["description"].append(generated_dataset[i]["description"]) 102 | new_dataset = Dataset.from_dict(new_dataset) 103 | new_dataset.save_to_disk(output_dir) 104 | 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /model_eval_qlora/standalone_eval.py: -------------------------------------------------------------------------------- 1 | import jsonlines 2 | import sys 3 | import tiktoken 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | import numpy as np 7 | 8 | sys.path.append("../verilog_eval/verilog_eval") 9 | from evaluation import evaluate_functional_correctness 10 | 11 | def process_jsonl_file(src_file, dst_file): 12 | with jsonlines.open(src_file) as reader: 13 | with jsonlines.open(dst_file, mode='w') as writer: 14 | for obj in reader: 15 | split = obj['completion'].split(';', 1) 16 | if len(split) > 1: 17 | obj['completion'] = split[1] 18 | writer.write(obj) 19 | else: 20 | writer.write(obj) 21 | 22 | 23 | 24 | def evaluate(gen_file, prob_file): 25 | res = evaluate_functional_correctness(gen_file, problem_file=prob_file, k=[1,5,10]) 26 | print("Eval Results:", res) 27 | 28 | def results_profile(result_file, prob_file): 29 | tokenizer = tiktoken.encoding_for_model("gpt-4") 30 | passed_list = [] 31 | failed_list = [] 32 | with jsonlines.open(result_file) as reader: 33 | for obj in reader: 34 | if obj['passed']: 35 | passed_list.append(obj) 36 | else: 37 | failed_list.append(obj) 38 | 39 | problems = {} 40 | with jsonlines.open(prob_file) as reader: 41 | for obj in reader: 42 | problems[obj['task_id']] = obj 43 | 44 | for obj in passed_list: 45 | obj['module_header'] = problems[obj['task_id']]['prompt'] 46 | obj['canonical_solution'] = problems[obj['task_id']]['canonical_solution'] 47 | obj["code_lines"] = len(obj['module_header'].split('\n')) + len(obj['canonical_solution'].split('\n')) 48 | obj["code_token_count"] = len(tokenizer.encode(obj["module_header"] + "\n" + obj["canonical_solution"])) 49 | obj["prompt_token_count"] = len(tokenizer.encode(obj["prompt"])) 50 | 51 | for obj in failed_list: 52 | obj['module_header'] = problems[obj['task_id']]['prompt'] 53 | obj['canonical_solution'] = problems[obj['task_id']]['canonical_solution'] 54 | obj["code_lines"] = len(obj['module_header'].split('\n')) + len(obj['canonical_solution'].split('\n')) 55 | obj["code_token_count"] = len(tokenizer.encode(obj["module_header"] + "\n" + obj["canonical_solution"])) 56 | obj["prompt_token_count"] = len(tokenizer.encode(obj["prompt"])) 57 | 58 | 59 | 60 | data1 = [obj["code_token_count"] for obj in passed_list] 61 | data2 = [obj["code_token_count"] for obj in failed_list] 62 | data3 = data1 + data2 63 | 64 | 65 | # Plotting the distributions 66 | sns.set(style="whitegrid") # Setting the style of the plot 67 | plt.figure(figsize=(10, 6)) # Setting the size of the plot 68 | #bin size 10 69 | sns.histplot(data1, kde=True, color="blue", label="Passed", bins=10) 70 | sns.histplot(data2, kde=True, color="red", label="Failed", bins=10) 71 | 72 | plt.title('Distribution of Code Token Count') 73 | plt.xlabel('Code Token Count') 74 | plt.ylabel('Frequency') 75 | #save figure 76 | plt.savefig("passed_code_token_count.png") 77 | plt.clf() 78 | 79 | 80 | 81 | # Define common bin edges 82 | bins = np.linspace(min(np.min(data) for data in [data1, data2, data3]), 83 | max(np.max(data) for data in [data1, data2, data3]), 84 | 10) 85 | # Calculate histograms 86 | hist1, _ = np.histogram(data1, bins=bins) 87 | hist2, _ = np.histogram(data2, bins=bins) 88 | hist3, _ = np.histogram(data3, bins=bins) 89 | # Normalize histograms 90 | normalized_hist1 = hist1 / (hist3 + 1e-6) # Adding a small constant to avoid division by zero 91 | normalized_hist2 = hist2 / (hist3 + 1e-6) 92 | # Plotting 93 | plt.figure(figsize=(10, 6)) 94 | 95 | plt.plot(bins[:-1], normalized_hist1, label='Normalized Dataset 1', marker='o', color="blue") 96 | plt.plot(bins[:-1], normalized_hist2, label='Normalized Dataset 2', marker='o', color="red") 97 | 98 | plt.title('Success / Failure Rates') 99 | plt.xlabel('Code Token Count') 100 | plt.ylabel('Success / Failure Rates') 101 | #save figure 102 | plt.savefig("success_failure_rates.png") 103 | plt.clf() 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | if __name__ == '__main__': 112 | prob_file = "../verilog_eval/data/VerilogEval_Machine.jsonl" 113 | gen_file = "./data/gen.jsonl" 114 | result_file = "./data/gen.jsonl_results.jsonl" 115 | #process_jsonl_file(gen_file, "test.jsonl") 116 | #evaluate(gen_file="test.jsonl", prob_file=prob_file) 117 | results_profile(result_file, prob_file) 118 | -------------------------------------------------------------------------------- /auto_data_gen_val/preliminary_exp.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from dotenv import load_dotenv 4 | load_dotenv() 5 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR"))) 6 | 7 | sys.path.append("../verilog_eval/verilog_eval") 8 | from evaluation import evaluate_functional_correctness 9 | 10 | import openai 11 | import requests 12 | import json 13 | import copy 14 | import time 15 | import datetime 16 | import shutil 17 | import pandas as pd 18 | import tiktoken 19 | from openai.embeddings_utils import get_embedding, cosine_similarity 20 | from ast import literal_eval 21 | import numpy as np 22 | from utils import * 23 | from tqdm import tqdm 24 | import jsonlines 25 | 26 | 27 | from chain_utils import gen_block_summary_chain, func_name_lookup_chain, VerilogEval, detail_steps_chain, openai_chat 28 | 29 | from embedding_lookup_utils import openai_chat, validate_global_summary_openai 30 | 31 | def reverse_code_gen_openai(desc_file, eval_file, result_file, repeat=10): 32 | desc_list = [] 33 | with jsonlines.open(desc_file) as reader: 34 | for obj in reader: 35 | desc_list.append(obj) 36 | results = [] 37 | for obj in desc_list: 38 | for r in range(repeat): 39 | print("task_id: {}".format(obj["task_id"])) 40 | task_id = obj["task_id"] 41 | # print("description: {}".format(desc_dict[task_id])) 42 | passed, code = validate_global_summary_openai(obj["detail_description"], task_id, eval_file, max_trials=1) 43 | results.append({"task_id":task_id, "completion": code, "passed":passed}) 44 | print("passed: {}".format(passed)) 45 | with jsonlines.open(result_file, "w") as writer: 46 | writer.write_all(results) 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | if __name__ == "__main__": 55 | verilogeval0 = VerilogEval(model="llama2") 56 | example_cstr_json = "/home/user_name/DAC_2024/chatgpt4_auto_accel/fine_tune_dataset/auto_doc_part_dataset/verilogeval_datagen/example_code_strings.json" 57 | with open(example_cstr_json, "r") as f: 58 | example_code_strings = json.load(f) 59 | example_code_description_file = "/home/user_name/DAC_2024/verilogeval/verilog-eval/descriptions/VerilogDescription_Machine.jsonl" 60 | eval_file = "/home/user_name/DAC_2024/verilogeval/verilog-eval/data/VerilogEval_Machine.jsonl" 61 | global_summary_chain = verilogeval0.verilog_eval_sft_data 62 | code_gen_chain = verilogeval0.code_gen 63 | 64 | tested_model = "llama2-70b" 65 | generated_description_file = "gen_{}.jsonl".format(tested_model) 66 | reverse_code_gen_openai_file = "reverse_code_gen_{}-openai-gpt4.jsonl".format(tested_model) 67 | reverse_code_gen_file = "reverse_code_gen_{}.jsonl".format(tested_model) 68 | repeat_times = 10 69 | 70 | gen_description = False 71 | openai_code_gen = True 72 | 73 | 74 | reverse_code_gen = False 75 | 76 | if gen_description: 77 | #read the code_content from eval_file 78 | code_content = {} 79 | with jsonlines.open(eval_file) as reader: 80 | for obj in reader: 81 | code_content[obj["task_id"]] = obj["prompt"] + "\n"+obj["canonical_solution"] 82 | generated_description = [] 83 | for task_id in code_content: 84 | print("generating description for task_id: {}".format(task_id)) 85 | for i in range(repeat_times): 86 | print(i, end = " ") 87 | description = global_summary_chain( code_content[task_id], 88 | example_code_description_file=example_code_description_file, 89 | example_code_strings=example_code_strings, 90 | desc_key="detail_description") 91 | #append task_id and description to generated_description 92 | generated_description.append({"task_id":task_id, "detail_description":description}) 93 | print(len(generated_description)) 94 | print() 95 | #store in a jsonl file 96 | with jsonlines.open(generated_description_file, "w") as writer: 97 | writer.write_all(generated_description) 98 | 99 | if openai_code_gen: 100 | generated_description_file = "/home/user_name/DAC_2024/chatgpt4_auto_accel/fine_tune_dataset/auto_doc_part_dataset/hdlbits_description.jsonl" 101 | #reverse code generation 102 | reverse_code_gen_openai(generated_description_file, eval_file, reverse_code_gen_openai_file) 103 | 104 | if reverse_code_gen: 105 | #code gen from tested model 106 | code_gen_chain(example_code_description_file, eval_file=eval_file, result_file=reverse_code_gen_file, repeat=repeat_times) 107 | 108 | 109 | 110 | 111 | -------------------------------------------------------------------------------- /auto_data_gen_val/code_preprocesser.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from dotenv import load_dotenv 4 | load_dotenv() 5 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR"))) 6 | import openai 7 | import requests 8 | import json 9 | import copy 10 | import time 11 | import datetime 12 | import shutil 13 | from utils import * 14 | 15 | 16 | def folder_create(folder_name): 17 | if not os.path.exists(folder_name): 18 | #recursively create the directory 19 | os.makedirs(folder_name) 20 | else: 21 | #ask the user if they want to delete the directory and create a new one 22 | print("The directory {} already exists. Do you want to delete it and create a new one?".format(folder_name)) 23 | print("Type 'y' for yes and 'n' for no.") 24 | answer = input() 25 | if answer == "y": 26 | shutil.rmtree(folder_name) 27 | os.makedirs(folder_name) 28 | else: 29 | print("Leave the directory as it is.") 30 | 31 | 32 | class CodePreprocesser: 33 | def __init__(self, code_dir, store_src_code_dir, 34 | csv_code_dir, csv_comment_dir, csv_new_comment_dir, 35 | csv_pure_gen_comment_dir, code_summary_dir, documented_code_dir, 36 | code_suffix =[".v", ".sv", ".vh"], discard_original_comment = False): 37 | self.code_dir = code_dir 38 | self.code_suffix = code_suffix 39 | self.store_src_code_dir = store_src_code_dir 40 | self.csv_code_dir = csv_code_dir 41 | self.csv_comment_dir = csv_comment_dir 42 | self.csv_new_comment_dir = csv_new_comment_dir 43 | self.csv_pure_gen_comment_dir = csv_pure_gen_comment_dir 44 | self.code_summary_dir = code_summary_dir 45 | self.documented_code_dir = documented_code_dir 46 | self.discard_original_comment = discard_original_comment 47 | #check if the directory exists 48 | folder_create(self.store_src_code_dir) 49 | folder_create(self.csv_code_dir) 50 | folder_create(self.csv_comment_dir) 51 | folder_create(self.csv_new_comment_dir) 52 | folder_create(self.csv_pure_gen_comment_dir) 53 | folder_create(self.code_summary_dir) 54 | folder_create(self.documented_code_dir) 55 | 56 | def raw_code_copy(self, src_dir, dst_dir, skip_preprocess = False): 57 | #copy all the files with the suffix to the dst_dir 58 | self.code_files = [] 59 | for file in os.listdir(src_dir): 60 | if file.endswith(tuple(self.code_suffix)): 61 | if not skip_preprocess: 62 | shutil.copy(os.path.join(src_dir, file), dst_dir) 63 | self.code_files.append(file) 64 | 65 | def create_code_assets(self): 66 | #separate the comments and code and create corresponding csv files 67 | for code_file in tqdm(self.code_files, total=len(self.code_files), desc="Creating code assets"): 68 | src_code_file = os.path.join(self.store_src_code_dir, code_file) 69 | csv_code_file = os.path.join(self.csv_code_dir, code_file.split(".")[0] + ".csv") 70 | csv_comment_file = os.path.join(self.csv_comment_dir, code_file.split(".")[0] + ".csv") 71 | convert_raw_src_code_to_csv(src_code_file, csv_code_file, csv_comment_file, discard_original_comment = self.discard_original_comment) 72 | 73 | def pre_process_routines(self, dst_dir, discard_original_comment = True, rtl = True): 74 | for file in os.listdir(dst_dir): 75 | preprocess(os.path.join(dst_dir, file),discard_original_comment=discard_original_comment, rtl=rtl) 76 | 77 | if __name__ == "__main__": 78 | code_dir = "../verilog/AccDNN/verilog" 79 | if os.environ.get("TARGET_LANG") == "verilog": 80 | code_suffix = [".v", ".sv", ".vh"] 81 | elif os.environ.get("TARGET_LANG") == "xilinx_hls": 82 | code_suffix = [".c", ".cpp", ".h", ".hpp"] 83 | store_src_code_dir = os.environ.get("STORE_SRC_CODE_DIR") 84 | csv_code_dir = os.environ.get("CSV_CODE_DIR") 85 | csv_comment_dir = os.environ.get("CSV_COMMENT_DIR") 86 | csv_new_comment_dir = os.environ.get("CSV_NEW_COMMENT_DIR") 87 | csv_pure_gen_comment_dir = os.environ.get("CSV_PURE_GEN_COMMENT_DIR") 88 | code_summary_dir = os.environ.get("CODE_SUMMARY_DIR") 89 | documented_code_dir = os.environ.get("DOCUMENTED_CODE_DIR") 90 | 91 | code_preprocesser = CodePreprocesser(code_dir, store_src_code_dir, 92 | csv_code_dir, csv_comment_dir, 93 | csv_new_comment_dir, csv_pure_gen_comment_dir, 94 | code_summary_dir, documented_code_dir, 95 | code_suffix=code_suffix, discard_original_comment = False) 96 | code_preprocesser.raw_code_copy(code_dir, store_src_code_dir) 97 | code_preprocesser.create_code_assets() 98 | -------------------------------------------------------------------------------- /auto_data_gen_val/preprocess_data/example_code_strings_detailed_instructions.json: -------------------------------------------------------------------------------- 1 | { 2 | "lemmings1": "module top_module (\n\tinput clk,\n\tinput areset,\n\tinput bump_left,\n\tinput bump_right,\n\toutput walk_left,\n\toutput walk_right\n);\n\tparameter WL=0, WR=1;\n\treg state;\n\treg next;\n \n always_comb begin\n\t\tcase (state)\n\t\t\tWL: next = bump_left ? WR : WL;\n\t\t\tWR: next = bump_right ? WL: WR;\n\t\tendcase\n end\n \n always @(posedge clk, posedge areset) begin\n\t\tif (areset) state <= WL;\n else state <= next;\n\tend\n\t\t\n\tassign walk_left = (state==WL);\n\tassign walk_right = (state==WR);\n\n\t\nendmodule\n", 3 | "rotate100": "module top_module(\n\tinput clk,\n\tinput load,\n\tinput [1:0] ena,\n\tinput [99:0] data,\n\toutput reg [99:0] q);\n\t\n\t\n\talways @(posedge clk) begin\n\t\tif (load)\n\t\t\tq <= data;\n\t\telse if (ena == 2'h1)\n\t\t\tq <= {q[0], q[99:1]};\n\t\telse if (ena == 2'h2)\n\t\t\tq <= {q[98:0], q[99]};\n\tend\nendmodule\n", 4 | "vector2": "module top_module (\n\tinput [31:0] in,\n\toutput [31:0] out\n);\n\n\tassign out = {in[7:0], in[15:8], in[23:16], in[31:24]};\t\n\t\nendmodule\n", 5 | "gatesv100": "module top_module (\n\tinput [99:0] in,\n\toutput [98:0] out_both,\n\toutput [99:1] out_any,\n\toutput [99:0] out_different\n);\n\n\tassign out_both = in & in[99:1];\n\tassign out_any = in | in[99:1];\n\tassign out_different = in^{in[0], in[99:1]};\n\t\nendmodule\n", 6 | "history_shift": "module top_module\n(\n input clk,\n input areset,\n input predict_valid,\n input predict_taken,\n output logic [31:0] predict_history,\n \n input train_mispredicted,\n input train_taken,\n input [31:0] train_history\n);\n always@(posedge clk, posedge areset)\n\t\tif (areset) begin\n\t\t\tpredict_history = 0;\n end\telse begin\n\t\t\tif (train_mispredicted)\n\t\t\t\tpredict_history <= {train_history, train_taken};\n\t\t\telse if (predict_valid)\n\t\t\t\tpredict_history <= {predict_history, predict_taken};\n\t\tend\nendmodule\n", 7 | "ece241_2013_q2": "module top_module (\n\tinput a,\n\tinput b,\n\tinput c,\n\tinput d,\n\toutput out_sop,\n\toutput out_pos\n);\n\t\n\twire pos0, pos1;\n\tassign out_sop = c&d | ~a&~b&c;\n\tassign pos0 = c & (~b|d)&(~a|b);\n\tassign pos1 = c & (~b|d)&(~a|d);\n\t\n\tassign out_pos = (pos0 == pos1) ? pos0 : 1'bx;\nendmodule\n", 8 | "dff16e": "module top_module(\n\tinput clk,\n\tinput resetn,\n\tinput [1:0] byteena,\n\tinput [15:0] d,\n\toutput reg [15:0] q);\n\t\n\talways @(posedge clk) begin\n\t\tif (!resetn)\n\t\t\tq <= 0;\n\t\telse begin\n\t\t\tif (byteena[0])\n\t\t\t\tq[7:0] <= d[7:0];\n\t\t\tif (byteena[1])\n\t\t\t\tq[15:8] <= d[15:8];\n\t\tend\n\tend\n\t\nendmodule\n", 9 | "fsm2": "module top_module (\n\tinput clk,\n\tinput j,\n\tinput k,\n\tinput areset,\n\toutput out\n);\n\tparameter A=0, B=1;\n\treg state;\n\treg next;\n \n always_comb begin\n\t\tcase (state)\n\t\t\tA: next = j ? B : A;\n\t\t\tB: next = k ? A : B;\n\t\tendcase\n end\n \n always @(posedge clk, posedge areset) begin\n\t\tif (areset) state <= A;\n else state <= next;\n\tend\n\t\t\n\tassign out = (state==B);\n\n\t\nendmodule\n", 10 | "vector100r": "module top_module (\n\tinput [99:0] in,\n\toutput reg [99:0] out\n);\n\t\n\talways_comb \n\t\tfor (int i=0;i<$bits(out);i++)\n\t\t\tout[i] = in[$bits(out)-i-1];\n\t\nendmodule\n", 11 | "gatesv": "module top_module (\n\tinput [3:0] in,\n\toutput [2:0] out_both,\n\toutput [3:1] out_any,\n\toutput [3:0] out_different\n);\n\n\tassign out_both = in[2:0] & in[3:1];\n\tassign out_any = in[2:0] | in[3:1];\n\tassign out_different = in^{in[0], in[3:1]};\n\t\nendmodule\n", 12 | "review2015_fsmseq": "module top_module(\n\tinput clk,\n\tinput reset,\n\tinput data,\n\toutput start_shifting);\n\n\tparameter S=0, S1=1, S11=2, S110=3, Done=4;\n\t\n\treg [2:0] state, next;\n\t\n\talways_comb begin\n\t\tcase (state)\n\t\t\tS: next = data ? S1: S;\n\t\t\tS1: next = data ? S11: S;\n\t\t\tS11: next = data ? S11 : S110;\n\t\t\tS110: next = data ? Done : S;\n\t\t\tDone: next = Done;\n\t\tendcase\n\tend\n\t\n\talways @(posedge clk)\n\t\tif (reset) state <= S;\n\t\telse state <= next;\n\t\t\n\tassign start_shifting = state == Done;\n\t\n\t\nendmodule\n", 13 | "2014_q3bfsm": "module top_module (\n\tinput clk,\n\tinput reset,\n\tinput x,\n\toutput reg z\n);\n\tparameter A=0, B=1, C=2, D=3, E=4;\n\treg [2:0] state, next;\n\n\talways @(posedge clk) begin\n\t\tif (reset) state <= A;\n\t\telse state <= next;\n\tend\n\t\n\talways_comb begin\n\t\tcase (state)\n\t\t\tA: next = x ? B : A;\n\t\t\tB: next = x ? E : B;\n\t\t\tC: next = x ? B : C;\n\t\t\tD: next = x ? C : B;\n\t\t\tE: next = x ? E : D;\t\t\n\t\t\tdefault: next = 'x;\n\t\tendcase\n\tend\n\t \n\tassign z = (state == D) || (state == E);\n\t\nendmodule\n", 14 | "ece241_2013_q7": "module top_module (\n\tinput clk,\n\tinput j,\n\tinput k,\n\toutput reg Q\n);\n\n\talways @(posedge clk)\n\t\tQ <= j&~Q | ~k&Q;\n\t\nendmodule\n", 15 | "edgecapture": "module top_module(\n\tinput clk,\n\tinput reset,\n\tinput [31:0] in,\n\toutput reg [31:0] out);\n\t\n\treg [31:0] d_last;\t\n\t\t\t\n\talways @(posedge clk) begin\n\t\td_last <= in;\n\t\tif (reset)\n\t\t\tout <= '0;\n\t\telse\n\t\t\tout <= out | (~in & d_last);\n\tend\n\t\nendmodule\n" 16 | } -------------------------------------------------------------------------------- /verilog_eval/verilog_eval/evaluation.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict, Counter 2 | from concurrent.futures import ProcessPoolExecutor, as_completed 3 | from typing import List, Union, Iterable, Dict, Tuple, Optional 4 | import itertools 5 | 6 | import numpy as np 7 | import tqdm 8 | 9 | from verilog_eval.data import read_problems, stream_jsonl, write_jsonl 10 | from verilog_eval.execution import check_correctness, clean_up_simulation 11 | 12 | 13 | def estimate_pass_at_k( 14 | num_samples: Union[int, List[int], np.ndarray], 15 | num_correct: Union[List[int], np.ndarray], 16 | k: int 17 | ) -> np.ndarray: 18 | """ 19 | Estimates pass@k of each problem and returns them in an array. 20 | """ 21 | 22 | def estimator(n: int, c: int, k: int) -> float: 23 | """ 24 | Calculates 1 - comb(n - c, k) / comb(n, k). 25 | """ 26 | if n - c < k: 27 | return 1.0 28 | return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) 29 | 30 | if isinstance(num_samples, int): 31 | num_samples_it = itertools.repeat(num_samples, len(num_correct)) 32 | else: 33 | assert len(num_samples) == len(num_correct) 34 | num_samples_it = iter(num_samples) 35 | 36 | return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]) 37 | 38 | 39 | def contain_passing_completion( 40 | problem: Dict, 41 | completions: List[str], 42 | n_workers: int = 4, 43 | timeout: float = 30.0, 44 | unit_test_length: Optional[int] = None, 45 | clean_up: bool = True, 46 | ) -> Tuple[bool, str]: 47 | 48 | with ProcessPoolExecutor(max_workers=n_workers) as executor: 49 | 50 | futures = [] 51 | 52 | for idx, completion in enumerate(completions): 53 | args = (problem, completion, timeout, idx, unit_test_length) 54 | future = executor.submit(check_correctness, *args) 55 | futures.append(future) 56 | 57 | for future in as_completed(futures): 58 | result = future.result() 59 | if result["passed"]: 60 | return True, completions[result["completion_id"]] 61 | 62 | if clean_up: 63 | clean_up_simulation() 64 | 65 | return False, "" 66 | 67 | def evaluate_functional_correctness( 68 | sample_file: str, 69 | problem_file: str, 70 | k: List[int] = [1, 10, 100], 71 | n_workers: int = 4, 72 | timeout: float = 30.0, 73 | unit_test: bool = False, 74 | clean_up: bool = True, 75 | ): 76 | """ 77 | Evaluates the functional correctness of generated samples, and writes 78 | results to f"{sample_file}_results.jsonl.gz" 79 | """ 80 | 81 | problems = read_problems(problem_file) 82 | 83 | # Check the generated samples against test suites. 84 | with ProcessPoolExecutor(max_workers=n_workers) as executor: 85 | 86 | futures = [] 87 | completion_id = Counter() 88 | n_samples = 0 89 | results = defaultdict(list) 90 | 91 | print("Reading samples...") 92 | for sample in tqdm.tqdm(stream_jsonl(sample_file)): 93 | task_id = sample["task_id"] 94 | completion = sample["completion"] 95 | if unit_test: 96 | args = (problems[task_id], completion, timeout, completion_id[task_id], 100) 97 | else: 98 | args = (problems[task_id], completion, timeout, completion_id[task_id]) 99 | future = executor.submit(check_correctness, *args) 100 | futures.append(future) 101 | completion_id[task_id] += 1 102 | n_samples += 1 103 | 104 | assert len(completion_id) == len(problems), "Some problems are not attempted." 105 | 106 | print("Running test suites...") 107 | for future in tqdm.tqdm(as_completed(futures), total=len(futures)): 108 | result = future.result() 109 | results[result["task_id"]].append((result["completion_id"], result)) 110 | 111 | if clean_up: 112 | clean_up_simulation() 113 | 114 | # Calculate pass@k. 115 | total, correct = [], [] 116 | for result in results.values(): 117 | result.sort() 118 | passed = [r[1]["passed"] for r in result] 119 | total.append(len(passed)) 120 | correct.append(sum(passed)) 121 | total = np.array(total) 122 | correct = np.array(correct) 123 | 124 | ks = k 125 | pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean() 126 | for k in ks if (total >= k).all()} 127 | 128 | # Finally, save the results in one file: 129 | def combine_results(): 130 | for sample in stream_jsonl(sample_file): 131 | task_id = sample["task_id"] 132 | result = results[task_id].pop(0) 133 | sample["result"] = result[1]["result"] 134 | sample["passed"] = result[1]["passed"] 135 | yield sample 136 | 137 | out_file = sample_file + "_results.jsonl" 138 | print(f"Writing results to {out_file}...") 139 | write_jsonl(out_file, tqdm.tqdm(combine_results(), total=n_samples)) 140 | 141 | return pass_at_k 142 | -------------------------------------------------------------------------------- /verilog_eval/build/lib/verilog_eval/evaluation.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict, Counter 2 | from concurrent.futures import ProcessPoolExecutor, as_completed 3 | from typing import List, Union, Iterable, Dict, Tuple, Optional 4 | import itertools 5 | 6 | import numpy as np 7 | import tqdm 8 | 9 | from verilog_eval.data import read_problems, stream_jsonl, write_jsonl 10 | from verilog_eval.execution import check_correctness, clean_up_simulation 11 | 12 | 13 | def estimate_pass_at_k( 14 | num_samples: Union[int, List[int], np.ndarray], 15 | num_correct: Union[List[int], np.ndarray], 16 | k: int 17 | ) -> np.ndarray: 18 | """ 19 | Estimates pass@k of each problem and returns them in an array. 20 | """ 21 | 22 | def estimator(n: int, c: int, k: int) -> float: 23 | """ 24 | Calculates 1 - comb(n - c, k) / comb(n, k). 25 | """ 26 | if n - c < k: 27 | return 1.0 28 | return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) 29 | 30 | if isinstance(num_samples, int): 31 | num_samples_it = itertools.repeat(num_samples, len(num_correct)) 32 | else: 33 | assert len(num_samples) == len(num_correct) 34 | num_samples_it = iter(num_samples) 35 | 36 | return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]) 37 | 38 | 39 | def contain_passing_completion( 40 | problem: Dict, 41 | completions: List[str], 42 | n_workers: int = 4, 43 | timeout: float = 30.0, 44 | unit_test_length: Optional[int] = None, 45 | clean_up: bool = True, 46 | ) -> Tuple[bool, str]: 47 | 48 | with ProcessPoolExecutor(max_workers=n_workers) as executor: 49 | 50 | futures = [] 51 | 52 | for idx, completion in enumerate(completions): 53 | args = (problem, completion, timeout, idx, unit_test_length) 54 | future = executor.submit(check_correctness, *args) 55 | futures.append(future) 56 | 57 | for future in as_completed(futures): 58 | result = future.result() 59 | if result["passed"]: 60 | return True, completions[result["completion_id"]] 61 | 62 | if clean_up: 63 | clean_up_simulation() 64 | 65 | return False, "" 66 | 67 | def evaluate_functional_correctness( 68 | sample_file: str, 69 | problem_file: str, 70 | k: List[int] = [1, 10, 100], 71 | n_workers: int = 4, 72 | timeout: float = 30.0, 73 | unit_test: bool = False, 74 | clean_up: bool = True, 75 | ): 76 | """ 77 | Evaluates the functional correctness of generated samples, and writes 78 | results to f"{sample_file}_results.jsonl.gz" 79 | """ 80 | 81 | problems = read_problems(problem_file) 82 | 83 | # Check the generated samples against test suites. 84 | with ProcessPoolExecutor(max_workers=n_workers) as executor: 85 | 86 | futures = [] 87 | completion_id = Counter() 88 | n_samples = 0 89 | results = defaultdict(list) 90 | 91 | print("Reading samples...") 92 | for sample in tqdm.tqdm(stream_jsonl(sample_file)): 93 | task_id = sample["task_id"] 94 | completion = sample["completion"] 95 | if unit_test: 96 | args = (problems[task_id], completion, timeout, completion_id[task_id], 100) 97 | else: 98 | args = (problems[task_id], completion, timeout, completion_id[task_id]) 99 | future = executor.submit(check_correctness, *args) 100 | futures.append(future) 101 | completion_id[task_id] += 1 102 | n_samples += 1 103 | 104 | assert len(completion_id) == len(problems), "Some problems are not attempted." 105 | 106 | print("Running test suites...") 107 | for future in tqdm.tqdm(as_completed(futures), total=len(futures)): 108 | result = future.result() 109 | results[result["task_id"]].append((result["completion_id"], result)) 110 | 111 | if clean_up: 112 | clean_up_simulation() 113 | 114 | # Calculate pass@k. 115 | total, correct = [], [] 116 | for result in results.values(): 117 | result.sort() 118 | passed = [r[1]["passed"] for r in result] 119 | total.append(len(passed)) 120 | correct.append(sum(passed)) 121 | total = np.array(total) 122 | correct = np.array(correct) 123 | 124 | ks = k 125 | pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean() 126 | for k in ks if (total >= k).all()} 127 | 128 | # Finally, save the results in one file: 129 | def combine_results(): 130 | for sample in stream_jsonl(sample_file): 131 | task_id = sample["task_id"] 132 | result = results[task_id].pop(0) 133 | sample["result"] = result[1]["result"] 134 | sample["passed"] = result[1]["passed"] 135 | yield sample 136 | 137 | out_file = sample_file + "_results.jsonl" 138 | print(f"Writing results to {out_file}...") 139 | write_jsonl(out_file, tqdm.tqdm(combine_results(), total=n_samples)) 140 | 141 | return pass_at_k 142 | -------------------------------------------------------------------------------- /auto_data_gen_val/gen_global_summary.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import shutil 4 | import argparse 5 | from dotenv import load_dotenv 6 | load_dotenv() 7 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR"))) 8 | from embedding_lookup_utils import CodeDataset 9 | from langchain.callbacks import get_openai_callback #with get_openai_callback() as cb: 10 | 11 | if __name__ == "__main__": 12 | parser = argparse.ArgumentParser() 13 | #positional arguments 14 | #start_id, total_code_parts 15 | parser.add_argument("start_id", help="start id of the code parts", type=int) 16 | parser.add_argument("total_code_parts", help="total number of code parts", type=int) 17 | 18 | #optional arguments 19 | parser.add_argument("--documented_code_dir", help="documented code directory", type=str, default="/home/user_name/DAC_2024/ckpts/") 20 | parser.add_argument("--code_metadata_dir", help="code metadata directory", type=str, default="/home/user_name/DAC_2024/ckpt3_user_name_valid_content_code_metadata/") 21 | parser.add_argument("--model", help="model", type=str, default="gpt-3.5-turbo-1106") 22 | parser.add_argument("--detailed", action="store_true", help="detailed summary") 23 | args = parser.parse_args() 24 | code_part_start_id = args.start_id 25 | total_code_parts = args.total_code_parts 26 | documented_code_dir = args.documented_code_dir 27 | code_metadata_dir = args.code_metadata_dir 28 | model = args.model 29 | detailed = args.detailed 30 | 31 | 32 | dataset_metadata_dir = os.path.join(documented_code_dir, "dataset_metadata") 33 | if not os.path.exists(dataset_metadata_dir): 34 | os.makedirs(dataset_metadata_dir) 35 | 36 | with get_openai_callback() as cb: 37 | for code_part_id in range(code_part_start_id, total_code_parts): 38 | if not os.path.exists("{}/part{}".format(dataset_metadata_dir, code_part_id)): 39 | os.makedirs("{}/part{}".format(dataset_metadata_dir, code_part_id)) 40 | src_code_dir = os.path.join(documented_code_dir, "part{}".format(code_part_id)) 41 | codedb = CodeDataset( 42 | src_code_dir, 43 | bookkeeping_dir="{}/part{}/bookkeeping/".format(dataset_metadata_dir,code_part_id), 44 | vectorembedding_dir="{}/part{}/vectorembedding/".format(dataset_metadata_dir, code_part_id), 45 | force_refresh=False, 46 | cb=cb 47 | ) 48 | csv_code_dir = os.path.join(code_metadata_dir, "part{}".format(code_part_id), "assets", "verilog", "code_and_comment_src", "csv_src", "csv_code_src") 49 | csv_comment_dir = os.path.join(code_metadata_dir, "part{}".format(code_part_id), "assets", "verilog", "code_and_comment_src", "csv_src", "csv_new_comment_src") 50 | codedb.load_and_split_code(skip_small_doc=True, split_by_line=True, based_on_code_lines_only=True, 51 | csv_code_dir=csv_code_dir, 52 | csv_comment_dir=csv_comment_dir 53 | ) 54 | if detailed: 55 | codedb.init_vectorstore(global_summary_chain_from_verilog_eval=False, 56 | global_summary_model=model, 57 | global_summary_example_cstr_json = f"{os.environ.get('DATA4AIGCHIP_HOME')}/auto_data_gen_val/preprocess_data/example_code_strings_detailed_instructions.json", 58 | global_summary_example_code_description_file= f"{os.environ.get('DATA4AIGCHIP_HOME')}/verilog_eval/descriptions/VerilogDescription_Machine.jsonl" 59 | ) 60 | codedb.supplement_summary(block_summary_placeholding=True,force_refresh_global_summary_detailed=True, global_summary_example_desc_key="detail_description") 61 | codedb.save_global_summary( 62 | "{}/part{}/global_detailed_summary.json".format(dataset_metadata_dir, code_part_id) 63 | ) 64 | else: 65 | codedb.init_vectorstore(global_summary_chain_from_verilog_eval=False, 66 | detailed=False, 67 | global_summary_model=model, 68 | global_summary_example_cstr_json = f"{os.environ.get('DATA4AIGCHIP_HOME')}/auto_data_gen_val/preprocess_data/example_code_strings_simple_instructions.json", 69 | global_summary_example_code_description_file= f"{os.environ.get('DATA4AIGCHIP_HOME')}/verilog_eval/descriptions/VerilogDescription_Machine.jsonl" 70 | ) 71 | codedb.supplement_summary(block_summary_placeholding=True,force_refresh_global_summary_high_level=True, global_summary_example_desc_key="simple_description") 72 | codedb.save_global_summary( 73 | "{}/part{}/global_high_level_summary.json".format(dataset_metadata_dir, code_part_id) 74 | ) 75 | -------------------------------------------------------------------------------- /verilog_eval/README.md: -------------------------------------------------------------------------------- 1 | # VerilogEval: Evaluating Large Language Models for Verilog Code Generation 2 | 3 | This is an evaluation harness for the VerilogEval problem solving dataset 4 | described in the paper "[VerilogEval: Evaluating Large 5 | Language Models for Verilog Code Generation](https://arxiv.org/abs/2309.07544)". 6 | 7 | This evaluation dataset consists of 156 problems from the Verilog 8 | instructional website [HDLBits](https://hdlbits.01xz.net/wiki/Problem_sets). 9 | We provide two sets of problem descriptions: machine generated and manually 10 | converted to text-only format. 11 | 12 | ## Installation 13 | 14 | We closely follow guidance from [HumanEval](https://github.com/openai/human-eval/tree/master). 15 | 16 | Make sure to use python 3.7 or later: 17 | ``` 18 | $ conda create -n codex python=3.7 19 | $ conda activate codex 20 | ``` 21 | 22 | Install [ICARUS Verilog](https://github.com/steveicarus/iverilog): 23 | ``` 24 | $ git clone https://github.com/steveicarus/iverilog.git && cd iverilog \ 25 | && git checkout 01441687235135d1c12eeef920f75d97995da333 \ 26 | && sh ./autoconf.sh && ./configure && make -j4\ 27 | && make install 28 | ``` 29 | 30 | It is recommended to use the provided [Dockerfile](https://github.com/NVlabs/verilog-eval/blob/main/Dockerfile) 31 | which already pre-installed ICARUS Verilog Simulator. Using the docker container 32 | you would still need to complete the following step. 33 | 34 | Check out and install this repository: 35 | ``` 36 | $ git clone https://github.com/NVlabs/verilog-eval 37 | $ pip install -e verilog-eval 38 | ``` 39 | 40 | ## Usage 41 | 42 | **This program would make system calls to *iverilog* and *vvp* to simulate 43 | untrusted model-generated code. Users are strongly 44 | encouraged not to do so outside of a robust security sandbox. The [execution 45 | call](https://github.com/NVlabs/verilog-eval/blob/main/verilog_eval/execution.py#L79-L112) 46 | in `execution.py` is deliberately commented out to ensure users read this 47 | disclaimer before running code in a potentially unsafe manner. See the comment in 48 | `execution.py` for more information and instructions.** 49 | 50 | After following the above instructions to enable execution, generate samples 51 | and save them in the following JSON Lines (jsonl) format, where each sample is 52 | formatted into a single line like so: 53 | ``` 54 | {"task_id": "Corresponding VerilogEval task ID", "completion": "Completion only without the prompt"} 55 | ``` 56 | We provide examples under `data/example` to illustrate the format and help with debugging. 57 | 58 | To evaluate the samples, run 59 | ``` 60 | $ evaluate_functional_correctness samples.jsonl --problem_file data/VerilogEval_Human.jsonl 61 | Reading samples... 62 | 3120it [00:00, 16077.44it/s] 63 | Running test suites... 64 | 100%|...| 3120/3120 [00:32<00:00, 97.47it/s] 65 | Killing all hanging simulation process. 66 | Writing results to samples.jsonl_results.jsonl... 67 | 100%|...| 3120/3120 [00:00<00:00, 30608.13it/s] 68 | {'pass@1': ..., 'pass@5': ..., 'pass@10': ...} 69 | ``` 70 | 71 | The user must specify `--problem_file` input argument. We provide two sets of problem 72 | evaluations `data/VerilogEval_Machine.jsonl` and `data/VerilogEval_Human.jsonl`. 73 | We also provide problem description files used to sample Verilog code completions 74 | in `descriptions` directory. 75 | 76 | This script provides more fine-grained information in a new file ending in 77 | `_results.jsonl`. Each row now contains whether the completion 78 | `passed` along with the execution `result` which is one of "passed", "timed 79 | out", or "failed". 80 | 81 | As a quick sanity-check, the example samples should yield 0.5 pass@1. The results can be 82 | verified against the provided output 83 | in `data/example/ExampleSolution.jsonl_reference.jsonl`. 84 | ``` 85 | $ evaluate_functional_correctness data/example/ExampleSolution.jsonl --problem_file=data/example/ExampleEval.jsonl 86 | Reading samples... 87 | 6it [00:00, 221.60it/s] 88 | Running example suites... 89 | 100%|...| 6/6 [00:00<00:00, 142.09it/s] 90 | Killing all hanging simulation process. 91 | Writing results to data/example/ExampleSolution.jsonl_results.jsonl... 92 | 100%|...| 6/6 [00:00<00:00, 19941.22it/s] 93 | {'pass@1': 0.5} 94 | ``` 95 | 96 | Because there is no unbiased way of estimating pass@k when there are fewer 97 | samples than k, the script does not evaluate pass@k for these cases. To 98 | evaluate with other k values, pass `--k=`. For 99 | other options, see 100 | ``` 101 | $ evaluate_functional_correctness --help 102 | ``` 103 | However, we recommend that you use the default values for the rest. 104 | 105 | ## Issues 106 | Problem descriptions in `descriptions/VerilogDescription_Machine.jsonl` are machine 107 | generated and we can not guarantee the absense of ambiguity and errors. We do not plan 108 | to maintain description correctness. 109 | 110 | Functional correctness are evaluated through comparing simulation outputs using 111 | [ICARUS Verilog](https://github.com/steveicarus/iverilog). The evaluation of Verilog syntax is limited by the simulator, which might not include all features of Verilog HDL 112 | IEEE-1364 standard. 113 | 114 | 115 | ## Citation 116 | 117 | Please cite using the following bibtex entry: 118 | 119 | ``` 120 | @inproceedings{liu2023verilogeval, 121 | title={{VerilogEval:} Evaluating Large Language Models for Verilog Code Generation}, 122 | author={Liu, Mingjie and Pinckney, Nathaniel and Khailany, Brucek and Ren, Haoxing}, 123 | booktitle={2023 IEEE/ACM International Conference on Computer-Aided Design (ICCAD)}, 124 | year={2023} 125 | } 126 | ``` 127 | -------------------------------------------------------------------------------- /auto_data_gen_val/line_by_line_comments_gen.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from dotenv import load_dotenv 4 | load_dotenv(os.path.join(os.path.dirname(__file__), ".env")) 5 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR"))) 6 | import openai 7 | import requests 8 | import json 9 | import copy 10 | import time 11 | import datetime 12 | import shutil 13 | from embedding_lookup_utils import * 14 | from utils import * 15 | from completion_handler import * 16 | from code_preprocesser import * 17 | from code_repo_documentor import * 18 | 19 | #documenting the first version with module instantiation 20 | #one_shot 5 lines 21 | #pure llama 2 70B 22 | #around 12k samples 23 | 24 | if __name__ == "__main__": 25 | #NOTE: run utils.py first to partition the code first 26 | import argparse 27 | 28 | parser = argparse.ArgumentParser(description='Line-by-line Code Documentor') 29 | parser.add_argument('--total_parts', type=int, default=10, help='total parts') 30 | parser.add_argument('--output_dir', type=str, default="./documented_code", help='output directory') 31 | parser.add_argument('--src_code_dir', type=str, default="/home/user_name/DAC_2024/ckpt3_user_name_valid_content_renamed/", help='code directory') 32 | parser.add_argument('--code_metadata_dir', type=str, default="/home/user_name/DAC_2024/ckpt3_user_name_valid_content_code_metadata/", help='code metadata file') 33 | parser.add_argument('--code_lib_path', type=str, default="/home/user_name/DAC_2024/ckpt3_user_name_valid_content_shared_lib/", help='code library path') 34 | parser.add_argument('--code_vec_store', type=str, default="../code_vec_store/test_10_30/", help='code vector store') 35 | parser.add_argument('--skip_preprocess', action='store_true', help='skip preprocessing') 36 | parser.add_argument('--skip_supplement_summary', action='store_true', help='skip supplementing summary') 37 | parser.add_argument('--discard_original_comment', action='store_true', help='discard original comment') 38 | 39 | args = parser.parse_args() 40 | total_parts = args.total_parts 41 | output_dir = args.output_dir 42 | src_code_dir = args.src_code_dir 43 | code_metadata_dir = args.code_metadata_dir 44 | code_lib_path = args.code_lib_path 45 | code_vec_store = args.code_vec_store 46 | skip_preprocess = args.skip_preprocess 47 | skip_supplement_summary = args.skip_supplement_summary 48 | discard_original_comment = args.discard_original_comment 49 | 50 | for code_part in range(total_parts): 51 | code_dir = os.path.join(src_code_dir, "part{}".format(code_part)) 52 | code_metadata_file = os.path.join(code_metadata_dir, "part{}".format(code_part), "codes.json") 53 | # code_lib_path = "/home/user_name/DAC_2024/ckpt3_user_name_valid_content_shared_lib/" 54 | # code_vec_store = "../code_vec_store/test_10_30/" 55 | 56 | 57 | language = os.environ.get("TARGET_LANG") 58 | if os.environ.get("TARGET_LANG") == "verilog": 59 | code_suffix = [".v", ".sv", ".vh"] 60 | elif os.environ.get("TARGET_LANG") == "xilinx_hls": 61 | code_suffix = [".c", ".cpp", ".h", ".hpp"] 62 | store_src_code_dir = os.environ.get("STORE_SRC_CODE_DIR") 63 | csv_code_dir = os.environ.get("CSV_CODE_DIR") 64 | csv_comment_dir = os.environ.get("CSV_COMMENT_DIR") 65 | csv_new_comment_dir = os.environ.get("CSV_NEW_COMMENT_DIR") 66 | csv_pure_gen_comment_dir = os.environ.get("CSV_PURE_GEN_COMMENT_DIR") 67 | code_summary_dir = os.environ.get("CODE_SUMMARY_DIR") 68 | documented_code_dir = os.environ.get("DOCUMENTED_CODE_DIR") 69 | 70 | 71 | with get_openai_callback() as cb: 72 | #This switch will discard 1. the comments in the raw code copy and 2. the comments will be converted to the raw code csv 73 | # discard_original_comment = True 74 | # skip_preprocess = True 75 | # skip_supplement_summary = True 76 | 77 | code_repo_documentor = CodeRepoDocumentor(code_dir, store_src_code_dir, 78 | csv_code_dir, csv_comment_dir, csv_new_comment_dir, 79 | csv_pure_gen_comment_dir, code_summary_dir, documented_code_dir, 80 | code_metadata_file=code_metadata_file, 81 | code_suffix=code_suffix, language=language, 82 | discard_original_comment=discard_original_comment, 83 | code_lib_path=code_lib_path, code_vec_store=code_vec_store, 84 | skip_supplement_summary=skip_supplement_summary, 85 | cb = cb) 86 | code_repo_documentor.create_embedding() 87 | code_repo_documentor.code_preprocess(skip_preprocess=skip_preprocess) 88 | code_repo_documentor.document_repo() 89 | 90 | output_dir_part = os.path.join(output_dir, "part{}".format(code_part)) 91 | #check if output dir exists 92 | if not os.path.exists(output_dir_part): 93 | os.makedirs(output_dir_part) 94 | else: 95 | #ask for confirmation 96 | print("Output directory already exists. Do you want to overwrite? (y/n)") 97 | choice = input().lower() 98 | if choice == "y": 99 | shutil.rmtree(output_dir_part) 100 | os.makedirs(output_dir_part) 101 | else: 102 | print("Exiting...") 103 | continue 104 | code_repo_documentor.package_documented_code(output_dir_part) 105 | #copy assets to output dir 106 | shutil.copytree(os.environ.get("ASSET_DIR"), os.path.join(os.path.join(code_metadata_dir, "part{}".format(code_part)), "assets")) 107 | #copy vector store to output dir 108 | shutil.copytree(code_vec_store, os.path.join(os.path.join(code_metadata_dir, "part{}".format(code_part)), "code_vec_store")) 109 | 110 | 111 | -------------------------------------------------------------------------------- /verilog_eval/verilog_eval/execution.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Callable, Dict 2 | import ast 3 | import contextlib 4 | import faulthandler 5 | import io 6 | import os 7 | import multiprocessing 8 | import platform 9 | import signal 10 | import tempfile 11 | 12 | import subprocess 13 | import re 14 | from threading import Timer 15 | 16 | def clean_up_simulation() -> None: 17 | """ 18 | kill all simulation process. 19 | """ 20 | print("Killing all hanging simulation process.") 21 | subprocess.run("pkill iverilog", shell=True) 22 | subprocess.run("pkill vvp", shell=True) 23 | 24 | def check_correctness(problem: Dict, completion: str, timeout: float, 25 | completion_id: Optional[int] = None, unit_test_length: Optional[int] = None) -> Dict: 26 | """ 27 | Evaluates the functional correctness of a completion by running the test 28 | suite provided in the problem. 29 | :param completion_id: an optional completion ID so we can match 30 | the results later even if execution finishes asynchronously. 31 | """ 32 | 33 | def unsafe_execute(): 34 | 35 | with create_tempdir(): 36 | 37 | # These system calls are needed when cleaning up tempdir. 38 | import os 39 | import shutil 40 | rmtree = shutil.rmtree 41 | rmdir = os.rmdir 42 | chdir = os.chdir 43 | 44 | # Disable functionalities that can make destructive changes to the test. 45 | # WARNING 46 | # subprocess.Popen is used to run shell command with calls to iveriog and vvp. 47 | # Please refer to reliability_guard function for details 48 | reliability_guard() 49 | 50 | # Output testbench with solution to Verilog file in temp directory. 51 | verilog_test = problem["test"] + "\n" + \ 52 | problem["prompt"] + "\n" + \ 53 | completion 54 | 55 | 56 | if unit_test_length: 57 | keywords = re.findall("repeat\([0-9]*\)", verilog_test) 58 | for words in keywords: 59 | verilog_test = verilog_test.replace(words, "repeat({})".format(unit_test_length)) 60 | 61 | with open("{}.sv".format(problem["task_id"]), 'w') as f: 62 | f.write(verilog_test) 63 | 64 | try: 65 | # WARNING PLEASE READ 66 | # The following code use subprocess.Popen to run shell command with calls to iveriog and vvp. 67 | # Please check that iverilog and vvp are installed and included in your current run path. 68 | # For installation of Icarus Verilog, please refer to: https://github.com/steveicarus/iverilog 69 | # This program exists to execute untrusted model-generated code. Although 70 | # it is highly unlikely that model-generated code will do something overtly 71 | # malicious in response to this test suite, model-generated code may act 72 | # destructively due to a lack of model capability or alignment. 73 | # Users are strongly encouraged to sandbox this evaluation suite so that it 74 | # does not perform destructive actions on their host or network. For more 75 | # information on how OpenAI sandboxes its code, see the original OpenAI paper. 76 | # Once you have read this disclaimer and taken appropriate precautions, 77 | # proceed at your own risk: 78 | # BEGIN CODE BLOCK 79 | with swallow_io(): 80 | with time_limit(timeout): 81 | cmd = "iverilog -Wall -Winfloop -Wno-timescale -g2012 \ 82 | -s tb -o test.vvp {}.sv; vvp -n test.vvp".format(problem["task_id"]) 83 | 84 | """ 85 | adding timeout options for Popen. something breaks if not using timeout. seems to be working for now. 86 | not really sure if its the best/correct way. let me know if anyone has a better solution. 87 | https://stackoverflow.com/questions/1191374/using-module-subprocess-with-timeout 88 | """ 89 | p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 90 | timer = Timer(timeout, p.kill) 91 | try: 92 | timer.start() 93 | out, err = p.communicate() 94 | finally: 95 | timer.cancel() 96 | 97 | out, err = out.decode("utf-8"), err.decode("utf-8") 98 | match = re.search(r'Mismatches: ([0-9]*) in ([0-9]*) samples', out) 99 | if "syntax error" in err: 100 | result.append("failed: syntax error.") 101 | elif len(err) > 0: 102 | result.append("failed: compile error.") 103 | elif match: 104 | cor, tot = [int(i) for i in match.groups()] 105 | if cor == 0: 106 | result.append("passed") 107 | else: 108 | result.append(f"failed: {cor} out of {tot} samples.") 109 | else: 110 | result.append("failed: info string not matched.") 111 | # END CODE BLOCK 112 | except TimeoutException: 113 | result.append("timed out") 114 | except BaseException as e: 115 | result.append(f"failed: {e}") 116 | 117 | # Needed for cleaning up. 118 | shutil.rmtree = rmtree 119 | os.rmdir = rmdir 120 | os.chdir = chdir 121 | 122 | manager = multiprocessing.Manager() 123 | result = manager.list() 124 | 125 | p = multiprocessing.Process(target=unsafe_execute) 126 | p.start() 127 | p.join(timeout=timeout + 1) 128 | if p.is_alive(): 129 | p.kill() 130 | 131 | if not result: 132 | result.append("timed out") 133 | 134 | return dict( 135 | task_id=problem["task_id"], 136 | passed=result[0] == "passed", 137 | result=result[0], 138 | completion_id=completion_id, 139 | ) 140 | 141 | 142 | @contextlib.contextmanager 143 | def time_limit(seconds: float): 144 | def signal_handler(signum, frame): 145 | raise TimeoutException("Timed out!") 146 | signal.setitimer(signal.ITIMER_REAL, seconds) 147 | signal.signal(signal.SIGALRM, signal_handler) 148 | try: 149 | yield 150 | finally: 151 | signal.setitimer(signal.ITIMER_REAL, 0) 152 | 153 | 154 | @contextlib.contextmanager 155 | def swallow_io(): 156 | stream = WriteOnlyStringIO() 157 | with contextlib.redirect_stdout(stream): 158 | with contextlib.redirect_stderr(stream): 159 | with redirect_stdin(stream): 160 | yield 161 | 162 | 163 | @contextlib.contextmanager 164 | def create_tempdir(): 165 | with tempfile.TemporaryDirectory() as dirname: 166 | with chdir(dirname): 167 | yield dirname 168 | 169 | 170 | class TimeoutException(Exception): 171 | pass 172 | 173 | 174 | class WriteOnlyStringIO(io.StringIO): 175 | """ StringIO that throws an exception when it's read from """ 176 | 177 | def read(self, *args, **kwargs): 178 | raise IOError 179 | 180 | def readline(self, *args, **kwargs): 181 | raise IOError 182 | 183 | def readlines(self, *args, **kwargs): 184 | raise IOError 185 | 186 | def readable(self, *args, **kwargs): 187 | """ Returns True if the IO object can be read. """ 188 | return False 189 | 190 | 191 | class redirect_stdin(contextlib._RedirectStream): # type: ignore 192 | _stream = 'stdin' 193 | 194 | 195 | @contextlib.contextmanager 196 | def chdir(root): 197 | if root == ".": 198 | yield 199 | return 200 | cwd = os.getcwd() 201 | os.chdir(root) 202 | try: 203 | yield 204 | except BaseException as exc: 205 | raise exc 206 | finally: 207 | os.chdir(cwd) 208 | 209 | 210 | def reliability_guard(maximum_memory_bytes: Optional[int] = None): 211 | """ 212 | Updated Comment: 213 | We have enabled subprocess.Popen to allow shell command calls to verilog 214 | compiler and simulator. Please use at own risk. 215 | Original Comment: 216 | This disables various destructive functions and prevents the generated code 217 | from interfering with the test (e.g. fork bomb, killing other processes, 218 | removing filesystem files, etc.) 219 | WARNING 220 | This function is NOT a security sandbox. Untrusted code, including, model- 221 | generated code, should not be blindly executed outside of one. See the 222 | Codex paper for more information about OpenAI's code sandbox, and proceed 223 | with caution. 224 | """ 225 | 226 | if maximum_memory_bytes is not None: 227 | import resource 228 | resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)) 229 | resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)) 230 | if not platform.uname().system == 'Darwin': 231 | resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)) 232 | 233 | faulthandler.disable() 234 | 235 | import builtins 236 | builtins.exit = None 237 | builtins.quit = None 238 | 239 | import os 240 | os.environ['OMP_NUM_THREADS'] = '1' 241 | 242 | os.kill = None 243 | os.system = None 244 | os.putenv = None 245 | os.remove = None 246 | os.removedirs = None 247 | os.rmdir = None 248 | os.fchdir = None 249 | os.setuid = None 250 | os.fork = None 251 | os.forkpty = None 252 | os.killpg = None 253 | os.rename = None 254 | os.renames = None 255 | os.truncate = None 256 | os.replace = None 257 | #os.unlink = None 258 | os.fchmod = None 259 | os.fchown = None 260 | os.chmod = None 261 | os.chown = None 262 | os.chroot = None 263 | os.fchdir = None 264 | os.lchflags = None 265 | os.lchmod = None 266 | os.lchown = None 267 | os.getcwd = None 268 | os.chdir = None 269 | 270 | import shutil 271 | shutil.rmtree = None 272 | shutil.move = None 273 | shutil.chown = None 274 | 275 | # WARNING 276 | # subprocess.Popen is allowed and used to make shell command calls to verilog compiler and simulator. 277 | #import subprocess 278 | #subprocess.Popen = None # type: ignore 279 | 280 | __builtins__['help'] = None 281 | 282 | import sys 283 | sys.modules['ipdb'] = None 284 | sys.modules['joblib'] = None 285 | sys.modules['resource'] = None 286 | sys.modules['psutil'] = None 287 | sys.modules['tkinter'] = None -------------------------------------------------------------------------------- /verilog_eval/build/lib/verilog_eval/execution.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Callable, Dict 2 | import ast 3 | import contextlib 4 | import faulthandler 5 | import io 6 | import os 7 | import multiprocessing 8 | import platform 9 | import signal 10 | import tempfile 11 | 12 | import subprocess 13 | import re 14 | from threading import Timer 15 | 16 | def clean_up_simulation() -> None: 17 | """ 18 | kill all simulation process. 19 | """ 20 | print("Killing all hanging simulation process.") 21 | subprocess.run("pkill iverilog", shell=True) 22 | subprocess.run("pkill vvp", shell=True) 23 | 24 | def check_correctness(problem: Dict, completion: str, timeout: float, 25 | completion_id: Optional[int] = None, unit_test_length: Optional[int] = None) -> Dict: 26 | """ 27 | Evaluates the functional correctness of a completion by running the test 28 | suite provided in the problem. 29 | :param completion_id: an optional completion ID so we can match 30 | the results later even if execution finishes asynchronously. 31 | """ 32 | 33 | def unsafe_execute(): 34 | 35 | with create_tempdir(): 36 | 37 | # These system calls are needed when cleaning up tempdir. 38 | import os 39 | import shutil 40 | rmtree = shutil.rmtree 41 | rmdir = os.rmdir 42 | chdir = os.chdir 43 | 44 | # Disable functionalities that can make destructive changes to the test. 45 | # WARNING 46 | # subprocess.Popen is used to run shell command with calls to iveriog and vvp. 47 | # Please refer to reliability_guard function for details 48 | reliability_guard() 49 | 50 | # Output testbench with solution to Verilog file in temp directory. 51 | verilog_test = problem["test"] + "\n" + \ 52 | problem["prompt"] + "\n" + \ 53 | completion 54 | 55 | 56 | if unit_test_length: 57 | keywords = re.findall("repeat\([0-9]*\)", verilog_test) 58 | for words in keywords: 59 | verilog_test = verilog_test.replace(words, "repeat({})".format(unit_test_length)) 60 | 61 | with open("{}.sv".format(problem["task_id"]), 'w') as f: 62 | f.write(verilog_test) 63 | 64 | try: 65 | # WARNING PLEASE READ 66 | # The following code use subprocess.Popen to run shell command with calls to iveriog and vvp. 67 | # Please check that iverilog and vvp are installed and included in your current run path. 68 | # For installation of Icarus Verilog, please refer to: https://github.com/steveicarus/iverilog 69 | # This program exists to execute untrusted model-generated code. Although 70 | # it is highly unlikely that model-generated code will do something overtly 71 | # malicious in response to this test suite, model-generated code may act 72 | # destructively due to a lack of model capability or alignment. 73 | # Users are strongly encouraged to sandbox this evaluation suite so that it 74 | # does not perform destructive actions on their host or network. For more 75 | # information on how OpenAI sandboxes its code, see the original OpenAI paper. 76 | # Once you have read this disclaimer and taken appropriate precautions, 77 | # proceed at your own risk: 78 | # BEGIN CODE BLOCK 79 | with swallow_io(): 80 | with time_limit(timeout): 81 | cmd = "iverilog -Wall -Winfloop -Wno-timescale -g2012 \ 82 | -s tb -o test.vvp {}.sv; vvp -n test.vvp".format(problem["task_id"]) 83 | 84 | """ 85 | adding timeout options for Popen. something breaks if not using timeout. seems to be working for now. 86 | not really sure if its the best/correct way. let me know if anyone has a better solution. 87 | https://stackoverflow.com/questions/1191374/using-module-subprocess-with-timeout 88 | """ 89 | p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 90 | timer = Timer(timeout, p.kill) 91 | try: 92 | timer.start() 93 | out, err = p.communicate() 94 | finally: 95 | timer.cancel() 96 | 97 | out, err = out.decode("utf-8"), err.decode("utf-8") 98 | match = re.search(r'Mismatches: ([0-9]*) in ([0-9]*) samples', out) 99 | if "syntax error" in err: 100 | result.append("failed: syntax error.") 101 | elif len(err) > 0: 102 | result.append("failed: compile error.") 103 | elif match: 104 | cor, tot = [int(i) for i in match.groups()] 105 | if cor == 0: 106 | result.append("passed") 107 | else: 108 | result.append(f"failed: {cor} out of {tot} samples.") 109 | else: 110 | result.append("failed: info string not matched.") 111 | # END CODE BLOCK 112 | except TimeoutException: 113 | result.append("timed out") 114 | except BaseException as e: 115 | result.append(f"failed: {e}") 116 | 117 | # Needed for cleaning up. 118 | shutil.rmtree = rmtree 119 | os.rmdir = rmdir 120 | os.chdir = chdir 121 | 122 | manager = multiprocessing.Manager() 123 | result = manager.list() 124 | 125 | p = multiprocessing.Process(target=unsafe_execute) 126 | p.start() 127 | p.join(timeout=timeout + 1) 128 | if p.is_alive(): 129 | p.kill() 130 | 131 | if not result: 132 | result.append("timed out") 133 | 134 | return dict( 135 | task_id=problem["task_id"], 136 | passed=result[0] == "passed", 137 | result=result[0], 138 | completion_id=completion_id, 139 | ) 140 | 141 | 142 | @contextlib.contextmanager 143 | def time_limit(seconds: float): 144 | def signal_handler(signum, frame): 145 | raise TimeoutException("Timed out!") 146 | signal.setitimer(signal.ITIMER_REAL, seconds) 147 | signal.signal(signal.SIGALRM, signal_handler) 148 | try: 149 | yield 150 | finally: 151 | signal.setitimer(signal.ITIMER_REAL, 0) 152 | 153 | 154 | @contextlib.contextmanager 155 | def swallow_io(): 156 | stream = WriteOnlyStringIO() 157 | with contextlib.redirect_stdout(stream): 158 | with contextlib.redirect_stderr(stream): 159 | with redirect_stdin(stream): 160 | yield 161 | 162 | 163 | @contextlib.contextmanager 164 | def create_tempdir(): 165 | with tempfile.TemporaryDirectory() as dirname: 166 | with chdir(dirname): 167 | yield dirname 168 | 169 | 170 | class TimeoutException(Exception): 171 | pass 172 | 173 | 174 | class WriteOnlyStringIO(io.StringIO): 175 | """ StringIO that throws an exception when it's read from """ 176 | 177 | def read(self, *args, **kwargs): 178 | raise IOError 179 | 180 | def readline(self, *args, **kwargs): 181 | raise IOError 182 | 183 | def readlines(self, *args, **kwargs): 184 | raise IOError 185 | 186 | def readable(self, *args, **kwargs): 187 | """ Returns True if the IO object can be read. """ 188 | return False 189 | 190 | 191 | class redirect_stdin(contextlib._RedirectStream): # type: ignore 192 | _stream = 'stdin' 193 | 194 | 195 | @contextlib.contextmanager 196 | def chdir(root): 197 | if root == ".": 198 | yield 199 | return 200 | cwd = os.getcwd() 201 | os.chdir(root) 202 | try: 203 | yield 204 | except BaseException as exc: 205 | raise exc 206 | finally: 207 | os.chdir(cwd) 208 | 209 | 210 | def reliability_guard(maximum_memory_bytes: Optional[int] = None): 211 | """ 212 | Updated Comment: 213 | We have enabled subprocess.Popen to allow shell command calls to verilog 214 | compiler and simulator. Please use at own risk. 215 | Original Comment: 216 | This disables various destructive functions and prevents the generated code 217 | from interfering with the test (e.g. fork bomb, killing other processes, 218 | removing filesystem files, etc.) 219 | WARNING 220 | This function is NOT a security sandbox. Untrusted code, including, model- 221 | generated code, should not be blindly executed outside of one. See the 222 | Codex paper for more information about OpenAI's code sandbox, and proceed 223 | with caution. 224 | """ 225 | 226 | if maximum_memory_bytes is not None: 227 | import resource 228 | resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)) 229 | resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)) 230 | if not platform.uname().system == 'Darwin': 231 | resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)) 232 | 233 | faulthandler.disable() 234 | 235 | import builtins 236 | builtins.exit = None 237 | builtins.quit = None 238 | 239 | import os 240 | os.environ['OMP_NUM_THREADS'] = '1' 241 | 242 | os.kill = None 243 | os.system = None 244 | os.putenv = None 245 | os.remove = None 246 | os.removedirs = None 247 | os.rmdir = None 248 | os.fchdir = None 249 | os.setuid = None 250 | os.fork = None 251 | os.forkpty = None 252 | os.killpg = None 253 | os.rename = None 254 | os.renames = None 255 | os.truncate = None 256 | os.replace = None 257 | #os.unlink = None 258 | os.fchmod = None 259 | os.fchown = None 260 | os.chmod = None 261 | os.chown = None 262 | os.chroot = None 263 | os.fchdir = None 264 | os.lchflags = None 265 | os.lchmod = None 266 | os.lchown = None 267 | os.getcwd = None 268 | os.chdir = None 269 | 270 | import shutil 271 | shutil.rmtree = None 272 | shutil.move = None 273 | shutil.chown = None 274 | 275 | # WARNING 276 | # subprocess.Popen is allowed and used to make shell command calls to verilog compiler and simulator. 277 | #import subprocess 278 | #subprocess.Popen = None # type: ignore 279 | 280 | __builtins__['help'] = None 281 | 282 | import sys 283 | sys.modules['ipdb'] = None 284 | sys.modules['joblib'] = None 285 | sys.modules['resource'] = None 286 | sys.modules['psutil'] = None 287 | sys.modules['tkinter'] = None -------------------------------------------------------------------------------- /verilog_eval/data/example/ExampleEval.jsonl: -------------------------------------------------------------------------------- 1 | {"task_id": "gatesv", "prompt": "module top_module (\n\tinput [3:0] in,\n\toutput [2:0] out_both,\n\toutput [3:1] out_any,\n\toutput [3:0] out_different\n);\n", "canonical_solution": "\n\tassign out_both = in[2:0] & in[3:1];\n\tassign out_any = in[2:0] | in[3:1];\n\tassign out_different = in^{in[0], in[3:1]};\n\t\nendmodule\n", "test": "`timescale 1 ps/1 ps\n`define OK 12\n`define INCORRECT 13\nmodule reference_module (\n\tinput [3:0] in,\n\toutput [2:0] out_both,\n\toutput [3:1] out_any,\n\toutput [3:0] out_different\n);\n\n\tassign out_both = in[2:0] & in[3:1];\n\tassign out_any = in[2:0] | in[3:1];\n\tassign out_different = in^{in[0], in[3:1]};\n\t\nendmodule\n\n\nmodule stimulus_gen (\n\tinput clk,\n\tinput tb_match,\n\toutput logic [3:0] in,\n\toutput reg[511:0] wavedrom_title,\n\toutput reg wavedrom_enable\t\n);\n\n\n// Add two ports to module stimulus_gen:\n// output [511:0] wavedrom_title\n// output reg wavedrom_enable\n\n\ttask wavedrom_start(input[511:0] title = \"\");\n\tendtask\n\t\n\ttask wavedrom_stop;\n\t\t#1;\n\tendtask\t\n\n\n\n\tinitial begin\n\t\tin <= 4'h3;\n\t\t@(negedge clk);\n\t\twavedrom_start();\n\t\t\t@(posedge clk) in <= 3;\n\t\t\t@(posedge clk) in <= 6;\n\t\t\t@(posedge clk) in <= 12;\n\t\t\t@(posedge clk) in <= 9;\n\t\t\t@(posedge clk) in <= 5;\n\t\t@(negedge clk);\n\t\twavedrom_stop();\n\t\tin <= $random;\n\t\trepeat(100) begin\n\t\t\t@(negedge clk) in <= $random;\n\t\t\t@(posedge clk) in <= $random;\n\t\tend\n\t\t#1 $finish;\n\tend\n\t\t\nendmodule\n\nmodule tb();\n\n\ttypedef struct packed {\n\t\tint errors;\n\t\tint errortime;\n\t\tint errors_out_both;\n\t\tint errortime_out_both;\n\t\tint errors_out_any;\n\t\tint errortime_out_any;\n\t\tint errors_out_different;\n\t\tint errortime_out_different;\n\n\t\tint clocks;\n\t} stats;\n\t\n\tstats stats1;\n\t\n\t\n\twire[511:0] wavedrom_title;\n\twire wavedrom_enable;\n\tint wavedrom_hide_after_time;\n\t\n\treg clk=0;\n\tinitial forever\n\t\t#5 clk = ~clk;\n\n\tlogic [3:0] in;\n\tlogic [2:0] out_both_ref;\n\tlogic [2:0] out_both_dut;\n\tlogic [3:1] out_any_ref;\n\tlogic [3:1] out_any_dut;\n\tlogic [3:0] out_different_ref;\n\tlogic [3:0] out_different_dut;\n\n\tinitial begin \n\t\t$dumpfile(\"wave.vcd\");\n\t\t$dumpvars(1, stim1.clk, tb_mismatch ,in,out_both_ref,out_both_dut,out_any_ref,out_any_dut,out_different_ref,out_different_dut );\n\tend\n\n\n\twire tb_match;\t\t// Verification\n\twire tb_mismatch = ~tb_match;\n\t\n\tstimulus_gen stim1 (\n\t\t.clk,\n\t\t.* ,\n\t\t.in );\n\treference_module good1 (\n\t\t.in,\n\t\t.out_both(out_both_ref),\n\t\t.out_any(out_any_ref),\n\t\t.out_different(out_different_ref) );\n\t\t\n\ttop_module top_module1 (\n\t\t.in,\n\t\t.out_both(out_both_dut),\n\t\t.out_any(out_any_dut),\n\t\t.out_different(out_different_dut) );\n\n\t\n\tbit strobe = 0;\n\ttask wait_for_end_of_timestep;\n\t\trepeat(5) begin\n\t\t\tstrobe <= !strobe; // Try to delay until the very end of the time step.\n\t\t\t@(strobe);\n\t\tend\n\tendtask\t\n\n\t\n\tfinal begin\n\t\tif (stats1.errors_out_both) $display(\"Hint: Output '%s' has %0d mismatches. First mismatch occurred at time %0d.\", \"out_both\", stats1.errors_out_both, stats1.errortime_out_both);\n\t\telse $display(\"Hint: Output '%s' has no mismatches.\", \"out_both\");\n\t\tif (stats1.errors_out_any) $display(\"Hint: Output '%s' has %0d mismatches. First mismatch occurred at time %0d.\", \"out_any\", stats1.errors_out_any, stats1.errortime_out_any);\n\t\telse $display(\"Hint: Output '%s' has no mismatches.\", \"out_any\");\n\t\tif (stats1.errors_out_different) $display(\"Hint: Output '%s' has %0d mismatches. First mismatch occurred at time %0d.\", \"out_different\", stats1.errors_out_different, stats1.errortime_out_different);\n\t\telse $display(\"Hint: Output '%s' has no mismatches.\", \"out_different\");\n\n\t\t$display(\"Hint: Total mismatched samples is %1d out of %1d samples\\n\", stats1.errors, stats1.clocks);\n\t\t$display(\"Simulation finished at %0d ps\", $time);\n\t\t$display(\"Mismatches: %1d in %1d samples\", stats1.errors, stats1.clocks);\n\tend\n\t\n\t// Verification: XORs on the right makes any X in good_vector match anything, but X in dut_vector will only match X.\n\tassign tb_match = ( { out_both_ref, out_any_ref, out_different_ref } === ( { out_both_ref, out_any_ref, out_different_ref } ^ { out_both_dut, out_any_dut, out_different_dut } ^ { out_both_ref, out_any_ref, out_different_ref } ) );\n\t// Use explicit sensitivity list here. @(*) causes NetProc::nex_input() to be called when trying to compute\n\t// the sensitivity list of the @(strobe) process, which isn't implemented.\n\talways @(posedge clk, negedge clk) begin\n\n\t\tstats1.clocks++;\n\t\tif (!tb_match) begin\n\t\t\tif (stats1.errors == 0) stats1.errortime = $time;\n\t\t\tstats1.errors++;\n\t\tend\n\t\tif (out_both_ref !== ( out_both_ref ^ out_both_dut ^ out_both_ref ))\n\t\tbegin if (stats1.errors_out_both == 0) stats1.errortime_out_both = $time;\n\t\t\tstats1.errors_out_both = stats1.errors_out_both+1'b1; end\n\t\tif (out_any_ref !== ( out_any_ref ^ out_any_dut ^ out_any_ref ))\n\t\tbegin if (stats1.errors_out_any == 0) stats1.errortime_out_any = $time;\n\t\t\tstats1.errors_out_any = stats1.errors_out_any+1'b1; end\n\t\tif (out_different_ref !== ( out_different_ref ^ out_different_dut ^ out_different_ref ))\n\t\tbegin if (stats1.errors_out_different == 0) stats1.errortime_out_different = $time;\n\t\t\tstats1.errors_out_different = stats1.errors_out_different+1'b1; end\n\n\tend\nendmodule\n"} 2 | {"task_id": "vector4", "prompt": "module top_module (\n\tinput [7:0] in,\n\toutput [31:0] out\n);\n", "canonical_solution": "\n\tassign out = { {24{in[7]}}, in };\n\t\nendmodule\n", "test": "`timescale 1 ps/1 ps\n`define OK 12\n`define INCORRECT 13\nmodule reference_module (\n\tinput [7:0] in,\n\toutput [31:0] out\n);\n\n\tassign out = { {24{in[7]}}, in };\n\t\nendmodule\n\n\nmodule stimulus_gen (\n\tinput clk,\n\toutput logic [7:0] in\n);\n\n\tinitial begin\n\t\trepeat(100) @(posedge clk, negedge clk)\n\t\t\tin <= $random;\n\t\t$finish;\n\tend\n\t\nendmodule\n\nmodule tb();\n\n\ttypedef struct packed {\n\t\tint errors;\n\t\tint errortime;\n\t\tint errors_out;\n\t\tint errortime_out;\n\n\t\tint clocks;\n\t} stats;\n\t\n\tstats stats1;\n\t\n\t\n\twire[511:0] wavedrom_title;\n\twire wavedrom_enable;\n\tint wavedrom_hide_after_time;\n\t\n\treg clk=0;\n\tinitial forever\n\t\t#5 clk = ~clk;\n\n\tlogic [7:0] in;\n\tlogic [31:0] out_ref;\n\tlogic [31:0] out_dut;\n\n\tinitial begin \n\t\t$dumpfile(\"wave.vcd\");\n\t\t$dumpvars(1, stim1.clk, tb_mismatch ,in,out_ref,out_dut );\n\tend\n\n\n\twire tb_match;\t\t// Verification\n\twire tb_mismatch = ~tb_match;\n\t\n\tstimulus_gen stim1 (\n\t\t.clk,\n\t\t.* ,\n\t\t.in );\n\treference_module good1 (\n\t\t.in,\n\t\t.out(out_ref) );\n\t\t\n\ttop_module top_module1 (\n\t\t.in,\n\t\t.out(out_dut) );\n\n\t\n\tbit strobe = 0;\n\ttask wait_for_end_of_timestep;\n\t\trepeat(5) begin\n\t\t\tstrobe <= !strobe; // Try to delay until the very end of the time step.\n\t\t\t@(strobe);\n\t\tend\n\tendtask\t\n\n\t\n\tfinal begin\n\t\tif (stats1.errors_out) $display(\"Hint: Output '%s' has %0d mismatches. First mismatch occurred at time %0d.\", \"out\", stats1.errors_out, stats1.errortime_out);\n\t\telse $display(\"Hint: Output '%s' has no mismatches.\", \"out\");\n\n\t\t$display(\"Hint: Total mismatched samples is %1d out of %1d samples\\n\", stats1.errors, stats1.clocks);\n\t\t$display(\"Simulation finished at %0d ps\", $time);\n\t\t$display(\"Mismatches: %1d in %1d samples\", stats1.errors, stats1.clocks);\n\tend\n\t\n\t// Verification: XORs on the right makes any X in good_vector match anything, but X in dut_vector will only match X.\n\tassign tb_match = ( { out_ref } === ( { out_ref } ^ { out_dut } ^ { out_ref } ) );\n\t// Use explicit sensitivity list here. @(*) causes NetProc::nex_input() to be called when trying to compute\n\t// the sensitivity list of the @(strobe) process, which isn't implemented.\n\talways @(posedge clk, negedge clk) begin\n\n\t\tstats1.clocks++;\n\t\tif (!tb_match) begin\n\t\t\tif (stats1.errors == 0) stats1.errortime = $time;\n\t\t\tstats1.errors++;\n\t\tend\n\t\tif (out_ref !== ( out_ref ^ out_dut ^ out_ref ))\n\t\tbegin if (stats1.errors_out == 0) stats1.errortime_out = $time;\n\t\t\tstats1.errors_out = stats1.errors_out+1'b1; end\n\n\tend\nendmodule\n"} 3 | {"task_id": "zero", "prompt": "module top_module(\n\toutput zero);\n", "canonical_solution": "\t\n\tassign zero = 1'b0;\n\t\nendmodule\n", "test": "`timescale 1 ps/1 ps\n`define OK 12\n`define INCORRECT 13\nmodule reference_module(\n\toutput zero);\n\t\n\tassign zero = 1'b0;\n\t\nendmodule\n\n\nmodule stimulus_gen (\n\tinput clk,\n\toutput reg[511:0] wavedrom_title,\n\toutput reg wavedrom_enable\n);\n\n\n// Add two ports to module stimulus_gen:\n// output [511:0] wavedrom_title\n// output reg wavedrom_enable\n\n\ttask wavedrom_start(input[511:0] title = \"\");\n\tendtask\n\t\n\ttask wavedrom_stop;\n\t\t#1;\n\tendtask\t\n\n\n\n\tinitial begin\n\t\twavedrom_start(\"Output should 0\");\n\t\trepeat(20) @(posedge clk, negedge clk);\n\t\twavedrom_stop();\n\t\t\n\t\t#1 $finish;\n\tend\n\t\nendmodule\n\nmodule tb();\n\n\ttypedef struct packed {\n\t\tint errors;\n\t\tint errortime;\n\t\tint errors_zero;\n\t\tint errortime_zero;\n\n\t\tint clocks;\n\t} stats;\n\t\n\tstats stats1;\n\t\n\t\n\twire[511:0] wavedrom_title;\n\twire wavedrom_enable;\n\tint wavedrom_hide_after_time;\n\t\n\treg clk=0;\n\tinitial forever\n\t\t#5 clk = ~clk;\n\n\tlogic zero_ref;\n\tlogic zero_dut;\n\n\tinitial begin \n\t\t$dumpfile(\"wave.vcd\");\n\t\t$dumpvars(1, stim1.clk, tb_mismatch ,zero_ref,zero_dut );\n\tend\n\n\n\twire tb_match;\t\t// Verification\n\twire tb_mismatch = ~tb_match;\n\t\n\tstimulus_gen stim1 (\n\t\t.clk,\n\t\t.* );\n\treference_module good1 (\n\t\t.zero(zero_ref) );\n\t\t\n\ttop_module top_module1 (\n\t\t.zero(zero_dut) );\n\n\t\n\tbit strobe = 0;\n\ttask wait_for_end_of_timestep;\n\t\trepeat(5) begin\n\t\t\tstrobe <= !strobe; // Try to delay until the very end of the time step.\n\t\t\t@(strobe);\n\t\tend\n\tendtask\t\n\n\t\n\tfinal begin\n\t\tif (stats1.errors_zero) $display(\"Hint: Output '%s' has %0d mismatches. First mismatch occurred at time %0d.\", \"zero\", stats1.errors_zero, stats1.errortime_zero);\n\t\telse $display(\"Hint: Output '%s' has no mismatches.\", \"zero\");\n\n\t\t$display(\"Hint: Total mismatched samples is %1d out of %1d samples\\n\", stats1.errors, stats1.clocks);\n\t\t$display(\"Simulation finished at %0d ps\", $time);\n\t\t$display(\"Mismatches: %1d in %1d samples\", stats1.errors, stats1.clocks);\n\tend\n\t\n\t// Verification: XORs on the right makes any X in good_vector match anything, but X in dut_vector will only match X.\n\tassign tb_match = ( { zero_ref } === ( { zero_ref } ^ { zero_dut } ^ { zero_ref } ) );\n\t// Use explicit sensitivity list here. @(*) causes NetProc::nex_input() to be called when trying to compute\n\t// the sensitivity list of the @(strobe) process, which isn't implemented.\n\talways @(posedge clk, negedge clk) begin\n\n\t\tstats1.clocks++;\n\t\tif (!tb_match) begin\n\t\t\tif (stats1.errors == 0) stats1.errortime = $time;\n\t\t\tstats1.errors++;\n\t\tend\n\t\tif (zero_ref !== ( zero_ref ^ zero_dut ^ zero_ref ))\n\t\tbegin if (stats1.errors_zero == 0) stats1.errortime_zero = $time;\n\t\t\tstats1.errors_zero = stats1.errors_zero+1'b1; end\n\n\tend\nendmodule\n"} -------------------------------------------------------------------------------- /auto_data_gen_val/code_repo_documentor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from dotenv import load_dotenv 4 | load_dotenv() 5 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR"))) 6 | import openai 7 | import requests 8 | import json 9 | import copy 10 | import time 11 | import datetime 12 | import shutil 13 | from embedding_lookup_utils import * 14 | from utils import * 15 | from completion_handler import * 16 | from code_preprocesser import * 17 | 18 | 19 | class CodeRepoDocumentor: 20 | def __init__(self, code_dir, store_src_code_dir, 21 | csv_code_dir, csv_comment_dir, csv_new_comment_dir, 22 | csv_pure_gen_comment_dir, code_summary_dir, documented_code_dir, 23 | code_metadata_file = None, 24 | code_suffix =[".v", ".sv", ".vh"], language="verilog", 25 | discard_original_comment = False, 26 | code_lib_path= "./lib", code_vec_store = "./vector_store/", 27 | skip_rag_db = False, skip_supplement_summary = False, 28 | cb = None): 29 | 30 | #raw code preprocessing 31 | self.code_dir = code_dir 32 | self.code_suffix = code_suffix 33 | self.language = language 34 | self.store_src_code_dir = store_src_code_dir 35 | self.csv_code_dir = csv_code_dir 36 | self.csv_comment_dir = csv_comment_dir 37 | self.csv_new_comment_dir = csv_new_comment_dir 38 | self.csv_pure_gen_comment_dir = csv_pure_gen_comment_dir 39 | self.code_summary_dir = code_summary_dir 40 | self.documented_code_dir = documented_code_dir 41 | self.discard_original_comment = discard_original_comment 42 | self.code_vec_store = code_vec_store 43 | self.code_Lib_path = code_lib_path 44 | self.cb = cb 45 | self.skip_rag_db = skip_rag_db 46 | self.skip_supplement_summary = skip_supplement_summary 47 | if code_metadata_file is not None: 48 | self.code_metadata_file = code_metadata_file 49 | self.code_metadata = json.load(open(self.code_metadata_file, "r")) 50 | 51 | self.code_preprocesser = CodePreprocesser(code_dir, store_src_code_dir, 52 | csv_code_dir, csv_comment_dir, 53 | csv_new_comment_dir, csv_pure_gen_comment_dir, 54 | code_summary_dir, documented_code_dir, 55 | code_suffix=code_suffix, discard_original_comment=discard_original_comment) 56 | self.documented_list = [] 57 | self.documented_list_file = os.path.join(os.environ.get("ASSET_DIR"), os.environ.get("TARGET_LANG"), "documented_list.txt") 58 | if os.path.exists(self.documented_list_file): 59 | #ask if the user wants to remove the documented list 60 | print("Do you want to remove the documented list? (y/n)") 61 | answer = input() 62 | if answer == "y": 63 | os.remove(self.documented_list_file) 64 | print("Documented list removed") 65 | else: 66 | with open(self.documented_list_file, "r") as f: 67 | self.documented_list = f.readlines() 68 | self.documented_list = [x.strip() for x in self.documented_list] 69 | 70 | 71 | #context embedding 72 | self.embedding_fields = ["Filename", "File type", "Summary", "Text", "Line_id"] 73 | self.system_embedder = EmbedTool0(self.embedding_fields, 74 | os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SYSTEM_CONTEXT_DIR")), 75 | os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SYSTEM_CONTEXT_EMBEDDING_DIR")), 76 | "system_context_embedding.csv") 77 | 78 | #code documentor 79 | self.documentor = Chatbot(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SYSTEM_CONTEXT_DIR"), 80 | "context.fixed_features.txt"), 81 | os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("CONVERSE_DIR")), 82 | os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SYSTEM_CONTEXT_DIR"), 83 | "context.converse_samples.txt"), 84 | code_suffix=self.code_suffix, 85 | language=self.language, 86 | code_lib_path=self.code_Lib_path, 87 | code_vec_store=self.code_vec_store, 88 | skip_supplement_summary = self.skip_supplement_summary, 89 | cb = self.cb 90 | ) 91 | 92 | 93 | def create_embedding(self): 94 | self.system_embedder.create_raw_system_context() 95 | self.system_embedder.create_embedding() 96 | self.system_embedder.load_embedding() 97 | if not self.skip_rag_db: 98 | self.documentor.init_code_retrival() 99 | 100 | def code_preprocess(self, skip_preprocess=False): 101 | self.code_preprocesser.raw_code_copy(self.code_dir, self.store_src_code_dir, skip_preprocess=skip_preprocess) 102 | if not skip_preprocess: 103 | self.code_preprocesser.pre_process_routines(self.store_src_code_dir, 104 | discard_original_comment=self.discard_original_comment, 105 | rtl=(".v" in self.code_suffix or ".sv" in self.code_suffix or ".vh" in self.code_suffix)) 106 | self.code_preprocesser.create_code_assets() 107 | else: 108 | pass 109 | 110 | def document_repo(self): 111 | self.documentor.system_context_embedding = self.system_embedder.df_embed 112 | for code_src in self.code_preprocesser.code_files: 113 | start_time = time.time() 114 | if code_src not in self.documented_list: 115 | print("Documenting {}".format(code_src)) 116 | #clear the memory of the documentor 117 | self.documentor.line_by_line_comment_converse_chain.memory.clear() 118 | self.documentor.line_by_line_comment_converse_chain.memory_buffer = [] 119 | csv_code_file = os.path.join(self.code_preprocesser.csv_code_dir, code_src.split(".")[0] + ".csv") 120 | csv_comment_file = os.path.join(self.code_preprocesser.csv_comment_dir, code_src.split(".")[0] + ".csv") 121 | csv_new_comment_file = os.path.join(self.code_preprocesser.csv_new_comment_dir, code_src.split(".")[0] + ".csv") 122 | csv_pure_gen_comment_file = os.path.join(self.code_preprocesser.csv_pure_gen_comment_dir, code_src.split(".")[0] + ".csv") 123 | code_summary_file = os.path.join(self.code_preprocesser.code_summary_dir, code_src.split(".")[0] + ".txt") 124 | 125 | #check the # of lines of code 126 | with open(os.path.join(self.code_preprocesser.store_src_code_dir, code_src), "r") as f: 127 | lines = f.readlines() 128 | if len(lines) > 200: 129 | print("Skip {} because it has too many lines of code".format(code_src)) 130 | continue 131 | 132 | dependent_funcs = self.code_metadata[code_src.split(".")[0]]["module_inst_list"] 133 | self.documentor.comment_a_code_file(csv_code_file, csv_comment_file, csv_new_comment_file, csv_pure_gen_comment_file, dependent_funcs=dependent_funcs) 134 | 135 | new_code_string = merge_code_and_comment(csv_code_file, csv_new_comment_file) 136 | with open(os.path.join(self.code_preprocesser.documented_code_dir, code_src), "w") as f: 137 | f.write(new_code_string) 138 | 139 | self.documentor.summarize_code_blocks(csv_code_file, csv_new_comment_file, code_summary_file) 140 | # bot.reverse_code_gen(csv_pure_gen_comment_file, code_summary_file) 141 | 142 | self.documented_list.append(code_src) 143 | with open(self.documented_list_file, "w") as f: 144 | f.write("\n".join(self.documented_list)) 145 | end_time = time.time() 146 | print("Time left to finish this repo: {}".format((end_time - start_time) * (len(self.code_preprocesser.code_files) - self.code_preprocesser.code_files.index(code_src)))) 147 | def package_documented_code(self, package_dir): 148 | #create the package dir 149 | if not os.path.exists(package_dir): 150 | os.makedirs(package_dir) 151 | for code_src in self.documented_list: 152 | #create a subdirectory for each of the documented code 153 | code_src = code_src.strip() 154 | code_src_dir = os.path.join(package_dir, code_src.split(".")[0]) 155 | if not os.path.exists(code_src_dir): 156 | os.makedirs(code_src_dir) 157 | shutil.copy(os.path.join(self.code_preprocesser.documented_code_dir, code_src), os.path.join(package_dir, code_src.split(".")[0], code_src)) 158 | shutil.copy(os.path.join(self.code_preprocesser.code_summary_dir, code_src.split(".")[0] + ".txt"), os.path.join(package_dir, code_src.split(".")[0], code_src.split(".")[0] + ".txt")) 159 | #TODO: add a function to convert the documented code to original raw code 160 | 161 | if __name__ == "__main__": 162 | #NOTE: run utils.py first to partition the code first 163 | code_dir = "./test_repo/" 164 | code_lib_path = "./test_repo/" 165 | code_vec_store = "../code_vec_store/DNNBuilder/" 166 | language = os.environ.get("TARGET_LANG") 167 | if os.environ.get("TARGET_LANG") == "verilog": 168 | code_suffix = [".v", ".sv", ".vh"] 169 | elif os.environ.get("TARGET_LANG") == "xilinx_hls": 170 | code_suffix = [".c", ".cpp", ".h", ".hpp"] 171 | store_src_code_dir = os.environ.get("STORE_SRC_CODE_DIR") 172 | csv_code_dir = os.environ.get("CSV_CODE_DIR") 173 | csv_comment_dir = os.environ.get("CSV_COMMENT_DIR") 174 | csv_new_comment_dir = os.environ.get("CSV_NEW_COMMENT_DIR") 175 | csv_pure_gen_comment_dir = os.environ.get("CSV_PURE_GEN_COMMENT_DIR") 176 | code_summary_dir = os.environ.get("CODE_SUMMARY_DIR") 177 | documented_code_dir = os.environ.get("DOCUMENTED_CODE_DIR") 178 | 179 | 180 | with get_openai_callback() as cb: 181 | #This switch will discard 1. the comments in the raw code copy and 2. the comments will be converted to the raw code csv 182 | discard_original_comment = True 183 | 184 | code_repo_documentor = CodeRepoDocumentor(code_dir, store_src_code_dir, 185 | csv_code_dir, csv_comment_dir, csv_new_comment_dir, 186 | csv_pure_gen_comment_dir, code_summary_dir, documented_code_dir, 187 | code_suffix=code_suffix, language=language, 188 | discard_original_comment=discard_original_comment, 189 | code_lib_path=code_lib_path, code_vec_store=code_vec_store, 190 | cb = cb) 191 | code_repo_documentor.create_embedding() 192 | code_repo_documentor.code_preprocess() 193 | code_repo_documentor.document_repo() 194 | code_repo_documentor.package_documented_code("./documented_code") -------------------------------------------------------------------------------- /auto_data_gen_val/preprocess_data/minhash_deduplicate.py: -------------------------------------------------------------------------------- 1 | import json 2 | import multiprocessing as mp 3 | import re 4 | from collections import defaultdict 5 | from functools import partial 6 | from typing import Dict, List, Optional, Set, Tuple, Type 7 | 8 | from datasets import Dataset 9 | from datasketch import MinHash, MinHashLSH 10 | from dpu_utils.utils.iterators import ThreadedIterator 11 | from tqdm import tqdm 12 | 13 | 14 | NON_ALPHA = re.compile("[^A-Za-z_0-9]") 15 | # parameters used in DuplicationIndex 16 | MIN_NUM_TOKENS = 10 17 | NUM_PERM = 256 18 | 19 | 20 | def get_min_hash(tokens: List[str]) -> Optional[MinHash]: 21 | """Compute the MinHash of a code snippet.""" 22 | if len(tokens) < MIN_NUM_TOKENS: 23 | return None 24 | min_hash = MinHash(num_perm=NUM_PERM) 25 | for token in set(tokens): 26 | min_hash.update(token.encode()) 27 | return min_hash 28 | 29 | 30 | def get_tokens(code: str) -> Set[str]: 31 | """Tokenize a code snippet.""" 32 | return {t for t in NON_ALPHA.split(code) if len(t.strip()) > 0} 33 | 34 | 35 | class DuplicationIndex: 36 | def __init__( 37 | self, 38 | *, 39 | duplication_jaccard_threshold: float = 0.85, 40 | ): 41 | self._duplication_jaccard_threshold = duplication_jaccard_threshold 42 | self._num_perm = NUM_PERM 43 | self._index = MinHashLSH(threshold=self._duplication_jaccard_threshold, num_perm=self._num_perm) 44 | 45 | self._duplicate_clusters = defaultdict(set) 46 | 47 | def add(self, code_key: Tuple, min_hash: MinHash) -> None: 48 | """Add a key to _index (MinHashLSH) 49 | the min_hash is used to query closest matches based on the jaccard_threshold. 50 | The new key is either added to a existing cluster of one close match, 51 | or a new cluster is created. The clusters created in this way, depend on the order of add. 52 | 53 | Args: 54 | code_key (Tuple of (index, repo_name, path)): 55 | Theoritically any hasbale key. Here we use a tuple to retrieve the information later. 56 | min_hash: MinHash of the code_key. 57 | """ 58 | close_duplicates = self._index.query(min_hash) 59 | if code_key in self._index.keys: 60 | print(f"Duplicate key {code_key}") 61 | return 62 | 63 | self._index.insert(code_key, min_hash) 64 | if len(close_duplicates) > 0: 65 | for base_duplicate in close_duplicates: 66 | if base_duplicate in self._duplicate_clusters: 67 | self._duplicate_clusters[base_duplicate].add(code_key) 68 | break 69 | else: 70 | self._duplicate_clusters[close_duplicates[0]].add(code_key) 71 | 72 | def get_duplicate_clusters(self) -> List[List[Dict]]: 73 | """Export the duplicate clusters. 74 | For each cluster, the first element is the base element of the cluster. 75 | The base element has an estimation jaccard similarity higher than the threshold with all the other elements. 76 | 77 | Returns: 78 | duplicate_clusters (List[List[Dict]]): 79 | List of duplicate clusters. 80 | """ 81 | duplicate_clusters = [] 82 | for base, duplicates in self._duplicate_clusters.items(): 83 | cluster = [base] + list(duplicates) 84 | # reformat the cluster to be a list of dict 85 | cluster = [{"base_index": el[0], "repo_name": el[1], "path": el[2]} for el in cluster] 86 | duplicate_clusters.append(cluster) 87 | return duplicate_clusters 88 | 89 | def save(self, filepath) -> None: 90 | duplicate_clusters = self.get_duplicate_clusters() 91 | with open(filepath, "w") as f: 92 | json.dump(duplicate_clusters, f) 93 | 94 | 95 | def _compute_min_hash(element): 96 | index, data = element 97 | min_hash = get_min_hash([t for t in NON_ALPHA.split(data["text"]) if len(t.strip()) > 0]) 98 | if min_hash is not None: 99 | return (index, data["repo_name"], data["path"]), min_hash 100 | 101 | 102 | def minhash_iter(dataset_iterator: Type[Dataset]): 103 | with mp.Pool() as pool: 104 | for data in pool.imap_unordered( 105 | _compute_min_hash, 106 | ThreadedIterator(dataset_iterator, max_queue_size=10000), 107 | chunksize=100, 108 | ): 109 | if data is not None: 110 | yield data 111 | 112 | 113 | def make_duplicate_clusters(dataset_iterator: Type[Dataset], jaccard_threshold: float): 114 | """Find duplicate clusters in the dataset in two steps: 115 | 1. Compute MinHash for each code snippet. MinHash is a tool for fast jaccard similarity estimation. 116 | This step is computed using an asynchronous multiprocessing pool, minhash_iter 117 | 2. Find duplicate clusters. The computed MinHash is added sequentially to the DuplicationIndex. 118 | This step cannot be parallelized. So using asynchronous thread in the previous step helps to speed up the process. 119 | """ 120 | di = DuplicationIndex(duplication_jaccard_threshold=jaccard_threshold) 121 | 122 | for filename, min_hash in tqdm(ThreadedIterator(minhash_iter(enumerate(dataset_iterator)), max_queue_size=100)): 123 | di.add(filename, min_hash) 124 | 125 | # Returns a List[Cluster] where Cluster is List[str] with the filenames. 126 | return di.get_duplicate_clusters() 127 | 128 | 129 | def jaccard_similarity(code1: str, code2: str) -> float: 130 | """Compute the Jaccard similarity of two code snippets.""" 131 | tokens1 = get_tokens(code1) 132 | tokens2 = get_tokens(code2) 133 | return len(tokens1 & tokens2) / len(tokens1 | tokens2) 134 | 135 | 136 | _shared_dataset = None 137 | 138 | 139 | def _find_cluster_extremes_shared(cluster, jaccard_threshold): 140 | """Find a reduced cluster such that each code in the origin cluster is similar to at least one code in the reduced cluster. 141 | Two codes are similar if their Jaccard similarity is above the threshold. 142 | 143 | Args: 144 | cluster (List[dict]): 145 | cluster is a list of dict, each dict contains the following keys: 146 | - base_index 147 | - repo_name 148 | - path 149 | This is a typical output of DuplicationIndex.get_duplicate_clusters() 150 | jaccard_threshold (float): 151 | threshold for Jaccard similarity. 152 | Two codes are similar if their Jaccard similarity is above the threshold. 153 | 154 | Returns: 155 | extremes (List[dict]): 156 | A reduced representation of the cluster. The field copies is added to each dict. 157 | The copies field indicates the number of similar codes in the cluster for a extreme. 158 | """ 159 | extremes = [] 160 | for element1 in cluster: 161 | code1 = _shared_dataset[element1["base_index"]]["text"] 162 | for element2 in extremes: 163 | code2 = _shared_dataset[element2["base_index"]]["text"] 164 | if jaccard_similarity(code1, code2) >= jaccard_threshold: 165 | element2["copies"] += 1 166 | break 167 | else: 168 | element1["copies"] = 1 169 | extremes.append(element1) 170 | return extremes 171 | 172 | 173 | def find_extremes(cluster_list, dataset, jaccard_threshold): 174 | """Call the _find_cluster_extremes_shared function in a parallel fashion. 175 | 176 | Args: 177 | cluster_list (List[List[Dict]]): 178 | each cluster is a list of dicts with the key base_index, 179 | referring to the index of the base code in the dataset. 180 | dataset (Type[Dataset]): 181 | dataset is used to access the content of the code snippets, 182 | using the base_index from the cluster_list. 183 | dataset is shared between all the processes using a glabal variable (any other way to share the dataset?), 184 | otherwise the multi processing is not speeded up. 185 | jaccard_threshold (float): 186 | the threshold for the jaccard similarity. The default value is 0.85 187 | 188 | Returns: 189 | extremes_list (List[Dict]): 190 | Each cluster is reduced to extremes. 191 | See _find_cluster_extremes_shared for the definition of extremes. 192 | """ 193 | global _shared_dataset 194 | _shared_dataset = dataset 195 | extremes_list = [] 196 | f = partial(_find_cluster_extremes_shared, jaccard_threshold=jaccard_threshold) 197 | with mp.Pool() as pool: 198 | for extremes in tqdm( 199 | pool.imap_unordered( 200 | f, 201 | cluster_list, 202 | ), 203 | total=len(cluster_list), 204 | ): 205 | extremes_list.append(extremes) 206 | return extremes_list 207 | 208 | 209 | def deduplicate_dataset( 210 | dataset: Type[Dataset], jaccard_threshold: float = 0.85 211 | ) -> Tuple[Type[Dataset], List[List[Dict]]]: 212 | """Deduplicate the dataset using minhash and jaccard similarity. 213 | This function first generate duplicate clusters, then each cluster 214 | is reduced to the extremes that are similar to the other elements in the cluster. 215 | Codes are called similar if their Jaccard similarity is greater than jaccard_threshold (0.85 default). 216 | 217 | Args: 218 | dataset (Type[Dataset]): 219 | The dataset to deduplicate. 220 | jaccard_threshold (float, default=0.85): 221 | jaccard threshold to determine if two codes are similar 222 | 223 | Returns: 224 | ds_dedup (Type[Dataset]): 225 | The deduplicated dataset. 226 | duplicate_clusters (List[List[Dict]]): 227 | The list of duplicate clusters. 228 | Each cluster is a list of dicts with the following keys: 229 | - base_index : int 230 | The index of the code in the original dataset. 231 | - repo_name : str 232 | - path : str 233 | - copies : int 234 | The number of copies of the code in the cluster. (find_cluster_extremes) 235 | - is_extreme : bool 236 | Whether the code is an extreme in the cluster. 237 | All the codes in the cluster are removed from the dataset except the extremes. 238 | 239 | Example: 240 | >>> from datasets import load_dataset 241 | >>> from minhash_deduplication import deduplicate_dataset 242 | >>> ds = load_dataset("lvwerra/codeparrot-clean", split="train") 243 | >>> ds_dedup, duplicate_clusters = deduplicate_dataset(ds, jaccard_threshold=0.85) 244 | """ 245 | duplicate_clusters = make_duplicate_clusters(dataset, jaccard_threshold) 246 | duplicate_indices = {x["base_index"] for cluster in duplicate_clusters for x in cluster} 247 | extreme_dict = {} 248 | extremes_clusters = find_extremes(duplicate_clusters, dataset, jaccard_threshold) 249 | for extremes in extremes_clusters: 250 | for element in extremes: 251 | extreme_dict[element["base_index"]] = element 252 | remove_indices = duplicate_indices - set(extreme_dict.keys()) 253 | ds_filter = dataset.filter(lambda x, idx: idx not in remove_indices, with_indices=True) 254 | 255 | # update duplicate_clusters 256 | for cluster in duplicate_clusters: 257 | for element in cluster: 258 | element["is_extreme"] = element["base_index"] in extreme_dict 259 | if element["is_extreme"]: 260 | element["copies"] = extreme_dict[element["base_index"]]["copies"] 261 | 262 | print(f"Original dataset size: {len(dataset)}") 263 | print(f"Number of duplicate clusters: {len(duplicate_clusters)}") 264 | print(f"Files in duplicate cluster: {len(duplicate_indices)}") 265 | print(f"Unique files in duplicate cluster: {len(extreme_dict)}") 266 | print(f"Filtered dataset size: {len(ds_filter)}") 267 | 268 | return ds_filter, duplicate_clusters -------------------------------------------------------------------------------- /auto_data_gen_val/preprocess_data/process_data/minhash.py: -------------------------------------------------------------------------------- 1 | import json 2 | import multiprocessing as mp 3 | import re 4 | from collections import defaultdict 5 | from functools import partial 6 | from typing import Dict, List, Optional, Set, Tuple, Type 7 | 8 | from datasets import Dataset 9 | from datasketch import MinHash, MinHashLSH 10 | # from dpu_utils.utils.iterators import ThreadedIterator 11 | from tqdm import tqdm 12 | 13 | 14 | 15 | from functools import partial 16 | import sys 17 | import queue 18 | import threading 19 | from typing import TypeVar, Iterator, List, Optional, Tuple 20 | 21 | T = TypeVar('T') 22 | 23 | class ThreadedIterator(Iterator[T]): 24 | """An iterator object that computes its elements in a single parallel thread to be ready to be consumed. 25 | The iterator should *not* return `None`. Elements of the original iterable will be shuffled arbitrarily.""" 26 | def __init__(self, original_iterator: Iterator[T], max_queue_size: int = 2, enabled: bool = True): 27 | self.__is_enabled = enabled 28 | if enabled: 29 | self.__queue = queue.Queue(maxsize=max_queue_size) # type: queue.Queue[Optional[T]] 30 | self.__thread = threading.Thread(target=lambda: self.__worker(self.__queue, original_iterator), daemon=True) 31 | self.__thread.start() 32 | else: 33 | self.__original_iterator = original_iterator 34 | 35 | @staticmethod 36 | def __worker(queue: queue.Queue, original_iterator: Iterator[T])-> None: 37 | try: 38 | for element in original_iterator: 39 | assert element is not None, 'By convention, Iterables wrapped in ThreadedIterator may not contain None.' 40 | queue.put(element, block=True) 41 | queue.put(None, block=True) 42 | except Exception as e: 43 | _, __, tb = sys.exc_info() 44 | queue.put((e, tb), block=True) 45 | 46 | def __next__(self) -> T: 47 | next_element = self.__queue.get(block=True) 48 | if next_element is None: 49 | self.__thread.join() 50 | self.__queue.put(None) # Make sure that we remember that we are done if we are called once more... 51 | raise StopIteration 52 | if isinstance(next_element, tuple) and isinstance(next_element[0], Exception): 53 | raise next_element[0].with_traceback(next_element[1]) 54 | return next_element 55 | 56 | def __iter__(self): 57 | if self.__is_enabled: 58 | return self 59 | else: 60 | return iter(self.__original_iterator) 61 | 62 | 63 | 64 | 65 | 66 | NON_ALPHA = re.compile("[^A-Za-z_0-9]") 67 | # parameters used in DuplicationIndex 68 | MIN_NUM_TOKENS = 10 69 | NUM_PERM = 256 70 | 71 | def get_min_hash(tokens: List[str]) -> Optional[MinHash]: 72 | """Compute the MinHash of a code snippet.""" 73 | if len(tokens) < MIN_NUM_TOKENS: 74 | return None 75 | min_hash = MinHash(num_perm=NUM_PERM) 76 | for token in set(tokens): 77 | min_hash.update(token.encode()) 78 | return min_hash 79 | 80 | 81 | def get_tokens(code: str) -> Set[str]: 82 | """Tokenize a code snippet.""" 83 | return {t for t in NON_ALPHA.split(code) if len(t.strip()) > 0} 84 | 85 | 86 | class DuplicationIndex: 87 | def __init__( 88 | self, 89 | *, 90 | duplication_jaccard_threshold: float = 0.85, 91 | ): 92 | self._duplication_jaccard_threshold = duplication_jaccard_threshold 93 | self._num_perm = NUM_PERM 94 | self._index = MinHashLSH(threshold=self._duplication_jaccard_threshold, num_perm=self._num_perm) 95 | 96 | self._duplicate_clusters = defaultdict(set) 97 | 98 | def add(self, code_key: Tuple, min_hash: MinHash) -> None: 99 | """Add a key to _index (MinHashLSH) 100 | the min_hash is used to query closest matches based on the jaccard_threshold. 101 | The new key is either added to a existing cluster of one close match, 102 | or a new cluster is created. The clusters created in this way, depend on the order of add. 103 | 104 | Args: 105 | code_key (Tuple of (index, repo_name, path)): 106 | Theoritically any hasbale key. Here we use a tuple to retrieve the information later. 107 | min_hash: MinHash of the code_key. 108 | """ 109 | close_duplicates = self._index.query(min_hash) 110 | if code_key in self._index.keys: 111 | print(f"Duplicate key {code_key}") 112 | return 113 | 114 | self._index.insert(code_key, min_hash) 115 | if len(close_duplicates) > 0: 116 | for base_duplicate in close_duplicates: 117 | if base_duplicate in self._duplicate_clusters: 118 | self._duplicate_clusters[base_duplicate].add(code_key) 119 | break 120 | else: 121 | self._duplicate_clusters[close_duplicates[0]].add(code_key) 122 | 123 | def get_duplicate_clusters(self) -> List[List[Dict]]: 124 | """Export the duplicate clusters. 125 | For each cluster, the first element is the base element of the cluster. 126 | The base element has an estimation jaccard similarity higher than the threshold with all the other elements. 127 | 128 | Returns: 129 | duplicate_clusters (List[List[Dict]]): 130 | List of duplicate clusters. 131 | """ 132 | duplicate_clusters = [] 133 | for base, duplicates in self._duplicate_clusters.items(): 134 | cluster = [base] + list(duplicates) 135 | # reformat the cluster to be a list of dict 136 | cluster = [{"base_index": el[0], "repo_name": el[1], "path": el[2]} for el in cluster] 137 | duplicate_clusters.append(cluster) 138 | return duplicate_clusters 139 | 140 | def save(self, filepath) -> None: 141 | duplicate_clusters = self.get_duplicate_clusters() 142 | with open(filepath, "w") as f: 143 | json.dump(duplicate_clusters, f) 144 | 145 | 146 | def _compute_min_hash(element): 147 | index, data = element 148 | min_hash = get_min_hash([t for t in NON_ALPHA.split(data["text"]) if len(t.strip()) > 0]) 149 | if min_hash is not None: 150 | #can supply later and bookkept by get_duplicate_clusters 151 | return (index, "norepo", "nopath"), min_hash 152 | 153 | 154 | def minhash_iter(dataset_iterator: Type[Dataset]): 155 | with mp.Pool() as pool: 156 | for data in pool.imap_unordered( 157 | _compute_min_hash, 158 | ThreadedIterator(dataset_iterator, max_queue_size=10000), 159 | chunksize=100, 160 | ): 161 | if data is not None: 162 | yield data 163 | 164 | 165 | def make_duplicate_clusters(dataset_iterator: Type[Dataset], jaccard_threshold: float): 166 | """Find duplicate clusters in the dataset in two steps: 167 | 1. Compute MinHash for each code snippet. MinHash is a tool for fast jaccard similarity estimation. 168 | This step is computed using an asynchronous multiprocessing pool, minhash_iter 169 | 2. Find duplicate clusters. The computed MinHash is added sequentially to the DuplicationIndex. 170 | This step cannot be parallelized. So using asynchronous thread in the previous step helps to speed up the process. 171 | """ 172 | di = DuplicationIndex(duplication_jaccard_threshold=jaccard_threshold) 173 | 174 | for filename, min_hash in tqdm(ThreadedIterator(minhash_iter(enumerate(dataset_iterator)), max_queue_size=100)): 175 | di.add(filename, min_hash) 176 | 177 | # Returns a List[Cluster] where Cluster is List[str] with the filenames. 178 | return di.get_duplicate_clusters() 179 | 180 | 181 | def jaccard_similarity(code1: str, code2: str) -> float: 182 | """Compute the Jaccard similarity of two code snippets.""" 183 | tokens1 = get_tokens(code1) 184 | tokens2 = get_tokens(code2) 185 | return len(tokens1 & tokens2) / len(tokens1 | tokens2) 186 | 187 | 188 | _shared_dataset = None 189 | 190 | 191 | def _find_cluster_extremes_shared(cluster, jaccard_threshold): 192 | """Find a reduced cluster such that each code in the origin cluster is similar to at least one code in the reduced cluster. 193 | Two codes are similar if their Jaccard similarity is above the threshold. 194 | 195 | Args: 196 | cluster (List[dict]): 197 | cluster is a list of dict, each dict contains the following keys: 198 | - base_index 199 | - repo_name 200 | - path 201 | This is a typical output of DuplicationIndex.get_duplicate_clusters() 202 | jaccard_threshold (float): 203 | threshold for Jaccard similarity. 204 | Two codes are similar if their Jaccard similarity is above the threshold. 205 | 206 | Returns: 207 | extremes (List[dict]): 208 | A reduced representation of the cluster. The field copies is added to each dict. 209 | The copies field indicates the number of similar codes in the cluster for a extreme. 210 | """ 211 | extremes = [] 212 | for element1 in cluster: 213 | code1 = _shared_dataset[element1["base_index"]]["text"] 214 | for element2 in extremes: 215 | code2 = _shared_dataset[element2["base_index"]]["text"] 216 | if jaccard_similarity(code1, code2) >= jaccard_threshold: 217 | element2["copies"] += 1 218 | break 219 | else: 220 | element1["copies"] = 1 221 | extremes.append(element1) 222 | return extremes 223 | 224 | 225 | def find_extremes(cluster_list, dataset, jaccard_threshold): 226 | """Call the _find_cluster_extremes_shared function in a parallel fashion. 227 | 228 | Args: 229 | cluster_list (List[List[Dict]]): 230 | each cluster is a list of dicts with the key base_index, 231 | referring to the index of the base code in the dataset. 232 | dataset (Type[Dataset]): 233 | dataset is used to access the content of the code snippets, 234 | using the base_index from the cluster_list. 235 | dataset is shared between all the processes using a glabal variable (any other way to share the dataset?), 236 | otherwise the multi processing is not speeded up. 237 | jaccard_threshold (float): 238 | the threshold for the jaccard similarity. The default value is 0.85 239 | 240 | Returns: 241 | extremes_list (List[Dict]): 242 | Each cluster is reduced to extremes. 243 | See _find_cluster_extremes_shared for the definition of extremes. 244 | """ 245 | global _shared_dataset 246 | _shared_dataset = dataset 247 | extremes_list = [] 248 | f = partial(_find_cluster_extremes_shared, jaccard_threshold=jaccard_threshold) 249 | with mp.Pool() as pool: 250 | for extremes in tqdm( 251 | pool.imap_unordered( 252 | f, 253 | cluster_list, 254 | ), 255 | total=len(cluster_list), 256 | ): 257 | extremes_list.append(extremes) 258 | return extremes_list 259 | 260 | 261 | def deduplicate_dataset( 262 | dataset: Type[Dataset], jaccard_threshold: float = 0.85 263 | ) -> Tuple[Type[Dataset], List[List[Dict]]]: 264 | """Deduplicate the dataset using minhash and jaccard similarity. 265 | This function first generate duplicate clusters, then each cluster 266 | is reduced to the extremes that are similar to the other elements in the cluster. 267 | Codes are called similar if their Jaccard similarity is greater than jaccard_threshold (0.85 default). 268 | 269 | Args: 270 | dataset (Type[Dataset]): 271 | The dataset to deduplicate. 272 | jaccard_threshold (float, default=0.85): 273 | jaccard threshold to determine if two codes are similar 274 | 275 | Returns: 276 | ds_dedup (Type[Dataset]): 277 | The deduplicated dataset. 278 | duplicate_clusters (List[List[Dict]]): 279 | The list of duplicate clusters. 280 | Each cluster is a list of dicts with the following keys: 281 | - base_index : int 282 | The index of the code in the original dataset. 283 | - repo_name : str 284 | - path : str 285 | - copies : int 286 | The number of copies of the code in the cluster. (find_cluster_extremes) 287 | - is_extreme : bool 288 | Whether the code is an extreme in the cluster. 289 | All the codes in the cluster are removed from the dataset except the extremes. 290 | 291 | Example: 292 | >>> from datasets import load_dataset 293 | >>> from minhash_deduplication import deduplicate_dataset 294 | >>> ds = load_dataset("lvwerra/codeparrot-clean", split="train") 295 | >>> ds_dedup, duplicate_clusters = deduplicate_dataset(ds, jaccard_threshold=0.85) 296 | """ 297 | duplicate_clusters = make_duplicate_clusters(dataset, jaccard_threshold) 298 | duplicate_indices = {x["base_index"] for cluster in duplicate_clusters for x in cluster} 299 | extreme_dict = {} 300 | extremes_clusters = find_extremes(duplicate_clusters, dataset, jaccard_threshold) 301 | for extremes in extremes_clusters: 302 | for element in extremes: 303 | extreme_dict[element["base_index"]] = element 304 | remove_indices = duplicate_indices - set(extreme_dict.keys()) 305 | ds_filter = dataset.filter(lambda x, idx: idx not in remove_indices, with_indices=True) 306 | 307 | # update duplicate_clusters 308 | for cluster in duplicate_clusters: 309 | for element in cluster: 310 | element["is_extreme"] = element["base_index"] in extreme_dict 311 | if element["is_extreme"]: 312 | element["copies"] = extreme_dict[element["base_index"]]["copies"] 313 | 314 | print(f"Original dataset size: {len(dataset)}") 315 | print(f"Number of duplicate clusters: {len(duplicate_clusters)}") 316 | print(f"Files in duplicate cluster: {len(duplicate_indices)}") 317 | print(f"Unique files in duplicate cluster: {len(extreme_dict)}") 318 | print(f"Filtered dataset size: {len(ds_filter)}") 319 | 320 | return ds_filter, duplicate_clusters --------------------------------------------------------------------------------