├── verilog_eval
    ├── verilog_eval
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── data.cpython-311.pyc
    │   │   ├── data.cpython-38.pyc
    │   │   ├── __init__.cpython-311.pyc
    │   │   ├── __init__.cpython-38.pyc
    │   │   ├── evaluation.cpython-38.pyc
    │   │   ├── execution.cpython-311.pyc
    │   │   ├── execution.cpython-38.pyc
    │   │   └── evaluation.cpython-311.pyc
    │   ├── evaluate_functional_correctness.py
    │   ├── data.py
    │   ├── evaluation.py
    │   └── execution.py
    ├── build
    │   └── lib
    │   │   └── verilog_eval
    │   │       ├── __init__.py
    │   │       ├── evaluate_functional_correctness.py
    │   │       ├── data.py
    │   │       ├── evaluation.py
    │   │       └── execution.py
    ├── requirements.txt
    ├── verilog_eval.egg-info
    │   ├── dependency_links.txt
    │   ├── requires.txt
    │   ├── top_level.txt
    │   ├── PKG-INFO
    │   ├── entry_points.txt
    │   └── SOURCES.txt
    ├── dist
    │   └── verilog_eval-1.0-py3.8.egg
    ├── data
    │   ├── human-eval
    │   │   ├── HumanEval.jsonl.gz
    │   │   ├── example_problem.jsonl
    │   │   └── example_samples.jsonl
    │   └── example
    │   │   ├── ExampleSolution.jsonl
    │   │   ├── ExampleSolution.jsonl_reference.jsonl
    │   │   ├── ExampleDescriptions.jsonl
    │   │   └── ExampleEval.jsonl
    ├── setup.py
    ├── Dockerfile
    ├── LICENSE
    └── README.md
├── auto_data_gen_val
    ├── assets
    │   ├── verilog
    │   │   ├── context
    │   │   │   ├── context.optional_features.txt
    │   │   │   ├── system_context_raw.csv
    │   │   │   └── context.fixed_features.txt
    │   │   ├── documented_list.txt
    │   │   └── context_embedding
    │   │   │   └── system_context_embedding.csv
    │   └── xilinx_hls
    │   │   ├── context
    │   │       ├── context.optional_features.txt
    │   │       ├── system_context_raw.csv
    │   │       └── context.fixed_features.txt
    │   │   ├── context_embedding
    │   │       └── system_context_embedding.csv
    │   │   └── documented_list.txt
    ├── clean.sh
    ├── auto_restart_script.sh
    ├── auto_restart_script_1.sh
    ├── test_repo
    │   ├── passthrough.v
    │   └── multiplier.v
    ├── preprocess_data
    │   ├── prepare_example_code_strings.py
    │   ├── .env
    │   ├── process_data
    │   │   ├── dataset_viewer.py
    │   │   └── minhash.py
    │   ├── example_code_strings_simple_instructions.json
    │   ├── example_code_strings_detailed_instructions.json
    │   └── minhash_deduplicate.py
    ├── move_dataset.sh
    ├── .env
    ├── run_all_part.sh
    ├── tool_utils.py
    ├── my_pydantic.py
    ├── gen_detailed_steps.py
    ├── gen_block_summaries_no_comment_exists.py
    ├── gen_verilogeval_baseline_summary.py
    ├── pre_proc_sync.py
    ├── dataset_utils_baseline.py
    ├── verilog_eval_to_part_data.py
    ├── gen_block_summaries.py
    ├── requirements.txt
    ├── code_validate.py
    ├── preliminary_exp.py
    ├── code_preprocesser.py
    ├── gen_global_summary.py
    ├── line_by_line_comments_gen.py
    └── code_repo_documentor.py
├── imgs
    ├── pyverilog_patch.png
    └── mg_verilog_logo-removebg-preview.png
├── .gitignore
├── inference_server_setup
    ├── hf_test.py
    ├── test.py
    └── README.md
├── sft_code
    ├── train_baseline.sh
    ├── train.sh
    ├── train_llm1.sh
    └── train_llm2.sh
├── model_eval_qlora
    ├── gen.sh
    ├── gen_fp.sh
    ├── gen_simple_description.sh
    ├── gen_llm1.sh
    ├── gen_llm2_block_to_code.sh
    └── standalone_eval.py
├── LICENSE
└── document_customized_repo
    ├── decode_results.py
    ├── document_customized_repo.sh
    └── test_dir
        └── priority_encoder.v


/verilog_eval/verilog_eval/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/verilog_eval/build/lib/verilog_eval/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/verilog_eval/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | fire
3 | numpy
4 | 


--------------------------------------------------------------------------------
/verilog_eval/verilog_eval.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/assets/verilog/context/context.optional_features.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/assets/verilog/documented_list.txt:
--------------------------------------------------------------------------------
1 | priority_encoder.v


--------------------------------------------------------------------------------
/auto_data_gen_val/assets/xilinx_hls/context/context.optional_features.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/verilog_eval/verilog_eval.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | fire
3 | numpy
4 | 


--------------------------------------------------------------------------------
/verilog_eval/verilog_eval.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | verilog-eval
2 | verilog_eval
3 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/assets/verilog/context/system_context_raw.csv:
--------------------------------------------------------------------------------
1 | Filename,File type,Summary,Text,Line_id
2 | 


--------------------------------------------------------------------------------
/imgs/pyverilog_patch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/imgs/pyverilog_patch.png


--------------------------------------------------------------------------------
/auto_data_gen_val/assets/xilinx_hls/context/system_context_raw.csv:
--------------------------------------------------------------------------------
1 | Filename,File type,Summary,Text,Line_id
2 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/assets/verilog/context_embedding/system_context_embedding.csv:
--------------------------------------------------------------------------------
1 | Filename,embedding,Line_id,Text
2 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/assets/xilinx_hls/context_embedding/system_context_embedding.csv:
--------------------------------------------------------------------------------
1 | Filename,embedding,Line_id,Text
2 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/clean.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | rm -rf assets/verilog/converse/*
3 | rm -rf assets/xilinx_hls/converse/*
4 | 


--------------------------------------------------------------------------------
/imgs/mg_verilog_logo-removebg-preview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/imgs/mg_verilog_logo-removebg-preview.png


--------------------------------------------------------------------------------
/verilog_eval/dist/verilog_eval-1.0-py3.8.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/verilog_eval/dist/verilog_eval-1.0-py3.8.egg


--------------------------------------------------------------------------------
/verilog_eval/data/human-eval/HumanEval.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/verilog_eval/data/human-eval/HumanEval.jsonl.gz


--------------------------------------------------------------------------------
/verilog_eval/verilog_eval.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.1
2 | Name: verilog-eval
3 | Version: 1.0
4 | Author: NVIDIA
5 | License-File: LICENSE
6 | 


--------------------------------------------------------------------------------
/verilog_eval/verilog_eval.egg-info/entry_points.txt:
--------------------------------------------------------------------------------
1 | [console_scripts]
2 | evaluate_functional_correctness = verilog_eval.evaluate_functional_correctness
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | **/__pycache__
2 | auto_data_gen_val/assets/verilog/code_and_comment_src
3 | auto_data_gen_val/assets/verilog/converse
4 | verilog_eval
5 | tmp
6 | cache


--------------------------------------------------------------------------------
/verilog_eval/verilog_eval/__pycache__/data.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/verilog_eval/verilog_eval/__pycache__/data.cpython-311.pyc


--------------------------------------------------------------------------------
/verilog_eval/verilog_eval/__pycache__/data.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/verilog_eval/verilog_eval/__pycache__/data.cpython-38.pyc


--------------------------------------------------------------------------------
/verilog_eval/verilog_eval/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/verilog_eval/verilog_eval/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/verilog_eval/verilog_eval/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/verilog_eval/verilog_eval/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/verilog_eval/verilog_eval/__pycache__/evaluation.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/verilog_eval/verilog_eval/__pycache__/evaluation.cpython-38.pyc


--------------------------------------------------------------------------------
/verilog_eval/verilog_eval/__pycache__/execution.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/verilog_eval/verilog_eval/__pycache__/execution.cpython-311.pyc


--------------------------------------------------------------------------------
/verilog_eval/verilog_eval/__pycache__/execution.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/verilog_eval/verilog_eval/__pycache__/execution.cpython-38.pyc


--------------------------------------------------------------------------------
/verilog_eval/verilog_eval/__pycache__/evaluation.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GATECH-EIC/mg-verilog/HEAD/verilog_eval/verilog_eval/__pycache__/evaluation.cpython-311.pyc


--------------------------------------------------------------------------------
/auto_data_gen_val/auto_restart_script.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source /home/user_name/init_conda.sh
3 | conda activate tvm
4 | while true; do echo -e "n\nn\nn\nn\nn\nn\nn\nn\n" | python line_by_line_comments_gen.py && break; done
5 | 


--------------------------------------------------------------------------------
/verilog_eval/data/human-eval/example_problem.jsonl:
--------------------------------------------------------------------------------
1 | {"task_id": "test/0", "prompt": "def return1():\n", "canonical_solution": "    return 1", "test": "def check(candidate):\n    assert candidate() == 1", "entry_point": "return1"}
2 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/auto_restart_script_1.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source /home/user_name/init_conda.sh
3 | conda activate tvm
4 | #first argument start_id
5 | #second argument end_id
6 | 
7 | echo "python $1 $2 $3"
8 | while true; do python $1 $2 $3 && break; done
9 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/test_repo/passthrough.v:
--------------------------------------------------------------------------------
 1 | module passthrough(
 2 |   clk,
 3 |   rst,
 4 |   op_din_en,
 5 |   op_din_eop,
 6 |   op_din,
 7 |   op_dout
 8 | );
 9 | 
10 | parameter Q = 8;
11 | parameter RELU = 0;
12 | 
13 | input clk;
14 | input rst;
15 | input op_din_en;
16 | input op_din_eop;
17 | input [15:0] op_din;
18 | output [15:0] op_dout;
19 | 
20 | assign op_dout = op_din;
21 | 
22 | endmodule
23 | 


--------------------------------------------------------------------------------
/verilog_eval/data/human-eval/example_samples.jsonl:
--------------------------------------------------------------------------------
1 | {"task_id": "test/0", "completion": "    import subprocess\n    subprocess.check_output('rm -rf tmp')"}
2 | {"task_id": "test/0", "completion": "    import time\n    time.sleep(10)\n    return 1"}
3 | {"task_id": "test/0", "completion": "    return input('enter a number')"}
4 | {"task_id": "test/0", "completion": "    return 1"}
5 | {"task_id": "test/0", "completion": "  return 1"}
6 | {"task_id": "test/0", "completion": "\treturn 1"}
7 | 


--------------------------------------------------------------------------------
/verilog_eval/verilog_eval.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | LICENSE
 2 | README.md
 3 | setup.py
 4 | verilog_eval/__init__.py
 5 | verilog_eval/data.py
 6 | verilog_eval/evaluate_functional_correctness.py
 7 | verilog_eval/evaluation.py
 8 | verilog_eval/execution.py
 9 | verilog_eval.egg-info/PKG-INFO
10 | verilog_eval.egg-info/SOURCES.txt
11 | verilog_eval.egg-info/dependency_links.txt
12 | verilog_eval.egg-info/entry_points.txt
13 | verilog_eval.egg-info/requires.txt
14 | verilog_eval.egg-info/top_level.txt


--------------------------------------------------------------------------------
/auto_data_gen_val/assets/xilinx_hls/context/context.fixed_features.txt:
--------------------------------------------------------------------------------
1 | I am training/fine-tuning an LLM to assit the hardware code (Xilinx HLS) code generation.
2 | You are helping me preparing the training data, with code blocks and comments pairs.
3 | You will help me document a hardawre code with comments line by line.
4 | Do not add too obvious comments; only add comments when you think is informative.
5 | Do not add comment immediately after parameters template parameters.
6 | For arguments, only add a single line comment at the beginning.
7 | You will also help me to decide whether multiple lines of code can be combined together as a code block.


--------------------------------------------------------------------------------
/verilog_eval/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pkg_resources
 4 | from setuptools import setup, find_packages
 5 | 
 6 | 
 7 | setup(
 8 |     name="verilog-eval",
 9 |     py_modules=["verilog-eval"],
10 |     version="1.0",
11 |     description="",
12 |     author="NVIDIA",
13 |     packages=find_packages(),
14 |     install_requires=[
15 |         str(r)
16 |         for r in pkg_resources.parse_requirements(
17 |             open(os.path.join(os.path.dirname(__file__), "requirements.txt"))
18 |         )
19 |     ],
20 |     entry_points={
21 |         "console_scripts": [
22 |             "evaluate_functional_correctness = verilog_eval.evaluate_functional_correctness",
23 |         ]
24 |     }
25 | )
26 | 


--------------------------------------------------------------------------------
/inference_server_setup/hf_test.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer
 2 | import transformers
 3 | import torch
 4 | 
 5 | model = "codellama/CodeLlama-34b-hf"
 6 | 
 7 | 
 8 | tokenizer = AutoTokenizer.from_pretrained(model)
 9 | pipeline = transformers.pipeline(
10 |     "text-generation",
11 |     model=model,
12 |     torch_dtype=torch.float16,
13 |     device_map="auto",
14 | )
15 | 
16 | sequences = pipeline(
17 |     "// Vivado HDL program to do 4x4 matrix multiplication",
18 |     do_sample=True,
19 |     top_k=10,
20 |     temperature=0.1,
21 |     top_p=0.95,
22 |     num_return_sequences=1,
23 |     eos_token_id=tokenizer.eos_token_id,
24 |     max_length=200,
25 | )
26 | for seq in sequences:
27 |     print(f"Result: {seq['generated_text']}")


--------------------------------------------------------------------------------
/sft_code/train_baseline.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3
 2 | 
 3 | # accelerate launch qlora.py
 4 | 
 5 | WORLD_SIZE=4 torchrun --nproc_per_node=4 qlora.py \
 6 |                       --model_name_or_path codellama/CodeLlama-7b-Instruct-hf \
 7 |                       --source_max_len 2048 \
 8 |                       --target_max_len 1024 \
 9 |                       --output_dir ./data/Verilog_code_generation/new_baseline_verilogeval_global_summary \
10 |                       --dataset_dir /data/user_name_data/user_name/sft_dataset/new_baseline_verilogeval_global_summary \
11 |                       --cache_dir /data/user_name_data/user_name/HF_cache \
12 |                       --gradient_accumulation_steps 4 \
13 |                       --save_steps 500
14 | 
15 | 


--------------------------------------------------------------------------------
/sft_code/train.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3
 2 | 
 3 | # accelerate launch qlora.py
 4 | 
 5 | WORLD_SIZE=4 torchrun --nproc_per_node=4 qlora.py \
 6 |                       --model_name_or_path codellama/CodeLlama-7b-Instruct-hf \
 7 |                       --hf_token "your_hf_token_if_you_want_to_use_it" \
 8 |                       --source_max_len 2048 \
 9 |                       --target_max_len 1024 \
10 |                       --output_dir $OUTPUT_DIR/data/Verilog_code_generation/checkpoint_dir \
11 |                       --dataset_dir $OUTPUT_DIR/packaged_dataset/merged_dataset \
12 |                       --cache_dir /data/user_name_data/user_name/HF_cache \
13 |                       --gradient_accumulation_steps 4 \
14 |                       --save_steps 500
15 | 
16 | 


--------------------------------------------------------------------------------
/sft_code/train_llm1.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3
 2 | 
 3 | # accelerate launch qlora.py
 4 | 
 5 | WORLD_SIZE=4 torchrun --nproc_per_node=4 qlora.py \
 6 |                       --model_name_or_path meta-llama/Llama-2-7b-chat-hf \
 7 |                       --source_max_len 1024 \
 8 |                       --target_max_len 2048 \
 9 |                       --output_dir ./data/Verilog_code_generation/llm1_new_verilogeval_global_summary_to_block_summary_skip_single_block \
10 |                       --dataset_dir /data/user_name_data/user_name/sft_dataset/llm1_new_verilogeval_global_summary_to_block_summary_skip_single_block \
11 |                       --cache_dir /data/user_name_data/user_name/HF_cache \
12 |                       --gradient_accumulation_steps 4 \
13 |                       --save_steps 500
14 | 
15 | 


--------------------------------------------------------------------------------
/sft_code/train_llm2.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3
 2 | 
 3 | # accelerate launch qlora.py
 4 | 
 5 | WORLD_SIZE=4 torchrun --nproc_per_node=4 qlora.py \
 6 |                       --model_name_or_path codellama/CodeLlama-7b-Instruct-hf \
 7 |                       --source_max_len 2048 \
 8 |                       --target_max_len 1024 \
 9 |                       --output_dir ./data/Verilog_code_generation/llm2_block_summary_plus_new_verilogeval_global_summary_to_pure_code \
10 |                       --dataset_dir /data/user_name_data/user_name/sft_dataset/llm2_block_summary_plus_new_verilogeval_global_summary_to_pure_code \
11 |                       --cache_dir /data/user_name_data/user_name/HF_cache \
12 |                       --gradient_accumulation_steps 4 \
13 |                       --save_steps 500
14 | 
15 | 


--------------------------------------------------------------------------------
/verilog_eval/data/example/ExampleSolution.jsonl:
--------------------------------------------------------------------------------
1 | {"task_id": "gatesv", "completion": "\n\tassign out_both = in[2:0] & in[3:1];\n\tassign out_any = in[2:0] | in[3:1];\n\tassign out_different = in^{in[0], in[3:1]};\n\t\nendmodule\n"}
2 | {"task_id": "gatesv", "completion": "\twire [3:0] any, both;\n\twire [3:0] different;\n\n\tassign both = in[3:2] & in[1:0];\n\tassign any = in[3:2] | in[1:0];\n\tassign different = ~in[0] & in[3:1];\n\n\tassign out_both = both[1:0];\n\tassign out_any = any[1:0];\n\tassign out_different = different[2:0];\nendmodule\n"}
3 | {"task_id": "vector4", "completion": "\n\tassign out = { {24{in[7]}}, in };\n\t\nendmodule\n"}
4 | {"task_id": "vector4", "completion": "\tassign out = { 24{in}, in };\nendmodule\n"}
5 | {"task_id": "zero", "completion": "\t\n\tassign zero = 1'b0;\n\nendmodule\n"}
6 | {"task_id": "zero", "completion": "   assign zero = 1;\nendmodule\n"}
7 | 


--------------------------------------------------------------------------------
/model_eval_qlora/gen.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3
 2 | 
 3 | export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64'
 4 | 
 5 | accelerate launch --multi_gpu generate2.py \
 6 |     --checkpoint_dir ./result_ckpt \
 7 |     --model_type "qlora" \
 8 |     --base_model "codellama/CodeLlama-7b-Instruct-hf" \
 9 |     --tokenizer_type "code_llama" \
10 |     --cache_dir "/home/user_name/HF_cache/" \
11 |     --hf_token "your_hf_token_if_you_want_to_use_it" \
12 |     --max_new_tokens 1024 \
13 |     --temperature 0.7 \
14 |     --desc_file $OUTPUT_DIR/benchmark_packaged_dataset/hdlbits_for_llm2_eval.jsonl \
15 |     --desc_key "block_to_code_description" \
16 |     --prompt_type "llm2_block_to_code" \
17 |     --eval_file ../verilog_eval/data/VerilogEval_Machine.jsonl \
18 |     --output_file $OUTPUT_DIR/data/gen.jsonl \
19 |     --fp16 \
20 |     --sample_k 20 \
21 |     --result_name Test \
22 |     --batch_size 2 
23 | 


--------------------------------------------------------------------------------
/model_eval_qlora/gen_fp.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3
 2 | 
 3 | export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64'
 4 | 
 5 | # accelerate launch --num_processes 4 generate2.py \
 6 | # # python generate.py \
 7 | #     --model_name ./gpu5/output \
 8 | #     --model_type "qlora" \
 9 | #     --base_model "codellama/CodeLlama-7b-Instruct-hf" \
10 | #     --fp16 \
11 | #     --sample_k 20 \
12 | #     --result_name Test \
13 | #     --batch_size 2
14 | #     # --bf16 \
15 | #     # --desc_file ./verilog_eval/desc_mini.jsonl \
16 | #     # --eval_file ./verilog_eval/eval_mini.jsonl \
17 | 
18 | accelerate launch --multi_gpu --num_processes 4 generate2_vanilla.py \
19 |     --model_type "qlora" \
20 |     --base_model "codellama/CodeLlama-7b-Instruct-hf" \
21 |     --bf16 \
22 |     --sample_k 10 \
23 |     --result_name Test \
24 |     --batch_size 1 \
25 |     # --desc_file ./verilog_eval/desc_mini.jsonl \
26 |     # --eval_file ./verilog_eval/eval_mini.jsonl \
27 |     # --skip_gen \
28 |     # --bf16 \
29 | 


--------------------------------------------------------------------------------
/inference_server_setup/test.py:
--------------------------------------------------------------------------------
 1 | from langchain.llms import HuggingFaceTextGenInference
 2 | from langchain.prompts import PromptTemplate
 3 | from langchain.chains import LLMChain
 4 | 
 5 | # LLM inference
 6 | llm = HuggingFaceTextGenInference(
 7 |     inference_server_url="http://130.207.125.98:8080/",
 8 |     max_new_tokens=128,
 9 |     # top_k=10,
10 |     # top_p=0.95,
11 |     # typical_p=0.95,
12 |     # temperature=0.9,
13 |     # repetition_penalty=1.15
14 | )
15 | 
16 | 
17 | llama2_prompt ="""
18 |     <s>[INST] <<SYS>>
19 |     {system_message}
20 |     <</SYS>>
21 | 
22 |     hello, I am test [/INST] I'm a large language model, so I don't have feelings like humans do, but I'm always happy to chat with you. Is there something specific you'd like to talk about or ask me? I'm here to help with any questions you might have. </s><s>[INST] {human_input} [/INST]
23 | """
24 | 
25 | 
26 | 
27 | output = llm(llama2_prompt.format(system_message="You are a Chatbot", human_input="Hello, do you know what time it is?"))
28 | print(output)
29 | 


--------------------------------------------------------------------------------
/verilog_eval/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/pytorch:22.08-py3
 2 | LABEL maintainer="Mingjie Liu <mingjiel@nvidia.com>"
 3 | RUN echo "alias python=python3" >> ~/.bashrc \
 4 |         && echo "alias pip=pip3" >> ~/.bashrc
 5 | RUN apt-get -y update \
 6 |         && apt-get -y install vim 
 7 | RUN apt-get install wget
 8 | RUN apt-get install -y autoconf gperf flex bison screen
 9 | RUN python -m pip install --upgrade pip
10 | RUN python -m pip install deepspeed scikit-learn pandas numpy scipy wandb
11 | RUN python -m pip install accelerate>=0.12.0 torch>=1.3 datasets>=1.8.0 sentencepiece!=0.1.92 protobuf evaluate
12 | RUN python -m pip install git+https://github.com/huggingface/transformers/
13 | RUN git clone https://github.com/steveicarus/iverilog.git && cd iverilog \
14 |         && git checkout 01441687235135d1c12eeef920f75d97995da333 \
15 |         && sh ./autoconf.sh && ./configure && make -j4\
16 |         && make install
17 | RUN python -m pip install jupyterlab
18 | RUN python -m pip install openai tiktoken
19 | ENV SHELL=/bin/bash


--------------------------------------------------------------------------------
/model_eval_qlora/gen_simple_description.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3
 2 | 
 3 | export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64'
 4 | 
 5 | accelerate launch --multi_gpu generate2.py \
 6 |     --checkpoint_dir /home/user_name/DAC_2024/checkpoint/merged_dataset/checkpoint-15000 \
 7 |     --model_type "qlora" \
 8 |     --base_model "codellama/CodeLlama-7b-Instruct-hf" \
 9 |     --tokenizer_type "code_llama" \
10 |     --cache_dir "/home/user_name/HF_cache/" \
11 |     --hf_token "your_hf_token_if_you_want_to_use_it" \
12 |     --max_new_tokens 1024 \
13 |     --temperature 0.7 \
14 |     --top_p 0.95 \
15 |     --desc_file /home/user_name/DAC_2024/chatgpt4_auto_accel/fine_tune_dataset/auto_doc_part_dataset/hdlbits_description.jsonl \
16 |     --desc_key "detail_description" \
17 |     --prompt_type "baseline" \
18 |     --eval_file ../verilog_eval/data/VerilogEval_Machine.jsonl \
19 |     --output_file ./data/gen.merged_dataset+hdlbits_description.jsonl \
20 |     --fp16 \
21 |     --sample_k 10 \
22 |     --result_name "merged_dataset+hdlbits_description" \
23 |     --batch_size 2 


--------------------------------------------------------------------------------
/auto_data_gen_val/preprocess_data/prepare_example_code_strings.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import shutil
 4 | import pandas as pd
 5 | import json
 6 | import jsonlines
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 |     task_ids = [
11 |          "shift18",
12 |          "rule110",
13 |          "lemmings1",
14 |          "fsm3onehot"
15 |     ]
16 |     
17 | 
18 |     example_code_strings_name = "example_code_strings_detailed_instructions.json"
19 |     eval_file = "../../verilog_eval/data/VerilogEval_Machine.jsonl"
20 |     eval_dict = {}
21 |     with jsonlines.open(eval_file) as reader:
22 |         for obj in reader:
23 |             eval_dict[obj["task_id"]] = {}
24 |             eval_dict[obj["task_id"]]["code"] = obj["prompt"] + obj["canonical_solution"]
25 | 
26 |     #store in a json string 
27 |     example_code_strings = {}
28 |     for task_id in task_ids:
29 |         example_code_strings[task_id] = eval_dict[task_id]["code"]
30 |     #store in a json file
31 |     with open(example_code_strings_name, "w") as f:
32 |         json.dump(example_code_strings, f, indent=4)
33 |         
34 | 


--------------------------------------------------------------------------------
/verilog_eval/verilog_eval/evaluate_functional_correctness.py:
--------------------------------------------------------------------------------
 1 | import fire
 2 | import sys
 3 | 
 4 | from verilog_eval.evaluation import evaluate_functional_correctness
 5 | 
 6 | 
 7 | def entry_point(
 8 |     sample_file: str,
 9 |     problem_file: str,
10 |     k: str = "1,5,10",
11 |     n_workers: int = 32,
12 |     timeout: float = 30.0,
13 |     unit_test: bool = False,
14 |     clean_up: bool = True,
15 | ):
16 |     """
17 |     Evaluates the functional correctness of generated samples, and writes
18 |     results to f"{sample_file}_results.jsonl.gz"
19 |     """
20 |     
21 |     #routines to separate the results by "eval_type" entry
22 | 
23 |     if type(k) == tuple:
24 |         k = list(k)
25 |     else:
26 |         k = list(map(int, k.split(",")))
27 |     results = evaluate_functional_correctness(sample_file, problem_file, k, n_workers, timeout, unit_test, clean_up)
28 |     print(results)
29 | 
30 |     #verilator evaluation
31 | 
32 |     #customized iverilog evaluation
33 | 
34 |     #combine the results
35 | 
36 | def main():
37 |     fire.Fire(entry_point)
38 | 
39 | 
40 | sys.exit(main())
41 | 


--------------------------------------------------------------------------------
/verilog_eval/build/lib/verilog_eval/evaluate_functional_correctness.py:
--------------------------------------------------------------------------------
 1 | import fire
 2 | import sys
 3 | 
 4 | from verilog_eval.evaluation import evaluate_functional_correctness
 5 | 
 6 | 
 7 | def entry_point(
 8 |     sample_file: str,
 9 |     problem_file: str,
10 |     k: str = "1,5,10",
11 |     n_workers: int = 32,
12 |     timeout: float = 30.0,
13 |     unit_test: bool = False,
14 |     clean_up: bool = True,
15 | ):
16 |     """
17 |     Evaluates the functional correctness of generated samples, and writes
18 |     results to f"{sample_file}_results.jsonl.gz"
19 |     """
20 |     
21 |     #routines to separate the results by "eval_type" entry
22 | 
23 |     if type(k) == tuple:
24 |         k = list(k)
25 |     else:
26 |         k = list(map(int, k.split(",")))
27 |     results = evaluate_functional_correctness(sample_file, problem_file, k, n_workers, timeout, unit_test, clean_up)
28 |     print(results)
29 | 
30 |     #verilator evaluation
31 | 
32 |     #customized iverilog evaluation
33 | 
34 |     #combine the results
35 | 
36 | def main():
37 |     fire.Fire(entry_point)
38 | 
39 | 
40 | sys.exit(main())
41 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/preprocess_data/.env:
--------------------------------------------------------------------------------
 1 | #xilinx_hls or verilog
 2 | TARGET_LANG="verilog"
 3 | CHATBOT_BACKEND_DIR="../"
 4 | OPENAI_API_KEY="your_openai_key_if_you_want_to_use_it"
 5 | #context and embedding 
 6 | ASSET_DIR="../assets"
 7 | CONVERSE_DIR="${ASSET_DIR}/${TARGET_LANG}/converse"
 8 | SYSTEM_CONTEXT_DIR="${ASSET_DIR}/${TARGET_LANG}/context"
 9 | SRC_DIR="./"
10 | SYSTEM_CONTEXT_EMBEDDING_DIR="${ASSET_DIR}/${TARGET_LANG}/context_embedding"
11 | #raw code processing
12 | STORE_SRC_CODE_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/raw_src/raw_code_src"
13 | CSV_CODE_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/csv_src/csv_code_src"
14 | CSV_COMMENT_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/csv_src/csv_comment_src"
15 | CSV_NEW_COMMENT_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/csv_src/csv_new_comment_src"
16 | CSV_PURE_GEN_COMMENT_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/csv_src/csv_pure_gen_comment_src"
17 | CODE_SUMMARY_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/code_summary"
18 | DOCUMENTED_CODE_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/documented_code_src"
19 | 
20 | 


--------------------------------------------------------------------------------
/model_eval_qlora/gen_llm1.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3
 2 | 
 3 | export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64'
 4 | 
 5 | accelerate launch --multi_gpu --num_processes 4 generate2.py \
 6 |     --checkpoint_dir /home/user_name/DAC_2024/checkpoint/llm1_high_level_summary_to_block_summary_dataset_skip_single_blocks_usage_summary_combined_better_formating_2/checkpoint-9000 \
 7 |     --model_type "qlora" \
 8 |     --base_model "meta-llama/Llama-2-7b-chat-hf" \
 9 |     --tokenizer_type "llama" \
10 |     --cache_dir "/home/user_name/HF_cache/" \
11 |     --hf_token "your_hf_token_if_you_want_to_use_it" \
12 |     --max_new_tokens 2048 \
13 |     --temperature 0.7 \
14 |     --top_p 0.1 \
15 |     --top_k 40 \
16 |     --repetition_penalty 1.17 \
17 |     --desc_file ../verilog_eval/descriptions/VerilogDescription_Machine.jsonl \
18 |     --desc_key "detail_description" \
19 |     --prompt_type "llm1" \
20 |     --eval_file ../verilog_eval/data/VerilogEval_Machine.jsonl \
21 |     --output_file ./data/gen.llm1.jsonl \
22 |     --fp16 \
23 |     --sample_k 10 \
24 |     --result_name Test \
25 |     --batch_size 2  \
26 |     --skip_iverilog 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Yongan Zhang 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/model_eval_qlora/gen_llm2_block_to_code.sh:
--------------------------------------------------------------------------------
 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3
 2 | 
 3 | export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64'
 4 | 
 5 | accelerate launch --multi_gpu generate2.py \
 6 |     --checkpoint_dir /home/user_name/DAC_2024/checkpoint/llm2_new_block_summary_to_pure_code/checkpoint-9500 \
 7 |     --model_type "qlora" \
 8 |     --base_model "codellama/CodeLlama-7b-Instruct-hf" \
 9 |     --tokenizer_type "code_llama" \
10 |     --cache_dir "/home/user_name/HF_cache/" \
11 |     --hf_token "your_hf_token_if_you_want_to_use_it" \
12 |     --max_new_tokens 1024 \
13 |     --temperature 0.6 \
14 |     --top_p 0.95 \
15 |     --desc_file /home/user_name/DAC_2024/chatgpt4_auto_accel/fine_tune_dataset/auto_doc_part_dataset/hdlbits_description_simple_description.jsonl \
16 |     --desc_key "simple_description" \
17 |     --prompt_type "baseline" \
18 |     --eval_file ../verilog_eval/data/VerilogEval_Machine.jsonl \
19 |     --output_file ./data/gen.llm2_new_block_summary_to_pure_code+simple_description.jsonl \
20 |     --fp16 \
21 |     --sample_k 10 \
22 |     --result_name "llm2_new_block_summary_to_pure_code+simple_description" \
23 |     --batch_size 2 


--------------------------------------------------------------------------------
/verilog_eval/data/example/ExampleSolution.jsonl_reference.jsonl:
--------------------------------------------------------------------------------
1 | {"task_id": "gatesv", "completion": "\n\tassign out_both = in[2:0] & in[3:1];\n\tassign out_any = in[2:0] | in[3:1];\n\tassign out_different = in^{in[0], in[3:1]};\n\t\nendmodule\n", "result": "passed", "passed": true}
2 | {"task_id": "gatesv", "completion": "\twire [3:0] any, both;\n\twire [3:0] different;\n\n\tassign both = in[3:2] & in[1:0];\n\tassign any = in[3:2] | in[1:0];\n\tassign different = ~in[0] & in[3:1];\n\n\tassign out_both = both[1:0];\n\tassign out_any = any[1:0];\n\tassign out_different = different[2:0];\nendmodule\n", "result": "failed: 207 out of 213 samples.", "passed": false}
3 | {"task_id": "vector4", "completion": "\n\tassign out = { {24{in[7]}}, in };\n\t\nendmodule\n", "result": "passed", "passed": true}
4 | {"task_id": "vector4", "completion": "\tassign out = { 24{in}, in };\nendmodule\n", "result": "failed: syntax error.", "passed": false}
5 | {"task_id": "zero", "completion": "\t\n\tassign zero = 1'b0;\n\nendmodule\n", "result": "passed", "passed": true}
6 | {"task_id": "zero", "completion": "   assign zero = 1;\nendmodule\n", "result": "failed: 20 out of 20 samples.", "passed": false}
7 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/move_dataset.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | mode=$1
 4 | part_num=$2
 5 | 
 6 | if [ "$mode" = 0 ]; then
 7 |     folder_to_remove=/home/user_name/DAC_2024/ckpts/test_10_30_${part_num}_complete
 8 |     echo "Removing $folder_to_remove"
 9 |     rm -rf $folder_to_remove
10 |     mkdir $folder_to_remove
11 |     echo "Copying assets to $folder_to_remove"
12 |     cp -r assets $folder_to_remove
13 |     echo "Copying code_vec_store to $folder_to_remove"
14 |     cp -r ../code_vec_store $folder_to_remove
15 |     echo "Copying documented_code to $folder_to_remove"
16 |     cp -r documented_code $folder_to_remove
17 | elif [ "$mode" = 1 ]; then
18 |     assets_dir=/home/user_name/DAC_2024/ckpts/test_10_30_${part_num}_complete/assets
19 |     code_vec_store_dir=/home/user_name/DAC_2024/ckpts/test_10_30_${part_num}_complete/code_vec_store
20 |     echo "Copying assets from $assets_dir to assets"
21 |     rm -rf assets
22 |     cp -r $assets_dir assets
23 |     # rm -rf ../code_vec_store
24 |     # echo "Copying code_vec_store from $code_vec_store_dir to ../code_vec_store"
25 |     # cp -r $code_vec_store_dir ../code_vec_store
26 |     rm -rf documented_code/*
27 |     ./clean.sh
28 | fi
29 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/assets/xilinx_hls/documented_list.txt:
--------------------------------------------------------------------------------
 1 | write_out_stream_indirect.cpp
 2 | layernorm_accumulate.cpp
 3 | linear_weights_ping[ceildiv.cpp
 4 | write_attn.cpp
 5 | compute_patch_embed.cpp
 6 | patch_embed_accumulate_compute.cpp
 7 | prepare_attn.cpp
 8 | top_k.cpp
 9 | read_gate_inp.cpp
10 | write_attn_matmul_v.cpp
11 | read_attn_softmax_info.cpp
12 | read_kv.cpp
13 | compute_gating.cpp
14 | write_out_stream_direct.cpp
15 | read_x.cpp
16 | compute_q_matmul_k.cpp
17 | load_norms.cpp
18 | write_out_stream.cpp
19 | patch_embed_accumulate_read.cpp
20 | compute_add.cpp
21 | compute_norm1.cpp
22 | compute_norm2.cpp
23 | compute_gating_for_patch.cpp
24 | compute_norm.cpp
25 | compute_linear.cpp
26 | write_gate_results.cpp
27 | load_one_time_weights.cpp
28 | finalize_topk_scores_softmax.cpp
29 | load_linear_bias<wt_bias_t>.cpp
30 | read_attn.cpp
31 | load_w_gate.cpp
32 | ViT_compute.cpp
33 | patch_embed_output.cpp
34 | compute_linear_on_stream.cpp
35 | compute_attn_matmul_v.cpp
36 | zero_output.cpp
37 | read_in_stream.cpp
38 | finalize_attn.cpp
39 | layernorm_output.cpp
40 | read_in_stream_indirect.cpp
41 | compute_moe.cpp
42 | read_in_stream_direct.cpp
43 | write_attn_softmax_info.cpp
44 | update_softmax_info.cpp
45 | patch_embed_accumulate.cpp


--------------------------------------------------------------------------------
/auto_data_gen_val/.env:
--------------------------------------------------------------------------------
 1 | #xilinx_hls or verilog
 2 | TARGET_LANG="verilog"
 3 | CHATBOT_BACKEND_DIR="${DATA4AIGCHIP_HOME}/auto_data_gen_val"
 4 | OPENAI_API_KEY="your_openai_api_key"
 5 | LLAMA_INFERENCE_SERVER_URL="http://your.server.ip:port/"
 6 | #context and embedding 
 7 | ASSET_DIR="${CHATBOT_BACKEND_DIR}/assets"
 8 | CONVERSE_DIR="${ASSET_DIR}/${TARGET_LANG}/converse"
 9 | SYSTEM_CONTEXT_DIR="${ASSET_DIR}/${TARGET_LANG}/context"
10 | SRC_DIR="${CHATBOT_BACKEND_DIR}/"
11 | SYSTEM_CONTEXT_EMBEDDING_DIR="${ASSET_DIR}/${TARGET_LANG}/context_embedding"
12 | #raw code processing
13 | STORE_SRC_CODE_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/raw_src/raw_code_src"
14 | CSV_CODE_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/csv_src/csv_code_src"
15 | CSV_COMMENT_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/csv_src/csv_comment_src"
16 | CSV_NEW_COMMENT_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/csv_src/csv_new_comment_src"
17 | CSV_PURE_GEN_COMMENT_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/csv_src/csv_pure_gen_comment_src"
18 | CODE_SUMMARY_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/code_summary"
19 | DOCUMENTED_CODE_DIR="${ASSET_DIR}/${TARGET_LANG}/code_and_comment_src/documented_code_src"
20 | 
21 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/preprocess_data/process_data/dataset_viewer.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | sys.path.append(os.path.abspath("../../"))
 4 | from datasets import load_dataset, load_from_disk, Dataset
 5 | import uuid
 6 | 
 7 | # import sys
 8 | # sys.path.append("../finetuning/")
 9 | # from llama import Tokenizer
10 | import tiktoken
11 | 
12 | from pyverilog.vparser.parser import parse
13 | import pyverilog.vparser.ast as vast
14 | from minhash import deduplicate_dataset
15 | 
16 | import os
17 | import subprocess
18 | import json
19 | from io import StringIO  
20 | from utils import *
21 | from tqdm import tqdm
22 | import numpy as np
23 | 
24 | if __name__ == "__main__":
25 |     dataset_path = "ckpt_separated_modules"
26 |     dataset = load_from_disk(dataset_path)
27 |     #input from user
28 |     while True:
29 |         index = input("Enter index of the module to view: ")
30 |         if index == "exit":
31 |             break
32 |         index = int(index)
33 |         print(dataset[index])
34 |         print(dataset[index]["module_name"])
35 |         print(dataset[index]["text"])
36 |         print(dataset[index]["task_id"])
37 |         print(dataset[index]["code_str_before_preprocessing"])
38 | 
39 |         #save code_str_before_preprocessing to a file
40 |         with open("test.v", "w") as f:
41 |             f.write(dataset[index]["code_str_before_preprocessing"])


--------------------------------------------------------------------------------
/inference_server_setup/README.md:
--------------------------------------------------------------------------------
 1 | # codellm
 2 | 
 3 | This is the example of using codellama within langchain framework
 4 | 
 5 | ## Env Setup
 6 | 
 7 | ```
 8 | conda create -n codellm python==3.9
 9 | conda activate codellm
10 | conda install langchain -c conda-forge
11 | pip install langchain[all]
12 | pip install huggingface_hub
13 | pip install git+https://github.com/huggingface/transformers.git@main accelerate
14 | ```
15 | 
16 | ## Set Huggingface cache dir and access token
17 | 
18 | By default, huggingface will use ~/.cache/huggingface/ for cache datasets and models. However, in some servers, you only have limited space in home dir or you want this cache stored in a folder that can be shared among different servers. In such cases, you need to set your huggingface cache dir manully.
19 | 
20 | ```
21 | export HF_HOME=/path/to/cache/directory
22 | export HUGGINGFACEHUB_API_TOKEN=your_hf_token
23 | ```
24 | 
25 | You can also add the above cmd to your bashrc, if you want to set it permanently.
26 | 
27 | ## Setup HuggingFace Inference Server
28 | 
29 | ```
30 | model=codellama/CodeLlama-34b-hf
31 | volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
32 | 
33 | docker run --gpus '"device=0,1,2,3,4,5,6,7"' --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.1.0 --model-id $model
34 | ```
35 | 
36 | ## Run the test example
37 | 
38 | ```
39 | python test.py
40 | ```


--------------------------------------------------------------------------------
/auto_data_gen_val/assets/verilog/context/context.fixed_features.txt:
--------------------------------------------------------------------------------
 1 | - Please act as an expert in hardware design using Verilog or SystemVerilog. 
 2 | - You will help me document a hardware code by adding comments, detailed descriptions of the code blocks, detailed descriptions of the modules, and also high-level descriptions of the modules.
 3 | - When generating comments:
 4 |     - Do not add too obvious comments; only add comments when you think the code is not obvious.
 5 |     - Do not add comment immediately after parameters, e.g., `module_name #(.param1(1), .param2(2))`.
 6 |     - For arguments, only add a single line comment at the beginning.
 7 |     - You will also help me to decide whether multiple lines of code can be combined together as a code block.
 8 | - When generating descriptions of the code blocks or modules:
 9 |     - Use as many high-level concepts that are directly applicable to describe the code of the whole design. 
10 |     - When necessary, explicitly mention the specifications of inputs and outputs in terms of their bit-width, range, and any other constraints or considerations.
11 |     - Pay special attention to the temporal logic of the signals; e.g., how the registers are updated, how the state machines transition, etc.
12 |     - Assume your response will be used by an experienced hardware designer as the only basis for implementing the equivalent functionality and provide the same top module input/output interface as described in the code.


--------------------------------------------------------------------------------
/auto_data_gen_val/run_all_part.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ./move_dataset.sh 1 0
 3 | sed -i 's/code_part = [0-9]*/code_part = 0/' test_10_30.py
 4 | ./auto_restart_script.sh
 5 | ./move_dataset.sh 0 0
 6 | 
 7 | ./move_dataset.sh 1 1
 8 | sed -i 's/code_part = [0-9]*/code_part = 1/' test_10_30.py
 9 | ./auto_restart_script.sh
10 | ./move_dataset.sh 0 1
11 | 
12 | ./move_dataset.sh 1 2
13 | sed -i 's/code_part = [0-9]*/code_part = 2/' test_10_30.py
14 | ./auto_restart_script.sh
15 | ./move_dataset.sh 0 2
16 | 
17 | ./move_dataset.sh 1 3
18 | sed -i 's/code_part = [0-9]*/code_part = 3/' test_10_30.py
19 | ./auto_restart_script.sh
20 | ./move_dataset.sh 0 3
21 | 
22 | ./move_dataset.sh 1 4
23 | sed -i 's/code_part = [0-9]*/code_part = 4/' test_10_30.py
24 | ./auto_restart_script.sh
25 | ./move_dataset.sh 0 4
26 | 
27 | ./move_dataset.sh 1 5
28 | sed -i 's/code_part = [0-9]*/code_part = 5/' test_10_30.py
29 | ./auto_restart_script.sh
30 | ./move_dataset.sh 0 5
31 | 
32 | ./move_dataset.sh 1 6
33 | sed -i 's/code_part = [0-9]*/code_part = 6/' test_10_30.py
34 | ./auto_restart_script.sh
35 | ./move_dataset.sh 0 6
36 | 
37 | ./move_dataset.sh 1 7
38 | sed -i 's/code_part = [0-9]*/code_part = 7/' test_10_30.py
39 | ./auto_restart_script.sh
40 | ./move_dataset.sh 0 7
41 | 
42 | # ./move_dataset.sh 1 8
43 | # sed -i 's/code_part = [0-9]*/code_part = 8/' test_10_30.py
44 | # ./auto_restart_script.sh
45 | # ./move_dataset.sh 0 8
46 | 
47 | # ./move_dataset.sh 1 9
48 | # sed -i 's/code_part = [0-9]*/code_part = 9/' test_10_30.py
49 | # ./auto_restart_script.sh
50 | # ./move_dataset.sh 0 9
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/verilog_eval/verilog_eval/data.py:
--------------------------------------------------------------------------------
 1 | from typing import Iterable, Dict
 2 | import gzip
 3 | import json
 4 | import os
 5 | 
 6 | 
 7 | ROOT = os.path.dirname(os.path.abspath(__file__))
 8 | 
 9 | def read_problems(evalset_file: str) -> Dict[str, Dict]:
10 |     return {task["task_id"]: task for task in stream_jsonl(evalset_file)}
11 | 
12 | 
13 | def stream_jsonl(filename: str) -> Iterable[Dict]:
14 |     """
15 |     Parses each jsonl line and yields it as a dictionary
16 |     """
17 |     if filename.endswith(".gz"):
18 |         with open(filename, "rb") as gzfp:
19 |             with gzip.open(gzfp, 'rt') as fp:
20 |                 for line in fp:
21 |                     if any(not x.isspace() for x in line):
22 |                         yield json.loads(line)
23 |     else:
24 |         with open(filename, "r") as fp:
25 |             for line in fp:
26 |                 if any(not x.isspace() for x in line):
27 |                     yield json.loads(line)
28 | 
29 | 
30 | def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
31 |     """
32 |     Writes an iterable of dictionaries to jsonl
33 |     Skipping None in data
34 |     """
35 |     if append:
36 |         mode = 'ab'
37 |     else:
38 |         mode = 'wb'
39 |     filename = os.path.expanduser(filename)
40 |     if filename.endswith(".gz"):
41 |         with open(filename, mode) as fp:
42 |             with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp:
43 |                 for x in data:
44 |                     if x:
45 |                         gzfp.write((json.dumps(x) + "\n").encode('utf-8'))
46 |     else:
47 |         with open(filename, mode) as fp:
48 |             for x in data:
49 |                 if x:
50 |                     fp.write((json.dumps(x) + "\n").encode('utf-8'))
51 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/test_repo/multiplier.v:
--------------------------------------------------------------------------------
 1 | module multiplier(
 2 | 	clk,
 3 | 	d_i,
 4 | 	w_i,
 5 | 	m_o
 6 | 	);
 7 | 	
 8 | test_mod test_mod_inst (
 9 | 	.clk(clk),
10 | 	.d_i(d_i),
11 | 	.w_i(w_i),
12 | 	.m_o(m_o)
13 | 	);
14 | 
15 | parameter C_WIDTH = 2; // data channel, set to 2 when using 8 bit 
16 | parameter D_WIDTH = 8;
17 | parameter W_WIDTH = 16;
18 | parameter M_WIDTH = 32;
19 | /* W_WIDTH should be 1/2/8/16 */
20 | input clk;
21 | input [D_WIDTH-1:0] d_i;
22 | input [W_WIDTH-1:0] w_i;
23 | output wire [M_WIDTH-1:0] m_o;
24 | 
25 | reg  [M_WIDTH-1:0] m_o_reg;
26 | wire  [M_WIDTH-1:0] m_o_tmp;
27 | 
28 | wire [23:0] w_i_tmp;
29 | 
30 | assign w_i_tmp = {{w_i[15:8] + {8{w_i[7]}}},{8{w_i[7]}}, w_i[7:0]};
31 | 
32 | generate 
33 | case (W_WIDTH/C_WIDTH)
34 | 	1: 	always @(posedge clk)
35 | 		begin
36 | 			m_o_reg <= w_i[0]?(~d_i[D_WIDTH-1:0]+1):D_WIDTH[D_WIDTH-1:0];
37 | 		end
38 | 	2:  always @ (posedge clk)
39 |         begin
40 | 			m_o_reg <= w_i[1]?(~d_i[D_WIDTH-1:0]+1):(w_i[0]?d_i[D_WIDTH-1:0]:{D_WIDTH{1'b0}});
41 |         end
42 | 	8:  if (D_WIDTH == 8) begin
43 | 			mul24x8_signed u_mul24x8_signed (
44 | 			.CLK(clk),
45 | 			.A(w_i_tmp),
46 | 			.B(d_i[7:0]),
47 | 			.P(m_o_tmp) // output
48 | 			);
49 | 			assign m_o[15:0] = m_o_tmp[15:0];
50 | 			assign m_o[31:16] = m_o_tmp[31:16]-{16{m_o_tmp[15]}};
51 | 			end
52 | 		else 
53 | 			mul16x16_signed u_mul16x16_signed (
54 | 			.CLK(clk),
55 | 			.A(w_i[15:0]),
56 | 			.B(d_i[15:0]),
57 | 			.P(m_o) // output
58 | 			);
59 | 	default: mul16x16_signed u_mul16x16_signed (
60 | 			.CLK(clk),
61 | 			.A(w_i[15:0]),
62 | 			.B(d_i[15:0]),
63 | 			.P(m_o) // output
64 | 			);
65 | endcase
66 |     if (W_WIDTH < 3) begin: gen_mo
67 |         assign m_o = m_o_reg;
68 |     end
69 | endgenerate
70 | 
71 | endmodule
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/verilog_eval/build/lib/verilog_eval/data.py:
--------------------------------------------------------------------------------
 1 | from typing import Iterable, Dict
 2 | import gzip
 3 | import json
 4 | import os
 5 | 
 6 | 
 7 | ROOT = os.path.dirname(os.path.abspath(__file__))
 8 | 
 9 | def read_problems(evalset_file: str) -> Dict[str, Dict]:
10 |     return {task["task_id"]: task for task in stream_jsonl(evalset_file)}
11 | 
12 | 
13 | def stream_jsonl(filename: str) -> Iterable[Dict]:
14 |     """
15 |     Parses each jsonl line and yields it as a dictionary
16 |     """
17 |     if filename.endswith(".gz"):
18 |         with open(filename, "rb") as gzfp:
19 |             with gzip.open(gzfp, 'rt') as fp:
20 |                 for line in fp:
21 |                     if any(not x.isspace() for x in line):
22 |                         yield json.loads(line)
23 |     else:
24 |         with open(filename, "r") as fp:
25 |             for line in fp:
26 |                 if any(not x.isspace() for x in line):
27 |                     yield json.loads(line)
28 | 
29 | 
30 | def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
31 |     """
32 |     Writes an iterable of dictionaries to jsonl
33 |     Skipping None in data
34 |     """
35 |     if append:
36 |         mode = 'ab'
37 |     else:
38 |         mode = 'wb'
39 |     filename = os.path.expanduser(filename)
40 |     if filename.endswith(".gz"):
41 |         with open(filename, mode) as fp:
42 |             with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp:
43 |                 for x in data:
44 |                     if x:
45 |                         gzfp.write((json.dumps(x) + "\n").encode('utf-8'))
46 |     else:
47 |         with open(filename, mode) as fp:
48 |             for x in data:
49 |                 if x:
50 |                     fp.write((json.dumps(x) + "\n").encode('utf-8'))
51 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/tool_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Type
 2 | from langchain.tools import BaseTool, StructuredTool, Tool, tool
 3 | from pydantic import BaseModel, Field
 4 | from langchain.retrievers.multi_vector import MultiVectorRetriever
 5 | from typing import Any
 6 | 
 7 | from langchain.callbacks.manager import (
 8 |     AsyncCallbackManagerForToolRun,
 9 |     CallbackManagerForToolRun,
10 | )
11 | 
12 | 
13 | # You can provide a custom args schema to add descriptions or custom validation
14 | class SCodeRetrieveSchema(BaseModel):
15 |     query: str = Field(description="should be the function name you want to search for")
16 | 
17 | #TODO: add similarity thresholding
18 | #TODO: multiple doc retrieval
19 | class GlobalCodeRetrieve(BaseTool):
20 |     name = "retrieve_code_function"
21 |     description = "useful for when wantting to look for a function called in a code block to retriveve its summary"
22 |     args_schema: Type[SCodeRetrieveSchema] = SCodeRetrieveSchema
23 |     retriever: Any
24 | 
25 |     def __init__(self, retriever: Any):
26 |         super(GlobalCodeRetrieve,self).__init__(retriever=retriever)
27 | 
28 |     def _run(
29 |         self,
30 |         query: str,
31 |         run_manager: Optional[CallbackManagerForToolRun] = None,
32 |     ) -> str:
33 |         """Use the tool."""
34 |         doc = self.retriever.vectorstore.similarity_search(query)
35 |         doc_summary = doc[0].metadata["summary"]
36 |         return f"Document summary: {doc_summary}"
37 | 
38 |     async def _arun(
39 |         self,
40 |         query: str,
41 |         run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
42 |     ) -> str:
43 |         """Use the tool asynchronously."""
44 |         raise NotImplementedError("custom_search does not support async")
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/document_customized_repo/decode_results.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import argparse
 4 | 
 5 | if __name__ == "__main__":
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument("output_dir", help="output dir where the results are stored")
 8 | 
 9 |     args = parser.parse_args()
10 |     output_dir = args.output_dir
11 | 
12 |     metadatapath = "/documented_code/dataset_metadata/part0/global_high_level_summary.json"
13 | 
14 |     #load metadata
15 |     with open(output_dir+metadatapath, "r") as f:
16 |         metadata = json.load(f)
17 |     #store "global_summary_high_level" to global_summary_high_level.txt
18 |     with open("global_summary_high_level.txt", "w") as f:
19 |         f.write(metadata["priority_encoder.v"]["global_summary_high_level"])
20 |     #store "global_summary_detailed" to global_summary_detailed.txt
21 |     with open("global_summary_detailed.txt", "w") as f:
22 |         f.write(metadata["priority_encoder.v"]["global_summary_detailed"])
23 | 
24 | 
25 |     #block metadata
26 |     block_metadatapath = "/documented_code/dataset_metadata/part0/block_summary.json"
27 |     #load block metadata
28 |     with open(output_dir+ block_metadatapath, "r") as f:
29 |         block_metadata = json.load(f)
30 |     #store "block_summary" to block_summary.txt
31 |     block_idx = 0
32 |     with open( "block_summary.txt", "w") as f:
33 |         for block_summary in block_metadata["priority_encoder.v"]["block_summary"]:
34 |             f.write(f"Block {block_idx}: {block_summary}\n\n")
35 |             block_idx += 1
36 |     
37 |     documented_code_path = "/documented_code/part0/priority_encoder/priority_encoder.v"
38 |     #store "documented_code" to documented_code.v
39 |     with open( "documented_code.v", "w") as f:
40 |         with open(output_dir+ documented_code_path, "r") as f2:
41 |             f.write(f2.read())
42 |             
43 | 
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/verilog_eval/data/example/ExampleDescriptions.jsonl:
--------------------------------------------------------------------------------
1 | {"task_id": "gatesv", "simple_description": " This module takes in 4-bit inputs and outputs 3-bit outputs based on the logical operations of AND, OR, and XOR.", "detail_description": " This Verilog module takes four input bits and produces three output bits. The first output bit is the result of a bitwise AND operation between the two least significant bits of the input. The second output bit is the result of a bitwise OR operation between the two least significant bits of the input. The third output bit is the result of a bitwise XOR operation between the two least significant bits of the input and all the other bits of the input except for the least significant bit."}
2 | {"task_id": "vector4", "simple_description": " This module takes an 8-bit input and replicates it 24 times to create a 32-bit output.", "detail_description": " This Verilog module is a simple combinational circuit that takes an 8-bit input and produces a 32-bit output. The output is formed by replicating the 8-bit input 24 times and then concatenating it with the original 8-bit input. This module does not contain any instantiated modules or state transitions, so the functionality is straightforward. The input is connected directly to the output, with the 8-bit input being replicated 24 times. The output is 32 bits wide, with the first 24 bits being the replicated input and the last 8 bits being the original input."}
3 | {"task_id": "zero", "simple_description": "This module assigns the output 'zero' to a logic value of 0.", "detail_description": " This top Verilog module is a simple module that assigns the output zero to a value of 0. This module does not have any inputs and is used to assign a constant value to the output. This module is useful when a constant value is needed for a design. For example, if a design requires a signal to be always 0, this module can be used to assign the output zero to 0. This module can also be used to assign a constant value to a signal that is used as an input to another module."}


--------------------------------------------------------------------------------
/auto_data_gen_val/my_pydantic.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | from typing import Type, TypeVar
 4 | 
 5 | from langchain.output_parsers.format_instructions import PYDANTIC_FORMAT_INSTRUCTIONS
 6 | from langchain.pydantic_v1 import BaseModel, ValidationError
 7 | from langchain.schema import BaseOutputParser, OutputParserException
 8 | from langchain.schema import AIMessage
 9 | 
10 | T = TypeVar("T", bound=BaseModel)
11 | 
12 | 
13 | class PydanticOutputParserMessages(BaseOutputParser[T]):
14 |     """Parse an output using a pydantic model."""
15 | 
16 |     pydantic_object: Type[T]
17 |     """The pydantic model to parse."""
18 | 
19 |     def parse(self, text: str) -> T:
20 |         try:
21 |             # Greedy search for 1st json candidate.
22 |             match = re.search(
23 |                 r"\{.*\}", text.strip(), re.MULTILINE | re.IGNORECASE | re.DOTALL
24 |             )
25 |             json_str = ""
26 |             if match:
27 |                 json_str = match.group()
28 |             json_object = json.loads(json_str, strict=False)
29 |             return self.pydantic_object.parse_obj(json_object).json()
30 | 
31 |         except (json.JSONDecodeError, ValidationError) as e:
32 |             name = self.pydantic_object.__name__
33 |             msg = f"Failed to parse {name} from completion {text}. Got: {e}"
34 |             raise OutputParserException(msg, llm_output=text)
35 | 
36 |     def get_format_instructions(self) -> str:
37 |         schema = self.pydantic_object.schema()
38 | 
39 |         # Remove extraneous fields.
40 |         reduced_schema = schema
41 |         if "title" in reduced_schema:
42 |             del reduced_schema["title"]
43 |         if "type" in reduced_schema:
44 |             del reduced_schema["type"]
45 |         # Ensure json in context is well-formed with double quotes.
46 |         schema_str = json.dumps(reduced_schema)
47 | 
48 |         return PYDANTIC_FORMAT_INSTRUCTIONS.format(schema=schema_str)
49 | 
50 |     @property
51 |     def _type(self) -> str:
52 |         return "pydantic"
53 | 
54 | 


--------------------------------------------------------------------------------
/document_customized_repo/document_customized_repo.sh:
--------------------------------------------------------------------------------
 1 | TEST_DIR=$1
 2 | OUTPUT_DIR=$2
 3 | CURRENT_DIR=$(cd $(dirname $0); pwd)
 4 | export DATA4AIGCHIP_HOME=$(cd $CURRENT_DIR/..; pwd)
 5 | echo "DATA4AIGCHIP_HOME=$DATA4AIGCHIP_HOME"
 6 | echo "TEST_DIR=$TEST_DIR"
 7 | echo "OUTPUT_DIR=$OUTPUT_DIR"
 8 | 
 9 | 
10 | 
11 | python ../auto_data_gen_val/preprocess_data/process_data/preprocess.py $OUTPUT_DIR/raw_code -customized_dataset_dir $TEST_DIR
12 | 
13 | python ../auto_data_gen_val/utils.py \
14 |     --src_code_dir $OUTPUT_DIR/raw_code \
15 |     --src_code_metadata_file $OUTPUT_DIR/module_inst.json \
16 |     --output_dir $OUTPUT_DIR/partitioned_dataset_output_path/ \
17 |     --shared_lib_dir $OUTPUT_DIR/directory_to_store_common_modules/ \
18 |     --output_code_metadata_dir $OUTPUT_DIR/output_dir_for_code_metadata/ \
19 |     --output_code_metadata_file codes.json \
20 |     --module_to_task_id_map_file $OUTPUT_DIR/module_name_to_task_id_mapping.json
21 | 
22 | 
23 | python ../auto_data_gen_val/line_by_line_comments_gen.py \
24 |     --total_parts 1 \
25 |     --output_dir $OUTPUT_DIR/documented_code \
26 |     --src_code_dir $OUTPUT_DIR/partitioned_dataset_output_path/ \
27 |     --code_metadata_dir $OUTPUT_DIR/output_dir_for_code_metadata/ \
28 |     --code_lib_path $OUTPUT_DIR/directory_to_store_common_modules/ \
29 |     --code_vec_store $OUTPUT_DIR/code_vec_store/test/ \
30 |     --discard_original_comment
31 | 
32 | 
33 | python ../auto_data_gen_val/gen_block_summaries.py 0 1 \
34 |     --code_metadata_dir $OUTPUT_DIR/output_dir_for_code_metadata/ \
35 |     --documented_code_dir $OUTPUT_DIR/documented_code \
36 |     --block_line_length 10 \
37 |     --model gpt-4-turbo
38 | 
39 | 
40 | 
41 | python ../auto_data_gen_val/gen_global_summary.py 0 1 \
42 |     --code_metadata_dir $OUTPUT_DIR/output_dir_for_code_metadata/ \
43 |     --documented_code_dir $OUTPUT_DIR/documented_code \
44 |     --model gpt-4-turbo \
45 |     --detailed
46 | 
47 | 
48 | python ../auto_data_gen_val/gen_global_summary.py 0 1 \
49 |     --code_metadata_dir $OUTPUT_DIR/output_dir_for_code_metadata/ \
50 |     --documented_code_dir $OUTPUT_DIR/documented_code \
51 |     --model gpt-4-turbo
52 | 
53 | 
54 | python decode_results.py $OUTPUT_DIR


--------------------------------------------------------------------------------
/auto_data_gen_val/gen_detailed_steps.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import shutil
 4 | from dotenv import load_dotenv
 5 | load_dotenv()
 6 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR")))
 7 | from embedding_lookup_utils import CodeDataset
 8 | from langchain.callbacks import get_openai_callback #with get_openai_callback() as cb:
 9 | 
10 | if __name__ == "__main__":
11 |     dataset_metadata_dir = "./dataset_metadata/"
12 |     if not os.path.exists(dataset_metadata_dir):
13 |         os.makedirs(dataset_metadata_dir)
14 | 
15 |     total_code_parts = 6
16 |     code_part_start_id = 2
17 |     with get_openai_callback() as cb:
18 |         for code_part_id in range(code_part_start_id, total_code_parts):
19 |             if not os.path.exists("{}/part{}".format(dataset_metadata_dir, code_part_id)):
20 |                 os.makedirs("{}/part{}".format(dataset_metadata_dir, code_part_id))
21 |             codedb = CodeDataset(
22 |                                 "/home/user_name/DAC_2024/ckpts_test/test_10_30_{}_complete/documented_code/".format(code_part_id),
23 |                                 bookkeeping_dir="{}/part{}/bookkeeping/".format(dataset_metadata_dir,code_part_id),
24 |                                 vectorembedding_dir="{}/part{}/vectorembedding/".format(dataset_metadata_dir, code_part_id),
25 |                                 force_refresh=False,
26 |                                 cb=cb
27 |                                 )
28 |             codedb.load_and_split_code(skip_small_doc=True, split_by_line=True, based_on_code_lines_only=True, 
29 |                                         csv_code_dir="/home/user_name/DAC_2024/ckpts_test/test_10_30_{}_complete/assets/verilog/code_and_comment_src/csv_src/csv_code_src".format(code_part_id),
30 |                                         csv_comment_dir="/home/user_name/DAC_2024/ckpts_test/test_10_30_{}_complete/assets/verilog/code_and_comment_src/csv_src/csv_new_comment_src".format(code_part_id)
31 |                                     )
32 |             codedb.init_vectorstore()
33 |             codedb.supplement_detailed_steps()
34 | 
35 |             codedb.save_detail_steps(
36 |                                     "{}/part{}/detailed_steps.json".format(dataset_metadata_dir, code_part_id),
37 |                                     split_by_line = True
38 |                                     )


--------------------------------------------------------------------------------
/verilog_eval/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 NVIDIA Research Projects
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 
24 | This project contains code from human-eval (https://github.com/openai/human-eval/).
25 | 
26 | The MIT License
27 | 
28 | Copyright (c) OpenAI (https://openai.com)
29 | 
30 | Permission is hereby granted, free of charge, to any person obtaining a copy
31 | of this software and associated documentation files (the "Software"), to deal
32 | in the Software without restriction, including without limitation the rights
33 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
34 | copies of the Software, and to permit persons to whom the Software is
35 | furnished to do so, subject to the following conditions:
36 | 
37 | The above copyright notice and this permission notice shall be included in
38 | all copies or substantial portions of the Software.
39 | 
40 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
41 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
42 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
43 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
44 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
45 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
46 | THE SOFTWARE.
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/gen_block_summaries_no_comment_exists.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import shutil
 4 | from dotenv import load_dotenv
 5 | load_dotenv()
 6 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR")))
 7 | from embedding_lookup_utils import CodeDataset
 8 | from langchain.callbacks import get_openai_callback #with get_openai_callback() as cb:
 9 | 
10 | if __name__ == "__main__":
11 |     dataset_metadata_dir = "./dataset_metadata/"
12 |     if not os.path.exists(dataset_metadata_dir):
13 |         os.makedirs(dataset_metadata_dir)
14 | 
15 |     total_code_parts = 10
16 |     code_part_start_id = 0
17 |     with get_openai_callback() as cb:
18 |         for code_part_id in range(code_part_start_id, total_code_parts):
19 |             if not os.path.exists("{}/part{}".format(dataset_metadata_dir, code_part_id)):
20 |                 os.makedirs("{}/part{}".format(dataset_metadata_dir, code_part_id))
21 |             codedb = CodeDataset(
22 |                                 "/home/user_name/DAC_2024/ckpts/test_10_30_{}_complete/documented_code/".format(code_part_id),
23 |                                 bookkeeping_dir="{}/part{}/bookkeeping/".format(dataset_metadata_dir,code_part_id),
24 |                                 vectorembedding_dir="{}/part{}/vectorembedding/".format(dataset_metadata_dir, code_part_id),
25 |                                 force_refresh=False,
26 |                                 cb=cb
27 |                                 )
28 |             codedb.load_and_split_code(skip_small_doc=True, split_by_line=True, based_on_code_lines_only=True, 
29 |                                         csv_code_dir="/home/user_name/DAC_2024/ckpts/test_10_30_{}_complete/assets/verilog/code_and_comment_src/csv_src/csv_code_src".format(code_part_id),
30 |                                         csv_comment_dir="/home/user_name/DAC_2024/ckpts/test_10_30_{}_complete/assets/verilog/code_and_comment_src/csv_src/csv_comment_src".format(code_part_id)
31 |                                     )
32 |             codedb.init_vectorstore()
33 |             codedb.supplement_summary(block_summary_placeholding=False, use_global_summary_for_block_summary=False)
34 |             codedb.save_block_summary(
35 |                                     "{}/part{}/block_summary.json".format(dataset_metadata_dir, code_part_id),
36 |                                     split_by_line = True
37 |                                     )
38 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/gen_verilogeval_baseline_summary.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import shutil
 4 | import argparse
 5 | 
 6 | from dotenv import load_dotenv
 7 | load_dotenv()
 8 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR")))
 9 | from embedding_lookup_utils import CodeDataset
10 | from langchain.callbacks import get_openai_callback #with get_openai_callback() as cb:
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     #positional arguments
15 |     #start_id, total_code_parts
16 |     parser.add_argument("start_id", help="start id of the code parts", type=int)
17 |     parser.add_argument("total_code_parts", help="total number of code parts", type=int)
18 |     args = parser.parse_args()
19 |     code_part_start_id = args.start_id
20 |     total_code_parts = args.total_code_parts
21 | 
22 |     dataset_metadata_dir = "./dataset_metadata/"
23 |     desc_key = "detail_description"
24 |     if not os.path.exists(dataset_metadata_dir):
25 |         os.makedirs(dataset_metadata_dir)
26 | 
27 |     with get_openai_callback() as cb:
28 |         for code_part_id in range(code_part_start_id, total_code_parts):
29 |             if not os.path.exists("{}/part{}".format(dataset_metadata_dir, code_part_id)):
30 |                 os.makedirs("{}/part{}".format(dataset_metadata_dir, code_part_id))
31 |             codedb = CodeDataset(
32 |                                 "/home/user_name/DAC_2024/ckpts/test_10_30_{}_complete/documented_code/".format(code_part_id),
33 |                                 bookkeeping_dir="{}/part{}/bookkeeping/".format(dataset_metadata_dir,code_part_id),
34 |                                 vectorembedding_dir="{}/part{}/vectorembedding/".format(dataset_metadata_dir, code_part_id),
35 |                                 force_refresh=False,
36 |                                 cb=cb
37 |                                 )
38 |             codedb.load_and_split_code(skip_small_doc=True, split_by_line=True, based_on_code_lines_only=True, 
39 |                                         csv_code_dir="/home/user_name/DAC_2024/ckpts/test_10_30_{}_complete/assets/verilog/code_and_comment_src/csv_src/csv_code_src".format(code_part_id),
40 |                                         csv_comment_dir="/home/user_name/DAC_2024/ckpts/test_10_30_{}_complete/assets/verilog/code_and_comment_src/csv_src/csv_new_comment_src".format(code_part_id)
41 |                                     )
42 |             codedb.init_vectorstore()
43 |             codedb.supplement_summary(block_summary_placeholding=True,
44 |                                       force_refresh_global_summary=True, 
45 |                                       global_summary_example_desc_key=desc_key)
46 | 
47 |             codedb.save_global_summary(
48 |                                     "{}/part{}/global_summary.json".format(dataset_metadata_dir, code_part_id)
49 |                                     )
50 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/pre_proc_sync.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from dotenv import load_dotenv
 4 | load_dotenv()
 5 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR")))
 6 | import openai
 7 | import requests
 8 | import json
 9 | import copy
10 | import time
11 | import datetime
12 | import shutil
13 | from embedding_lookup_utils import *
14 | from utils import *
15 | from completion_handler import *
16 | from code_preprocesser import *
17 | from code_repo_documentor import *
18 | 
19 | #documenting the first version with module instantiation
20 | #one_shot 5 lines
21 | #pure llama 2 70B
22 | #around 12k samples
23 | 
24 | if __name__ == "__main__":
25 |     #NOTE: run utils.py first to partition the code first
26 |     code_part = 0
27 |     code_dir = "/home/user_name/DAC_2024/ckpt3_user_name_valid_content_renamed/part{}".format(code_part)
28 |     code_metadata_file = "/home/user_name/DAC_2024/ckpt3_user_name_valid_content_code_metadata/part{}/codes.json".format(code_part)
29 |     code_lib_path =  "/home/user_name/DAC_2024/ckpt3_user_name_valid_content_shared_lib/"
30 |     code_vec_store = "../code_vec_store/test_10_30/"
31 |     language = os.environ.get("TARGET_LANG")
32 |     if os.environ.get("TARGET_LANG") == "verilog":
33 |         code_suffix = [".v", ".sv", ".vh"]
34 |     elif os.environ.get("TARGET_LANG") == "xilinx_hls":
35 |         code_suffix = [".c", ".cpp", ".h", ".hpp"]
36 |     store_src_code_dir = os.environ.get("STORE_SRC_CODE_DIR")
37 |     csv_code_dir = os.environ.get("CSV_CODE_DIR")
38 |     csv_comment_dir = os.environ.get("CSV_COMMENT_DIR")
39 |     csv_new_comment_dir = os.environ.get("CSV_NEW_COMMENT_DIR")
40 |     csv_pure_gen_comment_dir = os.environ.get("CSV_PURE_GEN_COMMENT_DIR")
41 |     code_summary_dir = os.environ.get("CODE_SUMMARY_DIR")
42 |     documented_code_dir = os.environ.get("DOCUMENTED_CODE_DIR")
43 | 
44 | 
45 |     with get_openai_callback() as cb:
46 |         #This switch will discard 1. the comments in the raw code copy and 2. the comments will be converted to the raw code csv 
47 |         discard_original_comment = True
48 |         
49 |         code_repo_documentor = CodeRepoDocumentor(code_dir, store_src_code_dir,
50 |                                                     csv_code_dir, csv_comment_dir, csv_new_comment_dir, 
51 |                                                     csv_pure_gen_comment_dir, code_summary_dir, documented_code_dir,
52 |                                                     code_metadata_file=code_metadata_file,
53 |                                                     code_suffix=code_suffix, language=language,
54 |                                                     discard_original_comment=discard_original_comment,
55 |                                                     code_lib_path=code_lib_path, code_vec_store=code_vec_store,
56 |                                                     skip_rag_db=True,
57 |                                                     cb = cb)
58 |         code_repo_documentor.create_embedding()
59 |         code_repo_documentor.code_preprocess()
60 |         code_repo_documentor.package_documented_code("./documented_code")
61 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/preprocess_data/example_code_strings_simple_instructions.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lemmings1": "module top_module (\n\tinput clk,\n\tinput areset,\n\tinput bump_left,\n\tinput bump_right,\n\toutput walk_left,\n\toutput walk_right\n);\n\tparameter WL=0, WR=1;\n\treg state;\n\treg next;\n    \n    always_comb begin\n\t\tcase (state)\n\t\t\tWL: next = bump_left ? WR : WL;\n\t\t\tWR: next = bump_right ? WL: WR;\n\t\tendcase\n    end\n    \n    always @(posedge clk, posedge areset) begin\n\t\tif (areset) state <= WL;\n        else state <= next;\n\tend\n\t\t\n\tassign walk_left = (state==WL);\n\tassign walk_right = (state==WR);\n\n\t\nendmodule\n",
 3 |     "rotate100": "module top_module(\n\tinput clk,\n\tinput load,\n\tinput [1:0] ena,\n\tinput [99:0] data,\n\toutput reg [99:0] q);\n\t\n\t\n\talways @(posedge clk) begin\n\t\tif (load)\n\t\t\tq <= data;\n\t\telse if (ena == 2'h1)\n\t\t\tq <= {q[0], q[99:1]};\n\t\telse if (ena == 2'h2)\n\t\t\tq <= {q[98:0], q[99]};\n\tend\nendmodule\n",
 4 |     "vector2": "module top_module (\n\tinput [31:0] in,\n\toutput [31:0] out\n);\n\n\tassign out = {in[7:0], in[15:8], in[23:16], in[31:24]};\t\n\t\nendmodule\n",
 5 |     "gatesv100": "module top_module (\n\tinput [99:0] in,\n\toutput [98:0] out_both,\n\toutput [99:1] out_any,\n\toutput [99:0] out_different\n);\n\n\tassign out_both = in & in[99:1];\n\tassign out_any = in | in[99:1];\n\tassign out_different = in^{in[0], in[99:1]};\n\t\nendmodule\n",
 6 |     "history_shift": "module top_module\n(\n    input clk,\n    input areset,\n    input predict_valid,\n    input predict_taken,\n    output logic [31:0] predict_history,\n    \n    input train_mispredicted,\n    input train_taken,\n    input [31:0] train_history\n);\n    always@(posedge clk, posedge areset)\n\t\tif (areset) begin\n\t\t\tpredict_history = 0;\n        end\telse begin\n\t\t\tif (train_mispredicted)\n\t\t\t\tpredict_history <= {train_history, train_taken};\n\t\t\telse if (predict_valid)\n\t\t\t\tpredict_history <= {predict_history, predict_taken};\n\t\tend\nendmodule\n",
 7 |     "ece241_2013_q2": "module top_module (\n\tinput a,\n\tinput b,\n\tinput c,\n\tinput d,\n\toutput out_sop,\n\toutput out_pos\n);\n\t\n\twire pos0, pos1;\n\tassign out_sop = c&d | ~a&~b&c;\n\tassign pos0 = c & (~b|d)&(~a|b);\n\tassign pos1 = c & (~b|d)&(~a|d);\n\t\n\tassign out_pos = (pos0 == pos1) ? pos0 : 1'bx;\nendmodule\n",
 8 |     "dff16e": "module top_module(\n\tinput clk,\n\tinput resetn,\n\tinput [1:0] byteena,\n\tinput [15:0] d,\n\toutput reg [15:0] q);\n\t\n\talways @(posedge clk) begin\n\t\tif (!resetn)\n\t\t\tq <= 0;\n\t\telse begin\n\t\t\tif (byteena[0])\n\t\t\t\tq[7:0] <= d[7:0];\n\t\t\tif (byteena[1])\n\t\t\t\tq[15:8] <= d[15:8];\n\t\tend\n\tend\n\t\nendmodule\n",
 9 |     "fsm2": "module top_module (\n\tinput clk,\n\tinput j,\n\tinput k,\n\tinput areset,\n\toutput out\n);\n\tparameter A=0, B=1;\n\treg state;\n\treg next;\n    \n    always_comb begin\n\t\tcase (state)\n\t\t\tA: next = j ? B : A;\n\t\t\tB: next = k ? A : B;\n\t\tendcase\n    end\n    \n    always @(posedge clk, posedge areset) begin\n\t\tif (areset) state <= A;\n        else state <= next;\n\tend\n\t\t\n\tassign out = (state==B);\n\n\t\nendmodule\n",
10 |     "vector100r": "module top_module (\n\tinput [99:0] in,\n\toutput reg [99:0] out\n);\n\t\n\talways_comb \n\t\tfor (int i=0;i<$bits(out);i++)\n\t\t\tout[i] = in[$bits(out)-i-1];\n\t\nendmodule\n"
11 | }


--------------------------------------------------------------------------------
/auto_data_gen_val/dataset_utils_baseline.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from dotenv import load_dotenv
 4 | load_dotenv()
 5 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR")))
 6 | import openai
 7 | import requests
 8 | import json
 9 | import copy
10 | import time
11 | import datetime
12 | import shutil
13 | import pandas as pd
14 | import tiktoken
15 | from openai.embeddings_utils import get_embedding, cosine_similarity
16 | from ast import literal_eval
17 | import numpy as np
18 | import re 
19 | from tqdm import tqdm
20 | from datasets import load_from_disk
21 | from datasets import Dataset
22 | import jsonlines
23 | from utils import extract_module_header, preprocess
24 | 
25 | 
26 | 
27 | def random_sample_dataset(datasetpath, sample_percent, savepath):
28 |     dataset = load_from_disk(datasetpath)
29 |     dataset = dataset.shuffle(seed=42)
30 |     dataset = dataset.select(range(int(len(dataset)*sample_percent)))
31 |     dataset.save_to_disk(savepath)
32 |     return dataset 
33 | 
34 | def find_fail_all_entries(result_file):
35 |     all_entries = []
36 |     with jsonlines.open(result_file) as reader:
37 |         for obj in reader:
38 |             all_entries.append(obj["task_id"])
39 |     all_entries = set(all_entries)
40 |     with jsonlines.open(result_file) as reader:
41 |         for obj in reader:
42 |             if obj["result"] == "passed":
43 |                 if obj["task_id"] in all_entries:
44 |                     all_entries.remove(obj["task_id"])
45 |     return all_entries
46 | 
47 | def form_new_prob_file(orig_prob_file, new_prob_file, fail_entries):
48 |     with jsonlines.open(orig_prob_file) as reader:
49 |         with jsonlines.open(new_prob_file, mode='w') as writer:
50 |             for obj in reader:
51 |                 if obj["task_id"] in fail_entries:
52 |                     writer.write(obj)
53 |     return new_prob_file
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     # datasetpath = "/home/user_name/DAC_2024/sft_dataset/vanilla_baseline"
58 |     # sample_percent = 0.1
59 |     # savepath = "/home/user_name/DAC_2024/sft_dataset/vanilla_baseline_sample_0_{}".format(int(sample_percent*10))
60 |     # random_sample_dataset(datasetpath, sample_percent, savepath)
61 |     result_file = "/home/user_name/DAC_2024/chatgpt4_auto_accel/model_eval_qlora_kevin/data/gen.merged_dataset+simple_description.jsonl_results.jsonl"
62 |     new_prob_dir = "./special_set"
63 |     hdlbits_hlvl = "hdlbits_description.jsonl"
64 |     hdlbits_simple_desc = "hdlbits_description_simple_description.jsonl"
65 |     hdlbits_detail_desc = "hdlbits_description_detail_description.jsonl"
66 |     hdlbits_block_desc = "hdlbits_for_llm2_eval.jsonl"
67 |     eval_file = "/home/user_name/DAC_2024/chatgpt4_auto_accel/verilog_eval/data/VerilogEval_Machine.jsonl"
68 |     fail_entries = find_fail_all_entries(result_file)
69 |     print(len(fail_entries))
70 |     print(fail_entries)
71 |     new_prob_file = form_new_prob_file(hdlbits_hlvl, os.path.join(new_prob_dir, hdlbits_hlvl), fail_entries)
72 |     new_prob_file = form_new_prob_file(hdlbits_simple_desc, os.path.join(new_prob_dir, hdlbits_simple_desc), fail_entries)
73 |     new_prob_file = form_new_prob_file(hdlbits_detail_desc, os.path.join(new_prob_dir, hdlbits_detail_desc), fail_entries)
74 |     new_prob_file = form_new_prob_file(hdlbits_block_desc, os.path.join(new_prob_dir, hdlbits_block_desc), fail_entries)
75 |     new_prob_file = form_new_prob_file(eval_file, os.path.join(new_prob_dir, "VerilogEval_Machine.jsonl"), fail_entries)
76 | 


--------------------------------------------------------------------------------
/document_customized_repo/test_dir/priority_encoder.v:
--------------------------------------------------------------------------------
 1 | /*
 2 | 
 3 | Copyright (c) 2014-2021 Alex Forencich
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 
23 | */
24 | 
25 | // Language: Verilog 2001
26 | 
27 | `resetall
28 | `timescale 1ns / 1ps
29 | `default_nettype none
30 | 
31 | /*
32 |  * Priority encoder module
33 |  */
34 | module priority_encoder #
35 | (
36 |     parameter WIDTH = 4,
37 |     // LSB priority selection
38 |     parameter LSB_HIGH_PRIORITY = 0
39 | )
40 | (
41 |     input  wire [WIDTH-1:0]         input_unencoded,
42 |     output wire                     output_valid,
43 |     output wire [$clog2(WIDTH)-1:0] output_encoded,
44 |     output wire [WIDTH-1:0]         output_unencoded
45 | );
46 | 
47 | parameter LEVELS = WIDTH > 2 ? $clog2(WIDTH) : 1;
48 | parameter W = 2**LEVELS;
49 | 
50 | // pad input to even power of two
51 | wire [W-1:0] input_padded = {{W-WIDTH{1'b0}}, input_unencoded};
52 | 
53 | wire [W/2-1:0] stage_valid[LEVELS-1:0];
54 | wire [W/2-1:0] stage_enc[LEVELS-1:0];
55 | 
56 | generate
57 |     genvar l, n;
58 | 
59 |     // process input bits; generate valid bit and encoded bit for each pair
60 |     for (n = 0; n < W/2; n = n + 1) begin : loop_in
61 |         assign stage_valid[0][n] = |input_padded[n*2+1:n*2];
62 |         if (LSB_HIGH_PRIORITY) begin
63 |             // bit 0 is highest priority
64 |             assign stage_enc[0][n] = !input_padded[n*2+0];
65 |         end else begin
66 |             // bit 0 is lowest priority
67 |             assign stage_enc[0][n] = input_padded[n*2+1];
68 |         end
69 |     end
70 | 
71 |     // compress down to single valid bit and encoded bus
72 |     for (l = 1; l < LEVELS; l = l + 1) begin : loop_levels
73 |         for (n = 0; n < W/(2*2**l); n = n + 1) begin : loop_compress
74 |             assign stage_valid[l][n] = |stage_valid[l-1][n*2+1:n*2];
75 |             if (LSB_HIGH_PRIORITY) begin
76 |                 // bit 0 is highest priority
77 |                 assign stage_enc[l][(n+1)*(l+1)-1:n*(l+1)] = stage_valid[l-1][n*2+0] ? {1'b0, stage_enc[l-1][(n*2+1)*l-1:(n*2+0)*l]} : {1'b1, stage_enc[l-1][(n*2+2)*l-1:(n*2+1)*l]};
78 |             end else begin
79 |                 // bit 0 is lowest priority
80 |                 assign stage_enc[l][(n+1)*(l+1)-1:n*(l+1)] = stage_valid[l-1][n*2+1] ? {1'b1, stage_enc[l-1][(n*2+2)*l-1:(n*2+1)*l]} : {1'b0, stage_enc[l-1][(n*2+1)*l-1:(n*2+0)*l]};
81 |             end
82 |         end
83 |     end
84 | endgenerate
85 | 
86 | assign output_valid = stage_valid[LEVELS-1];
87 | assign output_encoded = stage_enc[LEVELS-1];
88 | assign output_unencoded = 1 << output_encoded;
89 | 
90 | endmodule
91 | 
92 | `resetall
93 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/verilog_eval_to_part_data.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from dotenv import load_dotenv
  4 | load_dotenv()
  5 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR")))
  6 | import openai
  7 | import requests
  8 | import json
  9 | import copy
 10 | import time
 11 | import datetime
 12 | import shutil
 13 | import pandas as pd
 14 | import tiktoken
 15 | from openai.embeddings_utils import get_embedding, cosine_similarity
 16 | from ast import literal_eval
 17 | import numpy as np
 18 | import re 
 19 | from tqdm import tqdm
 20 | import tiktoken
 21 | from datasets import load_from_disk
 22 | from datasets import Dataset
 23 | from utils import *
 24 | 
 25 | llama2_prompt_with_memory ="""
 26 |     <s>[INST] <<SYS>>
 27 |     {system_message}
 28 |     <</SYS>>
 29 | 
 30 |     {chat_history} {human_input} [/INST]
 31 | """
 32 | 
 33 | llama2_prompt_without_memory ="""
 34 |     <s>[INST] <<SYS>>
 35 |     {system_message}
 36 |     <</SYS>>
 37 | 
 38 |     {human_input} [/INST]
 39 | """
 40 | 
 41 | llama2_prompt_without_memory_without_sys ="""
 42 | <s>[INST] {human_input} [/INST]
 43 | """
 44 | 
 45 | llama2_pompt_with_memory_without_sys ="""
 46 | <s>[INST] {chat_history} {human_input} [/INST]
 47 | """
 48 | 
 49 | llama2_memory_prompt ="""{human_input} [/INST] {model_reply}</s><s>[INST]"""
 50 | 
 51 | 
 52 | def eval_file_to_part_data(eval_file, data_dir, meta_data_dir):
 53 |     if os.path.exists(data_dir):
 54 |         shutil.rmtree(data_dir)
 55 |     os.makedirs(data_dir)
 56 |     if os.path.exists(meta_data_dir):
 57 |         shutil.rmtree(meta_data_dir)
 58 |     os.makedirs(meta_data_dir)
 59 |     code_pieces = []
 60 |     with open(eval_file, "r") as f:
 61 |         for line in f:
 62 |             code_pieces.append(json.loads(line))
 63 |     meta_data = {}
 64 |     for code_idx, code in tqdm(enumerate(code_pieces), total=len(code_pieces), desc="Preparing data"):
 65 |         code_name = code["task_id"] 
 66 |         module_header = code["prompt"].replace("top_module", code_name)
 67 |         code_content = module_header+code["canonical_solution"]
 68 | 
 69 |         #preprocess code
 70 |         output_str_list, module_name_list = part_verilog_module_string(code_content) 
 71 |         assert len(module_name_list) == 1
 72 |         assert code_name == module_name_list[0]
 73 |         code_content = output_str_list[0]
 74 | 
 75 |         #enter dummy meta data
 76 |         meta_data[code_name] = {"code_name": code_name, "module_inst_list": []}
 77 | 
 78 |         #save code to file 
 79 |         code_file = os.path.join(data_dir, code_name+".v")
 80 |         with open(code_file, "w") as f:
 81 |             f.write(code_content)
 82 |     #save meta data to file
 83 |     meta_data_file = os.path.join(meta_data_dir, "codes.json")
 84 |     with open(meta_data_file, "w") as f:
 85 |         json.dump(meta_data, f, indent=4)
 86 | 
 87 |     return data_dir
 88 | 
 89 | 
 90 | 
 91 | 
 92 | if __name__ == "__main__":
 93 |     import argparse
 94 |     parser = argparse.ArgumentParser()
 95 |     parser.add_argument("--eval_file", type=str, default="/home/user_name/DAC_2024/chatgpt4_auto_accel/verilog_eval/data/VerilogEval_Machine.jsonl")
 96 |     parser.add_argument("--data_dir", type=str, default="/home/user_name/DAC_2024/ckpt3_user_name_valid_content_renamed/part10")
 97 |     parser.add_argument("--meta_data_dir", type=str, default="/home/user_name/DAC_2024/ckpt3_user_name_valid_content_code_metadata/part10")
 98 |     args = parser.parse_args()
 99 |     eval_file = args.eval_file
100 |     data_dir = args.data_dir
101 |     meta_data_dir = args.meta_data_dir
102 |     eval_file_to_part_data(eval_file, data_dir, meta_data_dir)
103 |                         


--------------------------------------------------------------------------------
/auto_data_gen_val/gen_block_summaries.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import shutil
 4 | import argparse
 5 | from dotenv import load_dotenv
 6 | load_dotenv()
 7 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR")))
 8 | from embedding_lookup_utils import CodeDataset
 9 | from langchain.callbacks import get_openai_callback #with get_openai_callback() as cb:
10 | 
11 | if __name__ == "__main__":
12 |     parser = argparse.ArgumentParser()
13 |     #positional arguments
14 |     #start_id, total_code_parts
15 |     parser.add_argument("start_id", help="start id of the code parts", type=int)
16 |     parser.add_argument("total_code_parts", help="total number of code parts", type=int)
17 | 
18 |     #optional arguments
19 |     parser.add_argument("--documented_code_dir", help="documented code directory", type=str, default="/home/user_name/DAC_2024/ckpts/")
20 |     parser.add_argument("--code_metadata_dir", help="code metadata directory", type=str, default="/home/user_name/DAC_2024/ckpt3_user_name_valid_content_code_metadata/")
21 |     parser.add_argument("--block_line_length", help="block line length", type=int, default=10)
22 |     parser.add_argument("--model", help="model", type=str, default="gpt-3.5-turbo-1106")
23 |     args = parser.parse_args()
24 |     code_part_start_id = args.start_id
25 |     total_code_parts = args.total_code_parts
26 |     documented_code_dir = args.documented_code_dir
27 |     code_metadata_dir = args.code_metadata_dir
28 |     block_line_length = args.block_line_length
29 |     model = args.model
30 | 
31 |     dataset_metadata_dir = os.path.join(documented_code_dir, "dataset_metadata")
32 |     if not os.path.exists(dataset_metadata_dir):
33 |         os.makedirs(dataset_metadata_dir)
34 |         
35 |     with get_openai_callback() as cb:
36 |         for code_part_id in range(code_part_start_id, total_code_parts):
37 |             if not os.path.exists("{}/part{}".format(dataset_metadata_dir, code_part_id)):
38 |                 os.makedirs("{}/part{}".format(dataset_metadata_dir, code_part_id))
39 |             src_code_dir = os.path.join(documented_code_dir, "part{}".format(code_part_id))
40 |             codedb = CodeDataset(
41 |                                 src_code_dir,
42 |                                 bookkeeping_dir="{}/part{}/bookkeeping/".format(dataset_metadata_dir,code_part_id),
43 |                                 vectorembedding_dir="{}/part{}/vectorembedding/".format(dataset_metadata_dir, code_part_id),
44 |                                 force_refresh=False,
45 |                                 cb=cb
46 |                                 )
47 |             csv_code_dir = os.path.join(code_metadata_dir, "part{}".format(code_part_id), "assets", "verilog", "code_and_comment_src", "csv_src", "csv_code_src")
48 |             csv_comment_dir = os.path.join(code_metadata_dir, "part{}".format(code_part_id), "assets", "verilog", "code_and_comment_src", "csv_src", "csv_new_comment_src")
49 |             codedb.load_and_split_code(skip_small_doc=True, split_by_line=True,
50 |                                         line_length=block_line_length,
51 |                                         based_on_code_lines_only=True, 
52 |                                         csv_code_dir=csv_code_dir,
53 |                                         csv_comment_dir=csv_comment_dir
54 |                                     )
55 |             codedb.init_vectorstore(block_summary_model=model)
56 |             codedb.supplement_summary(block_summary_placeholding=False)
57 |             codedb.save_block_summary(
58 |                                         "{}/part{}/block_summary.json".format(dataset_metadata_dir, code_part_id),
59 |                                         split_by_line = True
60 |                                     )
61 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/requirements.txt:
--------------------------------------------------------------------------------
  1 | adal==1.2.7
  2 | aiohttp==3.8.5
  3 | aiosignal==1.3.1
  4 | asttokens==2.2.1
  5 | async-timeout==4.0.2
  6 | attrs==23.1.0
  7 | azure-common==1.1.28
  8 | azure-core==1.29.2
  9 | backcall==0.2.0
 10 | beautifulsoup4==4.12.2
 11 | blinker==1.6.2
 12 | certifi==2023.7.22
 13 | cffi==1.15.1
 14 | chardet==5.2.0
 15 | charset-normalizer==3.2.0
 16 | click==8.1.6
 17 | cloudpickle==2.2.1
 18 | cmake==3.27.1
 19 | comm==0.1.4
 20 | contourpy==1.1.0
 21 | cryptography==41.0.3
 22 | cycler==0.11.0
 23 | dataclasses-json==0.5.14
 24 | decorator==5.1.1
 25 | docx2txt==0.8
 26 | dominate==2.8.0
 27 | executing==1.2.0
 28 | filelock==3.12.2
 29 | filetype==1.2.0
 30 | Flask==2.2.3
 31 | Flask-Bootstrap==3.3.7.1
 32 | fonttools==4.42.0
 33 | frozenlist==1.4.0
 34 | fsspec==2023.6.0
 35 | gevent==23.7.0
 36 | greenlet==2.0.2
 37 | huggingface-hub==0.16.4
 38 | idna==3.4
 39 | importlib-metadata==6.8.0
 40 | importlib-resources==6.0.1
 41 | ipython==8.12.2
 42 | ipywidgets==8.1.0
 43 | isodate==0.6.1
 44 | itsdangerous==2.1.2
 45 | jedi==0.19.0
 46 | Jinja2==3.1.2
 47 | joblib==1.3.1
 48 | jupyterlab-widgets==3.0.8
 49 | kiwisolver==1.4.4
 50 | langchain==0.0.257
 51 | langsmith==0.0.19
 52 | lit==16.0.6
 53 | llama-index==0.7.21
 54 | llvmlite==0.40.1
 55 | load-dotenv==0.1.0
 56 | lxml==4.9.3
 57 | MarkupSafe==2.1.3
 58 | marshmallow==3.20.1
 59 | matplotlib==3.7.2
 60 | matplotlib-inline==0.1.6
 61 | microsoft-bing-autosuggest==1.0.0
 62 | microsoft-bing-customimagesearch==1.0.0
 63 | microsoft-bing-customwebsearch==1.0.0
 64 | microsoft-bing-entitysearch==1.0.0
 65 | microsoft-bing-imagesearch==1.0.0
 66 | microsoft-bing-newssearch==1.0.0
 67 | microsoft-bing-spellcheck==1.0.0
 68 | microsoft-bing-videosearch==1.0.0
 69 | microsoft-bing-visualsearch==1.0.0
 70 | microsoft-bing-websearch==1.0.0
 71 | more-itertools==10.1.0
 72 | mpmath==1.3.0
 73 | msrest==0.7.1
 74 | msrestazure==0.6.4
 75 | multidict==6.0.4
 76 | mypy-extensions==1.0.0
 77 | nest-asyncio==1.5.7
 78 | networkx==3.1
 79 | nltk==3.8.1
 80 | numba==0.57.1
 81 | numexpr==2.8.5
 82 | numpy==1.24.4
 83 | nvidia-cublas-cu11==11.10.3.66
 84 | nvidia-cuda-cupti-cu11==11.7.101
 85 | nvidia-cuda-nvrtc-cu11==11.7.99
 86 | nvidia-cuda-runtime-cu11==11.7.99
 87 | nvidia-cudnn-cu11==8.5.0.96
 88 | nvidia-cufft-cu11==10.9.0.58
 89 | nvidia-curand-cu11==10.2.10.91
 90 | nvidia-cusolver-cu11==11.4.0.1
 91 | nvidia-cusparse-cu11==11.7.4.91
 92 | nvidia-nccl-cu11==2.14.3
 93 | nvidia-nvtx-cu11==11.7.91
 94 | oauthlib==3.2.2
 95 | openai==0.27.8
 96 | openai-whisper @ git+https://github.com/openai/whisper.git@e8622f9afc4eba139bf796c210f5c01081000472
 97 | openapi-schema-pydantic==1.2.4
 98 | packaging==23.1
 99 | pandas==2.0.3
100 | parso==0.8.3
101 | pexpect==4.8.0
102 | pickleshare==0.7.5
103 | Pillow==9.4.0
104 | plotly==5.15.0
105 | prompt-toolkit==3.0.39
106 | psutil==5.9.5
107 | ptyprocess==0.7.0
108 | pure-eval==0.2.2
109 | pycparser==2.21
110 | pydantic==1.10.12
111 | pydub==0.25.1
112 | Pygments==2.16.1
113 | PyJWT==2.8.0
114 | pyparsing==3.0.9
115 | PyPDF2==3.0.1
116 | python-dateutil==2.8.2
117 | python-dotenv==1.0.0
118 | python-magic==0.4.27
119 | python-pptx==0.6.21
120 | pytz==2023.3
121 | PyYAML==6.0.1
122 | regex==2023.6.3
123 | requests==2.31.0
124 | requests-oauthlib==1.3.1
125 | scikit-learn==1.3.0
126 | scipy==1.10.1
127 | sentencepiece==0.1.97
128 | six==1.16.0
129 | soupsieve==2.4.1
130 | SQLAlchemy==2.0.19
131 | stack-data==0.6.2
132 | sympy==1.12
133 | tabulate==0.9.0
134 | tenacity==8.2.2
135 | threadpoolctl==3.2.0
136 | tiktoken==0.3.3
137 | tokenizers==0.13.3
138 | torch==2.0.1
139 | tornado==6.3.2
140 | tqdm==4.65.0
141 | traitlets==5.9.0
142 | transformers==4.27.3
143 | triton==2.0.0
144 | typing-inspect==0.9.0
145 | typing_extensions==4.7.1
146 | tzdata==2023.3
147 | unstructured==0.9.1
148 | urllib3==1.26.16
149 | visitor==0.1.3
150 | wcwidth==0.2.6
151 | Werkzeug==2.3.6
152 | widgetsnbextension==4.0.8
153 | xgboost==1.7.6
154 | XlsxWriter==3.1.2
155 | yarl==1.9.2
156 | zipp==3.16.2
157 | zope.event==5.0
158 | zope.interface==6.0
159 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/code_validate.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from dotenv import load_dotenv
  4 | load_dotenv()
  5 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR")))
  6 | 
  7 | sys.path.append("../verilog_eval/verilog_eval")
  8 | from evaluation import evaluate_functional_correctness
  9 | 
 10 | import requests
 11 | import json
 12 | import uuid
 13 | from io import StringIO  
 14 | import copy
 15 | import time
 16 | import datetime
 17 | import shutil
 18 | import pandas as pd
 19 | import tiktoken
 20 | from openai.embeddings_utils import get_embedding, cosine_similarity
 21 | from ast import literal_eval
 22 | import numpy as np
 23 | from utils import *
 24 | from tqdm import tqdm
 25 | import jsonlines
 26 | 
 27 | from chain_utils import SimpleConverseChain
 28 | from pyverilog.vparser.parser import parse
 29 | from datasets import load_dataset, load_from_disk, Dataset
 30 | 
 31 | 
 32 | def compile_syntax_check(code_str):
 33 |     row = code_str
 34 |     file_id = str(uuid.uuid4()) 
 35 |     path = "tmp/file{}.v".format(file_id)
 36 |     asset_dir = "tmp/asset{}".format(file_id)
 37 |     
 38 |     #check if tmp dir exists
 39 |     if not os.path.exists("tmp"):
 40 |         os.makedirs("tmp", exist_ok=True)
 41 | 
 42 |     #check and make asset dir
 43 |     if not os.path.exists(asset_dir):
 44 |         os.makedirs(os.path.dirname(asset_dir), exist_ok=True)
 45 | 
 46 |     with open(path, "w") as f:
 47 |         f.write(row)
 48 | 
 49 |     try:
 50 |         ast, directives = parse([path], debug=False, outputdir=asset_dir, preprocess_output="tmp/preprocess.output.{}".format(file_id))
 51 |         output = StringIO()
 52 |         ast.show(buf=output)
 53 |         for lineno, directive in directives:
 54 |             output.write('Line %d : %s' % (lineno, directive))
 55 |         #delete the file
 56 |         os.remove(path)
 57 |         shutil.rmtree(asset_dir)
 58 |         return True
 59 |     except Exception as e:
 60 |         #delete the file
 61 |         os.remove(path)
 62 |         shutil.rmtree(asset_dir)
 63 |         return False
 64 | 
 65 | def reverse_codegen(description, code_str, model="gpt-4-0613", max_trials=10):
 66 |     system_prompt = "You only complete chats with syntax correct Verilog code. End the Verilog module code completion with 'endmodule'. Do not include module, input and output definitions."
 67 |     question_prompt = "Implement the Verilog module based on the following description. Assume that signals are positive clock/clk edge triggered unless otherwise stated."
 68 |     problem_description = "\n\n {description} \n\n Module header:\n\n {module_header}\n"
 69 |     #retrieve the module header
 70 |     module_header = extract_module_header(code_str, code_str=True)
 71 |     #generate the prompt
 72 |     user_prompt = question_prompt + problem_description.format(description=description, module_header=module_header)
 73 |     chain = SimpleConverseChain(system_prompt=system_prompt, model=model, temperature=0.7, max_tokens=512, top_p=0.95, have_memory=False, verbose=False)
 74 |     for trial in range(max_trials):
 75 |         completion = chain.chat(user_prompt, system_prompt=system_prompt)
 76 |         #check if the completion is valid
 77 |         if compile_syntax_check(completion):
 78 |             return True, completion
 79 |     return False, completion
 80 |     
 81 |     
 82 | 
 83 | 
 84 | if __name__ == "__main__":
 85 |     import argparse
 86 |     parser = argparse.ArgumentParser()
 87 |     parser.add_argument("--dataset_dir", type=str, default="/home/user_name/DAC_2024/sft_dataset/detail_description_dataset")
 88 |     parser.add_argument("--output_dir", type=str, default="/home/user_name/DAC_2024/sft_dataset/detail_description_dataset_val")
 89 |     dataset_dir = parser.parse_args().dataset_dir
 90 |     output_dir = parser.parse_args().output_dir
 91 | 
 92 |     #load the dataset
 93 |     generated_dataset = load_from_disk(dataset_dir)
 94 | 
 95 |     new_dataset = {"code": [], "description": []}
 96 |     for i in range(len(generated_dataset)):
 97 |         code_str = generated_dataset[i]["code"]
 98 |         passed, completion = reverse_codegen(generated_dataset[i]["description"], code_str)
 99 |         if passed:
100 |             new_dataset["code"].append(code_str)
101 |             new_dataset["description"].append(generated_dataset[i]["description"])
102 |     new_dataset = Dataset.from_dict(new_dataset)
103 |     new_dataset.save_to_disk(output_dir)
104 |         
105 |     
106 |         
107 | 
108 | 
109 | 
110 | 


--------------------------------------------------------------------------------
/model_eval_qlora/standalone_eval.py:
--------------------------------------------------------------------------------
  1 | import jsonlines
  2 | import sys
  3 | import tiktoken
  4 | import matplotlib.pyplot as plt
  5 | import seaborn as sns
  6 | import numpy as np
  7 | 
  8 | sys.path.append("../verilog_eval/verilog_eval")
  9 | from evaluation import evaluate_functional_correctness
 10 | 
 11 | def process_jsonl_file(src_file, dst_file):
 12 |     with jsonlines.open(src_file) as reader:
 13 |         with jsonlines.open(dst_file, mode='w') as writer:
 14 |             for obj in reader:
 15 |                 split = obj['completion'].split(';', 1)
 16 |                 if len(split) > 1:
 17 |                     obj['completion'] = split[1]
 18 |                     writer.write(obj)
 19 |                 else:
 20 |                     writer.write(obj)
 21 | 
 22 | 
 23 | 
 24 | def evaluate(gen_file, prob_file):
 25 |     res = evaluate_functional_correctness(gen_file, problem_file=prob_file, k=[1,5,10])
 26 |     print("Eval Results:", res)
 27 | 
 28 | def results_profile(result_file, prob_file):
 29 |     tokenizer = tiktoken.encoding_for_model("gpt-4")
 30 |     passed_list = []
 31 |     failed_list = []
 32 |     with jsonlines.open(result_file) as reader:
 33 |         for obj in reader:
 34 |             if obj['passed']:
 35 |                 passed_list.append(obj)
 36 |             else:
 37 |                 failed_list.append(obj)
 38 | 
 39 |     problems = {}
 40 |     with jsonlines.open(prob_file) as reader:
 41 |         for obj in reader:
 42 |             problems[obj['task_id']] = obj
 43 |     
 44 |     for obj in passed_list:
 45 |         obj['module_header'] = problems[obj['task_id']]['prompt']
 46 |         obj['canonical_solution'] = problems[obj['task_id']]['canonical_solution']
 47 |         obj["code_lines"] = len(obj['module_header'].split('\n')) + len(obj['canonical_solution'].split('\n'))
 48 |         obj["code_token_count"] = len(tokenizer.encode(obj["module_header"] + "\n" + obj["canonical_solution"]))
 49 |         obj["prompt_token_count"] = len(tokenizer.encode(obj["prompt"]))
 50 | 
 51 |     for obj in failed_list:
 52 |         obj['module_header'] = problems[obj['task_id']]['prompt']
 53 |         obj['canonical_solution'] = problems[obj['task_id']]['canonical_solution']
 54 |         obj["code_lines"] = len(obj['module_header'].split('\n')) + len(obj['canonical_solution'].split('\n'))
 55 |         obj["code_token_count"] = len(tokenizer.encode(obj["module_header"] + "\n" + obj["canonical_solution"]))
 56 |         obj["prompt_token_count"] = len(tokenizer.encode(obj["prompt"]))
 57 | 
 58 | 
 59 | 
 60 |     data1 = [obj["code_token_count"] for obj in passed_list]
 61 |     data2 = [obj["code_token_count"] for obj in failed_list]
 62 |     data3 = data1 + data2
 63 | 
 64 | 
 65 |     # Plotting the distributions
 66 |     sns.set(style="whitegrid")  # Setting the style of the plot
 67 |     plt.figure(figsize=(10, 6))  # Setting the size of the plot
 68 |     #bin size 10 
 69 |     sns.histplot(data1, kde=True, color="blue", label="Passed", bins=10)
 70 |     sns.histplot(data2, kde=True, color="red", label="Failed", bins=10)
 71 | 
 72 |     plt.title('Distribution of Code Token Count')
 73 |     plt.xlabel('Code Token Count')
 74 |     plt.ylabel('Frequency')
 75 |     #save figure
 76 |     plt.savefig("passed_code_token_count.png")
 77 |     plt.clf()
 78 | 
 79 | 
 80 | 
 81 |     # Define common bin edges
 82 |     bins = np.linspace(min(np.min(data) for data in [data1, data2, data3]), 
 83 |                     max(np.max(data) for data in [data1, data2, data3]), 
 84 |                     10)
 85 |     # Calculate histograms
 86 |     hist1, _ = np.histogram(data1, bins=bins)
 87 |     hist2, _ = np.histogram(data2, bins=bins)
 88 |     hist3, _ = np.histogram(data3, bins=bins)
 89 |     # Normalize histograms
 90 |     normalized_hist1 = hist1 / (hist3 + 1e-6)  # Adding a small constant to avoid division by zero
 91 |     normalized_hist2 = hist2 / (hist3 + 1e-6)
 92 |     # Plotting
 93 |     plt.figure(figsize=(10, 6))
 94 | 
 95 |     plt.plot(bins[:-1], normalized_hist1, label='Normalized Dataset 1', marker='o', color="blue")
 96 |     plt.plot(bins[:-1], normalized_hist2, label='Normalized Dataset 2', marker='o', color="red")
 97 | 
 98 |     plt.title('Success / Failure Rates')
 99 |     plt.xlabel('Code Token Count')
100 |     plt.ylabel('Success / Failure Rates')
101 |     #save figure
102 |     plt.savefig("success_failure_rates.png")
103 |     plt.clf()
104 | 
105 | 
106 | 
107 |     
108 |         
109 | 
110 | 
111 | if __name__ == '__main__':
112 |     prob_file = "../verilog_eval/data/VerilogEval_Machine.jsonl"
113 |     gen_file = "./data/gen.jsonl" 
114 |     result_file = "./data/gen.jsonl_results.jsonl"
115 |     #process_jsonl_file(gen_file, "test.jsonl")
116 |     #evaluate(gen_file="test.jsonl", prob_file=prob_file)
117 |     results_profile(result_file, prob_file)
118 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/preliminary_exp.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from dotenv import load_dotenv
  4 | load_dotenv()
  5 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR")))
  6 | 
  7 | sys.path.append("../verilog_eval/verilog_eval")
  8 | from evaluation import evaluate_functional_correctness
  9 | 
 10 | import openai
 11 | import requests
 12 | import json
 13 | import copy
 14 | import time
 15 | import datetime
 16 | import shutil
 17 | import pandas as pd
 18 | import tiktoken
 19 | from openai.embeddings_utils import get_embedding, cosine_similarity
 20 | from ast import literal_eval
 21 | import numpy as np
 22 | from utils import *
 23 | from tqdm import tqdm
 24 | import jsonlines
 25 | 
 26 | 
 27 | from chain_utils import gen_block_summary_chain, func_name_lookup_chain, VerilogEval, detail_steps_chain, openai_chat
 28 | 
 29 | from embedding_lookup_utils import openai_chat, validate_global_summary_openai
 30 | 
 31 | def reverse_code_gen_openai(desc_file, eval_file, result_file, repeat=10):
 32 |     desc_list = []
 33 |     with jsonlines.open(desc_file) as reader:
 34 |         for obj in reader:
 35 |             desc_list.append(obj)
 36 |     results = []
 37 |     for obj in desc_list:
 38 |         for r in range(repeat):
 39 |             print("task_id: {}".format(obj["task_id"]))
 40 |             task_id = obj["task_id"]
 41 |             # print("description: {}".format(desc_dict[task_id]))
 42 |             passed, code = validate_global_summary_openai(obj["detail_description"], task_id, eval_file, max_trials=1)
 43 |             results.append({"task_id":task_id, "completion": code, "passed":passed})
 44 |             print("passed: {}".format(passed))
 45 |     with jsonlines.open(result_file, "w") as writer:
 46 |         writer.write_all(results)
 47 |         
 48 | 
 49 |             
 50 |  
 51 | 
 52 | 
 53 | 
 54 | if __name__ == "__main__":
 55 |     verilogeval0 = VerilogEval(model="llama2")
 56 |     example_cstr_json = "/home/user_name/DAC_2024/chatgpt4_auto_accel/fine_tune_dataset/auto_doc_part_dataset/verilogeval_datagen/example_code_strings.json"
 57 |     with open(example_cstr_json, "r") as f:
 58 |         example_code_strings = json.load(f)
 59 |     example_code_description_file = "/home/user_name/DAC_2024/verilogeval/verilog-eval/descriptions/VerilogDescription_Machine.jsonl"
 60 |     eval_file = "/home/user_name/DAC_2024/verilogeval/verilog-eval/data/VerilogEval_Machine.jsonl"
 61 |     global_summary_chain = verilogeval0.verilog_eval_sft_data
 62 |     code_gen_chain = verilogeval0.code_gen
 63 | 
 64 |     tested_model = "llama2-70b"
 65 |     generated_description_file = "gen_{}.jsonl".format(tested_model)
 66 |     reverse_code_gen_openai_file = "reverse_code_gen_{}-openai-gpt4.jsonl".format(tested_model)
 67 |     reverse_code_gen_file = "reverse_code_gen_{}.jsonl".format(tested_model)
 68 |     repeat_times = 10 
 69 | 
 70 |     gen_description = False
 71 |     openai_code_gen = True
 72 | 
 73 | 
 74 |     reverse_code_gen = False
 75 | 
 76 |     if gen_description:
 77 |         #read the code_content from eval_file
 78 |         code_content = {}
 79 |         with jsonlines.open(eval_file) as reader:
 80 |             for obj in reader:
 81 |                 code_content[obj["task_id"]] = obj["prompt"] + "\n"+obj["canonical_solution"]
 82 |         generated_description = []
 83 |         for task_id in code_content:
 84 |             print("generating description for task_id: {}".format(task_id))
 85 |             for i in range(repeat_times):
 86 |                 print(i, end = " ")
 87 |                 description = global_summary_chain(   code_content[task_id],
 88 |                                         example_code_description_file=example_code_description_file,
 89 |                                         example_code_strings=example_code_strings,
 90 |                                         desc_key="detail_description")
 91 |                 #append task_id and description to generated_description
 92 |                 generated_description.append({"task_id":task_id, "detail_description":description})
 93 |             print(len(generated_description))
 94 |             print()
 95 |         #store in a jsonl file
 96 |         with jsonlines.open(generated_description_file, "w") as writer:
 97 |             writer.write_all(generated_description)
 98 | 
 99 |     if openai_code_gen:
100 |         generated_description_file = "/home/user_name/DAC_2024/chatgpt4_auto_accel/fine_tune_dataset/auto_doc_part_dataset/hdlbits_description.jsonl"
101 |         #reverse code generation
102 |         reverse_code_gen_openai(generated_description_file, eval_file, reverse_code_gen_openai_file)
103 | 
104 |     if reverse_code_gen:
105 |         #code gen from tested model
106 |         code_gen_chain(example_code_description_file, eval_file=eval_file, result_file=reverse_code_gen_file, repeat=repeat_times)
107 | 
108 | 
109 | 
110 |     
111 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/code_preprocesser.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from dotenv import load_dotenv
 4 | load_dotenv()
 5 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR")))
 6 | import openai
 7 | import requests
 8 | import json
 9 | import copy
10 | import time
11 | import datetime
12 | import shutil
13 | from utils import *
14 | 
15 | 
16 | def folder_create(folder_name):
17 |     if not os.path.exists(folder_name):
18 |         #recursively create the directory
19 |         os.makedirs(folder_name)
20 |     else:
21 |         #ask the user if they want to delete the directory and create a new one
22 |         print("The directory {} already exists. Do you want to delete it and create a new one?".format(folder_name))
23 |         print("Type 'y' for yes and 'n' for no.")
24 |         answer = input()
25 |         if answer == "y":
26 |             shutil.rmtree(folder_name)
27 |             os.makedirs(folder_name)
28 |         else:
29 |             print("Leave the directory as it is.")
30 | 
31 | 
32 | class CodePreprocesser:
33 |     def __init__(self, code_dir, store_src_code_dir, 
34 |                  csv_code_dir, csv_comment_dir, csv_new_comment_dir, 
35 |                  csv_pure_gen_comment_dir, code_summary_dir, documented_code_dir,
36 |                  code_suffix =[".v", ".sv", ".vh"], discard_original_comment = False):
37 |         self.code_dir = code_dir
38 |         self.code_suffix = code_suffix
39 |         self.store_src_code_dir = store_src_code_dir
40 |         self.csv_code_dir = csv_code_dir
41 |         self.csv_comment_dir = csv_comment_dir
42 |         self.csv_new_comment_dir = csv_new_comment_dir
43 |         self.csv_pure_gen_comment_dir = csv_pure_gen_comment_dir
44 |         self.code_summary_dir = code_summary_dir
45 |         self.documented_code_dir = documented_code_dir
46 |         self.discard_original_comment = discard_original_comment
47 |         #check if the directory exists
48 |         folder_create(self.store_src_code_dir)
49 |         folder_create(self.csv_code_dir)
50 |         folder_create(self.csv_comment_dir)
51 |         folder_create(self.csv_new_comment_dir)
52 |         folder_create(self.csv_pure_gen_comment_dir)
53 |         folder_create(self.code_summary_dir)
54 |         folder_create(self.documented_code_dir)
55 |     
56 |     def raw_code_copy(self, src_dir, dst_dir, skip_preprocess = False):
57 |         #copy all the files with the suffix to the dst_dir
58 |         self.code_files = []
59 |         for file in os.listdir(src_dir):
60 |             if file.endswith(tuple(self.code_suffix)):
61 |                 if not skip_preprocess:
62 |                     shutil.copy(os.path.join(src_dir, file), dst_dir)
63 |                 self.code_files.append(file)
64 |     
65 |     def create_code_assets(self):
66 |         #separate the comments and code and create corresponding csv files
67 |         for code_file in tqdm(self.code_files, total=len(self.code_files), desc="Creating code assets"):
68 |             src_code_file = os.path.join(self.store_src_code_dir, code_file)
69 |             csv_code_file = os.path.join(self.csv_code_dir, code_file.split(".")[0] + ".csv")
70 |             csv_comment_file = os.path.join(self.csv_comment_dir, code_file.split(".")[0] + ".csv")
71 |             convert_raw_src_code_to_csv(src_code_file, csv_code_file, csv_comment_file, discard_original_comment = self.discard_original_comment)
72 | 
73 |     def pre_process_routines(self, dst_dir, discard_original_comment = True, rtl = True):
74 |         for file in os.listdir(dst_dir):
75 |             preprocess(os.path.join(dst_dir, file),discard_original_comment=discard_original_comment, rtl=rtl)
76 | 
77 | if __name__ == "__main__":
78 |     code_dir = "../verilog/AccDNN/verilog"
79 |     if os.environ.get("TARGET_LANG") == "verilog":
80 |         code_suffix = [".v", ".sv", ".vh"]
81 |     elif os.environ.get("TARGET_LANG") == "xilinx_hls":
82 |         code_suffix = [".c", ".cpp", ".h", ".hpp"]
83 |     store_src_code_dir = os.environ.get("STORE_SRC_CODE_DIR")
84 |     csv_code_dir = os.environ.get("CSV_CODE_DIR")
85 |     csv_comment_dir = os.environ.get("CSV_COMMENT_DIR")
86 |     csv_new_comment_dir = os.environ.get("CSV_NEW_COMMENT_DIR")
87 |     csv_pure_gen_comment_dir = os.environ.get("CSV_PURE_GEN_COMMENT_DIR")
88 |     code_summary_dir = os.environ.get("CODE_SUMMARY_DIR")
89 |     documented_code_dir = os.environ.get("DOCUMENTED_CODE_DIR")
90 | 
91 |     code_preprocesser = CodePreprocesser(code_dir, store_src_code_dir, 
92 |                                          csv_code_dir, csv_comment_dir, 
93 |                                          csv_new_comment_dir, csv_pure_gen_comment_dir, 
94 |                                          code_summary_dir, documented_code_dir,
95 |                                          code_suffix=code_suffix, discard_original_comment = False)
96 |     code_preprocesser.raw_code_copy(code_dir, store_src_code_dir)
97 |     code_preprocesser.create_code_assets()
98 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/preprocess_data/example_code_strings_detailed_instructions.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lemmings1": "module top_module (\n\tinput clk,\n\tinput areset,\n\tinput bump_left,\n\tinput bump_right,\n\toutput walk_left,\n\toutput walk_right\n);\n\tparameter WL=0, WR=1;\n\treg state;\n\treg next;\n    \n    always_comb begin\n\t\tcase (state)\n\t\t\tWL: next = bump_left ? WR : WL;\n\t\t\tWR: next = bump_right ? WL: WR;\n\t\tendcase\n    end\n    \n    always @(posedge clk, posedge areset) begin\n\t\tif (areset) state <= WL;\n        else state <= next;\n\tend\n\t\t\n\tassign walk_left = (state==WL);\n\tassign walk_right = (state==WR);\n\n\t\nendmodule\n",
 3 |     "rotate100": "module top_module(\n\tinput clk,\n\tinput load,\n\tinput [1:0] ena,\n\tinput [99:0] data,\n\toutput reg [99:0] q);\n\t\n\t\n\talways @(posedge clk) begin\n\t\tif (load)\n\t\t\tq <= data;\n\t\telse if (ena == 2'h1)\n\t\t\tq <= {q[0], q[99:1]};\n\t\telse if (ena == 2'h2)\n\t\t\tq <= {q[98:0], q[99]};\n\tend\nendmodule\n",
 4 |     "vector2": "module top_module (\n\tinput [31:0] in,\n\toutput [31:0] out\n);\n\n\tassign out = {in[7:0], in[15:8], in[23:16], in[31:24]};\t\n\t\nendmodule\n",
 5 |     "gatesv100": "module top_module (\n\tinput [99:0] in,\n\toutput [98:0] out_both,\n\toutput [99:1] out_any,\n\toutput [99:0] out_different\n);\n\n\tassign out_both = in & in[99:1];\n\tassign out_any = in | in[99:1];\n\tassign out_different = in^{in[0], in[99:1]};\n\t\nendmodule\n",
 6 |     "history_shift": "module top_module\n(\n    input clk,\n    input areset,\n    input predict_valid,\n    input predict_taken,\n    output logic [31:0] predict_history,\n    \n    input train_mispredicted,\n    input train_taken,\n    input [31:0] train_history\n);\n    always@(posedge clk, posedge areset)\n\t\tif (areset) begin\n\t\t\tpredict_history = 0;\n        end\telse begin\n\t\t\tif (train_mispredicted)\n\t\t\t\tpredict_history <= {train_history, train_taken};\n\t\t\telse if (predict_valid)\n\t\t\t\tpredict_history <= {predict_history, predict_taken};\n\t\tend\nendmodule\n",
 7 |     "ece241_2013_q2": "module top_module (\n\tinput a,\n\tinput b,\n\tinput c,\n\tinput d,\n\toutput out_sop,\n\toutput out_pos\n);\n\t\n\twire pos0, pos1;\n\tassign out_sop = c&d | ~a&~b&c;\n\tassign pos0 = c & (~b|d)&(~a|b);\n\tassign pos1 = c & (~b|d)&(~a|d);\n\t\n\tassign out_pos = (pos0 == pos1) ? pos0 : 1'bx;\nendmodule\n",
 8 |     "dff16e": "module top_module(\n\tinput clk,\n\tinput resetn,\n\tinput [1:0] byteena,\n\tinput [15:0] d,\n\toutput reg [15:0] q);\n\t\n\talways @(posedge clk) begin\n\t\tif (!resetn)\n\t\t\tq <= 0;\n\t\telse begin\n\t\t\tif (byteena[0])\n\t\t\t\tq[7:0] <= d[7:0];\n\t\t\tif (byteena[1])\n\t\t\t\tq[15:8] <= d[15:8];\n\t\tend\n\tend\n\t\nendmodule\n",
 9 |     "fsm2": "module top_module (\n\tinput clk,\n\tinput j,\n\tinput k,\n\tinput areset,\n\toutput out\n);\n\tparameter A=0, B=1;\n\treg state;\n\treg next;\n    \n    always_comb begin\n\t\tcase (state)\n\t\t\tA: next = j ? B : A;\n\t\t\tB: next = k ? A : B;\n\t\tendcase\n    end\n    \n    always @(posedge clk, posedge areset) begin\n\t\tif (areset) state <= A;\n        else state <= next;\n\tend\n\t\t\n\tassign out = (state==B);\n\n\t\nendmodule\n",
10 |     "vector100r": "module top_module (\n\tinput [99:0] in,\n\toutput reg [99:0] out\n);\n\t\n\talways_comb \n\t\tfor (int i=0;i<$bits(out);i++)\n\t\t\tout[i] = in[$bits(out)-i-1];\n\t\nendmodule\n",
11 |     "gatesv": "module top_module (\n\tinput [3:0] in,\n\toutput [2:0] out_both,\n\toutput [3:1] out_any,\n\toutput [3:0] out_different\n);\n\n\tassign out_both = in[2:0] & in[3:1];\n\tassign out_any = in[2:0] | in[3:1];\n\tassign out_different = in^{in[0], in[3:1]};\n\t\nendmodule\n",
12 |     "review2015_fsmseq": "module top_module(\n\tinput clk,\n\tinput reset,\n\tinput data,\n\toutput start_shifting);\n\n\tparameter S=0, S1=1, S11=2, S110=3, Done=4;\n\t\n\treg [2:0] state, next;\n\t\n\talways_comb begin\n\t\tcase (state)\n\t\t\tS: next = data ? S1: S;\n\t\t\tS1: next = data ? S11: S;\n\t\t\tS11: next = data ? S11 : S110;\n\t\t\tS110: next = data ? Done : S;\n\t\t\tDone: next = Done;\n\t\tendcase\n\tend\n\t\n\talways @(posedge clk)\n\t\tif (reset) state <= S;\n\t\telse state <= next;\n\t\t\n\tassign start_shifting = state == Done;\n\t\n\t\nendmodule\n",
13 |     "2014_q3bfsm": "module top_module (\n\tinput clk,\n\tinput reset,\n\tinput x,\n\toutput reg z\n);\n\tparameter A=0, B=1, C=2, D=3, E=4;\n\treg [2:0] state, next;\n\n\talways @(posedge clk) begin\n\t\tif (reset) state <= A;\n\t\telse state <= next;\n\tend\n\t\n\talways_comb begin\n\t\tcase (state)\n\t\t\tA: next = x ? B : A;\n\t\t\tB: next = x ? E : B;\n\t\t\tC: next = x ? B : C;\n\t\t\tD: next = x ? C : B;\n\t\t\tE: next = x ? E : D;\t\t\n\t\t\tdefault: next = 'x;\n\t\tendcase\n\tend\n\t    \n\tassign z = (state == D) || (state == E);\n\t\nendmodule\n",
14 |     "ece241_2013_q7": "module top_module (\n\tinput clk,\n\tinput j,\n\tinput k,\n\toutput reg Q\n);\n\n\talways @(posedge clk)\n\t\tQ <= j&~Q | ~k&Q;\n\t\nendmodule\n",
15 |     "edgecapture": "module top_module(\n\tinput clk,\n\tinput reset,\n\tinput [31:0] in,\n\toutput reg [31:0] out);\n\t\n\treg [31:0] d_last;\t\n\t\t\t\n\talways @(posedge clk) begin\n\t\td_last <= in;\n\t\tif (reset)\n\t\t\tout <= '0;\n\t\telse\n\t\t\tout <= out | (~in & d_last);\n\tend\n\t\nendmodule\n"
16 | }


--------------------------------------------------------------------------------
/verilog_eval/verilog_eval/evaluation.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict, Counter
  2 | from concurrent.futures import ProcessPoolExecutor, as_completed
  3 | from typing import List, Union, Iterable, Dict, Tuple, Optional
  4 | import itertools
  5 | 
  6 | import numpy as np
  7 | import tqdm
  8 | 
  9 | from verilog_eval.data import read_problems, stream_jsonl, write_jsonl
 10 | from verilog_eval.execution import check_correctness, clean_up_simulation
 11 | 
 12 | 
 13 | def estimate_pass_at_k(
 14 |     num_samples: Union[int, List[int], np.ndarray],
 15 |     num_correct: Union[List[int], np.ndarray],
 16 |     k: int
 17 | ) -> np.ndarray:
 18 |     """
 19 |     Estimates pass@k of each problem and returns them in an array.
 20 |     """
 21 | 
 22 |     def estimator(n: int, c: int, k: int) -> float:
 23 |         """
 24 |         Calculates 1 - comb(n - c, k) / comb(n, k).
 25 |         """
 26 |         if n - c < k:
 27 |             return 1.0
 28 |         return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
 29 | 
 30 |     if isinstance(num_samples, int):
 31 |         num_samples_it = itertools.repeat(num_samples, len(num_correct))
 32 |     else:
 33 |         assert len(num_samples) == len(num_correct)
 34 |         num_samples_it = iter(num_samples)
 35 | 
 36 |     return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
 37 | 
 38 | 
 39 | def contain_passing_completion(
 40 |     problem: Dict,
 41 |     completions: List[str],
 42 |     n_workers: int = 4,
 43 |     timeout: float = 30.0,
 44 |     unit_test_length: Optional[int] = None,
 45 |     clean_up: bool = True,
 46 | ) -> Tuple[bool, str]:
 47 | 
 48 |     with ProcessPoolExecutor(max_workers=n_workers) as executor:
 49 |         
 50 |         futures = []
 51 |         
 52 |         for idx, completion in enumerate(completions):
 53 |             args = (problem, completion, timeout, idx, unit_test_length)
 54 |             future = executor.submit(check_correctness, *args)
 55 |             futures.append(future)
 56 |             
 57 |         for future in as_completed(futures):
 58 |             result = future.result()
 59 |             if result["passed"]:
 60 |                 return True, completions[result["completion_id"]]
 61 |             
 62 |     if clean_up:
 63 |         clean_up_simulation()
 64 |             
 65 |     return False, ""
 66 |             
 67 | def evaluate_functional_correctness(
 68 |     sample_file: str,
 69 |     problem_file: str,
 70 |     k: List[int] = [1, 10, 100],
 71 |     n_workers: int = 4,
 72 |     timeout: float = 30.0,
 73 |     unit_test: bool = False,
 74 |     clean_up: bool = True,
 75 | ):
 76 |     """
 77 |     Evaluates the functional correctness of generated samples, and writes
 78 |     results to f"{sample_file}_results.jsonl.gz"
 79 |     """
 80 | 
 81 |     problems = read_problems(problem_file)
 82 | 
 83 |     # Check the generated samples against test suites.
 84 |     with ProcessPoolExecutor(max_workers=n_workers) as executor:
 85 | 
 86 |         futures = []
 87 |         completion_id = Counter()
 88 |         n_samples = 0
 89 |         results = defaultdict(list)
 90 | 
 91 |         print("Reading samples...")
 92 |         for sample in tqdm.tqdm(stream_jsonl(sample_file)):
 93 |             task_id = sample["task_id"]
 94 |             completion = sample["completion"]
 95 |             if unit_test:
 96 |                 args = (problems[task_id], completion, timeout, completion_id[task_id], 100)
 97 |             else:
 98 |                 args = (problems[task_id], completion, timeout, completion_id[task_id])
 99 |             future = executor.submit(check_correctness, *args)
100 |             futures.append(future)
101 |             completion_id[task_id] += 1
102 |             n_samples += 1
103 | 
104 |         assert len(completion_id) == len(problems), "Some problems are not attempted."
105 | 
106 |         print("Running test suites...")
107 |         for future in tqdm.tqdm(as_completed(futures), total=len(futures)):
108 |             result = future.result()
109 |             results[result["task_id"]].append((result["completion_id"], result))
110 |     
111 |     if clean_up:
112 |         clean_up_simulation()
113 | 
114 |     # Calculate pass@k.
115 |     total, correct = [], []
116 |     for result in results.values():
117 |         result.sort()
118 |         passed = [r[1]["passed"] for r in result]
119 |         total.append(len(passed))
120 |         correct.append(sum(passed))
121 |     total = np.array(total)
122 |     correct = np.array(correct)
123 | 
124 |     ks = k
125 |     pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
126 |                  for k in ks if (total >= k).all()}
127 | 
128 |     # Finally, save the results in one file:
129 |     def combine_results():
130 |         for sample in stream_jsonl(sample_file):
131 |             task_id = sample["task_id"]
132 |             result = results[task_id].pop(0)
133 |             sample["result"] = result[1]["result"]
134 |             sample["passed"] = result[1]["passed"]
135 |             yield sample
136 | 
137 |     out_file = sample_file + "_results.jsonl"
138 |     print(f"Writing results to {out_file}...")
139 |     write_jsonl(out_file, tqdm.tqdm(combine_results(), total=n_samples))
140 | 
141 |     return pass_at_k
142 | 


--------------------------------------------------------------------------------
/verilog_eval/build/lib/verilog_eval/evaluation.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict, Counter
  2 | from concurrent.futures import ProcessPoolExecutor, as_completed
  3 | from typing import List, Union, Iterable, Dict, Tuple, Optional
  4 | import itertools
  5 | 
  6 | import numpy as np
  7 | import tqdm
  8 | 
  9 | from verilog_eval.data import read_problems, stream_jsonl, write_jsonl
 10 | from verilog_eval.execution import check_correctness, clean_up_simulation
 11 | 
 12 | 
 13 | def estimate_pass_at_k(
 14 |     num_samples: Union[int, List[int], np.ndarray],
 15 |     num_correct: Union[List[int], np.ndarray],
 16 |     k: int
 17 | ) -> np.ndarray:
 18 |     """
 19 |     Estimates pass@k of each problem and returns them in an array.
 20 |     """
 21 | 
 22 |     def estimator(n: int, c: int, k: int) -> float:
 23 |         """
 24 |         Calculates 1 - comb(n - c, k) / comb(n, k).
 25 |         """
 26 |         if n - c < k:
 27 |             return 1.0
 28 |         return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
 29 | 
 30 |     if isinstance(num_samples, int):
 31 |         num_samples_it = itertools.repeat(num_samples, len(num_correct))
 32 |     else:
 33 |         assert len(num_samples) == len(num_correct)
 34 |         num_samples_it = iter(num_samples)
 35 | 
 36 |     return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
 37 | 
 38 | 
 39 | def contain_passing_completion(
 40 |     problem: Dict,
 41 |     completions: List[str],
 42 |     n_workers: int = 4,
 43 |     timeout: float = 30.0,
 44 |     unit_test_length: Optional[int] = None,
 45 |     clean_up: bool = True,
 46 | ) -> Tuple[bool, str]:
 47 | 
 48 |     with ProcessPoolExecutor(max_workers=n_workers) as executor:
 49 |         
 50 |         futures = []
 51 |         
 52 |         for idx, completion in enumerate(completions):
 53 |             args = (problem, completion, timeout, idx, unit_test_length)
 54 |             future = executor.submit(check_correctness, *args)
 55 |             futures.append(future)
 56 |             
 57 |         for future in as_completed(futures):
 58 |             result = future.result()
 59 |             if result["passed"]:
 60 |                 return True, completions[result["completion_id"]]
 61 |             
 62 |     if clean_up:
 63 |         clean_up_simulation()
 64 |             
 65 |     return False, ""
 66 |             
 67 | def evaluate_functional_correctness(
 68 |     sample_file: str,
 69 |     problem_file: str,
 70 |     k: List[int] = [1, 10, 100],
 71 |     n_workers: int = 4,
 72 |     timeout: float = 30.0,
 73 |     unit_test: bool = False,
 74 |     clean_up: bool = True,
 75 | ):
 76 |     """
 77 |     Evaluates the functional correctness of generated samples, and writes
 78 |     results to f"{sample_file}_results.jsonl.gz"
 79 |     """
 80 | 
 81 |     problems = read_problems(problem_file)
 82 | 
 83 |     # Check the generated samples against test suites.
 84 |     with ProcessPoolExecutor(max_workers=n_workers) as executor:
 85 | 
 86 |         futures = []
 87 |         completion_id = Counter()
 88 |         n_samples = 0
 89 |         results = defaultdict(list)
 90 | 
 91 |         print("Reading samples...")
 92 |         for sample in tqdm.tqdm(stream_jsonl(sample_file)):
 93 |             task_id = sample["task_id"]
 94 |             completion = sample["completion"]
 95 |             if unit_test:
 96 |                 args = (problems[task_id], completion, timeout, completion_id[task_id], 100)
 97 |             else:
 98 |                 args = (problems[task_id], completion, timeout, completion_id[task_id])
 99 |             future = executor.submit(check_correctness, *args)
100 |             futures.append(future)
101 |             completion_id[task_id] += 1
102 |             n_samples += 1
103 | 
104 |         assert len(completion_id) == len(problems), "Some problems are not attempted."
105 | 
106 |         print("Running test suites...")
107 |         for future in tqdm.tqdm(as_completed(futures), total=len(futures)):
108 |             result = future.result()
109 |             results[result["task_id"]].append((result["completion_id"], result))
110 |     
111 |     if clean_up:
112 |         clean_up_simulation()
113 | 
114 |     # Calculate pass@k.
115 |     total, correct = [], []
116 |     for result in results.values():
117 |         result.sort()
118 |         passed = [r[1]["passed"] for r in result]
119 |         total.append(len(passed))
120 |         correct.append(sum(passed))
121 |     total = np.array(total)
122 |     correct = np.array(correct)
123 | 
124 |     ks = k
125 |     pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
126 |                  for k in ks if (total >= k).all()}
127 | 
128 |     # Finally, save the results in one file:
129 |     def combine_results():
130 |         for sample in stream_jsonl(sample_file):
131 |             task_id = sample["task_id"]
132 |             result = results[task_id].pop(0)
133 |             sample["result"] = result[1]["result"]
134 |             sample["passed"] = result[1]["passed"]
135 |             yield sample
136 | 
137 |     out_file = sample_file + "_results.jsonl"
138 |     print(f"Writing results to {out_file}...")
139 |     write_jsonl(out_file, tqdm.tqdm(combine_results(), total=n_samples))
140 | 
141 |     return pass_at_k
142 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/gen_global_summary.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import shutil
 4 | import argparse
 5 | from dotenv import load_dotenv
 6 | load_dotenv()
 7 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR")))
 8 | from embedding_lookup_utils import CodeDataset
 9 | from langchain.callbacks import get_openai_callback #with get_openai_callback() as cb:
10 | 
11 | if __name__ == "__main__":
12 |     parser = argparse.ArgumentParser()
13 |     #positional arguments
14 |     #start_id, total_code_parts
15 |     parser.add_argument("start_id", help="start id of the code parts", type=int)
16 |     parser.add_argument("total_code_parts", help="total number of code parts", type=int)
17 | 
18 |     #optional arguments
19 |     parser.add_argument("--documented_code_dir", help="documented code directory", type=str, default="/home/user_name/DAC_2024/ckpts/")
20 |     parser.add_argument("--code_metadata_dir", help="code metadata directory", type=str, default="/home/user_name/DAC_2024/ckpt3_user_name_valid_content_code_metadata/")
21 |     parser.add_argument("--model", help="model", type=str, default="gpt-3.5-turbo-1106")
22 |     parser.add_argument("--detailed", action="store_true", help="detailed summary")
23 |     args = parser.parse_args()
24 |     code_part_start_id = args.start_id
25 |     total_code_parts = args.total_code_parts
26 |     documented_code_dir = args.documented_code_dir
27 |     code_metadata_dir = args.code_metadata_dir
28 |     model = args.model
29 |     detailed = args.detailed
30 | 
31 | 
32 |     dataset_metadata_dir = os.path.join(documented_code_dir, "dataset_metadata")
33 |     if not os.path.exists(dataset_metadata_dir):
34 |         os.makedirs(dataset_metadata_dir)
35 | 
36 |     with get_openai_callback() as cb:
37 |         for code_part_id in range(code_part_start_id, total_code_parts):
38 |             if not os.path.exists("{}/part{}".format(dataset_metadata_dir, code_part_id)):
39 |                 os.makedirs("{}/part{}".format(dataset_metadata_dir, code_part_id))
40 |             src_code_dir = os.path.join(documented_code_dir, "part{}".format(code_part_id))
41 |             codedb = CodeDataset(
42 |                                 src_code_dir,
43 |                                 bookkeeping_dir="{}/part{}/bookkeeping/".format(dataset_metadata_dir,code_part_id),
44 |                                 vectorembedding_dir="{}/part{}/vectorembedding/".format(dataset_metadata_dir, code_part_id),
45 |                                 force_refresh=False,
46 |                                 cb=cb
47 |                                 )
48 |             csv_code_dir = os.path.join(code_metadata_dir, "part{}".format(code_part_id), "assets", "verilog", "code_and_comment_src", "csv_src", "csv_code_src")
49 |             csv_comment_dir = os.path.join(code_metadata_dir, "part{}".format(code_part_id), "assets", "verilog", "code_and_comment_src", "csv_src", "csv_new_comment_src")
50 |             codedb.load_and_split_code(skip_small_doc=True, split_by_line=True, based_on_code_lines_only=True, 
51 |                                         csv_code_dir=csv_code_dir,
52 |                                         csv_comment_dir=csv_comment_dir
53 |                                     )
54 |             if detailed:
55 |                 codedb.init_vectorstore(global_summary_chain_from_verilog_eval=False,
56 |                                         global_summary_model=model,
57 |                                         global_summary_example_cstr_json = f"{os.environ.get('DATA4AIGCHIP_HOME')}/auto_data_gen_val/preprocess_data/example_code_strings_detailed_instructions.json",
58 |                                         global_summary_example_code_description_file= f"{os.environ.get('DATA4AIGCHIP_HOME')}/verilog_eval/descriptions/VerilogDescription_Machine.jsonl"
59 |                                         )
60 |                 codedb.supplement_summary(block_summary_placeholding=True,force_refresh_global_summary_detailed=True, global_summary_example_desc_key="detail_description")
61 |                 codedb.save_global_summary(
62 |                                         "{}/part{}/global_detailed_summary.json".format(dataset_metadata_dir, code_part_id)
63 |                                         )
64 |             else:
65 |                 codedb.init_vectorstore(global_summary_chain_from_verilog_eval=False,
66 |                                         detailed=False,
67 |                                         global_summary_model=model,
68 |                                         global_summary_example_cstr_json = f"{os.environ.get('DATA4AIGCHIP_HOME')}/auto_data_gen_val/preprocess_data/example_code_strings_simple_instructions.json",
69 |                                         global_summary_example_code_description_file= f"{os.environ.get('DATA4AIGCHIP_HOME')}/verilog_eval/descriptions/VerilogDescription_Machine.jsonl"
70 |                                         )
71 |                 codedb.supplement_summary(block_summary_placeholding=True,force_refresh_global_summary_high_level=True, global_summary_example_desc_key="simple_description")
72 |                 codedb.save_global_summary(
73 |                                         "{}/part{}/global_high_level_summary.json".format(dataset_metadata_dir, code_part_id)
74 |                                         )
75 | 


--------------------------------------------------------------------------------
/verilog_eval/README.md:
--------------------------------------------------------------------------------
  1 | # VerilogEval: Evaluating Large Language Models for Verilog Code Generation 
  2 | 
  3 | This is an evaluation harness for the VerilogEval problem solving dataset
  4 | described in the paper "[VerilogEval: Evaluating Large
  5 | Language Models for Verilog Code Generation](https://arxiv.org/abs/2309.07544)".
  6 | 
  7 | This evaluation dataset consists of 156 problems from the Verilog 
  8 | instructional website [HDLBits](https://hdlbits.01xz.net/wiki/Problem_sets).
  9 | We provide two sets of problem descriptions: machine generated and manually
 10 | converted to text-only format.
 11 | 
 12 | ## Installation
 13 | 
 14 | We closely follow guidance from [HumanEval](https://github.com/openai/human-eval/tree/master).
 15 | 
 16 | Make sure to use python 3.7 or later:
 17 | ```
 18 | $ conda create -n codex python=3.7
 19 | $ conda activate codex
 20 | ```
 21 | 
 22 | Install [ICARUS Verilog](https://github.com/steveicarus/iverilog):
 23 | ```
 24 | $ git clone https://github.com/steveicarus/iverilog.git && cd iverilog \
 25 |         && git checkout 01441687235135d1c12eeef920f75d97995da333 \
 26 |         && sh ./autoconf.sh && ./configure && make -j4\
 27 |         && make install
 28 | ```
 29 | 
 30 | It is recommended to use the provided [Dockerfile](https://github.com/NVlabs/verilog-eval/blob/main/Dockerfile) 
 31 | which already pre-installed ICARUS Verilog Simulator. Using the docker container
 32 | you would still need to complete the following step.
 33 | 
 34 | Check out and install this repository:
 35 | ```
 36 | $ git clone https://github.com/NVlabs/verilog-eval
 37 | $ pip install -e verilog-eval
 38 | ```
 39 | 
 40 | ## Usage
 41 | 
 42 | **This program would make system calls to *iverilog* and *vvp* to simulate 
 43 | untrusted model-generated code. Users are strongly
 44 | encouraged not to do so outside of a robust security sandbox. The [execution
 45 | call](https://github.com/NVlabs/verilog-eval/blob/main/verilog_eval/execution.py#L79-L112)
 46 | in `execution.py` is deliberately commented out to ensure users read this
 47 | disclaimer before running code in a potentially unsafe manner. See the comment in
 48 | `execution.py` for more information and instructions.**
 49 | 
 50 | After following the above instructions to enable execution, generate samples
 51 | and save them in the following JSON Lines (jsonl) format, where each sample is
 52 | formatted into a single line like so:
 53 | ```
 54 | {"task_id": "Corresponding VerilogEval task ID", "completion": "Completion only without the prompt"}
 55 | ```
 56 | We provide examples under `data/example` to illustrate the format and help with debugging.
 57 | 
 58 | To evaluate the samples, run
 59 | ```
 60 | $ evaluate_functional_correctness samples.jsonl --problem_file data/VerilogEval_Human.jsonl
 61 | Reading samples...
 62 | 3120it [00:00, 16077.44it/s]
 63 | Running test suites...
 64 | 100%|...| 3120/3120 [00:32<00:00, 97.47it/s]
 65 | Killing all hanging simulation process.
 66 | Writing results to samples.jsonl_results.jsonl...
 67 | 100%|...| 3120/3120 [00:00<00:00, 30608.13it/s]
 68 | {'pass@1': ..., 'pass@5': ..., 'pass@10': ...}
 69 | ```
 70 | 
 71 | The user must specify `--problem_file` input argument. We provide two sets of problem
 72 | evaluations `data/VerilogEval_Machine.jsonl` and `data/VerilogEval_Human.jsonl`. 
 73 | We also provide problem description files used to sample Verilog code completions 
 74 | in `descriptions` directory.
 75 | 
 76 | This script provides more fine-grained information in a new file ending in
 77 | `<input_path>_results.jsonl`. Each row now contains whether the completion
 78 | `passed` along with the execution `result` which is one of "passed", "timed
 79 | out", or "failed".
 80 | 
 81 | As a quick sanity-check, the example samples should yield 0.5 pass@1. The results can be
 82 | verified against the provided output 
 83 | in `data/example/ExampleSolution.jsonl_reference.jsonl`.
 84 | ```
 85 | $ evaluate_functional_correctness data/example/ExampleSolution.jsonl --problem_file=data/example/ExampleEval.jsonl
 86 | Reading samples...
 87 | 6it [00:00, 221.60it/s]
 88 | Running example suites...
 89 | 100%|...| 6/6 [00:00<00:00, 142.09it/s]
 90 | Killing all hanging simulation process.
 91 | Writing results to data/example/ExampleSolution.jsonl_results.jsonl...
 92 | 100%|...| 6/6 [00:00<00:00, 19941.22it/s]
 93 | {'pass@1': 0.5}
 94 | ```
 95 | 
 96 | Because there is no unbiased way of estimating pass@k when there are fewer
 97 | samples than k, the script does not evaluate pass@k for these cases. To
 98 | evaluate with other k values, pass `--k=<comma-separated-values-here>`. For
 99 | other options, see
100 | ```
101 | $ evaluate_functional_correctness --help
102 | ```
103 | However, we recommend that you use the default values for the rest.
104 | 
105 | ## Issues
106 | Problem descriptions in `descriptions/VerilogDescription_Machine.jsonl` are machine 
107 | generated and we can not guarantee the absense of ambiguity and errors. We do not plan
108 | to maintain description correctness.
109 | 
110 | Functional correctness are evaluated through comparing simulation outputs using 
111 | [ICARUS Verilog](https://github.com/steveicarus/iverilog). The evaluation of Verilog syntax is limited by the simulator, which might not include all features of Verilog HDL 
112 | IEEE-1364 standard.
113 | 
114 | 
115 | ## Citation
116 | 
117 | Please cite using the following bibtex entry:
118 | 
119 | ```
120 | @inproceedings{liu2023verilogeval,
121 |   title={{VerilogEval:} Evaluating Large Language Models for Verilog Code Generation},
122 |   author={Liu, Mingjie and Pinckney, Nathaniel and Khailany, Brucek and Ren, Haoxing},
123 |   booktitle={2023 IEEE/ACM International Conference on Computer-Aided Design (ICCAD)}, 
124 |   year={2023}
125 | }
126 | ```
127 | 


--------------------------------------------------------------------------------
/auto_data_gen_val/line_by_line_comments_gen.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from dotenv import load_dotenv
  4 | load_dotenv(os.path.join(os.path.dirname(__file__), ".env"))
  5 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR")))
  6 | import openai
  7 | import requests
  8 | import json
  9 | import copy
 10 | import time
 11 | import datetime
 12 | import shutil
 13 | from embedding_lookup_utils import *
 14 | from utils import *
 15 | from completion_handler import *
 16 | from code_preprocesser import *
 17 | from code_repo_documentor import *
 18 | 
 19 | #documenting the first version with module instantiation
 20 | #one_shot 5 lines
 21 | #pure llama 2 70B
 22 | #around 12k samples
 23 | 
 24 | if __name__ == "__main__":
 25 |     #NOTE: run utils.py first to partition the code first
 26 |     import argparse
 27 | 
 28 |     parser = argparse.ArgumentParser(description='Line-by-line Code Documentor')
 29 |     parser.add_argument('--total_parts', type=int, default=10, help='total parts')
 30 |     parser.add_argument('--output_dir', type=str, default="./documented_code", help='output directory')
 31 |     parser.add_argument('--src_code_dir', type=str, default="/home/user_name/DAC_2024/ckpt3_user_name_valid_content_renamed/", help='code directory')
 32 |     parser.add_argument('--code_metadata_dir', type=str, default="/home/user_name/DAC_2024/ckpt3_user_name_valid_content_code_metadata/", help='code metadata file')
 33 |     parser.add_argument('--code_lib_path', type=str, default="/home/user_name/DAC_2024/ckpt3_user_name_valid_content_shared_lib/", help='code library path')
 34 |     parser.add_argument('--code_vec_store', type=str, default="../code_vec_store/test_10_30/", help='code vector store')
 35 |     parser.add_argument('--skip_preprocess', action='store_true', help='skip preprocessing')
 36 |     parser.add_argument('--skip_supplement_summary', action='store_true', help='skip supplementing summary')
 37 |     parser.add_argument('--discard_original_comment', action='store_true', help='discard original comment')
 38 | 
 39 |     args = parser.parse_args()
 40 |     total_parts = args.total_parts
 41 |     output_dir = args.output_dir
 42 |     src_code_dir = args.src_code_dir
 43 |     code_metadata_dir = args.code_metadata_dir
 44 |     code_lib_path = args.code_lib_path
 45 |     code_vec_store = args.code_vec_store
 46 |     skip_preprocess = args.skip_preprocess
 47 |     skip_supplement_summary = args.skip_supplement_summary
 48 |     discard_original_comment = args.discard_original_comment
 49 | 
 50 |     for code_part in range(total_parts):
 51 |         code_dir = os.path.join(src_code_dir, "part{}".format(code_part))
 52 |         code_metadata_file = os.path.join(code_metadata_dir, "part{}".format(code_part), "codes.json")
 53 |         # code_lib_path =  "/home/user_name/DAC_2024/ckpt3_user_name_valid_content_shared_lib/"
 54 |         # code_vec_store = "../code_vec_store/test_10_30/"
 55 | 
 56 |         
 57 |         language = os.environ.get("TARGET_LANG")
 58 |         if os.environ.get("TARGET_LANG") == "verilog":
 59 |             code_suffix = [".v", ".sv", ".vh"]
 60 |         elif os.environ.get("TARGET_LANG") == "xilinx_hls":
 61 |             code_suffix = [".c", ".cpp", ".h", ".hpp"]
 62 |         store_src_code_dir = os.environ.get("STORE_SRC_CODE_DIR")
 63 |         csv_code_dir = os.environ.get("CSV_CODE_DIR")
 64 |         csv_comment_dir = os.environ.get("CSV_COMMENT_DIR")
 65 |         csv_new_comment_dir = os.environ.get("CSV_NEW_COMMENT_DIR")
 66 |         csv_pure_gen_comment_dir = os.environ.get("CSV_PURE_GEN_COMMENT_DIR")
 67 |         code_summary_dir = os.environ.get("CODE_SUMMARY_DIR")
 68 |         documented_code_dir = os.environ.get("DOCUMENTED_CODE_DIR")
 69 | 
 70 | 
 71 |         with get_openai_callback() as cb:
 72 |             #This switch will discard 1. the comments in the raw code copy and 2. the comments will be converted to the raw code csv 
 73 |             # discard_original_comment = True
 74 |             # skip_preprocess = True
 75 |             # skip_supplement_summary = True
 76 | 
 77 |             code_repo_documentor = CodeRepoDocumentor(code_dir, store_src_code_dir,
 78 |                                                         csv_code_dir, csv_comment_dir, csv_new_comment_dir, 
 79 |                                                         csv_pure_gen_comment_dir, code_summary_dir, documented_code_dir,
 80 |                                                         code_metadata_file=code_metadata_file,
 81 |                                                         code_suffix=code_suffix, language=language,
 82 |                                                         discard_original_comment=discard_original_comment,
 83 |                                                         code_lib_path=code_lib_path, code_vec_store=code_vec_store,
 84 |                                                         skip_supplement_summary=skip_supplement_summary,
 85 |                                                         cb = cb)
 86 |             code_repo_documentor.create_embedding()
 87 |             code_repo_documentor.code_preprocess(skip_preprocess=skip_preprocess)
 88 |             code_repo_documentor.document_repo()
 89 | 
 90 |             output_dir_part = os.path.join(output_dir, "part{}".format(code_part))
 91 |             #check if output dir exists
 92 |             if not os.path.exists(output_dir_part):
 93 |                 os.makedirs(output_dir_part)
 94 |             else:
 95 |                 #ask for confirmation
 96 |                 print("Output directory already exists. Do you want to overwrite? (y/n)")
 97 |                 choice = input().lower()
 98 |                 if choice == "y":
 99 |                     shutil.rmtree(output_dir_part)
100 |                     os.makedirs(output_dir_part)
101 |                 else:
102 |                     print("Exiting...")
103 |                     continue
104 |             code_repo_documentor.package_documented_code(output_dir_part)
105 |             #copy assets to output dir
106 |             shutil.copytree(os.environ.get("ASSET_DIR"), os.path.join(os.path.join(code_metadata_dir, "part{}".format(code_part)), "assets"))
107 |             #copy vector store to output dir
108 |             shutil.copytree(code_vec_store, os.path.join(os.path.join(code_metadata_dir, "part{}".format(code_part)), "code_vec_store"))
109 |             
110 | 
111 | 


--------------------------------------------------------------------------------
/verilog_eval/verilog_eval/execution.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Callable, Dict
  2 | import ast
  3 | import contextlib
  4 | import faulthandler
  5 | import io
  6 | import os
  7 | import multiprocessing
  8 | import platform
  9 | import signal
 10 | import tempfile
 11 | 
 12 | import subprocess
 13 | import re
 14 | from threading import Timer
 15 | 
 16 | def clean_up_simulation() -> None:
 17 |     """
 18 |     kill all simulation process.
 19 |     """
 20 |     print("Killing all hanging simulation process.")
 21 |     subprocess.run("pkill iverilog", shell=True)
 22 |     subprocess.run("pkill vvp", shell=True)
 23 | 
 24 | def check_correctness(problem: Dict, completion: str, timeout: float,
 25 |                       completion_id: Optional[int] = None, unit_test_length: Optional[int] = None) -> Dict:
 26 |     """
 27 |     Evaluates the functional correctness of a completion by running the test
 28 |     suite provided in the problem. 
 29 |     :param completion_id: an optional completion ID so we can match
 30 |         the results later even if execution finishes asynchronously.
 31 |     """
 32 | 
 33 |     def unsafe_execute():
 34 | 
 35 |         with create_tempdir():
 36 | 
 37 |             # These system calls are needed when cleaning up tempdir.
 38 |             import os
 39 |             import shutil
 40 |             rmtree = shutil.rmtree
 41 |             rmdir = os.rmdir
 42 |             chdir = os.chdir
 43 | 
 44 |             # Disable functionalities that can make destructive changes to the test.
 45 | # WARNING
 46 | # subprocess.Popen is used to run shell command with calls to iveriog and vvp.
 47 | # Please refer to reliability_guard function for details
 48 |             reliability_guard()
 49 | 
 50 |             # Output testbench with solution to Verilog file in temp directory.
 51 |             verilog_test = problem["test"] + "\n" + \
 52 |                     problem["prompt"] + "\n" + \
 53 |                     completion
 54 | 
 55 |                 
 56 |             if unit_test_length:
 57 |                 keywords = re.findall("repeat\([0-9]*\)", verilog_test)
 58 |                 for words in keywords:
 59 |                     verilog_test = verilog_test.replace(words, "repeat({})".format(unit_test_length))
 60 |                     
 61 |             with open("{}.sv".format(problem["task_id"]), 'w') as f:
 62 |                 f.write(verilog_test)
 63 |             
 64 |             try:
 65 | # WARNING PLEASE READ
 66 | # The following code use subprocess.Popen to run shell command with calls to iveriog and vvp.
 67 | # Please check that iverilog and vvp are installed and included in your current run path.
 68 | # For installation of Icarus Verilog, please refer to: https://github.com/steveicarus/iverilog
 69 | # This program exists to execute untrusted model-generated code. Although
 70 | # it is highly unlikely that model-generated code will do something overtly
 71 | # malicious in response to this test suite, model-generated code may act
 72 | # destructively due to a lack of model capability or alignment.
 73 | # Users are strongly encouraged to sandbox this evaluation suite so that it 
 74 | # does not perform destructive actions on their host or network. For more 
 75 | # information on how OpenAI sandboxes its code, see the original OpenAI paper.
 76 | # Once you have read this disclaimer and taken appropriate precautions, 
 77 | # proceed at your own risk:
 78 | # BEGIN CODE BLOCK
 79 |                 with swallow_io():
 80 |                     with time_limit(timeout):
 81 |                         cmd = "iverilog -Wall -Winfloop -Wno-timescale -g2012 \
 82 |                                     -s tb -o test.vvp {}.sv; vvp -n test.vvp".format(problem["task_id"])
 83 |                        
 84 |                         """
 85 |                         adding timeout options for Popen. something breaks if not using timeout. seems to be working for now.
 86 |                         not really sure if its the best/correct way. let me know if anyone has a better solution.
 87 |                         https://stackoverflow.com/questions/1191374/using-module-subprocess-with-timeout
 88 |                         """
 89 |                         p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 90 |                         timer = Timer(timeout, p.kill)
 91 |                         try:
 92 |                             timer.start()
 93 |                             out, err = p.communicate()
 94 |                         finally:
 95 |                             timer.cancel()
 96 |                             
 97 |                         out, err = out.decode("utf-8"), err.decode("utf-8") 
 98 |                         match = re.search(r'Mismatches: ([0-9]*) in ([0-9]*) samples', out)
 99 |                         if "syntax error" in err:
100 |                             result.append("failed: syntax error.")
101 |                         elif len(err) > 0:
102 |                             result.append("failed: compile error.")
103 |                         elif match:
104 |                             cor, tot = [int(i) for i in match.groups()]
105 |                             if cor == 0:
106 |                                 result.append("passed")
107 |                             else:
108 |                                 result.append(f"failed: {cor} out of {tot} samples.")
109 |                         else:
110 |                             result.append("failed: info string not matched.")
111 | # END CODE BLOCK
112 |             except TimeoutException:
113 |                 result.append("timed out")
114 |             except BaseException as e:
115 |                 result.append(f"failed: {e}")
116 | 
117 |             # Needed for cleaning up.
118 |             shutil.rmtree = rmtree
119 |             os.rmdir = rmdir
120 |             os.chdir = chdir
121 |             
122 |     manager = multiprocessing.Manager()
123 |     result = manager.list()
124 | 
125 |     p = multiprocessing.Process(target=unsafe_execute)
126 |     p.start()
127 |     p.join(timeout=timeout + 1)
128 |     if p.is_alive():
129 |         p.kill()
130 | 
131 |     if not result:
132 |         result.append("timed out")
133 | 
134 |     return dict(
135 |         task_id=problem["task_id"],
136 |         passed=result[0] == "passed",
137 |         result=result[0],
138 |         completion_id=completion_id,
139 |     )
140 | 
141 | 
142 | @contextlib.contextmanager
143 | def time_limit(seconds: float):
144 |     def signal_handler(signum, frame):
145 |         raise TimeoutException("Timed out!")
146 |     signal.setitimer(signal.ITIMER_REAL, seconds)
147 |     signal.signal(signal.SIGALRM, signal_handler)
148 |     try:
149 |         yield
150 |     finally:
151 |         signal.setitimer(signal.ITIMER_REAL, 0)
152 | 
153 | 
154 | @contextlib.contextmanager
155 | def swallow_io():
156 |     stream = WriteOnlyStringIO()
157 |     with contextlib.redirect_stdout(stream):
158 |         with contextlib.redirect_stderr(stream):
159 |             with redirect_stdin(stream):
160 |                 yield
161 | 
162 | 
163 | @contextlib.contextmanager
164 | def create_tempdir():
165 |     with tempfile.TemporaryDirectory() as dirname:
166 |         with chdir(dirname):
167 |             yield dirname
168 | 
169 | 
170 | class TimeoutException(Exception):
171 |     pass
172 | 
173 | 
174 | class WriteOnlyStringIO(io.StringIO):
175 |     """ StringIO that throws an exception when it's read from """
176 | 
177 |     def read(self, *args, **kwargs):
178 |         raise IOError
179 | 
180 |     def readline(self, *args, **kwargs):
181 |         raise IOError
182 | 
183 |     def readlines(self, *args, **kwargs):
184 |         raise IOError
185 | 
186 |     def readable(self, *args, **kwargs):
187 |         """ Returns True if the IO object can be read. """
188 |         return False
189 | 
190 | 
191 | class redirect_stdin(contextlib._RedirectStream):  # type: ignore
192 |     _stream = 'stdin'
193 | 
194 | 
195 | @contextlib.contextmanager
196 | def chdir(root):
197 |     if root == ".":
198 |         yield
199 |         return
200 |     cwd = os.getcwd()
201 |     os.chdir(root)
202 |     try:
203 |         yield
204 |     except BaseException as exc:
205 |         raise exc
206 |     finally:
207 |         os.chdir(cwd)
208 | 
209 | 
210 | def reliability_guard(maximum_memory_bytes: Optional[int] = None):
211 |     """
212 |     Updated Comment:
213 |     We have enabled subprocess.Popen to allow shell command calls to verilog 
214 |     compiler and simulator. Please use at own risk.
215 |     Original Comment:
216 |     This disables various destructive functions and prevents the generated code
217 |     from interfering with the test (e.g. fork bomb, killing other processes,
218 |     removing filesystem files, etc.)
219 |     WARNING
220 |     This function is NOT a security sandbox. Untrusted code, including, model-
221 |     generated code, should not be blindly executed outside of one. See the 
222 |     Codex paper for more information about OpenAI's code sandbox, and proceed
223 |     with caution.
224 |     """
225 | 
226 |     if maximum_memory_bytes is not None:
227 |         import resource
228 |         resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
229 |         resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
230 |         if not platform.uname().system == 'Darwin':
231 |             resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
232 | 
233 |     faulthandler.disable()
234 | 
235 |     import builtins
236 |     builtins.exit = None
237 |     builtins.quit = None
238 | 
239 |     import os
240 |     os.environ['OMP_NUM_THREADS'] = '1'
241 | 
242 |     os.kill = None
243 |     os.system = None
244 |     os.putenv = None
245 |     os.remove = None
246 |     os.removedirs = None
247 |     os.rmdir = None
248 |     os.fchdir = None
249 |     os.setuid = None
250 |     os.fork = None
251 |     os.forkpty = None
252 |     os.killpg = None
253 |     os.rename = None
254 |     os.renames = None
255 |     os.truncate = None
256 |     os.replace = None
257 |     #os.unlink = None
258 |     os.fchmod = None
259 |     os.fchown = None
260 |     os.chmod = None
261 |     os.chown = None
262 |     os.chroot = None
263 |     os.fchdir = None
264 |     os.lchflags = None
265 |     os.lchmod = None
266 |     os.lchown = None
267 |     os.getcwd = None
268 |     os.chdir = None
269 | 
270 |     import shutil
271 |     shutil.rmtree = None
272 |     shutil.move = None
273 |     shutil.chown = None
274 | 
275 | # WARNING
276 | # subprocess.Popen is allowed and used to make shell command calls to verilog compiler and simulator.
277 |     #import subprocess
278 |     #subprocess.Popen = None  # type: ignore
279 | 
280 |     __builtins__['help'] = None
281 | 
282 |     import sys
283 |     sys.modules['ipdb'] = None
284 |     sys.modules['joblib'] = None
285 |     sys.modules['resource'] = None
286 |     sys.modules['psutil'] = None
287 |     sys.modules['tkinter'] = None


--------------------------------------------------------------------------------
/verilog_eval/build/lib/verilog_eval/execution.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Callable, Dict
  2 | import ast
  3 | import contextlib
  4 | import faulthandler
  5 | import io
  6 | import os
  7 | import multiprocessing
  8 | import platform
  9 | import signal
 10 | import tempfile
 11 | 
 12 | import subprocess
 13 | import re
 14 | from threading import Timer
 15 | 
 16 | def clean_up_simulation() -> None:
 17 |     """
 18 |     kill all simulation process.
 19 |     """
 20 |     print("Killing all hanging simulation process.")
 21 |     subprocess.run("pkill iverilog", shell=True)
 22 |     subprocess.run("pkill vvp", shell=True)
 23 | 
 24 | def check_correctness(problem: Dict, completion: str, timeout: float,
 25 |                       completion_id: Optional[int] = None, unit_test_length: Optional[int] = None) -> Dict:
 26 |     """
 27 |     Evaluates the functional correctness of a completion by running the test
 28 |     suite provided in the problem. 
 29 |     :param completion_id: an optional completion ID so we can match
 30 |         the results later even if execution finishes asynchronously.
 31 |     """
 32 | 
 33 |     def unsafe_execute():
 34 | 
 35 |         with create_tempdir():
 36 | 
 37 |             # These system calls are needed when cleaning up tempdir.
 38 |             import os
 39 |             import shutil
 40 |             rmtree = shutil.rmtree
 41 |             rmdir = os.rmdir
 42 |             chdir = os.chdir
 43 | 
 44 |             # Disable functionalities that can make destructive changes to the test.
 45 | # WARNING
 46 | # subprocess.Popen is used to run shell command with calls to iveriog and vvp.
 47 | # Please refer to reliability_guard function for details
 48 |             reliability_guard()
 49 | 
 50 |             # Output testbench with solution to Verilog file in temp directory.
 51 |             verilog_test = problem["test"] + "\n" + \
 52 |                     problem["prompt"] + "\n" + \
 53 |                     completion
 54 | 
 55 |                 
 56 |             if unit_test_length:
 57 |                 keywords = re.findall("repeat\([0-9]*\)", verilog_test)
 58 |                 for words in keywords:
 59 |                     verilog_test = verilog_test.replace(words, "repeat({})".format(unit_test_length))
 60 |                     
 61 |             with open("{}.sv".format(problem["task_id"]), 'w') as f:
 62 |                 f.write(verilog_test)
 63 |             
 64 |             try:
 65 | # WARNING PLEASE READ
 66 | # The following code use subprocess.Popen to run shell command with calls to iveriog and vvp.
 67 | # Please check that iverilog and vvp are installed and included in your current run path.
 68 | # For installation of Icarus Verilog, please refer to: https://github.com/steveicarus/iverilog
 69 | # This program exists to execute untrusted model-generated code. Although
 70 | # it is highly unlikely that model-generated code will do something overtly
 71 | # malicious in response to this test suite, model-generated code may act
 72 | # destructively due to a lack of model capability or alignment.
 73 | # Users are strongly encouraged to sandbox this evaluation suite so that it 
 74 | # does not perform destructive actions on their host or network. For more 
 75 | # information on how OpenAI sandboxes its code, see the original OpenAI paper.
 76 | # Once you have read this disclaimer and taken appropriate precautions, 
 77 | # proceed at your own risk:
 78 | # BEGIN CODE BLOCK
 79 |                 with swallow_io():
 80 |                     with time_limit(timeout):
 81 |                         cmd = "iverilog -Wall -Winfloop -Wno-timescale -g2012 \
 82 |                                     -s tb -o test.vvp {}.sv; vvp -n test.vvp".format(problem["task_id"])
 83 |                        
 84 |                         """
 85 |                         adding timeout options for Popen. something breaks if not using timeout. seems to be working for now.
 86 |                         not really sure if its the best/correct way. let me know if anyone has a better solution.
 87 |                         https://stackoverflow.com/questions/1191374/using-module-subprocess-with-timeout
 88 |                         """
 89 |                         p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 90 |                         timer = Timer(timeout, p.kill)
 91 |                         try:
 92 |                             timer.start()
 93 |                             out, err = p.communicate()
 94 |                         finally:
 95 |                             timer.cancel()
 96 |                             
 97 |                         out, err = out.decode("utf-8"), err.decode("utf-8") 
 98 |                         match = re.search(r'Mismatches: ([0-9]*) in ([0-9]*) samples', out)
 99 |                         if "syntax error" in err:
100 |                             result.append("failed: syntax error.")
101 |                         elif len(err) > 0:
102 |                             result.append("failed: compile error.")
103 |                         elif match:
104 |                             cor, tot = [int(i) for i in match.groups()]
105 |                             if cor == 0:
106 |                                 result.append("passed")
107 |                             else:
108 |                                 result.append(f"failed: {cor} out of {tot} samples.")
109 |                         else:
110 |                             result.append("failed: info string not matched.")
111 | # END CODE BLOCK
112 |             except TimeoutException:
113 |                 result.append("timed out")
114 |             except BaseException as e:
115 |                 result.append(f"failed: {e}")
116 | 
117 |             # Needed for cleaning up.
118 |             shutil.rmtree = rmtree
119 |             os.rmdir = rmdir
120 |             os.chdir = chdir
121 |             
122 |     manager = multiprocessing.Manager()
123 |     result = manager.list()
124 | 
125 |     p = multiprocessing.Process(target=unsafe_execute)
126 |     p.start()
127 |     p.join(timeout=timeout + 1)
128 |     if p.is_alive():
129 |         p.kill()
130 | 
131 |     if not result:
132 |         result.append("timed out")
133 | 
134 |     return dict(
135 |         task_id=problem["task_id"],
136 |         passed=result[0] == "passed",
137 |         result=result[0],
138 |         completion_id=completion_id,
139 |     )
140 | 
141 | 
142 | @contextlib.contextmanager
143 | def time_limit(seconds: float):
144 |     def signal_handler(signum, frame):
145 |         raise TimeoutException("Timed out!")
146 |     signal.setitimer(signal.ITIMER_REAL, seconds)
147 |     signal.signal(signal.SIGALRM, signal_handler)
148 |     try:
149 |         yield
150 |     finally:
151 |         signal.setitimer(signal.ITIMER_REAL, 0)
152 | 
153 | 
154 | @contextlib.contextmanager
155 | def swallow_io():
156 |     stream = WriteOnlyStringIO()
157 |     with contextlib.redirect_stdout(stream):
158 |         with contextlib.redirect_stderr(stream):
159 |             with redirect_stdin(stream):
160 |                 yield
161 | 
162 | 
163 | @contextlib.contextmanager
164 | def create_tempdir():
165 |     with tempfile.TemporaryDirectory() as dirname:
166 |         with chdir(dirname):
167 |             yield dirname
168 | 
169 | 
170 | class TimeoutException(Exception):
171 |     pass
172 | 
173 | 
174 | class WriteOnlyStringIO(io.StringIO):
175 |     """ StringIO that throws an exception when it's read from """
176 | 
177 |     def read(self, *args, **kwargs):
178 |         raise IOError
179 | 
180 |     def readline(self, *args, **kwargs):
181 |         raise IOError
182 | 
183 |     def readlines(self, *args, **kwargs):
184 |         raise IOError
185 | 
186 |     def readable(self, *args, **kwargs):
187 |         """ Returns True if the IO object can be read. """
188 |         return False
189 | 
190 | 
191 | class redirect_stdin(contextlib._RedirectStream):  # type: ignore
192 |     _stream = 'stdin'
193 | 
194 | 
195 | @contextlib.contextmanager
196 | def chdir(root):
197 |     if root == ".":
198 |         yield
199 |         return
200 |     cwd = os.getcwd()
201 |     os.chdir(root)
202 |     try:
203 |         yield
204 |     except BaseException as exc:
205 |         raise exc
206 |     finally:
207 |         os.chdir(cwd)
208 | 
209 | 
210 | def reliability_guard(maximum_memory_bytes: Optional[int] = None):
211 |     """
212 |     Updated Comment:
213 |     We have enabled subprocess.Popen to allow shell command calls to verilog 
214 |     compiler and simulator. Please use at own risk.
215 |     Original Comment:
216 |     This disables various destructive functions and prevents the generated code
217 |     from interfering with the test (e.g. fork bomb, killing other processes,
218 |     removing filesystem files, etc.)
219 |     WARNING
220 |     This function is NOT a security sandbox. Untrusted code, including, model-
221 |     generated code, should not be blindly executed outside of one. See the 
222 |     Codex paper for more information about OpenAI's code sandbox, and proceed
223 |     with caution.
224 |     """
225 | 
226 |     if maximum_memory_bytes is not None:
227 |         import resource
228 |         resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
229 |         resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
230 |         if not platform.uname().system == 'Darwin':
231 |             resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
232 | 
233 |     faulthandler.disable()
234 | 
235 |     import builtins
236 |     builtins.exit = None
237 |     builtins.quit = None
238 | 
239 |     import os
240 |     os.environ['OMP_NUM_THREADS'] = '1'
241 | 
242 |     os.kill = None
243 |     os.system = None
244 |     os.putenv = None
245 |     os.remove = None
246 |     os.removedirs = None
247 |     os.rmdir = None
248 |     os.fchdir = None
249 |     os.setuid = None
250 |     os.fork = None
251 |     os.forkpty = None
252 |     os.killpg = None
253 |     os.rename = None
254 |     os.renames = None
255 |     os.truncate = None
256 |     os.replace = None
257 |     #os.unlink = None
258 |     os.fchmod = None
259 |     os.fchown = None
260 |     os.chmod = None
261 |     os.chown = None
262 |     os.chroot = None
263 |     os.fchdir = None
264 |     os.lchflags = None
265 |     os.lchmod = None
266 |     os.lchown = None
267 |     os.getcwd = None
268 |     os.chdir = None
269 | 
270 |     import shutil
271 |     shutil.rmtree = None
272 |     shutil.move = None
273 |     shutil.chown = None
274 | 
275 | # WARNING
276 | # subprocess.Popen is allowed and used to make shell command calls to verilog compiler and simulator.
277 |     #import subprocess
278 |     #subprocess.Popen = None  # type: ignore
279 | 
280 |     __builtins__['help'] = None
281 | 
282 |     import sys
283 |     sys.modules['ipdb'] = None
284 |     sys.modules['joblib'] = None
285 |     sys.modules['resource'] = None
286 |     sys.modules['psutil'] = None
287 |     sys.modules['tkinter'] = None


--------------------------------------------------------------------------------
/verilog_eval/data/example/ExampleEval.jsonl:
--------------------------------------------------------------------------------
1 | {"task_id": "gatesv", "prompt": "module top_module (\n\tinput [3:0] in,\n\toutput [2:0] out_both,\n\toutput [3:1] out_any,\n\toutput [3:0] out_different\n);\n", "canonical_solution": "\n\tassign out_both = in[2:0] & in[3:1];\n\tassign out_any = in[2:0] | in[3:1];\n\tassign out_different = in^{in[0], in[3:1]};\n\t\nendmodule\n", "test": "`timescale 1 ps/1 ps\n`define OK 12\n`define INCORRECT 13\nmodule reference_module (\n\tinput [3:0] in,\n\toutput [2:0] out_both,\n\toutput [3:1] out_any,\n\toutput [3:0] out_different\n);\n\n\tassign out_both = in[2:0] & in[3:1];\n\tassign out_any = in[2:0] | in[3:1];\n\tassign out_different = in^{in[0], in[3:1]};\n\t\nendmodule\n\n\nmodule stimulus_gen (\n\tinput clk,\n\tinput tb_match,\n\toutput logic [3:0] in,\n\toutput reg[511:0] wavedrom_title,\n\toutput reg wavedrom_enable\t\n);\n\n\n// Add two ports to module stimulus_gen:\n//    output [511:0] wavedrom_title\n//    output reg wavedrom_enable\n\n\ttask wavedrom_start(input[511:0] title = \"\");\n\tendtask\n\t\n\ttask wavedrom_stop;\n\t\t#1;\n\tendtask\t\n\n\n\n\tinitial begin\n\t\tin <= 4'h3;\n\t\t@(negedge clk);\n\t\twavedrom_start();\n\t\t\t@(posedge clk) in <= 3;\n\t\t\t@(posedge clk) in <= 6;\n\t\t\t@(posedge clk) in <= 12;\n\t\t\t@(posedge clk) in <= 9;\n\t\t\t@(posedge clk) in <= 5;\n\t\t@(negedge clk);\n\t\twavedrom_stop();\n\t\tin <= $random;\n\t\trepeat(100) begin\n\t\t\t@(negedge clk) in <= $random;\n\t\t\t@(posedge clk) in <= $random;\n\t\tend\n\t\t#1 $finish;\n\tend\n\t\t\nendmodule\n\nmodule tb();\n\n\ttypedef struct packed {\n\t\tint errors;\n\t\tint errortime;\n\t\tint errors_out_both;\n\t\tint errortime_out_both;\n\t\tint errors_out_any;\n\t\tint errortime_out_any;\n\t\tint errors_out_different;\n\t\tint errortime_out_different;\n\n\t\tint clocks;\n\t} stats;\n\t\n\tstats stats1;\n\t\n\t\n\twire[511:0] wavedrom_title;\n\twire wavedrom_enable;\n\tint wavedrom_hide_after_time;\n\t\n\treg clk=0;\n\tinitial forever\n\t\t#5 clk = ~clk;\n\n\tlogic [3:0] in;\n\tlogic [2:0] out_both_ref;\n\tlogic [2:0] out_both_dut;\n\tlogic [3:1] out_any_ref;\n\tlogic [3:1] out_any_dut;\n\tlogic [3:0] out_different_ref;\n\tlogic [3:0] out_different_dut;\n\n\tinitial begin \n\t\t$dumpfile(\"wave.vcd\");\n\t\t$dumpvars(1, stim1.clk, tb_mismatch ,in,out_both_ref,out_both_dut,out_any_ref,out_any_dut,out_different_ref,out_different_dut );\n\tend\n\n\n\twire tb_match;\t\t// Verification\n\twire tb_mismatch = ~tb_match;\n\t\n\tstimulus_gen stim1 (\n\t\t.clk,\n\t\t.* ,\n\t\t.in );\n\treference_module good1 (\n\t\t.in,\n\t\t.out_both(out_both_ref),\n\t\t.out_any(out_any_ref),\n\t\t.out_different(out_different_ref) );\n\t\t\n\ttop_module top_module1 (\n\t\t.in,\n\t\t.out_both(out_both_dut),\n\t\t.out_any(out_any_dut),\n\t\t.out_different(out_different_dut) );\n\n\t\n\tbit strobe = 0;\n\ttask wait_for_end_of_timestep;\n\t\trepeat(5) begin\n\t\t\tstrobe <= !strobe;  // Try to delay until the very end of the time step.\n\t\t\t@(strobe);\n\t\tend\n\tendtask\t\n\n\t\n\tfinal begin\n\t\tif (stats1.errors_out_both) $display(\"Hint: Output '%s' has %0d mismatches. First mismatch occurred at time %0d.\", \"out_both\", stats1.errors_out_both, stats1.errortime_out_both);\n\t\telse $display(\"Hint: Output '%s' has no mismatches.\", \"out_both\");\n\t\tif (stats1.errors_out_any) $display(\"Hint: Output '%s' has %0d mismatches. First mismatch occurred at time %0d.\", \"out_any\", stats1.errors_out_any, stats1.errortime_out_any);\n\t\telse $display(\"Hint: Output '%s' has no mismatches.\", \"out_any\");\n\t\tif (stats1.errors_out_different) $display(\"Hint: Output '%s' has %0d mismatches. First mismatch occurred at time %0d.\", \"out_different\", stats1.errors_out_different, stats1.errortime_out_different);\n\t\telse $display(\"Hint: Output '%s' has no mismatches.\", \"out_different\");\n\n\t\t$display(\"Hint: Total mismatched samples is %1d out of %1d samples\\n\", stats1.errors, stats1.clocks);\n\t\t$display(\"Simulation finished at %0d ps\", $time);\n\t\t$display(\"Mismatches: %1d in %1d samples\", stats1.errors, stats1.clocks);\n\tend\n\t\n\t// Verification: XORs on the right makes any X in good_vector match anything, but X in dut_vector will only match X.\n\tassign tb_match = ( { out_both_ref, out_any_ref, out_different_ref } === ( { out_both_ref, out_any_ref, out_different_ref } ^ { out_both_dut, out_any_dut, out_different_dut } ^ { out_both_ref, out_any_ref, out_different_ref } ) );\n\t// Use explicit sensitivity list here. @(*) causes NetProc::nex_input() to be called when trying to compute\n\t// the sensitivity list of the @(strobe) process, which isn't implemented.\n\talways @(posedge clk, negedge clk) begin\n\n\t\tstats1.clocks++;\n\t\tif (!tb_match) begin\n\t\t\tif (stats1.errors == 0) stats1.errortime = $time;\n\t\t\tstats1.errors++;\n\t\tend\n\t\tif (out_both_ref !== ( out_both_ref ^ out_both_dut ^ out_both_ref ))\n\t\tbegin if (stats1.errors_out_both == 0) stats1.errortime_out_both = $time;\n\t\t\tstats1.errors_out_both = stats1.errors_out_both+1'b1; end\n\t\tif (out_any_ref !== ( out_any_ref ^ out_any_dut ^ out_any_ref ))\n\t\tbegin if (stats1.errors_out_any == 0) stats1.errortime_out_any = $time;\n\t\t\tstats1.errors_out_any = stats1.errors_out_any+1'b1; end\n\t\tif (out_different_ref !== ( out_different_ref ^ out_different_dut ^ out_different_ref ))\n\t\tbegin if (stats1.errors_out_different == 0) stats1.errortime_out_different = $time;\n\t\t\tstats1.errors_out_different = stats1.errors_out_different+1'b1; end\n\n\tend\nendmodule\n"}
2 | {"task_id": "vector4", "prompt": "module top_module (\n\tinput [7:0] in,\n\toutput [31:0] out\n);\n", "canonical_solution": "\n\tassign out = { {24{in[7]}}, in };\n\t\nendmodule\n", "test": "`timescale 1 ps/1 ps\n`define OK 12\n`define INCORRECT 13\nmodule reference_module (\n\tinput [7:0] in,\n\toutput [31:0] out\n);\n\n\tassign out = { {24{in[7]}}, in };\n\t\nendmodule\n\n\nmodule stimulus_gen (\n\tinput clk,\n\toutput logic [7:0] in\n);\n\n\tinitial begin\n\t\trepeat(100) @(posedge clk, negedge clk)\n\t\t\tin <= $random;\n\t\t$finish;\n\tend\n\t\nendmodule\n\nmodule tb();\n\n\ttypedef struct packed {\n\t\tint errors;\n\t\tint errortime;\n\t\tint errors_out;\n\t\tint errortime_out;\n\n\t\tint clocks;\n\t} stats;\n\t\n\tstats stats1;\n\t\n\t\n\twire[511:0] wavedrom_title;\n\twire wavedrom_enable;\n\tint wavedrom_hide_after_time;\n\t\n\treg clk=0;\n\tinitial forever\n\t\t#5 clk = ~clk;\n\n\tlogic [7:0] in;\n\tlogic [31:0] out_ref;\n\tlogic [31:0] out_dut;\n\n\tinitial begin \n\t\t$dumpfile(\"wave.vcd\");\n\t\t$dumpvars(1, stim1.clk, tb_mismatch ,in,out_ref,out_dut );\n\tend\n\n\n\twire tb_match;\t\t// Verification\n\twire tb_mismatch = ~tb_match;\n\t\n\tstimulus_gen stim1 (\n\t\t.clk,\n\t\t.* ,\n\t\t.in );\n\treference_module good1 (\n\t\t.in,\n\t\t.out(out_ref) );\n\t\t\n\ttop_module top_module1 (\n\t\t.in,\n\t\t.out(out_dut) );\n\n\t\n\tbit strobe = 0;\n\ttask wait_for_end_of_timestep;\n\t\trepeat(5) begin\n\t\t\tstrobe <= !strobe;  // Try to delay until the very end of the time step.\n\t\t\t@(strobe);\n\t\tend\n\tendtask\t\n\n\t\n\tfinal begin\n\t\tif (stats1.errors_out) $display(\"Hint: Output '%s' has %0d mismatches. First mismatch occurred at time %0d.\", \"out\", stats1.errors_out, stats1.errortime_out);\n\t\telse $display(\"Hint: Output '%s' has no mismatches.\", \"out\");\n\n\t\t$display(\"Hint: Total mismatched samples is %1d out of %1d samples\\n\", stats1.errors, stats1.clocks);\n\t\t$display(\"Simulation finished at %0d ps\", $time);\n\t\t$display(\"Mismatches: %1d in %1d samples\", stats1.errors, stats1.clocks);\n\tend\n\t\n\t// Verification: XORs on the right makes any X in good_vector match anything, but X in dut_vector will only match X.\n\tassign tb_match = ( { out_ref } === ( { out_ref } ^ { out_dut } ^ { out_ref } ) );\n\t// Use explicit sensitivity list here. @(*) causes NetProc::nex_input() to be called when trying to compute\n\t// the sensitivity list of the @(strobe) process, which isn't implemented.\n\talways @(posedge clk, negedge clk) begin\n\n\t\tstats1.clocks++;\n\t\tif (!tb_match) begin\n\t\t\tif (stats1.errors == 0) stats1.errortime = $time;\n\t\t\tstats1.errors++;\n\t\tend\n\t\tif (out_ref !== ( out_ref ^ out_dut ^ out_ref ))\n\t\tbegin if (stats1.errors_out == 0) stats1.errortime_out = $time;\n\t\t\tstats1.errors_out = stats1.errors_out+1'b1; end\n\n\tend\nendmodule\n"}
3 | {"task_id": "zero", "prompt": "module top_module(\n\toutput zero);\n", "canonical_solution": "\t\n\tassign zero = 1'b0;\n\t\nendmodule\n", "test": "`timescale 1 ps/1 ps\n`define OK 12\n`define INCORRECT 13\nmodule reference_module(\n\toutput zero);\n\t\n\tassign zero = 1'b0;\n\t\nendmodule\n\n\nmodule stimulus_gen (\n\tinput clk,\n\toutput reg[511:0] wavedrom_title,\n\toutput reg wavedrom_enable\n);\n\n\n// Add two ports to module stimulus_gen:\n//    output [511:0] wavedrom_title\n//    output reg wavedrom_enable\n\n\ttask wavedrom_start(input[511:0] title = \"\");\n\tendtask\n\t\n\ttask wavedrom_stop;\n\t\t#1;\n\tendtask\t\n\n\n\n\tinitial begin\n\t\twavedrom_start(\"Output should 0\");\n\t\trepeat(20) @(posedge clk, negedge clk);\n\t\twavedrom_stop();\n\t\t\n\t\t#1 $finish;\n\tend\n\t\nendmodule\n\nmodule tb();\n\n\ttypedef struct packed {\n\t\tint errors;\n\t\tint errortime;\n\t\tint errors_zero;\n\t\tint errortime_zero;\n\n\t\tint clocks;\n\t} stats;\n\t\n\tstats stats1;\n\t\n\t\n\twire[511:0] wavedrom_title;\n\twire wavedrom_enable;\n\tint wavedrom_hide_after_time;\n\t\n\treg clk=0;\n\tinitial forever\n\t\t#5 clk = ~clk;\n\n\tlogic zero_ref;\n\tlogic zero_dut;\n\n\tinitial begin \n\t\t$dumpfile(\"wave.vcd\");\n\t\t$dumpvars(1, stim1.clk, tb_mismatch ,zero_ref,zero_dut );\n\tend\n\n\n\twire tb_match;\t\t// Verification\n\twire tb_mismatch = ~tb_match;\n\t\n\tstimulus_gen stim1 (\n\t\t.clk,\n\t\t.*  );\n\treference_module good1 (\n\t\t.zero(zero_ref) );\n\t\t\n\ttop_module top_module1 (\n\t\t.zero(zero_dut) );\n\n\t\n\tbit strobe = 0;\n\ttask wait_for_end_of_timestep;\n\t\trepeat(5) begin\n\t\t\tstrobe <= !strobe;  // Try to delay until the very end of the time step.\n\t\t\t@(strobe);\n\t\tend\n\tendtask\t\n\n\t\n\tfinal begin\n\t\tif (stats1.errors_zero) $display(\"Hint: Output '%s' has %0d mismatches. First mismatch occurred at time %0d.\", \"zero\", stats1.errors_zero, stats1.errortime_zero);\n\t\telse $display(\"Hint: Output '%s' has no mismatches.\", \"zero\");\n\n\t\t$display(\"Hint: Total mismatched samples is %1d out of %1d samples\\n\", stats1.errors, stats1.clocks);\n\t\t$display(\"Simulation finished at %0d ps\", $time);\n\t\t$display(\"Mismatches: %1d in %1d samples\", stats1.errors, stats1.clocks);\n\tend\n\t\n\t// Verification: XORs on the right makes any X in good_vector match anything, but X in dut_vector will only match X.\n\tassign tb_match = ( { zero_ref } === ( { zero_ref } ^ { zero_dut } ^ { zero_ref } ) );\n\t// Use explicit sensitivity list here. @(*) causes NetProc::nex_input() to be called when trying to compute\n\t// the sensitivity list of the @(strobe) process, which isn't implemented.\n\talways @(posedge clk, negedge clk) begin\n\n\t\tstats1.clocks++;\n\t\tif (!tb_match) begin\n\t\t\tif (stats1.errors == 0) stats1.errortime = $time;\n\t\t\tstats1.errors++;\n\t\tend\n\t\tif (zero_ref !== ( zero_ref ^ zero_dut ^ zero_ref ))\n\t\tbegin if (stats1.errors_zero == 0) stats1.errortime_zero = $time;\n\t\t\tstats1.errors_zero = stats1.errors_zero+1'b1; end\n\n\tend\nendmodule\n"}


--------------------------------------------------------------------------------
/auto_data_gen_val/code_repo_documentor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from dotenv import load_dotenv
  4 | load_dotenv()
  5 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SRC_DIR")))
  6 | import openai
  7 | import requests
  8 | import json
  9 | import copy
 10 | import time
 11 | import datetime
 12 | import shutil
 13 | from embedding_lookup_utils import *
 14 | from utils import *
 15 | from completion_handler import *
 16 | from code_preprocesser import *
 17 | 
 18 | 
 19 | class CodeRepoDocumentor:
 20 |     def __init__(self, code_dir, store_src_code_dir, 
 21 |                  csv_code_dir, csv_comment_dir, csv_new_comment_dir, 
 22 |                  csv_pure_gen_comment_dir, code_summary_dir, documented_code_dir,
 23 |                  code_metadata_file = None,
 24 |                  code_suffix =[".v", ".sv", ".vh"], language="verilog",
 25 |                  discard_original_comment = False,
 26 |                  code_lib_path= "./lib", code_vec_store = "./vector_store/",
 27 |                  skip_rag_db = False, skip_supplement_summary = False,
 28 |                  cb = None):
 29 | 
 30 |         #raw code preprocessing
 31 |         self.code_dir = code_dir
 32 |         self.code_suffix = code_suffix
 33 |         self.language = language
 34 |         self.store_src_code_dir = store_src_code_dir
 35 |         self.csv_code_dir = csv_code_dir
 36 |         self.csv_comment_dir = csv_comment_dir
 37 |         self.csv_new_comment_dir = csv_new_comment_dir
 38 |         self.csv_pure_gen_comment_dir = csv_pure_gen_comment_dir
 39 |         self.code_summary_dir = code_summary_dir
 40 |         self.documented_code_dir = documented_code_dir
 41 |         self.discard_original_comment = discard_original_comment
 42 |         self.code_vec_store = code_vec_store
 43 |         self.code_Lib_path = code_lib_path
 44 |         self.cb = cb
 45 |         self.skip_rag_db = skip_rag_db
 46 |         self.skip_supplement_summary = skip_supplement_summary
 47 |         if code_metadata_file is not None:
 48 |             self.code_metadata_file = code_metadata_file
 49 |             self.code_metadata = json.load(open(self.code_metadata_file, "r"))
 50 | 
 51 |         self.code_preprocesser = CodePreprocesser(code_dir, store_src_code_dir, 
 52 |                                             csv_code_dir, csv_comment_dir, 
 53 |                                             csv_new_comment_dir, csv_pure_gen_comment_dir, 
 54 |                                             code_summary_dir, documented_code_dir,
 55 |                                             code_suffix=code_suffix, discard_original_comment=discard_original_comment)
 56 |         self.documented_list = []
 57 |         self.documented_list_file = os.path.join(os.environ.get("ASSET_DIR"), os.environ.get("TARGET_LANG"), "documented_list.txt")
 58 |         if os.path.exists(self.documented_list_file):
 59 |             #ask if the user wants to remove the documented list
 60 |             print("Do you want to remove the documented list? (y/n)")
 61 |             answer = input()
 62 |             if answer == "y":
 63 |                 os.remove(self.documented_list_file)
 64 |                 print("Documented list removed")
 65 |             else:
 66 |                 with open(self.documented_list_file, "r") as f:
 67 |                     self.documented_list = f.readlines()
 68 |                 self.documented_list = [x.strip() for x in self.documented_list]
 69 |         
 70 |         
 71 |         #context embedding
 72 |         self.embedding_fields = ["Filename", "File type", "Summary", "Text", "Line_id"]
 73 |         self.system_embedder = EmbedTool0(self.embedding_fields, 
 74 |                                     os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SYSTEM_CONTEXT_DIR")),
 75 |                                     os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SYSTEM_CONTEXT_EMBEDDING_DIR")),
 76 |                                     "system_context_embedding.csv")
 77 | 
 78 |         #code documentor
 79 |         self.documentor = Chatbot(os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SYSTEM_CONTEXT_DIR"), 
 80 |                             "context.fixed_features.txt"),
 81 |                             os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("CONVERSE_DIR")),
 82 |                             os.path.join(os.path.dirname(os.path.abspath(__file__)),os.environ.get("CHATBOT_BACKEND_DIR"),os.environ.get("SYSTEM_CONTEXT_DIR"), 
 83 |                             "context.converse_samples.txt"),
 84 |                             code_suffix=self.code_suffix,
 85 |                             language=self.language,
 86 |                             code_lib_path=self.code_Lib_path,
 87 |                             code_vec_store=self.code_vec_store,
 88 |                             skip_supplement_summary = self.skip_supplement_summary,
 89 |                             cb = self.cb
 90 |                             )
 91 | 
 92 |     
 93 |     def create_embedding(self):
 94 |         self.system_embedder.create_raw_system_context()
 95 |         self.system_embedder.create_embedding()
 96 |         self.system_embedder.load_embedding()
 97 |         if not self.skip_rag_db:
 98 |             self.documentor.init_code_retrival()
 99 | 
100 |     def code_preprocess(self, skip_preprocess=False):
101 |         self.code_preprocesser.raw_code_copy(self.code_dir, self.store_src_code_dir, skip_preprocess=skip_preprocess)
102 |         if not skip_preprocess:
103 |             self.code_preprocesser.pre_process_routines(self.store_src_code_dir, 
104 |                                                         discard_original_comment=self.discard_original_comment, 
105 |                                                         rtl=(".v" in self.code_suffix or ".sv" in self.code_suffix or ".vh" in self.code_suffix))
106 |             self.code_preprocesser.create_code_assets()
107 |         else:
108 |             pass
109 | 
110 |     def document_repo(self):
111 |         self.documentor.system_context_embedding = self.system_embedder.df_embed
112 |         for code_src in self.code_preprocesser.code_files:
113 |             start_time = time.time()
114 |             if code_src not in self.documented_list:
115 |                 print("Documenting {}".format(code_src))
116 |                 #clear the memory of the documentor 
117 |                 self.documentor.line_by_line_comment_converse_chain.memory.clear()
118 |                 self.documentor.line_by_line_comment_converse_chain.memory_buffer = []
119 |                 csv_code_file = os.path.join(self.code_preprocesser.csv_code_dir, code_src.split(".")[0] + ".csv")
120 |                 csv_comment_file = os.path.join(self.code_preprocesser.csv_comment_dir, code_src.split(".")[0] + ".csv")
121 |                 csv_new_comment_file = os.path.join(self.code_preprocesser.csv_new_comment_dir, code_src.split(".")[0] + ".csv")
122 |                 csv_pure_gen_comment_file = os.path.join(self.code_preprocesser.csv_pure_gen_comment_dir, code_src.split(".")[0] + ".csv")
123 |                 code_summary_file = os.path.join(self.code_preprocesser.code_summary_dir, code_src.split(".")[0] + ".txt")
124 | 
125 |                 #check the # of lines of code
126 |                 with open(os.path.join(self.code_preprocesser.store_src_code_dir, code_src), "r") as f:
127 |                     lines = f.readlines()
128 |                 if len(lines) > 200:
129 |                     print("Skip {} because it has too many lines of code".format(code_src))
130 |                     continue
131 | 
132 |                 dependent_funcs = self.code_metadata[code_src.split(".")[0]]["module_inst_list"]
133 |                 self.documentor.comment_a_code_file(csv_code_file, csv_comment_file, csv_new_comment_file, csv_pure_gen_comment_file, dependent_funcs=dependent_funcs)
134 |                 
135 |                 new_code_string = merge_code_and_comment(csv_code_file, csv_new_comment_file)
136 |                 with open(os.path.join(self.code_preprocesser.documented_code_dir, code_src), "w") as f:
137 |                     f.write(new_code_string)
138 | 
139 |                 self.documentor.summarize_code_blocks(csv_code_file, csv_new_comment_file, code_summary_file)
140 |                 # bot.reverse_code_gen(csv_pure_gen_comment_file, code_summary_file)
141 | 
142 |                 self.documented_list.append(code_src)
143 |                 with open(self.documented_list_file, "w") as f:
144 |                     f.write("\n".join(self.documented_list))
145 |             end_time = time.time()
146 |             print("Time left to finish this repo: {}".format((end_time - start_time) * (len(self.code_preprocesser.code_files) - self.code_preprocesser.code_files.index(code_src))))
147 |     def package_documented_code(self, package_dir):
148 |         #create the package dir
149 |         if not os.path.exists(package_dir):
150 |             os.makedirs(package_dir)
151 |         for code_src in self.documented_list:
152 |             #create a subdirectory for each of the documented code
153 |             code_src = code_src.strip()
154 |             code_src_dir = os.path.join(package_dir, code_src.split(".")[0])
155 |             if not os.path.exists(code_src_dir):
156 |                 os.makedirs(code_src_dir)
157 |             shutil.copy(os.path.join(self.code_preprocesser.documented_code_dir, code_src), os.path.join(package_dir, code_src.split(".")[0], code_src))
158 |             shutil.copy(os.path.join(self.code_preprocesser.code_summary_dir, code_src.split(".")[0] + ".txt"), os.path.join(package_dir, code_src.split(".")[0], code_src.split(".")[0] + ".txt"))
159 |     #TODO: add a function to convert the documented code to original raw code
160 | 
161 | if __name__ == "__main__":
162 |     #NOTE: run utils.py first to partition the code first
163 |     code_dir = "./test_repo/"
164 |     code_lib_path =  "./test_repo/"
165 |     code_vec_store = "../code_vec_store/DNNBuilder/"
166 |     language = os.environ.get("TARGET_LANG")
167 |     if os.environ.get("TARGET_LANG") == "verilog":
168 |         code_suffix = [".v", ".sv", ".vh"]
169 |     elif os.environ.get("TARGET_LANG") == "xilinx_hls":
170 |         code_suffix = [".c", ".cpp", ".h", ".hpp"]
171 |     store_src_code_dir = os.environ.get("STORE_SRC_CODE_DIR")
172 |     csv_code_dir = os.environ.get("CSV_CODE_DIR")
173 |     csv_comment_dir = os.environ.get("CSV_COMMENT_DIR")
174 |     csv_new_comment_dir = os.environ.get("CSV_NEW_COMMENT_DIR")
175 |     csv_pure_gen_comment_dir = os.environ.get("CSV_PURE_GEN_COMMENT_DIR")
176 |     code_summary_dir = os.environ.get("CODE_SUMMARY_DIR")
177 |     documented_code_dir = os.environ.get("DOCUMENTED_CODE_DIR")
178 | 
179 | 
180 |     with get_openai_callback() as cb:
181 |         #This switch will discard 1. the comments in the raw code copy and 2. the comments will be converted to the raw code csv 
182 |         discard_original_comment = True
183 |         
184 |         code_repo_documentor = CodeRepoDocumentor(code_dir, store_src_code_dir,
185 |                                                     csv_code_dir, csv_comment_dir, csv_new_comment_dir, 
186 |                                                     csv_pure_gen_comment_dir, code_summary_dir, documented_code_dir,
187 |                                                     code_suffix=code_suffix, language=language,
188 |                                                     discard_original_comment=discard_original_comment,
189 |                                                     code_lib_path=code_lib_path, code_vec_store=code_vec_store,
190 |                                                     cb = cb)
191 |         code_repo_documentor.create_embedding()
192 |         code_repo_documentor.code_preprocess()
193 |         code_repo_documentor.document_repo()
194 |         code_repo_documentor.package_documented_code("./documented_code")


--------------------------------------------------------------------------------
/auto_data_gen_val/preprocess_data/minhash_deduplicate.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import multiprocessing as mp
  3 | import re
  4 | from collections import defaultdict
  5 | from functools import partial
  6 | from typing import Dict, List, Optional, Set, Tuple, Type
  7 | 
  8 | from datasets import Dataset
  9 | from datasketch import MinHash, MinHashLSH
 10 | from dpu_utils.utils.iterators import ThreadedIterator
 11 | from tqdm import tqdm
 12 | 
 13 | 
 14 | NON_ALPHA = re.compile("[^A-Za-z_0-9]")
 15 | # parameters used in DuplicationIndex
 16 | MIN_NUM_TOKENS = 10
 17 | NUM_PERM = 256
 18 | 
 19 | 
 20 | def get_min_hash(tokens: List[str]) -> Optional[MinHash]:
 21 |     """Compute the MinHash of a code snippet."""
 22 |     if len(tokens) < MIN_NUM_TOKENS:
 23 |         return None
 24 |     min_hash = MinHash(num_perm=NUM_PERM)
 25 |     for token in set(tokens):
 26 |         min_hash.update(token.encode())
 27 |     return min_hash
 28 | 
 29 | 
 30 | def get_tokens(code: str) -> Set[str]:
 31 |     """Tokenize a code snippet."""
 32 |     return {t for t in NON_ALPHA.split(code) if len(t.strip()) > 0}
 33 | 
 34 | 
 35 | class DuplicationIndex:
 36 |     def __init__(
 37 |         self,
 38 |         *,
 39 |         duplication_jaccard_threshold: float = 0.85,
 40 |     ):
 41 |         self._duplication_jaccard_threshold = duplication_jaccard_threshold
 42 |         self._num_perm = NUM_PERM
 43 |         self._index = MinHashLSH(threshold=self._duplication_jaccard_threshold, num_perm=self._num_perm)
 44 | 
 45 |         self._duplicate_clusters = defaultdict(set)
 46 | 
 47 |     def add(self, code_key: Tuple, min_hash: MinHash) -> None:
 48 |         """Add a key to _index (MinHashLSH)
 49 |         the min_hash is used to query closest matches based on the jaccard_threshold.
 50 |         The new key is either added to a existing cluster of one close match,
 51 |         or a new cluster is created. The clusters created in this way, depend on the order of add.
 52 | 
 53 |         Args:
 54 |             code_key (Tuple of (index, repo_name, path)):
 55 |                 Theoritically any hasbale key. Here we use a tuple to retrieve the information later.
 56 |             min_hash: MinHash of the code_key.
 57 |         """
 58 |         close_duplicates = self._index.query(min_hash)
 59 |         if code_key in self._index.keys:
 60 |             print(f"Duplicate key {code_key}")
 61 |             return
 62 | 
 63 |         self._index.insert(code_key, min_hash)
 64 |         if len(close_duplicates) > 0:
 65 |             for base_duplicate in close_duplicates:
 66 |                 if base_duplicate in self._duplicate_clusters:
 67 |                     self._duplicate_clusters[base_duplicate].add(code_key)
 68 |                     break
 69 |             else:
 70 |                 self._duplicate_clusters[close_duplicates[0]].add(code_key)
 71 | 
 72 |     def get_duplicate_clusters(self) -> List[List[Dict]]:
 73 |         """Export the duplicate clusters.
 74 |         For each cluster, the first element is the base element of the cluster.
 75 |         The base element has an estimation jaccard similarity higher than the threshold with all the other elements.
 76 | 
 77 |         Returns:
 78 |             duplicate_clusters (List[List[Dict]]):
 79 |                 List of duplicate clusters.
 80 |         """
 81 |         duplicate_clusters = []
 82 |         for base, duplicates in self._duplicate_clusters.items():
 83 |             cluster = [base] + list(duplicates)
 84 |             # reformat the cluster to be a list of dict
 85 |             cluster = [{"base_index": el[0], "repo_name": el[1], "path": el[2]} for el in cluster]
 86 |             duplicate_clusters.append(cluster)
 87 |         return duplicate_clusters
 88 | 
 89 |     def save(self, filepath) -> None:
 90 |         duplicate_clusters = self.get_duplicate_clusters()
 91 |         with open(filepath, "w") as f:
 92 |             json.dump(duplicate_clusters, f)
 93 | 
 94 | 
 95 | def _compute_min_hash(element):
 96 |     index, data = element
 97 |     min_hash = get_min_hash([t for t in NON_ALPHA.split(data["text"]) if len(t.strip()) > 0])
 98 |     if min_hash is not None:
 99 |         return (index, data["repo_name"], data["path"]), min_hash
100 | 
101 | 
102 | def minhash_iter(dataset_iterator: Type[Dataset]):
103 |     with mp.Pool() as pool:
104 |         for data in pool.imap_unordered(
105 |             _compute_min_hash,
106 |             ThreadedIterator(dataset_iterator, max_queue_size=10000),
107 |             chunksize=100,
108 |         ):
109 |             if data is not None:
110 |                 yield data
111 | 
112 | 
113 | def make_duplicate_clusters(dataset_iterator: Type[Dataset], jaccard_threshold: float):
114 |     """Find duplicate clusters in the dataset in two steps:
115 |     1. Compute MinHash for each code snippet. MinHash is a tool for fast jaccard similarity estimation.
116 |     This step is computed using an asynchronous multiprocessing pool, minhash_iter
117 |     2. Find duplicate clusters. The computed MinHash is added sequentially to the DuplicationIndex.
118 |     This step cannot be parallelized. So using asynchronous thread in the previous step helps to speed up the process.
119 |     """
120 |     di = DuplicationIndex(duplication_jaccard_threshold=jaccard_threshold)
121 | 
122 |     for filename, min_hash in tqdm(ThreadedIterator(minhash_iter(enumerate(dataset_iterator)), max_queue_size=100)):
123 |         di.add(filename, min_hash)
124 | 
125 |     # Returns a List[Cluster] where Cluster is List[str] with the filenames.
126 |     return di.get_duplicate_clusters()
127 | 
128 | 
129 | def jaccard_similarity(code1: str, code2: str) -> float:
130 |     """Compute the Jaccard similarity of two code snippets."""
131 |     tokens1 = get_tokens(code1)
132 |     tokens2 = get_tokens(code2)
133 |     return len(tokens1 & tokens2) / len(tokens1 | tokens2)
134 | 
135 | 
136 | _shared_dataset = None
137 | 
138 | 
139 | def _find_cluster_extremes_shared(cluster, jaccard_threshold):
140 |     """Find a reduced cluster such that each code in the origin cluster is similar to at least one code in the reduced cluster.
141 |     Two codes are similar if their Jaccard similarity is above the threshold.
142 | 
143 |     Args:
144 |         cluster (List[dict]):
145 |            cluster is a list of dict, each dict contains the following keys:
146 |                 - base_index
147 |                 - repo_name
148 |                 - path
149 |             This is a typical output of DuplicationIndex.get_duplicate_clusters()
150 |         jaccard_threshold (float):
151 |             threshold for Jaccard similarity.
152 |             Two codes are similar if their Jaccard similarity is above the threshold.
153 | 
154 |     Returns:
155 |         extremes (List[dict]):
156 |             A reduced representation of the cluster. The field copies is added to each dict.
157 |             The copies field indicates the number of similar codes in the cluster for a extreme.
158 |     """
159 |     extremes = []
160 |     for element1 in cluster:
161 |         code1 = _shared_dataset[element1["base_index"]]["text"]
162 |         for element2 in extremes:
163 |             code2 = _shared_dataset[element2["base_index"]]["text"]
164 |             if jaccard_similarity(code1, code2) >= jaccard_threshold:
165 |                 element2["copies"] += 1
166 |                 break
167 |         else:
168 |             element1["copies"] = 1
169 |             extremes.append(element1)
170 |     return extremes
171 | 
172 | 
173 | def find_extremes(cluster_list, dataset, jaccard_threshold):
174 |     """Call the _find_cluster_extremes_shared function in a parallel fashion.
175 | 
176 |     Args:
177 |         cluster_list (List[List[Dict]]):
178 |             each cluster is a list of dicts with the key base_index,
179 |             referring to the index of the base code in the dataset.
180 |         dataset (Type[Dataset]):
181 |             dataset is used to access the content of the code snippets,
182 |             using the base_index from the cluster_list.
183 |             dataset is shared between all the processes using a glabal variable (any other way to share the dataset?),
184 |             otherwise the multi processing is not speeded up.
185 |         jaccard_threshold (float):
186 |             the threshold for the jaccard similarity. The default value is 0.85
187 | 
188 |     Returns:
189 |         extremes_list (List[Dict]):
190 |             Each cluster is reduced to extremes.
191 |             See _find_cluster_extremes_shared for the definition of extremes.
192 |     """
193 |     global _shared_dataset
194 |     _shared_dataset = dataset
195 |     extremes_list = []
196 |     f = partial(_find_cluster_extremes_shared, jaccard_threshold=jaccard_threshold)
197 |     with mp.Pool() as pool:
198 |         for extremes in tqdm(
199 |             pool.imap_unordered(
200 |                 f,
201 |                 cluster_list,
202 |             ),
203 |             total=len(cluster_list),
204 |         ):
205 |             extremes_list.append(extremes)
206 |     return extremes_list
207 | 
208 | 
209 | def deduplicate_dataset(
210 |     dataset: Type[Dataset], jaccard_threshold: float = 0.85
211 | ) -> Tuple[Type[Dataset], List[List[Dict]]]:
212 |     """Deduplicate the dataset using minhash and jaccard similarity.
213 |     This function first generate duplicate clusters, then each cluster
214 |     is reduced to the extremes that are similar to the other elements in the cluster.
215 |     Codes are called similar if their Jaccard similarity is greater than jaccard_threshold (0.85 default).
216 | 
217 |     Args:
218 |         dataset (Type[Dataset]):
219 |             The dataset to deduplicate.
220 |         jaccard_threshold (float, default=0.85):
221 |             jaccard threshold to determine if two codes are similar
222 | 
223 |     Returns:
224 |         ds_dedup (Type[Dataset]):
225 |             The deduplicated dataset.
226 |         duplicate_clusters (List[List[Dict]]):
227 |             The list of duplicate clusters.
228 |             Each cluster is a list of dicts with the following keys:
229 |             - base_index : int
230 |                 The index of the code in the original dataset.
231 |             - repo_name : str
232 |             - path : str
233 |             - copies : int
234 |                 The number of copies of the code in the cluster. (find_cluster_extremes)
235 |             - is_extreme : bool
236 |                 Whether the code is an extreme in the cluster.
237 |             All the codes in the cluster are removed from the dataset except the extremes.
238 | 
239 |     Example:
240 |         >>> from datasets import load_dataset
241 |         >>> from minhash_deduplication import deduplicate_dataset
242 |         >>> ds = load_dataset("lvwerra/codeparrot-clean", split="train")
243 |         >>> ds_dedup, duplicate_clusters = deduplicate_dataset(ds, jaccard_threshold=0.85)
244 |     """
245 |     duplicate_clusters = make_duplicate_clusters(dataset, jaccard_threshold)
246 |     duplicate_indices = {x["base_index"] for cluster in duplicate_clusters for x in cluster}
247 |     extreme_dict = {}
248 |     extremes_clusters = find_extremes(duplicate_clusters, dataset, jaccard_threshold)
249 |     for extremes in extremes_clusters:
250 |         for element in extremes:
251 |             extreme_dict[element["base_index"]] = element
252 |     remove_indices = duplicate_indices - set(extreme_dict.keys())
253 |     ds_filter = dataset.filter(lambda x, idx: idx not in remove_indices, with_indices=True)
254 | 
255 |     # update duplicate_clusters
256 |     for cluster in duplicate_clusters:
257 |         for element in cluster:
258 |             element["is_extreme"] = element["base_index"] in extreme_dict
259 |             if element["is_extreme"]:
260 |                 element["copies"] = extreme_dict[element["base_index"]]["copies"]
261 | 
262 |     print(f"Original dataset size: {len(dataset)}")
263 |     print(f"Number of duplicate clusters: {len(duplicate_clusters)}")
264 |     print(f"Files in duplicate cluster: {len(duplicate_indices)}")
265 |     print(f"Unique files in duplicate cluster: {len(extreme_dict)}")
266 |     print(f"Filtered dataset size: {len(ds_filter)}")
267 | 
268 |     return ds_filter, duplicate_clusters


--------------------------------------------------------------------------------
/auto_data_gen_val/preprocess_data/process_data/minhash.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import multiprocessing as mp
  3 | import re
  4 | from collections import defaultdict
  5 | from functools import partial
  6 | from typing import Dict, List, Optional, Set, Tuple, Type
  7 | 
  8 | from datasets import Dataset
  9 | from datasketch import MinHash, MinHashLSH
 10 | # from dpu_utils.utils.iterators import ThreadedIterator
 11 | from tqdm import tqdm
 12 | 
 13 | 
 14 | 
 15 | from functools import partial
 16 | import sys
 17 | import queue
 18 | import threading
 19 | from typing import TypeVar, Iterator, List, Optional, Tuple
 20 | 
 21 | T = TypeVar('T')
 22 | 
 23 | class ThreadedIterator(Iterator[T]):
 24 |     """An iterator object that computes its elements in a single parallel thread to be ready to be consumed.
 25 |     The iterator should *not* return `None`. Elements of the original iterable will be shuffled arbitrarily."""
 26 |     def __init__(self, original_iterator: Iterator[T], max_queue_size: int = 2, enabled: bool = True):
 27 |         self.__is_enabled = enabled
 28 |         if enabled:
 29 |             self.__queue = queue.Queue(maxsize=max_queue_size)  # type: queue.Queue[Optional[T]]
 30 |             self.__thread = threading.Thread(target=lambda: self.__worker(self.__queue, original_iterator), daemon=True)
 31 |             self.__thread.start()
 32 |         else:
 33 |             self.__original_iterator = original_iterator
 34 | 
 35 |     @staticmethod
 36 |     def __worker(queue: queue.Queue, original_iterator: Iterator[T])-> None:
 37 |         try:
 38 |             for element in original_iterator:
 39 |                 assert element is not None, 'By convention, Iterables wrapped in ThreadedIterator may not contain None.'
 40 |                 queue.put(element, block=True)
 41 |             queue.put(None, block=True)
 42 |         except Exception as e:
 43 |             _, __, tb = sys.exc_info()
 44 |             queue.put((e, tb), block=True)
 45 | 
 46 |     def __next__(self) -> T:
 47 |         next_element = self.__queue.get(block=True)
 48 |         if next_element is None:
 49 |             self.__thread.join()
 50 |             self.__queue.put(None)  # Make sure that we remember that we are done if we are called once more...
 51 |             raise StopIteration
 52 |         if isinstance(next_element, tuple) and isinstance(next_element[0], Exception):
 53 |             raise next_element[0].with_traceback(next_element[1])
 54 |         return next_element
 55 | 
 56 |     def __iter__(self):
 57 |         if self.__is_enabled:
 58 |             return self
 59 |         else:
 60 |             return iter(self.__original_iterator)
 61 | 
 62 | 
 63 | 
 64 | 
 65 | 
 66 | NON_ALPHA = re.compile("[^A-Za-z_0-9]")
 67 | # parameters used in DuplicationIndex
 68 | MIN_NUM_TOKENS = 10
 69 | NUM_PERM = 256
 70 | 
 71 | def get_min_hash(tokens: List[str]) -> Optional[MinHash]:
 72 |     """Compute the MinHash of a code snippet."""
 73 |     if len(tokens) < MIN_NUM_TOKENS:
 74 |         return None
 75 |     min_hash = MinHash(num_perm=NUM_PERM)
 76 |     for token in set(tokens):
 77 |         min_hash.update(token.encode())
 78 |     return min_hash
 79 | 
 80 | 
 81 | def get_tokens(code: str) -> Set[str]:
 82 |     """Tokenize a code snippet."""
 83 |     return {t for t in NON_ALPHA.split(code) if len(t.strip()) > 0}
 84 | 
 85 | 
 86 | class DuplicationIndex:
 87 |     def __init__(
 88 |         self,
 89 |         *,
 90 |         duplication_jaccard_threshold: float = 0.85,
 91 |     ):
 92 |         self._duplication_jaccard_threshold = duplication_jaccard_threshold
 93 |         self._num_perm = NUM_PERM
 94 |         self._index = MinHashLSH(threshold=self._duplication_jaccard_threshold, num_perm=self._num_perm)
 95 | 
 96 |         self._duplicate_clusters = defaultdict(set)
 97 | 
 98 |     def add(self, code_key: Tuple, min_hash: MinHash) -> None:
 99 |         """Add a key to _index (MinHashLSH)
100 |         the min_hash is used to query closest matches based on the jaccard_threshold.
101 |         The new key is either added to a existing cluster of one close match,
102 |         or a new cluster is created. The clusters created in this way, depend on the order of add.
103 | 
104 |         Args:
105 |             code_key (Tuple of (index, repo_name, path)):
106 |                 Theoritically any hasbale key. Here we use a tuple to retrieve the information later.
107 |             min_hash: MinHash of the code_key.
108 |         """
109 |         close_duplicates = self._index.query(min_hash)
110 |         if code_key in self._index.keys:
111 |             print(f"Duplicate key {code_key}")
112 |             return
113 | 
114 |         self._index.insert(code_key, min_hash)
115 |         if len(close_duplicates) > 0:
116 |             for base_duplicate in close_duplicates:
117 |                 if base_duplicate in self._duplicate_clusters:
118 |                     self._duplicate_clusters[base_duplicate].add(code_key)
119 |                     break
120 |             else:
121 |                 self._duplicate_clusters[close_duplicates[0]].add(code_key)
122 | 
123 |     def get_duplicate_clusters(self) -> List[List[Dict]]:
124 |         """Export the duplicate clusters.
125 |         For each cluster, the first element is the base element of the cluster.
126 |         The base element has an estimation jaccard similarity higher than the threshold with all the other elements.
127 | 
128 |         Returns:
129 |             duplicate_clusters (List[List[Dict]]):
130 |                 List of duplicate clusters.
131 |         """
132 |         duplicate_clusters = []
133 |         for base, duplicates in self._duplicate_clusters.items():
134 |             cluster = [base] + list(duplicates)
135 |             # reformat the cluster to be a list of dict
136 |             cluster = [{"base_index": el[0], "repo_name": el[1], "path": el[2]} for el in cluster]
137 |             duplicate_clusters.append(cluster)
138 |         return duplicate_clusters
139 | 
140 |     def save(self, filepath) -> None:
141 |         duplicate_clusters = self.get_duplicate_clusters()
142 |         with open(filepath, "w") as f:
143 |             json.dump(duplicate_clusters, f)
144 | 
145 | 
146 | def _compute_min_hash(element):
147 |     index, data = element
148 |     min_hash = get_min_hash([t for t in NON_ALPHA.split(data["text"]) if len(t.strip()) > 0])
149 |     if min_hash is not None:
150 |         #can supply later and bookkept by get_duplicate_clusters 
151 |         return (index, "norepo", "nopath"), min_hash
152 | 
153 | 
154 | def minhash_iter(dataset_iterator: Type[Dataset]):
155 |     with mp.Pool() as pool:
156 |         for data in pool.imap_unordered(
157 |             _compute_min_hash,
158 |             ThreadedIterator(dataset_iterator, max_queue_size=10000),
159 |             chunksize=100,
160 |         ):
161 |             if data is not None:
162 |                 yield data
163 | 
164 | 
165 | def make_duplicate_clusters(dataset_iterator: Type[Dataset], jaccard_threshold: float):
166 |     """Find duplicate clusters in the dataset in two steps:
167 |     1. Compute MinHash for each code snippet. MinHash is a tool for fast jaccard similarity estimation.
168 |     This step is computed using an asynchronous multiprocessing pool, minhash_iter
169 |     2. Find duplicate clusters. The computed MinHash is added sequentially to the DuplicationIndex.
170 |     This step cannot be parallelized. So using asynchronous thread in the previous step helps to speed up the process.
171 |     """
172 |     di = DuplicationIndex(duplication_jaccard_threshold=jaccard_threshold)
173 | 
174 |     for filename, min_hash in tqdm(ThreadedIterator(minhash_iter(enumerate(dataset_iterator)), max_queue_size=100)):
175 |         di.add(filename, min_hash)
176 | 
177 |     # Returns a List[Cluster] where Cluster is List[str] with the filenames.
178 |     return di.get_duplicate_clusters()
179 | 
180 | 
181 | def jaccard_similarity(code1: str, code2: str) -> float:
182 |     """Compute the Jaccard similarity of two code snippets."""
183 |     tokens1 = get_tokens(code1)
184 |     tokens2 = get_tokens(code2)
185 |     return len(tokens1 & tokens2) / len(tokens1 | tokens2)
186 | 
187 | 
188 | _shared_dataset = None
189 | 
190 | 
191 | def _find_cluster_extremes_shared(cluster, jaccard_threshold):
192 |     """Find a reduced cluster such that each code in the origin cluster is similar to at least one code in the reduced cluster.
193 |     Two codes are similar if their Jaccard similarity is above the threshold.
194 | 
195 |     Args:
196 |         cluster (List[dict]):
197 |            cluster is a list of dict, each dict contains the following keys:
198 |                 - base_index
199 |                 - repo_name
200 |                 - path
201 |             This is a typical output of DuplicationIndex.get_duplicate_clusters()
202 |         jaccard_threshold (float):
203 |             threshold for Jaccard similarity.
204 |             Two codes are similar if their Jaccard similarity is above the threshold.
205 | 
206 |     Returns:
207 |         extremes (List[dict]):
208 |             A reduced representation of the cluster. The field copies is added to each dict.
209 |             The copies field indicates the number of similar codes in the cluster for a extreme.
210 |     """
211 |     extremes = []
212 |     for element1 in cluster:
213 |         code1 = _shared_dataset[element1["base_index"]]["text"]
214 |         for element2 in extremes:
215 |             code2 = _shared_dataset[element2["base_index"]]["text"]
216 |             if jaccard_similarity(code1, code2) >= jaccard_threshold:
217 |                 element2["copies"] += 1
218 |                 break
219 |         else:
220 |             element1["copies"] = 1
221 |             extremes.append(element1)
222 |     return extremes
223 | 
224 | 
225 | def find_extremes(cluster_list, dataset, jaccard_threshold):
226 |     """Call the _find_cluster_extremes_shared function in a parallel fashion.
227 | 
228 |     Args:
229 |         cluster_list (List[List[Dict]]):
230 |             each cluster is a list of dicts with the key base_index,
231 |             referring to the index of the base code in the dataset.
232 |         dataset (Type[Dataset]):
233 |             dataset is used to access the content of the code snippets,
234 |             using the base_index from the cluster_list.
235 |             dataset is shared between all the processes using a glabal variable (any other way to share the dataset?),
236 |             otherwise the multi processing is not speeded up.
237 |         jaccard_threshold (float):
238 |             the threshold for the jaccard similarity. The default value is 0.85
239 | 
240 |     Returns:
241 |         extremes_list (List[Dict]):
242 |             Each cluster is reduced to extremes.
243 |             See _find_cluster_extremes_shared for the definition of extremes.
244 |     """
245 |     global _shared_dataset
246 |     _shared_dataset = dataset
247 |     extremes_list = []
248 |     f = partial(_find_cluster_extremes_shared, jaccard_threshold=jaccard_threshold)
249 |     with mp.Pool() as pool:
250 |         for extremes in tqdm(
251 |             pool.imap_unordered(
252 |                 f,
253 |                 cluster_list,
254 |             ),
255 |             total=len(cluster_list),
256 |         ):
257 |             extremes_list.append(extremes)
258 |     return extremes_list
259 | 
260 | 
261 | def deduplicate_dataset(
262 |     dataset: Type[Dataset], jaccard_threshold: float = 0.85
263 | ) -> Tuple[Type[Dataset], List[List[Dict]]]:
264 |     """Deduplicate the dataset using minhash and jaccard similarity.
265 |     This function first generate duplicate clusters, then each cluster
266 |     is reduced to the extremes that are similar to the other elements in the cluster.
267 |     Codes are called similar if their Jaccard similarity is greater than jaccard_threshold (0.85 default).
268 | 
269 |     Args:
270 |         dataset (Type[Dataset]):
271 |             The dataset to deduplicate.
272 |         jaccard_threshold (float, default=0.85):
273 |             jaccard threshold to determine if two codes are similar
274 | 
275 |     Returns:
276 |         ds_dedup (Type[Dataset]):
277 |             The deduplicated dataset.
278 |         duplicate_clusters (List[List[Dict]]):
279 |             The list of duplicate clusters.
280 |             Each cluster is a list of dicts with the following keys:
281 |             - base_index : int
282 |                 The index of the code in the original dataset.
283 |             - repo_name : str
284 |             - path : str
285 |             - copies : int
286 |                 The number of copies of the code in the cluster. (find_cluster_extremes)
287 |             - is_extreme : bool
288 |                 Whether the code is an extreme in the cluster.
289 |             All the codes in the cluster are removed from the dataset except the extremes.
290 | 
291 |     Example:
292 |         >>> from datasets import load_dataset
293 |         >>> from minhash_deduplication import deduplicate_dataset
294 |         >>> ds = load_dataset("lvwerra/codeparrot-clean", split="train")
295 |         >>> ds_dedup, duplicate_clusters = deduplicate_dataset(ds, jaccard_threshold=0.85)
296 |     """
297 |     duplicate_clusters = make_duplicate_clusters(dataset, jaccard_threshold)
298 |     duplicate_indices = {x["base_index"] for cluster in duplicate_clusters for x in cluster}
299 |     extreme_dict = {}
300 |     extremes_clusters = find_extremes(duplicate_clusters, dataset, jaccard_threshold)
301 |     for extremes in extremes_clusters:
302 |         for element in extremes:
303 |             extreme_dict[element["base_index"]] = element
304 |     remove_indices = duplicate_indices - set(extreme_dict.keys())
305 |     ds_filter = dataset.filter(lambda x, idx: idx not in remove_indices, with_indices=True)
306 | 
307 |     # update duplicate_clusters
308 |     for cluster in duplicate_clusters:
309 |         for element in cluster:
310 |             element["is_extreme"] = element["base_index"] in extreme_dict
311 |             if element["is_extreme"]:
312 |                 element["copies"] = extreme_dict[element["base_index"]]["copies"]
313 | 
314 |     print(f"Original dataset size: {len(dataset)}")
315 |     print(f"Number of duplicate clusters: {len(duplicate_clusters)}")
316 |     print(f"Files in duplicate cluster: {len(duplicate_indices)}")
317 |     print(f"Unique files in duplicate cluster: {len(extreme_dict)}")
318 |     print(f"Filtered dataset size: {len(ds_filter)}")
319 | 
320 |     return ds_filter, duplicate_clusters


--------------------------------------------------------------------------------