├── apicoder ├── APIRetriever │ ├── src │ │ ├── dense │ │ │ ├── __init__.py │ │ │ ├── driver │ │ │ │ ├── __init__.py │ │ │ │ ├── __init__.pyc │ │ │ │ ├── train.py │ │ │ │ └── encode.py │ │ │ ├── utils │ │ │ │ ├── __init__.py │ │ │ │ └── format │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── convert_result_to_trec.py │ │ │ ├── faiss_retriever │ │ │ │ ├── __init__.py │ │ │ │ ├── retriever.py │ │ │ │ ├── reducer.py │ │ │ │ └── __main__.py │ │ │ ├── processor │ │ │ │ ├── __init__.py │ │ │ │ └── processors.py │ │ │ ├── dataset │ │ │ │ ├── __init__.py │ │ │ │ └── processor.py │ │ │ ├── loss.py │ │ │ ├── arguments.py │ │ │ └── trainer.py │ │ ├── run_trec_format_4.sh │ │ ├── run_search_3.sh │ │ ├── run_train_1.sh │ │ └── run_encode_2.sh │ ├── build │ │ └── lib │ │ │ └── dense │ │ │ ├── __init__.py │ │ │ ├── driver │ │ │ ├── __init__.py │ │ │ ├── train.py │ │ │ └── encode.py │ │ │ ├── utils │ │ │ ├── __init__.py │ │ │ └── format │ │ │ │ ├── __init__.py │ │ │ │ └── convert_result_to_trec.py │ │ │ ├── faiss_retriever │ │ │ ├── __init__.py │ │ │ ├── retriever.py │ │ │ ├── reducer.py │ │ │ └── __main__.py │ │ │ ├── processor │ │ │ ├── __init__.py │ │ │ └── processors.py │ │ │ ├── dataset │ │ │ ├── __init__.py │ │ │ └── processor.py │ │ │ ├── loss.py │ │ │ ├── arguments.py │ │ │ └── trainer.py │ ├── data │ │ └── inference │ │ │ └── README.md │ ├── requirements.txt │ ├── setup.py │ ├── scripts │ │ ├── run_extract_apiretriever_corpus.sh │ │ ├── run_prepare_test_private_code.py │ │ └── run_prepare_train_private_code.py │ └── README.md ├── private-eval │ ├── private_eval │ │ ├── __init__.py │ │ ├── evaluate_functional_correctness.py │ │ ├── data.py │ │ └── evaluation.py │ ├── requirements.txt │ ├── data │ │ ├── real_numpy_eval_v3.jsonl.gz │ │ ├── real_beatnum_eval_v3.jsonl.gz │ │ ├── real_monkey_eval_v3.jsonl.gz │ │ ├── real_pandas_eval_v3.jsonl.gz │ │ ├── real_torchdata_eval_v3.jsonl.gz │ │ ├── real_monkey_eval_v3_api_1.jsonl.gz │ │ ├── real_monkey_eval_v3_api_2.jsonl.gz │ │ ├── real_monkey_eval_v3_api_3.jsonl.gz │ │ ├── real_monkey_eval_v3_api_5.jsonl.gz │ │ ├── real_monkey_eval_v3_api_n.jsonl.gz │ │ ├── real_numpy_eval_v3_api_1.jsonl.gz │ │ ├── real_numpy_eval_v3_api_2.jsonl.gz │ │ ├── real_numpy_eval_v3_api_3.jsonl.gz │ │ ├── real_numpy_eval_v3_api_5.jsonl.gz │ │ ├── real_numpy_eval_v3_api_n.jsonl.gz │ │ ├── real_pandas_eval_v3_api_1.jsonl.gz │ │ ├── real_pandas_eval_v3_api_2.jsonl.gz │ │ ├── real_pandas_eval_v3_api_3.jsonl.gz │ │ ├── real_pandas_eval_v3_api_5.jsonl.gz │ │ ├── real_pandas_eval_v3_api_n.jsonl.gz │ │ ├── real_beatnum_eval_v3_api_1.jsonl.gz │ │ ├── real_beatnum_eval_v3_api_2.jsonl.gz │ │ ├── real_beatnum_eval_v3_api_3.jsonl.gz │ │ ├── real_beatnum_eval_v3_api_5.jsonl.gz │ │ ├── real_beatnum_eval_v3_api_n.jsonl.gz │ │ ├── real_torchdata_eval_v3_api_1.jsonl.gz │ │ ├── real_torchdata_eval_v3_api_2.jsonl.gz │ │ ├── real_torchdata_eval_v3_api_3.jsonl.gz │ │ ├── real_torchdata_eval_v3_api_5.jsonl.gz │ │ ├── real_torchdata_eval_v3_api_n.jsonl.gz │ │ ├── real_beatnum_eval_v3_human_labelled.jsonl.gz │ │ ├── real_monkey_eval_v3_human_labelled.jsonl.gz │ │ ├── real_torchdata_eval_v3_human_labelled.jsonl.gz │ │ ├── real_torchdata_eval_v3_api_1_make_sense.jsonl.gz │ │ ├── real_torchdata_eval_v3_api_2_make_sense.jsonl.gz │ │ ├── real_torchdata_eval_v3_api_3_make_sense.jsonl.gz │ │ ├── real_torchdata_eval_v3_api_5_make_sense.jsonl.gz │ │ ├── real_torchdata_eval_v3_api_n_make_sense.jsonl.gz │ │ ├── real_torchdata_eval_v3_human_labelled_make_sense.jsonl.gz │ │ ├── numpy_keywords.jsonl │ │ ├── pandas_keywords.jsonl │ │ ├── XXXAPIEval-make sense.ipynb │ │ └── TorchData_no.API_number_0.CodeGen.hm_False.machine.t0.1.p0.9.l100.n1.samples.jsonl │ ├── setup.py │ ├── LICENSE │ └── README.md ├── CodeGenAPI │ ├── scripts │ │ ├── __init__.py │ │ ├── requirements.txt │ │ ├── run_details_apis.sh │ │ ├── run_extract_apis.sh │ │ ├── run_encode_private_data.sh │ │ └── get_comments_from_evallibs.py │ ├── requirements.txt │ ├── run_evaluating_codes.sh │ ├── nl2code │ │ ├── __init__.py │ │ └── configuration_codegen.py │ ├── run_generating_codes.sh │ ├── APICoder │ │ ├── get_lib_comment_for_eval.py │ │ └── get_api_info_by_name.py │ ├── run_private.sh │ └── README.md └── data │ ├── CodeGenAPI │ └── README.md │ ├── Cleaned-Private-Code-Files │ └── README.md │ ├── EncodedCorpus4CodeGenAPI │ └── README.md │ └── API-Doc │ └── README.md ├── cert ├── pandas-numpy-eval │ ├── pandas_numpy_eval │ │ ├── __init__.py │ │ ├── evaluate_functional_correctness.py │ │ ├── data.py │ │ └── evaluation.py │ ├── requirements.txt │ ├── data │ │ ├── NumpyEval.jsonl.gz │ │ └── PandasEval.jsonl.gz │ ├── setup.py │ ├── LICENSE │ └── README.md ├── scripts │ ├── requirements.txt │ ├── run_encode_domain.sh │ └── ast_utils.py ├── requirements.txt ├── run_evaluating_codes.sh ├── nl2code │ ├── __init__.py │ └── dynamic_block_dataset.py ├── run_generating_codes.sh ├── README.md └── run_training_cert.sh ├── requirements.txt ├── CODE_OF_CONDUCT.md ├── LICENSE ├── .github └── workflows │ └── codeql.yml ├── SECURITY.md ├── eval_human_eval.py └── README.md /apicoder/APIRetriever/src/dense/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apicoder/private-eval/private_eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/build/lib/dense/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/src/dense/driver/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/src/dense/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cert/pandas-numpy-eval/pandas_numpy_eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/build/lib/dense/driver/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/build/lib/dense/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/src/dense/utils/format/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/build/lib/dense/utils/format/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | transformers 3 | sentencepiece 4 | protobuf -------------------------------------------------------------------------------- /apicoder/CodeGenAPI/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("scripts") 3 | -------------------------------------------------------------------------------- /apicoder/data/CodeGenAPI/README.md: -------------------------------------------------------------------------------- 1 | The weights, vocabulary and tokenizer of CodeGenAPI-350M-mono. -------------------------------------------------------------------------------- /apicoder/APIRetriever/src/dense/faiss_retriever/__init__.py: -------------------------------------------------------------------------------- 1 | from .retriever import BaseFaissIPRetriever 2 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/build/lib/dense/faiss_retriever/__init__.py: -------------------------------------------------------------------------------- 1 | from .retriever import BaseFaissIPRetriever 2 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/data/inference/README.md: -------------------------------------------------------------------------------- 1 | Download all embedding files for our benchmarks and put it under this fold. -------------------------------------------------------------------------------- /apicoder/private-eval/requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | fire 3 | numpy==1.21.4 4 | pandas==1.3.5 5 | docformatter 6 | autopep8 7 | ipdb -------------------------------------------------------------------------------- /cert/pandas-numpy-eval/requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | fire 3 | numpy==1.21.4 4 | pandas==1.3.5 5 | docformatter 6 | autopep8 7 | ipdb 8 | -------------------------------------------------------------------------------- /cert/pandas-numpy-eval/data/NumpyEval.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/cert/pandas-numpy-eval/data/NumpyEval.jsonl.gz -------------------------------------------------------------------------------- /cert/pandas-numpy-eval/data/PandasEval.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/cert/pandas-numpy-eval/data/PandasEval.jsonl.gz -------------------------------------------------------------------------------- /apicoder/APIRetriever/src/dense/driver/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/APIRetriever/src/dense/driver/__init__.pyc -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_numpy_eval_v3.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_numpy_eval_v3.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_beatnum_eval_v3.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_beatnum_eval_v3.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_monkey_eval_v3.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_monkey_eval_v3.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_pandas_eval_v3.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_pandas_eval_v3.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_torchdata_eval_v3.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_monkey_eval_v3_api_1.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_monkey_eval_v3_api_1.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_monkey_eval_v3_api_2.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_monkey_eval_v3_api_2.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_monkey_eval_v3_api_3.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_monkey_eval_v3_api_3.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_monkey_eval_v3_api_5.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_monkey_eval_v3_api_5.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_monkey_eval_v3_api_n.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_monkey_eval_v3_api_n.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_numpy_eval_v3_api_1.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_numpy_eval_v3_api_1.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_numpy_eval_v3_api_2.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_numpy_eval_v3_api_2.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_numpy_eval_v3_api_3.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_numpy_eval_v3_api_3.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_numpy_eval_v3_api_5.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_numpy_eval_v3_api_5.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_numpy_eval_v3_api_n.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_numpy_eval_v3_api_n.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_pandas_eval_v3_api_1.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_pandas_eval_v3_api_1.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_pandas_eval_v3_api_2.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_pandas_eval_v3_api_2.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_pandas_eval_v3_api_3.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_pandas_eval_v3_api_3.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_pandas_eval_v3_api_5.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_pandas_eval_v3_api_5.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_pandas_eval_v3_api_n.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_pandas_eval_v3_api_n.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_beatnum_eval_v3_api_1.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_beatnum_eval_v3_api_1.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_beatnum_eval_v3_api_2.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_beatnum_eval_v3_api_2.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_beatnum_eval_v3_api_3.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_beatnum_eval_v3_api_3.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_beatnum_eval_v3_api_5.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_beatnum_eval_v3_api_5.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_beatnum_eval_v3_api_n.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_beatnum_eval_v3_api_n.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_torchdata_eval_v3_api_1.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_api_1.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_torchdata_eval_v3_api_2.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_api_2.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_torchdata_eval_v3_api_3.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_api_3.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_torchdata_eval_v3_api_5.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_api_5.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_torchdata_eval_v3_api_n.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_api_n.jsonl.gz -------------------------------------------------------------------------------- /apicoder/APIRetriever/src/dense/processor/__init__.py: -------------------------------------------------------------------------------- 1 | from .processors import SimpleTrainProcessor, SimpleCollectionProcessor 2 | 3 | MarcoPassageTrainProcessor = SimpleTrainProcessor 4 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/build/lib/dense/processor/__init__.py: -------------------------------------------------------------------------------- 1 | from .processors import SimpleTrainProcessor, SimpleCollectionProcessor 2 | 3 | MarcoPassageTrainProcessor = SimpleTrainProcessor 4 | -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_beatnum_eval_v3_human_labelled.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_beatnum_eval_v3_human_labelled.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_monkey_eval_v3_human_labelled.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_monkey_eval_v3_human_labelled.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_torchdata_eval_v3_human_labelled.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_human_labelled.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_torchdata_eval_v3_api_1_make_sense.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_api_1_make_sense.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_torchdata_eval_v3_api_2_make_sense.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_api_2_make_sense.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_torchdata_eval_v3_api_3_make_sense.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_api_3_make_sense.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_torchdata_eval_v3_api_5_make_sense.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_api_5_make_sense.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_torchdata_eval_v3_api_n_make_sense.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_api_n_make_sense.jsonl.gz -------------------------------------------------------------------------------- /apicoder/private-eval/data/real_torchdata_eval_v3_human_labelled_make_sense.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_human_labelled_make_sense.jsonl.gz -------------------------------------------------------------------------------- /cert/scripts/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.12.5 2 | sentencepiece 3 | protobuf 4 | wandb 5 | datasets 6 | numpy 7 | cython 8 | fairseq 9 | autopep8 10 | docformatter 11 | zstandard 12 | beautifulsoup4 13 | lxml 14 | ipdb 15 | redbaron 16 | func-timeout -------------------------------------------------------------------------------- /apicoder/CodeGenAPI/scripts/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.12.5 2 | sentencepiece 3 | protobuf 4 | wandb 5 | datasets 6 | numpy 7 | cython 8 | fairseq 9 | autopep8 10 | docformatter 11 | zstandard 12 | beautifulsoup4 13 | lxml 14 | ipdb 15 | redbaron 16 | func-timeout -------------------------------------------------------------------------------- /apicoder/APIRetriever/requirements.txt: -------------------------------------------------------------------------------- 1 | torch<=1.8.0 2 | faiss-cpu>=1.6.5 3 | transformers==4.2.0 4 | datasets==1.1.3 5 | wandb==0.13.3 6 | sentencepiece 7 | protobuf 8 | numpy 9 | cython 10 | fairseq 11 | autopep8 12 | docformatter 13 | zstandard 14 | beautifulsoup4 15 | lxml 16 | ipdb 17 | redbaron 18 | func-timeout -------------------------------------------------------------------------------- /apicoder/data/Cleaned-Private-Code-Files/README.md: -------------------------------------------------------------------------------- 1 | This folder contains all code files about the 31 public libraries our defined. 2 | ``` 3 | pandas,numpy,sklearn,torch,tensorflow,django,selenium,matplotlib,flask,scipy,seaborn,nltk,beautifulsoup,pygame,PIL,jieba,gensim,spacy,transformers,fairseq,sqlalchemy,scrapy,allennlp,datasets,tokenizers,mxnet,imageio,pytest,metpy,ansible 4 | ``` -------------------------------------------------------------------------------- /cert/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.12.5 2 | sentencepiece 3 | protobuf 4 | wandb 5 | tqdm 6 | datasets 7 | tensorboard 8 | fairseq 9 | fairscale 10 | zstandard 11 | openpyxl 12 | matplotlib 13 | pandas>=1.1.2 14 | torchvision>=0.7.0 15 | seaborn>=0.11.2 16 | pyyaml 17 | ipdb 18 | numpy 19 | cython 20 | autopep8 21 | docformatter 22 | beautifulsoup4 23 | lxml 24 | redbaron 25 | func-timeout -------------------------------------------------------------------------------- /apicoder/data/EncodedCorpus4CodeGenAPI/README.md: -------------------------------------------------------------------------------- 1 | This folder contains all encoded code files (after the tokenization operation) about the 31 public libraries our defined. 2 | ``` 3 | pandas,numpy,sklearn,torch,tensorflow,django,selenium,matplotlib,flask,scipy,seaborn,nltk,beautifulsoup,pygame,PIL,jieba,gensim,spacy,transformers,fairseq,sqlalchemy,scrapy,allennlp,datasets,tokenizers,mxnet,imageio,pytest,metpy,ansible 4 | ``` -------------------------------------------------------------------------------- /apicoder/APIRetriever/src/run_trec_format_4.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | 3 | LIBRIES=( "pandas" "numpy" "monkey" "beatnum" "torchdata") 4 | 5 | for LIBRARY in ${LIBRIES[@]}; do 6 | echo "Library: $LIBRARY" 7 | INPUT_DIR="../data/inference" 8 | RUN="$INPUT_DIR/${LIBRARY}_id_score.txt" 9 | TREC_RUN="$INPUT_DIR/${LIBRARY}_id_score.trec" 10 | 11 | python -m dense.utils.format.convert_result_to_trec --input $RUN --output $TREC_RUN 12 | done 13 | -------------------------------------------------------------------------------- /apicoder/CodeGenAPI/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.12.5 2 | sentencepiece 3 | wandb 4 | tqdm 5 | datasets 6 | tensorboard 7 | fairseq 8 | fairscale 9 | deepspeed 10 | zstandard 11 | openpyxl 12 | matplotlib 13 | pandas>=1.1.2 14 | torch>=1.6.0 15 | torchvision>=0.7.0 16 | seaborn>=0.11.2 17 | pyyaml 18 | ipdb 19 | numpy 20 | cython 21 | autopep8==1.6.0 22 | docformatter==1.4 23 | redbaron==0.9.2 24 | func-timeout 25 | torchdata==0.3.0 26 | protobuf==3.20.1 -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /apicoder/data/API-Doc/README.md: -------------------------------------------------------------------------------- 1 | This folder stores the 65 crawled public libraries. 2 | ``` 3 | pandas,numpy,sklearn,torch,tensorflow,django,selenium,matplotlib,flask,scipy,seaborn,nltk,beautifulsoup,pygame,PIL,jieba,gensim,spacy,transformers,fairseq,sqlalchemy,scrapy,allennlp,datasets,tokenizers,mxnet,imageio,pytest,metpy,ansible,requests,datetime,zlib,random,math,sys,glob,os,urllib,time,re,json,unittest,collections,subprocess,copy,functools,itertools,six,threading,tempfile,io,pickle,pathlib,socket,struct,hashlib,traceback,csv,uuid,pprint 4 | ``` -------------------------------------------------------------------------------- /apicoder/APIRetriever/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='apiretriever', 5 | version='0.0.1', 6 | packages=find_packages("src"), 7 | package_dir={'': 'src'}, 8 | install_requires=open('requirements.txt').read().splitlines(), 9 | url='https://github.com/microsoft/PyCodeGPT', 10 | license='Apache 2.0', 11 | author='MSRA-DKI', 12 | author_email='daoguang@iscas.ac.cn', 13 | description='A toolkit for learning and running deep dense retrieval models.' 14 | ) 15 | -------------------------------------------------------------------------------- /apicoder/CodeGenAPI/run_evaluating_codes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BASE_DIR="your/base/dir" 4 | 5 | TEMP=$1 6 | 7 | # Temperature needs to be replaced with: $TEMP 8 | # Remember to change the human/data path; Remember to add torchdata in requirements.txt; Start with CERT/ 9 | POST_PATH="XXX/codeparrot-small/official_TorchData_machine_gpt2_apinum_5_temp_$TEMP.samples.jsonl" 10 | 11 | EVALUATION_FILE="$BASE_DIR/$POST_PATH" 12 | echo "Evaluation File Path: $EVALUATION_FILE" 13 | evaluate_functional_correctness $EVALUATION_FILE 14 | 15 | echo "All Done!" -------------------------------------------------------------------------------- /apicoder/APIRetriever/src/run_search_3.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | 3 | LIBRIES=( "pandas" "numpy" "monkey" "beatnum" "torchdata") 4 | 5 | for LIBRARY in ${LIBRIES[@]}; do 6 | echo "Library: $LIBRARY" 7 | INPUT_DIR="../data/inference" 8 | DEPTH=100 9 | RUN="$INPUT_DIR/${LIBRARY}_id_score.txt" 10 | 11 | python -m dense.faiss_retriever \ 12 | --query_reps "$INPUT_DIR/${LIBRARY}_comment.pt" \ 13 | --passage_reps "$INPUT_DIR/${LIBRARY}_api.pt" \ 14 | --depth $DEPTH \ 15 | --batch_size -1 \ 16 | --save_text \ 17 | --save_ranking_to $RUN 18 | done 19 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/src/run_train_1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export WANDB_PROJECT="Your Project Name" 4 | export WANDB_API_KEY="Your WANDB API Key" 5 | 6 | TRAIN_DIR="../data/train" 7 | OUTDIR="../outputs" 8 | MODEL_PATH="/your/path/of/bert-base-uncased" 9 | 10 | python -m dense.driver.train \ 11 | --output_dir $OUTDIR \ 12 | --model_name_or_path ${MODEL_PATH} \ 13 | --do_train \ 14 | --save_steps 200 \ 15 | --train_dir $TRAIN_DIR \ 16 | --fp16 \ 17 | --per_device_train_batch_size 5 \ 18 | --train_n_passages 8 \ 19 | --learning_rate 1e-5 \ 20 | --q_max_len 256 \ 21 | --p_max_len 256 \ 22 | --num_train_epochs 150 \ 23 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/src/dense/utils/format/convert_result_to_trec.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | parser = ArgumentParser() 4 | parser.add_argument('--input', type=str, required=True) 5 | parser.add_argument('--output', type=str, required=True) 6 | args = parser.parse_args() 7 | 8 | with open(args.input) as f_in, open(args.output, 'w+') as f_out: 9 | cur_qid = None 10 | rank = 0 11 | for line in f_in: 12 | qid, docid, score = line.split() 13 | if cur_qid != qid: 14 | cur_qid = qid 15 | rank = 0 16 | rank += 1 17 | f_out.write(f'{qid} Q0 {docid} {rank} {score} dense\n') 18 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/build/lib/dense/utils/format/convert_result_to_trec.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | parser = ArgumentParser() 4 | parser.add_argument('--input', type=str, required=True) 5 | parser.add_argument('--output', type=str, required=True) 6 | args = parser.parse_args() 7 | 8 | with open(args.input) as f_in, open(args.output, 'w') as f_out: 9 | cur_qid = None 10 | rank = 0 11 | for line in f_in: 12 | qid, docid, score = line.split() 13 | if cur_qid != qid: 14 | cur_qid = qid 15 | rank = 0 16 | rank += 1 17 | f_out.write(f'{qid} Q0 {docid} {rank} {score} dense\n') 18 | -------------------------------------------------------------------------------- /apicoder/private-eval/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pkg_resources 4 | from setuptools import setup, find_packages 5 | 6 | 7 | setup( 8 | name="private-eval", 9 | py_modules=["private-eval"], 10 | version="1.0", 11 | description="", 12 | author="OpenAI", 13 | packages=find_packages(), 14 | install_requires=[ 15 | str(r) 16 | for r in pkg_resources.parse_requirements( 17 | open(os.path.join(os.path.dirname(__file__), "requirements.txt")) 18 | ) 19 | ], 20 | entry_points={ 21 | "console_scripts": [ 22 | "evaluate_functional_correctness = private_eval.evaluate_functional_correctness", 23 | ] 24 | } 25 | ) 26 | -------------------------------------------------------------------------------- /cert/pandas-numpy-eval/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pkg_resources 4 | from setuptools import setup, find_packages 5 | 6 | 7 | setup( 8 | name="pandas-numpy-eval", 9 | py_modules=["pandas-numpy-eval"], 10 | version="1.0", 11 | description="", 12 | author="OpenAI", 13 | packages=find_packages(), 14 | install_requires=[ 15 | str(r) 16 | for r in pkg_resources.parse_requirements( 17 | open(os.path.join(os.path.dirname(__file__), "requirements.txt")) 18 | ) 19 | ], 20 | entry_points={ 21 | "console_scripts": [ 22 | "evaluate_functional_correctness = pandas_numpy_eval.evaluate_functional_correctness", 23 | ] 24 | } 25 | ) 26 | -------------------------------------------------------------------------------- /cert/run_evaluating_codes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # Licensed under the MIT license. 4 | 5 | BASE_DIR="Your base data directory" 6 | 7 | # ---------------------------------------------------------------------------------------------------- 8 | # The variable below should be changed according to the output file path of `run_eval_monitor.sh`. 9 | # ---------------------------------------------------------------------------------------------------- 10 | POST_PATH="CERT/pandas-numpy-eval/data/Example_Pandas_PYCODEGPT_samples.jsonl" 11 | EVALUATION_FILE="$BASE_DIR/$POST_PATH" 12 | echo "Evaluation File Path: $EVALUATION_FILE" 13 | 14 | evaluate_functional_correctness $EVALUATION_FILE 15 | 16 | echo "File: $" 17 | echo "All Done!" 18 | -------------------------------------------------------------------------------- /apicoder/private-eval/private_eval/evaluate_functional_correctness.py: -------------------------------------------------------------------------------- 1 | import fire 2 | import sys 3 | 4 | from private_eval.data import HUMAN_EVAL 5 | from private_eval.evaluation import evaluate_functional_correctness 6 | 7 | 8 | def entry_point( 9 | sample_file: str, 10 | k: str = "1,10,100", 11 | n_workers: int = 4, 12 | timeout: float = 3.0, 13 | problem_file: str = HUMAN_EVAL, 14 | ): 15 | """ 16 | Evaluates the functional correctness of generated samples, and writes 17 | results to f"{sample_file}_results.jsonl.gz" 18 | """ 19 | k = list(map(int, k.split(","))) 20 | results = evaluate_functional_correctness(sample_file, k, n_workers, timeout, problem_file) 21 | print(results) 22 | 23 | 24 | def main(): 25 | fire.Fire(entry_point) 26 | 27 | 28 | sys.exit(main()) 29 | -------------------------------------------------------------------------------- /cert/pandas-numpy-eval/pandas_numpy_eval/evaluate_functional_correctness.py: -------------------------------------------------------------------------------- 1 | import fire 2 | import sys 3 | 4 | from pandas_numpy_eval.data import HUMAN_EVAL 5 | from pandas_numpy_eval.evaluation import evaluate_functional_correctness 6 | 7 | 8 | def entry_point( 9 | sample_file: str, 10 | k: str = "1,10,100", 11 | n_workers: int = 4, 12 | timeout: float = 3.0, 13 | problem_file: str = HUMAN_EVAL, 14 | ): 15 | """ 16 | Evaluates the functional correctness of generated samples, and writes 17 | results to f"{sample_file}_results.jsonl.gz" 18 | """ 19 | k = list(map(int, k.split(","))) 20 | results = evaluate_functional_correctness(sample_file, k, n_workers, timeout, problem_file) 21 | print(results) 22 | 23 | 24 | def main(): 25 | fire.Fire(entry_point) 26 | 27 | 28 | sys.exit(main()) 29 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/src/run_encode_2.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | 3 | LIBRIES=( "pandas" "numpy" "monkey" "beatnum" "torchdata") 4 | MODES=( "comment" "api") 5 | 6 | for MODE in ${MODES[@]}; do 7 | echo "Mode: $MODE" 8 | for LIBRARY in ${LIBRIES[@]}; do 9 | echo "Library: $LIBRARY" 10 | OUTDIR="../data/inference" 11 | MODEL_DIR="../outputs/APIRetrieverCheckPoint/" 12 | CORPUS_DIR="../data/inference" 13 | ENCODE_DIR="../data/inference" 14 | PER_BATCH_SIZE=50 15 | 16 | CUDA_VISIBLE_DEVICES=0 python -m dense.driver.encode \ 17 | --output_dir=$OUTDIR \ 18 | --model_name_or_path $MODEL_DIR \ 19 | --fp16 \ 20 | --per_device_eval_batch_size ${PER_BATCH_SIZE} \ 21 | --local_rank -1 \ 22 | --encode_in_path "${CORPUS_DIR}/${LIBRARY}_${MODE}.json" \ 23 | --encoded_save_path "${ENCODE_DIR}/${LIBRARY}_${MODE}.pt" 24 | done 25 | done 26 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/src/dense/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from .processor import TrainProcessor, TestProcessor, CorpusProcessor 2 | 3 | PROCESSOR_INFO = { 4 | 'Tevatron/wikipedia-nq': { 5 | 'train': TrainProcessor, 6 | 'dev': TrainProcessor, 7 | 'test': TestProcessor, 8 | 'corpus': CorpusProcessor, 9 | }, 10 | 'Tevatron/wikipedia-trivia': { 11 | 'train': TrainProcessor, 12 | 'dev': TrainProcessor, 13 | 'test': TestProcessor, 14 | 'corpus': CorpusProcessor, 15 | }, 16 | 'Tevatron/msmarco-passage': { 17 | 'train': TrainProcessor, 18 | 'dev': TestProcessor, 19 | 'corpus': CorpusProcessor, 20 | }, 21 | 'Tevatron/scifact': { 22 | 'train': TrainProcessor, 23 | 'dev': TestProcessor, 24 | 'test': TestProcessor, 25 | 'corpus': CorpusProcessor, 26 | }, 27 | } 28 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/build/lib/dense/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from .processor import TrainProcessor, TestProcessor, CorpusProcessor 2 | 3 | PROCESSOR_INFO = { 4 | 'Tevatron/wikipedia-nq': { 5 | 'train': TrainProcessor, 6 | 'dev': TrainProcessor, 7 | 'test': TestProcessor, 8 | 'corpus': CorpusProcessor, 9 | }, 10 | 'Tevatron/wikipedia-trivia': { 11 | 'train': TrainProcessor, 12 | 'dev': TrainProcessor, 13 | 'test': TestProcessor, 14 | 'corpus': CorpusProcessor, 15 | }, 16 | 'Tevatron/msmarco-passage': { 17 | 'train': TrainProcessor, 18 | 'dev': TestProcessor, 19 | 'corpus': CorpusProcessor, 20 | }, 21 | 'Tevatron/scifact': { 22 | 'train': TrainProcessor, 23 | 'dev': TestProcessor, 24 | 'test': TestProcessor, 25 | 'corpus': CorpusProcessor, 26 | }, 27 | } 28 | -------------------------------------------------------------------------------- /apicoder/CodeGenAPI/scripts/run_details_apis.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # for example: "pandas,numpy,sklearn,tensorflow,keras" 4 | # PS: comma after the comma can not have space 5 | 6 | # Third party libraries 7 | LIBRARIES="${LIBRARIES},pandas,numpy,sklearn,torch,tensorflow,django,selenium,matplotlib,flask,scipy,seaborn,nltk,beautifulsoup,pygame,PIL,jieba,gensim,spacy,transformers,fairseq,sqlalchemy,scrapy,allennlp,datasets,tokenizers" 8 | LIBRARIES="${LIBRARIES},mxnet,imageio,pytest,metpy,ansible,requests" 9 | # Built-in libraries 10 | # LIBRARIES="${LIBRARIES},datetime,zlib,random,math,sys,glob,os,urllib,time,re,json,unittest,collections,subprocess,copy,functools,itertools,six,threading" 11 | # LIBRARIES="${LIBRARIES},tempfile,io,pickle,pathlib,socket,struct,hashlib,traceback,csv,uuid,pprint" 12 | 13 | OUTPUT_DIR="data/API-Doc" 14 | PROCESS_NUM=16 15 | 16 | Run_Args="-o ${OUTPUT_DIR}" 17 | Run_Args="${Run_Args} -l ${LIBRARIES}" 18 | Run_Args="${Run_Args} -pn ${PROCESS_NUM}" 19 | 20 | echo "Run_Args: ${Run_Args}" 21 | 22 | python run_extract_details_from_apis.py ${Run_Args} 23 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/src/dense/faiss_retriever/retriever.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import faiss 3 | 4 | import logging 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class BaseFaissIPRetriever: 9 | def __init__(self, init_reps: np.ndarray): 10 | index = faiss.IndexFlatIP(init_reps.shape[1]) 11 | self.index = index 12 | 13 | def search(self, q_reps: np.ndarray, k: int): 14 | return self.index.search(q_reps, k) 15 | 16 | def add(self, p_reps: np.ndarray): 17 | self.index.add(p_reps) 18 | 19 | def batch_search(self, q_reps: np.ndarray, k: int, batch_size: int): 20 | num_query = q_reps.shape[0] 21 | all_scores = [] 22 | all_indices = [] 23 | for start_idx in range(0, num_query, batch_size): 24 | nn_scores, nn_indices = self.search(q_reps[start_idx: start_idx + batch_size], k) 25 | all_scores.append(nn_scores) 26 | all_indices.append(nn_indices) 27 | all_scores = np.concatenate(all_scores, axis=0) 28 | all_indices = np.concatenate(all_indices, axis=0) 29 | 30 | return all_scores, all_indices -------------------------------------------------------------------------------- /apicoder/APIRetriever/build/lib/dense/faiss_retriever/retriever.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import faiss 3 | 4 | import logging 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class BaseFaissIPRetriever: 9 | def __init__(self, init_reps: np.ndarray): 10 | index = faiss.IndexFlatIP(init_reps.shape[1]) 11 | self.index = index 12 | 13 | def search(self, q_reps: np.ndarray, k: int): 14 | return self.index.search(q_reps, k) 15 | 16 | def add(self, p_reps: np.ndarray): 17 | self.index.add(p_reps) 18 | 19 | def batch_search(self, q_reps: np.ndarray, k: int, batch_size: int): 20 | num_query = q_reps.shape[0] 21 | all_scores = [] 22 | all_indices = [] 23 | for start_idx in range(0, num_query, batch_size): 24 | nn_scores, nn_indices = self.search(q_reps[start_idx: start_idx + batch_size], k) 25 | all_scores.append(nn_scores) 26 | all_indices.append(nn_indices) 27 | all_scores = np.concatenate(all_scores, axis=0) 28 | all_indices = np.concatenate(all_indices, axis=0) 29 | 30 | return all_scores, all_indices -------------------------------------------------------------------------------- /apicoder/private-eval/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) OpenAI (https://openai.com) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /cert/pandas-numpy-eval/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) OpenAI (https://openai.com) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /cert/nl2code/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | import os 4 | from transformers import AutoTokenizer 5 | from .code_dataset import CodeBlockDataset, CodeDatasetCallBack 6 | 7 | huggingface_model_mappings = { 8 | 'gpt-neo-125M'.lower() : 'EleutherAI/gpt-neo-125M', 9 | 'gpt-neo-1.3B'.lower() : 'EleutherAI/gpt-neo-1.3B' 10 | } 11 | 12 | _Proj_Abs_Dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 13 | _Data_Abs_Dir = os.path.join(_Proj_Abs_Dir, 'data') 14 | 15 | def load_pretrained_tokenizer(name_or_path: str): 16 | name_or_path = resolve_model_name_or_path(name_or_path) 17 | return AutoTokenizer.from_pretrained(name_or_path) 18 | 19 | def resolve_model_name_or_path(name_or_path: str): 20 | if name_or_path.lower() in huggingface_model_mappings: 21 | name_or_path = huggingface_model_mappings[name_or_path.lower()] 22 | 23 | data_dir = _Data_Abs_Dir if 'AMLT_DATA_DIR' not in os.environ else os.environ['AMLT_DATA_DIR'] 24 | model_local_path = os.path.join(data_dir, 'pretrained_models', name_or_path) 25 | if os.path.exists(model_local_path): 26 | name_or_path = model_local_path 27 | 28 | return model_local_path 29 | -------------------------------------------------------------------------------- /apicoder/CodeGenAPI/nl2code/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | from transformers import AutoTokenizer 3 | from .code_dataset import CodeBlockDataset, CodeDatasetCallBack 4 | from .code_dataset_codegen import CodeBlockDatasetCodeGen, CodeDatasetCallBackCodeGen 5 | 6 | huggingface_model_mappings = { 7 | 'gpt-neo-125M'.lower() : 'EleutherAI/gpt-neo-125M', 8 | 'gpt-neo-1.3B'.lower() : 'EleutherAI/gpt-neo-1.3B' 9 | } 10 | 11 | _Proj_Abs_Dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 12 | _Data_Abs_Dir = os.path.join(_Proj_Abs_Dir, 'data') 13 | 14 | def load_pretrained_tokenizer(name_or_path: str): 15 | name_or_path = resolve_model_name_or_path(name_or_path) 16 | return AutoTokenizer.from_pretrained(name_or_path) 17 | 18 | def resolve_model_name_or_path(name_or_path: str): 19 | if name_or_path.lower() in huggingface_model_mappings: 20 | name_or_path = huggingface_model_mappings[name_or_path.lower()] 21 | 22 | data_dir = _Data_Abs_Dir if 'AMLT_DATA_DIR' not in os.environ else os.environ['AMLT_DATA_DIR'] 23 | model_local_path = os.path.join(data_dir, 'pretrained_models', name_or_path) 24 | if os.path.exists(model_local_path): 25 | name_or_path = model_local_path 26 | 27 | return model_local_path 28 | -------------------------------------------------------------------------------- /apicoder/CodeGenAPI/scripts/run_extract_apis.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export DJANGO_SETTINGS_MODULE=bay.settings # 4 | 5 | # for example: "pandas,numpy,sklearn,tensorflow,keras" 6 | # PS: behind the comma can not have space 7 | # Third party libraries 8 | LIBRARIES="${LIBRARIES},pandas,numpy,sklearn,torch,tensorflow,django,selenium,matplotlib,flask,scipy,seaborn,nltk,beautifulsoup,pygame,PIL,jieba,gensim,spacy,transformers,fairseq,sqlalchemy,scrapy,allennlp,datasets,tokenizers,torchdata" 9 | LIBRARIES="${LIBRARIES},mxnet,imageio,pytest,metpy,ansible,requests" 10 | # Built-in libraries 11 | # LIBRARIES="${LIBRARIES},datetime,zlib,random,math,sys,glob,os,urllib,time,re,json,unittest,collections,subprocess,copy,functools,itertools,six,threading" 12 | # LIBRARIES="${LIBRARIES},tempfile,io,pickle,pathlib,socket,struct,hashlib,traceback,csv,uuid,pprint" 13 | 14 | ID=$(date +"%m%d") 15 | OUTPUT_DIR="data/API-Doc" 16 | PROCESS_NUM=16 17 | OVER_WRITE="True" # [True, False] 18 | GET_SIG="True" # [True, False] 19 | 20 | Run_Args="-o ${OUTPUT_DIR}" 21 | Run_Args="${Run_Args} -ls ${LIBRARIES}" 22 | Run_Args="${Run_Args} -pn ${PROCESS_NUM}" 23 | Run_Args="${Run_Args} -ow ${OVER_WRITE}" 24 | Run_Args="${Run_Args} -gs ${GET_SIG}" 25 | 26 | echo "Run_Args: ${Run_Args}" 27 | 28 | python -u extract_api.py ${Run_Args} 29 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/scripts/run_extract_apiretriever_corpus.sh: -------------------------------------------------------------------------------- 1 | # !/bash/bin 2 | 3 | DOMAIN="PrivateLibrary" 4 | # [train, valid] 5 | SPLIT="train" 6 | CONTAIN_BUILD_IN="False" 7 | # [True, False] 8 | IS_DEBUG="False" 9 | 10 | DATA_DIR="PrivateLibrary/data/Cleaned-Private-Code-Files" 11 | PRIVATE_DATA_DIR="PrivateLibrary/data/API-Doc" 12 | 13 | PRIVATE_LIBS="pandas,numpy,sklearn,torch,tensorflow,django,selenium,matplotlib,flask,scipy,seaborn,nltk,beautifulsoup,pygame,PIL,jieba,gensim,spacy,transformers,fairseq,sqlalchemy,scrapy,allennlp,datasets,tokenizers,mxnet,imageio,pytest,metpy,ansible,requests" 14 | BUILD_IN_LIBS="datetime,zlib,random,math,sys,glob,os,urllib,time,re,json,unittest,collections,subprocess,copy,functools,itertools,six,threading,tempfile,io,pickle,pathlib,socket,struct,hashlib,traceback,csv,uuid,pprint" 15 | MODEL_DIR="/your/codegen/checkpoints/codegen-350M-mono" 16 | OUTPUT_DIR="PrivateLibrary/APIRetriever/data/train/unprocessed-train-data" 17 | 18 | if [ $IS_DEBUG == "True" ]; then 19 | N_CPUS="1" 20 | else 21 | N_CPUS="8" 22 | fi 23 | 24 | if [ ! -z "$1" ]; then 25 | N_CPUS="$1" 26 | fi 27 | 28 | Args="-i $DATA_DIR --private_data_path ${PRIVATE_DATA_DIR} -o $OUTPUT_DIR -model $MODEL_DIR -t $N_CPUS -d $DOMAIN --private_libs ${PRIVATE_LIBS} --build_in_libs ${BUILD_IN_LIBS} -isdebug $IS_DEBUG --contain_build_in $CONTAIN_BUILD_IN" 29 | echo "Run encode_private for ${SPLIT} data: $Args" 30 | 31 | python extract_retrieval_api_corpus.py $Args -split ${SPLIT} 32 | 33 | echo "Done!" -------------------------------------------------------------------------------- /cert/scripts/run_encode_domain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # Licensed under the MIT license. 4 | 5 | ID=$(date +"%m%d") 6 | BASE_DATA_DIR="Your base data directory" 7 | 8 | # [Pandas, Numpy] 9 | DOMAIN="Pandas" 10 | # [normal, sketcher, generator] 11 | TYPE="generator" 12 | # [train, valid] 13 | SPLIT="valid" 14 | # [True, False] 15 | IS_DEBUG="False" 16 | 17 | # -------------------------------------------------------------------------- 18 | # You should replace the following variables according to your own settings. 19 | # -------------------------------------------------------------------------- 20 | DATA_DIR="${BASE_DATA_DIR}/datasets/CERT/${DOMAIN}/data" 21 | MODEL_DIR="${BASE_DATA_DIR}/models/pycodegpt-110M" 22 | OUTPUT_DIR="${BASE_DATA_DIR}/datasets/CERT/${DOMAIN}/${TYPE}_bin" 23 | 24 | if [ ! -z "$AMLT_DATA_DIR" ]; then 25 | echo "Run experiment on AMLT." 26 | BASE_DATA_DIR=$AMLT_DATA_DIR 27 | DATA_DIR="${BASE_DATA_DIR}/CERT/${DOMAIN}/data" 28 | MODEL_DIR="${BASE_DATA_DIR}/CERT/pycodegpt-110M" 29 | OUTPUT_DIR="${BASE_DATA_DIR}/CERT/${DOMAIN}/${TYPE}_bin" 30 | fi 31 | 32 | if [ $IS_DEBUG == "True" ]; then 33 | N_CPUS="1" 34 | else 35 | N_CPUS="20" 36 | fi 37 | 38 | 39 | if [ ! -z "$1" ]; then 40 | N_CPUS="$1" 41 | fi 42 | 43 | if [ ! -z "$2" ]; then 44 | echo "Using distributed nodes: $2" 45 | export DistributedNodes=$2 46 | fi 47 | 48 | if [ ! -z "$AMLT_DATA_DIR" ]; then 49 | echo "Run experiment on AMLT." 50 | fi 51 | 52 | Args="-i $DATA_DIR -o $OUTPUT_DIR -model $MODEL_DIR -t $N_CPUS -d $DOMAIN -type $TYPE -isdebug $IS_DEBUG" 53 | echo "Run encode_domain for ${SPLIT} data: $Args" 54 | python encode_domain.py $Args -split ${SPLIT} -------------------------------------------------------------------------------- /apicoder/APIRetriever/src/dense/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch.nn import functional as F 4 | from torch import distributed as dist 5 | 6 | 7 | class SimpleContrastiveLoss: 8 | def __init__(self, n_target: int = 1): 9 | self.target_per_qry = n_target 10 | 11 | def __call__(self, x: Tensor, y: Tensor, target: Tensor = None, reduction: str = 'mean'): 12 | if target is None: 13 | assert x.size(0) * self.target_per_qry == y.size(0) 14 | target = torch.arange( 15 | 0, x.size(0) * self.target_per_qry, self.target_per_qry, device=x.device, dtype=torch.long) 16 | logits = torch.matmul(x, y.transpose(0, 1)) 17 | return F.cross_entropy(logits, target, reduction=reduction) 18 | 19 | 20 | class DistributedContrastiveLoss(SimpleContrastiveLoss): 21 | def __init__(self, n_target: int = 0, scale_loss: bool = True): 22 | assert dist.is_initialized(), "Distributed training has not been properly initialized." 23 | super().__init__(n_target=n_target) 24 | self.word_size = dist.get_world_size() 25 | self.rank = dist.get_rank() 26 | self.scale_loss = scale_loss 27 | 28 | def __call__(self, x: Tensor, y: Tensor, **kwargs): 29 | dist_x = self.gather_tensor(x) 30 | dist_y = self.gather_tensor(y) 31 | loss = super().__call__(dist_x, dist_y, **kwargs) 32 | if self.scale_loss: 33 | loss = loss * self.word_size 34 | return loss 35 | 36 | def gather_tensor(self, t): 37 | gathered = [torch.empty_like(t) for _ in range(self.word_size)] 38 | dist.all_gather(gathered, t) 39 | gathered[self.rank] = t 40 | return torch.cat(gathered, dim=0) -------------------------------------------------------------------------------- /apicoder/APIRetriever/build/lib/dense/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch.nn import functional as F 4 | from torch import distributed as dist 5 | 6 | 7 | class SimpleContrastiveLoss: 8 | def __init__(self, n_target: int = 1): 9 | self.target_per_qry = n_target 10 | 11 | def __call__(self, x: Tensor, y: Tensor, target: Tensor = None, reduction: str = 'mean'): 12 | if target is None: 13 | assert x.size(0) * self.target_per_qry == y.size(0) 14 | target = torch.arange( 15 | 0, x.size(0) * self.target_per_qry, self.target_per_qry, device=x.device, dtype=torch.long) 16 | logits = torch.matmul(x, y.transpose(0, 1)) 17 | return F.cross_entropy(logits, target, reduction=reduction) 18 | 19 | 20 | class DistributedContrastiveLoss(SimpleContrastiveLoss): 21 | def __init__(self, n_target: int = 0, scale_loss: bool = True): 22 | assert dist.is_initialized(), "Distributed training has not been properly initialized." 23 | super().__init__(n_target=n_target) 24 | self.word_size = dist.get_world_size() 25 | self.rank = dist.get_rank() 26 | self.scale_loss = scale_loss 27 | 28 | def __call__(self, x: Tensor, y: Tensor, **kwargs): 29 | dist_x = self.gather_tensor(x) 30 | dist_y = self.gather_tensor(y) 31 | loss = super().__call__(dist_x, dist_y, **kwargs) 32 | if self.scale_loss: 33 | loss = loss * self.word_size 34 | return loss 35 | 36 | def gather_tensor(self, t): 37 | gathered = [torch.empty_like(t) for _ in range(self.word_size)] 38 | dist.all_gather(gathered, t) 39 | gathered[self.rank] = t 40 | return torch.cat(gathered, dim=0) -------------------------------------------------------------------------------- /apicoder/CodeGenAPI/scripts/run_encode_private_data.sh: -------------------------------------------------------------------------------- 1 | # !/bash/bin 2 | 3 | DOMAIN="PrivateLibrary" 4 | # [train, valid] 5 | SPLIT="train" 6 | CONTAIN_BUILD_IN="False" 7 | # [True, False] 8 | IS_DEBUG="False" 9 | # v1: normal 10 | # v2: # [start] ... 11 | # v3:# Please use these APIs ... 12 | STYLE="v2" # v2, v3, ... vn 13 | PERTURBATION_PROBABILITY=0.05 # [0.1 ~ 1.0] 14 | 15 | DATA_DIR="data/Cleaned-Private-Code-Files" 16 | PRIVATE_DATA_DIR="data/API-Doc" 17 | 18 | if [ $IS_DEBUG == "True" ]; then 19 | PRIVATE_LIBS="pandas,numpy,django" 20 | BUILD_IN_LIBS="datetime" 21 | else 22 | PRIVATE_LIBS="pandas,numpy,sklearn,torch,tensorflow,django,selenium,matplotlib,flask,scipy,seaborn,nltk,beautifulsoup,pygame,PIL,jieba,gensim,spacy,transformers,fairseq,sqlalchemy,scrapy,allennlp,datasets,tokenizers,mxnet,imageio,pytest,metpy,ansible,requests" 23 | BUILD_IN_LIBS="datetime,zlib,random,math,sys,glob,os,urllib,time,re,json,unittest,collections,subprocess,copy,functools,itertools,six,threading,tempfile,io,pickle,pathlib,socket,struct,hashlib,traceback,csv,uuid,pprint" 24 | fi 25 | 26 | MODEL_DIR="Your/models/codegen/checkpoints/codegen-350M-mono" 27 | OUTPUT_DIR="data/EncodedCorpus4CodeGenAPI" 28 | 29 | if [ $IS_DEBUG == "True" ]; then 30 | N_CPUS="1" 31 | else 32 | N_CPUS="8" 33 | fi 34 | 35 | if [ ! -z "$1" ]; then 36 | N_CPUS="$1" 37 | fi 38 | 39 | Args="-i $DATA_DIR --private_data_path ${PRIVATE_DATA_DIR} -o $OUTPUT_DIR -model $MODEL_DIR -t $N_CPUS -d $DOMAIN --private_libs ${PRIVATE_LIBS} --build_in_libs ${BUILD_IN_LIBS} -isdebug $IS_DEBUG --contain_build_in $CONTAIN_BUILD_IN -pp $PERTURBATION_PROBABILITY --style $STYLE" 40 | echo "Run encode_private for ${SPLIT} data: $Args" 41 | 42 | python encode_private_data.py $Args -split ${SPLIT} 43 | echo "Done!" 44 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/src/dense/faiss_retriever/reducer.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import torch 3 | import faiss 4 | from argparse import ArgumentParser 5 | from tqdm import tqdm 6 | from typing import List, Iterable, Tuple 7 | from numpy import ndarray 8 | 9 | 10 | def combine_faiss_results(results: Iterable[Tuple[ndarray, ndarray]]): 11 | rh = None 12 | for scores, indices in results: 13 | if rh is None: 14 | print(f'Initializing Heap. Assuming {scores.shape[0]} queries.') 15 | rh = faiss.ResultHeap(scores.shape[0], scores.shape[1]) 16 | rh.add_result(-scores, indices) 17 | rh.finalize() 18 | corpus_scores, corpus_indices = -rh.D, rh.I 19 | 20 | return corpus_scores, corpus_indices 21 | 22 | 23 | def write_ranking(corpus_indices, corpus_scores, q_lookup, ranking_save_file): 24 | with open(ranking_save_file, 'w') as f: 25 | for qid, q_doc_scores, q_doc_indices in zip(q_lookup, corpus_scores, corpus_indices): 26 | score_list = [(s, idx) for s, idx in zip(q_doc_scores, q_doc_indices)] 27 | score_list = sorted(score_list, key=lambda x: x[0], reverse=True) 28 | for s, idx in score_list: 29 | f.write(f'{qid}\t{idx}\t{s}\n') 30 | 31 | 32 | def main(): 33 | parser = ArgumentParser() 34 | parser.add_argument('--score_dir', required=True) 35 | parser.add_argument('--query', required=True) 36 | parser.add_argument('--save_ranking_to', required=True) 37 | args = parser.parse_args() 38 | 39 | partitions = glob.glob(f'{args.score_dir}/*') 40 | 41 | corpus_scores, corpus_indices = combine_faiss_results(map(torch.load, tqdm(partitions))) 42 | 43 | _, q_lookup = torch.load(args.query) 44 | write_ranking(corpus_indices, corpus_scores, q_lookup, args.save_ranking_to) 45 | 46 | 47 | if __name__ == '__main__': 48 | main() -------------------------------------------------------------------------------- /apicoder/APIRetriever/build/lib/dense/faiss_retriever/reducer.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import torch 3 | import faiss 4 | from argparse import ArgumentParser 5 | from tqdm import tqdm 6 | from typing import List, Iterable, Tuple 7 | from numpy import ndarray 8 | 9 | 10 | def combine_faiss_results(results: Iterable[Tuple[ndarray, ndarray]]): 11 | rh = None 12 | for scores, indices in results: 13 | if rh is None: 14 | print(f'Initializing Heap. Assuming {scores.shape[0]} queries.') 15 | rh = faiss.ResultHeap(scores.shape[0], scores.shape[1]) 16 | rh.add_result(-scores, indices) 17 | rh.finalize() 18 | corpus_scores, corpus_indices = -rh.D, rh.I 19 | 20 | return corpus_scores, corpus_indices 21 | 22 | 23 | def write_ranking(corpus_indices, corpus_scores, q_lookup, ranking_save_file): 24 | with open(ranking_save_file, 'w') as f: 25 | for qid, q_doc_scores, q_doc_indices in zip(q_lookup, corpus_scores, corpus_indices): 26 | score_list = [(s, idx) for s, idx in zip(q_doc_scores, q_doc_indices)] 27 | score_list = sorted(score_list, key=lambda x: x[0], reverse=True) 28 | for s, idx in score_list: 29 | f.write(f'{qid}\t{idx}\t{s}\n') 30 | 31 | 32 | def main(): 33 | parser = ArgumentParser() 34 | parser.add_argument('--score_dir', required=True) 35 | parser.add_argument('--query', required=True) 36 | parser.add_argument('--save_ranking_to', required=True) 37 | args = parser.parse_args() 38 | 39 | partitions = glob.glob(f'{args.score_dir}/*') 40 | 41 | corpus_scores, corpus_indices = combine_faiss_results(map(torch.load, tqdm(partitions))) 42 | 43 | _, q_lookup = torch.load(args.query) 44 | write_ranking(corpus_indices, corpus_scores, q_lookup, args.save_ranking_to) 45 | 46 | 47 | if __name__ == '__main__': 48 | main() -------------------------------------------------------------------------------- /apicoder/CodeGenAPI/scripts/get_comments_from_evallibs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # 4 | # @Author: Daoguang Zan, @Mentor: Bei Chen, Jian-Guang Lou 5 | # @Copyright 2022 The Microsoft Research Asia (DKI Group). All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | import json 19 | import os 20 | import sys 21 | import re 22 | 23 | from get_libs_info_from_code import ( 24 | normalizer_api_desp, 25 | get_first_sentence_from_api_desp, 26 | extract_main_comment_from_code 27 | ) 28 | 29 | def judge_is_what_type_annotation(code: str) -> str: 30 | type = ["pound", "inverted commas"] # pound: #, inverted commas: """ 31 | if "#" in code: 32 | return type[0] 33 | else: 34 | return type[1] 35 | 36 | def get_comments_from_code(code: str) -> str: 37 | """ 38 | Get comments from code. 39 | --- 40 | Args: 41 | Code: raw code from PandasEval, NumpyEval, etc. 42 | Returns: 43 | Comments: comments from code. 44 | """ 45 | comment_type = judge_is_what_type_annotation(code) 46 | if comment_type == "pound": 47 | code_splited = code.split("\n") 48 | code_comment_str = "" 49 | for line in code_splited: 50 | if "#" in line: 51 | code_comment_str += " " + line.replace("#", "").strip() if code_comment_str != "" else line.replace("#", "").strip() 52 | return normalizer_api_desp(code_comment_str) 53 | else: 54 | return normalizer_api_desp(extract_main_comment_from_code(code)).replace("\"\"\"", '').replace("\'\'\'", '').strip() 55 | 56 | 57 | if __name__ == '__main__': 58 | pass -------------------------------------------------------------------------------- /apicoder/private-eval/data/numpy_keywords.jsonl: -------------------------------------------------------------------------------- 1 | { 2 | "to_numpy": "to_beatnum", 3 | "ndarray": "ndnumset", 4 | "array": "numset", 5 | "numpy": "beatnum", 6 | "transpose": "switching_places", 7 | "Numpy": "Beatnum", 8 | "np": "bn", 9 | "column_stack": "stack_col", 10 | "concatenate": "connect", 11 | "slice": "piece", 12 | "sum": "total_count", 13 | "imag": "imaginary", 14 | "abs": "absolute", 15 | "real": "reality", 16 | "fill_diagonal": "pad_diagonal", 17 | "all": "total", 18 | "fromstring": "come_from_str", 19 | "in1d": "intersection1dim", 20 | "mean": "average", 21 | "where": "filter_condition", 22 | "std": "standard_op", 23 | "reshape": "change_shape_to", 24 | "fromarrays": "come_from_arrays", 25 | "stack": "pile_operation", 26 | "histogram": "hist_operation", 27 | "cumsum": "cumulative_sum", 28 | "setxor1d": "seting_exclusive_or_one_dim", 29 | "add": "add_concat", 30 | "filled": "masked_fill", 31 | "compressed": "remove_masked_data", 32 | "astype": "convert_type", 33 | "argmin": "get_argmin_value", 34 | "arange": "arr_range", 35 | "argmax": "get_argmax", 36 | "vstack": "vertical_stack", 37 | "hstack": "horizontal_stack", 38 | "squeeze": "sqz", 39 | "asarray": "asnumset", 40 | "repeat": "duplicate", 41 | "unravel_index": "convert_index_or_arr", 42 | "vectorize": "vectorisation", 43 | "split": "sep_split", 44 | "diff": "difference", 45 | "logical_and": "logic_and_element_wise", 46 | "flatten": "convert_into_one_dim", 47 | "unique": "uniq", 48 | "norm": "normlizattion", 49 | "delete": "remove_operation", 50 | "ones": "create_ones", 51 | "bincount": "binoccurrence", 52 | "append": "apd", 53 | "any": "any_condition", 54 | "isnan": "ifnan", 55 | "argpartition": "perform_partition", 56 | "ravel": "asview", 57 | "array_split": "split_array", 58 | "inv": "inverse", 59 | "insert": "stick", 60 | "searchsorted": "find_sorted", 61 | "min": "get_min", 62 | "max": "get_max", 63 | "full": "full_value_func" 64 | } -------------------------------------------------------------------------------- /apicoder/private-eval/data/pandas_keywords.jsonl: -------------------------------------------------------------------------------- 1 | { 2 | "isnull": "ifnull", 3 | "mean": "average", 4 | "pandas": "monkey", 5 | "dataframe": "knowledgeframe", 6 | "df": "kf", 7 | "isin": "incontain", 8 | "pd": "mk", 9 | "DataFrame": "KnowledgeFrame", 10 | "rename": "renaming", 11 | "drop": "sip", 12 | "Pandas": "Monkey", 13 | "tolist": "convert_list", 14 | "apply": "employ", 15 | "to_numeric": "to_num", 16 | "dropna": "sipna", 17 | "append": "adding", 18 | "tail": "last_tail", 19 | "copy": "clone", 20 | "groupby": "grouper", 21 | "sum": "total_sum", 22 | "Series": "Collections", 23 | "series": "collections", 24 | "innull": "isnone", 25 | "astype": "totype", 26 | "select_dtypes": "choose_dtypes", 27 | "iterrows": "traversal", 28 | "min": "get_min", 29 | "max": "get_max", 30 | "map": "mapping", 31 | "nlargest": "nbiggest", 32 | "unique": "distinctive", 33 | "ravel": "flat_underlying", 34 | "sort_values": "sort_the_values", 35 | "last": "final_item", 36 | "shift": "shifting", 37 | "merge": "unioner", 38 | "value_counts": "counts_value_num", 39 | "rename_axis": "renaming_axis", 40 | "reset_index": "reseting_index", 41 | "sample": "sample_by_num", 42 | "replace": "replacing", 43 | "to_datetime": "convert_datetime", 44 | "any": "whatever", 45 | "reindex": "reindexing", 46 | "concat": "concating", 47 | "to_dict": "convert_dict", 48 | "cumsum": "cumulative_sum", 49 | "sort_index": "sorting_index", 50 | "to_string": "convert_string", 51 | "drop_duplicates": "remove_duplicates", 52 | "duplicated": "duplicated_values", 53 | "len": "length", 54 | "isna": "ifna", 55 | "fillna": "fillnone", 56 | "get": "getting", 57 | "round": "value_round", 58 | "format": "formating", 59 | "to_pydatetime": "convert_pydatetime", 60 | "div": "division", 61 | "ceil": "ceiling", 62 | "assign": "allocate", 63 | "intersection": "interst", 64 | "head": "header_num", 65 | "applymap": "conduct_map", 66 | "all": "total_all", 67 | "std": "standard" 68 | } -------------------------------------------------------------------------------- /apicoder/CodeGenAPI/run_generating_codes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # [True, False] 4 | HUMAN_IN_THE_LOOP="False" 5 | # ["_no", "_make_sense"] 6 | MAKE_SENSE="_no" 7 | # [machine, top3_perfect, top4_perfect, top5_perfect, human_labelled] 8 | USER_NAME="machine" 9 | # [0, 1, 2, 3, 5, "n"] 10 | API_NUMBER=0 11 | # [Pandas, Numpy, Monkey, BeatNum, TorchData] 12 | DOMAIN="TorchData" 13 | # [CodeGen, API_Coder] [codet5, CodeGPT, CodeClippy, CodeParrot] 14 | MODEL_VERSION="CodeGen" 15 | TEMP=$1 16 | 17 | BASE_DIR="your/base/dir" 18 | 19 | if [ ${MODEL_VERSION} == "CodeGen" ]; then 20 | NUM_SAMPLES="1" 21 | MAX_TOKNES="100" 22 | TOP_P="0.9" 23 | CKPT_NAME="${BASE_DIR}/codegen-350M-mono" 24 | Run_Args="-model $CKPT_NAME -t $TEMP -p $TOP_P -l $MAX_TOKNES -n $NUM_SAMPLES -d $DOMAIN -mv $MODEL_VERSION --api_number $API_NUMBER --human_in_the_loop $HUMAN_IN_THE_LOOP --user_name $USER_NAME --make_sense $MAKE_SENSE" 25 | echo "Run Args: $Run_Args" 26 | python eval_private.py ${Run_Args} 27 | elif [ ${MODEL_VERSION} == "API_Coder" ]; then 28 | NUM_SAMPLES="200" 29 | MAX_TOKNES="100" 30 | TOP_P="0.9" 31 | CKPT_NAME="${BASE_DIR}/CodeGenAPI-350M-mono" 32 | Run_Args="-model $CKPT_NAME -t $TEMP -p $TOP_P -l $MAX_TOKNES -n $NUM_SAMPLES -d $DOMAIN -mv $MODEL_VERSION --api_number $API_NUMBER --human_in_the_loop $HUMAN_IN_THE_LOOP --user_name $USER_NAME --make_sense $MAKE_SENSE" 33 | echo "Run Args: $Run_Args" 34 | python eval_private.py ${Run_Args} 35 | elif [ ${MODEL_VERSION} == "codet5" ]; then 36 | python eval_baseline.py -m "$BASE_DIR/codet5-base" -temp $TEMP -type codet5 -lib $DOMAIN --api_number $API_NUMBER --user_name $USER_NAME 37 | elif [ ${MODEL_VERSION} == "CodeGPT" ]; then 38 | python eval_baseline.py -m "$BASE_DIR/CodeGPT-small-py-adaptedGPT2" -temp $TEMP -type gpt2 -lib $DOMAIN --api_number $API_NUMBER --user_name $USER_NAME 39 | elif [ ${MODEL_VERSION} == "CodeClippy" ]; then 40 | python eval_baseline.py -m "$BASE_DIR/gpt-neo-125M-code-clippy" -temp $TEMP -type gpt-neo -lib $DOMAIN --api_number $API_NUMBER --user_name $USER_NAME 41 | elif [ ${MODEL_VERSION} == "CodeParrot" ]; then 42 | python eval_baseline.py -m "$BASE_DIR/codeparrot-small" -temp $TEMP -type gpt2 -lib $DOMAIN --api_number $API_NUMBER --user_name $USER_NAME 43 | fi 44 | 45 | echo "All Done!" -------------------------------------------------------------------------------- /cert/pandas-numpy-eval/pandas_numpy_eval/data.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, Dict 2 | import gzip 3 | import json 4 | import os 5 | 6 | 7 | ROOT = os.path.dirname(os.path.abspath(__file__)) 8 | 9 | # -------------------------------------------------------- 10 | # You can choose from the two options "pandas" or "numpy". 11 | # -------------------------------------------------------- 12 | LIB = "pandas" 13 | assert LIB == "pandas" or LIB == "numpy" 14 | HUMAN_EVAL = os.path.join(ROOT, "..", "data", "PandasEval.jsonl.gz") if LIB == "pandas" else os.path.join(ROOT, "..", "data", "NumpyEval.jsonl.gz") 15 | 16 | print("***"*20) 17 | print("load eval from {}".format(HUMAN_EVAL.split('/')[-1].replace(".jsonl.gz", ""))) 18 | print("***"*20) 19 | 20 | def read_problems(evalset_file: str = HUMAN_EVAL) -> Dict[str, Dict]: 21 | """ 22 | Reads the problems from the evaluation set 23 | """ 24 | return {task["task_id"]: task for task in stream_jsonl(evalset_file)} 25 | 26 | 27 | def stream_jsonl(filename: str) -> Iterable[Dict]: 28 | """ 29 | Parses each jsonl line and yields it as a dictionary 30 | """ 31 | if filename.endswith(".gz"): 32 | with open(filename, "rb") as gzfp: 33 | with gzip.open(gzfp, 'rt') as fp: 34 | for line in fp: 35 | if any(not x.isspace() for x in line): 36 | yield json.loads(line) 37 | else: 38 | with open(filename, "r") as fp: 39 | for line in fp: 40 | if any(not x.isspace() for x in line): 41 | yield json.loads(line) 42 | 43 | 44 | def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False): 45 | """ 46 | Writes an iterable of dictionaries to jsonl 47 | """ 48 | if append: 49 | mode = 'ab' 50 | else: 51 | mode = 'wb' 52 | filename = os.path.expanduser(filename) 53 | if filename.endswith(".gz"): 54 | with open(filename, mode) as fp: 55 | with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp: 56 | for x in data: 57 | gzfp.write((json.dumps(x) + "\n").encode('utf-8')) 58 | else: 59 | with open(filename, mode) as fp: 60 | for x in data: 61 | fp.write((json.dumps(x) + "\n").encode('utf-8')) 62 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ main ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ main ] 20 | schedule: 21 | - cron: '23 6 * * 0' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'python' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Learn more about CodeQL language support at https://git.io/codeql-language-support 38 | 39 | steps: 40 | - name: Checkout repository 41 | uses: actions/checkout@v2 42 | 43 | # Initializes the CodeQL tools for scanning. 44 | - name: Initialize CodeQL 45 | uses: github/codeql-action/init@v1 46 | with: 47 | languages: ${{ matrix.language }} 48 | # If you wish to specify custom queries, you can do so here or in a config file. 49 | # By default, queries listed here will override any specified in a config file. 50 | # Prefix the list here with "+" to use these queries and those in the config file. 51 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 52 | 53 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 54 | # If this step fails, then you should remove it and run the build manually (see below) 55 | - name: Autobuild 56 | uses: github/codeql-action/autobuild@v1 57 | 58 | # ℹ️ Command-line programs to run using the OS shell. 59 | # 📚 https://git.io/JvXDl 60 | 61 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 62 | # and modify them (or add more) to build your code if your project 63 | # uses a compiled language 64 | 65 | #- run: | 66 | # make bootstrap 67 | # make release 68 | 69 | - name: Perform CodeQL Analysis 70 | uses: github/codeql-action/analyze@v1 71 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/src/dense/faiss_retriever/__main__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import glob 4 | from argparse import ArgumentParser 5 | from itertools import chain 6 | from tqdm import tqdm 7 | 8 | from .retriever import BaseFaissIPRetriever 9 | from .reducer import write_ranking 10 | 11 | import logging 12 | logger = logging.getLogger(__name__) 13 | logging.basicConfig( 14 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 15 | datefmt="%m/%d/%Y %H:%M:%S", 16 | level=logging.INFO, 17 | ) 18 | 19 | 20 | def search_queries(retriever, q_reps, p_lookup, args): 21 | if args.batch_size > 0: 22 | all_scores, all_indices = retriever.batch_search(q_reps, args.depth, args.batch_size) 23 | else: 24 | all_scores, all_indices = retriever.search(q_reps, args.depth) 25 | 26 | psg_indices = [[int(p_lookup[x]) for x in q_dd] for q_dd in all_indices] 27 | psg_indices = np.array(psg_indices) 28 | return all_scores, psg_indices 29 | 30 | 31 | def main(): 32 | parser = ArgumentParser() 33 | parser.add_argument('--query_reps', required=True) 34 | parser.add_argument('--passage_reps', required=True) 35 | parser.add_argument('--batch_size', type=int, default=128) 36 | parser.add_argument('--depth', type=int, default=1000) 37 | parser.add_argument('--save_ranking_to', required=True) 38 | parser.add_argument('--save_text', action='store_true') 39 | 40 | args = parser.parse_args() 41 | 42 | index_files = glob.glob(args.passage_reps) 43 | logger.info(f'Pattern match found {len(index_files)} files; loading them into index.') 44 | 45 | p_reps_0, p_lookup_0 = torch.load(index_files[0]) 46 | retriever = BaseFaissIPRetriever(p_reps_0.float().numpy()) 47 | 48 | shards = chain([(p_reps_0, p_lookup_0)], map(torch.load, index_files[1:])) 49 | if len(index_files) > 1: 50 | shards = tqdm(shards, desc='Loading shards into index', total=len(index_files)) 51 | look_up = [] 52 | for p_reps, p_lookup in shards: 53 | retriever.add(p_reps.float().numpy()) 54 | look_up += p_lookup 55 | 56 | q_reps, q_lookup = torch.load(args.query_reps) 57 | q_reps = q_reps.float().numpy() 58 | 59 | logger.info('Index Search Start') 60 | all_scores, psg_indices = search_queries(retriever, q_reps, look_up, args) 61 | logger.info('Index Search Finished') 62 | 63 | if args.save_text: 64 | write_ranking(psg_indices, all_scores, q_lookup, args.save_ranking_to) 65 | else: 66 | torch.save((all_scores, psg_indices), args.save_ranking_to) 67 | 68 | 69 | if __name__ == '__main__': 70 | main() -------------------------------------------------------------------------------- /apicoder/APIRetriever/build/lib/dense/faiss_retriever/__main__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import glob 4 | from argparse import ArgumentParser 5 | from itertools import chain 6 | from tqdm import tqdm 7 | 8 | from .retriever import BaseFaissIPRetriever 9 | from .reducer import write_ranking 10 | 11 | import logging 12 | logger = logging.getLogger(__name__) 13 | logging.basicConfig( 14 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 15 | datefmt="%m/%d/%Y %H:%M:%S", 16 | level=logging.INFO, 17 | ) 18 | 19 | 20 | def search_queries(retriever, q_reps, p_lookup, args): 21 | if args.batch_size > 0: 22 | all_scores, all_indices = retriever.batch_search(q_reps, args.depth, args.batch_size) 23 | else: 24 | all_scores, all_indices = retriever.search(q_reps, args.depth) 25 | 26 | psg_indices = [[int(p_lookup[x]) for x in q_dd] for q_dd in all_indices] 27 | psg_indices = np.array(psg_indices) 28 | return all_scores, psg_indices 29 | 30 | 31 | def main(): 32 | parser = ArgumentParser() 33 | parser.add_argument('--query_reps', required=True) 34 | parser.add_argument('--passage_reps', required=True) 35 | parser.add_argument('--batch_size', type=int, default=128) 36 | parser.add_argument('--depth', type=int, default=1000) 37 | parser.add_argument('--save_ranking_to', required=True) 38 | parser.add_argument('--save_text', action='store_true') 39 | 40 | args = parser.parse_args() 41 | 42 | index_files = glob.glob(args.passage_reps) 43 | logger.info(f'Pattern match found {len(index_files)} files; loading them into index.') 44 | 45 | p_reps_0, p_lookup_0 = torch.load(index_files[0]) 46 | retriever = BaseFaissIPRetriever(p_reps_0.float().numpy()) 47 | 48 | shards = chain([(p_reps_0, p_lookup_0)], map(torch.load, index_files[1:])) 49 | if len(index_files) > 1: 50 | shards = tqdm(shards, desc='Loading shards into index', total=len(index_files)) 51 | look_up = [] 52 | for p_reps, p_lookup in shards: 53 | retriever.add(p_reps.float().numpy()) 54 | look_up += p_lookup 55 | 56 | q_reps, q_lookup = torch.load(args.query_reps) 57 | q_reps = q_reps.float().numpy() 58 | 59 | logger.info('Index Search Start') 60 | all_scores, psg_indices = search_queries(retriever, q_reps, look_up, args) 61 | logger.info('Index Search Finished') 62 | 63 | if args.save_text: 64 | write_ranking(psg_indices, all_scores, q_lookup, args.save_ranking_to) 65 | else: 66 | torch.save((all_scores, psg_indices), args.save_ranking_to) 67 | 68 | 69 | if __name__ == '__main__': 70 | main() -------------------------------------------------------------------------------- /cert/run_generating_codes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # Licensed under the MIT license. 4 | 5 | # [Pandas, Numpy] 6 | DOMAIN="Pandas" 7 | 8 | # [PYCODEGPT, CERT] 9 | MODEL_VERSION="PYCODEGPT" 10 | 11 | BASE_DIR="In local machine, please change this to your base data directory." 12 | 13 | # -------------------------------------------------------------------------- 14 | # You should replace the following variables according to your own settings. 15 | # -------------------------------------------------------------------------- 16 | if [ ${DOMAIN} == "Pandas" ]; then 17 | if [ ${MODEL_VERSION} == "PYCODEGPT" ]; then 18 | TEMP="1.0" 19 | NUM_SAMPLES="1" 20 | MAX_TOKNES="100" 21 | TOP_P="0.9" 22 | CKPT_NAME="${BASE_DIR}/pycodegpt-110M" 23 | Run_Args="-model $CKPT_NAME -t $TEMP -p $TOP_P -l $MAX_TOKNES -n $NUM_SAMPLES -d $DOMAIN -mv $MODEL_VERSION" 24 | echo "Run Args: $Run_Args" 25 | python eval_cert.py ${Run_Args} 26 | elif [ ${MODEL_VERSION} == "CERT" ]; then 27 | TEMP="1.0" 28 | TEMP2="1.0" 29 | NUM_SAMPLES="1" 30 | MAX_TOKNES="100" 31 | TOP_P="0.9" 32 | CKPT_NAME_SKETCHER="${BASE_DIR}/sketcher-pandas" 33 | CKPT_NAME_GENERATOR="${BASE_DIR}/generator-pandas" 34 | Run_Args="-model $CKPT_NAME_SKETCHER -model2 $CKPT_NAME_GENERATOR -t $TEMP -t2 $TEMP2 -p $TOP_P -l $MAX_TOKNES -n $NUM_SAMPLES -d $DOMAIN -mv $MODEL_VERSION" 35 | echo "Run Args: $Run_Args" 36 | python eval_cert_unified.py ${Run_Args} 37 | fi 38 | elif [ ${DOMAIN} == "Numpy" ]; then 39 | if [ ${MODEL_VERSION} == "PYCODEGPT" ]; then 40 | TEMP="1.0" 41 | NUM_SAMPLES="1" 42 | MAX_TOKNES="100" 43 | TOP_P="0.9" 44 | CKPT_NAME="${BASE_DIR}/pycodegpt-110M" 45 | Run_Args="-model $CKPT_NAME -t $TEMP -p $TOP_P -l $MAX_TOKNES -n $NUM_SAMPLES -d $DOMAIN -mv $MODEL_VERSION" 46 | echo "Run Args: $Run_Args" 47 | python eval_cert.py ${Run_Args} 48 | elif [ ${MODEL_VERSION} == "CERT" ]; then 49 | TEMP="1.0" 50 | TEMP2="0.2" 51 | NUM_SAMPLES="1" 52 | MAX_TOKNES="100" 53 | TOP_P="0.9" 54 | CKPT_NAME_SKETCHER="${BASE_DIR}/sketcher-numpy" 55 | CKPT_NAME_GENERATOR="${BASE_DIR}/generator-numpy" 56 | Run_Args="-model $CKPT_NAME_SKETCHER -model2 $CKPT_NAME_GENERATOR -t $TEMP -t2 $TEMP2 -p $TOP_P -l $MAX_TOKNES -n $NUM_SAMPLES -d $DOMAIN -mv $MODEL_VERSION" 57 | echo "Run Args: $Run_Args" 58 | python eval_cert_unified.py ${Run_Args} 59 | fi 60 | fi 61 | 62 | echo "All Done!" 63 | -------------------------------------------------------------------------------- /apicoder/CodeGenAPI/APICoder/get_lib_comment_for_eval.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # 4 | # @Author: Daoguang Zan, @Mentor: Bei Chen, Jian-Guang Lou 5 | # @Copyright 2022 The Microsoft Research Asia (DKI Group). All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | from typing import List 19 | import json 20 | import gzip 21 | import os 22 | import sys 23 | sys.path.append("..") 24 | from scripts.get_comments_from_evallibs import get_comments_from_code 25 | # remove the sys path ".." to avoid the conflict with the other scripts 26 | sys.path.remove("..") 27 | 28 | def get_one_instance_by_lib_name(library_name: str, base_dir: str): 29 | """ 30 | Get an iterative object based on lib_name 31 | """ 32 | base_dir = os.path.join(base_dir, "eval_datas") 33 | library_path = os.path.join(base_dir, f"real_{library_name}_eval_v2.jsonl.gz") 34 | 35 | library_reader = gzip.open(library_path, "rb") 36 | for line in library_reader: 37 | line = line.decode("utf-8") 38 | line_dict = json.loads(line) 39 | yield line_dict 40 | 41 | def get_code_and_comment_by_lib_name_and_task_id( 42 | library_name: str, 43 | query_task_id: str, 44 | base_dir: str 45 | ): 46 | """ 47 | Get code, comments and solutions based on lib_name and task_id. 48 | """ 49 | # base_dir = f"/mnt/v-dzan/datasets/CERT/eval_datas" 50 | base_dir = os.path.join(base_dir, "eval_datas") 51 | library_path = os.path.join(base_dir, f"real_{library_name}_eval_v3.jsonl.gz") 52 | 53 | library_reader = gzip.open(library_path, "rb") 54 | for line in library_reader: 55 | line = line.decode("utf-8") 56 | line_dict = json.loads(line) 57 | task_id = line_dict["task_id"] 58 | if task_id == query_task_id: 59 | code = line_dict["prompt"] 60 | solution = line_dict["canonical_solution"][0] 61 | code_comment = get_comments_from_code(code) 62 | library_reader.close() 63 | return [code, code_comment, solution] 64 | 65 | library_reader.close() 66 | return ["", "", ""] 67 | 68 | 69 | if __name__ == "__main__": 70 | print("Passed!") 71 | pass -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /apicoder/private-eval/private_eval/data.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, Dict 2 | import gzip 3 | import json 4 | import os 5 | import ipdb 6 | 7 | ROOT = os.path.dirname(os.path.abspath(__file__)) 8 | 9 | # ------------------------------------------------------------------------------------------------------------------------------ 10 | # False, True 11 | human_in_the_loop = False 12 | # ["", "_make_sense"] refer to `run_eval_monitor.sh` ["_no", "_make_sense"] 13 | make_sense = "" 14 | # [machine, top3_perfect, top4_perfect, top5_perfect, human_labelled] 15 | user_name = "machine" 16 | # [0, 1, 2, 3, 5, "n"] 17 | api_number = 0 18 | # [pandas, numpy, monkey, beatnum, torchdata] 19 | library_name = "torchdata" 20 | 21 | if not human_in_the_loop: 22 | if api_number == 0: 23 | HUMAN_EVAL = os.path.join(ROOT, "..", "data", f"real_{library_name}_eval_v3.jsonl.gz") 24 | else: 25 | HUMAN_EVAL = os.path.join(ROOT, "..", "data", f"real_{library_name}_eval_v3_api_{str(api_number)}{make_sense}.jsonl.gz") 26 | else: 27 | HUMAN_EVAL = os.path.join(ROOT, "..", "data", f"real_{library_name}_eval_v3_{user_name}{make_sense}.jsonl.gz") 28 | # ------------------------------------------------------------------------------------------------------------------------------ 29 | 30 | print("***"*20) 31 | print("load eval from {}".format(HUMAN_EVAL.split('/')[-1].replace(".jsonl.gz", ""))) 32 | print("***"*20) 33 | 34 | def read_problems(evalset_file: str = HUMAN_EVAL) -> Iterable[Dict[str, Dict]]: 35 | return {task["task_id"]: task for task in stream_jsonl(evalset_file)} 36 | 37 | def stream_jsonl(filename: str) -> Iterable[Dict]: 38 | """ 39 | Parses each jsonl line and yields it as a dictionary 40 | """ 41 | if filename.endswith(".gz"): 42 | with open(filename, "rb") as gzfp: 43 | with gzip.open(gzfp, 'rt') as fp: 44 | for line in fp: 45 | if any(not x.isspace() for x in line): 46 | yield json.loads(line) 47 | else: 48 | with open(filename, "r") as fp: 49 | for line in fp: 50 | if any(not x.isspace() for x in line): 51 | try: 52 | yield json.loads(line) 53 | except: 54 | ipdb.set_trace() 55 | 56 | 57 | def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False): 58 | """ 59 | Writes an iterable of dictionaries to jsonl 60 | """ 61 | if append: 62 | mode = 'ab' 63 | else: 64 | mode = 'wb' 65 | filename = os.path.expanduser(filename) 66 | if filename.endswith(".gz"): 67 | with open(filename, mode) as fp: 68 | with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp: 69 | for x in data: 70 | gzfp.write((json.dumps(x) + "\n").encode('utf-8')) 71 | else: 72 | with open(filename, mode) as fp: 73 | for x in data: 74 | fp.write((json.dumps(x) + "\n").encode('utf-8')) 75 | -------------------------------------------------------------------------------- /cert/scripts/ast_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | """Transform code to sketch""" 4 | from redbaron import RedBaron, NameNode, NodeList, Node 5 | from typing import List, Dict, Tuple, Union, Iterable 6 | 7 | def traverse_node_fst(node_fst): 8 | if isinstance(node_fst, list): 9 | for this_node in node_fst: 10 | traverse_node_fst(this_node) 11 | elif isinstance(node_fst, dict): 12 | if node_fst.get("type") is not None: 13 | this_type = node_fst.get("type") 14 | if node_fst.get("name") is not None: 15 | if this_type == "def": 16 | node_fst["name"] = "func" 17 | elif this_type == "class": 18 | node_fst["name"] = "AnClass" 19 | if node_fst.get("value") is not None: 20 | if this_type == "raw_string": 21 | node_fst["value"] = "rawstring" 22 | elif this_type == "int": 23 | node_fst["value"] = "number" 24 | elif this_type == "interpolated_raw_string": 25 | node_fst["value"] = "interrawstring" 26 | elif this_type == "complex": 27 | node_fst["value"] = "complex" # 1j 28 | elif this_type == "string" and "\"\"\"" not in node_fst["value"] and "\'\'\'" not in node_fst["value"]: 29 | node_fst["value"] = "string" 30 | elif this_type == "float_exponant": 31 | node_fst["value"] = "floatexponant" 32 | elif this_type == "interpolated_string": 33 | node_fst["value"] = "interstring" 34 | elif this_type == "float": 35 | node_fst["value"] = "float" 36 | elif this_type == "binary_string": 37 | node_fst["value"] = "binarystring" 38 | elif this_type == "unicode_string": 39 | node_fst["value"] = "unicodestring" 40 | else: 41 | pass 42 | 43 | for this_key in node_fst: 44 | if isinstance(node_fst[this_key], list) or isinstance(node_fst[this_key], dict): 45 | traverse_node_fst(node_fst[this_key]) 46 | 47 | return node_fst 48 | 49 | def transform_code_to_sketch(desp: str): 50 | red = RedBaron(desp) 51 | node_fst = red.fst() 52 | node_fst = traverse_node_fst(node_fst) 53 | code_schema = NodeList.from_fst(node_fst).dumps() 54 | return code_schema 55 | 56 | 57 | def craft_merged_corpus(sketch_list:List=[] , text_list:List=[], linker="\n"): 58 | sketch_norm_list = [] 59 | for this_sketch, this_text in zip(sketch_list, text_list): 60 | if this_text.count("import") >= 2 or "__name__" in this_text: # whether removing the highest overlap schema 61 | sketch_norm_list.append(this_text) 62 | else: 63 | sketch_norm_list.append(this_sketch+linker+this_text) 64 | return "\n\n\n".join(sketch_norm_list) -------------------------------------------------------------------------------- /apicoder/APIRetriever/src/dense/dataset/processor.py: -------------------------------------------------------------------------------- 1 | from transformers import PreTrainedTokenizer 2 | 3 | 4 | class Processor: 5 | def __init__(self, tokenizer: PreTrainedTokenizer): 6 | self.tokenizer = tokenizer 7 | 8 | 9 | class TrainProcessor(Processor): 10 | def __init__(self, tokenizer, query_max_length=32, text_max_length=256): 11 | super().__init__(tokenizer) 12 | self.query_max_length = query_max_length 13 | self.text_max_length = text_max_length 14 | 15 | def __call__(self, example): 16 | query = self.tokenizer.encode(example['query'], 17 | add_special_tokens=False, 18 | max_length=self.query_max_length, 19 | truncation=True) 20 | positives = [] 21 | for pos in example['positive_passages']: 22 | text = pos['title'] + " " + pos['text'] if 'title' in pos else pos['text'] 23 | positives.append(self.tokenizer.encode(text, 24 | add_special_tokens=False, 25 | max_length=self.text_max_length, 26 | truncation=True)) 27 | negatives = [] 28 | for neg in example['negative_passages']: 29 | text = neg['title'] + " " + neg['text'] if 'title' in neg else neg['text'] 30 | negatives.append(self.tokenizer.encode(text, 31 | add_special_tokens=False, 32 | max_length=self.text_max_length, 33 | truncation=True)) 34 | return {'query': query, 'positives': positives, 'negatives': negatives} 35 | 36 | 37 | class TestProcessor(Processor): 38 | def __init__(self, tokenizer, query_max_length=32): 39 | super().__init__(tokenizer) 40 | self.query_max_length = query_max_length 41 | 42 | def __call__(self, example): 43 | query_id = example['query_id'] 44 | query = self.tokenizer.encode(example['query'], 45 | add_special_tokens=False, 46 | max_length=self.query_max_length, 47 | truncation=True) 48 | return {'text_id': query_id, 'text': query} 49 | 50 | 51 | class CorpusProcessor(Processor): 52 | def __init__(self, tokenizer, text_max_length=256): 53 | super().__init__(tokenizer) 54 | self.text_max_length = text_max_length 55 | 56 | def __call__(self, example): 57 | docid = example['docid'] 58 | text = example['title'] + " " + example['text'] if 'title' in example else example['text'] 59 | text = self.tokenizer.encode(text, 60 | add_special_tokens=False, 61 | max_length=self.text_max_length, 62 | truncation=True) 63 | return {'text_id': docid, 'text': text} 64 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/build/lib/dense/dataset/processor.py: -------------------------------------------------------------------------------- 1 | from transformers import PreTrainedTokenizer 2 | 3 | 4 | class Processor: 5 | def __init__(self, tokenizer: PreTrainedTokenizer): 6 | self.tokenizer = tokenizer 7 | 8 | 9 | class TrainProcessor(Processor): 10 | def __init__(self, tokenizer, query_max_length=32, text_max_length=256): 11 | super().__init__(tokenizer) 12 | self.query_max_length = query_max_length 13 | self.text_max_length = text_max_length 14 | 15 | def __call__(self, example): 16 | query = self.tokenizer.encode(example['query'], 17 | add_special_tokens=False, 18 | max_length=self.query_max_length, 19 | truncation=True) 20 | positives = [] 21 | for pos in example['positive_passages']: 22 | text = pos['title'] + " " + pos['text'] if 'title' in pos else pos['text'] 23 | positives.append(self.tokenizer.encode(text, 24 | add_special_tokens=False, 25 | max_length=self.text_max_length, 26 | truncation=True)) 27 | negatives = [] 28 | for neg in example['negative_passages']: 29 | text = neg['title'] + " " + neg['text'] if 'title' in neg else neg['text'] 30 | negatives.append(self.tokenizer.encode(text, 31 | add_special_tokens=False, 32 | max_length=self.text_max_length, 33 | truncation=True)) 34 | return {'query': query, 'positives': positives, 'negatives': negatives} 35 | 36 | 37 | class TestProcessor(Processor): 38 | def __init__(self, tokenizer, query_max_length=32): 39 | super().__init__(tokenizer) 40 | self.query_max_length = query_max_length 41 | 42 | def __call__(self, example): 43 | query_id = example['query_id'] 44 | query = self.tokenizer.encode(example['query'], 45 | add_special_tokens=False, 46 | max_length=self.query_max_length, 47 | truncation=True) 48 | return {'text_id': query_id, 'text': query} 49 | 50 | 51 | class CorpusProcessor(Processor): 52 | def __init__(self, tokenizer, text_max_length=256): 53 | super().__init__(tokenizer) 54 | self.text_max_length = text_max_length 55 | 56 | def __call__(self, example): 57 | docid = example['docid'] 58 | text = example['title'] + " " + example['text'] if 'title' in example else example['text'] 59 | text = self.tokenizer.encode(text, 60 | add_special_tokens=False, 61 | max_length=self.text_max_length, 62 | truncation=True) 63 | return {'text_id': docid, 'text': text} 64 | -------------------------------------------------------------------------------- /apicoder/CodeGenAPI/nl2code/configuration_codegen.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # Modified configuration implementation based on https://github.com/huggingface/transformers/blob/main/src/transformers/models/gptj/configuration_gptj.py 17 | 18 | from transformers.configuration_utils import PretrainedConfig 19 | from transformers.utils import logging 20 | 21 | logger = logging.get_logger(__name__) 22 | 23 | 24 | class CodeGenConfig(PretrainedConfig): 25 | model_type = "codegen" 26 | 27 | def __init__( 28 | self, 29 | vocab_size=50400, 30 | n_positions=2048, 31 | n_ctx=2048, 32 | n_embd=4096, 33 | n_layer=28, 34 | n_head=16, 35 | rotary_dim=64, 36 | n_inner=None, 37 | activation_function="gelu_new", 38 | resid_pdrop=0.0, 39 | embd_pdrop=0.0, 40 | attn_pdrop=0.0, 41 | layer_norm_epsilon=1e-5, 42 | initializer_range=0.02, 43 | scale_attn_weights=True, 44 | gradient_checkpointing=False, 45 | use_cache=True, 46 | bos_token_id=50256, 47 | eos_token_id=50256, 48 | **kwargs 49 | ): 50 | super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) 51 | 52 | self.vocab_size = vocab_size 53 | self.n_ctx = n_ctx 54 | self.n_positions = n_positions 55 | self.n_embd = n_embd 56 | self.n_layer = n_layer 57 | self.n_head = n_head 58 | self.n_inner = n_inner 59 | self.rotary_dim = rotary_dim 60 | self.activation_function = activation_function 61 | self.resid_pdrop = resid_pdrop 62 | self.embd_pdrop = embd_pdrop 63 | self.attn_pdrop = attn_pdrop 64 | self.layer_norm_epsilon = layer_norm_epsilon 65 | self.initializer_range = initializer_range 66 | self.gradient_checkpointing = gradient_checkpointing 67 | self.scale_attn_weights = scale_attn_weights 68 | self.use_cache = use_cache 69 | 70 | self.bos_token_id = bos_token_id 71 | self.eos_token_id = eos_token_id 72 | 73 | @property 74 | def max_position_embeddings(self): 75 | return self.n_positions 76 | 77 | @property 78 | def hidden_size(self): 79 | return self.n_embd 80 | 81 | @property 82 | def num_attention_heads(self): 83 | return self.n_head 84 | 85 | @property 86 | def num_hidden_layers(self): 87 | return self.n_layer 88 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/src/dense/processor/processors.py: -------------------------------------------------------------------------------- 1 | import json 2 | import csv 3 | import datasets 4 | from transformers import PreTrainedTokenizer 5 | from dataclasses import dataclass 6 | 7 | 8 | @dataclass 9 | class SimpleTrainProcessor: 10 | query_file: str 11 | collection_file: str 12 | tokenizer: PreTrainedTokenizer 13 | 14 | max_length: int = 128 15 | columns = ['text_id', 'title', 'text'] 16 | title_field = 'title' 17 | text_field = 'text' 18 | 19 | def __post_init__(self): 20 | self.queries = self.read_queries(self.query_file) 21 | self.collection = datasets.load_dataset( 22 | 'csv', 23 | data_files=self.collection_file, 24 | column_names=self.columns, 25 | delimiter='\t', 26 | )['train'] 27 | 28 | @staticmethod 29 | def read_queries(queries): 30 | qmap = {} 31 | with open(queries) as f: 32 | for l in f: 33 | qid, qry = l.strip().split('\t') 34 | qmap[qid] = qry 35 | return qmap 36 | 37 | @staticmethod 38 | def read_qrel(relevance_file): 39 | qrel = {} 40 | with open(relevance_file, encoding='utf8') as f: 41 | tsvreader = csv.reader(f, delimiter="\t") 42 | for [topicid, _, docid, rel] in tsvreader: 43 | assert rel == "1" 44 | if topicid in qrel: 45 | qrel[topicid].append(docid) 46 | else: 47 | qrel[topicid] = [docid] 48 | return qrel 49 | 50 | def get_query(self, q): 51 | query_encoded = self.tokenizer.encode( 52 | self.queries[q], 53 | add_special_tokens=False, 54 | max_length=self.max_length, 55 | truncation=True 56 | ) 57 | return query_encoded 58 | 59 | def get_passage(self, p): 60 | entry = self.collection[int(p)] 61 | title = entry[self.title_field] 62 | title = "" if title is None else title 63 | body = entry[self.text_field] 64 | content = title + self.tokenizer.sep_token + body 65 | 66 | passage_encoded = self.tokenizer.encode( 67 | content, 68 | add_special_tokens=False, 69 | max_length=self.max_length, 70 | truncation=True 71 | ) 72 | 73 | return passage_encoded 74 | 75 | def process_one(self, train): 76 | q, pp, nn = train 77 | train_example = { 78 | 'query': self.get_query(q), 79 | 'positives': [self.get_passage(p) for p in pp], 80 | 'negatives': [self.get_passage(n) for n in nn], 81 | } 82 | 83 | return json.dumps(train_example) 84 | 85 | 86 | @dataclass 87 | class SimpleCollectionProcessor: 88 | tokenizer: PreTrainedTokenizer 89 | separator: str = '\t' 90 | max_length: int = 128 91 | 92 | def process_line(self, line: str): 93 | xx = line.strip().split(self.separator) 94 | text_id, text = xx[0], xx[1:] 95 | text_encoded = self.tokenizer.encode( 96 | self.tokenizer.sep_token.join(text), 97 | add_special_tokens=False, 98 | max_length=self.max_length, 99 | truncation=True 100 | ) 101 | encoded = { 102 | 'text_id': text_id, 103 | 'text': text_encoded 104 | } 105 | return json.dumps(encoded) 106 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/build/lib/dense/processor/processors.py: -------------------------------------------------------------------------------- 1 | import json 2 | import csv 3 | import datasets 4 | from transformers import PreTrainedTokenizer 5 | from dataclasses import dataclass 6 | 7 | 8 | @dataclass 9 | class SimpleTrainProcessor: 10 | query_file: str 11 | collection_file: str 12 | tokenizer: PreTrainedTokenizer 13 | 14 | max_length: int = 128 15 | columns = ['text_id', 'title', 'text'] 16 | title_field = 'title' 17 | text_field = 'text' 18 | 19 | def __post_init__(self): 20 | self.queries = self.read_queries(self.query_file) 21 | self.collection = datasets.load_dataset( 22 | 'csv', 23 | data_files=self.collection_file, 24 | column_names=self.columns, 25 | delimiter='\t', 26 | )['train'] 27 | 28 | @staticmethod 29 | def read_queries(queries): 30 | qmap = {} 31 | with open(queries) as f: 32 | for l in f: 33 | qid, qry = l.strip().split('\t') 34 | qmap[qid] = qry 35 | return qmap 36 | 37 | @staticmethod 38 | def read_qrel(relevance_file): 39 | qrel = {} 40 | with open(relevance_file, encoding='utf8') as f: 41 | tsvreader = csv.reader(f, delimiter="\t") 42 | for [topicid, _, docid, rel] in tsvreader: 43 | assert rel == "1" 44 | if topicid in qrel: 45 | qrel[topicid].append(docid) 46 | else: 47 | qrel[topicid] = [docid] 48 | return qrel 49 | 50 | def get_query(self, q): 51 | query_encoded = self.tokenizer.encode( 52 | self.queries[q], 53 | add_special_tokens=False, 54 | max_length=self.max_length, 55 | truncation=True 56 | ) 57 | return query_encoded 58 | 59 | def get_passage(self, p): 60 | entry = self.collection[int(p)] 61 | title = entry[self.title_field] 62 | title = "" if title is None else title 63 | body = entry[self.text_field] 64 | content = title + self.tokenizer.sep_token + body 65 | 66 | passage_encoded = self.tokenizer.encode( 67 | content, 68 | add_special_tokens=False, 69 | max_length=self.max_length, 70 | truncation=True 71 | ) 72 | 73 | return passage_encoded 74 | 75 | def process_one(self, train): 76 | q, pp, nn = train 77 | train_example = { 78 | 'query': self.get_query(q), 79 | 'positives': [self.get_passage(p) for p in pp], 80 | 'negatives': [self.get_passage(n) for n in nn], 81 | } 82 | 83 | return json.dumps(train_example) 84 | 85 | 86 | @dataclass 87 | class SimpleCollectionProcessor: 88 | tokenizer: PreTrainedTokenizer 89 | separator: str = '\t' 90 | max_length: int = 128 91 | 92 | def process_line(self, line: str): 93 | xx = line.strip().split(self.separator) 94 | text_id, text = xx[0], xx[1:] 95 | text_encoded = self.tokenizer.encode( 96 | self.tokenizer.sep_token.join(text), 97 | add_special_tokens=False, 98 | max_length=self.max_length, 99 | truncation=True 100 | ) 101 | encoded = { 102 | 'text_id': text_id, 103 | 'text': text_encoded 104 | } 105 | return json.dumps(encoded) 106 | -------------------------------------------------------------------------------- /apicoder/CodeGenAPI/APICoder/get_api_info_by_name.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # 4 | # @Author: Daoguang Zan, @Mentor: Bei Chen, Jian-Guang Lou 5 | # @Copyright 2022 The Microsoft Research Asia (DKI Group). All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | import json 19 | import os 20 | 21 | def get_api_name_4_api_sign_and_desps(library_name: str, base_dir: str): 22 | """ 23 | According to library_name, get all the API info of this library in the format shown in the following format. 24 | """ 25 | # load the library_name's all api info 26 | # base_dir = "/mnt/v-dzan/datasets/CERT/PrivateLibrary/Train" 27 | base_dir = os.path.join(base_dir, "PrivateLibrary", "Train") 28 | library_path = os.path.join(base_dir, library_name, f"{library_name}_apis_doc_details.jsonl") 29 | 30 | library_apis_reader = open(library_path, "r") 31 | api_name_4_api_sign_and_desps = {} 32 | # The api_name_4_api_sign_and_desps format is: 33 | # { 34 | # "api_name": { 35 | # api_path1: [api_sign1, api_desp1], 36 | # api_path2: [api_sign2, api_desp2], 37 | # ... 38 | # } 39 | # ... 40 | # } 41 | for line in library_apis_reader: 42 | api_info = json.loads(line) 43 | # (['api_path', 'api_name', 'api_doc', 'api_signature', 'api_description', 'api_parameters', 'api_parameters_number', 'api_returns', 'api_see_also', 'api_notes', 'api_examples']) 44 | api_path = api_info["api_path"] 45 | api_name = api_info["api_name"] 46 | api_signature = api_info["api_signature"] 47 | api_description = api_info["api_description"] 48 | tmp_api_path_api_info = {api_path: [api_signature, api_description]} 49 | if api_name_4_api_sign_and_desps.get(api_name) is None: 50 | api_name_4_api_sign_and_desps[api_name] = tmp_api_path_api_info 51 | else: 52 | api_name_4_api_sign_and_desps[api_name] = dict(api_name_4_api_sign_and_desps[api_name], **tmp_api_path_api_info) 53 | 54 | library_apis_reader.close() 55 | return api_name_4_api_sign_and_desps 56 | 57 | def get_all_api_info_prompt_list_by_api_name(api_name_4_api_sign_and_desps, API_NAME): 58 | """ 59 | Get a dictionary of all {API_path: API_signature, API_description} based on the name of the API 60 | """ 61 | import sys 62 | from scripts.get_libs_info_from_code import ( 63 | normalizer_api_desp, 64 | get_first_sentence_from_api_desp 65 | ) 66 | 67 | result_api_path_info_dict = dict() 68 | for api_name, api_path_info_dict in api_name_4_api_sign_and_desps.items(): 69 | if api_name == API_NAME: 70 | for api_path, api_info_list in api_path_info_dict.items(): 71 | api_signature, api_description = api_info_list[0], get_first_sentence_from_api_desp(normalizer_api_desp(api_info_list[1])) 72 | 73 | result_api_path_info_dict[api_path] = [api_signature, api_description] 74 | break 75 | return result_api_path_info_dict 76 | -------------------------------------------------------------------------------- /apicoder/private-eval/data/XXXAPIEval-make sense.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "50it [00:00, 16642.74it/s]0/7 [00:00 0 or training_args.n_gpu > 1: 34 | raise NotImplementedError('Multi-GPU encoding is not supported.') 35 | 36 | # Setup logging 37 | logging.basicConfig( 38 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 39 | datefmt="%m/%d/%Y %H:%M:%S", 40 | level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, 41 | ) 42 | 43 | num_labels = 1 44 | config = AutoConfig.from_pretrained( 45 | model_args.config_name if model_args.config_name else model_args.model_name_or_path, 46 | num_labels=num_labels, 47 | cache_dir=model_args.cache_dir, 48 | ) 49 | tokenizer = AutoTokenizer.from_pretrained( 50 | model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, 51 | cache_dir=model_args.cache_dir, 52 | use_fast=False, 53 | ) 54 | 55 | model = DenseModelForInference.build( 56 | model_name_or_path=model_args.model_name_or_path, 57 | config=config, 58 | cache_dir=model_args.cache_dir, 59 | ) 60 | 61 | text_max_length = data_args.q_max_len if data_args.encode_is_qry else data_args.p_max_len 62 | 63 | encode_dataset = EncodeDataset(data_args.encode_in_path, tokenizer, max_len=text_max_length) 64 | encode_loader = DataLoader( 65 | encode_dataset, 66 | batch_size=training_args.per_device_eval_batch_size, 67 | collate_fn=EncodeCollator( 68 | tokenizer, 69 | max_length=text_max_length, 70 | padding='max_length' 71 | ), 72 | shuffle=False, 73 | drop_last=False, 74 | num_workers=training_args.dataloader_num_workers, 75 | ) 76 | encoded = [] 77 | lookup_indices = [] 78 | model = model.to(training_args.device) 79 | model.eval() 80 | 81 | for (batch_ids, batch) in tqdm(encode_loader): 82 | lookup_indices.extend(batch_ids) 83 | with torch.cuda.amp.autocast() if training_args.fp16 else nullcontext(): 84 | with torch.no_grad(): 85 | for k, v in batch.items(): 86 | batch[k] = v.to(training_args.device) 87 | if data_args.encode_is_qry: 88 | model_output: DenseOutput = model(query=batch) 89 | encoded.append(model_output.q_reps.cpu()) 90 | else: 91 | model_output: DenseOutput = model(passage=batch) 92 | encoded.append(model_output.p_reps.cpu()) 93 | 94 | encoded = torch.cat(encoded) 95 | torch.save((encoded, lookup_indices), data_args.encoded_save_path) 96 | 97 | 98 | if __name__ == "__main__": 99 | main() 100 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/src/dense/arguments.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass, field 3 | from typing import Optional, List, Union 4 | from transformers import TrainingArguments 5 | 6 | 7 | @dataclass 8 | class ModelArguments: 9 | model_name_or_path: str = field( 10 | metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} 11 | ) 12 | target_model_path: str = field( 13 | default=None, 14 | metadata={"help": "Path to pretrained reranker target model"} 15 | ) 16 | config_name: Optional[str] = field( 17 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} 18 | ) 19 | tokenizer_name: Optional[str] = field( 20 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} 21 | ) 22 | cache_dir: Optional[str] = field( 23 | default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} 24 | ) 25 | 26 | # modeling 27 | untie_encoder: bool = field( 28 | default=False, 29 | metadata={"help": "no weight sharing between qry passage encoders"} 30 | ) 31 | 32 | # out projection 33 | add_pooler: bool = field(default=False) 34 | projection_in_dim: int = field(default=768) 35 | projection_out_dim: int = field(default=768) 36 | 37 | 38 | @dataclass 39 | class DataArguments: 40 | train_dir: str = field( 41 | default=None, metadata={"help": "Path to train directory"} 42 | ) 43 | dataset_name: str = field( 44 | default=None, metadata={"help": "huggingface dataset name"} 45 | ) 46 | dataset_proc_num: int = field( 47 | default=12, metadata={"help": "number of proc used in dataset preprocess"} 48 | ) 49 | train_n_passages: int = field(default=8) 50 | 51 | encode_in_path: List[str] = field(default=None, metadata={"help": "Path to data to encode"}) 52 | encoded_save_path: str = field(default=None, metadata={"help": "where to save the encode"}) 53 | encode_is_qry: bool = field(default=False) 54 | encode_num_shard: int = field(default=1) 55 | encode_shard_index: int = field(default=0) 56 | 57 | q_max_len: int = field( 58 | default=32, 59 | metadata={ 60 | "help": "The maximum total input sequence length after tokenization for query. Sequences longer " 61 | "than this will be truncated, sequences shorter will be padded." 62 | }, 63 | ) 64 | p_max_len: int = field( 65 | default=128, 66 | metadata={ 67 | "help": "The maximum total input sequence length after tokenization for passage. Sequences longer " 68 | "than this will be truncated, sequences shorter will be padded." 69 | }, 70 | ) 71 | 72 | def __post_init__(self): 73 | if self.dataset_name is not None: 74 | info = self.dataset_name.split('/') 75 | self.dataset_split = info[-1] if len(info) == 3 else 'train' 76 | self.dataset_name = "/".join(info[:-1]) if len(info) == 3 else '/'.join(info) 77 | if self.train_dir is not None: 78 | files = os.listdir(self.train_dir) 79 | self.train_path = [ 80 | os.path.join(self.train_dir, f) 81 | for f in files 82 | if f.endswith('tsv') or f.endswith('json') 83 | ] 84 | 85 | 86 | @dataclass 87 | class DenseTrainingArguments(TrainingArguments): 88 | warmup_ratio: float = field(default=0.1) 89 | negatives_x_device: bool = field(default=False, metadata={"help": "share negatives across devices"}) 90 | do_encode: bool = field(default=False, metadata={"help": "run the encoding loop"}) 91 | 92 | grad_cache: bool = field(default=False, metadata={"help": "Use gradient cache update"}) 93 | gc_q_chunk_size: int = field(default=4) 94 | gc_p_chunk_size: int = field(default=32) 95 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/build/lib/dense/arguments.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass, field 3 | from typing import Optional, List, Union 4 | from transformers import TrainingArguments 5 | 6 | 7 | @dataclass 8 | class ModelArguments: 9 | model_name_or_path: str = field( 10 | metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} 11 | ) 12 | target_model_path: str = field( 13 | default=None, 14 | metadata={"help": "Path to pretrained reranker target model"} 15 | ) 16 | config_name: Optional[str] = field( 17 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} 18 | ) 19 | tokenizer_name: Optional[str] = field( 20 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} 21 | ) 22 | cache_dir: Optional[str] = field( 23 | default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} 24 | ) 25 | 26 | # modeling 27 | untie_encoder: bool = field( 28 | default=False, 29 | metadata={"help": "no weight sharing between qry passage encoders"} 30 | ) 31 | 32 | # out projection 33 | add_pooler: bool = field(default=False) 34 | projection_in_dim: int = field(default=768) 35 | projection_out_dim: int = field(default=768) 36 | 37 | 38 | @dataclass 39 | class DataArguments: 40 | train_dir: str = field( 41 | default=None, metadata={"help": "Path to train directory"} 42 | ) 43 | dataset_name: str = field( 44 | default=None, metadata={"help": "huggingface dataset name"} 45 | ) 46 | dataset_proc_num: int = field( 47 | default=12, metadata={"help": "number of proc used in dataset preprocess"} 48 | ) 49 | train_n_passages: int = field(default=8) 50 | 51 | encode_in_path: List[str] = field(default=None, metadata={"help": "Path to data to encode"}) 52 | encoded_save_path: str = field(default=None, metadata={"help": "where to save the encode"}) 53 | encode_is_qry: bool = field(default=False) 54 | encode_num_shard: int = field(default=1) 55 | encode_shard_index: int = field(default=0) 56 | 57 | q_max_len: int = field( 58 | default=32, 59 | metadata={ 60 | "help": "The maximum total input sequence length after tokenization for query. Sequences longer " 61 | "than this will be truncated, sequences shorter will be padded." 62 | }, 63 | ) 64 | p_max_len: int = field( 65 | default=128, 66 | metadata={ 67 | "help": "The maximum total input sequence length after tokenization for passage. Sequences longer " 68 | "than this will be truncated, sequences shorter will be padded." 69 | }, 70 | ) 71 | 72 | def __post_init__(self): 73 | if self.dataset_name is not None: 74 | info = self.dataset_name.split('/') 75 | self.dataset_split = info[-1] if len(info) == 3 else 'train' 76 | self.dataset_name = "/".join(info[:-1]) if len(info) == 3 else '/'.join(info) 77 | if self.train_dir is not None: 78 | files = os.listdir(self.train_dir) 79 | self.train_path = [ 80 | os.path.join(self.train_dir, f) 81 | for f in files 82 | if f.endswith('tsv') or f.endswith('json') 83 | ] 84 | 85 | 86 | @dataclass 87 | class DenseTrainingArguments(TrainingArguments): 88 | warmup_ratio: float = field(default=0.1) 89 | negatives_x_device: bool = field(default=False, metadata={"help": "share negatives across devices"}) 90 | do_encode: bool = field(default=False, metadata={"help": "run the encoding loop"}) 91 | 92 | grad_cache: bool = field(default=False, metadata={"help": "Use gradient cache update"}) 93 | gc_q_chunk_size: int = field(default=4) 94 | gc_p_chunk_size: int = field(default=32) 95 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/src/dense/driver/encode.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | # from contextlib import nullcontext 5 | from tqdm import tqdm 6 | import ipdb 7 | 8 | import torch 9 | 10 | from torch.utils.data import DataLoader 11 | from transformers import AutoConfig, AutoTokenizer 12 | from transformers import ( 13 | HfArgumentParser, 14 | ) 15 | 16 | from dense.arguments import ModelArguments, DataArguments, \ 17 | DenseTrainingArguments as TrainingArguments 18 | from dense.data import EncodeDataset, EncodeCollator 19 | from dense.modeling import DenseOutput, DenseModelForInference 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | def main(): 25 | parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) 26 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 27 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) 28 | else: 29 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 30 | model_args: ModelArguments 31 | data_args: DataArguments 32 | training_args: TrainingArguments 33 | 34 | if training_args.local_rank > 0 or training_args.n_gpu > 1: 35 | raise NotImplementedError('Multi-GPU encoding is not supported.') 36 | 37 | # Setup logging 38 | logging.basicConfig( 39 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 40 | datefmt="%m/%d/%Y %H:%M:%S", 41 | level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, 42 | ) 43 | 44 | num_labels = 1 45 | config = AutoConfig.from_pretrained( 46 | model_args.config_name if model_args.config_name else model_args.model_name_or_path, 47 | num_labels=num_labels, 48 | cache_dir=model_args.cache_dir, 49 | ) 50 | tokenizer = AutoTokenizer.from_pretrained( 51 | model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, 52 | cache_dir=model_args.cache_dir, 53 | use_fast=False, 54 | ) 55 | 56 | model = DenseModelForInference.build( 57 | model_name_or_path=model_args.model_name_or_path, 58 | config=config, 59 | cache_dir=model_args.cache_dir, 60 | ) 61 | 62 | text_max_length = data_args.q_max_len if data_args.encode_is_qry else data_args.p_max_len 63 | 64 | encode_dataset = EncodeDataset(data_args.encode_in_path, tokenizer, max_len=text_max_length) 65 | encode_loader = DataLoader( 66 | encode_dataset, 67 | batch_size=training_args.per_device_eval_batch_size, 68 | collate_fn=EncodeCollator( 69 | tokenizer, 70 | max_length=text_max_length, 71 | padding='max_length' 72 | ), 73 | shuffle=False, 74 | drop_last=False, 75 | num_workers=training_args.dataloader_num_workers, 76 | ) 77 | encoded = [] 78 | lookup_indices = [] 79 | model = model.to(training_args.device) 80 | model.eval() 81 | for (batch_ids, batch) in tqdm(encode_loader): 82 | lookup_indices.extend(batch_ids) 83 | with torch.cuda.amp.autocast() if training_args.fp16 else None: 84 | with torch.no_grad(): 85 | for k, v in batch.items(): 86 | batch[k] = v.to(training_args.device) 87 | if data_args.encode_is_qry: 88 | model_output: DenseOutput = model(query=batch) 89 | encoded.append(model_output.q_reps.cpu()) 90 | else: 91 | model_output: DenseOutput = model(passage=batch) 92 | encoded.append(model_output.p_reps.cpu()) 93 | 94 | encoded = torch.cat(encoded) 95 | torch.save((encoded, lookup_indices), data_args.encoded_save_path) 96 | 97 | 98 | if __name__ == "__main__": 99 | main() 100 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/README.md: -------------------------------------------------------------------------------- 1 | # What is it? 2 | APIRetriever is a dense retrieval system that can find possible used APIs for programming problem. We refer to a toolkit named [Dense](https://github.com/luyug/Dense) to implemente our APIRetriever. 3 | 4 | --- 5 | 6 | ## Installation 7 | Our dependencies are as follows: 8 | ``` 9 | pytorch==1.8.0 10 | faiss-cpu==1.6.5 11 | transformers==4.2.0 12 | datasets==1.1.3 13 | wandb==0.13.3 14 | ``` 15 | So, you should run pip commands for installing the above dependencies automatically. 16 | ``` 17 | cd Your/Own/Path/.../PrivateLibrary/APIRetriever 18 | pip install . 19 | ``` 20 | Besides, if you would like to use mixed precision FP16 to speed up the training, it is necessary for you to install the apex library. 21 | ``` 22 | git clone https://github.com/NVIDIA/apex 23 | cd apex 24 | pip install -v --no-cache-dir ./ 25 | ``` 26 | 27 | ## Project Directory 28 | ```shell 29 | ├── apex 30 | ├── data 31 | │ ├── inference # The test data for five libraries. The below `XXX` can be `pandas`, `numpy`, `monkey`, `beatnum`, and `torchdata`. 32 | │ │ ├── XXX_api.json # API and its id. 33 | │ │ ├── XXX_api.pt # API embeddings encoded by our APIRetriever. 34 | │ │ ├── XXX_comment.json # Code comment and its id. 35 | │ │ ├── XXX_comment.pt # Code comment embeddings encoded by our APIRetriever. 36 | │ │ ├── XXX_id_score.trec # The score between comment and API with an easy-to-read format. 37 | │ │ ├── XXX_id_score.txt # The score between comment and API with an obscure format. 38 | │ └── train 39 | │ ├── processed-train-data 40 | │ └── unprocessed-train-data 41 | ├── outputs 42 | ├── requirements.txt 43 | ├── scripts 44 | │ ├── extract_retrieval_api_corpus.py 45 | │ ├── run_extract_apiretriever_corpus.sh 46 | │ ├── run_prepare_test_private_code.py 47 | │ └── run_prepare_train_private_code.py 48 | ├── setup.py 49 | └── src 50 | ├── dense 51 | ├── run_train_1.sh 52 | ├── run_encode_2.sh 53 | ├── run_search_3.sh 54 | └── run_trec_format_4.sh 55 | ``` 56 | 57 | ## Training 58 | 59 | First, you need to process the crawled python files into comment-API pairs. 60 | ```shell 61 | bash APIRetriever/scripts/run_extract_apiretriever_corpus.sh 62 | ``` 63 | Then, you should convert these data pairs into a trainable format for training our APIRetriever. 64 | ```shell 65 | python APIRetriever/scripts/run_prepare_train_private_code.py 66 | ``` 67 | After preparing the training corpus, you should start training your own APIRetriever. 68 | ```shell 69 | bash APIRetriever/src/run_train_1.sh 70 | ``` 71 | 72 | 73 | 74 | ## Inference 75 | After training phase, we can use APIRetriever to retrieve private APIs for each programming problem description. In detail, we apply $E_{\mathbf{a}}$ to all the APIs and index them by [FAISS](https://github.com/facebookresearch/faiss) offline. Given a new programming problem description $\mathbf{p}$ at run-time, we only need to produce its embedding $v_{\mathbf{p}}=E_{\mathbf{p}}(\mathbf{p})$ and recall the top-$k$ APIs with the embeddings closest to $v_{\mathbf{p}}$. 76 | 77 | First, you should encode the code comments and APIs. 78 | ```shell 79 | bash APIRetriever/src/run_encode_2.sh 80 | ``` 81 | Then, you need to retrieve and rank the APIs for each code comment. 82 | ```shell 83 | bash APIRetriever/src/run_search_3.sh 84 | ``` 85 | Next, you can get the final scores between code comments and its APIs. 86 | ```shell 87 | bash APIRetriever/src/run_trec_format_4.sh 88 | ``` 89 | 90 | > The retrieved outcome is placed in `APIRetriever/data/inference`. In addition, they can be used to prompt APIs (Top-1,2,3,5 and Human) to our crafted benchmarks. 91 | 92 | ## Citation 93 | If you find our work useful, please cite the paper: 94 | ``` 95 | @inproceedings{APICoder, 96 | title={When Languange Model Meets Private Library}, 97 | author={Zan, Daoguang and Chen, Bei and Lin, Zeqi and Guan, Bei and Wang, Yongji and Lou, Jian-Guang}, 98 | booktitle={EMNLP findings}, 99 | year={2022} 100 | } 101 | ``` 102 | -------------------------------------------------------------------------------- /cert/pandas-numpy-eval/pandas_numpy_eval/evaluation.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict, Counter 2 | from concurrent.futures import ThreadPoolExecutor, as_completed 3 | from typing import List, Union, Iterable, Dict 4 | import itertools 5 | 6 | import numpy as np 7 | import tqdm 8 | 9 | from pandas_numpy_eval.data import HUMAN_EVAL, read_problems, stream_jsonl, write_jsonl 10 | from pandas_numpy_eval.execution import check_correctness 11 | 12 | 13 | def estimate_pass_at_k( 14 | num_samples: Union[int, List[int], np.ndarray], 15 | num_correct: Union[List[int], np.ndarray], 16 | k: int 17 | ) -> np.ndarray: 18 | """ 19 | Estimates pass@k of each problem and returns them in an array. 20 | """ 21 | 22 | def estimator(n: int, c: int, k: int) -> float: 23 | """ 24 | Calculates 1 - comb(n - c, k) / comb(n, k). 25 | """ 26 | if n - c < k: 27 | return 1.0 28 | return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) 29 | 30 | if isinstance(num_samples, int): 31 | num_samples_it = itertools.repeat(num_samples, len(num_correct)) 32 | else: 33 | assert len(num_samples) == len(num_correct) 34 | num_samples_it = iter(num_samples) 35 | 36 | return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]) 37 | 38 | 39 | def evaluate_functional_correctness( 40 | sample_file: str, 41 | k: List[int] = [1, 10, 100], 42 | n_workers: int = 4, 43 | timeout: float = 3.0, 44 | problem_file: str = HUMAN_EVAL, 45 | ): 46 | """ 47 | Evaluates the functional correctness of generated samples, and writes 48 | results to f"{sample_file}_results.jsonl.gz" 49 | """ 50 | 51 | problems = read_problems(problem_file) 52 | 53 | # Check the generated samples against test suites. 54 | with ThreadPoolExecutor(max_workers=n_workers) as executor: 55 | 56 | futures = [] 57 | completion_id = Counter() 58 | n_samples = 0 59 | results = defaultdict(list) 60 | 61 | print("Reading samples...") 62 | for sample in tqdm.tqdm(stream_jsonl(sample_file)): 63 | task_id = sample["task_id"] 64 | completion = sample["completion"] 65 | args = (problems[task_id], completion, timeout, completion_id[task_id]) 66 | future = executor.submit(check_correctness, *args) 67 | futures.append(future) 68 | completion_id[task_id] += 1 69 | n_samples += 1 70 | 71 | assert len(completion_id) == len(problems), "Some problems are not attempted." 72 | 73 | print("Running test suites...") 74 | for future in tqdm.tqdm(as_completed(futures), total=len(futures)): 75 | result = future.result() 76 | results[result["task_id"]].append((result["completion_id"], result)) 77 | 78 | # Calculate pass@k. 79 | total, correct = [], [] 80 | for result in results.values(): 81 | result.sort() 82 | passed = [r[1]["passed"] for r in result] 83 | total.append(len(passed)) 84 | correct.append(sum(passed)) 85 | total = np.array(total) 86 | correct = np.array(correct) 87 | 88 | ks = k 89 | pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean() 90 | for k in ks if (total >= k).all()} 91 | 92 | # Finally, save the results in one file: 93 | def combine_results(): 94 | for sample in stream_jsonl(sample_file): 95 | task_id = sample["task_id"] 96 | result = results[task_id].pop(0) 97 | sample["result"] = result[1]["result"] 98 | sample["passed"] = result[1]["passed"] 99 | yield sample 100 | 101 | out_file = sample_file + "_results.jsonl" 102 | print(f"Writing results to {out_file}...") 103 | write_jsonl(out_file, tqdm.tqdm(combine_results(), total=n_samples)) 104 | 105 | def return_pass_at_k(): 106 | yield pass_at_k 107 | metric_file = sample_file + "_metrics.jsonl" 108 | print(f"Writing metrics to {metric_file}...") 109 | write_jsonl(metric_file, return_pass_at_k()) 110 | 111 | return pass_at_k 112 | -------------------------------------------------------------------------------- /apicoder/private-eval/private_eval/evaluation.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict, Counter 2 | from concurrent.futures import ThreadPoolExecutor, as_completed 3 | from typing import List, Union, Iterable, Dict 4 | import itertools 5 | 6 | import numpy as np 7 | import tqdm 8 | 9 | from private_eval.data import HUMAN_EVAL, read_problems, stream_jsonl, write_jsonl 10 | from private_eval.execution import check_correctness 11 | 12 | 13 | def estimate_pass_at_k( 14 | num_samples: Union[int, List[int], np.ndarray], 15 | num_correct: Union[List[int], np.ndarray], 16 | k: int 17 | ) -> np.ndarray: 18 | """ 19 | Estimates pass@k of each problem and returns them in an array. 20 | """ 21 | 22 | def estimator(n: int, c: int, k: int) -> float: 23 | """ 24 | Calculates 1 - comb(n - c, k) / comb(n, k). 25 | """ 26 | if n - c < k: 27 | return 1.0 28 | return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) 29 | 30 | if isinstance(num_samples, int): 31 | num_samples_it = itertools.repeat(num_samples, len(num_correct)) 32 | else: 33 | assert len(num_samples) == len(num_correct) 34 | num_samples_it = iter(num_samples) 35 | 36 | return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]) 37 | 38 | 39 | def evaluate_functional_correctness( 40 | sample_file: str, 41 | k: List[int] = [1, 10, 100], 42 | n_workers: int = 4, 43 | timeout: float = 3.0, 44 | problem_file: str = HUMAN_EVAL, 45 | ): 46 | """ 47 | Evaluates the functional correctness of generated samples, and writes 48 | results to f"{sample_file}_results.jsonl.gz" 49 | """ 50 | 51 | problems = read_problems(problem_file) 52 | 53 | # Check the generated samples against test suites. 54 | with ThreadPoolExecutor(max_workers=n_workers) as executor: 55 | 56 | futures = [] 57 | completion_id = Counter() 58 | n_samples = 0 59 | results = defaultdict(list) 60 | 61 | print("Reading samples...") 62 | for sample in tqdm.tqdm(stream_jsonl(sample_file)): 63 | task_id = sample["task_id"] 64 | completion = sample["completion"] 65 | args = (problems[task_id], completion, timeout, completion_id[task_id]) 66 | future = executor.submit(check_correctness, *args) 67 | futures.append(future) 68 | completion_id[task_id] += 1 69 | n_samples += 1 70 | 71 | assert len(completion_id) == len(problems), "Some problems are not attempted." 72 | 73 | print("Running test suites...") 74 | for future in tqdm.tqdm(as_completed(futures), total=len(futures)): 75 | result = future.result() 76 | results[result["task_id"]].append((result["completion_id"], result)) 77 | 78 | print("Starting calculate pass@k...") 79 | # Calculate pass@k. 80 | total, correct = [], [] 81 | for result in results.values(): 82 | result.sort() 83 | passed = [r[1]["passed"] for r in result] 84 | total.append(len(passed)) 85 | correct.append(sum(passed)) 86 | total = np.array(total) 87 | correct = np.array(correct) 88 | 89 | ks = k 90 | pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean() 91 | for k in ks if (total >= k).all()} 92 | print("Pass@k:", pass_at_k) 93 | 94 | # Finally, save the results in one file: 95 | def combine_results(): 96 | for sample in stream_jsonl(sample_file): 97 | task_id = sample["task_id"] 98 | result = results[task_id].pop(0) 99 | sample["result"] = result[1]["result"] 100 | sample["passed"] = result[1]["passed"] 101 | yield sample 102 | 103 | out_file = sample_file + "_results.jsonl" 104 | print(f"Writing results to {out_file}...") 105 | write_jsonl(out_file, tqdm.tqdm(combine_results(), total=n_samples)) 106 | 107 | def return_pass_at_k(): 108 | yield pass_at_k 109 | metric_file = sample_file + "_metrics.jsonl" 110 | print(f"Writing metrics to {metric_file}...") 111 | write_jsonl(metric_file, return_pass_at_k()) 112 | 113 | return pass_at_k 114 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/src/dense/trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | from itertools import repeat 3 | from typing import Dict, List, Tuple, Optional, Any, Union 4 | 5 | from transformers.trainer import Trainer 6 | 7 | import torch 8 | from torch.utils.data import DataLoader 9 | import torch.distributed as dist 10 | 11 | from .loss import SimpleContrastiveLoss, DistributedContrastiveLoss 12 | 13 | import logging 14 | logger = logging.getLogger(__name__) 15 | 16 | try: 17 | from grad_cache import GradCache 18 | _grad_cache_available = True 19 | except ModuleNotFoundError: 20 | _grad_cache_available = False 21 | 22 | 23 | class DenseTrainer(Trainer): 24 | def __init__(self, *args, **kwargs): 25 | super(DenseTrainer, self).__init__(*args, **kwargs) 26 | self._dist_loss_scale_factor = dist.get_world_size() if self.args.negatives_x_device else 1 27 | 28 | def _save(self, output_dir: Optional[str] = None): 29 | output_dir = output_dir if output_dir is not None else self.args.output_dir 30 | os.makedirs(output_dir, exist_ok=True) 31 | logger.info("Saving model checkpoint to %s", output_dir) 32 | self.model.save(output_dir) 33 | 34 | def _prepare_inputs( 35 | self, 36 | inputs: Tuple[Dict[str, Union[torch.Tensor, Any]], ...] 37 | ) -> List[Dict[str, Union[torch.Tensor, Any]]]: 38 | prepared = [] 39 | for x in inputs: 40 | if isinstance(x, torch.Tensor): 41 | prepared.append(x.to(self.args.device)) 42 | else: 43 | prepared.append(super()._prepare_inputs(x)) 44 | return prepared 45 | 46 | def get_train_dataloader(self) -> DataLoader: 47 | if self.train_dataset is None: 48 | raise ValueError("Trainer: training requires a train_dataset.") 49 | train_sampler = self._get_train_sampler() 50 | 51 | return DataLoader( 52 | self.train_dataset, 53 | batch_size=self.args.train_batch_size, 54 | sampler=train_sampler, 55 | collate_fn=self.data_collator, 56 | drop_last=True, 57 | num_workers=self.args.dataloader_num_workers, 58 | ) 59 | 60 | def compute_loss(self, model, inputs): 61 | query, passage = inputs 62 | return model(query=query, passage=passage).loss 63 | 64 | def training_step(self, *args): 65 | return super(DenseTrainer, self).training_step(*args) / self._dist_loss_scale_factor 66 | 67 | 68 | def split_dense_inputs(model_input: dict, chunk_size: int): 69 | assert len(model_input) == 1 70 | arg_key = list(model_input.keys())[0] 71 | arg_val = model_input[arg_key] 72 | 73 | keys = list(arg_val.keys()) 74 | chunked_tensors = [arg_val[k].split(chunk_size, dim=0) for k in keys] 75 | chunked_arg_val = [dict(zip(kk, tt)) for kk, tt in zip(repeat(keys), zip(*chunked_tensors))] 76 | 77 | return [{arg_key: c} for c in chunked_arg_val] 78 | 79 | 80 | def get_dense_rep(x): 81 | if x.q_reps is None: 82 | return x.p_reps 83 | else: 84 | return x.q_reps 85 | 86 | 87 | class GCTrainer(DenseTrainer): 88 | def __init__(self, *args, **kwargs): 89 | logger.info('Initializing Gradient Cache Trainer') 90 | if not _grad_cache_available: 91 | raise ValueError( 92 | 'Grad Cache package not available. You can obtain it from https://github.com/luyug/GradCache.') 93 | super(GCTrainer, self).__init__(*args, **kwargs) 94 | 95 | loss_fn_cls = DistributedContrastiveLoss if self.args.negatives_x_device else SimpleContrastiveLoss 96 | loss_fn = loss_fn_cls(self.model.data_args.train_n_passages) 97 | 98 | self.gc = GradCache( 99 | models=[self.model, self.model], 100 | chunk_sizes=[self.args.gc_q_chunk_size, self.args.gc_p_chunk_size], 101 | loss_fn=loss_fn, 102 | split_input_fn=split_dense_inputs, 103 | get_rep_fn=get_dense_rep, 104 | fp16=self.args.fp16, 105 | scaler=self.scaler 106 | ) 107 | 108 | def training_step(self, model, inputs) -> torch.Tensor: 109 | model.train() 110 | queries, passages = self._prepare_inputs(inputs) 111 | queries, passages = {'query': queries}, {'passage': passages} 112 | 113 | _distributed = self.args.local_rank > -1 114 | self.gc.models = [model, model] 115 | loss = self.gc(queries, passages, no_sync_except_last=_distributed) 116 | 117 | return loss / self._dist_loss_scale_factor 118 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/build/lib/dense/trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | from itertools import repeat 3 | from typing import Dict, List, Tuple, Optional, Any, Union 4 | 5 | from transformers.trainer import Trainer 6 | 7 | import torch 8 | from torch.utils.data import DataLoader 9 | import torch.distributed as dist 10 | 11 | from .loss import SimpleContrastiveLoss, DistributedContrastiveLoss 12 | 13 | import logging 14 | logger = logging.getLogger(__name__) 15 | 16 | try: 17 | from grad_cache import GradCache 18 | _grad_cache_available = True 19 | except ModuleNotFoundError: 20 | _grad_cache_available = False 21 | 22 | 23 | class DenseTrainer(Trainer): 24 | def __init__(self, *args, **kwargs): 25 | super(DenseTrainer, self).__init__(*args, **kwargs) 26 | self._dist_loss_scale_factor = dist.get_world_size() if self.args.negatives_x_device else 1 27 | 28 | def _save(self, output_dir: Optional[str] = None): 29 | output_dir = output_dir if output_dir is not None else self.args.output_dir 30 | os.makedirs(output_dir, exist_ok=True) 31 | logger.info("Saving model checkpoint to %s", output_dir) 32 | self.model.save(output_dir) 33 | 34 | def _prepare_inputs( 35 | self, 36 | inputs: Tuple[Dict[str, Union[torch.Tensor, Any]], ...] 37 | ) -> List[Dict[str, Union[torch.Tensor, Any]]]: 38 | prepared = [] 39 | for x in inputs: 40 | if isinstance(x, torch.Tensor): 41 | prepared.append(x.to(self.args.device)) 42 | else: 43 | prepared.append(super()._prepare_inputs(x)) 44 | return prepared 45 | 46 | def get_train_dataloader(self) -> DataLoader: 47 | if self.train_dataset is None: 48 | raise ValueError("Trainer: training requires a train_dataset.") 49 | train_sampler = self._get_train_sampler() 50 | 51 | return DataLoader( 52 | self.train_dataset, 53 | batch_size=self.args.train_batch_size, 54 | sampler=train_sampler, 55 | collate_fn=self.data_collator, 56 | drop_last=True, 57 | num_workers=self.args.dataloader_num_workers, 58 | ) 59 | 60 | def compute_loss(self, model, inputs): 61 | query, passage = inputs 62 | return model(query=query, passage=passage).loss 63 | 64 | def training_step(self, *args): 65 | return super(DenseTrainer, self).training_step(*args) / self._dist_loss_scale_factor 66 | 67 | 68 | def split_dense_inputs(model_input: dict, chunk_size: int): 69 | assert len(model_input) == 1 70 | arg_key = list(model_input.keys())[0] 71 | arg_val = model_input[arg_key] 72 | 73 | keys = list(arg_val.keys()) 74 | chunked_tensors = [arg_val[k].split(chunk_size, dim=0) for k in keys] 75 | chunked_arg_val = [dict(zip(kk, tt)) for kk, tt in zip(repeat(keys), zip(*chunked_tensors))] 76 | 77 | return [{arg_key: c} for c in chunked_arg_val] 78 | 79 | 80 | def get_dense_rep(x): 81 | if x.q_reps is None: 82 | return x.p_reps 83 | else: 84 | return x.q_reps 85 | 86 | 87 | class GCTrainer(DenseTrainer): 88 | def __init__(self, *args, **kwargs): 89 | logger.info('Initializing Gradient Cache Trainer') 90 | if not _grad_cache_available: 91 | raise ValueError( 92 | 'Grad Cache package not available. You can obtain it from https://github.com/luyug/GradCache.') 93 | super(GCTrainer, self).__init__(*args, **kwargs) 94 | 95 | loss_fn_cls = DistributedContrastiveLoss if self.args.negatives_x_device else SimpleContrastiveLoss 96 | loss_fn = loss_fn_cls(self.model.data_args.train_n_passages) 97 | 98 | self.gc = GradCache( 99 | models=[self.model, self.model], 100 | chunk_sizes=[self.args.gc_q_chunk_size, self.args.gc_p_chunk_size], 101 | loss_fn=loss_fn, 102 | split_input_fn=split_dense_inputs, 103 | get_rep_fn=get_dense_rep, 104 | fp16=self.args.fp16, 105 | scaler=self.scaler 106 | ) 107 | 108 | def training_step(self, model, inputs) -> torch.Tensor: 109 | model.train() 110 | queries, passages = self._prepare_inputs(inputs) 111 | queries, passages = {'query': queries}, {'passage': passages} 112 | 113 | _distributed = self.args.local_rank > -1 114 | self.gc.models = [model, model] 115 | loss = self.gc(queries, passages, no_sync_except_last=_distributed) 116 | 117 | return loss / self._dist_loss_scale_factor 118 | -------------------------------------------------------------------------------- /apicoder/CodeGenAPI/run_private.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export WANDB_PROJECT="Your Project Name" 4 | export WANDB_API_KEY="Your API Key of WANDB" 5 | 6 | BASE_DATA_DIR="your/base/dir" 7 | if [ ! -z "$AMLT_DATA_DIR" ]; then 8 | echo "Run experiment on AMLT." 9 | BASE_DATA_DIR=$AMLT_DATA_DIR 10 | fi 11 | 12 | DOMAIN="PrivateLibrary" 13 | TYPE="private_libs_bin_codegen_v2" 14 | 15 | DATA_DIR="${BASE_DATA_DIR}/XXX/${DOMAIN}/${TYPE}" 16 | 17 | N_GPUS="8" 18 | NODE_SIZE="1" 19 | 20 | if [ ! -z "$1" ]; then 21 | N_GPUS=$1 22 | fi 23 | 24 | BATCH_SIZE=8 # 24G:7 32G:8 16G:6 25 | MAX_STEPS=500_000 26 | BLOCK_SIZE=1024 27 | GRAD_ACC_STEPS=2 28 | WARMUP_STEPS=1_000 29 | SAVE_STEPS=2_000 30 | 31 | LR="5e-4" 32 | WD="0.1" 33 | 34 | # DO NOT take func score into consideration for resampling by setting a const weight 1.0 35 | RS_WEIGHTS="1.0_0.5_1.0" #_0.5" 36 | GAS="" #"512K_150K" #default is const 37 | 38 | OUTPUT_DIR="${BASE_DATA_DIR}/XXX/${DOMAIN}/experiments_codegen_v2/" 39 | CKPT_NAME="" 40 | 41 | STEP_SUMMARY="${WARMUP_STEPS}K_${MAX_STEPS}K_${SAVE_STEPS}K" 42 | STEP_SUMMARY=${STEP_SUMMARY//_000/} 43 | 44 | # Resampling with weight 0.4 45 | # GRAD_ACC_STEPS + 1.0 per epoch 46 | ID="" 47 | 48 | if [ ! -z "$RS_WEIGHTS" ]; then 49 | ID="RS_${RS_WEIGHTS}" 50 | fi 51 | 52 | if [ ! -z "$GAS" ]; then 53 | ID="${ID}-GAS_${GAS}" 54 | else 55 | GAS="const" 56 | fi 57 | 58 | if [ ! -z "$CKPT_NAME" ]; then 59 | ID="${ID}-RSUME" 60 | fi 61 | 62 | ACTUAL_GPUS=$((${N_GPUS}*${NODE_SIZE})) 63 | RUN_NAME="${BATCH_SIZE}x${GRAD_ACC_STEPS}x${ACTUAL_GPUS}x${BLOCK_SIZE}-${LR}-${WD}-${STEP_SUMMARY}-${ID}" 64 | RUN_OUTPUT_DIR="$OUTPUT_DIR/$RUN_NAME" 65 | 66 | echo "Experiment Run Name: $RUN_NAME" 67 | echo "Data Dir:" $DATA_DIR 68 | echo "Actual GPUs:" $ACTUAL_GPUS 69 | export DISTRIBUTED_GPU_SIZE=$ACTUAL_GPUS 70 | 71 | echo "Output Dir:" $OUTPUT_DIR 72 | echo "Init Actual Batch Size: ${BATCH_SIZE}x${GRAD_ACC_STEPS}x${N_GPUS}x${NODE_SIZE}" 73 | 74 | Run_Command_Args=" --model_name_or_path $DATA_DIR/model" 75 | Run_Command_Args="$Run_Command_Args --run_name $RUN_NAME" 76 | Run_Command_Args="$Run_Command_Args --output_dir $RUN_OUTPUT_DIR" 77 | Run_Command_Args="$Run_Command_Args --train_file $DATA_DIR/train" 78 | Run_Command_Args="$Run_Command_Args --validation_file $DATA_DIR/valid" 79 | Run_Command_Args="$Run_Command_Args --do_train" 80 | Run_Command_Args="$Run_Command_Args --do_eval" 81 | 82 | Run_Command_Args="$Run_Command_Args --block_size $BLOCK_SIZE" 83 | Run_Command_Args="$Run_Command_Args --logging_steps 100" 84 | Run_Command_Args="$Run_Command_Args --evaluation_strategy steps" 85 | Run_Command_Args="$Run_Command_Args --eval_steps $SAVE_STEPS" 86 | Run_Command_Args="$Run_Command_Args --save_steps $SAVE_STEPS" 87 | Run_Command_Args="$Run_Command_Args --warmup_steps $WARMUP_STEPS" 88 | Run_Command_Args="$Run_Command_Args --learning_rate $LR" 89 | Run_Command_Args="$Run_Command_Args --adam_beta2 0.95" 90 | Run_Command_Args="$Run_Command_Args --lr_scheduler_type cosine" 91 | Run_Command_Args="$Run_Command_Args --resampling_weights $RS_WEIGHTS" 92 | 93 | Run_Command_Args="$Run_Command_Args --max_steps $MAX_STEPS" 94 | Run_Command_Args="$Run_Command_Args --per_device_train_batch_size $BATCH_SIZE" 95 | Run_Command_Args="$Run_Command_Args --per_device_eval_batch_size $BATCH_SIZE" 96 | Run_Command_Args="$Run_Command_Args --gradient_accumulation_steps $GRAD_ACC_STEPS" 97 | Run_Command_Args="$Run_Command_Args --weight_decay $WD" 98 | Run_Command_Args="$Run_Command_Args --fp16" 99 | Run_Command_Args="$Run_Command_Args --report_to wandb" 100 | 101 | if [ ! -z "$GAS" ]; then 102 | Run_Command_Args="$Run_Command_Args --gradient_accumulation_strategy $GAS" 103 | fi 104 | 105 | if [ ! -z "$CKPT_NAME" ]; then 106 | CKPT_PATH=$"$OUTPUT_DIR/$CKPT_NAME" 107 | echo "Resume from checkpoint: $CKPT_PATH" 108 | Run_Command_Args="$Run_Command_Args --resume_from_checkpoint $CKPT_PATH --ignore_data_skip" 109 | fi 110 | 111 | 112 | echo "Run Command Args: $Run_Command_Args" 113 | 114 | # deepspeed --num_gpus $N_GPUS run_gpt.py --deepspeed configs/ds_config.json $Run_Command_Args 115 | # deepspeed --num_gpus $N_GPUS run_gpt.py --deepspeed configs/ds_config_zero3.json $Run_Command_Args 116 | 117 | # if [ ! -z "$NODE_RANK" ]; then 118 | # echo "Run distributed training on multi nodes $NODE_RANK/$NODE_SIZE, master ip = $MASTER_ADDR:$MASTER_PORT" 119 | # python -m torch.distributed.launch \ 120 | # --nproc_per_node=$N_GPUS \ 121 | # --nnodes=$NODE_SIZE \ 122 | # --node_rank=$NODE_RANK \ 123 | # --master_addr=$MASTER_ADDR \ 124 | # --master_port=$MASTER_PORT \ 125 | # --use_env run_cert.py $Run_Command_Args 126 | # else 127 | python -m torch.distributed.launch --nproc_per_node $N_GPUS --use_env run_private.py $Run_Command_Args 128 | # fi -------------------------------------------------------------------------------- /eval_human_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import argparse 4 | from tqdm import tqdm 5 | 6 | from transformers import pipeline, set_seed 7 | from transformers import AutoTokenizer, AutoModelForCausalLM 8 | from transformers.pipelines.base import Pipeline 9 | 10 | from human_eval.data import write_jsonl, read_problems 11 | 12 | def load_generation_pipe(model_name_or_path: str, gpu_device: int=0): 13 | model = AutoModelForCausalLM.from_pretrained(model_name_or_path) 14 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) 15 | 16 | pipe = pipeline( 17 | 'text-generation', 18 | model=model, 19 | tokenizer=tokenizer, 20 | device=gpu_device 21 | ) 22 | 23 | print("load generation pipeline from {} over, vocab size = {}, eos id = {}, gpu device = {}.".format( 24 | model_name_or_path, len(tokenizer), tokenizer.eos_token_id, gpu_device) 25 | ) 26 | 27 | return pipe 28 | 29 | def extract_function_block(string): 30 | return re.split("\nclass|\ndef|\n#|\n@|\nprint|\nif", string)[0].rstrip() 31 | 32 | def run_code_generation(pipe, prompt, num_completions=1, **gen_kwargs): 33 | set_seed(123) 34 | 35 | code_gens = pipe(prompt, 36 | num_return_sequences=num_completions, 37 | **gen_kwargs 38 | ) 39 | 40 | return [extract_function_block(code_gen["generated_text"][len(prompt):]) for code_gen in code_gens] 41 | 42 | def evaluate_on_human_eval( 43 | model_name_or_path: str, 44 | temperature: float, 45 | top_p: float, 46 | num_samples_per_task: int, 47 | max_new_tokens: int, 48 | gpu_device: int, 49 | output_dir: str, 50 | ) -> str: 51 | 52 | pipe: Pipeline = load_generation_pipe(model_name_or_path, gpu_device=gpu_device) 53 | eval_name = f"human_eval.t{temperature}.p{top_p}.l{max_new_tokens}.n{num_samples_per_task}" 54 | 55 | if output_dir is None: 56 | if os.path.exists(model_name_or_path): 57 | output_dir = model_name_or_path 58 | else: 59 | raise ValueError("Output dir can't be null if you are not evaluation a local model.") 60 | 61 | os.makedirs(output_dir, exist_ok=True) 62 | saved_path = os.path.join(output_dir, f"{eval_name}.samples.jsonl") 63 | 64 | gen_kwargs = { 65 | "do_sample": True, 66 | "temperature": temperature, 67 | "max_new_tokens": max_new_tokens, 68 | "top_p": top_p, 69 | "top_k": 0, 70 | "pad_token_id": pipe.tokenizer.pad_token_id if pipe.tokenizer.pad_token_id else pipe.tokenizer.eos_token_id, 71 | "eos_token_id": pipe.tokenizer.eos_token_id 72 | } 73 | 74 | problems = read_problems() 75 | samples = [] 76 | generate_batch_size = min(50, num_samples_per_task) 77 | 78 | bos_token = pipe.tokenizer.bos_token if pipe.tokenizer.bos_token else pipe.tokenizer.eos_token 79 | 80 | for task_id in tqdm(problems): 81 | # Strip operation is important as new tokenizer will not treat '\n' as a independent token 82 | prompt = problems[task_id]["prompt"].strip() 83 | 84 | for _ in range(num_samples_per_task // generate_batch_size): 85 | input_prompt = bos_token + prompt 86 | gen_results = run_code_generation(pipe, input_prompt, num_completions=generate_batch_size, **gen_kwargs) 87 | for gen_result in gen_results: 88 | samples.append(dict(task_id=task_id, completion=gen_result)) 89 | 90 | write_jsonl(saved_path, samples) 91 | print("Run generation over, save {} samples to {}.".format(len(samples), saved_path)) 92 | 93 | if __name__ == '__main__': 94 | parser = argparse.ArgumentParser(description='Run evaluation for code generation model on human-eval.') 95 | 96 | parser.add_argument('-model', '--model_name_or_path', type=str, required=True) 97 | parser.add_argument('-o', '--output_dir', type=str, default=None) 98 | parser.add_argument('-n', '--num_completions', type=int, default=100) 99 | parser.add_argument('-t', '--temperature', type=float, default=0.2) 100 | parser.add_argument('-p', '--top_p', type=float, default=0.95) 101 | parser.add_argument('-l', '--max_new_tokens', type=int, default=100) 102 | parser.add_argument('-gpu', "--gpu_device", type=int, default=0) 103 | 104 | args = parser.parse_args() 105 | 106 | evaluate_on_human_eval( 107 | model_name_or_path=args.model_name_or_path, 108 | temperature=args.temperature, 109 | top_p=args.top_p, 110 | num_samples_per_task=args.num_completions, 111 | max_new_tokens=args.max_new_tokens, 112 | gpu_device=args.gpu_device, 113 | output_dir=args.output_dir, 114 | ) 115 | pass -------------------------------------------------------------------------------- /cert/pandas-numpy-eval/README.md: -------------------------------------------------------------------------------- 1 | # PandasEval and NumpyEval 2 | 3 | Two benchmarks for evaluating the performance of library-oriented code generation. They are proposed in the paper "[CERT: Continual Pre-Training on Sketches for Library-Oriented Code Generation](https://arxiv.org/pdf/2206.06888.pdf)". 4 | 5 | The evaluation script is adapted from OpenAI's [humaneval](https://github.com/openai/human-eval/tree/master/human_eval). 6 | 7 | ## Installation 8 | 9 | Make sure to use python 3.7 or later: 10 | ``` 11 | $ conda create -n pycodegpt python=3.7 12 | $ conda activate pycodegpt 13 | ``` 14 | 15 | Check out and install this repository: 16 | ``` 17 | $ pip install -e pandas-numpy-eval 18 | ``` 19 | 20 | ## Configuration 21 | ``` 22 | ├── data # The directory of our crafted benchmarks. 23 | │ ├── NumpyEval.jsonl.gz 24 | │ └── PandasEval.jsonl.gz 25 | ├── pandas_numpy_eval 26 | │ ├── data.py # Choosing whether to load PandasEval or NumpyEval. 27 | │ ├── evaluate_functional_correctness.py # Calculating the evaluation results. 28 | │ ├── evaluation.py # Calculating the evaluation results. 29 | │ └── execution.py # Executing the predicted code. 30 | ``` 31 | 32 | ## Running Environment Testing 33 | 34 | You need replace `XXX` with your local path for testing the pandas results. (Make sure that the `LIB` variable in `pandas-numpy-eval/pandas_numpy_eval/data.py` is set to `pandas`.) 35 | ``` 36 | $ evaluate_functional_correctness XXX/CERT/pandas-numpy-eval/data/Example_Pandas_PYCODEGPT_samples.jsonl 37 | ``` 38 | 39 | If you can successfully run the above command and obtain the following results, the evaluation environment is ready to use. 40 | ``` 41 | {'pass@1': 0.06930693069306931} 42 | ``` 43 | 44 | # The Process of Constructing PandasEval and NumpyEval 45 | 46 | We refer to [StackOverFlow](https://stackoverflow.com/), a Q&A website for programmers, to build the benchmarks. We search for posts using the library tag on StackOverFlow, and select those with high votes. To ensure quality, we only refer to posts with accepted answers. We go through a post's question and its accepted answer, then manually organize them into the form needed for our benchmarks, containing both context and target code. We also polish all programming problems so that the problem descriptions are clear and the codes are correct. Note that we keep the intentions and the descriptions of the programming problems consistent with the posts to the maximum extent. Finally, two programmers with more than three years of coding experience in the library are invited to act as code generation models and check the quality of the data. 47 | 48 | As a result, we craft 101 programming problems for PandasEval and NumpyEval, respectively. Each programming problem is equipped with test cases for evaluation. 49 | 50 | # Two Examples of Programming Problems 51 | 52 | Context is shown with a white background and the target code with a gray background. 53 | 54 | 55 | 56 | 57 | 58 | ## Reference 59 | 60 | If you use PandasEval or NumpyEval in your work, please cite the paper: 61 | 62 | ``` 63 | @inproceedings{CERT, 64 | title={{CERT}: Continual Pre-training on Sketches for Library-oriented Code Generation}, 65 | author={Zan, Daoguang and Chen, Bei and Yang, Dejian and Lin, Zeqi and Kim, Minsu and Guan, Bei and Wang, Yongji and Chen, Weizhu and Lou, Jian-Guang}, 66 | booktitle={The 2022 International Joint Conference on Artificial Intelligence}, 67 | year={2022} 68 | } 69 | ``` 70 | 71 | If you use the evaluationg script, please also cite the following paper: 72 | ``` 73 | @article{codex, 74 | title={Evaluating Large Language Models Trained on Code}, 75 | author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba}, 76 | year={2021}, 77 | eprint={2107.03374}, 78 | archivePrefix={arXiv}, 79 | primaryClass={cs.LG} 80 | } 81 | ``` 82 | -------------------------------------------------------------------------------- /cert/README.md: -------------------------------------------------------------------------------- 1 | # CERT: Continual Pre-Training on Sketches for Library-Oriented Code Generation 2 | 3 | Official repository for our paper ["CERT: Continual Pre-Training on Sketches for Library-Oriented Code Generation"](https://arxiv.org/pdf/2206.06888.pdf), containing crafted benchmarks, codes, and pre-trained models. 4 | 5 | --- 6 | 7 | ## Overview 8 | 9 | In our paper, we focus on investigating whether and how language models pre-trained on large-scale unlabelled code corpus can generate library-oriented code snippets. To meet this challenge, we propose CERT (for sket**C**her and g**E**ne**R**a**T**or), which is a continual pre-training approach on sketches for library-oriented code generation. In CERT, a sketcher firstly focuses on predicting a sketch, which omits user-defined details; then, a generator uses the sketch as a prompt to generate the complete code. In addition, we craft two evaluation benchmarks for Python libraries, called PandasEval and NumpyEval, each including 101 programming problems using Pandas and NumPy, respectively. 10 | 11 | 12 | 13 | Figure1: Overview of CERT: a sketcher and a generator. 14 | 15 | ## Project Directory 16 | ``` 17 | ├── nl2code # Basic scripts for loading corpus and training CERT. 18 | ├── code_dataset.py 19 | ├── dynamic_block_dataset.py 20 | ├── hf_trainer.py 21 | └── indexed_dataset.py 22 | ├── pandas-numpy-eval # Benchmarks and evaluation scripts. Please go to the folder for details. 23 | ├── scripts 24 | ├── ast_utils.py # Tools to handle the AST of Python code, for example, converting a code block to its code sketch. 25 | ├── encode_domain.py # Implementation of encoding. 26 | ├── file_utils.py # Tools for managing files. 27 | ├── multiprocessing_utils.py # Tools for managing multiple processes. 28 | └── run_encode_domain.sh # Encoding the crafted corpus (sketcher corpus and generator corpus). 29 | ├── eval_cert_unified.py # Implementation of code generation for CERT-sketcher and CERT-generator. 30 | ├── eval_cert.py # Implementation of code generation for PyCodeGPT and other baseline models. 31 | ├── run_cert.py # Implementation of CERT training. 32 | ├── run_evaluating_codes.sh # The entry script for evaluating the generated code snippets, and outputting the final results (pass@k). 33 | ├── run_generating_codes.sh # The entry script for CERT inference, which can generate a lot of code snippets for each programming problem in PandasEval and NumpyEval. 34 | ├── run_training_cert.sh # The entry script for training CERT. 35 | ``` 36 | 37 | ## Quickstart 38 | 39 | This section covers environment, data preparation, model inference, and model training. 40 | 41 | ### Preparation 42 | 43 | 1、Configuring your runtime environment 44 | 45 | ``` 46 | $ cd CERT/ 47 | $ pip install -r requirements.txt 48 | ``` 49 | 50 | 2、Preparation of pre-trained models 51 | 52 | Download the pre-trained checkpoint (e.g., `pycodegpt-110M`) from `Releases` in this GitHub project and place it in the corresponding folder (e.g., `CERT/models/pycodegpt-110M`). 53 | 54 | 3、Updating the scripts according to your local path 55 | 56 | - Update `run_training_cert.sh`. 57 | - Update `run_generating_codes.sh`. 58 | - Update `run_evaluating_codes.sh`. 59 | 60 | ### Use PyCodeGPT or CERT 61 | 62 | Firstly, multiple code snippets are generated for each programming problem (`run_generating_codes.sh`). Then, the code snippets are evaluated (`run_evaluating_codes.sh`). 63 | 64 | ``` 65 | $ bash run_generating_codes.sh 66 | $ bash run_evaluating_codes.sh 67 | ``` 68 | 69 | ### Train CERT 70 | 71 | Train CERT (sketcher and generator) by the following command based on the large-scale code corpus. 72 | 73 | ``` 74 | $ bash run_training_cert.sh 75 | ``` 76 | 77 | ## Experiments and Some Cases 78 | 79 | In inference phase, we set the `temperature` to one of `[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]`, the number of samples (`NUM_SAMPLES`) to `200`, the max number of generated tokens (`MAX_TOKNES`) to `100`, and the `top_p` to `0.9`. The best number is reported across the above hyper-parameters. 80 | 81 | Here are some cases: 82 | 83 | 1. Sketcher and Generator are able to predict successfully. (It usually occurs when there are more user-defined terms.) 84 | 85 | 2. Sketcher predicts the correct answer directly. (It usually occurs when there are relatively few or no user-defined terms) 86 | 87 | 3. Sketcher predicts wrong sketch, but generator can rectify them and predict the correct answer. 88 | 89 | 90 | 91 | ## Citation 92 | If you find our work useful, please cite the paper: 93 | ``` 94 | @inproceedings{CERT, 95 | title={{CERT}: Continual Pre-training on Sketches for Library-oriented Code Generation}, 96 | author={Zan, Daoguang and Chen, Bei and Yang, Dejian and Lin, Zeqi and Kim, Minsu and Guan, Bei and Wang, Yongji and Chen, Weizhu and Lou, Jian-Guang}, 97 | booktitle={The 2022 International Joint Conference on Artificial Intelligence}, 98 | year={2022} 99 | } 100 | ``` -------------------------------------------------------------------------------- /cert/run_training_cert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # Licensed under the MIT license. 4 | 5 | # setup for wandb 6 | export WANDB_PROJECT="CERT" 7 | export WANDB_API_KEY="Your wandb api key" 8 | 9 | BASE_DATA_DIR="Your base data directory" 10 | if [ ! -z "$AMLT_DATA_DIR" ]; then 11 | echo "Run experiment on AMLT." 12 | BASE_DATA_DIR=$AMLT_DATA_DIR 13 | fi 14 | 15 | # [Pandas, Numpy] 16 | DOMAIN="Pandas" 17 | 18 | # [normal, sketcher, generator] 19 | TYPE="generator" 20 | 21 | # ------------------Pandas------# ------Numpy-------------------------------- 22 | # You should replace the following variables according to your own settings. 23 | # ------------------Pandas------# ------Numpy-------------------------------- 24 | DATA_DIR="${BASE_DATA_DIR}/CERT/${DOMAIN}/${TYPE}_bin" 25 | 26 | N_GPUS="1" 27 | NODE_SIZE="1" 28 | 29 | if [ ! -z "$1" ]; then 30 | N_GPUS=$1 31 | fi 32 | 33 | if [ ! -z "$2" ]; then 34 | NODE_SIZE=$2 35 | fi 36 | 37 | BATCH_SIZE=1 # 24G:7 32G:8 16G:6 38 | MAX_STEPS=100_000 39 | BLOCK_SIZE=1024 40 | GRAD_ACC_STEPS=2 41 | WARMUP_STEPS=1_000 42 | SAVE_STEPS=2_000 43 | 44 | LR="5e-4" 45 | WD="0.1" 46 | 47 | # DO NOT take func score into consideration for resampling by setting a const weight 1.0 48 | RS_WEIGHTS="1.0_0.5_1.0" #_0.5" 49 | GAS="" #"512K_150K" #default is const 50 | 51 | # -------------------------------------------------------------------------- 52 | # You should replace the following variables according to your own settings. 53 | # -------------------------------------------------------------------------- 54 | OUTPUT_DIR="${BASE_DATA_DIR}/CERT/${DOMAIN}/experiments/${TYPE}_models" 55 | CKPT_NAME="" 56 | 57 | if [ ! -z "$AMLT_DATA_DIR" ]; then 58 | OUTPUT_DIR="$AMLT_DATA_DIR/CERT/${DOMAIN}/experiments/${TYPE}_models" 59 | BATCH_SIZE=10 60 | GRAD_ACC_STEPS=3 61 | 62 | fi 63 | 64 | STEP_SUMMARY="${WARMUP_STEPS}K_${MAX_STEPS}K_${SAVE_STEPS}K" 65 | STEP_SUMMARY=${STEP_SUMMARY//_000/} 66 | 67 | # Resampling with weight 0.4 68 | # GRAD_ACC_STEPS + 1.0 per epoch 69 | ID="" 70 | 71 | if [ ! -z "$RS_WEIGHTS" ]; then 72 | ID="RS_${RS_WEIGHTS}" 73 | fi 74 | 75 | if [ ! -z "$GAS" ]; then 76 | ID="${ID}-GAS_${GAS}" 77 | else 78 | GAS="const" 79 | fi 80 | 81 | if [ ! -z "$CKPT_NAME" ]; then 82 | ID="${ID}-RSUME" 83 | fi 84 | 85 | ACTUAL_GPUS=$((${N_GPUS}*${NODE_SIZE})) 86 | RUN_NAME="${BATCH_SIZE}x${GRAD_ACC_STEPS}x${ACTUAL_GPUS}x${BLOCK_SIZE}-${LR}-${WD}-${STEP_SUMMARY}-${ID}" 87 | RUN_OUTPUT_DIR="$OUTPUT_DIR/$RUN_NAME" 88 | 89 | echo "Experiment Run Name: $RUN_NAME" 90 | echo "Data Dir:" $DATA_DIR 91 | echo "Actual GPUs:" $ACTUAL_GPUS 92 | export DISTRIBUTED_GPU_SIZE=$ACTUAL_GPUS 93 | 94 | echo "Output Dir:" $OUTPUT_DIR 95 | echo "Init Actual Batch Size: ${BATCH_SIZE}x${GRAD_ACC_STEPS}x${N_GPUS}x${NODE_SIZE}" 96 | 97 | Run_Command_Args=" --model_name_or_path $DATA_DIR/model" 98 | Run_Command_Args="$Run_Command_Args --run_name $RUN_NAME" 99 | Run_Command_Args="$Run_Command_Args --output_dir $RUN_OUTPUT_DIR" 100 | Run_Command_Args="$Run_Command_Args --train_file $DATA_DIR/train" 101 | Run_Command_Args="$Run_Command_Args --validation_file $DATA_DIR/valid" 102 | Run_Command_Args="$Run_Command_Args --do_train" 103 | Run_Command_Args="$Run_Command_Args --do_eval" 104 | 105 | Run_Command_Args="$Run_Command_Args --block_size $BLOCK_SIZE" 106 | Run_Command_Args="$Run_Command_Args --logging_steps 100" 107 | Run_Command_Args="$Run_Command_Args --evaluation_strategy steps" 108 | Run_Command_Args="$Run_Command_Args --eval_steps $SAVE_STEPS" 109 | Run_Command_Args="$Run_Command_Args --save_steps $SAVE_STEPS" 110 | Run_Command_Args="$Run_Command_Args --warmup_steps $WARMUP_STEPS" 111 | Run_Command_Args="$Run_Command_Args --learning_rate $LR" 112 | Run_Command_Args="$Run_Command_Args --adam_beta2 0.95" 113 | Run_Command_Args="$Run_Command_Args --lr_scheduler_type cosine" 114 | Run_Command_Args="$Run_Command_Args --resampling_weights $RS_WEIGHTS" 115 | 116 | Run_Command_Args="$Run_Command_Args --max_steps $MAX_STEPS" 117 | Run_Command_Args="$Run_Command_Args --per_device_train_batch_size $BATCH_SIZE" 118 | Run_Command_Args="$Run_Command_Args --per_device_eval_batch_size $BATCH_SIZE" 119 | Run_Command_Args="$Run_Command_Args --gradient_accumulation_steps $GRAD_ACC_STEPS" 120 | Run_Command_Args="$Run_Command_Args --weight_decay $WD" 121 | Run_Command_Args="$Run_Command_Args --fp16" 122 | Run_Command_Args="$Run_Command_Args --report_to wandb" 123 | 124 | if [ "$OMPI_COMM_WORLD_RANK" == "0" ]; then 125 | mkdir "$OUTPUT_DIR/code/" 126 | cp -r "./" "$OUTPUT_DIR/code/" 127 | echo "Save experiment source code over." 128 | fi 129 | 130 | if [ ! -z "$GAS" ]; then 131 | Run_Command_Args="$Run_Command_Args --gradient_accumulation_strategy $GAS" 132 | fi 133 | 134 | if [ ! -z "$CKPT_NAME" ]; then 135 | CKPT_PATH=$"$OUTPUT_DIR/$CKPT_NAME" 136 | echo "Resume from checkpoint: $CKPT_PATH" 137 | Run_Command_Args="$Run_Command_Args --resume_from_checkpoint $CKPT_PATH --ignore_data_skip" 138 | fi 139 | 140 | echo "Run Command Args: $Run_Command_Args" 141 | 142 | # deepspeed --num_gpus $N_GPUS run_gpt.py --deepspeed configs/ds_config.json $Run_Command_Args 143 | CUDA_VISIBLE_DEVICES=0 python -m torch.distributed.launch --nproc_per_node $N_GPUS --use_env run_cert.py $Run_Command_Args 144 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/scripts/run_prepare_test_private_code.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import glob 4 | from argparse import ArgumentParser 5 | 6 | from transformers import AutoTokenizer 7 | from tqdm import tqdm 8 | 9 | import sys 10 | # ------------------------------------------------------------------------------------------------- 11 | # you need to change this path to your own `APICoder-CodeGenAPI` path, better to use absolute path 12 | # ------------------------------------------------------------------------------------------------- 13 | sys.path.append('../../../APICoder-CodeGenAPI/') 14 | from scripts.get_libs_info_from_code import ( 15 | get_dict_of_api_name_lib_api_paths, 16 | get_dict_of_api_path_api_signature_and_api_desp, 17 | get_first_sentence_from_api_desp, 18 | normalizer_api_desp 19 | ) 20 | from APICoder.get_lib_comment_for_eval import ( 21 | get_code_and_comment_by_lib_name_and_task_id, 22 | get_one_instance_by_lib_name 23 | ) 24 | from APICoder.get_api_info_by_name import ( 25 | get_api_name_4_api_sign_and_desps, 26 | get_all_api_info_prompt_list_by_api_name 27 | ) 28 | 29 | # ------------------------------------------------------------------------------------------------- 30 | # your need to change this path to the path of your `crawl_code` path, better to use absolute path 31 | # ------------------------------------------------------------------------------------------------- 32 | YOUR_CRAWLED_API_PATH = "PrivateLibrary/data/API-Doc" 33 | api_path_api_signature_and_api_desp = get_dict_of_api_path_api_signature_and_api_desp( 34 | YOUR_CRAWLED_API_PATH, 35 | "pandas,numpy,monkey,beatnum,torchdata", 36 | "datetime", 37 | "False" 38 | ) 39 | 40 | parser = ArgumentParser() 41 | 42 | parser.add_argument('--base_model_dir', type=str, default="/your/base/dir/including/`eval_datas(benchmarks)`/and/others/") 43 | parser.add_argument('--benchmarks', type=list, default=["pandas", "numpy", "monkey", "beatnum", "torchdata"]) 44 | parser.add_argument('--output_dir', type=str, default="PrivateLibrary/APIRetriever/data/inference") 45 | parser.add_argument('--tokenizer', type=str, required=False, default='your/path/of/bert-base-uncased/') 46 | 47 | args = parser.parse_args() 48 | 49 | base_model_dir, benchmarks, output_dir = args.base_model_dir, args.benchmarks, args.output_dir 50 | benchmark_dir = os.path.join(base_model_dir, "eval_datas") 51 | tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=True) 52 | 53 | x = [0, 1000, 10000, 100000, 1000000] # the unique id of each code comment 54 | 55 | ''' 56 | for `code comment` of 5 benchamarks, making query corpus (total 5 files, named: xxx_comment.json) 57 | ''' 58 | for idx, library_name in enumerate(tqdm(benchmarks)): 59 | if not os.path.exists(os.path.join(output_dir, library_name)): 60 | os.makedirs(os.path.join(output_dir, library_name)) 61 | comment_out_name = os.path.join(output_dir, library_name + "_comment.json") 62 | comment_writer = open(comment_out_name, 'w+') 63 | base_id = x[idx] 64 | 65 | lib_iter_obj = get_one_instance_by_lib_name(library_name, base_dir=base_model_dir) 66 | for this_instance_dict in tqdm(lib_iter_obj): 67 | # dict_keys(['task_id', 'prompt', 'entry_point', 'canonical_solution', 'test']) 68 | task_id = this_instance_dict["task_id"] 69 | text_id = base_id + int(task_id.split("/")[-1]) 70 | code_comment_solution = get_code_and_comment_by_lib_name_and_task_id(library_name, task_id, base_model_dir) 71 | this_code, this_comment, this_solution = code_comment_solution[0], code_comment_solution[1], code_comment_solution[2] 72 | save_dict = { 73 | "text_id": text_id, 74 | "task_id": task_id, 75 | "text": this_comment 76 | } 77 | comment_writer.write(json.dumps(save_dict) + "\n") 78 | 79 | comment_writer.close() 80 | 81 | ''' 82 | for `API information` of 5 benchamarks, making corpus (total 5 files, named: xxx_api.json) 83 | ''' 84 | y = [1000000, 1100000, 1200000, 1300000, 1400000] # the unique id of each API 85 | for idx, library_name in enumerate(tqdm(benchmarks)): 86 | if not os.path.exists(output_dir): 87 | os.makedirs(output_dir) 88 | api_out_name = os.path.join(output_dir, library_name + "_api.json") 89 | api_writer = open(api_out_name, 'w+') 90 | base_id = y[idx] 91 | api_name_4_api_sign_and_desps = get_api_name_4_api_sign_and_desps(library_name, base_model_dir) 92 | total_api, now_number = len(api_name_4_api_sign_and_desps), 0 93 | for api_name, api_path_info_dict in tqdm(api_name_4_api_sign_and_desps.items(), total=total_api): 94 | for api_idx, (api_path, api_info_list) in enumerate(api_path_info_dict.items()): 95 | api_signature, api_description = api_info_list[0].strip(), get_first_sentence_from_api_desp(normalizer_api_desp(api_info_list[1])) 96 | if api_signature == "": 97 | continue 98 | api_info_prompt=f"{api_name}{api_signature}: {api_description}" 99 | text_id = base_id + now_number 100 | save_dict = { 101 | "text_id": text_id, 102 | "text": api_info_prompt 103 | } 104 | now_number+=1 105 | api_writer.write(json.dumps(save_dict) + "\n") 106 | 107 | api_writer.close() 108 | 109 | print(f"Done!") -------------------------------------------------------------------------------- /apicoder/private-eval/data/TorchData_no.API_number_0.CodeGen.hm_False.machine.t0.1.p0.9.l100.n1.samples.jsonl: -------------------------------------------------------------------------------- 1 | {"task_id": "TorchDataEval/0", "completion": " datapipe.repeat(6)"} 2 | {"task_id": "TorchDataEval/1", "completion": " dp.index(0)"} 3 | {"task_id": "TorchDataEval/2", "completion": " batch_dp.sample(batch_size=1, shuffle=False)"} 4 | {"task_id": "TorchDataEval/4", "completion": " source_dp.split(odd_or_even)"} 5 | {"task_id": "TorchDataEval/5", "completion": " source_dp.clone(), source_dp.clone()\ndp1.iter.reset()\ndp2.iter.reset()"} 6 | {"task_id": "TorchDataEval/6", "completion": " dp1.merge(dp2)"} 7 | {"task_id": "TorchDataEval/7", "completion": " dp1.map(mapdp)"} 8 | {"task_id": "TorchDataEval/9", "completion": " SampleMultiplexer(source_dp1, source_dp2, weitghts)"} 9 | {"task_id": "TorchDataEval/10", "completion": " source_dp.unzip()"} 10 | {"task_id": "TorchDataEval/11", "completion": " dp.batch(3)\ndp = dp.drop(0)\ndp = dp.drop(1)\ndp = dp.drop(2)"} 11 | {"task_id": "TorchDataEval/12", "completion": " torchdata.datapipes.BatchDatapipe(source_dp, batch_size=3, batch_num=100, drop_last=False)"} 12 | {"task_id": "TorchDataEval/14", "completion": " IterableWrapper([\"a.png\", \"b.png\", \"a.json\", \"b.json\", \"a.jpg\", \"c.json\"])"} 13 | {"task_id": "TorchDataEval/16", "completion": " HttpReader(file_url, iterable_wrapper=IterableWrapper(file_url))"} 14 | {"task_id": "TorchDataEval/17", "completion": " source_dp.map(mutiple_fn)"} 15 | {"task_id": "TorchDataEval/18", "completion": " dp.map(lambda x: x + 1)"} 16 | {"task_id": "TorchDataEval/19", "completion": " dp.filter(lambda x: is_even(x))"} 17 | {"task_id": "TorchDataEval/20", "completion": " dp.get(0, 0)"} 18 | {"task_id": "TorchDataEval/21", "completion": " dp.map(lambda x: x['a'])"} 19 | {"task_id": "TorchDataEval/23", "completion": " map_dp_1.batch(2)\nnew_dp.map(lambda x: x + 1) #"} 20 | {"task_id": "TorchDataEval/24", "completion": " HttpReader(URL)\nag_news_train.process()"} 21 | {"task_id": "TorchDataEval/25", "completion": " HttpReader(URL, lambda_func_)\nag_news_train.map(lambda t: (int(t[0]), \" \".join(t[1:])))"} 22 | {"task_id": "TorchDataEval/26", "completion": " ag_news_train.map(lambda batch: lambda_batch(batch))"} 23 | {"task_id": "TorchDataEval/27", "completion": " dp.repeat(3)\ndp = dp.sample(3)"} 24 | {"task_id": "TorchDataEval/28", "completion": " dp1 + dp2"} 25 | {"task_id": "TorchDataEval/29", "completion": " dp1.merge(dp2)"} 26 | {"task_id": "TorchDataEval/30", "completion": " dp1.merge(dp2, keep_key=True)"} 27 | {"task_id": "TorchDataEval/31", "completion": " dp1.transform(merge_fn)\nres_dp = res_dp.map(lambda x: x[1])\nres_dp = res_dp.map(lambda x: x[0])"} 28 | {"task_id": "TorchDataEval/32", "completion": " dp1.merge_fn(dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1,"} 29 | {"task_id": "TorchDataEval/33", "completion": " dp1.map(merge_fn, dp1)\nres_map = mapdp.map(merge_fn, mapdp)"} 30 | {"task_id": "TorchDataEval/34", "completion": " dp1.map(merge_fn, dp1.map(mapdp, dp1.map(mapdp, dp1.map(mapdp, dp1.map(mapdp, dp1.map(mapdp, dp1.map(mapdp, dp1.map(mapdp, dp1.map(mapdp, dp1.map(mapdp, dp1.map(mapdp"} 31 | {"task_id": "TorchDataEval/35", "completion": " source_dp.group(group_fn)\ndp1 = source_dp.group(group_fn, length=1)\ndp2 = source_dp.group(group_fn, length=1, reserve=1)\ndp3 = source_dp.group(group_fn, length=1, reserve=1, reserve=1)"} 32 | {"task_id": "TorchDataEval/37", "completion": " ds.collate(collate_fn)"} 33 | {"task_id": "TorchDataEval/38", "completion": " ds.collate(collate_fn)"} 34 | {"task_id": "TorchDataEval/40", "completion": " source_dp.split(great_than_5)"} 35 | {"task_id": "TorchDataEval/41", "completion": " SampleMultiplexer(dp1, dp2, weight_)"} 36 | {"task_id": "TorchDataEval/42", "completion": " raw_dp"} 37 | {"task_id": "TorchDataEval/43", "completion": " dp.batch(3)\ndp = dp.drop_last(2)\ndp = dp.drop_last(2)\ndp = dp.drop_last(2)\ndp = dp.drop_last(2)\ndp = dp.drop_last(2)\ndp = dp.drop_last(2)\ndp = dp.drop_last(2)\ndp = dp.drop_last"} 38 | {"task_id": "TorchDataEval/44", "completion": " IterableWrapper([\"a\", \"b\", \"c\", \"d\", \"e\", \"f\", \"g\", \"h\", \"i\", \"j\"])\ndp_4 = IterableWrapper([\"1\", \"2\", \"3\", \"4\", \"5\", \"6\", \"7\", \"8\", \"9\", \"10\"])\ndp_5 = IterableWrapper([\"1\", \"2\", \"3\", \"4\", \""} 39 | {"task_id": "TorchDataEval/45", "completion": " dp_source_1.concatenate(dp_source_2)\nindex_dp.index.name = 'Ids'"} 40 | {"task_id": "TorchDataEval/46", "completion": " dp_source_1.join(dp_source_2, index_name='Ids')"} 41 | {"task_id": "TorchDataEval/47", "completion": " dp_source_1.join(dp_source_2)"} 42 | {"task_id": "TorchDataEval/48", "completion": " IterableWrapper(flatted_func)"} 43 | {"task_id": "TorchDataEval/49", "completion": " HttpReader(AG_NEWS_CSV_URL)\nag_news_train.map(lambda x: x.decode('utf-8'))"} 44 | {"task_id": "TorchDataEval/3", "completion": " dp_one.concatenate(dp_two)"} 45 | {"task_id": "TorchDataEval/8", "completion": " dp1.map(lambda x: x[0])\nresult = dp2.map(lambda x: x[0])"} 46 | {"task_id": "TorchDataEval/13", "completion": " ds.collate(int2tensor)"} 47 | {"task_id": "TorchDataEval/15", "completion": " torchdata.datapipes.unbatch(source_dp, level=1)"} 48 | {"task_id": "TorchDataEval/22", "completion": " dp.generate_by_chunk(1)"} 49 | {"task_id": "TorchDataEval/39", "completion": " dp_dog.remove_final_number(remove_final_number(dp_dog.get_data()))\nresult = dp_cat.remove_final_number(remove_final_number(dp_cat.get_data()))"} 50 | {"task_id": "TorchDataEval/36", "completion": " source_dp.group_by(group_fn)"} 51 | -------------------------------------------------------------------------------- /apicoder/APIRetriever/scripts/run_prepare_train_private_code.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import glob 4 | from argparse import ArgumentParser 5 | 6 | from transformers import AutoTokenizer 7 | from tqdm import tqdm 8 | 9 | import sys 10 | # ------------------------------------------------------------------------------------------------- 11 | # you need to change this path to your own `APICoder-CodeGenAPI` path, better to use absolute path 12 | # ------------------------------------------------------------------------------------------------- 13 | sys.path.append('../../../CodeGenAPI/') 14 | from scripts.get_libs_info_from_code import ( 15 | get_dict_of_api_name_lib_api_paths, 16 | get_dict_of_api_path_api_signature_and_api_desp, 17 | get_first_sentence_from_api_desp, 18 | normalizer_api_desp 19 | ) 20 | 21 | # ------------------------------------------------------------------------------------------------- 22 | # your need to change this path to the path of your `crawl_code` path, better to use absolute path 23 | # ------------------------------------------------------------------------------------------------- 24 | YOUR_CRAWLED_API_PATH = "PrivateLibrary/data/API-Doc" 25 | api_path_api_signature_and_api_desp = get_dict_of_api_path_api_signature_and_api_desp( 26 | YOUR_CRAWLED_API_PATH, 27 | "pandas,numpy,monkey,beatnum,torchdata", 28 | "datetime", 29 | "False" 30 | ) 31 | 32 | # ------------------------------------------------------------------------------------------------- 33 | # your need to change the below path to the your own ones, better to use absolute path 34 | # ------------------------------------------------------------------------------------------------- 35 | parser = ArgumentParser() 36 | parser.add_argument('--input', type=str, default="PrivateLibrary/APIRetriever/data/train/unprocessed-train-data", help="each jsonl file in such path contains many json lines, where each line's format is {'code_doc': '', 'positive_APIs': ['A', ...], 'negative_APIs': ['B', ...]}") 37 | parser.add_argument('--data_mode', type=str, default="", help="the prefix of the input jsonl file, default is empty") 38 | parser.add_argument('--output', type=str, default="PrivateLibrary/APIRetriever/data/train/processed-train-data", help="the output path") 39 | parser.add_argument('--tokenizer', type=str, required=False, default='/your/path/of/bert-base-uncased') 40 | 41 | args = parser.parse_args() 42 | 43 | tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=True) 44 | 45 | if not os.path.exists(args.output): 46 | os.makedirs(args.output) 47 | 48 | all_data_paths = glob.glob(os.path.join(args.input, f'{args.data_mode}*.jsonl')) 49 | print(f"Now, all data paths are: {all_data_paths}") 50 | 51 | # ------------------------------------------------------------------------------------------------- 52 | # your training data name default is `private_data_train.json`, you can change it to your own name 53 | # ------------------------------------------------------------------------------------------------- 54 | with open(os.path.join(args.output, 'private_data_train.json'), 'w+') as f: 55 | for data_path in tqdm(all_data_paths): 56 | data_reader = open(data_path, 'r') 57 | for line in tqdm(data_reader): 58 | group = {} 59 | # dict_keys(['code_block', 'code_doc', 'code_all_doc', 'positive_APIs', 'negative_APIs']) 60 | line_dict = json.loads(line) 61 | comment, positive_apis, negative_apis = line_dict["code_doc"], line_dict["positive_APIs"], line_dict["negative_APIs"] 62 | query = tokenizer.encode(comment, add_special_tokens=False, max_length=256, truncation=True) 63 | 64 | group['query'] = query 65 | group['positives'] = [] 66 | group['negatives'] = [] 67 | for positive_api in positive_apis: 68 | if api_path_api_signature_and_api_desp.get(positive_api) is None: 69 | continue 70 | positive_api_info_dict = api_path_api_signature_and_api_desp[positive_api] 71 | if positive_api_info_dict['api_signature'] == "": 72 | continue 73 | positive_api_prompt = f"{positive_api_info_dict['api_name']}{positive_api_info_dict['api_signature']}: {get_first_sentence_from_api_desp(normalizer_api_desp(positive_api_info_dict['api_description']))}" 74 | text = tokenizer.encode(positive_api_prompt, add_special_tokens=False, max_length=256, truncation=True) 75 | group['positives'].append(text) 76 | for negative_api in negative_apis: 77 | if api_path_api_signature_and_api_desp.get(negative_api) is None: 78 | continue 79 | negative_api_info_dict = api_path_api_signature_and_api_desp[negative_api] 80 | if negative_api_info_dict['api_signature'] == "": 81 | continue 82 | negative_api_prompt = f"{negative_api_info_dict['api_name']}{negative_api_info_dict['api_signature']}: {get_first_sentence_from_api_desp(normalizer_api_desp(negative_api_info_dict['api_description']))}" 83 | text = tokenizer.encode(negative_api_prompt, add_special_tokens=False, max_length=256, truncation=True) 84 | group['negatives'].append(text) 85 | if len(group['positives']) == 0 or len(group['negatives']) == 0 or len(group['query']) == 0: 86 | print("Skip this group") 87 | continue 88 | f.write(json.dumps(group) + '\n') 89 | 90 | print(f"Done!") -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyCodeGPT 2 | A pre-trained GPT model for Python code completion and generation 3 | 4 | ## What is it? 5 | 6 | PyCodeGPT is efficient and effective GPT-Neo-based model for python code generation task, which is similar to [OpenAI Codex](https://openai.com/blog/openai-codex/), [Github Copliot](https://copilot.github.com/), [CodeParrot](https://huggingface.co/blog/codeparrot), [AlphaCode](https://deepmind.com/blog/article/Competitive-programming-with-AlphaCode). 7 | 8 | ## Training Data 9 | Due to the small size of public released dataset, we proposed to collect data from GitHub from scratch. We first crawled 1.2M python-related repositories hosted by GitHub. Then, we used these repository URLs to download all contents of each repository from GitHub. After that, we got 60M raw python files under 1MB with a total size of 330GB. Finally, we carefully designed various strategies of data cleaning to get about 96GB data for training. Please refer to the following table for the details. 10 | 11 | |Model|Repositories|Size and file after filtering| 12 | |:------:|:---:|:---:| 13 | | CodeParrot | 0.56M | 12GB (compressed), 5.4M | 14 | | Codex | 54M | 159GB | 15 | | PyCodeGPT | 1.2M | 96GB, 13M | 16 | 17 | 18 | ## Pretrained models 19 | 20 | we aims to train median-large pre-trained models (model size with 110M) based on GPT-Neo: 21 | - PyCodeGPT-110M: derived from GPT-Neo 125M with a vocabulary size of 32K. 22 | 23 | PyCodeGPT-110M is available on [HuggingFace](https://huggingface.co/Daoguang/PyCodeGPT). 24 | 25 | ## Evaluation 26 | 1. Install requirements (python 3.7) 27 | ```bash 28 | $ pip install -r requirements.txt 29 | ``` 30 | 31 | 2. Install [HumanEval](https://github.com/openai/human-eval) 32 | - Note that you can successfully evaluate your model after uncommenting 58th line of `human-eval/human_eval/execution.py` 33 | ```bash 34 | $ git clone https://github.com/openai/human-eval 35 | $ pip install -e human-eval 36 | ``` 37 | 38 | 3. Run `eval_human_eval.py` to generate programs 39 | - Arguments 40 | - `model_name_or_path` : Path to the model checkpoint to be evaluated. 41 | - `output_dir` : Path to save generated programs 42 | - `num_completions` : The number of program to be generated 43 | - `temperature` : Temperature for sampling 44 | - `top_p` : p value for nucleus sampling 45 | - `max_new_tokens` : Maximum number of generated token 46 | - Example usage 47 | 48 | ```bash 49 | $ python eval_human_eval.py \ 50 | --model_name_or_path PyCodeGPT-110M/ \ 51 | --output_dir results/ \ 52 | --num_completions 100 \ 53 | --temperature 0.2 \ 54 | --top_p 0.95 \ 55 | --max_new_tokens 100 \ 56 | --gpu_device 0 57 | ``` 58 | 59 | 4. Evaluate functional correctness 60 | ```bash 61 | $ evaluate_functional_correctness 62 | # Example 63 | $ evaluate_functional_correctness results/human_eval.t0.2.p0.95.l100.n100.samples.jsonl 64 | ``` 65 | 66 | Here's our evaluation result on HumanEval dataset: 67 | 68 | Note: our model can have a comparable accuracy with Codex of similar model size. 69 | 70 | |Model|Pass@1|Pass@10|Pass@100| 71 | |:------:|:---:|:---:|:---:| 72 | |PyCodeGPT-110M |**8.32%** |**13.53%** |**18.3%** | 73 | ||||| 74 | |GPT-Neo 125M |0.75% |1.88% |2.97% | 75 | |GPT-Neo 1.3B |4.97% |7.47% |16.3% | 76 | |GPT-Neo 2.7B |6.41% |11.27% |21.37% | 77 | |GPT-J 6B |11.62% |15.74% |27.74% | 78 | ||||| 79 | |TabNine |2.58% |4.35% |7.59% | 80 | ||||| 81 | |CodeParrot 110M |3.80% |6.57% |12.78% | 82 | |CodeParrot 1.5B |3.58% |8.03% |14.96% | 83 | ||||| 84 | |Codex 12M |2.00% |3.62% |8.58% | 85 | |Codex 25M |3.21% |7.1% |12.89% | 86 | |Codex 42M |5.06% |8.8% |15.55% | 87 | |Codex 85M |8.22% |12.81% |22.4% | 88 | |Codex 300M |13.17% |20.37% |36.27% | 89 | |Codex 679M |16.22% |25.7% |40.95% | 90 | |Codex 2.5B |21.36% |35.42% |59.5% | 91 | |Codex 12B |28.81% |46.81% |72.31% | 92 | ||||| 93 | |Pretrained Decoder-only 13M (AlphaCode) |1.5% |3.6% |8.6% | 94 | |Pretrained Decoder-only 29M (AlphaCode) |3.4% |5.8% |11.2% | 95 | |Pretrained Decoder-only 55M (AlphaCode) |4.2% |8.2% |16.9% | 96 | |Pretrained Decoder-only 89M (AlphaCode) |4.3% |12.2% |20.0% | 97 | |Pretrained Decoder-only 302M (AlphaCode) |11.6% |18.8% |31.8% | 98 | |Pretrained Decoder-only 685M (AlphaCode) |14.2% |24.4% |38.8% | 99 | |Pretrained Decoder-only 1.1B (AlphaCode) |17.1% |28.2% |45.3% | 100 | ||||| 101 | |PolyCoder 160M |2.13% |3.35% |4.88% | 102 | |PolyCoder 400M |2.96% |5.29% |11.59% | 103 | |PolyCoder 2.7B |5.59% |9.84% |17.68% | 104 | 105 | ## Reference 106 | If you want to use the models, you need to cite our following paper: 107 | 108 | ``` 109 | @inproceedings{CERT, 110 | title={{CERT}: Continual Pre-training on Sketches for Library-oriented Code Generation}, 111 | author={Zan, Daoguang and Chen, Bei and Yang, Dejian and Lin, Zeqi and Kim, Minsu and Guan, Bei and Wang, Yongji and Chen, Weizhu and Lou, Jian-Guang}, 112 | booktitle={The 2022 International Joint Conference on Artificial Intelligence}, 113 | year={2022} 114 | } 115 | ``` 116 | -------------------------------------------------------------------------------- /cert/nl2code/dynamic_block_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | from typing import List 4 | from dataclasses import dataclass 5 | import numpy as np 6 | from datetime import datetime 7 | 8 | import torch 9 | from torch.utils.data.dataset import Dataset 10 | 11 | @dataclass 12 | class BlockSpan: 13 | index: int 14 | start: int 15 | end: int 16 | 17 | @property 18 | def length(self): 19 | return self.end - self.start + 1 20 | 21 | @dataclass 22 | class BlockItem: 23 | spans: List[BlockSpan] 24 | 25 | def __len__(self): 26 | return len(self.spans) 27 | 28 | def pad(self, item): 29 | self.spans += item.spans 30 | 31 | @property 32 | def size(self): 33 | return sum([x.length for x in self.spans]) 34 | 35 | class BlockCache: 36 | def __init__(self, block_size: int, cache_size: int): 37 | self.cache_size = cache_size 38 | self.block_size = block_size 39 | 40 | self.len2spans = [[] for _ in range(block_size)] 41 | self.items_count = 0 42 | self.length_counts = np.zeros(block_size, dtype=np.int32) 43 | 44 | def add_one(self, span: BlockSpan): 45 | length = span.length 46 | if length >= self.block_size: 47 | raise ValueError("Can't add one item with length {} >= block size {}.".format(length, self.block_size)) 48 | 49 | self.len2spans[length].append(span) 50 | self.items_count += 1 51 | self.length_counts[length] += 1 52 | 53 | def is_full(self): 54 | return self.items_count >= self.cache_size 55 | 56 | def __len__(self): 57 | return self.items_count 58 | 59 | def _pop_one_by_length(self, sel_length: int) -> BlockItem: 60 | assert len(self.len2spans[sel_length]) == self.length_counts[sel_length] 61 | 62 | if len(self.len2spans[sel_length]) == 0: 63 | raise ValueError("Pop from empty length spans: {}".format(sel_length)) 64 | 65 | self.length_counts[sel_length] -= 1 66 | sel_span = self.len2spans[sel_length].pop() 67 | pad_length = self.block_size - sel_length 68 | 69 | while pad_length > 0: 70 | # Find a perfect one to block 71 | if len(self.len2spans[pad_length]) > 0: 72 | pad_span = self.len2spans[pad_length].pop() 73 | self.length_counts[pad_length] -= 1 74 | block_item = BlockItem(spans=[sel_span, pad_span]) 75 | self.items_count -= 2 76 | return block_item 77 | 78 | pad_length -= 1 79 | 80 | # can't find one to pad 81 | self.items_count -= 1 82 | return BlockItem(spans=[sel_span]) 83 | 84 | def pop_one(self): 85 | sel_length = np.argmax(self.length_counts) 86 | return self._pop_one_by_length(sel_length) 87 | 88 | def pop_all(self): 89 | index = self.block_size - 1 90 | while index > 0: 91 | while self.len2spans[index]: 92 | yield self._pop_one_by_length(index) 93 | index -= 1 94 | 95 | class DynamicBlockDataset(Dataset): 96 | def __init__(self, src_dataset: Dataset, src_sizes: List[int], block_size: int, dynamic_factor: int=10) -> None: 97 | super().__init__() 98 | self.src_dataset = src_dataset 99 | self.src_sizes = src_sizes 100 | self.block_size = block_size 101 | self.dynamic_factor = dynamic_factor 102 | 103 | start = datetime.now() 104 | self.block_items: List[BlockItem] = self.build_block_index_mappings() 105 | self._block_sizes = [x.size for x in self.block_items] 106 | print("DynamicBlockDataset builds block indices over, {} => {} ({:.4f}), avg examples = {:.3f}, cost = {}.".format( 107 | len(self.src_dataset), 108 | len(self.block_items), 109 | self.get_block_ratio(), 110 | np.mean([len(x) for x in self.block_items]), 111 | datetime.now() - start 112 | )) 113 | 114 | @property 115 | def sizes(self): 116 | return self._block_sizes 117 | 118 | def size(self, index) -> int: 119 | return self.block_items[index].size 120 | 121 | def get_block_ratio(self) -> float: 122 | print(np.mean(self._block_sizes), np.mean(self._block_sizes) / self.block_size) 123 | return sum(self.sizes) / len(self.block_items) / self.block_size 124 | 125 | def __len__(self): 126 | return len(self.block_items) 127 | 128 | def __getitem__(self, index) -> torch.Tensor: 129 | item = self.block_items[index] 130 | tensors = [self.src_dataset[span.index][span.start:span.end+1] for span in item.spans] 131 | return torch.cat(tensors, dim=0) 132 | 133 | def build_block_index_mappings(self): 134 | cache = BlockCache(self.block_size, self.dynamic_factor * self.block_size) 135 | block_idx_items = [] 136 | 137 | for i, size in enumerate(self.src_sizes): 138 | start = 0 139 | while start < size: 140 | end = min(size, start + self.block_size) 141 | span = BlockSpan(index=i, start=start, end=end-1) 142 | 143 | if span.length == self.block_size: 144 | block_idx_items.append(BlockItem([span])) 145 | else: 146 | # Pop one if cache is full 147 | if cache.is_full(): 148 | block_idx_items.append(cache.pop_one()) 149 | cache.add_one(span) 150 | start = end 151 | 152 | for item in cache.pop_all(): 153 | block_idx_items.append(item) 154 | 155 | return block_idx_items 156 | -------------------------------------------------------------------------------- /apicoder/CodeGenAPI/README.md: -------------------------------------------------------------------------------- 1 | # APICoder - CodeGenAPI 2 | 3 | Official repository for our paper ["When Language Model Meets Private Library"](https://arxiv.org/pdf/2210.17236.pdf). 4 | 5 | --- 6 | 7 | ## Overview 8 | 9 | APIRetirever finds out useful APIs for a programming problem, and then APICoder aims to generate code that solves the problem with these APIs. We make use of the most straightforward way for APICoder: prompting API information set in front of the context. Each API information is in the form of `name(signature):description`. This is to mimic programmers learning the APIs properly before writing code using them. 10 | 11 | 12 | 13 | Figure1: The training process of CodeGenAPI 14 | 15 | ## Project Directory 16 | ```shell 17 | ├── CodeGenAPI 18 | │   ├── APICoder 19 | │   │   ├── get_api_info_by_name.py 20 | │   │   ├── get_lib_comment_for_eval.py 21 | │   ├── apex 22 | │   ├── eval_baseline.py 23 | │   ├── eval_private.py 24 | │   ├── nl2code 25 | │   ├── requirements.txt 26 | │   ├── run_generating_codes.sh # The entry script for CodeGenAPI inference, which can generate a lot of code snippets for each programming problem. 27 | │   ├── run_evaluating_codes.sh # The entry script for evaluating the generated code snippets, and outputting the final results (pass@k). 28 | │   ├── run_private.py 29 | │   ├── run_private.sh # Implementation of CodeGenAPI training. 30 | │   └── scripts 31 | │   ├── encode_private_data.py 32 | │   ├── extract_api.py 33 | │   ├── file_utils.py 34 | │   ├── get_comments_from_evallibs.py 35 | │   ├── get_libs_info_from_code.py 36 | │   ├── make_human_in_the_loop_test_corpus.py 37 | │   ├── multiprocessing_utils.py 38 | │   ├── pycode_visitor.py 39 | │   ├── requirements.txt 40 | │   ├── run_details_apis.sh # Extracting all kinds of API information (API name, signature, description and so on) from the crawled API documentations of 35 libraries. 41 | │   ├── run_encode_private_data.sh # Encoding the private data 42 | │   ├── run_extract_apis.sh # Crawling the API documentation for 31 off-the-shelf public libraries. 43 | │   └── run_extract_details_from_apis.py 44 | ``` 45 | 46 | ## Quickstart 47 | 48 | This section covers environment, data preparation, model inference, and model training. 49 | 50 | ### Preparation 51 | 52 | 1、Configuring your runtime environment 53 | 54 | ``` 55 | $ cd PrivateLibrary/CodeGenAPI 56 | $ pip install -r requirements.txt 57 | ``` 58 | Besides, if you would like to use mixed precision FP16 to speed up the training, it is necessary for you to install the apex library. 59 | ``` 60 | git clone https://github.com/NVIDIA/apex 61 | cd apex 62 | pip install -v --no-cache-dir ./ 63 | ``` 64 | 65 | 2、Preparation of pre-trained models 66 | 67 | Download the pre-trained checkpoint (e.g., `CodeGenAPI-110M`) from [our released page](https://github.com/microsoft/PyCodeGPT/releases/download/Private-Library/CodeGenAPI-350M-mono.zip) and place it in the corresponding folder (e.g., `CodeGenAPI/models/CodeGenAPI-110M`). 68 | 69 | 3、Updating the scripts according to your local path 70 | 71 | - Update `run_private.sh`. 72 | - Update `run_generating_codes.sh`. 73 | - Update `run_evaluating_codes.sh`. 74 | 75 | ### Use CodeGenAPI or other models 76 | 77 | Firstly, multiple code snippets are generated for each programming problem (`run_generating_codes.sh`). Then, the code snippets are evaluated (`run_evaluating_codes.sh`). 78 | 79 | ``` 80 | $ bash run_generating_codes.sh 81 | $ bash run_evaluating_codes.sh 82 | ``` 83 | 84 | ### Train CodeGenAPI 85 | 86 | Train CodeGenAPI by the following command based on the large-scale code corpus. 87 | 88 | ``` 89 | $ bash run_private.sh 90 | ``` 91 | 92 | ## Experiments 93 | 94 | In inference phase, we set the `temperature` to one of `[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]`, the number of samples (`NUM_SAMPLES`) to `200`, the max number of generated tokens (`MAX_TOKNES`) to `100`, and the `top_p` to `0.9`. The best number is reported across the above hyper-parameters. 95 | 96 | Here are the main results: 97 | 98 | ![](https://s3.bmp.ovh/imgs/2022/09/27/1f28c06f5cc05bcc.png) 99 | 100 | After running these numerous experiments, we drew some plausible observations and valuable insights as follows. 101 | 102 | > (1) Prompting API information set is useful on private-library oriented code generation task. 103 | 104 | > (2) Which is the best of the API prompt ways including Perfect, Top-N, and Human? As a general matter, Perfect, Human, and Top-N produce progressively decreasing benefits. However, Top-N is in occasion superior than Perfect as the noise exists when training the model. Also, we observe that Top-1,2 usually works better than Top-3,5 because the latter introduces more noise APIs. 105 | 106 | > (3) Our continual pre-trained model does better at invoking APIs than to its base model, and thus can further elevate the performance of code generation for private libraries in majority of scenarios. 107 | 108 | > (4) APIRetriever has the capability to retrieve useful APIs. 109 | 110 | > (5) Involving human in the loop can further boost the performance. 111 | 112 | > (6) As the k in pass@k grows larger, the gain we add API information brings is larger. 113 | 114 | > (7) It is so challenging to generate code invoking private libraries than public ones, that large models fail to do so if we do not prompt any APIs. 115 | 116 | For more explanation, please see our raw paper. 117 | 118 | ## Citation 119 | If you find our work useful, please cite the paper: 120 | ``` 121 | @inproceedings{APICoder, 122 | title={When Languange Model Meets Private Library}, 123 | author={Zan, Daoguang and Chen, Bei and Lin, Zeqi and Guan, Bei and Wang, Yongji and Lou, Jian-Guang}, 124 | booktitle={EMNLP findings}, 125 | year={2022} 126 | } 127 | ``` 128 | -------------------------------------------------------------------------------- /apicoder/private-eval/README.md: -------------------------------------------------------------------------------- 1 | # TorchDataEval, MonkeyEval and BeatNumEval 2 | 3 | Three benchmarks for evaluating the performance of private library oriented code generation. They are proposed in the paper "[When Language Model Meets Private Library](https://arxiv.org/pdf/2210.17236.pdf)". 4 | 5 | The evaluation script is adapted from OpenAI's [HumanEval](https://github.com/openai/human-eval/tree/master/human_eval). 6 | 7 | ## Installation 8 | 9 | Make sure to use python 3.7 or later: 10 | ``` 11 | $ conda create -n private python=3.7 12 | $ conda activate private 13 | ``` 14 | 15 | Check out and install this repository: 16 | ``` 17 | $ pip install -e private-eval 18 | ``` 19 | 20 | ## Configuration 21 | ``` 22 | ├── data # The directory of our crafted benchmarks. 23 | ├── private_eval 24 | │ ├── data.py # [key] Choosing whether to load TorchDataEval, MonkeyEval or BeatNumEval. 25 | │ ├── evaluate_functional_correctness.py # Calculating the evaluation results. 26 | │ ├── evaluation.py # Calculating the evaluation results. 27 | │ └── execution.py # [key] Executing the predicted code. Here, if you want to evaluate MonkeyEval and BeatNumEval, you need to set the `is_convert_back` variable in line 194 to `True` and `domain` to `pandas` or `numpy`. 28 | ``` 29 | 30 | ## Running Environment Testing 31 | 32 | You need replace `XXX` with your local path for testing the torchdata results. (Make sure that all settings in `private-eval/private_eval/data.py` is right.) 33 | ``` 34 | $ evaluate_functional_correctness XXX/PrivateLibrary/private-eval/data/TorchData_no.API_number_0.CodeGen.hm_False.machine.t0.1.p0.9.l100.n1.samples.jsonl 35 | ``` 36 | 37 | If you can successfully run the above command and obtain the following results, the evaluation environment is ready to use. 38 | ``` 39 | {'pass@1': 0.06} 40 | ``` 41 | 42 | # The Process of Constructing TorchDataEval, MonkeyEval and BeatNumEval 43 | 44 | We craft three benchmarks, called TorchDataEval, MonkeyEval, and BeatNumEval. Each programming problem consists of context, target code, and the corresponding test cases. 45 | 46 | To create a realistic benchmark for evaluating code generation for private library, we make use of TorchData, a Python library released just recently. We carefully learnt the official API documentation of TorchData and make sure we were proficient in all APIs. Then, we manually created $50$ programming problems based on the API usage examples in the documentation. Two volunteers with extensive experience in Python were invited to check the correctness of each problem. We control the difficulty of the programming problems by the number of APIs in the target code. The percentage of programming problems containing $1$ API, $2$ APIs, and more APIs is set to $6$:$3$:$1$. 47 | 48 | > Our base model, CODEGEN, is pre-trained with GitHub data before $2021$-$10$. TorchData was released after this time point and no code files using it are available on GitHub so far, hence we can consider it as a private library. 49 | 50 | We also construct two pseudo private libraries named MonkeyEval and BeatNumEval, they modify from PandasEval and NumpyEval, each containing $101$ programming problems, were proposed for the public libraries Pandas and Numpy. In detail, we manually modified all library-related keywords in PandasEval and NumpyEval, respectively. For example, as in the below Figure, `pandas` is converted to `monkey`, `dataframe` is converted to `knowledgeframe`, and the API name `isin` is converted to `iscontain`. To craft the API documentations for Monkey and BeatNum, we manually paraphrased the descriptions of all the new APIs to ensure that they have never been seen by the pre-trained language models. 51 | 52 | 53 | 54 | # A Example of Converting PandasEval (public) to MonkeyEval (private) 55 | 56 | Context is shown with a white background and the target code with a gray background. The changed parts are highlighted in yellow. 57 | 58 | 59 | 60 | ## Reference 61 | 62 | If you use TorchDataEval, MonkeyEval or BeatNumEval in your work, please cite the paper: 63 | ``` 64 | @inproceedings{APICoder, 65 | title={When Languange Model Meets Private Library}, 66 | author={Zan, Daoguang and Chen, Bei and Lin, Zeqi and Guan, Bei and Wang, Yongji and Lou, Jian-Guang}, 67 | booktitle={EMNLP findings}, 68 | year={2022} 69 | } 70 | ``` 71 | 72 | If you use PandasEval or NumpyEval in your work, please cite the paper: 73 | ``` 74 | @inproceedings{CERT, 75 | title={{CERT}: Continual Pre-training on Sketches for Library-oriented Code Generation}, 76 | author={Zan, Daoguang and Chen, Bei and Yang, Dejian and Lin, Zeqi and Kim, Minsu and Guan, Bei and Wang, Yongji and Chen, Weizhu and Lou, Jian-Guang}, 77 | booktitle={The 2022 International Joint Conference on Artificial Intelligence}, 78 | year={2022} 79 | } 80 | ``` 81 | 82 | Also, if you use the evaluationg script, please also cite the following paper: 83 | ``` 84 | @article{codex, 85 | title={Evaluating Large Language Models Trained on Code}, 86 | author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba}, 87 | year={2021}, 88 | eprint={2107.03374}, 89 | archivePrefix={arXiv}, 90 | primaryClass={cs.LG} 91 | } 92 | ``` 93 | --------------------------------------------------------------------------------