├── apicoder
├── APIRetriever
│ ├── src
│ │ ├── dense
│ │ │ ├── __init__.py
│ │ │ ├── driver
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __init__.pyc
│ │ │ │ ├── train.py
│ │ │ │ └── encode.py
│ │ │ ├── utils
│ │ │ │ ├── __init__.py
│ │ │ │ └── format
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── convert_result_to_trec.py
│ │ │ ├── faiss_retriever
│ │ │ │ ├── __init__.py
│ │ │ │ ├── retriever.py
│ │ │ │ ├── reducer.py
│ │ │ │ └── __main__.py
│ │ │ ├── processor
│ │ │ │ ├── __init__.py
│ │ │ │ └── processors.py
│ │ │ ├── dataset
│ │ │ │ ├── __init__.py
│ │ │ │ └── processor.py
│ │ │ ├── loss.py
│ │ │ ├── arguments.py
│ │ │ └── trainer.py
│ │ ├── run_trec_format_4.sh
│ │ ├── run_search_3.sh
│ │ ├── run_train_1.sh
│ │ └── run_encode_2.sh
│ ├── build
│ │ └── lib
│ │ │ └── dense
│ │ │ ├── __init__.py
│ │ │ ├── driver
│ │ │ ├── __init__.py
│ │ │ ├── train.py
│ │ │ └── encode.py
│ │ │ ├── utils
│ │ │ ├── __init__.py
│ │ │ └── format
│ │ │ │ ├── __init__.py
│ │ │ │ └── convert_result_to_trec.py
│ │ │ ├── faiss_retriever
│ │ │ ├── __init__.py
│ │ │ ├── retriever.py
│ │ │ ├── reducer.py
│ │ │ └── __main__.py
│ │ │ ├── processor
│ │ │ ├── __init__.py
│ │ │ └── processors.py
│ │ │ ├── dataset
│ │ │ ├── __init__.py
│ │ │ └── processor.py
│ │ │ ├── loss.py
│ │ │ ├── arguments.py
│ │ │ └── trainer.py
│ ├── data
│ │ └── inference
│ │ │ └── README.md
│ ├── requirements.txt
│ ├── setup.py
│ ├── scripts
│ │ ├── run_extract_apiretriever_corpus.sh
│ │ ├── run_prepare_test_private_code.py
│ │ └── run_prepare_train_private_code.py
│ └── README.md
├── private-eval
│ ├── private_eval
│ │ ├── __init__.py
│ │ ├── evaluate_functional_correctness.py
│ │ ├── data.py
│ │ └── evaluation.py
│ ├── requirements.txt
│ ├── data
│ │ ├── real_numpy_eval_v3.jsonl.gz
│ │ ├── real_beatnum_eval_v3.jsonl.gz
│ │ ├── real_monkey_eval_v3.jsonl.gz
│ │ ├── real_pandas_eval_v3.jsonl.gz
│ │ ├── real_torchdata_eval_v3.jsonl.gz
│ │ ├── real_monkey_eval_v3_api_1.jsonl.gz
│ │ ├── real_monkey_eval_v3_api_2.jsonl.gz
│ │ ├── real_monkey_eval_v3_api_3.jsonl.gz
│ │ ├── real_monkey_eval_v3_api_5.jsonl.gz
│ │ ├── real_monkey_eval_v3_api_n.jsonl.gz
│ │ ├── real_numpy_eval_v3_api_1.jsonl.gz
│ │ ├── real_numpy_eval_v3_api_2.jsonl.gz
│ │ ├── real_numpy_eval_v3_api_3.jsonl.gz
│ │ ├── real_numpy_eval_v3_api_5.jsonl.gz
│ │ ├── real_numpy_eval_v3_api_n.jsonl.gz
│ │ ├── real_pandas_eval_v3_api_1.jsonl.gz
│ │ ├── real_pandas_eval_v3_api_2.jsonl.gz
│ │ ├── real_pandas_eval_v3_api_3.jsonl.gz
│ │ ├── real_pandas_eval_v3_api_5.jsonl.gz
│ │ ├── real_pandas_eval_v3_api_n.jsonl.gz
│ │ ├── real_beatnum_eval_v3_api_1.jsonl.gz
│ │ ├── real_beatnum_eval_v3_api_2.jsonl.gz
│ │ ├── real_beatnum_eval_v3_api_3.jsonl.gz
│ │ ├── real_beatnum_eval_v3_api_5.jsonl.gz
│ │ ├── real_beatnum_eval_v3_api_n.jsonl.gz
│ │ ├── real_torchdata_eval_v3_api_1.jsonl.gz
│ │ ├── real_torchdata_eval_v3_api_2.jsonl.gz
│ │ ├── real_torchdata_eval_v3_api_3.jsonl.gz
│ │ ├── real_torchdata_eval_v3_api_5.jsonl.gz
│ │ ├── real_torchdata_eval_v3_api_n.jsonl.gz
│ │ ├── real_beatnum_eval_v3_human_labelled.jsonl.gz
│ │ ├── real_monkey_eval_v3_human_labelled.jsonl.gz
│ │ ├── real_torchdata_eval_v3_human_labelled.jsonl.gz
│ │ ├── real_torchdata_eval_v3_api_1_make_sense.jsonl.gz
│ │ ├── real_torchdata_eval_v3_api_2_make_sense.jsonl.gz
│ │ ├── real_torchdata_eval_v3_api_3_make_sense.jsonl.gz
│ │ ├── real_torchdata_eval_v3_api_5_make_sense.jsonl.gz
│ │ ├── real_torchdata_eval_v3_api_n_make_sense.jsonl.gz
│ │ ├── real_torchdata_eval_v3_human_labelled_make_sense.jsonl.gz
│ │ ├── numpy_keywords.jsonl
│ │ ├── pandas_keywords.jsonl
│ │ ├── XXXAPIEval-make sense.ipynb
│ │ └── TorchData_no.API_number_0.CodeGen.hm_False.machine.t0.1.p0.9.l100.n1.samples.jsonl
│ ├── setup.py
│ ├── LICENSE
│ └── README.md
├── CodeGenAPI
│ ├── scripts
│ │ ├── __init__.py
│ │ ├── requirements.txt
│ │ ├── run_details_apis.sh
│ │ ├── run_extract_apis.sh
│ │ ├── run_encode_private_data.sh
│ │ └── get_comments_from_evallibs.py
│ ├── requirements.txt
│ ├── run_evaluating_codes.sh
│ ├── nl2code
│ │ ├── __init__.py
│ │ └── configuration_codegen.py
│ ├── run_generating_codes.sh
│ ├── APICoder
│ │ ├── get_lib_comment_for_eval.py
│ │ └── get_api_info_by_name.py
│ ├── run_private.sh
│ └── README.md
└── data
│ ├── CodeGenAPI
│ └── README.md
│ ├── Cleaned-Private-Code-Files
│ └── README.md
│ ├── EncodedCorpus4CodeGenAPI
│ └── README.md
│ └── API-Doc
│ └── README.md
├── cert
├── pandas-numpy-eval
│ ├── pandas_numpy_eval
│ │ ├── __init__.py
│ │ ├── evaluate_functional_correctness.py
│ │ ├── data.py
│ │ └── evaluation.py
│ ├── requirements.txt
│ ├── data
│ │ ├── NumpyEval.jsonl.gz
│ │ └── PandasEval.jsonl.gz
│ ├── setup.py
│ ├── LICENSE
│ └── README.md
├── scripts
│ ├── requirements.txt
│ ├── run_encode_domain.sh
│ └── ast_utils.py
├── requirements.txt
├── run_evaluating_codes.sh
├── nl2code
│ ├── __init__.py
│ └── dynamic_block_dataset.py
├── run_generating_codes.sh
├── README.md
└── run_training_cert.sh
├── requirements.txt
├── CODE_OF_CONDUCT.md
├── LICENSE
├── .github
└── workflows
│ └── codeql.yml
├── SECURITY.md
├── eval_human_eval.py
└── README.md
/apicoder/APIRetriever/src/dense/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/apicoder/private-eval/private_eval/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/build/lib/dense/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/src/dense/driver/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/src/dense/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/cert/pandas-numpy-eval/pandas_numpy_eval/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/build/lib/dense/driver/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/build/lib/dense/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/src/dense/utils/format/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/build/lib/dense/utils/format/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | transformers
3 | sentencepiece
4 | protobuf
--------------------------------------------------------------------------------
/apicoder/CodeGenAPI/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("scripts")
3 |
--------------------------------------------------------------------------------
/apicoder/data/CodeGenAPI/README.md:
--------------------------------------------------------------------------------
1 | The weights, vocabulary and tokenizer of CodeGenAPI-350M-mono.
--------------------------------------------------------------------------------
/apicoder/APIRetriever/src/dense/faiss_retriever/__init__.py:
--------------------------------------------------------------------------------
1 | from .retriever import BaseFaissIPRetriever
2 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/build/lib/dense/faiss_retriever/__init__.py:
--------------------------------------------------------------------------------
1 | from .retriever import BaseFaissIPRetriever
2 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/data/inference/README.md:
--------------------------------------------------------------------------------
1 | Download all embedding files for our benchmarks and put it under this fold.
--------------------------------------------------------------------------------
/apicoder/private-eval/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | fire
3 | numpy==1.21.4
4 | pandas==1.3.5
5 | docformatter
6 | autopep8
7 | ipdb
--------------------------------------------------------------------------------
/cert/pandas-numpy-eval/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | fire
3 | numpy==1.21.4
4 | pandas==1.3.5
5 | docformatter
6 | autopep8
7 | ipdb
8 |
--------------------------------------------------------------------------------
/cert/pandas-numpy-eval/data/NumpyEval.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/cert/pandas-numpy-eval/data/NumpyEval.jsonl.gz
--------------------------------------------------------------------------------
/cert/pandas-numpy-eval/data/PandasEval.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/cert/pandas-numpy-eval/data/PandasEval.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/APIRetriever/src/dense/driver/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/APIRetriever/src/dense/driver/__init__.pyc
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_numpy_eval_v3.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_numpy_eval_v3.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_beatnum_eval_v3.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_beatnum_eval_v3.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_monkey_eval_v3.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_monkey_eval_v3.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_pandas_eval_v3.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_pandas_eval_v3.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_torchdata_eval_v3.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_monkey_eval_v3_api_1.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_monkey_eval_v3_api_1.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_monkey_eval_v3_api_2.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_monkey_eval_v3_api_2.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_monkey_eval_v3_api_3.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_monkey_eval_v3_api_3.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_monkey_eval_v3_api_5.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_monkey_eval_v3_api_5.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_monkey_eval_v3_api_n.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_monkey_eval_v3_api_n.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_numpy_eval_v3_api_1.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_numpy_eval_v3_api_1.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_numpy_eval_v3_api_2.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_numpy_eval_v3_api_2.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_numpy_eval_v3_api_3.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_numpy_eval_v3_api_3.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_numpy_eval_v3_api_5.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_numpy_eval_v3_api_5.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_numpy_eval_v3_api_n.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_numpy_eval_v3_api_n.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_pandas_eval_v3_api_1.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_pandas_eval_v3_api_1.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_pandas_eval_v3_api_2.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_pandas_eval_v3_api_2.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_pandas_eval_v3_api_3.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_pandas_eval_v3_api_3.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_pandas_eval_v3_api_5.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_pandas_eval_v3_api_5.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_pandas_eval_v3_api_n.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_pandas_eval_v3_api_n.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_beatnum_eval_v3_api_1.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_beatnum_eval_v3_api_1.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_beatnum_eval_v3_api_2.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_beatnum_eval_v3_api_2.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_beatnum_eval_v3_api_3.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_beatnum_eval_v3_api_3.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_beatnum_eval_v3_api_5.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_beatnum_eval_v3_api_5.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_beatnum_eval_v3_api_n.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_beatnum_eval_v3_api_n.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_torchdata_eval_v3_api_1.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_api_1.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_torchdata_eval_v3_api_2.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_api_2.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_torchdata_eval_v3_api_3.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_api_3.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_torchdata_eval_v3_api_5.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_api_5.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_torchdata_eval_v3_api_n.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_api_n.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/APIRetriever/src/dense/processor/__init__.py:
--------------------------------------------------------------------------------
1 | from .processors import SimpleTrainProcessor, SimpleCollectionProcessor
2 |
3 | MarcoPassageTrainProcessor = SimpleTrainProcessor
4 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/build/lib/dense/processor/__init__.py:
--------------------------------------------------------------------------------
1 | from .processors import SimpleTrainProcessor, SimpleCollectionProcessor
2 |
3 | MarcoPassageTrainProcessor = SimpleTrainProcessor
4 |
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_beatnum_eval_v3_human_labelled.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_beatnum_eval_v3_human_labelled.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_monkey_eval_v3_human_labelled.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_monkey_eval_v3_human_labelled.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_torchdata_eval_v3_human_labelled.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_human_labelled.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_torchdata_eval_v3_api_1_make_sense.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_api_1_make_sense.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_torchdata_eval_v3_api_2_make_sense.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_api_2_make_sense.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_torchdata_eval_v3_api_3_make_sense.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_api_3_make_sense.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_torchdata_eval_v3_api_5_make_sense.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_api_5_make_sense.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_torchdata_eval_v3_api_n_make_sense.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_api_n_make_sense.jsonl.gz
--------------------------------------------------------------------------------
/apicoder/private-eval/data/real_torchdata_eval_v3_human_labelled_make_sense.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyCodeGPT/HEAD/apicoder/private-eval/data/real_torchdata_eval_v3_human_labelled_make_sense.jsonl.gz
--------------------------------------------------------------------------------
/cert/scripts/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.12.5
2 | sentencepiece
3 | protobuf
4 | wandb
5 | datasets
6 | numpy
7 | cython
8 | fairseq
9 | autopep8
10 | docformatter
11 | zstandard
12 | beautifulsoup4
13 | lxml
14 | ipdb
15 | redbaron
16 | func-timeout
--------------------------------------------------------------------------------
/apicoder/CodeGenAPI/scripts/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.12.5
2 | sentencepiece
3 | protobuf
4 | wandb
5 | datasets
6 | numpy
7 | cython
8 | fairseq
9 | autopep8
10 | docformatter
11 | zstandard
12 | beautifulsoup4
13 | lxml
14 | ipdb
15 | redbaron
16 | func-timeout
--------------------------------------------------------------------------------
/apicoder/APIRetriever/requirements.txt:
--------------------------------------------------------------------------------
1 | torch<=1.8.0
2 | faiss-cpu>=1.6.5
3 | transformers==4.2.0
4 | datasets==1.1.3
5 | wandb==0.13.3
6 | sentencepiece
7 | protobuf
8 | numpy
9 | cython
10 | fairseq
11 | autopep8
12 | docformatter
13 | zstandard
14 | beautifulsoup4
15 | lxml
16 | ipdb
17 | redbaron
18 | func-timeout
--------------------------------------------------------------------------------
/apicoder/data/Cleaned-Private-Code-Files/README.md:
--------------------------------------------------------------------------------
1 | This folder contains all code files about the 31 public libraries our defined.
2 | ```
3 | pandas,numpy,sklearn,torch,tensorflow,django,selenium,matplotlib,flask,scipy,seaborn,nltk,beautifulsoup,pygame,PIL,jieba,gensim,spacy,transformers,fairseq,sqlalchemy,scrapy,allennlp,datasets,tokenizers,mxnet,imageio,pytest,metpy,ansible
4 | ```
--------------------------------------------------------------------------------
/cert/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.12.5
2 | sentencepiece
3 | protobuf
4 | wandb
5 | tqdm
6 | datasets
7 | tensorboard
8 | fairseq
9 | fairscale
10 | zstandard
11 | openpyxl
12 | matplotlib
13 | pandas>=1.1.2
14 | torchvision>=0.7.0
15 | seaborn>=0.11.2
16 | pyyaml
17 | ipdb
18 | numpy
19 | cython
20 | autopep8
21 | docformatter
22 | beautifulsoup4
23 | lxml
24 | redbaron
25 | func-timeout
--------------------------------------------------------------------------------
/apicoder/data/EncodedCorpus4CodeGenAPI/README.md:
--------------------------------------------------------------------------------
1 | This folder contains all encoded code files (after the tokenization operation) about the 31 public libraries our defined.
2 | ```
3 | pandas,numpy,sklearn,torch,tensorflow,django,selenium,matplotlib,flask,scipy,seaborn,nltk,beautifulsoup,pygame,PIL,jieba,gensim,spacy,transformers,fairseq,sqlalchemy,scrapy,allennlp,datasets,tokenizers,mxnet,imageio,pytest,metpy,ansible
4 | ```
--------------------------------------------------------------------------------
/apicoder/APIRetriever/src/run_trec_format_4.sh:
--------------------------------------------------------------------------------
1 | # !/bin/bash
2 |
3 | LIBRIES=( "pandas" "numpy" "monkey" "beatnum" "torchdata")
4 |
5 | for LIBRARY in ${LIBRIES[@]}; do
6 | echo "Library: $LIBRARY"
7 | INPUT_DIR="../data/inference"
8 | RUN="$INPUT_DIR/${LIBRARY}_id_score.txt"
9 | TREC_RUN="$INPUT_DIR/${LIBRARY}_id_score.trec"
10 |
11 | python -m dense.utils.format.convert_result_to_trec --input $RUN --output $TREC_RUN
12 | done
13 |
--------------------------------------------------------------------------------
/apicoder/CodeGenAPI/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.12.5
2 | sentencepiece
3 | wandb
4 | tqdm
5 | datasets
6 | tensorboard
7 | fairseq
8 | fairscale
9 | deepspeed
10 | zstandard
11 | openpyxl
12 | matplotlib
13 | pandas>=1.1.2
14 | torch>=1.6.0
15 | torchvision>=0.7.0
16 | seaborn>=0.11.2
17 | pyyaml
18 | ipdb
19 | numpy
20 | cython
21 | autopep8==1.6.0
22 | docformatter==1.4
23 | redbaron==0.9.2
24 | func-timeout
25 | torchdata==0.3.0
26 | protobuf==3.20.1
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Microsoft Open Source Code of Conduct
2 |
3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4 |
5 | Resources:
6 |
7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 |
--------------------------------------------------------------------------------
/apicoder/data/API-Doc/README.md:
--------------------------------------------------------------------------------
1 | This folder stores the 65 crawled public libraries.
2 | ```
3 | pandas,numpy,sklearn,torch,tensorflow,django,selenium,matplotlib,flask,scipy,seaborn,nltk,beautifulsoup,pygame,PIL,jieba,gensim,spacy,transformers,fairseq,sqlalchemy,scrapy,allennlp,datasets,tokenizers,mxnet,imageio,pytest,metpy,ansible,requests,datetime,zlib,random,math,sys,glob,os,urllib,time,re,json,unittest,collections,subprocess,copy,functools,itertools,six,threading,tempfile,io,pickle,pathlib,socket,struct,hashlib,traceback,csv,uuid,pprint
4 | ```
--------------------------------------------------------------------------------
/apicoder/APIRetriever/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | setup(
4 | name='apiretriever',
5 | version='0.0.1',
6 | packages=find_packages("src"),
7 | package_dir={'': 'src'},
8 | install_requires=open('requirements.txt').read().splitlines(),
9 | url='https://github.com/microsoft/PyCodeGPT',
10 | license='Apache 2.0',
11 | author='MSRA-DKI',
12 | author_email='daoguang@iscas.ac.cn',
13 | description='A toolkit for learning and running deep dense retrieval models.'
14 | )
15 |
--------------------------------------------------------------------------------
/apicoder/CodeGenAPI/run_evaluating_codes.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | BASE_DIR="your/base/dir"
4 |
5 | TEMP=$1
6 |
7 | # Temperature needs to be replaced with: $TEMP
8 | # Remember to change the human/data path; Remember to add torchdata in requirements.txt; Start with CERT/
9 | POST_PATH="XXX/codeparrot-small/official_TorchData_machine_gpt2_apinum_5_temp_$TEMP.samples.jsonl"
10 |
11 | EVALUATION_FILE="$BASE_DIR/$POST_PATH"
12 | echo "Evaluation File Path: $EVALUATION_FILE"
13 | evaluate_functional_correctness $EVALUATION_FILE
14 |
15 | echo "All Done!"
--------------------------------------------------------------------------------
/apicoder/APIRetriever/src/run_search_3.sh:
--------------------------------------------------------------------------------
1 | # !/bin/bash
2 |
3 | LIBRIES=( "pandas" "numpy" "monkey" "beatnum" "torchdata")
4 |
5 | for LIBRARY in ${LIBRIES[@]}; do
6 | echo "Library: $LIBRARY"
7 | INPUT_DIR="../data/inference"
8 | DEPTH=100
9 | RUN="$INPUT_DIR/${LIBRARY}_id_score.txt"
10 |
11 | python -m dense.faiss_retriever \
12 | --query_reps "$INPUT_DIR/${LIBRARY}_comment.pt" \
13 | --passage_reps "$INPUT_DIR/${LIBRARY}_api.pt" \
14 | --depth $DEPTH \
15 | --batch_size -1 \
16 | --save_text \
17 | --save_ranking_to $RUN
18 | done
19 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/src/run_train_1.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export WANDB_PROJECT="Your Project Name"
4 | export WANDB_API_KEY="Your WANDB API Key"
5 |
6 | TRAIN_DIR="../data/train"
7 | OUTDIR="../outputs"
8 | MODEL_PATH="/your/path/of/bert-base-uncased"
9 |
10 | python -m dense.driver.train \
11 | --output_dir $OUTDIR \
12 | --model_name_or_path ${MODEL_PATH} \
13 | --do_train \
14 | --save_steps 200 \
15 | --train_dir $TRAIN_DIR \
16 | --fp16 \
17 | --per_device_train_batch_size 5 \
18 | --train_n_passages 8 \
19 | --learning_rate 1e-5 \
20 | --q_max_len 256 \
21 | --p_max_len 256 \
22 | --num_train_epochs 150 \
23 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/src/dense/utils/format/convert_result_to_trec.py:
--------------------------------------------------------------------------------
1 | from argparse import ArgumentParser
2 |
3 | parser = ArgumentParser()
4 | parser.add_argument('--input', type=str, required=True)
5 | parser.add_argument('--output', type=str, required=True)
6 | args = parser.parse_args()
7 |
8 | with open(args.input) as f_in, open(args.output, 'w+') as f_out:
9 | cur_qid = None
10 | rank = 0
11 | for line in f_in:
12 | qid, docid, score = line.split()
13 | if cur_qid != qid:
14 | cur_qid = qid
15 | rank = 0
16 | rank += 1
17 | f_out.write(f'{qid} Q0 {docid} {rank} {score} dense\n')
18 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/build/lib/dense/utils/format/convert_result_to_trec.py:
--------------------------------------------------------------------------------
1 | from argparse import ArgumentParser
2 |
3 | parser = ArgumentParser()
4 | parser.add_argument('--input', type=str, required=True)
5 | parser.add_argument('--output', type=str, required=True)
6 | args = parser.parse_args()
7 |
8 | with open(args.input) as f_in, open(args.output, 'w') as f_out:
9 | cur_qid = None
10 | rank = 0
11 | for line in f_in:
12 | qid, docid, score = line.split()
13 | if cur_qid != qid:
14 | cur_qid = qid
15 | rank = 0
16 | rank += 1
17 | f_out.write(f'{qid} Q0 {docid} {rank} {score} dense\n')
18 |
--------------------------------------------------------------------------------
/apicoder/private-eval/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import pkg_resources
4 | from setuptools import setup, find_packages
5 |
6 |
7 | setup(
8 | name="private-eval",
9 | py_modules=["private-eval"],
10 | version="1.0",
11 | description="",
12 | author="OpenAI",
13 | packages=find_packages(),
14 | install_requires=[
15 | str(r)
16 | for r in pkg_resources.parse_requirements(
17 | open(os.path.join(os.path.dirname(__file__), "requirements.txt"))
18 | )
19 | ],
20 | entry_points={
21 | "console_scripts": [
22 | "evaluate_functional_correctness = private_eval.evaluate_functional_correctness",
23 | ]
24 | }
25 | )
26 |
--------------------------------------------------------------------------------
/cert/pandas-numpy-eval/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import pkg_resources
4 | from setuptools import setup, find_packages
5 |
6 |
7 | setup(
8 | name="pandas-numpy-eval",
9 | py_modules=["pandas-numpy-eval"],
10 | version="1.0",
11 | description="",
12 | author="OpenAI",
13 | packages=find_packages(),
14 | install_requires=[
15 | str(r)
16 | for r in pkg_resources.parse_requirements(
17 | open(os.path.join(os.path.dirname(__file__), "requirements.txt"))
18 | )
19 | ],
20 | entry_points={
21 | "console_scripts": [
22 | "evaluate_functional_correctness = pandas_numpy_eval.evaluate_functional_correctness",
23 | ]
24 | }
25 | )
26 |
--------------------------------------------------------------------------------
/cert/run_evaluating_codes.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # Licensed under the MIT license.
4 |
5 | BASE_DIR="Your base data directory"
6 |
7 | # ----------------------------------------------------------------------------------------------------
8 | # The variable below should be changed according to the output file path of `run_eval_monitor.sh`.
9 | # ----------------------------------------------------------------------------------------------------
10 | POST_PATH="CERT/pandas-numpy-eval/data/Example_Pandas_PYCODEGPT_samples.jsonl"
11 | EVALUATION_FILE="$BASE_DIR/$POST_PATH"
12 | echo "Evaluation File Path: $EVALUATION_FILE"
13 |
14 | evaluate_functional_correctness $EVALUATION_FILE
15 |
16 | echo "File: $"
17 | echo "All Done!"
18 |
--------------------------------------------------------------------------------
/apicoder/private-eval/private_eval/evaluate_functional_correctness.py:
--------------------------------------------------------------------------------
1 | import fire
2 | import sys
3 |
4 | from private_eval.data import HUMAN_EVAL
5 | from private_eval.evaluation import evaluate_functional_correctness
6 |
7 |
8 | def entry_point(
9 | sample_file: str,
10 | k: str = "1,10,100",
11 | n_workers: int = 4,
12 | timeout: float = 3.0,
13 | problem_file: str = HUMAN_EVAL,
14 | ):
15 | """
16 | Evaluates the functional correctness of generated samples, and writes
17 | results to f"{sample_file}_results.jsonl.gz"
18 | """
19 | k = list(map(int, k.split(",")))
20 | results = evaluate_functional_correctness(sample_file, k, n_workers, timeout, problem_file)
21 | print(results)
22 |
23 |
24 | def main():
25 | fire.Fire(entry_point)
26 |
27 |
28 | sys.exit(main())
29 |
--------------------------------------------------------------------------------
/cert/pandas-numpy-eval/pandas_numpy_eval/evaluate_functional_correctness.py:
--------------------------------------------------------------------------------
1 | import fire
2 | import sys
3 |
4 | from pandas_numpy_eval.data import HUMAN_EVAL
5 | from pandas_numpy_eval.evaluation import evaluate_functional_correctness
6 |
7 |
8 | def entry_point(
9 | sample_file: str,
10 | k: str = "1,10,100",
11 | n_workers: int = 4,
12 | timeout: float = 3.0,
13 | problem_file: str = HUMAN_EVAL,
14 | ):
15 | """
16 | Evaluates the functional correctness of generated samples, and writes
17 | results to f"{sample_file}_results.jsonl.gz"
18 | """
19 | k = list(map(int, k.split(",")))
20 | results = evaluate_functional_correctness(sample_file, k, n_workers, timeout, problem_file)
21 | print(results)
22 |
23 |
24 | def main():
25 | fire.Fire(entry_point)
26 |
27 |
28 | sys.exit(main())
29 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/src/run_encode_2.sh:
--------------------------------------------------------------------------------
1 | # !/bin/bash
2 |
3 | LIBRIES=( "pandas" "numpy" "monkey" "beatnum" "torchdata")
4 | MODES=( "comment" "api")
5 |
6 | for MODE in ${MODES[@]}; do
7 | echo "Mode: $MODE"
8 | for LIBRARY in ${LIBRIES[@]}; do
9 | echo "Library: $LIBRARY"
10 | OUTDIR="../data/inference"
11 | MODEL_DIR="../outputs/APIRetrieverCheckPoint/"
12 | CORPUS_DIR="../data/inference"
13 | ENCODE_DIR="../data/inference"
14 | PER_BATCH_SIZE=50
15 |
16 | CUDA_VISIBLE_DEVICES=0 python -m dense.driver.encode \
17 | --output_dir=$OUTDIR \
18 | --model_name_or_path $MODEL_DIR \
19 | --fp16 \
20 | --per_device_eval_batch_size ${PER_BATCH_SIZE} \
21 | --local_rank -1 \
22 | --encode_in_path "${CORPUS_DIR}/${LIBRARY}_${MODE}.json" \
23 | --encoded_save_path "${ENCODE_DIR}/${LIBRARY}_${MODE}.pt"
24 | done
25 | done
26 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/src/dense/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | from .processor import TrainProcessor, TestProcessor, CorpusProcessor
2 |
3 | PROCESSOR_INFO = {
4 | 'Tevatron/wikipedia-nq': {
5 | 'train': TrainProcessor,
6 | 'dev': TrainProcessor,
7 | 'test': TestProcessor,
8 | 'corpus': CorpusProcessor,
9 | },
10 | 'Tevatron/wikipedia-trivia': {
11 | 'train': TrainProcessor,
12 | 'dev': TrainProcessor,
13 | 'test': TestProcessor,
14 | 'corpus': CorpusProcessor,
15 | },
16 | 'Tevatron/msmarco-passage': {
17 | 'train': TrainProcessor,
18 | 'dev': TestProcessor,
19 | 'corpus': CorpusProcessor,
20 | },
21 | 'Tevatron/scifact': {
22 | 'train': TrainProcessor,
23 | 'dev': TestProcessor,
24 | 'test': TestProcessor,
25 | 'corpus': CorpusProcessor,
26 | },
27 | }
28 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/build/lib/dense/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | from .processor import TrainProcessor, TestProcessor, CorpusProcessor
2 |
3 | PROCESSOR_INFO = {
4 | 'Tevatron/wikipedia-nq': {
5 | 'train': TrainProcessor,
6 | 'dev': TrainProcessor,
7 | 'test': TestProcessor,
8 | 'corpus': CorpusProcessor,
9 | },
10 | 'Tevatron/wikipedia-trivia': {
11 | 'train': TrainProcessor,
12 | 'dev': TrainProcessor,
13 | 'test': TestProcessor,
14 | 'corpus': CorpusProcessor,
15 | },
16 | 'Tevatron/msmarco-passage': {
17 | 'train': TrainProcessor,
18 | 'dev': TestProcessor,
19 | 'corpus': CorpusProcessor,
20 | },
21 | 'Tevatron/scifact': {
22 | 'train': TrainProcessor,
23 | 'dev': TestProcessor,
24 | 'test': TestProcessor,
25 | 'corpus': CorpusProcessor,
26 | },
27 | }
28 |
--------------------------------------------------------------------------------
/apicoder/CodeGenAPI/scripts/run_details_apis.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # for example: "pandas,numpy,sklearn,tensorflow,keras"
4 | # PS: comma after the comma can not have space
5 |
6 | # Third party libraries
7 | LIBRARIES="${LIBRARIES},pandas,numpy,sklearn,torch,tensorflow,django,selenium,matplotlib,flask,scipy,seaborn,nltk,beautifulsoup,pygame,PIL,jieba,gensim,spacy,transformers,fairseq,sqlalchemy,scrapy,allennlp,datasets,tokenizers"
8 | LIBRARIES="${LIBRARIES},mxnet,imageio,pytest,metpy,ansible,requests"
9 | # Built-in libraries
10 | # LIBRARIES="${LIBRARIES},datetime,zlib,random,math,sys,glob,os,urllib,time,re,json,unittest,collections,subprocess,copy,functools,itertools,six,threading"
11 | # LIBRARIES="${LIBRARIES},tempfile,io,pickle,pathlib,socket,struct,hashlib,traceback,csv,uuid,pprint"
12 |
13 | OUTPUT_DIR="data/API-Doc"
14 | PROCESS_NUM=16
15 |
16 | Run_Args="-o ${OUTPUT_DIR}"
17 | Run_Args="${Run_Args} -l ${LIBRARIES}"
18 | Run_Args="${Run_Args} -pn ${PROCESS_NUM}"
19 |
20 | echo "Run_Args: ${Run_Args}"
21 |
22 | python run_extract_details_from_apis.py ${Run_Args}
23 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/src/dense/faiss_retriever/retriever.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import faiss
3 |
4 | import logging
5 | logger = logging.getLogger(__name__)
6 |
7 |
8 | class BaseFaissIPRetriever:
9 | def __init__(self, init_reps: np.ndarray):
10 | index = faiss.IndexFlatIP(init_reps.shape[1])
11 | self.index = index
12 |
13 | def search(self, q_reps: np.ndarray, k: int):
14 | return self.index.search(q_reps, k)
15 |
16 | def add(self, p_reps: np.ndarray):
17 | self.index.add(p_reps)
18 |
19 | def batch_search(self, q_reps: np.ndarray, k: int, batch_size: int):
20 | num_query = q_reps.shape[0]
21 | all_scores = []
22 | all_indices = []
23 | for start_idx in range(0, num_query, batch_size):
24 | nn_scores, nn_indices = self.search(q_reps[start_idx: start_idx + batch_size], k)
25 | all_scores.append(nn_scores)
26 | all_indices.append(nn_indices)
27 | all_scores = np.concatenate(all_scores, axis=0)
28 | all_indices = np.concatenate(all_indices, axis=0)
29 |
30 | return all_scores, all_indices
--------------------------------------------------------------------------------
/apicoder/APIRetriever/build/lib/dense/faiss_retriever/retriever.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import faiss
3 |
4 | import logging
5 | logger = logging.getLogger(__name__)
6 |
7 |
8 | class BaseFaissIPRetriever:
9 | def __init__(self, init_reps: np.ndarray):
10 | index = faiss.IndexFlatIP(init_reps.shape[1])
11 | self.index = index
12 |
13 | def search(self, q_reps: np.ndarray, k: int):
14 | return self.index.search(q_reps, k)
15 |
16 | def add(self, p_reps: np.ndarray):
17 | self.index.add(p_reps)
18 |
19 | def batch_search(self, q_reps: np.ndarray, k: int, batch_size: int):
20 | num_query = q_reps.shape[0]
21 | all_scores = []
22 | all_indices = []
23 | for start_idx in range(0, num_query, batch_size):
24 | nn_scores, nn_indices = self.search(q_reps[start_idx: start_idx + batch_size], k)
25 | all_scores.append(nn_scores)
26 | all_indices.append(nn_indices)
27 | all_scores = np.concatenate(all_scores, axis=0)
28 | all_indices = np.concatenate(all_indices, axis=0)
29 |
30 | return all_scores, all_indices
--------------------------------------------------------------------------------
/apicoder/private-eval/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License
2 |
3 | Copyright (c) OpenAI (https://openai.com)
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/cert/pandas-numpy-eval/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License
2 |
3 | Copyright (c) OpenAI (https://openai.com)
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) Microsoft Corporation.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE
22 |
--------------------------------------------------------------------------------
/cert/nl2code/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 | import os
4 | from transformers import AutoTokenizer
5 | from .code_dataset import CodeBlockDataset, CodeDatasetCallBack
6 |
7 | huggingface_model_mappings = {
8 | 'gpt-neo-125M'.lower() : 'EleutherAI/gpt-neo-125M',
9 | 'gpt-neo-1.3B'.lower() : 'EleutherAI/gpt-neo-1.3B'
10 | }
11 |
12 | _Proj_Abs_Dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
13 | _Data_Abs_Dir = os.path.join(_Proj_Abs_Dir, 'data')
14 |
15 | def load_pretrained_tokenizer(name_or_path: str):
16 | name_or_path = resolve_model_name_or_path(name_or_path)
17 | return AutoTokenizer.from_pretrained(name_or_path)
18 |
19 | def resolve_model_name_or_path(name_or_path: str):
20 | if name_or_path.lower() in huggingface_model_mappings:
21 | name_or_path = huggingface_model_mappings[name_or_path.lower()]
22 |
23 | data_dir = _Data_Abs_Dir if 'AMLT_DATA_DIR' not in os.environ else os.environ['AMLT_DATA_DIR']
24 | model_local_path = os.path.join(data_dir, 'pretrained_models', name_or_path)
25 | if os.path.exists(model_local_path):
26 | name_or_path = model_local_path
27 |
28 | return model_local_path
29 |
--------------------------------------------------------------------------------
/apicoder/CodeGenAPI/nl2code/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | from transformers import AutoTokenizer
3 | from .code_dataset import CodeBlockDataset, CodeDatasetCallBack
4 | from .code_dataset_codegen import CodeBlockDatasetCodeGen, CodeDatasetCallBackCodeGen
5 |
6 | huggingface_model_mappings = {
7 | 'gpt-neo-125M'.lower() : 'EleutherAI/gpt-neo-125M',
8 | 'gpt-neo-1.3B'.lower() : 'EleutherAI/gpt-neo-1.3B'
9 | }
10 |
11 | _Proj_Abs_Dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
12 | _Data_Abs_Dir = os.path.join(_Proj_Abs_Dir, 'data')
13 |
14 | def load_pretrained_tokenizer(name_or_path: str):
15 | name_or_path = resolve_model_name_or_path(name_or_path)
16 | return AutoTokenizer.from_pretrained(name_or_path)
17 |
18 | def resolve_model_name_or_path(name_or_path: str):
19 | if name_or_path.lower() in huggingface_model_mappings:
20 | name_or_path = huggingface_model_mappings[name_or_path.lower()]
21 |
22 | data_dir = _Data_Abs_Dir if 'AMLT_DATA_DIR' not in os.environ else os.environ['AMLT_DATA_DIR']
23 | model_local_path = os.path.join(data_dir, 'pretrained_models', name_or_path)
24 | if os.path.exists(model_local_path):
25 | name_or_path = model_local_path
26 |
27 | return model_local_path
28 |
--------------------------------------------------------------------------------
/apicoder/CodeGenAPI/scripts/run_extract_apis.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export DJANGO_SETTINGS_MODULE=bay.settings #
4 |
5 | # for example: "pandas,numpy,sklearn,tensorflow,keras"
6 | # PS: behind the comma can not have space
7 | # Third party libraries
8 | LIBRARIES="${LIBRARIES},pandas,numpy,sklearn,torch,tensorflow,django,selenium,matplotlib,flask,scipy,seaborn,nltk,beautifulsoup,pygame,PIL,jieba,gensim,spacy,transformers,fairseq,sqlalchemy,scrapy,allennlp,datasets,tokenizers,torchdata"
9 | LIBRARIES="${LIBRARIES},mxnet,imageio,pytest,metpy,ansible,requests"
10 | # Built-in libraries
11 | # LIBRARIES="${LIBRARIES},datetime,zlib,random,math,sys,glob,os,urllib,time,re,json,unittest,collections,subprocess,copy,functools,itertools,six,threading"
12 | # LIBRARIES="${LIBRARIES},tempfile,io,pickle,pathlib,socket,struct,hashlib,traceback,csv,uuid,pprint"
13 |
14 | ID=$(date +"%m%d")
15 | OUTPUT_DIR="data/API-Doc"
16 | PROCESS_NUM=16
17 | OVER_WRITE="True" # [True, False]
18 | GET_SIG="True" # [True, False]
19 |
20 | Run_Args="-o ${OUTPUT_DIR}"
21 | Run_Args="${Run_Args} -ls ${LIBRARIES}"
22 | Run_Args="${Run_Args} -pn ${PROCESS_NUM}"
23 | Run_Args="${Run_Args} -ow ${OVER_WRITE}"
24 | Run_Args="${Run_Args} -gs ${GET_SIG}"
25 |
26 | echo "Run_Args: ${Run_Args}"
27 |
28 | python -u extract_api.py ${Run_Args}
29 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/scripts/run_extract_apiretriever_corpus.sh:
--------------------------------------------------------------------------------
1 | # !/bash/bin
2 |
3 | DOMAIN="PrivateLibrary"
4 | # [train, valid]
5 | SPLIT="train"
6 | CONTAIN_BUILD_IN="False"
7 | # [True, False]
8 | IS_DEBUG="False"
9 |
10 | DATA_DIR="PrivateLibrary/data/Cleaned-Private-Code-Files"
11 | PRIVATE_DATA_DIR="PrivateLibrary/data/API-Doc"
12 |
13 | PRIVATE_LIBS="pandas,numpy,sklearn,torch,tensorflow,django,selenium,matplotlib,flask,scipy,seaborn,nltk,beautifulsoup,pygame,PIL,jieba,gensim,spacy,transformers,fairseq,sqlalchemy,scrapy,allennlp,datasets,tokenizers,mxnet,imageio,pytest,metpy,ansible,requests"
14 | BUILD_IN_LIBS="datetime,zlib,random,math,sys,glob,os,urllib,time,re,json,unittest,collections,subprocess,copy,functools,itertools,six,threading,tempfile,io,pickle,pathlib,socket,struct,hashlib,traceback,csv,uuid,pprint"
15 | MODEL_DIR="/your/codegen/checkpoints/codegen-350M-mono"
16 | OUTPUT_DIR="PrivateLibrary/APIRetriever/data/train/unprocessed-train-data"
17 |
18 | if [ $IS_DEBUG == "True" ]; then
19 | N_CPUS="1"
20 | else
21 | N_CPUS="8"
22 | fi
23 |
24 | if [ ! -z "$1" ]; then
25 | N_CPUS="$1"
26 | fi
27 |
28 | Args="-i $DATA_DIR --private_data_path ${PRIVATE_DATA_DIR} -o $OUTPUT_DIR -model $MODEL_DIR -t $N_CPUS -d $DOMAIN --private_libs ${PRIVATE_LIBS} --build_in_libs ${BUILD_IN_LIBS} -isdebug $IS_DEBUG --contain_build_in $CONTAIN_BUILD_IN"
29 | echo "Run encode_private for ${SPLIT} data: $Args"
30 |
31 | python extract_retrieval_api_corpus.py $Args -split ${SPLIT}
32 |
33 | echo "Done!"
--------------------------------------------------------------------------------
/cert/scripts/run_encode_domain.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # Licensed under the MIT license.
4 |
5 | ID=$(date +"%m%d")
6 | BASE_DATA_DIR="Your base data directory"
7 |
8 | # [Pandas, Numpy]
9 | DOMAIN="Pandas"
10 | # [normal, sketcher, generator]
11 | TYPE="generator"
12 | # [train, valid]
13 | SPLIT="valid"
14 | # [True, False]
15 | IS_DEBUG="False"
16 |
17 | # --------------------------------------------------------------------------
18 | # You should replace the following variables according to your own settings.
19 | # --------------------------------------------------------------------------
20 | DATA_DIR="${BASE_DATA_DIR}/datasets/CERT/${DOMAIN}/data"
21 | MODEL_DIR="${BASE_DATA_DIR}/models/pycodegpt-110M"
22 | OUTPUT_DIR="${BASE_DATA_DIR}/datasets/CERT/${DOMAIN}/${TYPE}_bin"
23 |
24 | if [ ! -z "$AMLT_DATA_DIR" ]; then
25 | echo "Run experiment on AMLT."
26 | BASE_DATA_DIR=$AMLT_DATA_DIR
27 | DATA_DIR="${BASE_DATA_DIR}/CERT/${DOMAIN}/data"
28 | MODEL_DIR="${BASE_DATA_DIR}/CERT/pycodegpt-110M"
29 | OUTPUT_DIR="${BASE_DATA_DIR}/CERT/${DOMAIN}/${TYPE}_bin"
30 | fi
31 |
32 | if [ $IS_DEBUG == "True" ]; then
33 | N_CPUS="1"
34 | else
35 | N_CPUS="20"
36 | fi
37 |
38 |
39 | if [ ! -z "$1" ]; then
40 | N_CPUS="$1"
41 | fi
42 |
43 | if [ ! -z "$2" ]; then
44 | echo "Using distributed nodes: $2"
45 | export DistributedNodes=$2
46 | fi
47 |
48 | if [ ! -z "$AMLT_DATA_DIR" ]; then
49 | echo "Run experiment on AMLT."
50 | fi
51 |
52 | Args="-i $DATA_DIR -o $OUTPUT_DIR -model $MODEL_DIR -t $N_CPUS -d $DOMAIN -type $TYPE -isdebug $IS_DEBUG"
53 | echo "Run encode_domain for ${SPLIT} data: $Args"
54 | python encode_domain.py $Args -split ${SPLIT}
--------------------------------------------------------------------------------
/apicoder/APIRetriever/src/dense/loss.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import Tensor
3 | from torch.nn import functional as F
4 | from torch import distributed as dist
5 |
6 |
7 | class SimpleContrastiveLoss:
8 | def __init__(self, n_target: int = 1):
9 | self.target_per_qry = n_target
10 |
11 | def __call__(self, x: Tensor, y: Tensor, target: Tensor = None, reduction: str = 'mean'):
12 | if target is None:
13 | assert x.size(0) * self.target_per_qry == y.size(0)
14 | target = torch.arange(
15 | 0, x.size(0) * self.target_per_qry, self.target_per_qry, device=x.device, dtype=torch.long)
16 | logits = torch.matmul(x, y.transpose(0, 1))
17 | return F.cross_entropy(logits, target, reduction=reduction)
18 |
19 |
20 | class DistributedContrastiveLoss(SimpleContrastiveLoss):
21 | def __init__(self, n_target: int = 0, scale_loss: bool = True):
22 | assert dist.is_initialized(), "Distributed training has not been properly initialized."
23 | super().__init__(n_target=n_target)
24 | self.word_size = dist.get_world_size()
25 | self.rank = dist.get_rank()
26 | self.scale_loss = scale_loss
27 |
28 | def __call__(self, x: Tensor, y: Tensor, **kwargs):
29 | dist_x = self.gather_tensor(x)
30 | dist_y = self.gather_tensor(y)
31 | loss = super().__call__(dist_x, dist_y, **kwargs)
32 | if self.scale_loss:
33 | loss = loss * self.word_size
34 | return loss
35 |
36 | def gather_tensor(self, t):
37 | gathered = [torch.empty_like(t) for _ in range(self.word_size)]
38 | dist.all_gather(gathered, t)
39 | gathered[self.rank] = t
40 | return torch.cat(gathered, dim=0)
--------------------------------------------------------------------------------
/apicoder/APIRetriever/build/lib/dense/loss.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import Tensor
3 | from torch.nn import functional as F
4 | from torch import distributed as dist
5 |
6 |
7 | class SimpleContrastiveLoss:
8 | def __init__(self, n_target: int = 1):
9 | self.target_per_qry = n_target
10 |
11 | def __call__(self, x: Tensor, y: Tensor, target: Tensor = None, reduction: str = 'mean'):
12 | if target is None:
13 | assert x.size(0) * self.target_per_qry == y.size(0)
14 | target = torch.arange(
15 | 0, x.size(0) * self.target_per_qry, self.target_per_qry, device=x.device, dtype=torch.long)
16 | logits = torch.matmul(x, y.transpose(0, 1))
17 | return F.cross_entropy(logits, target, reduction=reduction)
18 |
19 |
20 | class DistributedContrastiveLoss(SimpleContrastiveLoss):
21 | def __init__(self, n_target: int = 0, scale_loss: bool = True):
22 | assert dist.is_initialized(), "Distributed training has not been properly initialized."
23 | super().__init__(n_target=n_target)
24 | self.word_size = dist.get_world_size()
25 | self.rank = dist.get_rank()
26 | self.scale_loss = scale_loss
27 |
28 | def __call__(self, x: Tensor, y: Tensor, **kwargs):
29 | dist_x = self.gather_tensor(x)
30 | dist_y = self.gather_tensor(y)
31 | loss = super().__call__(dist_x, dist_y, **kwargs)
32 | if self.scale_loss:
33 | loss = loss * self.word_size
34 | return loss
35 |
36 | def gather_tensor(self, t):
37 | gathered = [torch.empty_like(t) for _ in range(self.word_size)]
38 | dist.all_gather(gathered, t)
39 | gathered[self.rank] = t
40 | return torch.cat(gathered, dim=0)
--------------------------------------------------------------------------------
/apicoder/CodeGenAPI/scripts/run_encode_private_data.sh:
--------------------------------------------------------------------------------
1 | # !/bash/bin
2 |
3 | DOMAIN="PrivateLibrary"
4 | # [train, valid]
5 | SPLIT="train"
6 | CONTAIN_BUILD_IN="False"
7 | # [True, False]
8 | IS_DEBUG="False"
9 | # v1: normal
10 | # v2: # [start] ...
11 | # v3:# Please use these APIs ...
12 | STYLE="v2" # v2, v3, ... vn
13 | PERTURBATION_PROBABILITY=0.05 # [0.1 ~ 1.0]
14 |
15 | DATA_DIR="data/Cleaned-Private-Code-Files"
16 | PRIVATE_DATA_DIR="data/API-Doc"
17 |
18 | if [ $IS_DEBUG == "True" ]; then
19 | PRIVATE_LIBS="pandas,numpy,django"
20 | BUILD_IN_LIBS="datetime"
21 | else
22 | PRIVATE_LIBS="pandas,numpy,sklearn,torch,tensorflow,django,selenium,matplotlib,flask,scipy,seaborn,nltk,beautifulsoup,pygame,PIL,jieba,gensim,spacy,transformers,fairseq,sqlalchemy,scrapy,allennlp,datasets,tokenizers,mxnet,imageio,pytest,metpy,ansible,requests"
23 | BUILD_IN_LIBS="datetime,zlib,random,math,sys,glob,os,urllib,time,re,json,unittest,collections,subprocess,copy,functools,itertools,six,threading,tempfile,io,pickle,pathlib,socket,struct,hashlib,traceback,csv,uuid,pprint"
24 | fi
25 |
26 | MODEL_DIR="Your/models/codegen/checkpoints/codegen-350M-mono"
27 | OUTPUT_DIR="data/EncodedCorpus4CodeGenAPI"
28 |
29 | if [ $IS_DEBUG == "True" ]; then
30 | N_CPUS="1"
31 | else
32 | N_CPUS="8"
33 | fi
34 |
35 | if [ ! -z "$1" ]; then
36 | N_CPUS="$1"
37 | fi
38 |
39 | Args="-i $DATA_DIR --private_data_path ${PRIVATE_DATA_DIR} -o $OUTPUT_DIR -model $MODEL_DIR -t $N_CPUS -d $DOMAIN --private_libs ${PRIVATE_LIBS} --build_in_libs ${BUILD_IN_LIBS} -isdebug $IS_DEBUG --contain_build_in $CONTAIN_BUILD_IN -pp $PERTURBATION_PROBABILITY --style $STYLE"
40 | echo "Run encode_private for ${SPLIT} data: $Args"
41 |
42 | python encode_private_data.py $Args -split ${SPLIT}
43 | echo "Done!"
44 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/src/dense/faiss_retriever/reducer.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import torch
3 | import faiss
4 | from argparse import ArgumentParser
5 | from tqdm import tqdm
6 | from typing import List, Iterable, Tuple
7 | from numpy import ndarray
8 |
9 |
10 | def combine_faiss_results(results: Iterable[Tuple[ndarray, ndarray]]):
11 | rh = None
12 | for scores, indices in results:
13 | if rh is None:
14 | print(f'Initializing Heap. Assuming {scores.shape[0]} queries.')
15 | rh = faiss.ResultHeap(scores.shape[0], scores.shape[1])
16 | rh.add_result(-scores, indices)
17 | rh.finalize()
18 | corpus_scores, corpus_indices = -rh.D, rh.I
19 |
20 | return corpus_scores, corpus_indices
21 |
22 |
23 | def write_ranking(corpus_indices, corpus_scores, q_lookup, ranking_save_file):
24 | with open(ranking_save_file, 'w') as f:
25 | for qid, q_doc_scores, q_doc_indices in zip(q_lookup, corpus_scores, corpus_indices):
26 | score_list = [(s, idx) for s, idx in zip(q_doc_scores, q_doc_indices)]
27 | score_list = sorted(score_list, key=lambda x: x[0], reverse=True)
28 | for s, idx in score_list:
29 | f.write(f'{qid}\t{idx}\t{s}\n')
30 |
31 |
32 | def main():
33 | parser = ArgumentParser()
34 | parser.add_argument('--score_dir', required=True)
35 | parser.add_argument('--query', required=True)
36 | parser.add_argument('--save_ranking_to', required=True)
37 | args = parser.parse_args()
38 |
39 | partitions = glob.glob(f'{args.score_dir}/*')
40 |
41 | corpus_scores, corpus_indices = combine_faiss_results(map(torch.load, tqdm(partitions)))
42 |
43 | _, q_lookup = torch.load(args.query)
44 | write_ranking(corpus_indices, corpus_scores, q_lookup, args.save_ranking_to)
45 |
46 |
47 | if __name__ == '__main__':
48 | main()
--------------------------------------------------------------------------------
/apicoder/APIRetriever/build/lib/dense/faiss_retriever/reducer.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import torch
3 | import faiss
4 | from argparse import ArgumentParser
5 | from tqdm import tqdm
6 | from typing import List, Iterable, Tuple
7 | from numpy import ndarray
8 |
9 |
10 | def combine_faiss_results(results: Iterable[Tuple[ndarray, ndarray]]):
11 | rh = None
12 | for scores, indices in results:
13 | if rh is None:
14 | print(f'Initializing Heap. Assuming {scores.shape[0]} queries.')
15 | rh = faiss.ResultHeap(scores.shape[0], scores.shape[1])
16 | rh.add_result(-scores, indices)
17 | rh.finalize()
18 | corpus_scores, corpus_indices = -rh.D, rh.I
19 |
20 | return corpus_scores, corpus_indices
21 |
22 |
23 | def write_ranking(corpus_indices, corpus_scores, q_lookup, ranking_save_file):
24 | with open(ranking_save_file, 'w') as f:
25 | for qid, q_doc_scores, q_doc_indices in zip(q_lookup, corpus_scores, corpus_indices):
26 | score_list = [(s, idx) for s, idx in zip(q_doc_scores, q_doc_indices)]
27 | score_list = sorted(score_list, key=lambda x: x[0], reverse=True)
28 | for s, idx in score_list:
29 | f.write(f'{qid}\t{idx}\t{s}\n')
30 |
31 |
32 | def main():
33 | parser = ArgumentParser()
34 | parser.add_argument('--score_dir', required=True)
35 | parser.add_argument('--query', required=True)
36 | parser.add_argument('--save_ranking_to', required=True)
37 | args = parser.parse_args()
38 |
39 | partitions = glob.glob(f'{args.score_dir}/*')
40 |
41 | corpus_scores, corpus_indices = combine_faiss_results(map(torch.load, tqdm(partitions)))
42 |
43 | _, q_lookup = torch.load(args.query)
44 | write_ranking(corpus_indices, corpus_scores, q_lookup, args.save_ranking_to)
45 |
46 |
47 | if __name__ == '__main__':
48 | main()
--------------------------------------------------------------------------------
/apicoder/CodeGenAPI/scripts/get_comments_from_evallibs.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | #
4 | # @Author: Daoguang Zan, @Mentor: Bei Chen, Jian-Guang Lou
5 | # @Copyright 2022 The Microsoft Research Asia (DKI Group). All rights reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | import json
19 | import os
20 | import sys
21 | import re
22 |
23 | from get_libs_info_from_code import (
24 | normalizer_api_desp,
25 | get_first_sentence_from_api_desp,
26 | extract_main_comment_from_code
27 | )
28 |
29 | def judge_is_what_type_annotation(code: str) -> str:
30 | type = ["pound", "inverted commas"] # pound: #, inverted commas: """
31 | if "#" in code:
32 | return type[0]
33 | else:
34 | return type[1]
35 |
36 | def get_comments_from_code(code: str) -> str:
37 | """
38 | Get comments from code.
39 | ---
40 | Args:
41 | Code: raw code from PandasEval, NumpyEval, etc.
42 | Returns:
43 | Comments: comments from code.
44 | """
45 | comment_type = judge_is_what_type_annotation(code)
46 | if comment_type == "pound":
47 | code_splited = code.split("\n")
48 | code_comment_str = ""
49 | for line in code_splited:
50 | if "#" in line:
51 | code_comment_str += " " + line.replace("#", "").strip() if code_comment_str != "" else line.replace("#", "").strip()
52 | return normalizer_api_desp(code_comment_str)
53 | else:
54 | return normalizer_api_desp(extract_main_comment_from_code(code)).replace("\"\"\"", '').replace("\'\'\'", '').strip()
55 |
56 |
57 | if __name__ == '__main__':
58 | pass
--------------------------------------------------------------------------------
/apicoder/private-eval/data/numpy_keywords.jsonl:
--------------------------------------------------------------------------------
1 | {
2 | "to_numpy": "to_beatnum",
3 | "ndarray": "ndnumset",
4 | "array": "numset",
5 | "numpy": "beatnum",
6 | "transpose": "switching_places",
7 | "Numpy": "Beatnum",
8 | "np": "bn",
9 | "column_stack": "stack_col",
10 | "concatenate": "connect",
11 | "slice": "piece",
12 | "sum": "total_count",
13 | "imag": "imaginary",
14 | "abs": "absolute",
15 | "real": "reality",
16 | "fill_diagonal": "pad_diagonal",
17 | "all": "total",
18 | "fromstring": "come_from_str",
19 | "in1d": "intersection1dim",
20 | "mean": "average",
21 | "where": "filter_condition",
22 | "std": "standard_op",
23 | "reshape": "change_shape_to",
24 | "fromarrays": "come_from_arrays",
25 | "stack": "pile_operation",
26 | "histogram": "hist_operation",
27 | "cumsum": "cumulative_sum",
28 | "setxor1d": "seting_exclusive_or_one_dim",
29 | "add": "add_concat",
30 | "filled": "masked_fill",
31 | "compressed": "remove_masked_data",
32 | "astype": "convert_type",
33 | "argmin": "get_argmin_value",
34 | "arange": "arr_range",
35 | "argmax": "get_argmax",
36 | "vstack": "vertical_stack",
37 | "hstack": "horizontal_stack",
38 | "squeeze": "sqz",
39 | "asarray": "asnumset",
40 | "repeat": "duplicate",
41 | "unravel_index": "convert_index_or_arr",
42 | "vectorize": "vectorisation",
43 | "split": "sep_split",
44 | "diff": "difference",
45 | "logical_and": "logic_and_element_wise",
46 | "flatten": "convert_into_one_dim",
47 | "unique": "uniq",
48 | "norm": "normlizattion",
49 | "delete": "remove_operation",
50 | "ones": "create_ones",
51 | "bincount": "binoccurrence",
52 | "append": "apd",
53 | "any": "any_condition",
54 | "isnan": "ifnan",
55 | "argpartition": "perform_partition",
56 | "ravel": "asview",
57 | "array_split": "split_array",
58 | "inv": "inverse",
59 | "insert": "stick",
60 | "searchsorted": "find_sorted",
61 | "min": "get_min",
62 | "max": "get_max",
63 | "full": "full_value_func"
64 | }
--------------------------------------------------------------------------------
/apicoder/private-eval/data/pandas_keywords.jsonl:
--------------------------------------------------------------------------------
1 | {
2 | "isnull": "ifnull",
3 | "mean": "average",
4 | "pandas": "monkey",
5 | "dataframe": "knowledgeframe",
6 | "df": "kf",
7 | "isin": "incontain",
8 | "pd": "mk",
9 | "DataFrame": "KnowledgeFrame",
10 | "rename": "renaming",
11 | "drop": "sip",
12 | "Pandas": "Monkey",
13 | "tolist": "convert_list",
14 | "apply": "employ",
15 | "to_numeric": "to_num",
16 | "dropna": "sipna",
17 | "append": "adding",
18 | "tail": "last_tail",
19 | "copy": "clone",
20 | "groupby": "grouper",
21 | "sum": "total_sum",
22 | "Series": "Collections",
23 | "series": "collections",
24 | "innull": "isnone",
25 | "astype": "totype",
26 | "select_dtypes": "choose_dtypes",
27 | "iterrows": "traversal",
28 | "min": "get_min",
29 | "max": "get_max",
30 | "map": "mapping",
31 | "nlargest": "nbiggest",
32 | "unique": "distinctive",
33 | "ravel": "flat_underlying",
34 | "sort_values": "sort_the_values",
35 | "last": "final_item",
36 | "shift": "shifting",
37 | "merge": "unioner",
38 | "value_counts": "counts_value_num",
39 | "rename_axis": "renaming_axis",
40 | "reset_index": "reseting_index",
41 | "sample": "sample_by_num",
42 | "replace": "replacing",
43 | "to_datetime": "convert_datetime",
44 | "any": "whatever",
45 | "reindex": "reindexing",
46 | "concat": "concating",
47 | "to_dict": "convert_dict",
48 | "cumsum": "cumulative_sum",
49 | "sort_index": "sorting_index",
50 | "to_string": "convert_string",
51 | "drop_duplicates": "remove_duplicates",
52 | "duplicated": "duplicated_values",
53 | "len": "length",
54 | "isna": "ifna",
55 | "fillna": "fillnone",
56 | "get": "getting",
57 | "round": "value_round",
58 | "format": "formating",
59 | "to_pydatetime": "convert_pydatetime",
60 | "div": "division",
61 | "ceil": "ceiling",
62 | "assign": "allocate",
63 | "intersection": "interst",
64 | "head": "header_num",
65 | "applymap": "conduct_map",
66 | "all": "total_all",
67 | "std": "standard"
68 | }
--------------------------------------------------------------------------------
/apicoder/CodeGenAPI/run_generating_codes.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # [True, False]
4 | HUMAN_IN_THE_LOOP="False"
5 | # ["_no", "_make_sense"]
6 | MAKE_SENSE="_no"
7 | # [machine, top3_perfect, top4_perfect, top5_perfect, human_labelled]
8 | USER_NAME="machine"
9 | # [0, 1, 2, 3, 5, "n"]
10 | API_NUMBER=0
11 | # [Pandas, Numpy, Monkey, BeatNum, TorchData]
12 | DOMAIN="TorchData"
13 | # [CodeGen, API_Coder] [codet5, CodeGPT, CodeClippy, CodeParrot]
14 | MODEL_VERSION="CodeGen"
15 | TEMP=$1
16 |
17 | BASE_DIR="your/base/dir"
18 |
19 | if [ ${MODEL_VERSION} == "CodeGen" ]; then
20 | NUM_SAMPLES="1"
21 | MAX_TOKNES="100"
22 | TOP_P="0.9"
23 | CKPT_NAME="${BASE_DIR}/codegen-350M-mono"
24 | Run_Args="-model $CKPT_NAME -t $TEMP -p $TOP_P -l $MAX_TOKNES -n $NUM_SAMPLES -d $DOMAIN -mv $MODEL_VERSION --api_number $API_NUMBER --human_in_the_loop $HUMAN_IN_THE_LOOP --user_name $USER_NAME --make_sense $MAKE_SENSE"
25 | echo "Run Args: $Run_Args"
26 | python eval_private.py ${Run_Args}
27 | elif [ ${MODEL_VERSION} == "API_Coder" ]; then
28 | NUM_SAMPLES="200"
29 | MAX_TOKNES="100"
30 | TOP_P="0.9"
31 | CKPT_NAME="${BASE_DIR}/CodeGenAPI-350M-mono"
32 | Run_Args="-model $CKPT_NAME -t $TEMP -p $TOP_P -l $MAX_TOKNES -n $NUM_SAMPLES -d $DOMAIN -mv $MODEL_VERSION --api_number $API_NUMBER --human_in_the_loop $HUMAN_IN_THE_LOOP --user_name $USER_NAME --make_sense $MAKE_SENSE"
33 | echo "Run Args: $Run_Args"
34 | python eval_private.py ${Run_Args}
35 | elif [ ${MODEL_VERSION} == "codet5" ]; then
36 | python eval_baseline.py -m "$BASE_DIR/codet5-base" -temp $TEMP -type codet5 -lib $DOMAIN --api_number $API_NUMBER --user_name $USER_NAME
37 | elif [ ${MODEL_VERSION} == "CodeGPT" ]; then
38 | python eval_baseline.py -m "$BASE_DIR/CodeGPT-small-py-adaptedGPT2" -temp $TEMP -type gpt2 -lib $DOMAIN --api_number $API_NUMBER --user_name $USER_NAME
39 | elif [ ${MODEL_VERSION} == "CodeClippy" ]; then
40 | python eval_baseline.py -m "$BASE_DIR/gpt-neo-125M-code-clippy" -temp $TEMP -type gpt-neo -lib $DOMAIN --api_number $API_NUMBER --user_name $USER_NAME
41 | elif [ ${MODEL_VERSION} == "CodeParrot" ]; then
42 | python eval_baseline.py -m "$BASE_DIR/codeparrot-small" -temp $TEMP -type gpt2 -lib $DOMAIN --api_number $API_NUMBER --user_name $USER_NAME
43 | fi
44 |
45 | echo "All Done!"
--------------------------------------------------------------------------------
/cert/pandas-numpy-eval/pandas_numpy_eval/data.py:
--------------------------------------------------------------------------------
1 | from typing import Iterable, Dict
2 | import gzip
3 | import json
4 | import os
5 |
6 |
7 | ROOT = os.path.dirname(os.path.abspath(__file__))
8 |
9 | # --------------------------------------------------------
10 | # You can choose from the two options "pandas" or "numpy".
11 | # --------------------------------------------------------
12 | LIB = "pandas"
13 | assert LIB == "pandas" or LIB == "numpy"
14 | HUMAN_EVAL = os.path.join(ROOT, "..", "data", "PandasEval.jsonl.gz") if LIB == "pandas" else os.path.join(ROOT, "..", "data", "NumpyEval.jsonl.gz")
15 |
16 | print("***"*20)
17 | print("load eval from {}".format(HUMAN_EVAL.split('/')[-1].replace(".jsonl.gz", "")))
18 | print("***"*20)
19 |
20 | def read_problems(evalset_file: str = HUMAN_EVAL) -> Dict[str, Dict]:
21 | """
22 | Reads the problems from the evaluation set
23 | """
24 | return {task["task_id"]: task for task in stream_jsonl(evalset_file)}
25 |
26 |
27 | def stream_jsonl(filename: str) -> Iterable[Dict]:
28 | """
29 | Parses each jsonl line and yields it as a dictionary
30 | """
31 | if filename.endswith(".gz"):
32 | with open(filename, "rb") as gzfp:
33 | with gzip.open(gzfp, 'rt') as fp:
34 | for line in fp:
35 | if any(not x.isspace() for x in line):
36 | yield json.loads(line)
37 | else:
38 | with open(filename, "r") as fp:
39 | for line in fp:
40 | if any(not x.isspace() for x in line):
41 | yield json.loads(line)
42 |
43 |
44 | def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
45 | """
46 | Writes an iterable of dictionaries to jsonl
47 | """
48 | if append:
49 | mode = 'ab'
50 | else:
51 | mode = 'wb'
52 | filename = os.path.expanduser(filename)
53 | if filename.endswith(".gz"):
54 | with open(filename, mode) as fp:
55 | with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp:
56 | for x in data:
57 | gzfp.write((json.dumps(x) + "\n").encode('utf-8'))
58 | else:
59 | with open(filename, mode) as fp:
60 | for x in data:
61 | fp.write((json.dumps(x) + "\n").encode('utf-8'))
62 |
--------------------------------------------------------------------------------
/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
1 | # For most projects, this workflow file will not need changing; you simply need
2 | # to commit it to your repository.
3 | #
4 | # You may wish to alter this file to override the set of languages analyzed,
5 | # or to provide custom queries or build logic.
6 | #
7 | # ******** NOTE ********
8 | # We have attempted to detect the languages in your repository. Please check
9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 |
14 | on:
15 | push:
16 | branches: [ main ]
17 | pull_request:
18 | # The branches below must be a subset of the branches above
19 | branches: [ main ]
20 | schedule:
21 | - cron: '23 6 * * 0'
22 |
23 | jobs:
24 | analyze:
25 | name: Analyze
26 | runs-on: ubuntu-latest
27 | permissions:
28 | actions: read
29 | contents: read
30 | security-events: write
31 |
32 | strategy:
33 | fail-fast: false
34 | matrix:
35 | language: [ 'python' ]
36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 | # Learn more about CodeQL language support at https://git.io/codeql-language-support
38 |
39 | steps:
40 | - name: Checkout repository
41 | uses: actions/checkout@v2
42 |
43 | # Initializes the CodeQL tools for scanning.
44 | - name: Initialize CodeQL
45 | uses: github/codeql-action/init@v1
46 | with:
47 | languages: ${{ matrix.language }}
48 | # If you wish to specify custom queries, you can do so here or in a config file.
49 | # By default, queries listed here will override any specified in a config file.
50 | # Prefix the list here with "+" to use these queries and those in the config file.
51 | # queries: ./path/to/local/query, your-org/your-repo/queries@main
52 |
53 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
54 | # If this step fails, then you should remove it and run the build manually (see below)
55 | - name: Autobuild
56 | uses: github/codeql-action/autobuild@v1
57 |
58 | # ℹ️ Command-line programs to run using the OS shell.
59 | # 📚 https://git.io/JvXDl
60 |
61 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
62 | # and modify them (or add more) to build your code if your project
63 | # uses a compiled language
64 |
65 | #- run: |
66 | # make bootstrap
67 | # make release
68 |
69 | - name: Perform CodeQL Analysis
70 | uses: github/codeql-action/analyze@v1
71 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/src/dense/faiss_retriever/__main__.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import glob
4 | from argparse import ArgumentParser
5 | from itertools import chain
6 | from tqdm import tqdm
7 |
8 | from .retriever import BaseFaissIPRetriever
9 | from .reducer import write_ranking
10 |
11 | import logging
12 | logger = logging.getLogger(__name__)
13 | logging.basicConfig(
14 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
15 | datefmt="%m/%d/%Y %H:%M:%S",
16 | level=logging.INFO,
17 | )
18 |
19 |
20 | def search_queries(retriever, q_reps, p_lookup, args):
21 | if args.batch_size > 0:
22 | all_scores, all_indices = retriever.batch_search(q_reps, args.depth, args.batch_size)
23 | else:
24 | all_scores, all_indices = retriever.search(q_reps, args.depth)
25 |
26 | psg_indices = [[int(p_lookup[x]) for x in q_dd] for q_dd in all_indices]
27 | psg_indices = np.array(psg_indices)
28 | return all_scores, psg_indices
29 |
30 |
31 | def main():
32 | parser = ArgumentParser()
33 | parser.add_argument('--query_reps', required=True)
34 | parser.add_argument('--passage_reps', required=True)
35 | parser.add_argument('--batch_size', type=int, default=128)
36 | parser.add_argument('--depth', type=int, default=1000)
37 | parser.add_argument('--save_ranking_to', required=True)
38 | parser.add_argument('--save_text', action='store_true')
39 |
40 | args = parser.parse_args()
41 |
42 | index_files = glob.glob(args.passage_reps)
43 | logger.info(f'Pattern match found {len(index_files)} files; loading them into index.')
44 |
45 | p_reps_0, p_lookup_0 = torch.load(index_files[0])
46 | retriever = BaseFaissIPRetriever(p_reps_0.float().numpy())
47 |
48 | shards = chain([(p_reps_0, p_lookup_0)], map(torch.load, index_files[1:]))
49 | if len(index_files) > 1:
50 | shards = tqdm(shards, desc='Loading shards into index', total=len(index_files))
51 | look_up = []
52 | for p_reps, p_lookup in shards:
53 | retriever.add(p_reps.float().numpy())
54 | look_up += p_lookup
55 |
56 | q_reps, q_lookup = torch.load(args.query_reps)
57 | q_reps = q_reps.float().numpy()
58 |
59 | logger.info('Index Search Start')
60 | all_scores, psg_indices = search_queries(retriever, q_reps, look_up, args)
61 | logger.info('Index Search Finished')
62 |
63 | if args.save_text:
64 | write_ranking(psg_indices, all_scores, q_lookup, args.save_ranking_to)
65 | else:
66 | torch.save((all_scores, psg_indices), args.save_ranking_to)
67 |
68 |
69 | if __name__ == '__main__':
70 | main()
--------------------------------------------------------------------------------
/apicoder/APIRetriever/build/lib/dense/faiss_retriever/__main__.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import glob
4 | from argparse import ArgumentParser
5 | from itertools import chain
6 | from tqdm import tqdm
7 |
8 | from .retriever import BaseFaissIPRetriever
9 | from .reducer import write_ranking
10 |
11 | import logging
12 | logger = logging.getLogger(__name__)
13 | logging.basicConfig(
14 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
15 | datefmt="%m/%d/%Y %H:%M:%S",
16 | level=logging.INFO,
17 | )
18 |
19 |
20 | def search_queries(retriever, q_reps, p_lookup, args):
21 | if args.batch_size > 0:
22 | all_scores, all_indices = retriever.batch_search(q_reps, args.depth, args.batch_size)
23 | else:
24 | all_scores, all_indices = retriever.search(q_reps, args.depth)
25 |
26 | psg_indices = [[int(p_lookup[x]) for x in q_dd] for q_dd in all_indices]
27 | psg_indices = np.array(psg_indices)
28 | return all_scores, psg_indices
29 |
30 |
31 | def main():
32 | parser = ArgumentParser()
33 | parser.add_argument('--query_reps', required=True)
34 | parser.add_argument('--passage_reps', required=True)
35 | parser.add_argument('--batch_size', type=int, default=128)
36 | parser.add_argument('--depth', type=int, default=1000)
37 | parser.add_argument('--save_ranking_to', required=True)
38 | parser.add_argument('--save_text', action='store_true')
39 |
40 | args = parser.parse_args()
41 |
42 | index_files = glob.glob(args.passage_reps)
43 | logger.info(f'Pattern match found {len(index_files)} files; loading them into index.')
44 |
45 | p_reps_0, p_lookup_0 = torch.load(index_files[0])
46 | retriever = BaseFaissIPRetriever(p_reps_0.float().numpy())
47 |
48 | shards = chain([(p_reps_0, p_lookup_0)], map(torch.load, index_files[1:]))
49 | if len(index_files) > 1:
50 | shards = tqdm(shards, desc='Loading shards into index', total=len(index_files))
51 | look_up = []
52 | for p_reps, p_lookup in shards:
53 | retriever.add(p_reps.float().numpy())
54 | look_up += p_lookup
55 |
56 | q_reps, q_lookup = torch.load(args.query_reps)
57 | q_reps = q_reps.float().numpy()
58 |
59 | logger.info('Index Search Start')
60 | all_scores, psg_indices = search_queries(retriever, q_reps, look_up, args)
61 | logger.info('Index Search Finished')
62 |
63 | if args.save_text:
64 | write_ranking(psg_indices, all_scores, q_lookup, args.save_ranking_to)
65 | else:
66 | torch.save((all_scores, psg_indices), args.save_ranking_to)
67 |
68 |
69 | if __name__ == '__main__':
70 | main()
--------------------------------------------------------------------------------
/cert/run_generating_codes.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # Licensed under the MIT license.
4 |
5 | # [Pandas, Numpy]
6 | DOMAIN="Pandas"
7 |
8 | # [PYCODEGPT, CERT]
9 | MODEL_VERSION="PYCODEGPT"
10 |
11 | BASE_DIR="In local machine, please change this to your base data directory."
12 |
13 | # --------------------------------------------------------------------------
14 | # You should replace the following variables according to your own settings.
15 | # --------------------------------------------------------------------------
16 | if [ ${DOMAIN} == "Pandas" ]; then
17 | if [ ${MODEL_VERSION} == "PYCODEGPT" ]; then
18 | TEMP="1.0"
19 | NUM_SAMPLES="1"
20 | MAX_TOKNES="100"
21 | TOP_P="0.9"
22 | CKPT_NAME="${BASE_DIR}/pycodegpt-110M"
23 | Run_Args="-model $CKPT_NAME -t $TEMP -p $TOP_P -l $MAX_TOKNES -n $NUM_SAMPLES -d $DOMAIN -mv $MODEL_VERSION"
24 | echo "Run Args: $Run_Args"
25 | python eval_cert.py ${Run_Args}
26 | elif [ ${MODEL_VERSION} == "CERT" ]; then
27 | TEMP="1.0"
28 | TEMP2="1.0"
29 | NUM_SAMPLES="1"
30 | MAX_TOKNES="100"
31 | TOP_P="0.9"
32 | CKPT_NAME_SKETCHER="${BASE_DIR}/sketcher-pandas"
33 | CKPT_NAME_GENERATOR="${BASE_DIR}/generator-pandas"
34 | Run_Args="-model $CKPT_NAME_SKETCHER -model2 $CKPT_NAME_GENERATOR -t $TEMP -t2 $TEMP2 -p $TOP_P -l $MAX_TOKNES -n $NUM_SAMPLES -d $DOMAIN -mv $MODEL_VERSION"
35 | echo "Run Args: $Run_Args"
36 | python eval_cert_unified.py ${Run_Args}
37 | fi
38 | elif [ ${DOMAIN} == "Numpy" ]; then
39 | if [ ${MODEL_VERSION} == "PYCODEGPT" ]; then
40 | TEMP="1.0"
41 | NUM_SAMPLES="1"
42 | MAX_TOKNES="100"
43 | TOP_P="0.9"
44 | CKPT_NAME="${BASE_DIR}/pycodegpt-110M"
45 | Run_Args="-model $CKPT_NAME -t $TEMP -p $TOP_P -l $MAX_TOKNES -n $NUM_SAMPLES -d $DOMAIN -mv $MODEL_VERSION"
46 | echo "Run Args: $Run_Args"
47 | python eval_cert.py ${Run_Args}
48 | elif [ ${MODEL_VERSION} == "CERT" ]; then
49 | TEMP="1.0"
50 | TEMP2="0.2"
51 | NUM_SAMPLES="1"
52 | MAX_TOKNES="100"
53 | TOP_P="0.9"
54 | CKPT_NAME_SKETCHER="${BASE_DIR}/sketcher-numpy"
55 | CKPT_NAME_GENERATOR="${BASE_DIR}/generator-numpy"
56 | Run_Args="-model $CKPT_NAME_SKETCHER -model2 $CKPT_NAME_GENERATOR -t $TEMP -t2 $TEMP2 -p $TOP_P -l $MAX_TOKNES -n $NUM_SAMPLES -d $DOMAIN -mv $MODEL_VERSION"
57 | echo "Run Args: $Run_Args"
58 | python eval_cert_unified.py ${Run_Args}
59 | fi
60 | fi
61 |
62 | echo "All Done!"
63 |
--------------------------------------------------------------------------------
/apicoder/CodeGenAPI/APICoder/get_lib_comment_for_eval.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | #
4 | # @Author: Daoguang Zan, @Mentor: Bei Chen, Jian-Guang Lou
5 | # @Copyright 2022 The Microsoft Research Asia (DKI Group). All rights reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | from typing import List
19 | import json
20 | import gzip
21 | import os
22 | import sys
23 | sys.path.append("..")
24 | from scripts.get_comments_from_evallibs import get_comments_from_code
25 | # remove the sys path ".." to avoid the conflict with the other scripts
26 | sys.path.remove("..")
27 |
28 | def get_one_instance_by_lib_name(library_name: str, base_dir: str):
29 | """
30 | Get an iterative object based on lib_name
31 | """
32 | base_dir = os.path.join(base_dir, "eval_datas")
33 | library_path = os.path.join(base_dir, f"real_{library_name}_eval_v2.jsonl.gz")
34 |
35 | library_reader = gzip.open(library_path, "rb")
36 | for line in library_reader:
37 | line = line.decode("utf-8")
38 | line_dict = json.loads(line)
39 | yield line_dict
40 |
41 | def get_code_and_comment_by_lib_name_and_task_id(
42 | library_name: str,
43 | query_task_id: str,
44 | base_dir: str
45 | ):
46 | """
47 | Get code, comments and solutions based on lib_name and task_id.
48 | """
49 | # base_dir = f"/mnt/v-dzan/datasets/CERT/eval_datas"
50 | base_dir = os.path.join(base_dir, "eval_datas")
51 | library_path = os.path.join(base_dir, f"real_{library_name}_eval_v3.jsonl.gz")
52 |
53 | library_reader = gzip.open(library_path, "rb")
54 | for line in library_reader:
55 | line = line.decode("utf-8")
56 | line_dict = json.loads(line)
57 | task_id = line_dict["task_id"]
58 | if task_id == query_task_id:
59 | code = line_dict["prompt"]
60 | solution = line_dict["canonical_solution"][0]
61 | code_comment = get_comments_from_code(code)
62 | library_reader.close()
63 | return [code, code_comment, solution]
64 |
65 | library_reader.close()
66 | return ["", "", ""]
67 |
68 |
69 | if __name__ == "__main__":
70 | print("Passed!")
71 | pass
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Security
4 |
5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
6 |
7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
8 |
9 | ## Reporting Security Issues
10 |
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 |
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 |
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 |
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc).
18 |
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 |
21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 | * Full paths of source file(s) related to the manifestation of the issue
23 | * The location of the affected source code (tag/branch/commit or direct URL)
24 | * Any special configuration required to reproduce the issue
25 | * Step-by-step instructions to reproduce the issue
26 | * Proof-of-concept or exploit code (if possible)
27 | * Impact of the issue, including how an attacker might exploit the issue
28 |
29 | This information will help us triage your report more quickly.
30 |
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 |
33 | ## Preferred Languages
34 |
35 | We prefer all communications to be in English.
36 |
37 | ## Policy
38 |
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 |
41 |
42 |
--------------------------------------------------------------------------------
/apicoder/private-eval/private_eval/data.py:
--------------------------------------------------------------------------------
1 | from typing import Iterable, Dict
2 | import gzip
3 | import json
4 | import os
5 | import ipdb
6 |
7 | ROOT = os.path.dirname(os.path.abspath(__file__))
8 |
9 | # ------------------------------------------------------------------------------------------------------------------------------
10 | # False, True
11 | human_in_the_loop = False
12 | # ["", "_make_sense"] refer to `run_eval_monitor.sh` ["_no", "_make_sense"]
13 | make_sense = ""
14 | # [machine, top3_perfect, top4_perfect, top5_perfect, human_labelled]
15 | user_name = "machine"
16 | # [0, 1, 2, 3, 5, "n"]
17 | api_number = 0
18 | # [pandas, numpy, monkey, beatnum, torchdata]
19 | library_name = "torchdata"
20 |
21 | if not human_in_the_loop:
22 | if api_number == 0:
23 | HUMAN_EVAL = os.path.join(ROOT, "..", "data", f"real_{library_name}_eval_v3.jsonl.gz")
24 | else:
25 | HUMAN_EVAL = os.path.join(ROOT, "..", "data", f"real_{library_name}_eval_v3_api_{str(api_number)}{make_sense}.jsonl.gz")
26 | else:
27 | HUMAN_EVAL = os.path.join(ROOT, "..", "data", f"real_{library_name}_eval_v3_{user_name}{make_sense}.jsonl.gz")
28 | # ------------------------------------------------------------------------------------------------------------------------------
29 |
30 | print("***"*20)
31 | print("load eval from {}".format(HUMAN_EVAL.split('/')[-1].replace(".jsonl.gz", "")))
32 | print("***"*20)
33 |
34 | def read_problems(evalset_file: str = HUMAN_EVAL) -> Iterable[Dict[str, Dict]]:
35 | return {task["task_id"]: task for task in stream_jsonl(evalset_file)}
36 |
37 | def stream_jsonl(filename: str) -> Iterable[Dict]:
38 | """
39 | Parses each jsonl line and yields it as a dictionary
40 | """
41 | if filename.endswith(".gz"):
42 | with open(filename, "rb") as gzfp:
43 | with gzip.open(gzfp, 'rt') as fp:
44 | for line in fp:
45 | if any(not x.isspace() for x in line):
46 | yield json.loads(line)
47 | else:
48 | with open(filename, "r") as fp:
49 | for line in fp:
50 | if any(not x.isspace() for x in line):
51 | try:
52 | yield json.loads(line)
53 | except:
54 | ipdb.set_trace()
55 |
56 |
57 | def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
58 | """
59 | Writes an iterable of dictionaries to jsonl
60 | """
61 | if append:
62 | mode = 'ab'
63 | else:
64 | mode = 'wb'
65 | filename = os.path.expanduser(filename)
66 | if filename.endswith(".gz"):
67 | with open(filename, mode) as fp:
68 | with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp:
69 | for x in data:
70 | gzfp.write((json.dumps(x) + "\n").encode('utf-8'))
71 | else:
72 | with open(filename, mode) as fp:
73 | for x in data:
74 | fp.write((json.dumps(x) + "\n").encode('utf-8'))
75 |
--------------------------------------------------------------------------------
/cert/scripts/ast_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 | """Transform code to sketch"""
4 | from redbaron import RedBaron, NameNode, NodeList, Node
5 | from typing import List, Dict, Tuple, Union, Iterable
6 |
7 | def traverse_node_fst(node_fst):
8 | if isinstance(node_fst, list):
9 | for this_node in node_fst:
10 | traverse_node_fst(this_node)
11 | elif isinstance(node_fst, dict):
12 | if node_fst.get("type") is not None:
13 | this_type = node_fst.get("type")
14 | if node_fst.get("name") is not None:
15 | if this_type == "def":
16 | node_fst["name"] = "func"
17 | elif this_type == "class":
18 | node_fst["name"] = "AnClass"
19 | if node_fst.get("value") is not None:
20 | if this_type == "raw_string":
21 | node_fst["value"] = "rawstring"
22 | elif this_type == "int":
23 | node_fst["value"] = "number"
24 | elif this_type == "interpolated_raw_string":
25 | node_fst["value"] = "interrawstring"
26 | elif this_type == "complex":
27 | node_fst["value"] = "complex" # 1j
28 | elif this_type == "string" and "\"\"\"" not in node_fst["value"] and "\'\'\'" not in node_fst["value"]:
29 | node_fst["value"] = "string"
30 | elif this_type == "float_exponant":
31 | node_fst["value"] = "floatexponant"
32 | elif this_type == "interpolated_string":
33 | node_fst["value"] = "interstring"
34 | elif this_type == "float":
35 | node_fst["value"] = "float"
36 | elif this_type == "binary_string":
37 | node_fst["value"] = "binarystring"
38 | elif this_type == "unicode_string":
39 | node_fst["value"] = "unicodestring"
40 | else:
41 | pass
42 |
43 | for this_key in node_fst:
44 | if isinstance(node_fst[this_key], list) or isinstance(node_fst[this_key], dict):
45 | traverse_node_fst(node_fst[this_key])
46 |
47 | return node_fst
48 |
49 | def transform_code_to_sketch(desp: str):
50 | red = RedBaron(desp)
51 | node_fst = red.fst()
52 | node_fst = traverse_node_fst(node_fst)
53 | code_schema = NodeList.from_fst(node_fst).dumps()
54 | return code_schema
55 |
56 |
57 | def craft_merged_corpus(sketch_list:List=[] , text_list:List=[], linker="\n"):
58 | sketch_norm_list = []
59 | for this_sketch, this_text in zip(sketch_list, text_list):
60 | if this_text.count("import") >= 2 or "__name__" in this_text: # whether removing the highest overlap schema
61 | sketch_norm_list.append(this_text)
62 | else:
63 | sketch_norm_list.append(this_sketch+linker+this_text)
64 | return "\n\n\n".join(sketch_norm_list)
--------------------------------------------------------------------------------
/apicoder/APIRetriever/src/dense/dataset/processor.py:
--------------------------------------------------------------------------------
1 | from transformers import PreTrainedTokenizer
2 |
3 |
4 | class Processor:
5 | def __init__(self, tokenizer: PreTrainedTokenizer):
6 | self.tokenizer = tokenizer
7 |
8 |
9 | class TrainProcessor(Processor):
10 | def __init__(self, tokenizer, query_max_length=32, text_max_length=256):
11 | super().__init__(tokenizer)
12 | self.query_max_length = query_max_length
13 | self.text_max_length = text_max_length
14 |
15 | def __call__(self, example):
16 | query = self.tokenizer.encode(example['query'],
17 | add_special_tokens=False,
18 | max_length=self.query_max_length,
19 | truncation=True)
20 | positives = []
21 | for pos in example['positive_passages']:
22 | text = pos['title'] + " " + pos['text'] if 'title' in pos else pos['text']
23 | positives.append(self.tokenizer.encode(text,
24 | add_special_tokens=False,
25 | max_length=self.text_max_length,
26 | truncation=True))
27 | negatives = []
28 | for neg in example['negative_passages']:
29 | text = neg['title'] + " " + neg['text'] if 'title' in neg else neg['text']
30 | negatives.append(self.tokenizer.encode(text,
31 | add_special_tokens=False,
32 | max_length=self.text_max_length,
33 | truncation=True))
34 | return {'query': query, 'positives': positives, 'negatives': negatives}
35 |
36 |
37 | class TestProcessor(Processor):
38 | def __init__(self, tokenizer, query_max_length=32):
39 | super().__init__(tokenizer)
40 | self.query_max_length = query_max_length
41 |
42 | def __call__(self, example):
43 | query_id = example['query_id']
44 | query = self.tokenizer.encode(example['query'],
45 | add_special_tokens=False,
46 | max_length=self.query_max_length,
47 | truncation=True)
48 | return {'text_id': query_id, 'text': query}
49 |
50 |
51 | class CorpusProcessor(Processor):
52 | def __init__(self, tokenizer, text_max_length=256):
53 | super().__init__(tokenizer)
54 | self.text_max_length = text_max_length
55 |
56 | def __call__(self, example):
57 | docid = example['docid']
58 | text = example['title'] + " " + example['text'] if 'title' in example else example['text']
59 | text = self.tokenizer.encode(text,
60 | add_special_tokens=False,
61 | max_length=self.text_max_length,
62 | truncation=True)
63 | return {'text_id': docid, 'text': text}
64 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/build/lib/dense/dataset/processor.py:
--------------------------------------------------------------------------------
1 | from transformers import PreTrainedTokenizer
2 |
3 |
4 | class Processor:
5 | def __init__(self, tokenizer: PreTrainedTokenizer):
6 | self.tokenizer = tokenizer
7 |
8 |
9 | class TrainProcessor(Processor):
10 | def __init__(self, tokenizer, query_max_length=32, text_max_length=256):
11 | super().__init__(tokenizer)
12 | self.query_max_length = query_max_length
13 | self.text_max_length = text_max_length
14 |
15 | def __call__(self, example):
16 | query = self.tokenizer.encode(example['query'],
17 | add_special_tokens=False,
18 | max_length=self.query_max_length,
19 | truncation=True)
20 | positives = []
21 | for pos in example['positive_passages']:
22 | text = pos['title'] + " " + pos['text'] if 'title' in pos else pos['text']
23 | positives.append(self.tokenizer.encode(text,
24 | add_special_tokens=False,
25 | max_length=self.text_max_length,
26 | truncation=True))
27 | negatives = []
28 | for neg in example['negative_passages']:
29 | text = neg['title'] + " " + neg['text'] if 'title' in neg else neg['text']
30 | negatives.append(self.tokenizer.encode(text,
31 | add_special_tokens=False,
32 | max_length=self.text_max_length,
33 | truncation=True))
34 | return {'query': query, 'positives': positives, 'negatives': negatives}
35 |
36 |
37 | class TestProcessor(Processor):
38 | def __init__(self, tokenizer, query_max_length=32):
39 | super().__init__(tokenizer)
40 | self.query_max_length = query_max_length
41 |
42 | def __call__(self, example):
43 | query_id = example['query_id']
44 | query = self.tokenizer.encode(example['query'],
45 | add_special_tokens=False,
46 | max_length=self.query_max_length,
47 | truncation=True)
48 | return {'text_id': query_id, 'text': query}
49 |
50 |
51 | class CorpusProcessor(Processor):
52 | def __init__(self, tokenizer, text_max_length=256):
53 | super().__init__(tokenizer)
54 | self.text_max_length = text_max_length
55 |
56 | def __call__(self, example):
57 | docid = example['docid']
58 | text = example['title'] + " " + example['text'] if 'title' in example else example['text']
59 | text = self.tokenizer.encode(text,
60 | add_special_tokens=False,
61 | max_length=self.text_max_length,
62 | truncation=True)
63 | return {'text_id': docid, 'text': text}
64 |
--------------------------------------------------------------------------------
/apicoder/CodeGenAPI/nl2code/configuration_codegen.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # Modified configuration implementation based on https://github.com/huggingface/transformers/blob/main/src/transformers/models/gptj/configuration_gptj.py
17 |
18 | from transformers.configuration_utils import PretrainedConfig
19 | from transformers.utils import logging
20 |
21 | logger = logging.get_logger(__name__)
22 |
23 |
24 | class CodeGenConfig(PretrainedConfig):
25 | model_type = "codegen"
26 |
27 | def __init__(
28 | self,
29 | vocab_size=50400,
30 | n_positions=2048,
31 | n_ctx=2048,
32 | n_embd=4096,
33 | n_layer=28,
34 | n_head=16,
35 | rotary_dim=64,
36 | n_inner=None,
37 | activation_function="gelu_new",
38 | resid_pdrop=0.0,
39 | embd_pdrop=0.0,
40 | attn_pdrop=0.0,
41 | layer_norm_epsilon=1e-5,
42 | initializer_range=0.02,
43 | scale_attn_weights=True,
44 | gradient_checkpointing=False,
45 | use_cache=True,
46 | bos_token_id=50256,
47 | eos_token_id=50256,
48 | **kwargs
49 | ):
50 | super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
51 |
52 | self.vocab_size = vocab_size
53 | self.n_ctx = n_ctx
54 | self.n_positions = n_positions
55 | self.n_embd = n_embd
56 | self.n_layer = n_layer
57 | self.n_head = n_head
58 | self.n_inner = n_inner
59 | self.rotary_dim = rotary_dim
60 | self.activation_function = activation_function
61 | self.resid_pdrop = resid_pdrop
62 | self.embd_pdrop = embd_pdrop
63 | self.attn_pdrop = attn_pdrop
64 | self.layer_norm_epsilon = layer_norm_epsilon
65 | self.initializer_range = initializer_range
66 | self.gradient_checkpointing = gradient_checkpointing
67 | self.scale_attn_weights = scale_attn_weights
68 | self.use_cache = use_cache
69 |
70 | self.bos_token_id = bos_token_id
71 | self.eos_token_id = eos_token_id
72 |
73 | @property
74 | def max_position_embeddings(self):
75 | return self.n_positions
76 |
77 | @property
78 | def hidden_size(self):
79 | return self.n_embd
80 |
81 | @property
82 | def num_attention_heads(self):
83 | return self.n_head
84 |
85 | @property
86 | def num_hidden_layers(self):
87 | return self.n_layer
88 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/src/dense/processor/processors.py:
--------------------------------------------------------------------------------
1 | import json
2 | import csv
3 | import datasets
4 | from transformers import PreTrainedTokenizer
5 | from dataclasses import dataclass
6 |
7 |
8 | @dataclass
9 | class SimpleTrainProcessor:
10 | query_file: str
11 | collection_file: str
12 | tokenizer: PreTrainedTokenizer
13 |
14 | max_length: int = 128
15 | columns = ['text_id', 'title', 'text']
16 | title_field = 'title'
17 | text_field = 'text'
18 |
19 | def __post_init__(self):
20 | self.queries = self.read_queries(self.query_file)
21 | self.collection = datasets.load_dataset(
22 | 'csv',
23 | data_files=self.collection_file,
24 | column_names=self.columns,
25 | delimiter='\t',
26 | )['train']
27 |
28 | @staticmethod
29 | def read_queries(queries):
30 | qmap = {}
31 | with open(queries) as f:
32 | for l in f:
33 | qid, qry = l.strip().split('\t')
34 | qmap[qid] = qry
35 | return qmap
36 |
37 | @staticmethod
38 | def read_qrel(relevance_file):
39 | qrel = {}
40 | with open(relevance_file, encoding='utf8') as f:
41 | tsvreader = csv.reader(f, delimiter="\t")
42 | for [topicid, _, docid, rel] in tsvreader:
43 | assert rel == "1"
44 | if topicid in qrel:
45 | qrel[topicid].append(docid)
46 | else:
47 | qrel[topicid] = [docid]
48 | return qrel
49 |
50 | def get_query(self, q):
51 | query_encoded = self.tokenizer.encode(
52 | self.queries[q],
53 | add_special_tokens=False,
54 | max_length=self.max_length,
55 | truncation=True
56 | )
57 | return query_encoded
58 |
59 | def get_passage(self, p):
60 | entry = self.collection[int(p)]
61 | title = entry[self.title_field]
62 | title = "" if title is None else title
63 | body = entry[self.text_field]
64 | content = title + self.tokenizer.sep_token + body
65 |
66 | passage_encoded = self.tokenizer.encode(
67 | content,
68 | add_special_tokens=False,
69 | max_length=self.max_length,
70 | truncation=True
71 | )
72 |
73 | return passage_encoded
74 |
75 | def process_one(self, train):
76 | q, pp, nn = train
77 | train_example = {
78 | 'query': self.get_query(q),
79 | 'positives': [self.get_passage(p) for p in pp],
80 | 'negatives': [self.get_passage(n) for n in nn],
81 | }
82 |
83 | return json.dumps(train_example)
84 |
85 |
86 | @dataclass
87 | class SimpleCollectionProcessor:
88 | tokenizer: PreTrainedTokenizer
89 | separator: str = '\t'
90 | max_length: int = 128
91 |
92 | def process_line(self, line: str):
93 | xx = line.strip().split(self.separator)
94 | text_id, text = xx[0], xx[1:]
95 | text_encoded = self.tokenizer.encode(
96 | self.tokenizer.sep_token.join(text),
97 | add_special_tokens=False,
98 | max_length=self.max_length,
99 | truncation=True
100 | )
101 | encoded = {
102 | 'text_id': text_id,
103 | 'text': text_encoded
104 | }
105 | return json.dumps(encoded)
106 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/build/lib/dense/processor/processors.py:
--------------------------------------------------------------------------------
1 | import json
2 | import csv
3 | import datasets
4 | from transformers import PreTrainedTokenizer
5 | from dataclasses import dataclass
6 |
7 |
8 | @dataclass
9 | class SimpleTrainProcessor:
10 | query_file: str
11 | collection_file: str
12 | tokenizer: PreTrainedTokenizer
13 |
14 | max_length: int = 128
15 | columns = ['text_id', 'title', 'text']
16 | title_field = 'title'
17 | text_field = 'text'
18 |
19 | def __post_init__(self):
20 | self.queries = self.read_queries(self.query_file)
21 | self.collection = datasets.load_dataset(
22 | 'csv',
23 | data_files=self.collection_file,
24 | column_names=self.columns,
25 | delimiter='\t',
26 | )['train']
27 |
28 | @staticmethod
29 | def read_queries(queries):
30 | qmap = {}
31 | with open(queries) as f:
32 | for l in f:
33 | qid, qry = l.strip().split('\t')
34 | qmap[qid] = qry
35 | return qmap
36 |
37 | @staticmethod
38 | def read_qrel(relevance_file):
39 | qrel = {}
40 | with open(relevance_file, encoding='utf8') as f:
41 | tsvreader = csv.reader(f, delimiter="\t")
42 | for [topicid, _, docid, rel] in tsvreader:
43 | assert rel == "1"
44 | if topicid in qrel:
45 | qrel[topicid].append(docid)
46 | else:
47 | qrel[topicid] = [docid]
48 | return qrel
49 |
50 | def get_query(self, q):
51 | query_encoded = self.tokenizer.encode(
52 | self.queries[q],
53 | add_special_tokens=False,
54 | max_length=self.max_length,
55 | truncation=True
56 | )
57 | return query_encoded
58 |
59 | def get_passage(self, p):
60 | entry = self.collection[int(p)]
61 | title = entry[self.title_field]
62 | title = "" if title is None else title
63 | body = entry[self.text_field]
64 | content = title + self.tokenizer.sep_token + body
65 |
66 | passage_encoded = self.tokenizer.encode(
67 | content,
68 | add_special_tokens=False,
69 | max_length=self.max_length,
70 | truncation=True
71 | )
72 |
73 | return passage_encoded
74 |
75 | def process_one(self, train):
76 | q, pp, nn = train
77 | train_example = {
78 | 'query': self.get_query(q),
79 | 'positives': [self.get_passage(p) for p in pp],
80 | 'negatives': [self.get_passage(n) for n in nn],
81 | }
82 |
83 | return json.dumps(train_example)
84 |
85 |
86 | @dataclass
87 | class SimpleCollectionProcessor:
88 | tokenizer: PreTrainedTokenizer
89 | separator: str = '\t'
90 | max_length: int = 128
91 |
92 | def process_line(self, line: str):
93 | xx = line.strip().split(self.separator)
94 | text_id, text = xx[0], xx[1:]
95 | text_encoded = self.tokenizer.encode(
96 | self.tokenizer.sep_token.join(text),
97 | add_special_tokens=False,
98 | max_length=self.max_length,
99 | truncation=True
100 | )
101 | encoded = {
102 | 'text_id': text_id,
103 | 'text': text_encoded
104 | }
105 | return json.dumps(encoded)
106 |
--------------------------------------------------------------------------------
/apicoder/CodeGenAPI/APICoder/get_api_info_by_name.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | #
4 | # @Author: Daoguang Zan, @Mentor: Bei Chen, Jian-Guang Lou
5 | # @Copyright 2022 The Microsoft Research Asia (DKI Group). All rights reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | import json
19 | import os
20 |
21 | def get_api_name_4_api_sign_and_desps(library_name: str, base_dir: str):
22 | """
23 | According to library_name, get all the API info of this library in the format shown in the following format.
24 | """
25 | # load the library_name's all api info
26 | # base_dir = "/mnt/v-dzan/datasets/CERT/PrivateLibrary/Train"
27 | base_dir = os.path.join(base_dir, "PrivateLibrary", "Train")
28 | library_path = os.path.join(base_dir, library_name, f"{library_name}_apis_doc_details.jsonl")
29 |
30 | library_apis_reader = open(library_path, "r")
31 | api_name_4_api_sign_and_desps = {}
32 | # The api_name_4_api_sign_and_desps format is:
33 | # {
34 | # "api_name": {
35 | # api_path1: [api_sign1, api_desp1],
36 | # api_path2: [api_sign2, api_desp2],
37 | # ...
38 | # }
39 | # ...
40 | # }
41 | for line in library_apis_reader:
42 | api_info = json.loads(line)
43 | # (['api_path', 'api_name', 'api_doc', 'api_signature', 'api_description', 'api_parameters', 'api_parameters_number', 'api_returns', 'api_see_also', 'api_notes', 'api_examples'])
44 | api_path = api_info["api_path"]
45 | api_name = api_info["api_name"]
46 | api_signature = api_info["api_signature"]
47 | api_description = api_info["api_description"]
48 | tmp_api_path_api_info = {api_path: [api_signature, api_description]}
49 | if api_name_4_api_sign_and_desps.get(api_name) is None:
50 | api_name_4_api_sign_and_desps[api_name] = tmp_api_path_api_info
51 | else:
52 | api_name_4_api_sign_and_desps[api_name] = dict(api_name_4_api_sign_and_desps[api_name], **tmp_api_path_api_info)
53 |
54 | library_apis_reader.close()
55 | return api_name_4_api_sign_and_desps
56 |
57 | def get_all_api_info_prompt_list_by_api_name(api_name_4_api_sign_and_desps, API_NAME):
58 | """
59 | Get a dictionary of all {API_path: API_signature, API_description} based on the name of the API
60 | """
61 | import sys
62 | from scripts.get_libs_info_from_code import (
63 | normalizer_api_desp,
64 | get_first_sentence_from_api_desp
65 | )
66 |
67 | result_api_path_info_dict = dict()
68 | for api_name, api_path_info_dict in api_name_4_api_sign_and_desps.items():
69 | if api_name == API_NAME:
70 | for api_path, api_info_list in api_path_info_dict.items():
71 | api_signature, api_description = api_info_list[0], get_first_sentence_from_api_desp(normalizer_api_desp(api_info_list[1]))
72 |
73 | result_api_path_info_dict[api_path] = [api_signature, api_description]
74 | break
75 | return result_api_path_info_dict
76 |
--------------------------------------------------------------------------------
/apicoder/private-eval/data/XXXAPIEval-make sense.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "50it [00:00, 16642.74it/s]0/7 [00:00, ?it/s]\n",
13 | "50it [00:00, 14979.66it/s]1/7 [00:01<00:06, 1.04s/it]\n",
14 | "50it [00:00, 14360.12it/s]2/7 [00:02<00:04, 1.00it/s]\n",
15 | "50it [00:00, 13214.57it/s]3/7 [00:02<00:03, 1.04it/s]\n",
16 | "50it [00:00, 11823.60it/s]4/7 [00:03<00:02, 1.46it/s]\n",
17 | "50it [00:00, 14344.40it/s]5/7 [00:03<00:01, 1.95it/s]\n",
18 | "50it [00:00, 15246.47it/s]6/7 [00:03<00:00, 2.38it/s]\n",
19 | "api_num: 100%|██████████| 7/7 [00:03<00:00, 1.81it/s]\n"
20 | ]
21 | }
22 | ],
23 | "source": [
24 | "import json\n",
25 | "import gzip\n",
26 | "import os\n",
27 | "from tqdm import tqdm\n",
28 | "\n",
29 | "base_dir = \"PrivateLibrary/private-eval/data\"\n",
30 | "\n",
31 | "api_nums = [\"0\", \"1\", \"2\", \"3\", \"5\", \"n\", \"human_labelled\"]\n",
32 | "for api_num in tqdm(api_nums, desc=\"api_num\"):\n",
33 | " if api_num == \"0\":\n",
34 | " raw_monkey_api_eval_path = os.path.join(base_dir, f\"real_torchdata_eval_v3.jsonl.gz\")\n",
35 | " output_monkey_api_eval_path = os.path.join(base_dir, f\"real_torchdata_eval_v3_make_sense.jsonl.gz\")\n",
36 | " elif api_num == \"human_labelled\":\n",
37 | " raw_monkey_api_eval_path = os.path.join(base_dir, f\"real_torchdata_eval_v3_{api_num}.jsonl.gz\")\n",
38 | " output_monkey_api_eval_path = os.path.join(base_dir, f\"real_torchdata_eval_v3_{api_num}_make_sense.jsonl.gz\")\n",
39 | " else:\n",
40 | " raw_monkey_api_eval_path = os.path.join(base_dir, f\"real_torchdata_eval_v3_api_{api_num}.jsonl.gz\")\n",
41 | " output_monkey_api_eval_path = os.path.join(base_dir, f\"real_torchdata_eval_v3_api_{api_num}_make_sense.jsonl.gz\")\n",
42 | "\n",
43 | " raw_monkey_reader = gzip.open(raw_monkey_api_eval_path, \"rb\")\n",
44 | " output_writer = gzip.open(output_monkey_api_eval_path, \"wb\")\n",
45 | "\n",
46 | " for line in tqdm(raw_monkey_reader):\n",
47 | " line_decoded = line.decode(\"utf-8\")\n",
48 | " line_dict = json.loads(line_decoded)\n",
49 | " # print(line_dict.keys())\n",
50 | " task_id = line_dict[\"task_id\"]\n",
51 | " prompt = line_dict[\"prompt\"]\n",
52 | " prompt = prompt.replace(\"[start]\", \"Please use the following APIs to solve the task:\")\n",
53 | " prompt = prompt.replace(\"# [end]\\n\", \"\")\n",
54 | " line_dict[\"prompt\"] = prompt\n",
55 | "\n",
56 | " output_writer.write(json.dumps(line_dict).encode(\"utf-8\") + \"\\n\".encode(\"utf-8\"))\n",
57 | "\n",
58 | " raw_monkey_reader.close()\n",
59 | " output_writer.close()\n"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": []
68 | }
69 | ],
70 | "metadata": {
71 | "interpreter": {
72 | "hash": "6b7426018b15da7c09748c02bfa198e6352f15be2a8ba8d90a42df1562657c8c"
73 | },
74 | "kernelspec": {
75 | "display_name": "Python 3.8.12 ('codex')",
76 | "language": "python",
77 | "name": "python3"
78 | },
79 | "language_info": {
80 | "codemirror_mode": {
81 | "name": "ipython",
82 | "version": 3
83 | },
84 | "file_extension": ".py",
85 | "mimetype": "text/x-python",
86 | "name": "python",
87 | "nbconvert_exporter": "python",
88 | "pygments_lexer": "ipython3",
89 | "version": "3.8.12"
90 | },
91 | "orig_nbformat": 4
92 | },
93 | "nbformat": 4,
94 | "nbformat_minor": 2
95 | }
96 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/src/dense/driver/train.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import sys
4 |
5 | from transformers import AutoConfig, AutoTokenizer
6 | from transformers import (
7 | HfArgumentParser,
8 | set_seed,
9 | )
10 | import ipdb
11 |
12 | from dense.arguments import ModelArguments, DataArguments, \
13 | DenseTrainingArguments as TrainingArguments
14 | from dense.data import TrainDataset, QPCollator
15 | from dense.modeling import DenseModel
16 | from dense.trainer import DenseTrainer as Trainer, GCTrainer
17 |
18 | logger = logging.getLogger(__name__)
19 |
20 |
21 | def main():
22 | parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
23 |
24 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
25 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
26 | else:
27 | model_args, data_args, training_args = parser.parse_args_into_dataclasses()
28 | model_args: ModelArguments
29 | data_args: DataArguments
30 | training_args: TrainingArguments
31 |
32 | if (
33 | os.path.exists(training_args.output_dir)
34 | and os.listdir(training_args.output_dir)
35 | and training_args.do_train
36 | and not training_args.overwrite_output_dir
37 | ):
38 | raise ValueError(
39 | f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
40 | )
41 |
42 | # Setup logging
43 | logging.basicConfig(
44 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
45 | datefmt="%m/%d/%Y %H:%M:%S",
46 | level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
47 | )
48 | logger.warning(
49 | "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
50 | training_args.local_rank,
51 | training_args.device,
52 | training_args.n_gpu,
53 | bool(training_args.local_rank != -1),
54 | training_args.fp16,
55 | )
56 | logger.info("Training/evaluation parameters %s", training_args)
57 | logger.info("MODEL parameters %s", model_args)
58 |
59 | set_seed(training_args.seed)
60 |
61 | num_labels = 1
62 | config = AutoConfig.from_pretrained(
63 | model_args.config_name if model_args.config_name else model_args.model_name_or_path,
64 | num_labels=num_labels,
65 | cache_dir=model_args.cache_dir,
66 | )
67 | tokenizer = AutoTokenizer.from_pretrained(
68 | model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
69 | cache_dir=model_args.cache_dir,
70 | use_fast=False,
71 | )
72 | model = DenseModel.build(
73 | model_args,
74 | data_args,
75 | training_args,
76 | config=config,
77 | cache_dir=model_args.cache_dir,
78 | )
79 |
80 | train_dataset = TrainDataset(
81 | data_args, data_args.train_path, tokenizer,
82 | )
83 |
84 | trainer_cls = GCTrainer if training_args.grad_cache else Trainer
85 | trainer = trainer_cls(
86 | model=model,
87 | args=training_args,
88 | train_dataset=train_dataset,
89 | data_collator=QPCollator(
90 | tokenizer,
91 | max_p_len=data_args.p_max_len,
92 | max_q_len=data_args.q_max_len
93 | ),
94 | )
95 | train_dataset.trainer = trainer
96 |
97 | trainer.train(
98 | model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
99 | )
100 | trainer.save_model()
101 | if trainer.is_world_process_zero():
102 | tokenizer.save_pretrained(training_args.output_dir)
103 |
104 | if __name__ == "__main__":
105 | main()
106 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/build/lib/dense/driver/train.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import sys
4 |
5 | from transformers import AutoConfig, AutoTokenizer
6 | from transformers import (
7 | HfArgumentParser,
8 | set_seed,
9 | )
10 | import ipdb
11 |
12 | from dense.arguments import ModelArguments, DataArguments, \
13 | DenseTrainingArguments as TrainingArguments
14 | from dense.data import TrainDataset, QPCollator
15 | from dense.modeling import DenseModel
16 | from dense.trainer import DenseTrainer as Trainer, GCTrainer
17 |
18 | logger = logging.getLogger(__name__)
19 |
20 |
21 | def main():
22 | parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
23 |
24 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
25 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
26 | else:
27 | model_args, data_args, training_args = parser.parse_args_into_dataclasses()
28 | model_args: ModelArguments
29 | data_args: DataArguments
30 | training_args: TrainingArguments
31 |
32 | if (
33 | os.path.exists(training_args.output_dir)
34 | and os.listdir(training_args.output_dir)
35 | and training_args.do_train
36 | and not training_args.overwrite_output_dir
37 | ):
38 | raise ValueError(
39 | f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
40 | )
41 |
42 | # Setup logging
43 | logging.basicConfig(
44 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
45 | datefmt="%m/%d/%Y %H:%M:%S",
46 | level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
47 | )
48 | logger.warning(
49 | "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
50 | training_args.local_rank,
51 | training_args.device,
52 | training_args.n_gpu,
53 | bool(training_args.local_rank != -1),
54 | training_args.fp16,
55 | )
56 | logger.info("Training/evaluation parameters %s", training_args)
57 | logger.info("MODEL parameters %s", model_args)
58 |
59 | set_seed(training_args.seed)
60 |
61 | num_labels = 1
62 | config = AutoConfig.from_pretrained(
63 | model_args.config_name if model_args.config_name else model_args.model_name_or_path,
64 | num_labels=num_labels,
65 | cache_dir=model_args.cache_dir,
66 | )
67 | tokenizer = AutoTokenizer.from_pretrained(
68 | model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
69 | cache_dir=model_args.cache_dir,
70 | use_fast=False,
71 | )
72 | model = DenseModel.build(
73 | model_args,
74 | data_args,
75 | training_args,
76 | config=config,
77 | cache_dir=model_args.cache_dir,
78 | )
79 |
80 | train_dataset = TrainDataset(
81 | data_args, data_args.train_path, tokenizer,
82 | )
83 |
84 | trainer_cls = GCTrainer if training_args.grad_cache else Trainer
85 | trainer = trainer_cls(
86 | model=model,
87 | args=training_args,
88 | train_dataset=train_dataset,
89 | data_collator=QPCollator(
90 | tokenizer,
91 | max_p_len=data_args.p_max_len,
92 | max_q_len=data_args.q_max_len
93 | ),
94 | )
95 | train_dataset.trainer = trainer
96 |
97 | trainer.train(
98 | model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
99 | )
100 | trainer.save_model()
101 | if trainer.is_world_process_zero():
102 | tokenizer.save_pretrained(training_args.output_dir)
103 |
104 | if __name__ == "__main__":
105 | main()
106 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/build/lib/dense/driver/encode.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import sys
4 | from contextlib import nullcontext
5 | from tqdm import tqdm
6 |
7 | import torch
8 |
9 | from torch.utils.data import DataLoader
10 | from transformers import AutoConfig, AutoTokenizer
11 | from transformers import (
12 | HfArgumentParser,
13 | )
14 |
15 | from dense.arguments import ModelArguments, DataArguments, \
16 | DenseTrainingArguments as TrainingArguments
17 | from dense.data import EncodeDataset, EncodeCollator
18 | from dense.modeling import DenseOutput, DenseModelForInference
19 |
20 | logger = logging.getLogger(__name__)
21 |
22 |
23 | def main():
24 | parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
25 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
26 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
27 | else:
28 | model_args, data_args, training_args = parser.parse_args_into_dataclasses()
29 | model_args: ModelArguments
30 | data_args: DataArguments
31 | training_args: TrainingArguments
32 |
33 | if training_args.local_rank > 0 or training_args.n_gpu > 1:
34 | raise NotImplementedError('Multi-GPU encoding is not supported.')
35 |
36 | # Setup logging
37 | logging.basicConfig(
38 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
39 | datefmt="%m/%d/%Y %H:%M:%S",
40 | level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
41 | )
42 |
43 | num_labels = 1
44 | config = AutoConfig.from_pretrained(
45 | model_args.config_name if model_args.config_name else model_args.model_name_or_path,
46 | num_labels=num_labels,
47 | cache_dir=model_args.cache_dir,
48 | )
49 | tokenizer = AutoTokenizer.from_pretrained(
50 | model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
51 | cache_dir=model_args.cache_dir,
52 | use_fast=False,
53 | )
54 |
55 | model = DenseModelForInference.build(
56 | model_name_or_path=model_args.model_name_or_path,
57 | config=config,
58 | cache_dir=model_args.cache_dir,
59 | )
60 |
61 | text_max_length = data_args.q_max_len if data_args.encode_is_qry else data_args.p_max_len
62 |
63 | encode_dataset = EncodeDataset(data_args.encode_in_path, tokenizer, max_len=text_max_length)
64 | encode_loader = DataLoader(
65 | encode_dataset,
66 | batch_size=training_args.per_device_eval_batch_size,
67 | collate_fn=EncodeCollator(
68 | tokenizer,
69 | max_length=text_max_length,
70 | padding='max_length'
71 | ),
72 | shuffle=False,
73 | drop_last=False,
74 | num_workers=training_args.dataloader_num_workers,
75 | )
76 | encoded = []
77 | lookup_indices = []
78 | model = model.to(training_args.device)
79 | model.eval()
80 |
81 | for (batch_ids, batch) in tqdm(encode_loader):
82 | lookup_indices.extend(batch_ids)
83 | with torch.cuda.amp.autocast() if training_args.fp16 else nullcontext():
84 | with torch.no_grad():
85 | for k, v in batch.items():
86 | batch[k] = v.to(training_args.device)
87 | if data_args.encode_is_qry:
88 | model_output: DenseOutput = model(query=batch)
89 | encoded.append(model_output.q_reps.cpu())
90 | else:
91 | model_output: DenseOutput = model(passage=batch)
92 | encoded.append(model_output.p_reps.cpu())
93 |
94 | encoded = torch.cat(encoded)
95 | torch.save((encoded, lookup_indices), data_args.encoded_save_path)
96 |
97 |
98 | if __name__ == "__main__":
99 | main()
100 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/src/dense/arguments.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dataclasses import dataclass, field
3 | from typing import Optional, List, Union
4 | from transformers import TrainingArguments
5 |
6 |
7 | @dataclass
8 | class ModelArguments:
9 | model_name_or_path: str = field(
10 | metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
11 | )
12 | target_model_path: str = field(
13 | default=None,
14 | metadata={"help": "Path to pretrained reranker target model"}
15 | )
16 | config_name: Optional[str] = field(
17 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
18 | )
19 | tokenizer_name: Optional[str] = field(
20 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
21 | )
22 | cache_dir: Optional[str] = field(
23 | default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
24 | )
25 |
26 | # modeling
27 | untie_encoder: bool = field(
28 | default=False,
29 | metadata={"help": "no weight sharing between qry passage encoders"}
30 | )
31 |
32 | # out projection
33 | add_pooler: bool = field(default=False)
34 | projection_in_dim: int = field(default=768)
35 | projection_out_dim: int = field(default=768)
36 |
37 |
38 | @dataclass
39 | class DataArguments:
40 | train_dir: str = field(
41 | default=None, metadata={"help": "Path to train directory"}
42 | )
43 | dataset_name: str = field(
44 | default=None, metadata={"help": "huggingface dataset name"}
45 | )
46 | dataset_proc_num: int = field(
47 | default=12, metadata={"help": "number of proc used in dataset preprocess"}
48 | )
49 | train_n_passages: int = field(default=8)
50 |
51 | encode_in_path: List[str] = field(default=None, metadata={"help": "Path to data to encode"})
52 | encoded_save_path: str = field(default=None, metadata={"help": "where to save the encode"})
53 | encode_is_qry: bool = field(default=False)
54 | encode_num_shard: int = field(default=1)
55 | encode_shard_index: int = field(default=0)
56 |
57 | q_max_len: int = field(
58 | default=32,
59 | metadata={
60 | "help": "The maximum total input sequence length after tokenization for query. Sequences longer "
61 | "than this will be truncated, sequences shorter will be padded."
62 | },
63 | )
64 | p_max_len: int = field(
65 | default=128,
66 | metadata={
67 | "help": "The maximum total input sequence length after tokenization for passage. Sequences longer "
68 | "than this will be truncated, sequences shorter will be padded."
69 | },
70 | )
71 |
72 | def __post_init__(self):
73 | if self.dataset_name is not None:
74 | info = self.dataset_name.split('/')
75 | self.dataset_split = info[-1] if len(info) == 3 else 'train'
76 | self.dataset_name = "/".join(info[:-1]) if len(info) == 3 else '/'.join(info)
77 | if self.train_dir is not None:
78 | files = os.listdir(self.train_dir)
79 | self.train_path = [
80 | os.path.join(self.train_dir, f)
81 | for f in files
82 | if f.endswith('tsv') or f.endswith('json')
83 | ]
84 |
85 |
86 | @dataclass
87 | class DenseTrainingArguments(TrainingArguments):
88 | warmup_ratio: float = field(default=0.1)
89 | negatives_x_device: bool = field(default=False, metadata={"help": "share negatives across devices"})
90 | do_encode: bool = field(default=False, metadata={"help": "run the encoding loop"})
91 |
92 | grad_cache: bool = field(default=False, metadata={"help": "Use gradient cache update"})
93 | gc_q_chunk_size: int = field(default=4)
94 | gc_p_chunk_size: int = field(default=32)
95 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/build/lib/dense/arguments.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dataclasses import dataclass, field
3 | from typing import Optional, List, Union
4 | from transformers import TrainingArguments
5 |
6 |
7 | @dataclass
8 | class ModelArguments:
9 | model_name_or_path: str = field(
10 | metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
11 | )
12 | target_model_path: str = field(
13 | default=None,
14 | metadata={"help": "Path to pretrained reranker target model"}
15 | )
16 | config_name: Optional[str] = field(
17 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
18 | )
19 | tokenizer_name: Optional[str] = field(
20 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
21 | )
22 | cache_dir: Optional[str] = field(
23 | default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
24 | )
25 |
26 | # modeling
27 | untie_encoder: bool = field(
28 | default=False,
29 | metadata={"help": "no weight sharing between qry passage encoders"}
30 | )
31 |
32 | # out projection
33 | add_pooler: bool = field(default=False)
34 | projection_in_dim: int = field(default=768)
35 | projection_out_dim: int = field(default=768)
36 |
37 |
38 | @dataclass
39 | class DataArguments:
40 | train_dir: str = field(
41 | default=None, metadata={"help": "Path to train directory"}
42 | )
43 | dataset_name: str = field(
44 | default=None, metadata={"help": "huggingface dataset name"}
45 | )
46 | dataset_proc_num: int = field(
47 | default=12, metadata={"help": "number of proc used in dataset preprocess"}
48 | )
49 | train_n_passages: int = field(default=8)
50 |
51 | encode_in_path: List[str] = field(default=None, metadata={"help": "Path to data to encode"})
52 | encoded_save_path: str = field(default=None, metadata={"help": "where to save the encode"})
53 | encode_is_qry: bool = field(default=False)
54 | encode_num_shard: int = field(default=1)
55 | encode_shard_index: int = field(default=0)
56 |
57 | q_max_len: int = field(
58 | default=32,
59 | metadata={
60 | "help": "The maximum total input sequence length after tokenization for query. Sequences longer "
61 | "than this will be truncated, sequences shorter will be padded."
62 | },
63 | )
64 | p_max_len: int = field(
65 | default=128,
66 | metadata={
67 | "help": "The maximum total input sequence length after tokenization for passage. Sequences longer "
68 | "than this will be truncated, sequences shorter will be padded."
69 | },
70 | )
71 |
72 | def __post_init__(self):
73 | if self.dataset_name is not None:
74 | info = self.dataset_name.split('/')
75 | self.dataset_split = info[-1] if len(info) == 3 else 'train'
76 | self.dataset_name = "/".join(info[:-1]) if len(info) == 3 else '/'.join(info)
77 | if self.train_dir is not None:
78 | files = os.listdir(self.train_dir)
79 | self.train_path = [
80 | os.path.join(self.train_dir, f)
81 | for f in files
82 | if f.endswith('tsv') or f.endswith('json')
83 | ]
84 |
85 |
86 | @dataclass
87 | class DenseTrainingArguments(TrainingArguments):
88 | warmup_ratio: float = field(default=0.1)
89 | negatives_x_device: bool = field(default=False, metadata={"help": "share negatives across devices"})
90 | do_encode: bool = field(default=False, metadata={"help": "run the encoding loop"})
91 |
92 | grad_cache: bool = field(default=False, metadata={"help": "Use gradient cache update"})
93 | gc_q_chunk_size: int = field(default=4)
94 | gc_p_chunk_size: int = field(default=32)
95 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/src/dense/driver/encode.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import sys
4 | # from contextlib import nullcontext
5 | from tqdm import tqdm
6 | import ipdb
7 |
8 | import torch
9 |
10 | from torch.utils.data import DataLoader
11 | from transformers import AutoConfig, AutoTokenizer
12 | from transformers import (
13 | HfArgumentParser,
14 | )
15 |
16 | from dense.arguments import ModelArguments, DataArguments, \
17 | DenseTrainingArguments as TrainingArguments
18 | from dense.data import EncodeDataset, EncodeCollator
19 | from dense.modeling import DenseOutput, DenseModelForInference
20 |
21 | logger = logging.getLogger(__name__)
22 |
23 |
24 | def main():
25 | parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
26 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
27 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
28 | else:
29 | model_args, data_args, training_args = parser.parse_args_into_dataclasses()
30 | model_args: ModelArguments
31 | data_args: DataArguments
32 | training_args: TrainingArguments
33 |
34 | if training_args.local_rank > 0 or training_args.n_gpu > 1:
35 | raise NotImplementedError('Multi-GPU encoding is not supported.')
36 |
37 | # Setup logging
38 | logging.basicConfig(
39 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
40 | datefmt="%m/%d/%Y %H:%M:%S",
41 | level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
42 | )
43 |
44 | num_labels = 1
45 | config = AutoConfig.from_pretrained(
46 | model_args.config_name if model_args.config_name else model_args.model_name_or_path,
47 | num_labels=num_labels,
48 | cache_dir=model_args.cache_dir,
49 | )
50 | tokenizer = AutoTokenizer.from_pretrained(
51 | model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
52 | cache_dir=model_args.cache_dir,
53 | use_fast=False,
54 | )
55 |
56 | model = DenseModelForInference.build(
57 | model_name_or_path=model_args.model_name_or_path,
58 | config=config,
59 | cache_dir=model_args.cache_dir,
60 | )
61 |
62 | text_max_length = data_args.q_max_len if data_args.encode_is_qry else data_args.p_max_len
63 |
64 | encode_dataset = EncodeDataset(data_args.encode_in_path, tokenizer, max_len=text_max_length)
65 | encode_loader = DataLoader(
66 | encode_dataset,
67 | batch_size=training_args.per_device_eval_batch_size,
68 | collate_fn=EncodeCollator(
69 | tokenizer,
70 | max_length=text_max_length,
71 | padding='max_length'
72 | ),
73 | shuffle=False,
74 | drop_last=False,
75 | num_workers=training_args.dataloader_num_workers,
76 | )
77 | encoded = []
78 | lookup_indices = []
79 | model = model.to(training_args.device)
80 | model.eval()
81 | for (batch_ids, batch) in tqdm(encode_loader):
82 | lookup_indices.extend(batch_ids)
83 | with torch.cuda.amp.autocast() if training_args.fp16 else None:
84 | with torch.no_grad():
85 | for k, v in batch.items():
86 | batch[k] = v.to(training_args.device)
87 | if data_args.encode_is_qry:
88 | model_output: DenseOutput = model(query=batch)
89 | encoded.append(model_output.q_reps.cpu())
90 | else:
91 | model_output: DenseOutput = model(passage=batch)
92 | encoded.append(model_output.p_reps.cpu())
93 |
94 | encoded = torch.cat(encoded)
95 | torch.save((encoded, lookup_indices), data_args.encoded_save_path)
96 |
97 |
98 | if __name__ == "__main__":
99 | main()
100 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/README.md:
--------------------------------------------------------------------------------
1 | # What is it?
2 | APIRetriever is a dense retrieval system that can find possible used APIs for programming problem. We refer to a toolkit named [Dense](https://github.com/luyug/Dense) to implemente our APIRetriever.
3 |
4 | ---
5 |
6 | ## Installation
7 | Our dependencies are as follows:
8 | ```
9 | pytorch==1.8.0
10 | faiss-cpu==1.6.5
11 | transformers==4.2.0
12 | datasets==1.1.3
13 | wandb==0.13.3
14 | ```
15 | So, you should run pip commands for installing the above dependencies automatically.
16 | ```
17 | cd Your/Own/Path/.../PrivateLibrary/APIRetriever
18 | pip install .
19 | ```
20 | Besides, if you would like to use mixed precision FP16 to speed up the training, it is necessary for you to install the apex library.
21 | ```
22 | git clone https://github.com/NVIDIA/apex
23 | cd apex
24 | pip install -v --no-cache-dir ./
25 | ```
26 |
27 | ## Project Directory
28 | ```shell
29 | ├── apex
30 | ├── data
31 | │ ├── inference # The test data for five libraries. The below `XXX` can be `pandas`, `numpy`, `monkey`, `beatnum`, and `torchdata`.
32 | │ │ ├── XXX_api.json # API and its id.
33 | │ │ ├── XXX_api.pt # API embeddings encoded by our APIRetriever.
34 | │ │ ├── XXX_comment.json # Code comment and its id.
35 | │ │ ├── XXX_comment.pt # Code comment embeddings encoded by our APIRetriever.
36 | │ │ ├── XXX_id_score.trec # The score between comment and API with an easy-to-read format.
37 | │ │ ├── XXX_id_score.txt # The score between comment and API with an obscure format.
38 | │ └── train
39 | │ ├── processed-train-data
40 | │ └── unprocessed-train-data
41 | ├── outputs
42 | ├── requirements.txt
43 | ├── scripts
44 | │ ├── extract_retrieval_api_corpus.py
45 | │ ├── run_extract_apiretriever_corpus.sh
46 | │ ├── run_prepare_test_private_code.py
47 | │ └── run_prepare_train_private_code.py
48 | ├── setup.py
49 | └── src
50 | ├── dense
51 | ├── run_train_1.sh
52 | ├── run_encode_2.sh
53 | ├── run_search_3.sh
54 | └── run_trec_format_4.sh
55 | ```
56 |
57 | ## Training
58 |
59 | First, you need to process the crawled python files into comment-API pairs.
60 | ```shell
61 | bash APIRetriever/scripts/run_extract_apiretriever_corpus.sh
62 | ```
63 | Then, you should convert these data pairs into a trainable format for training our APIRetriever.
64 | ```shell
65 | python APIRetriever/scripts/run_prepare_train_private_code.py
66 | ```
67 | After preparing the training corpus, you should start training your own APIRetriever.
68 | ```shell
69 | bash APIRetriever/src/run_train_1.sh
70 | ```
71 |
72 |
73 |
74 | ## Inference
75 | After training phase, we can use APIRetriever to retrieve private APIs for each programming problem description. In detail, we apply $E_{\mathbf{a}}$ to all the APIs and index them by [FAISS](https://github.com/facebookresearch/faiss) offline. Given a new programming problem description $\mathbf{p}$ at run-time, we only need to produce its embedding $v_{\mathbf{p}}=E_{\mathbf{p}}(\mathbf{p})$ and recall the top-$k$ APIs with the embeddings closest to $v_{\mathbf{p}}$.
76 |
77 | First, you should encode the code comments and APIs.
78 | ```shell
79 | bash APIRetriever/src/run_encode_2.sh
80 | ```
81 | Then, you need to retrieve and rank the APIs for each code comment.
82 | ```shell
83 | bash APIRetriever/src/run_search_3.sh
84 | ```
85 | Next, you can get the final scores between code comments and its APIs.
86 | ```shell
87 | bash APIRetriever/src/run_trec_format_4.sh
88 | ```
89 |
90 | > The retrieved outcome is placed in `APIRetriever/data/inference`. In addition, they can be used to prompt APIs (Top-1,2,3,5 and Human) to our crafted benchmarks.
91 |
92 | ## Citation
93 | If you find our work useful, please cite the paper:
94 | ```
95 | @inproceedings{APICoder,
96 | title={When Languange Model Meets Private Library},
97 | author={Zan, Daoguang and Chen, Bei and Lin, Zeqi and Guan, Bei and Wang, Yongji and Lou, Jian-Guang},
98 | booktitle={EMNLP findings},
99 | year={2022}
100 | }
101 | ```
102 |
--------------------------------------------------------------------------------
/cert/pandas-numpy-eval/pandas_numpy_eval/evaluation.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict, Counter
2 | from concurrent.futures import ThreadPoolExecutor, as_completed
3 | from typing import List, Union, Iterable, Dict
4 | import itertools
5 |
6 | import numpy as np
7 | import tqdm
8 |
9 | from pandas_numpy_eval.data import HUMAN_EVAL, read_problems, stream_jsonl, write_jsonl
10 | from pandas_numpy_eval.execution import check_correctness
11 |
12 |
13 | def estimate_pass_at_k(
14 | num_samples: Union[int, List[int], np.ndarray],
15 | num_correct: Union[List[int], np.ndarray],
16 | k: int
17 | ) -> np.ndarray:
18 | """
19 | Estimates pass@k of each problem and returns them in an array.
20 | """
21 |
22 | def estimator(n: int, c: int, k: int) -> float:
23 | """
24 | Calculates 1 - comb(n - c, k) / comb(n, k).
25 | """
26 | if n - c < k:
27 | return 1.0
28 | return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
29 |
30 | if isinstance(num_samples, int):
31 | num_samples_it = itertools.repeat(num_samples, len(num_correct))
32 | else:
33 | assert len(num_samples) == len(num_correct)
34 | num_samples_it = iter(num_samples)
35 |
36 | return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
37 |
38 |
39 | def evaluate_functional_correctness(
40 | sample_file: str,
41 | k: List[int] = [1, 10, 100],
42 | n_workers: int = 4,
43 | timeout: float = 3.0,
44 | problem_file: str = HUMAN_EVAL,
45 | ):
46 | """
47 | Evaluates the functional correctness of generated samples, and writes
48 | results to f"{sample_file}_results.jsonl.gz"
49 | """
50 |
51 | problems = read_problems(problem_file)
52 |
53 | # Check the generated samples against test suites.
54 | with ThreadPoolExecutor(max_workers=n_workers) as executor:
55 |
56 | futures = []
57 | completion_id = Counter()
58 | n_samples = 0
59 | results = defaultdict(list)
60 |
61 | print("Reading samples...")
62 | for sample in tqdm.tqdm(stream_jsonl(sample_file)):
63 | task_id = sample["task_id"]
64 | completion = sample["completion"]
65 | args = (problems[task_id], completion, timeout, completion_id[task_id])
66 | future = executor.submit(check_correctness, *args)
67 | futures.append(future)
68 | completion_id[task_id] += 1
69 | n_samples += 1
70 |
71 | assert len(completion_id) == len(problems), "Some problems are not attempted."
72 |
73 | print("Running test suites...")
74 | for future in tqdm.tqdm(as_completed(futures), total=len(futures)):
75 | result = future.result()
76 | results[result["task_id"]].append((result["completion_id"], result))
77 |
78 | # Calculate pass@k.
79 | total, correct = [], []
80 | for result in results.values():
81 | result.sort()
82 | passed = [r[1]["passed"] for r in result]
83 | total.append(len(passed))
84 | correct.append(sum(passed))
85 | total = np.array(total)
86 | correct = np.array(correct)
87 |
88 | ks = k
89 | pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
90 | for k in ks if (total >= k).all()}
91 |
92 | # Finally, save the results in one file:
93 | def combine_results():
94 | for sample in stream_jsonl(sample_file):
95 | task_id = sample["task_id"]
96 | result = results[task_id].pop(0)
97 | sample["result"] = result[1]["result"]
98 | sample["passed"] = result[1]["passed"]
99 | yield sample
100 |
101 | out_file = sample_file + "_results.jsonl"
102 | print(f"Writing results to {out_file}...")
103 | write_jsonl(out_file, tqdm.tqdm(combine_results(), total=n_samples))
104 |
105 | def return_pass_at_k():
106 | yield pass_at_k
107 | metric_file = sample_file + "_metrics.jsonl"
108 | print(f"Writing metrics to {metric_file}...")
109 | write_jsonl(metric_file, return_pass_at_k())
110 |
111 | return pass_at_k
112 |
--------------------------------------------------------------------------------
/apicoder/private-eval/private_eval/evaluation.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict, Counter
2 | from concurrent.futures import ThreadPoolExecutor, as_completed
3 | from typing import List, Union, Iterable, Dict
4 | import itertools
5 |
6 | import numpy as np
7 | import tqdm
8 |
9 | from private_eval.data import HUMAN_EVAL, read_problems, stream_jsonl, write_jsonl
10 | from private_eval.execution import check_correctness
11 |
12 |
13 | def estimate_pass_at_k(
14 | num_samples: Union[int, List[int], np.ndarray],
15 | num_correct: Union[List[int], np.ndarray],
16 | k: int
17 | ) -> np.ndarray:
18 | """
19 | Estimates pass@k of each problem and returns them in an array.
20 | """
21 |
22 | def estimator(n: int, c: int, k: int) -> float:
23 | """
24 | Calculates 1 - comb(n - c, k) / comb(n, k).
25 | """
26 | if n - c < k:
27 | return 1.0
28 | return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
29 |
30 | if isinstance(num_samples, int):
31 | num_samples_it = itertools.repeat(num_samples, len(num_correct))
32 | else:
33 | assert len(num_samples) == len(num_correct)
34 | num_samples_it = iter(num_samples)
35 |
36 | return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
37 |
38 |
39 | def evaluate_functional_correctness(
40 | sample_file: str,
41 | k: List[int] = [1, 10, 100],
42 | n_workers: int = 4,
43 | timeout: float = 3.0,
44 | problem_file: str = HUMAN_EVAL,
45 | ):
46 | """
47 | Evaluates the functional correctness of generated samples, and writes
48 | results to f"{sample_file}_results.jsonl.gz"
49 | """
50 |
51 | problems = read_problems(problem_file)
52 |
53 | # Check the generated samples against test suites.
54 | with ThreadPoolExecutor(max_workers=n_workers) as executor:
55 |
56 | futures = []
57 | completion_id = Counter()
58 | n_samples = 0
59 | results = defaultdict(list)
60 |
61 | print("Reading samples...")
62 | for sample in tqdm.tqdm(stream_jsonl(sample_file)):
63 | task_id = sample["task_id"]
64 | completion = sample["completion"]
65 | args = (problems[task_id], completion, timeout, completion_id[task_id])
66 | future = executor.submit(check_correctness, *args)
67 | futures.append(future)
68 | completion_id[task_id] += 1
69 | n_samples += 1
70 |
71 | assert len(completion_id) == len(problems), "Some problems are not attempted."
72 |
73 | print("Running test suites...")
74 | for future in tqdm.tqdm(as_completed(futures), total=len(futures)):
75 | result = future.result()
76 | results[result["task_id"]].append((result["completion_id"], result))
77 |
78 | print("Starting calculate pass@k...")
79 | # Calculate pass@k.
80 | total, correct = [], []
81 | for result in results.values():
82 | result.sort()
83 | passed = [r[1]["passed"] for r in result]
84 | total.append(len(passed))
85 | correct.append(sum(passed))
86 | total = np.array(total)
87 | correct = np.array(correct)
88 |
89 | ks = k
90 | pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
91 | for k in ks if (total >= k).all()}
92 | print("Pass@k:", pass_at_k)
93 |
94 | # Finally, save the results in one file:
95 | def combine_results():
96 | for sample in stream_jsonl(sample_file):
97 | task_id = sample["task_id"]
98 | result = results[task_id].pop(0)
99 | sample["result"] = result[1]["result"]
100 | sample["passed"] = result[1]["passed"]
101 | yield sample
102 |
103 | out_file = sample_file + "_results.jsonl"
104 | print(f"Writing results to {out_file}...")
105 | write_jsonl(out_file, tqdm.tqdm(combine_results(), total=n_samples))
106 |
107 | def return_pass_at_k():
108 | yield pass_at_k
109 | metric_file = sample_file + "_metrics.jsonl"
110 | print(f"Writing metrics to {metric_file}...")
111 | write_jsonl(metric_file, return_pass_at_k())
112 |
113 | return pass_at_k
114 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/src/dense/trainer.py:
--------------------------------------------------------------------------------
1 | import os
2 | from itertools import repeat
3 | from typing import Dict, List, Tuple, Optional, Any, Union
4 |
5 | from transformers.trainer import Trainer
6 |
7 | import torch
8 | from torch.utils.data import DataLoader
9 | import torch.distributed as dist
10 |
11 | from .loss import SimpleContrastiveLoss, DistributedContrastiveLoss
12 |
13 | import logging
14 | logger = logging.getLogger(__name__)
15 |
16 | try:
17 | from grad_cache import GradCache
18 | _grad_cache_available = True
19 | except ModuleNotFoundError:
20 | _grad_cache_available = False
21 |
22 |
23 | class DenseTrainer(Trainer):
24 | def __init__(self, *args, **kwargs):
25 | super(DenseTrainer, self).__init__(*args, **kwargs)
26 | self._dist_loss_scale_factor = dist.get_world_size() if self.args.negatives_x_device else 1
27 |
28 | def _save(self, output_dir: Optional[str] = None):
29 | output_dir = output_dir if output_dir is not None else self.args.output_dir
30 | os.makedirs(output_dir, exist_ok=True)
31 | logger.info("Saving model checkpoint to %s", output_dir)
32 | self.model.save(output_dir)
33 |
34 | def _prepare_inputs(
35 | self,
36 | inputs: Tuple[Dict[str, Union[torch.Tensor, Any]], ...]
37 | ) -> List[Dict[str, Union[torch.Tensor, Any]]]:
38 | prepared = []
39 | for x in inputs:
40 | if isinstance(x, torch.Tensor):
41 | prepared.append(x.to(self.args.device))
42 | else:
43 | prepared.append(super()._prepare_inputs(x))
44 | return prepared
45 |
46 | def get_train_dataloader(self) -> DataLoader:
47 | if self.train_dataset is None:
48 | raise ValueError("Trainer: training requires a train_dataset.")
49 | train_sampler = self._get_train_sampler()
50 |
51 | return DataLoader(
52 | self.train_dataset,
53 | batch_size=self.args.train_batch_size,
54 | sampler=train_sampler,
55 | collate_fn=self.data_collator,
56 | drop_last=True,
57 | num_workers=self.args.dataloader_num_workers,
58 | )
59 |
60 | def compute_loss(self, model, inputs):
61 | query, passage = inputs
62 | return model(query=query, passage=passage).loss
63 |
64 | def training_step(self, *args):
65 | return super(DenseTrainer, self).training_step(*args) / self._dist_loss_scale_factor
66 |
67 |
68 | def split_dense_inputs(model_input: dict, chunk_size: int):
69 | assert len(model_input) == 1
70 | arg_key = list(model_input.keys())[0]
71 | arg_val = model_input[arg_key]
72 |
73 | keys = list(arg_val.keys())
74 | chunked_tensors = [arg_val[k].split(chunk_size, dim=0) for k in keys]
75 | chunked_arg_val = [dict(zip(kk, tt)) for kk, tt in zip(repeat(keys), zip(*chunked_tensors))]
76 |
77 | return [{arg_key: c} for c in chunked_arg_val]
78 |
79 |
80 | def get_dense_rep(x):
81 | if x.q_reps is None:
82 | return x.p_reps
83 | else:
84 | return x.q_reps
85 |
86 |
87 | class GCTrainer(DenseTrainer):
88 | def __init__(self, *args, **kwargs):
89 | logger.info('Initializing Gradient Cache Trainer')
90 | if not _grad_cache_available:
91 | raise ValueError(
92 | 'Grad Cache package not available. You can obtain it from https://github.com/luyug/GradCache.')
93 | super(GCTrainer, self).__init__(*args, **kwargs)
94 |
95 | loss_fn_cls = DistributedContrastiveLoss if self.args.negatives_x_device else SimpleContrastiveLoss
96 | loss_fn = loss_fn_cls(self.model.data_args.train_n_passages)
97 |
98 | self.gc = GradCache(
99 | models=[self.model, self.model],
100 | chunk_sizes=[self.args.gc_q_chunk_size, self.args.gc_p_chunk_size],
101 | loss_fn=loss_fn,
102 | split_input_fn=split_dense_inputs,
103 | get_rep_fn=get_dense_rep,
104 | fp16=self.args.fp16,
105 | scaler=self.scaler
106 | )
107 |
108 | def training_step(self, model, inputs) -> torch.Tensor:
109 | model.train()
110 | queries, passages = self._prepare_inputs(inputs)
111 | queries, passages = {'query': queries}, {'passage': passages}
112 |
113 | _distributed = self.args.local_rank > -1
114 | self.gc.models = [model, model]
115 | loss = self.gc(queries, passages, no_sync_except_last=_distributed)
116 |
117 | return loss / self._dist_loss_scale_factor
118 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/build/lib/dense/trainer.py:
--------------------------------------------------------------------------------
1 | import os
2 | from itertools import repeat
3 | from typing import Dict, List, Tuple, Optional, Any, Union
4 |
5 | from transformers.trainer import Trainer
6 |
7 | import torch
8 | from torch.utils.data import DataLoader
9 | import torch.distributed as dist
10 |
11 | from .loss import SimpleContrastiveLoss, DistributedContrastiveLoss
12 |
13 | import logging
14 | logger = logging.getLogger(__name__)
15 |
16 | try:
17 | from grad_cache import GradCache
18 | _grad_cache_available = True
19 | except ModuleNotFoundError:
20 | _grad_cache_available = False
21 |
22 |
23 | class DenseTrainer(Trainer):
24 | def __init__(self, *args, **kwargs):
25 | super(DenseTrainer, self).__init__(*args, **kwargs)
26 | self._dist_loss_scale_factor = dist.get_world_size() if self.args.negatives_x_device else 1
27 |
28 | def _save(self, output_dir: Optional[str] = None):
29 | output_dir = output_dir if output_dir is not None else self.args.output_dir
30 | os.makedirs(output_dir, exist_ok=True)
31 | logger.info("Saving model checkpoint to %s", output_dir)
32 | self.model.save(output_dir)
33 |
34 | def _prepare_inputs(
35 | self,
36 | inputs: Tuple[Dict[str, Union[torch.Tensor, Any]], ...]
37 | ) -> List[Dict[str, Union[torch.Tensor, Any]]]:
38 | prepared = []
39 | for x in inputs:
40 | if isinstance(x, torch.Tensor):
41 | prepared.append(x.to(self.args.device))
42 | else:
43 | prepared.append(super()._prepare_inputs(x))
44 | return prepared
45 |
46 | def get_train_dataloader(self) -> DataLoader:
47 | if self.train_dataset is None:
48 | raise ValueError("Trainer: training requires a train_dataset.")
49 | train_sampler = self._get_train_sampler()
50 |
51 | return DataLoader(
52 | self.train_dataset,
53 | batch_size=self.args.train_batch_size,
54 | sampler=train_sampler,
55 | collate_fn=self.data_collator,
56 | drop_last=True,
57 | num_workers=self.args.dataloader_num_workers,
58 | )
59 |
60 | def compute_loss(self, model, inputs):
61 | query, passage = inputs
62 | return model(query=query, passage=passage).loss
63 |
64 | def training_step(self, *args):
65 | return super(DenseTrainer, self).training_step(*args) / self._dist_loss_scale_factor
66 |
67 |
68 | def split_dense_inputs(model_input: dict, chunk_size: int):
69 | assert len(model_input) == 1
70 | arg_key = list(model_input.keys())[0]
71 | arg_val = model_input[arg_key]
72 |
73 | keys = list(arg_val.keys())
74 | chunked_tensors = [arg_val[k].split(chunk_size, dim=0) for k in keys]
75 | chunked_arg_val = [dict(zip(kk, tt)) for kk, tt in zip(repeat(keys), zip(*chunked_tensors))]
76 |
77 | return [{arg_key: c} for c in chunked_arg_val]
78 |
79 |
80 | def get_dense_rep(x):
81 | if x.q_reps is None:
82 | return x.p_reps
83 | else:
84 | return x.q_reps
85 |
86 |
87 | class GCTrainer(DenseTrainer):
88 | def __init__(self, *args, **kwargs):
89 | logger.info('Initializing Gradient Cache Trainer')
90 | if not _grad_cache_available:
91 | raise ValueError(
92 | 'Grad Cache package not available. You can obtain it from https://github.com/luyug/GradCache.')
93 | super(GCTrainer, self).__init__(*args, **kwargs)
94 |
95 | loss_fn_cls = DistributedContrastiveLoss if self.args.negatives_x_device else SimpleContrastiveLoss
96 | loss_fn = loss_fn_cls(self.model.data_args.train_n_passages)
97 |
98 | self.gc = GradCache(
99 | models=[self.model, self.model],
100 | chunk_sizes=[self.args.gc_q_chunk_size, self.args.gc_p_chunk_size],
101 | loss_fn=loss_fn,
102 | split_input_fn=split_dense_inputs,
103 | get_rep_fn=get_dense_rep,
104 | fp16=self.args.fp16,
105 | scaler=self.scaler
106 | )
107 |
108 | def training_step(self, model, inputs) -> torch.Tensor:
109 | model.train()
110 | queries, passages = self._prepare_inputs(inputs)
111 | queries, passages = {'query': queries}, {'passage': passages}
112 |
113 | _distributed = self.args.local_rank > -1
114 | self.gc.models = [model, model]
115 | loss = self.gc(queries, passages, no_sync_except_last=_distributed)
116 |
117 | return loss / self._dist_loss_scale_factor
118 |
--------------------------------------------------------------------------------
/apicoder/CodeGenAPI/run_private.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export WANDB_PROJECT="Your Project Name"
4 | export WANDB_API_KEY="Your API Key of WANDB"
5 |
6 | BASE_DATA_DIR="your/base/dir"
7 | if [ ! -z "$AMLT_DATA_DIR" ]; then
8 | echo "Run experiment on AMLT."
9 | BASE_DATA_DIR=$AMLT_DATA_DIR
10 | fi
11 |
12 | DOMAIN="PrivateLibrary"
13 | TYPE="private_libs_bin_codegen_v2"
14 |
15 | DATA_DIR="${BASE_DATA_DIR}/XXX/${DOMAIN}/${TYPE}"
16 |
17 | N_GPUS="8"
18 | NODE_SIZE="1"
19 |
20 | if [ ! -z "$1" ]; then
21 | N_GPUS=$1
22 | fi
23 |
24 | BATCH_SIZE=8 # 24G:7 32G:8 16G:6
25 | MAX_STEPS=500_000
26 | BLOCK_SIZE=1024
27 | GRAD_ACC_STEPS=2
28 | WARMUP_STEPS=1_000
29 | SAVE_STEPS=2_000
30 |
31 | LR="5e-4"
32 | WD="0.1"
33 |
34 | # DO NOT take func score into consideration for resampling by setting a const weight 1.0
35 | RS_WEIGHTS="1.0_0.5_1.0" #_0.5"
36 | GAS="" #"512K_150K" #default is const
37 |
38 | OUTPUT_DIR="${BASE_DATA_DIR}/XXX/${DOMAIN}/experiments_codegen_v2/"
39 | CKPT_NAME=""
40 |
41 | STEP_SUMMARY="${WARMUP_STEPS}K_${MAX_STEPS}K_${SAVE_STEPS}K"
42 | STEP_SUMMARY=${STEP_SUMMARY//_000/}
43 |
44 | # Resampling with weight 0.4
45 | # GRAD_ACC_STEPS + 1.0 per epoch
46 | ID=""
47 |
48 | if [ ! -z "$RS_WEIGHTS" ]; then
49 | ID="RS_${RS_WEIGHTS}"
50 | fi
51 |
52 | if [ ! -z "$GAS" ]; then
53 | ID="${ID}-GAS_${GAS}"
54 | else
55 | GAS="const"
56 | fi
57 |
58 | if [ ! -z "$CKPT_NAME" ]; then
59 | ID="${ID}-RSUME"
60 | fi
61 |
62 | ACTUAL_GPUS=$((${N_GPUS}*${NODE_SIZE}))
63 | RUN_NAME="${BATCH_SIZE}x${GRAD_ACC_STEPS}x${ACTUAL_GPUS}x${BLOCK_SIZE}-${LR}-${WD}-${STEP_SUMMARY}-${ID}"
64 | RUN_OUTPUT_DIR="$OUTPUT_DIR/$RUN_NAME"
65 |
66 | echo "Experiment Run Name: $RUN_NAME"
67 | echo "Data Dir:" $DATA_DIR
68 | echo "Actual GPUs:" $ACTUAL_GPUS
69 | export DISTRIBUTED_GPU_SIZE=$ACTUAL_GPUS
70 |
71 | echo "Output Dir:" $OUTPUT_DIR
72 | echo "Init Actual Batch Size: ${BATCH_SIZE}x${GRAD_ACC_STEPS}x${N_GPUS}x${NODE_SIZE}"
73 |
74 | Run_Command_Args=" --model_name_or_path $DATA_DIR/model"
75 | Run_Command_Args="$Run_Command_Args --run_name $RUN_NAME"
76 | Run_Command_Args="$Run_Command_Args --output_dir $RUN_OUTPUT_DIR"
77 | Run_Command_Args="$Run_Command_Args --train_file $DATA_DIR/train"
78 | Run_Command_Args="$Run_Command_Args --validation_file $DATA_DIR/valid"
79 | Run_Command_Args="$Run_Command_Args --do_train"
80 | Run_Command_Args="$Run_Command_Args --do_eval"
81 |
82 | Run_Command_Args="$Run_Command_Args --block_size $BLOCK_SIZE"
83 | Run_Command_Args="$Run_Command_Args --logging_steps 100"
84 | Run_Command_Args="$Run_Command_Args --evaluation_strategy steps"
85 | Run_Command_Args="$Run_Command_Args --eval_steps $SAVE_STEPS"
86 | Run_Command_Args="$Run_Command_Args --save_steps $SAVE_STEPS"
87 | Run_Command_Args="$Run_Command_Args --warmup_steps $WARMUP_STEPS"
88 | Run_Command_Args="$Run_Command_Args --learning_rate $LR"
89 | Run_Command_Args="$Run_Command_Args --adam_beta2 0.95"
90 | Run_Command_Args="$Run_Command_Args --lr_scheduler_type cosine"
91 | Run_Command_Args="$Run_Command_Args --resampling_weights $RS_WEIGHTS"
92 |
93 | Run_Command_Args="$Run_Command_Args --max_steps $MAX_STEPS"
94 | Run_Command_Args="$Run_Command_Args --per_device_train_batch_size $BATCH_SIZE"
95 | Run_Command_Args="$Run_Command_Args --per_device_eval_batch_size $BATCH_SIZE"
96 | Run_Command_Args="$Run_Command_Args --gradient_accumulation_steps $GRAD_ACC_STEPS"
97 | Run_Command_Args="$Run_Command_Args --weight_decay $WD"
98 | Run_Command_Args="$Run_Command_Args --fp16"
99 | Run_Command_Args="$Run_Command_Args --report_to wandb"
100 |
101 | if [ ! -z "$GAS" ]; then
102 | Run_Command_Args="$Run_Command_Args --gradient_accumulation_strategy $GAS"
103 | fi
104 |
105 | if [ ! -z "$CKPT_NAME" ]; then
106 | CKPT_PATH=$"$OUTPUT_DIR/$CKPT_NAME"
107 | echo "Resume from checkpoint: $CKPT_PATH"
108 | Run_Command_Args="$Run_Command_Args --resume_from_checkpoint $CKPT_PATH --ignore_data_skip"
109 | fi
110 |
111 |
112 | echo "Run Command Args: $Run_Command_Args"
113 |
114 | # deepspeed --num_gpus $N_GPUS run_gpt.py --deepspeed configs/ds_config.json $Run_Command_Args
115 | # deepspeed --num_gpus $N_GPUS run_gpt.py --deepspeed configs/ds_config_zero3.json $Run_Command_Args
116 |
117 | # if [ ! -z "$NODE_RANK" ]; then
118 | # echo "Run distributed training on multi nodes $NODE_RANK/$NODE_SIZE, master ip = $MASTER_ADDR:$MASTER_PORT"
119 | # python -m torch.distributed.launch \
120 | # --nproc_per_node=$N_GPUS \
121 | # --nnodes=$NODE_SIZE \
122 | # --node_rank=$NODE_RANK \
123 | # --master_addr=$MASTER_ADDR \
124 | # --master_port=$MASTER_PORT \
125 | # --use_env run_cert.py $Run_Command_Args
126 | # else
127 | python -m torch.distributed.launch --nproc_per_node $N_GPUS --use_env run_private.py $Run_Command_Args
128 | # fi
--------------------------------------------------------------------------------
/eval_human_eval.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import argparse
4 | from tqdm import tqdm
5 |
6 | from transformers import pipeline, set_seed
7 | from transformers import AutoTokenizer, AutoModelForCausalLM
8 | from transformers.pipelines.base import Pipeline
9 |
10 | from human_eval.data import write_jsonl, read_problems
11 |
12 | def load_generation_pipe(model_name_or_path: str, gpu_device: int=0):
13 | model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
14 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
15 |
16 | pipe = pipeline(
17 | 'text-generation',
18 | model=model,
19 | tokenizer=tokenizer,
20 | device=gpu_device
21 | )
22 |
23 | print("load generation pipeline from {} over, vocab size = {}, eos id = {}, gpu device = {}.".format(
24 | model_name_or_path, len(tokenizer), tokenizer.eos_token_id, gpu_device)
25 | )
26 |
27 | return pipe
28 |
29 | def extract_function_block(string):
30 | return re.split("\nclass|\ndef|\n#|\n@|\nprint|\nif", string)[0].rstrip()
31 |
32 | def run_code_generation(pipe, prompt, num_completions=1, **gen_kwargs):
33 | set_seed(123)
34 |
35 | code_gens = pipe(prompt,
36 | num_return_sequences=num_completions,
37 | **gen_kwargs
38 | )
39 |
40 | return [extract_function_block(code_gen["generated_text"][len(prompt):]) for code_gen in code_gens]
41 |
42 | def evaluate_on_human_eval(
43 | model_name_or_path: str,
44 | temperature: float,
45 | top_p: float,
46 | num_samples_per_task: int,
47 | max_new_tokens: int,
48 | gpu_device: int,
49 | output_dir: str,
50 | ) -> str:
51 |
52 | pipe: Pipeline = load_generation_pipe(model_name_or_path, gpu_device=gpu_device)
53 | eval_name = f"human_eval.t{temperature}.p{top_p}.l{max_new_tokens}.n{num_samples_per_task}"
54 |
55 | if output_dir is None:
56 | if os.path.exists(model_name_or_path):
57 | output_dir = model_name_or_path
58 | else:
59 | raise ValueError("Output dir can't be null if you are not evaluation a local model.")
60 |
61 | os.makedirs(output_dir, exist_ok=True)
62 | saved_path = os.path.join(output_dir, f"{eval_name}.samples.jsonl")
63 |
64 | gen_kwargs = {
65 | "do_sample": True,
66 | "temperature": temperature,
67 | "max_new_tokens": max_new_tokens,
68 | "top_p": top_p,
69 | "top_k": 0,
70 | "pad_token_id": pipe.tokenizer.pad_token_id if pipe.tokenizer.pad_token_id else pipe.tokenizer.eos_token_id,
71 | "eos_token_id": pipe.tokenizer.eos_token_id
72 | }
73 |
74 | problems = read_problems()
75 | samples = []
76 | generate_batch_size = min(50, num_samples_per_task)
77 |
78 | bos_token = pipe.tokenizer.bos_token if pipe.tokenizer.bos_token else pipe.tokenizer.eos_token
79 |
80 | for task_id in tqdm(problems):
81 | # Strip operation is important as new tokenizer will not treat '\n' as a independent token
82 | prompt = problems[task_id]["prompt"].strip()
83 |
84 | for _ in range(num_samples_per_task // generate_batch_size):
85 | input_prompt = bos_token + prompt
86 | gen_results = run_code_generation(pipe, input_prompt, num_completions=generate_batch_size, **gen_kwargs)
87 | for gen_result in gen_results:
88 | samples.append(dict(task_id=task_id, completion=gen_result))
89 |
90 | write_jsonl(saved_path, samples)
91 | print("Run generation over, save {} samples to {}.".format(len(samples), saved_path))
92 |
93 | if __name__ == '__main__':
94 | parser = argparse.ArgumentParser(description='Run evaluation for code generation model on human-eval.')
95 |
96 | parser.add_argument('-model', '--model_name_or_path', type=str, required=True)
97 | parser.add_argument('-o', '--output_dir', type=str, default=None)
98 | parser.add_argument('-n', '--num_completions', type=int, default=100)
99 | parser.add_argument('-t', '--temperature', type=float, default=0.2)
100 | parser.add_argument('-p', '--top_p', type=float, default=0.95)
101 | parser.add_argument('-l', '--max_new_tokens', type=int, default=100)
102 | parser.add_argument('-gpu', "--gpu_device", type=int, default=0)
103 |
104 | args = parser.parse_args()
105 |
106 | evaluate_on_human_eval(
107 | model_name_or_path=args.model_name_or_path,
108 | temperature=args.temperature,
109 | top_p=args.top_p,
110 | num_samples_per_task=args.num_completions,
111 | max_new_tokens=args.max_new_tokens,
112 | gpu_device=args.gpu_device,
113 | output_dir=args.output_dir,
114 | )
115 | pass
--------------------------------------------------------------------------------
/cert/pandas-numpy-eval/README.md:
--------------------------------------------------------------------------------
1 | # PandasEval and NumpyEval
2 |
3 | Two benchmarks for evaluating the performance of library-oriented code generation. They are proposed in the paper "[CERT: Continual Pre-Training on Sketches for Library-Oriented Code Generation](https://arxiv.org/pdf/2206.06888.pdf)".
4 |
5 | The evaluation script is adapted from OpenAI's [humaneval](https://github.com/openai/human-eval/tree/master/human_eval).
6 |
7 | ## Installation
8 |
9 | Make sure to use python 3.7 or later:
10 | ```
11 | $ conda create -n pycodegpt python=3.7
12 | $ conda activate pycodegpt
13 | ```
14 |
15 | Check out and install this repository:
16 | ```
17 | $ pip install -e pandas-numpy-eval
18 | ```
19 |
20 | ## Configuration
21 | ```
22 | ├── data # The directory of our crafted benchmarks.
23 | │ ├── NumpyEval.jsonl.gz
24 | │ └── PandasEval.jsonl.gz
25 | ├── pandas_numpy_eval
26 | │ ├── data.py # Choosing whether to load PandasEval or NumpyEval.
27 | │ ├── evaluate_functional_correctness.py # Calculating the evaluation results.
28 | │ ├── evaluation.py # Calculating the evaluation results.
29 | │ └── execution.py # Executing the predicted code.
30 | ```
31 |
32 | ## Running Environment Testing
33 |
34 | You need replace `XXX` with your local path for testing the pandas results. (Make sure that the `LIB` variable in `pandas-numpy-eval/pandas_numpy_eval/data.py` is set to `pandas`.)
35 | ```
36 | $ evaluate_functional_correctness XXX/CERT/pandas-numpy-eval/data/Example_Pandas_PYCODEGPT_samples.jsonl
37 | ```
38 |
39 | If you can successfully run the above command and obtain the following results, the evaluation environment is ready to use.
40 | ```
41 | {'pass@1': 0.06930693069306931}
42 | ```
43 |
44 | # The Process of Constructing PandasEval and NumpyEval
45 |
46 | We refer to [StackOverFlow](https://stackoverflow.com/), a Q&A website for programmers, to build the benchmarks. We search for posts using the library tag on StackOverFlow, and select those with high votes. To ensure quality, we only refer to posts with accepted answers. We go through a post's question and its accepted answer, then manually organize them into the form needed for our benchmarks, containing both context and target code. We also polish all programming problems so that the problem descriptions are clear and the codes are correct. Note that we keep the intentions and the descriptions of the programming problems consistent with the posts to the maximum extent. Finally, two programmers with more than three years of coding experience in the library are invited to act as code generation models and check the quality of the data.
47 |
48 | As a result, we craft 101 programming problems for PandasEval and NumpyEval, respectively. Each programming problem is equipped with test cases for evaluation.
49 |
50 | # Two Examples of Programming Problems
51 |
52 | Context is shown with a white background and the target code with a gray background.
53 |
54 |
55 |
56 |
57 |
58 | ## Reference
59 |
60 | If you use PandasEval or NumpyEval in your work, please cite the paper:
61 |
62 | ```
63 | @inproceedings{CERT,
64 | title={{CERT}: Continual Pre-training on Sketches for Library-oriented Code Generation},
65 | author={Zan, Daoguang and Chen, Bei and Yang, Dejian and Lin, Zeqi and Kim, Minsu and Guan, Bei and Wang, Yongji and Chen, Weizhu and Lou, Jian-Guang},
66 | booktitle={The 2022 International Joint Conference on Artificial Intelligence},
67 | year={2022}
68 | }
69 | ```
70 |
71 | If you use the evaluationg script, please also cite the following paper:
72 | ```
73 | @article{codex,
74 | title={Evaluating Large Language Models Trained on Code},
75 | author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
76 | year={2021},
77 | eprint={2107.03374},
78 | archivePrefix={arXiv},
79 | primaryClass={cs.LG}
80 | }
81 | ```
82 |
--------------------------------------------------------------------------------
/cert/README.md:
--------------------------------------------------------------------------------
1 | # CERT: Continual Pre-Training on Sketches for Library-Oriented Code Generation
2 |
3 | Official repository for our paper ["CERT: Continual Pre-Training on Sketches for Library-Oriented Code Generation"](https://arxiv.org/pdf/2206.06888.pdf), containing crafted benchmarks, codes, and pre-trained models.
4 |
5 | ---
6 |
7 | ## Overview
8 |
9 | In our paper, we focus on investigating whether and how language models pre-trained on large-scale unlabelled code corpus can generate library-oriented code snippets. To meet this challenge, we propose CERT (for sket**C**her and g**E**ne**R**a**T**or), which is a continual pre-training approach on sketches for library-oriented code generation. In CERT, a sketcher firstly focuses on predicting a sketch, which omits user-defined details; then, a generator uses the sketch as a prompt to generate the complete code. In addition, we craft two evaluation benchmarks for Python libraries, called PandasEval and NumpyEval, each including 101 programming problems using Pandas and NumPy, respectively.
10 |
11 |
12 |
13 | Figure1: Overview of CERT: a sketcher and a generator.
14 |
15 | ## Project Directory
16 | ```
17 | ├── nl2code # Basic scripts for loading corpus and training CERT.
18 | ├── code_dataset.py
19 | ├── dynamic_block_dataset.py
20 | ├── hf_trainer.py
21 | └── indexed_dataset.py
22 | ├── pandas-numpy-eval # Benchmarks and evaluation scripts. Please go to the folder for details.
23 | ├── scripts
24 | ├── ast_utils.py # Tools to handle the AST of Python code, for example, converting a code block to its code sketch.
25 | ├── encode_domain.py # Implementation of encoding.
26 | ├── file_utils.py # Tools for managing files.
27 | ├── multiprocessing_utils.py # Tools for managing multiple processes.
28 | └── run_encode_domain.sh # Encoding the crafted corpus (sketcher corpus and generator corpus).
29 | ├── eval_cert_unified.py # Implementation of code generation for CERT-sketcher and CERT-generator.
30 | ├── eval_cert.py # Implementation of code generation for PyCodeGPT and other baseline models.
31 | ├── run_cert.py # Implementation of CERT training.
32 | ├── run_evaluating_codes.sh # The entry script for evaluating the generated code snippets, and outputting the final results (pass@k).
33 | ├── run_generating_codes.sh # The entry script for CERT inference, which can generate a lot of code snippets for each programming problem in PandasEval and NumpyEval.
34 | ├── run_training_cert.sh # The entry script for training CERT.
35 | ```
36 |
37 | ## Quickstart
38 |
39 | This section covers environment, data preparation, model inference, and model training.
40 |
41 | ### Preparation
42 |
43 | 1、Configuring your runtime environment
44 |
45 | ```
46 | $ cd CERT/
47 | $ pip install -r requirements.txt
48 | ```
49 |
50 | 2、Preparation of pre-trained models
51 |
52 | Download the pre-trained checkpoint (e.g., `pycodegpt-110M`) from `Releases` in this GitHub project and place it in the corresponding folder (e.g., `CERT/models/pycodegpt-110M`).
53 |
54 | 3、Updating the scripts according to your local path
55 |
56 | - Update `run_training_cert.sh`.
57 | - Update `run_generating_codes.sh`.
58 | - Update `run_evaluating_codes.sh`.
59 |
60 | ### Use PyCodeGPT or CERT
61 |
62 | Firstly, multiple code snippets are generated for each programming problem (`run_generating_codes.sh`). Then, the code snippets are evaluated (`run_evaluating_codes.sh`).
63 |
64 | ```
65 | $ bash run_generating_codes.sh
66 | $ bash run_evaluating_codes.sh
67 | ```
68 |
69 | ### Train CERT
70 |
71 | Train CERT (sketcher and generator) by the following command based on the large-scale code corpus.
72 |
73 | ```
74 | $ bash run_training_cert.sh
75 | ```
76 |
77 | ## Experiments and Some Cases
78 |
79 | In inference phase, we set the `temperature` to one of `[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]`, the number of samples (`NUM_SAMPLES`) to `200`, the max number of generated tokens (`MAX_TOKNES`) to `100`, and the `top_p` to `0.9`. The best number is reported across the above hyper-parameters.
80 |
81 | Here are some cases:
82 |
83 | 1. Sketcher and Generator are able to predict successfully. (It usually occurs when there are more user-defined terms.)
84 |
85 | 2. Sketcher predicts the correct answer directly. (It usually occurs when there are relatively few or no user-defined terms)
86 |
87 | 3. Sketcher predicts wrong sketch, but generator can rectify them and predict the correct answer.
88 |
89 |
90 |
91 | ## Citation
92 | If you find our work useful, please cite the paper:
93 | ```
94 | @inproceedings{CERT,
95 | title={{CERT}: Continual Pre-training on Sketches for Library-oriented Code Generation},
96 | author={Zan, Daoguang and Chen, Bei and Yang, Dejian and Lin, Zeqi and Kim, Minsu and Guan, Bei and Wang, Yongji and Chen, Weizhu and Lou, Jian-Guang},
97 | booktitle={The 2022 International Joint Conference on Artificial Intelligence},
98 | year={2022}
99 | }
100 | ```
--------------------------------------------------------------------------------
/cert/run_training_cert.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) Microsoft Corporation.
3 | # Licensed under the MIT license.
4 |
5 | # setup for wandb
6 | export WANDB_PROJECT="CERT"
7 | export WANDB_API_KEY="Your wandb api key"
8 |
9 | BASE_DATA_DIR="Your base data directory"
10 | if [ ! -z "$AMLT_DATA_DIR" ]; then
11 | echo "Run experiment on AMLT."
12 | BASE_DATA_DIR=$AMLT_DATA_DIR
13 | fi
14 |
15 | # [Pandas, Numpy]
16 | DOMAIN="Pandas"
17 |
18 | # [normal, sketcher, generator]
19 | TYPE="generator"
20 |
21 | # ------------------Pandas------# ------Numpy--------------------------------
22 | # You should replace the following variables according to your own settings.
23 | # ------------------Pandas------# ------Numpy--------------------------------
24 | DATA_DIR="${BASE_DATA_DIR}/CERT/${DOMAIN}/${TYPE}_bin"
25 |
26 | N_GPUS="1"
27 | NODE_SIZE="1"
28 |
29 | if [ ! -z "$1" ]; then
30 | N_GPUS=$1
31 | fi
32 |
33 | if [ ! -z "$2" ]; then
34 | NODE_SIZE=$2
35 | fi
36 |
37 | BATCH_SIZE=1 # 24G:7 32G:8 16G:6
38 | MAX_STEPS=100_000
39 | BLOCK_SIZE=1024
40 | GRAD_ACC_STEPS=2
41 | WARMUP_STEPS=1_000
42 | SAVE_STEPS=2_000
43 |
44 | LR="5e-4"
45 | WD="0.1"
46 |
47 | # DO NOT take func score into consideration for resampling by setting a const weight 1.0
48 | RS_WEIGHTS="1.0_0.5_1.0" #_0.5"
49 | GAS="" #"512K_150K" #default is const
50 |
51 | # --------------------------------------------------------------------------
52 | # You should replace the following variables according to your own settings.
53 | # --------------------------------------------------------------------------
54 | OUTPUT_DIR="${BASE_DATA_DIR}/CERT/${DOMAIN}/experiments/${TYPE}_models"
55 | CKPT_NAME=""
56 |
57 | if [ ! -z "$AMLT_DATA_DIR" ]; then
58 | OUTPUT_DIR="$AMLT_DATA_DIR/CERT/${DOMAIN}/experiments/${TYPE}_models"
59 | BATCH_SIZE=10
60 | GRAD_ACC_STEPS=3
61 |
62 | fi
63 |
64 | STEP_SUMMARY="${WARMUP_STEPS}K_${MAX_STEPS}K_${SAVE_STEPS}K"
65 | STEP_SUMMARY=${STEP_SUMMARY//_000/}
66 |
67 | # Resampling with weight 0.4
68 | # GRAD_ACC_STEPS + 1.0 per epoch
69 | ID=""
70 |
71 | if [ ! -z "$RS_WEIGHTS" ]; then
72 | ID="RS_${RS_WEIGHTS}"
73 | fi
74 |
75 | if [ ! -z "$GAS" ]; then
76 | ID="${ID}-GAS_${GAS}"
77 | else
78 | GAS="const"
79 | fi
80 |
81 | if [ ! -z "$CKPT_NAME" ]; then
82 | ID="${ID}-RSUME"
83 | fi
84 |
85 | ACTUAL_GPUS=$((${N_GPUS}*${NODE_SIZE}))
86 | RUN_NAME="${BATCH_SIZE}x${GRAD_ACC_STEPS}x${ACTUAL_GPUS}x${BLOCK_SIZE}-${LR}-${WD}-${STEP_SUMMARY}-${ID}"
87 | RUN_OUTPUT_DIR="$OUTPUT_DIR/$RUN_NAME"
88 |
89 | echo "Experiment Run Name: $RUN_NAME"
90 | echo "Data Dir:" $DATA_DIR
91 | echo "Actual GPUs:" $ACTUAL_GPUS
92 | export DISTRIBUTED_GPU_SIZE=$ACTUAL_GPUS
93 |
94 | echo "Output Dir:" $OUTPUT_DIR
95 | echo "Init Actual Batch Size: ${BATCH_SIZE}x${GRAD_ACC_STEPS}x${N_GPUS}x${NODE_SIZE}"
96 |
97 | Run_Command_Args=" --model_name_or_path $DATA_DIR/model"
98 | Run_Command_Args="$Run_Command_Args --run_name $RUN_NAME"
99 | Run_Command_Args="$Run_Command_Args --output_dir $RUN_OUTPUT_DIR"
100 | Run_Command_Args="$Run_Command_Args --train_file $DATA_DIR/train"
101 | Run_Command_Args="$Run_Command_Args --validation_file $DATA_DIR/valid"
102 | Run_Command_Args="$Run_Command_Args --do_train"
103 | Run_Command_Args="$Run_Command_Args --do_eval"
104 |
105 | Run_Command_Args="$Run_Command_Args --block_size $BLOCK_SIZE"
106 | Run_Command_Args="$Run_Command_Args --logging_steps 100"
107 | Run_Command_Args="$Run_Command_Args --evaluation_strategy steps"
108 | Run_Command_Args="$Run_Command_Args --eval_steps $SAVE_STEPS"
109 | Run_Command_Args="$Run_Command_Args --save_steps $SAVE_STEPS"
110 | Run_Command_Args="$Run_Command_Args --warmup_steps $WARMUP_STEPS"
111 | Run_Command_Args="$Run_Command_Args --learning_rate $LR"
112 | Run_Command_Args="$Run_Command_Args --adam_beta2 0.95"
113 | Run_Command_Args="$Run_Command_Args --lr_scheduler_type cosine"
114 | Run_Command_Args="$Run_Command_Args --resampling_weights $RS_WEIGHTS"
115 |
116 | Run_Command_Args="$Run_Command_Args --max_steps $MAX_STEPS"
117 | Run_Command_Args="$Run_Command_Args --per_device_train_batch_size $BATCH_SIZE"
118 | Run_Command_Args="$Run_Command_Args --per_device_eval_batch_size $BATCH_SIZE"
119 | Run_Command_Args="$Run_Command_Args --gradient_accumulation_steps $GRAD_ACC_STEPS"
120 | Run_Command_Args="$Run_Command_Args --weight_decay $WD"
121 | Run_Command_Args="$Run_Command_Args --fp16"
122 | Run_Command_Args="$Run_Command_Args --report_to wandb"
123 |
124 | if [ "$OMPI_COMM_WORLD_RANK" == "0" ]; then
125 | mkdir "$OUTPUT_DIR/code/"
126 | cp -r "./" "$OUTPUT_DIR/code/"
127 | echo "Save experiment source code over."
128 | fi
129 |
130 | if [ ! -z "$GAS" ]; then
131 | Run_Command_Args="$Run_Command_Args --gradient_accumulation_strategy $GAS"
132 | fi
133 |
134 | if [ ! -z "$CKPT_NAME" ]; then
135 | CKPT_PATH=$"$OUTPUT_DIR/$CKPT_NAME"
136 | echo "Resume from checkpoint: $CKPT_PATH"
137 | Run_Command_Args="$Run_Command_Args --resume_from_checkpoint $CKPT_PATH --ignore_data_skip"
138 | fi
139 |
140 | echo "Run Command Args: $Run_Command_Args"
141 |
142 | # deepspeed --num_gpus $N_GPUS run_gpt.py --deepspeed configs/ds_config.json $Run_Command_Args
143 | CUDA_VISIBLE_DEVICES=0 python -m torch.distributed.launch --nproc_per_node $N_GPUS --use_env run_cert.py $Run_Command_Args
144 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/scripts/run_prepare_test_private_code.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import glob
4 | from argparse import ArgumentParser
5 |
6 | from transformers import AutoTokenizer
7 | from tqdm import tqdm
8 |
9 | import sys
10 | # -------------------------------------------------------------------------------------------------
11 | # you need to change this path to your own `APICoder-CodeGenAPI` path, better to use absolute path
12 | # -------------------------------------------------------------------------------------------------
13 | sys.path.append('../../../APICoder-CodeGenAPI/')
14 | from scripts.get_libs_info_from_code import (
15 | get_dict_of_api_name_lib_api_paths,
16 | get_dict_of_api_path_api_signature_and_api_desp,
17 | get_first_sentence_from_api_desp,
18 | normalizer_api_desp
19 | )
20 | from APICoder.get_lib_comment_for_eval import (
21 | get_code_and_comment_by_lib_name_and_task_id,
22 | get_one_instance_by_lib_name
23 | )
24 | from APICoder.get_api_info_by_name import (
25 | get_api_name_4_api_sign_and_desps,
26 | get_all_api_info_prompt_list_by_api_name
27 | )
28 |
29 | # -------------------------------------------------------------------------------------------------
30 | # your need to change this path to the path of your `crawl_code` path, better to use absolute path
31 | # -------------------------------------------------------------------------------------------------
32 | YOUR_CRAWLED_API_PATH = "PrivateLibrary/data/API-Doc"
33 | api_path_api_signature_and_api_desp = get_dict_of_api_path_api_signature_and_api_desp(
34 | YOUR_CRAWLED_API_PATH,
35 | "pandas,numpy,monkey,beatnum,torchdata",
36 | "datetime",
37 | "False"
38 | )
39 |
40 | parser = ArgumentParser()
41 |
42 | parser.add_argument('--base_model_dir', type=str, default="/your/base/dir/including/`eval_datas(benchmarks)`/and/others/")
43 | parser.add_argument('--benchmarks', type=list, default=["pandas", "numpy", "monkey", "beatnum", "torchdata"])
44 | parser.add_argument('--output_dir', type=str, default="PrivateLibrary/APIRetriever/data/inference")
45 | parser.add_argument('--tokenizer', type=str, required=False, default='your/path/of/bert-base-uncased/')
46 |
47 | args = parser.parse_args()
48 |
49 | base_model_dir, benchmarks, output_dir = args.base_model_dir, args.benchmarks, args.output_dir
50 | benchmark_dir = os.path.join(base_model_dir, "eval_datas")
51 | tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=True)
52 |
53 | x = [0, 1000, 10000, 100000, 1000000] # the unique id of each code comment
54 |
55 | '''
56 | for `code comment` of 5 benchamarks, making query corpus (total 5 files, named: xxx_comment.json)
57 | '''
58 | for idx, library_name in enumerate(tqdm(benchmarks)):
59 | if not os.path.exists(os.path.join(output_dir, library_name)):
60 | os.makedirs(os.path.join(output_dir, library_name))
61 | comment_out_name = os.path.join(output_dir, library_name + "_comment.json")
62 | comment_writer = open(comment_out_name, 'w+')
63 | base_id = x[idx]
64 |
65 | lib_iter_obj = get_one_instance_by_lib_name(library_name, base_dir=base_model_dir)
66 | for this_instance_dict in tqdm(lib_iter_obj):
67 | # dict_keys(['task_id', 'prompt', 'entry_point', 'canonical_solution', 'test'])
68 | task_id = this_instance_dict["task_id"]
69 | text_id = base_id + int(task_id.split("/")[-1])
70 | code_comment_solution = get_code_and_comment_by_lib_name_and_task_id(library_name, task_id, base_model_dir)
71 | this_code, this_comment, this_solution = code_comment_solution[0], code_comment_solution[1], code_comment_solution[2]
72 | save_dict = {
73 | "text_id": text_id,
74 | "task_id": task_id,
75 | "text": this_comment
76 | }
77 | comment_writer.write(json.dumps(save_dict) + "\n")
78 |
79 | comment_writer.close()
80 |
81 | '''
82 | for `API information` of 5 benchamarks, making corpus (total 5 files, named: xxx_api.json)
83 | '''
84 | y = [1000000, 1100000, 1200000, 1300000, 1400000] # the unique id of each API
85 | for idx, library_name in enumerate(tqdm(benchmarks)):
86 | if not os.path.exists(output_dir):
87 | os.makedirs(output_dir)
88 | api_out_name = os.path.join(output_dir, library_name + "_api.json")
89 | api_writer = open(api_out_name, 'w+')
90 | base_id = y[idx]
91 | api_name_4_api_sign_and_desps = get_api_name_4_api_sign_and_desps(library_name, base_model_dir)
92 | total_api, now_number = len(api_name_4_api_sign_and_desps), 0
93 | for api_name, api_path_info_dict in tqdm(api_name_4_api_sign_and_desps.items(), total=total_api):
94 | for api_idx, (api_path, api_info_list) in enumerate(api_path_info_dict.items()):
95 | api_signature, api_description = api_info_list[0].strip(), get_first_sentence_from_api_desp(normalizer_api_desp(api_info_list[1]))
96 | if api_signature == "":
97 | continue
98 | api_info_prompt=f"{api_name}{api_signature}: {api_description}"
99 | text_id = base_id + now_number
100 | save_dict = {
101 | "text_id": text_id,
102 | "text": api_info_prompt
103 | }
104 | now_number+=1
105 | api_writer.write(json.dumps(save_dict) + "\n")
106 |
107 | api_writer.close()
108 |
109 | print(f"Done!")
--------------------------------------------------------------------------------
/apicoder/private-eval/data/TorchData_no.API_number_0.CodeGen.hm_False.machine.t0.1.p0.9.l100.n1.samples.jsonl:
--------------------------------------------------------------------------------
1 | {"task_id": "TorchDataEval/0", "completion": " datapipe.repeat(6)"}
2 | {"task_id": "TorchDataEval/1", "completion": " dp.index(0)"}
3 | {"task_id": "TorchDataEval/2", "completion": " batch_dp.sample(batch_size=1, shuffle=False)"}
4 | {"task_id": "TorchDataEval/4", "completion": " source_dp.split(odd_or_even)"}
5 | {"task_id": "TorchDataEval/5", "completion": " source_dp.clone(), source_dp.clone()\ndp1.iter.reset()\ndp2.iter.reset()"}
6 | {"task_id": "TorchDataEval/6", "completion": " dp1.merge(dp2)"}
7 | {"task_id": "TorchDataEval/7", "completion": " dp1.map(mapdp)"}
8 | {"task_id": "TorchDataEval/9", "completion": " SampleMultiplexer(source_dp1, source_dp2, weitghts)"}
9 | {"task_id": "TorchDataEval/10", "completion": " source_dp.unzip()"}
10 | {"task_id": "TorchDataEval/11", "completion": " dp.batch(3)\ndp = dp.drop(0)\ndp = dp.drop(1)\ndp = dp.drop(2)"}
11 | {"task_id": "TorchDataEval/12", "completion": " torchdata.datapipes.BatchDatapipe(source_dp, batch_size=3, batch_num=100, drop_last=False)"}
12 | {"task_id": "TorchDataEval/14", "completion": " IterableWrapper([\"a.png\", \"b.png\", \"a.json\", \"b.json\", \"a.jpg\", \"c.json\"])"}
13 | {"task_id": "TorchDataEval/16", "completion": " HttpReader(file_url, iterable_wrapper=IterableWrapper(file_url))"}
14 | {"task_id": "TorchDataEval/17", "completion": " source_dp.map(mutiple_fn)"}
15 | {"task_id": "TorchDataEval/18", "completion": " dp.map(lambda x: x + 1)"}
16 | {"task_id": "TorchDataEval/19", "completion": " dp.filter(lambda x: is_even(x))"}
17 | {"task_id": "TorchDataEval/20", "completion": " dp.get(0, 0)"}
18 | {"task_id": "TorchDataEval/21", "completion": " dp.map(lambda x: x['a'])"}
19 | {"task_id": "TorchDataEval/23", "completion": " map_dp_1.batch(2)\nnew_dp.map(lambda x: x + 1) #"}
20 | {"task_id": "TorchDataEval/24", "completion": " HttpReader(URL)\nag_news_train.process()"}
21 | {"task_id": "TorchDataEval/25", "completion": " HttpReader(URL, lambda_func_)\nag_news_train.map(lambda t: (int(t[0]), \" \".join(t[1:])))"}
22 | {"task_id": "TorchDataEval/26", "completion": " ag_news_train.map(lambda batch: lambda_batch(batch))"}
23 | {"task_id": "TorchDataEval/27", "completion": " dp.repeat(3)\ndp = dp.sample(3)"}
24 | {"task_id": "TorchDataEval/28", "completion": " dp1 + dp2"}
25 | {"task_id": "TorchDataEval/29", "completion": " dp1.merge(dp2)"}
26 | {"task_id": "TorchDataEval/30", "completion": " dp1.merge(dp2, keep_key=True)"}
27 | {"task_id": "TorchDataEval/31", "completion": " dp1.transform(merge_fn)\nres_dp = res_dp.map(lambda x: x[1])\nres_dp = res_dp.map(lambda x: x[0])"}
28 | {"task_id": "TorchDataEval/32", "completion": " dp1.merge_fn(dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1, dp1,"}
29 | {"task_id": "TorchDataEval/33", "completion": " dp1.map(merge_fn, dp1)\nres_map = mapdp.map(merge_fn, mapdp)"}
30 | {"task_id": "TorchDataEval/34", "completion": " dp1.map(merge_fn, dp1.map(mapdp, dp1.map(mapdp, dp1.map(mapdp, dp1.map(mapdp, dp1.map(mapdp, dp1.map(mapdp, dp1.map(mapdp, dp1.map(mapdp, dp1.map(mapdp, dp1.map(mapdp"}
31 | {"task_id": "TorchDataEval/35", "completion": " source_dp.group(group_fn)\ndp1 = source_dp.group(group_fn, length=1)\ndp2 = source_dp.group(group_fn, length=1, reserve=1)\ndp3 = source_dp.group(group_fn, length=1, reserve=1, reserve=1)"}
32 | {"task_id": "TorchDataEval/37", "completion": " ds.collate(collate_fn)"}
33 | {"task_id": "TorchDataEval/38", "completion": " ds.collate(collate_fn)"}
34 | {"task_id": "TorchDataEval/40", "completion": " source_dp.split(great_than_5)"}
35 | {"task_id": "TorchDataEval/41", "completion": " SampleMultiplexer(dp1, dp2, weight_)"}
36 | {"task_id": "TorchDataEval/42", "completion": " raw_dp"}
37 | {"task_id": "TorchDataEval/43", "completion": " dp.batch(3)\ndp = dp.drop_last(2)\ndp = dp.drop_last(2)\ndp = dp.drop_last(2)\ndp = dp.drop_last(2)\ndp = dp.drop_last(2)\ndp = dp.drop_last(2)\ndp = dp.drop_last(2)\ndp = dp.drop_last"}
38 | {"task_id": "TorchDataEval/44", "completion": " IterableWrapper([\"a\", \"b\", \"c\", \"d\", \"e\", \"f\", \"g\", \"h\", \"i\", \"j\"])\ndp_4 = IterableWrapper([\"1\", \"2\", \"3\", \"4\", \"5\", \"6\", \"7\", \"8\", \"9\", \"10\"])\ndp_5 = IterableWrapper([\"1\", \"2\", \"3\", \"4\", \""}
39 | {"task_id": "TorchDataEval/45", "completion": " dp_source_1.concatenate(dp_source_2)\nindex_dp.index.name = 'Ids'"}
40 | {"task_id": "TorchDataEval/46", "completion": " dp_source_1.join(dp_source_2, index_name='Ids')"}
41 | {"task_id": "TorchDataEval/47", "completion": " dp_source_1.join(dp_source_2)"}
42 | {"task_id": "TorchDataEval/48", "completion": " IterableWrapper(flatted_func)"}
43 | {"task_id": "TorchDataEval/49", "completion": " HttpReader(AG_NEWS_CSV_URL)\nag_news_train.map(lambda x: x.decode('utf-8'))"}
44 | {"task_id": "TorchDataEval/3", "completion": " dp_one.concatenate(dp_two)"}
45 | {"task_id": "TorchDataEval/8", "completion": " dp1.map(lambda x: x[0])\nresult = dp2.map(lambda x: x[0])"}
46 | {"task_id": "TorchDataEval/13", "completion": " ds.collate(int2tensor)"}
47 | {"task_id": "TorchDataEval/15", "completion": " torchdata.datapipes.unbatch(source_dp, level=1)"}
48 | {"task_id": "TorchDataEval/22", "completion": " dp.generate_by_chunk(1)"}
49 | {"task_id": "TorchDataEval/39", "completion": " dp_dog.remove_final_number(remove_final_number(dp_dog.get_data()))\nresult = dp_cat.remove_final_number(remove_final_number(dp_cat.get_data()))"}
50 | {"task_id": "TorchDataEval/36", "completion": " source_dp.group_by(group_fn)"}
51 |
--------------------------------------------------------------------------------
/apicoder/APIRetriever/scripts/run_prepare_train_private_code.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import glob
4 | from argparse import ArgumentParser
5 |
6 | from transformers import AutoTokenizer
7 | from tqdm import tqdm
8 |
9 | import sys
10 | # -------------------------------------------------------------------------------------------------
11 | # you need to change this path to your own `APICoder-CodeGenAPI` path, better to use absolute path
12 | # -------------------------------------------------------------------------------------------------
13 | sys.path.append('../../../CodeGenAPI/')
14 | from scripts.get_libs_info_from_code import (
15 | get_dict_of_api_name_lib_api_paths,
16 | get_dict_of_api_path_api_signature_and_api_desp,
17 | get_first_sentence_from_api_desp,
18 | normalizer_api_desp
19 | )
20 |
21 | # -------------------------------------------------------------------------------------------------
22 | # your need to change this path to the path of your `crawl_code` path, better to use absolute path
23 | # -------------------------------------------------------------------------------------------------
24 | YOUR_CRAWLED_API_PATH = "PrivateLibrary/data/API-Doc"
25 | api_path_api_signature_and_api_desp = get_dict_of_api_path_api_signature_and_api_desp(
26 | YOUR_CRAWLED_API_PATH,
27 | "pandas,numpy,monkey,beatnum,torchdata",
28 | "datetime",
29 | "False"
30 | )
31 |
32 | # -------------------------------------------------------------------------------------------------
33 | # your need to change the below path to the your own ones, better to use absolute path
34 | # -------------------------------------------------------------------------------------------------
35 | parser = ArgumentParser()
36 | parser.add_argument('--input', type=str, default="PrivateLibrary/APIRetriever/data/train/unprocessed-train-data", help="each jsonl file in such path contains many json lines, where each line's format is {'code_doc': '', 'positive_APIs': ['A', ...], 'negative_APIs': ['B', ...]}")
37 | parser.add_argument('--data_mode', type=str, default="", help="the prefix of the input jsonl file, default is empty")
38 | parser.add_argument('--output', type=str, default="PrivateLibrary/APIRetriever/data/train/processed-train-data", help="the output path")
39 | parser.add_argument('--tokenizer', type=str, required=False, default='/your/path/of/bert-base-uncased')
40 |
41 | args = parser.parse_args()
42 |
43 | tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=True)
44 |
45 | if not os.path.exists(args.output):
46 | os.makedirs(args.output)
47 |
48 | all_data_paths = glob.glob(os.path.join(args.input, f'{args.data_mode}*.jsonl'))
49 | print(f"Now, all data paths are: {all_data_paths}")
50 |
51 | # -------------------------------------------------------------------------------------------------
52 | # your training data name default is `private_data_train.json`, you can change it to your own name
53 | # -------------------------------------------------------------------------------------------------
54 | with open(os.path.join(args.output, 'private_data_train.json'), 'w+') as f:
55 | for data_path in tqdm(all_data_paths):
56 | data_reader = open(data_path, 'r')
57 | for line in tqdm(data_reader):
58 | group = {}
59 | # dict_keys(['code_block', 'code_doc', 'code_all_doc', 'positive_APIs', 'negative_APIs'])
60 | line_dict = json.loads(line)
61 | comment, positive_apis, negative_apis = line_dict["code_doc"], line_dict["positive_APIs"], line_dict["negative_APIs"]
62 | query = tokenizer.encode(comment, add_special_tokens=False, max_length=256, truncation=True)
63 |
64 | group['query'] = query
65 | group['positives'] = []
66 | group['negatives'] = []
67 | for positive_api in positive_apis:
68 | if api_path_api_signature_and_api_desp.get(positive_api) is None:
69 | continue
70 | positive_api_info_dict = api_path_api_signature_and_api_desp[positive_api]
71 | if positive_api_info_dict['api_signature'] == "":
72 | continue
73 | positive_api_prompt = f"{positive_api_info_dict['api_name']}{positive_api_info_dict['api_signature']}: {get_first_sentence_from_api_desp(normalizer_api_desp(positive_api_info_dict['api_description']))}"
74 | text = tokenizer.encode(positive_api_prompt, add_special_tokens=False, max_length=256, truncation=True)
75 | group['positives'].append(text)
76 | for negative_api in negative_apis:
77 | if api_path_api_signature_and_api_desp.get(negative_api) is None:
78 | continue
79 | negative_api_info_dict = api_path_api_signature_and_api_desp[negative_api]
80 | if negative_api_info_dict['api_signature'] == "":
81 | continue
82 | negative_api_prompt = f"{negative_api_info_dict['api_name']}{negative_api_info_dict['api_signature']}: {get_first_sentence_from_api_desp(normalizer_api_desp(negative_api_info_dict['api_description']))}"
83 | text = tokenizer.encode(negative_api_prompt, add_special_tokens=False, max_length=256, truncation=True)
84 | group['negatives'].append(text)
85 | if len(group['positives']) == 0 or len(group['negatives']) == 0 or len(group['query']) == 0:
86 | print("Skip this group")
87 | continue
88 | f.write(json.dumps(group) + '\n')
89 |
90 | print(f"Done!")
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PyCodeGPT
2 | A pre-trained GPT model for Python code completion and generation
3 |
4 | ## What is it?
5 |
6 | PyCodeGPT is efficient and effective GPT-Neo-based model for python code generation task, which is similar to [OpenAI Codex](https://openai.com/blog/openai-codex/), [Github Copliot](https://copilot.github.com/), [CodeParrot](https://huggingface.co/blog/codeparrot), [AlphaCode](https://deepmind.com/blog/article/Competitive-programming-with-AlphaCode).
7 |
8 | ## Training Data
9 | Due to the small size of public released dataset, we proposed to collect data from GitHub from scratch. We first crawled 1.2M python-related repositories hosted by GitHub. Then, we used these repository URLs to download all contents of each repository from GitHub. After that, we got 60M raw python files under 1MB with a total size of 330GB. Finally, we carefully designed various strategies of data cleaning to get about 96GB data for training. Please refer to the following table for the details.
10 |
11 | |Model|Repositories|Size and file after filtering|
12 | |:------:|:---:|:---:|
13 | | CodeParrot | 0.56M | 12GB (compressed), 5.4M |
14 | | Codex | 54M | 159GB |
15 | | PyCodeGPT | 1.2M | 96GB, 13M |
16 |
17 |
18 | ## Pretrained models
19 |
20 | we aims to train median-large pre-trained models (model size with 110M) based on GPT-Neo:
21 | - PyCodeGPT-110M: derived from GPT-Neo 125M with a vocabulary size of 32K.
22 |
23 | PyCodeGPT-110M is available on [HuggingFace](https://huggingface.co/Daoguang/PyCodeGPT).
24 |
25 | ## Evaluation
26 | 1. Install requirements (python 3.7)
27 | ```bash
28 | $ pip install -r requirements.txt
29 | ```
30 |
31 | 2. Install [HumanEval](https://github.com/openai/human-eval)
32 | - Note that you can successfully evaluate your model after uncommenting 58th line of `human-eval/human_eval/execution.py`
33 | ```bash
34 | $ git clone https://github.com/openai/human-eval
35 | $ pip install -e human-eval
36 | ```
37 |
38 | 3. Run `eval_human_eval.py` to generate programs
39 | - Arguments
40 | - `model_name_or_path` : Path to the model checkpoint to be evaluated.
41 | - `output_dir` : Path to save generated programs
42 | - `num_completions` : The number of program to be generated
43 | - `temperature` : Temperature for sampling
44 | - `top_p` : p value for nucleus sampling
45 | - `max_new_tokens` : Maximum number of generated token
46 | - Example usage
47 |
48 | ```bash
49 | $ python eval_human_eval.py \
50 | --model_name_or_path PyCodeGPT-110M/ \
51 | --output_dir results/ \
52 | --num_completions 100 \
53 | --temperature 0.2 \
54 | --top_p 0.95 \
55 | --max_new_tokens 100 \
56 | --gpu_device 0
57 | ```
58 |
59 | 4. Evaluate functional correctness
60 | ```bash
61 | $ evaluate_functional_correctness
62 | # Example
63 | $ evaluate_functional_correctness results/human_eval.t0.2.p0.95.l100.n100.samples.jsonl
64 | ```
65 |
66 | Here's our evaluation result on HumanEval dataset:
67 |
68 | Note: our model can have a comparable accuracy with Codex of similar model size.
69 |
70 | |Model|Pass@1|Pass@10|Pass@100|
71 | |:------:|:---:|:---:|:---:|
72 | |PyCodeGPT-110M |**8.32%** |**13.53%** |**18.3%** |
73 | |||||
74 | |GPT-Neo 125M |0.75% |1.88% |2.97% |
75 | |GPT-Neo 1.3B |4.97% |7.47% |16.3% |
76 | |GPT-Neo 2.7B |6.41% |11.27% |21.37% |
77 | |GPT-J 6B |11.62% |15.74% |27.74% |
78 | |||||
79 | |TabNine |2.58% |4.35% |7.59% |
80 | |||||
81 | |CodeParrot 110M |3.80% |6.57% |12.78% |
82 | |CodeParrot 1.5B |3.58% |8.03% |14.96% |
83 | |||||
84 | |Codex 12M |2.00% |3.62% |8.58% |
85 | |Codex 25M |3.21% |7.1% |12.89% |
86 | |Codex 42M |5.06% |8.8% |15.55% |
87 | |Codex 85M |8.22% |12.81% |22.4% |
88 | |Codex 300M |13.17% |20.37% |36.27% |
89 | |Codex 679M |16.22% |25.7% |40.95% |
90 | |Codex 2.5B |21.36% |35.42% |59.5% |
91 | |Codex 12B |28.81% |46.81% |72.31% |
92 | |||||
93 | |Pretrained Decoder-only 13M (AlphaCode) |1.5% |3.6% |8.6% |
94 | |Pretrained Decoder-only 29M (AlphaCode) |3.4% |5.8% |11.2% |
95 | |Pretrained Decoder-only 55M (AlphaCode) |4.2% |8.2% |16.9% |
96 | |Pretrained Decoder-only 89M (AlphaCode) |4.3% |12.2% |20.0% |
97 | |Pretrained Decoder-only 302M (AlphaCode) |11.6% |18.8% |31.8% |
98 | |Pretrained Decoder-only 685M (AlphaCode) |14.2% |24.4% |38.8% |
99 | |Pretrained Decoder-only 1.1B (AlphaCode) |17.1% |28.2% |45.3% |
100 | |||||
101 | |PolyCoder 160M |2.13% |3.35% |4.88% |
102 | |PolyCoder 400M |2.96% |5.29% |11.59% |
103 | |PolyCoder 2.7B |5.59% |9.84% |17.68% |
104 |
105 | ## Reference
106 | If you want to use the models, you need to cite our following paper:
107 |
108 | ```
109 | @inproceedings{CERT,
110 | title={{CERT}: Continual Pre-training on Sketches for Library-oriented Code Generation},
111 | author={Zan, Daoguang and Chen, Bei and Yang, Dejian and Lin, Zeqi and Kim, Minsu and Guan, Bei and Wang, Yongji and Chen, Weizhu and Lou, Jian-Guang},
112 | booktitle={The 2022 International Joint Conference on Artificial Intelligence},
113 | year={2022}
114 | }
115 | ```
116 |
--------------------------------------------------------------------------------
/cert/nl2code/dynamic_block_dataset.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 | from typing import List
4 | from dataclasses import dataclass
5 | import numpy as np
6 | from datetime import datetime
7 |
8 | import torch
9 | from torch.utils.data.dataset import Dataset
10 |
11 | @dataclass
12 | class BlockSpan:
13 | index: int
14 | start: int
15 | end: int
16 |
17 | @property
18 | def length(self):
19 | return self.end - self.start + 1
20 |
21 | @dataclass
22 | class BlockItem:
23 | spans: List[BlockSpan]
24 |
25 | def __len__(self):
26 | return len(self.spans)
27 |
28 | def pad(self, item):
29 | self.spans += item.spans
30 |
31 | @property
32 | def size(self):
33 | return sum([x.length for x in self.spans])
34 |
35 | class BlockCache:
36 | def __init__(self, block_size: int, cache_size: int):
37 | self.cache_size = cache_size
38 | self.block_size = block_size
39 |
40 | self.len2spans = [[] for _ in range(block_size)]
41 | self.items_count = 0
42 | self.length_counts = np.zeros(block_size, dtype=np.int32)
43 |
44 | def add_one(self, span: BlockSpan):
45 | length = span.length
46 | if length >= self.block_size:
47 | raise ValueError("Can't add one item with length {} >= block size {}.".format(length, self.block_size))
48 |
49 | self.len2spans[length].append(span)
50 | self.items_count += 1
51 | self.length_counts[length] += 1
52 |
53 | def is_full(self):
54 | return self.items_count >= self.cache_size
55 |
56 | def __len__(self):
57 | return self.items_count
58 |
59 | def _pop_one_by_length(self, sel_length: int) -> BlockItem:
60 | assert len(self.len2spans[sel_length]) == self.length_counts[sel_length]
61 |
62 | if len(self.len2spans[sel_length]) == 0:
63 | raise ValueError("Pop from empty length spans: {}".format(sel_length))
64 |
65 | self.length_counts[sel_length] -= 1
66 | sel_span = self.len2spans[sel_length].pop()
67 | pad_length = self.block_size - sel_length
68 |
69 | while pad_length > 0:
70 | # Find a perfect one to block
71 | if len(self.len2spans[pad_length]) > 0:
72 | pad_span = self.len2spans[pad_length].pop()
73 | self.length_counts[pad_length] -= 1
74 | block_item = BlockItem(spans=[sel_span, pad_span])
75 | self.items_count -= 2
76 | return block_item
77 |
78 | pad_length -= 1
79 |
80 | # can't find one to pad
81 | self.items_count -= 1
82 | return BlockItem(spans=[sel_span])
83 |
84 | def pop_one(self):
85 | sel_length = np.argmax(self.length_counts)
86 | return self._pop_one_by_length(sel_length)
87 |
88 | def pop_all(self):
89 | index = self.block_size - 1
90 | while index > 0:
91 | while self.len2spans[index]:
92 | yield self._pop_one_by_length(index)
93 | index -= 1
94 |
95 | class DynamicBlockDataset(Dataset):
96 | def __init__(self, src_dataset: Dataset, src_sizes: List[int], block_size: int, dynamic_factor: int=10) -> None:
97 | super().__init__()
98 | self.src_dataset = src_dataset
99 | self.src_sizes = src_sizes
100 | self.block_size = block_size
101 | self.dynamic_factor = dynamic_factor
102 |
103 | start = datetime.now()
104 | self.block_items: List[BlockItem] = self.build_block_index_mappings()
105 | self._block_sizes = [x.size for x in self.block_items]
106 | print("DynamicBlockDataset builds block indices over, {} => {} ({:.4f}), avg examples = {:.3f}, cost = {}.".format(
107 | len(self.src_dataset),
108 | len(self.block_items),
109 | self.get_block_ratio(),
110 | np.mean([len(x) for x in self.block_items]),
111 | datetime.now() - start
112 | ))
113 |
114 | @property
115 | def sizes(self):
116 | return self._block_sizes
117 |
118 | def size(self, index) -> int:
119 | return self.block_items[index].size
120 |
121 | def get_block_ratio(self) -> float:
122 | print(np.mean(self._block_sizes), np.mean(self._block_sizes) / self.block_size)
123 | return sum(self.sizes) / len(self.block_items) / self.block_size
124 |
125 | def __len__(self):
126 | return len(self.block_items)
127 |
128 | def __getitem__(self, index) -> torch.Tensor:
129 | item = self.block_items[index]
130 | tensors = [self.src_dataset[span.index][span.start:span.end+1] for span in item.spans]
131 | return torch.cat(tensors, dim=0)
132 |
133 | def build_block_index_mappings(self):
134 | cache = BlockCache(self.block_size, self.dynamic_factor * self.block_size)
135 | block_idx_items = []
136 |
137 | for i, size in enumerate(self.src_sizes):
138 | start = 0
139 | while start < size:
140 | end = min(size, start + self.block_size)
141 | span = BlockSpan(index=i, start=start, end=end-1)
142 |
143 | if span.length == self.block_size:
144 | block_idx_items.append(BlockItem([span]))
145 | else:
146 | # Pop one if cache is full
147 | if cache.is_full():
148 | block_idx_items.append(cache.pop_one())
149 | cache.add_one(span)
150 | start = end
151 |
152 | for item in cache.pop_all():
153 | block_idx_items.append(item)
154 |
155 | return block_idx_items
156 |
--------------------------------------------------------------------------------
/apicoder/CodeGenAPI/README.md:
--------------------------------------------------------------------------------
1 | # APICoder - CodeGenAPI
2 |
3 | Official repository for our paper ["When Language Model Meets Private Library"](https://arxiv.org/pdf/2210.17236.pdf).
4 |
5 | ---
6 |
7 | ## Overview
8 |
9 | APIRetirever finds out useful APIs for a programming problem, and then APICoder aims to generate code that solves the problem with these APIs. We make use of the most straightforward way for APICoder: prompting API information set in front of the context. Each API information is in the form of `name(signature):description`. This is to mimic programmers learning the APIs properly before writing code using them.
10 |
11 |
12 |
13 | Figure1: The training process of CodeGenAPI
14 |
15 | ## Project Directory
16 | ```shell
17 | ├── CodeGenAPI
18 | │ ├── APICoder
19 | │ │ ├── get_api_info_by_name.py
20 | │ │ ├── get_lib_comment_for_eval.py
21 | │ ├── apex
22 | │ ├── eval_baseline.py
23 | │ ├── eval_private.py
24 | │ ├── nl2code
25 | │ ├── requirements.txt
26 | │ ├── run_generating_codes.sh # The entry script for CodeGenAPI inference, which can generate a lot of code snippets for each programming problem.
27 | │ ├── run_evaluating_codes.sh # The entry script for evaluating the generated code snippets, and outputting the final results (pass@k).
28 | │ ├── run_private.py
29 | │ ├── run_private.sh # Implementation of CodeGenAPI training.
30 | │ └── scripts
31 | │ ├── encode_private_data.py
32 | │ ├── extract_api.py
33 | │ ├── file_utils.py
34 | │ ├── get_comments_from_evallibs.py
35 | │ ├── get_libs_info_from_code.py
36 | │ ├── make_human_in_the_loop_test_corpus.py
37 | │ ├── multiprocessing_utils.py
38 | │ ├── pycode_visitor.py
39 | │ ├── requirements.txt
40 | │ ├── run_details_apis.sh # Extracting all kinds of API information (API name, signature, description and so on) from the crawled API documentations of 35 libraries.
41 | │ ├── run_encode_private_data.sh # Encoding the private data
42 | │ ├── run_extract_apis.sh # Crawling the API documentation for 31 off-the-shelf public libraries.
43 | │ └── run_extract_details_from_apis.py
44 | ```
45 |
46 | ## Quickstart
47 |
48 | This section covers environment, data preparation, model inference, and model training.
49 |
50 | ### Preparation
51 |
52 | 1、Configuring your runtime environment
53 |
54 | ```
55 | $ cd PrivateLibrary/CodeGenAPI
56 | $ pip install -r requirements.txt
57 | ```
58 | Besides, if you would like to use mixed precision FP16 to speed up the training, it is necessary for you to install the apex library.
59 | ```
60 | git clone https://github.com/NVIDIA/apex
61 | cd apex
62 | pip install -v --no-cache-dir ./
63 | ```
64 |
65 | 2、Preparation of pre-trained models
66 |
67 | Download the pre-trained checkpoint (e.g., `CodeGenAPI-110M`) from [our released page](https://github.com/microsoft/PyCodeGPT/releases/download/Private-Library/CodeGenAPI-350M-mono.zip) and place it in the corresponding folder (e.g., `CodeGenAPI/models/CodeGenAPI-110M`).
68 |
69 | 3、Updating the scripts according to your local path
70 |
71 | - Update `run_private.sh`.
72 | - Update `run_generating_codes.sh`.
73 | - Update `run_evaluating_codes.sh`.
74 |
75 | ### Use CodeGenAPI or other models
76 |
77 | Firstly, multiple code snippets are generated for each programming problem (`run_generating_codes.sh`). Then, the code snippets are evaluated (`run_evaluating_codes.sh`).
78 |
79 | ```
80 | $ bash run_generating_codes.sh
81 | $ bash run_evaluating_codes.sh
82 | ```
83 |
84 | ### Train CodeGenAPI
85 |
86 | Train CodeGenAPI by the following command based on the large-scale code corpus.
87 |
88 | ```
89 | $ bash run_private.sh
90 | ```
91 |
92 | ## Experiments
93 |
94 | In inference phase, we set the `temperature` to one of `[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]`, the number of samples (`NUM_SAMPLES`) to `200`, the max number of generated tokens (`MAX_TOKNES`) to `100`, and the `top_p` to `0.9`. The best number is reported across the above hyper-parameters.
95 |
96 | Here are the main results:
97 |
98 | 
99 |
100 | After running these numerous experiments, we drew some plausible observations and valuable insights as follows.
101 |
102 | > (1) Prompting API information set is useful on private-library oriented code generation task.
103 |
104 | > (2) Which is the best of the API prompt ways including Perfect, Top-N, and Human? As a general matter, Perfect, Human, and Top-N produce progressively decreasing benefits. However, Top-N is in occasion superior than Perfect as the noise exists when training the model. Also, we observe that Top-1,2 usually works better than Top-3,5 because the latter introduces more noise APIs.
105 |
106 | > (3) Our continual pre-trained model does better at invoking APIs than to its base model, and thus can further elevate the performance of code generation for private libraries in majority of scenarios.
107 |
108 | > (4) APIRetriever has the capability to retrieve useful APIs.
109 |
110 | > (5) Involving human in the loop can further boost the performance.
111 |
112 | > (6) As the k in pass@k grows larger, the gain we add API information brings is larger.
113 |
114 | > (7) It is so challenging to generate code invoking private libraries than public ones, that large models fail to do so if we do not prompt any APIs.
115 |
116 | For more explanation, please see our raw paper.
117 |
118 | ## Citation
119 | If you find our work useful, please cite the paper:
120 | ```
121 | @inproceedings{APICoder,
122 | title={When Languange Model Meets Private Library},
123 | author={Zan, Daoguang and Chen, Bei and Lin, Zeqi and Guan, Bei and Wang, Yongji and Lou, Jian-Guang},
124 | booktitle={EMNLP findings},
125 | year={2022}
126 | }
127 | ```
128 |
--------------------------------------------------------------------------------
/apicoder/private-eval/README.md:
--------------------------------------------------------------------------------
1 | # TorchDataEval, MonkeyEval and BeatNumEval
2 |
3 | Three benchmarks for evaluating the performance of private library oriented code generation. They are proposed in the paper "[When Language Model Meets Private Library](https://arxiv.org/pdf/2210.17236.pdf)".
4 |
5 | The evaluation script is adapted from OpenAI's [HumanEval](https://github.com/openai/human-eval/tree/master/human_eval).
6 |
7 | ## Installation
8 |
9 | Make sure to use python 3.7 or later:
10 | ```
11 | $ conda create -n private python=3.7
12 | $ conda activate private
13 | ```
14 |
15 | Check out and install this repository:
16 | ```
17 | $ pip install -e private-eval
18 | ```
19 |
20 | ## Configuration
21 | ```
22 | ├── data # The directory of our crafted benchmarks.
23 | ├── private_eval
24 | │ ├── data.py # [key] Choosing whether to load TorchDataEval, MonkeyEval or BeatNumEval.
25 | │ ├── evaluate_functional_correctness.py # Calculating the evaluation results.
26 | │ ├── evaluation.py # Calculating the evaluation results.
27 | │ └── execution.py # [key] Executing the predicted code. Here, if you want to evaluate MonkeyEval and BeatNumEval, you need to set the `is_convert_back` variable in line 194 to `True` and `domain` to `pandas` or `numpy`.
28 | ```
29 |
30 | ## Running Environment Testing
31 |
32 | You need replace `XXX` with your local path for testing the torchdata results. (Make sure that all settings in `private-eval/private_eval/data.py` is right.)
33 | ```
34 | $ evaluate_functional_correctness XXX/PrivateLibrary/private-eval/data/TorchData_no.API_number_0.CodeGen.hm_False.machine.t0.1.p0.9.l100.n1.samples.jsonl
35 | ```
36 |
37 | If you can successfully run the above command and obtain the following results, the evaluation environment is ready to use.
38 | ```
39 | {'pass@1': 0.06}
40 | ```
41 |
42 | # The Process of Constructing TorchDataEval, MonkeyEval and BeatNumEval
43 |
44 | We craft three benchmarks, called TorchDataEval, MonkeyEval, and BeatNumEval. Each programming problem consists of context, target code, and the corresponding test cases.
45 |
46 | To create a realistic benchmark for evaluating code generation for private library, we make use of TorchData, a Python library released just recently. We carefully learnt the official API documentation of TorchData and make sure we were proficient in all APIs. Then, we manually created $50$ programming problems based on the API usage examples in the documentation. Two volunteers with extensive experience in Python were invited to check the correctness of each problem. We control the difficulty of the programming problems by the number of APIs in the target code. The percentage of programming problems containing $1$ API, $2$ APIs, and more APIs is set to $6$:$3$:$1$.
47 |
48 | > Our base model, CODEGEN, is pre-trained with GitHub data before $2021$-$10$. TorchData was released after this time point and no code files using it are available on GitHub so far, hence we can consider it as a private library.
49 |
50 | We also construct two pseudo private libraries named MonkeyEval and BeatNumEval, they modify from PandasEval and NumpyEval, each containing $101$ programming problems, were proposed for the public libraries Pandas and Numpy. In detail, we manually modified all library-related keywords in PandasEval and NumpyEval, respectively. For example, as in the below Figure, `pandas` is converted to `monkey`, `dataframe` is converted to `knowledgeframe`, and the API name `isin` is converted to `iscontain`. To craft the API documentations for Monkey and BeatNum, we manually paraphrased the descriptions of all the new APIs to ensure that they have never been seen by the pre-trained language models.
51 |
52 |
53 |
54 | # A Example of Converting PandasEval (public) to MonkeyEval (private)
55 |
56 | Context is shown with a white background and the target code with a gray background. The changed parts are highlighted in yellow.
57 |
58 |
59 |
60 | ## Reference
61 |
62 | If you use TorchDataEval, MonkeyEval or BeatNumEval in your work, please cite the paper:
63 | ```
64 | @inproceedings{APICoder,
65 | title={When Languange Model Meets Private Library},
66 | author={Zan, Daoguang and Chen, Bei and Lin, Zeqi and Guan, Bei and Wang, Yongji and Lou, Jian-Guang},
67 | booktitle={EMNLP findings},
68 | year={2022}
69 | }
70 | ```
71 |
72 | If you use PandasEval or NumpyEval in your work, please cite the paper:
73 | ```
74 | @inproceedings{CERT,
75 | title={{CERT}: Continual Pre-training on Sketches for Library-oriented Code Generation},
76 | author={Zan, Daoguang and Chen, Bei and Yang, Dejian and Lin, Zeqi and Kim, Minsu and Guan, Bei and Wang, Yongji and Chen, Weizhu and Lou, Jian-Guang},
77 | booktitle={The 2022 International Joint Conference on Artificial Intelligence},
78 | year={2022}
79 | }
80 | ```
81 |
82 | Also, if you use the evaluationg script, please also cite the following paper:
83 | ```
84 | @article{codex,
85 | title={Evaluating Large Language Models Trained on Code},
86 | author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
87 | year={2021},
88 | eprint={2107.03374},
89 | archivePrefix={arXiv},
90 | primaryClass={cs.LG}
91 | }
92 | ```
93 |
--------------------------------------------------------------------------------