├── .env ├── .gitignore ├── .python-version ├── .vscode └── settings.json ├── LICENSE ├── README.md ├── codenets ├── __init__.py ├── codesearchnet │ ├── __init__.py │ ├── ast_build.py │ ├── code_ast │ │ ├── __init__.py │ │ └── ast_utils.py │ ├── copied_code │ │ ├── __init__.py │ │ ├── bpevocabulary.py │ │ ├── metadata.py │ │ └── utils.py │ ├── data.py │ ├── dataset_main.py │ ├── dataset_utils.py │ ├── eval.py │ ├── huggingface │ │ ├── __init__.py │ │ ├── models.py │ │ └── tokenizer_recs.py │ ├── notebooks │ │ ├── codesearchnet_distrib.ipynb │ │ └── predictions.ipynb │ ├── poolers.py │ ├── predictions.py │ ├── query_1_code_1 │ │ ├── __init__.py │ │ ├── dataset.py │ │ ├── model.py │ │ └── training_ctx.py │ ├── query_1_code_n │ │ ├── __init__.py │ │ ├── dataset.py │ │ ├── model.py │ │ └── training_ctx.py │ ├── query_code_siamese │ │ ├── __init__.py │ │ ├── dataset.py │ │ ├── model.py │ │ └── training_ctx.py │ ├── sbert_build.py │ ├── tokenizer_build.py │ ├── tokenizer_recs.py │ ├── train.py │ └── training_ctx.py ├── losses.py ├── main.py ├── recordable.py ├── save.py ├── tensorboard_utils.py └── utils.py ├── conf ├── code_search_bert_2020_02_01_1500.conf ├── code_search_bert_2020_02_03_20_00.conf ├── code_search_bert_lg_2020_02_04_15_00.conf ├── code_search_bert_lg_2020_02_04_21_00.conf ├── code_search_bert_lg_2020_02_05_00_00.conf ├── code_search_bert_lg_2020_02_06_18_00.conf ├── code_search_bert_lg_2020_02_06_22_30.conf ├── code_search_bert_lg_2020_02_07_10_00.conf ├── code_search_bert_query_1_code_1_2020_02_10_11_00 copy.conf ├── code_search_bert_query_1_code_1_2020_02_10_11_00.conf ├── code_search_bert_query_1_code_1_2020_02_11_22_00 copy.conf ├── code_search_bert_query_1_code_1_2020_02_11_22_00.conf ├── code_search_bert_query_code_siamese_2020_02_12_00_00 copy.conf ├── code_search_bert_query_code_siamese_2020_02_12_00_00.conf ├── code_search_bert_query_code_siamese_2020_02_14_16_00 copy.conf ├── code_search_bert_query_code_siamese_2020_02_14_16_00.conf ├── code_search_bert_query_code_siamese_2020_02_15_14_00.conf ├── default.conf ├── qc_ast_2020_03_13.conf ├── qc_ast_2020_03_15 copy.conf ├── qc_ast_2020_03_15.conf ├── qc_ast_2020_03_17.conf ├── qc_ast_2020_03_18 copy.conf ├── qc_ast_2020_03_18.conf ├── qc_ast_2020_03_19.conf ├── qc_ce_2020_02_23_01_00 copy.conf ├── qc_ce_2020_02_23_01_00.conf ├── qc_ce_long_seq_2020_02_24.conf ├── qc_ce_sbert_2020_02_27 copy.conf ├── qc_ce_sbert_2020_02_27.conf ├── qc_ce_sbert_2020_02_28 copy.conf ├── qc_ce_sbert_2020_02_28.conf ├── qc_ce_sbert_2020_02_29 copy.conf ├── qc_ce_sbert_2020_02_29.conf ├── qc_ce_sbert_2020_03_01 copy.conf ├── qc_ce_sbert_2020_03_01.conf ├── qc_ce_subtoken_2020_02_25 copy.conf ├── qc_ce_subtoken_2020_02_25.conf ├── qc_ce_subtoken_larger_2020_02_25.conf ├── qc_ce_subtoken_larger_2020_02_26 copy.conf ├── qc_ce_subtoken_larger_2020_02_26.conf ├── qc_lambda_2020_02_20_12_30 copy.conf ├── qc_lambda_2020_02_20_12_30.conf ├── qc_sbert_lambda_2020_03_02.conf ├── qc_sbert_lambda_2020_03_04 copy.conf ├── qc_sbert_lambda_2020_03_04.conf ├── qc_sbert_lambda_2020_03_05.conf ├── qc_sbert_lambda_2020_03_07 copy.conf ├── qc_sbert_lambda_2020_03_07.conf ├── query_code_siamese_2020_02_15_14_00 copy.conf ├── query_code_siamese_2020_02_15_14_00.conf ├── query_code_siamese_2020_02_17_21_30 copy.conf ├── query_code_siamese_2020_02_17_21_30.conf ├── query_code_siamese_2020_02_18_13_00.conf ├── query_code_siamese_2020_02_19_13_00 copy.conf ├── query_code_siamese_2020_02_19_13_00.conf ├── query_code_siamese_albert_2020_02_18_08_30 copy.conf ├── query_code_siamese_albert_2020_02_18_08_30.conf └── query_code_siamese_albert_2020_02_18_14_00.conf ├── guide.md ├── main.py ├── model_predictions.csv ├── mypy.ini ├── pylama.ini ├── pyproject.toml ├── requirements.txt ├── test ├── __init__.py ├── conf │ ├── default.conf │ └── test.conf └── test_recordable.py └── wandb └── settings /.env: -------------------------------------------------------------------------------- 1 | PYTHONPATH=./codenets 2 | 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # ts 2 | **/node_modules/ 3 | /webroot/scripts/*.js 4 | 5 | # vim 6 | **/*.swp 7 | 8 | # python 9 | **/*.pyc 10 | **/__pycache__/ 11 | 12 | # jupyter 13 | **/.ipynb_checkpoints/ 14 | 15 | # data 16 | resources/ 17 | !resources/README.md 18 | !tests/data/ 19 | # *.csv 20 | !model_predictions.csv 21 | 22 | # environment 23 | *.ftpconfig 24 | 25 | .idea 26 | /src/wandb/run-* 27 | /src/wandb/debug.log 28 | *.html 29 | 30 | .mypy_cache 31 | *.lock 32 | 33 | wandb 34 | checkpoints 35 | pickles 36 | runs 37 | vendor 38 | build 39 | build_tokenizers 40 | codenets.egg-info/ 41 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.10.10 -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | // "python.pythonPath": "/Users/Pascal/Library/Caches/pypoetry/virtualenvs/codenets-O5WbUhkp-py3.7/bin/python", 3 | "python.linting.lintOnSave": true, 4 | "python.linting.pylintEnabled": false, 5 | "python.linting.pylamaEnabled": true, 6 | "python.linting.mypyEnabled": true, 7 | "python.formatting.provider": "black", 8 | "python.formatting.blackArgs": ["--line-length", "120"], 9 | "[python]": { 10 | "editor.formatOnSave": true, 11 | "editor.formatOnSaveTimeout": 2000, 12 | "editor.rulers": [120] 13 | }, 14 | "autoDocstring.docstringFormat": "google", 15 | "git.ignoreLimitWarning": true, 16 | "python.testing.pytestArgs": ["test"], 17 | "python.testing.unittestEnabled": false, 18 | "python.testing.pytestEnabled": true 19 | // "mypy.executable": "/Users/Pascal/Library/Caches/pypoetry/virtualenvs/codenets-O5WbUhkp-py3.7/bin/mypyls", 20 | // "mypy.targets": [ 21 | // "./src" 22 | // ], 23 | } 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2020 Pascal Voitot 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. -------------------------------------------------------------------------------- /codenets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/codenets/__init__.py -------------------------------------------------------------------------------- /codenets/codesearchnet/__init__.py: -------------------------------------------------------------------------------- 1 | from codenets.codesearchnet.query_1_code_1 import model, training_ctx 2 | 3 | import codenets.codesearchnet.query_1_code_1.training_ctx as single_branch_ctx 4 | 5 | # single_branch_model = model 6 | import codenets.codesearchnet.query_1_code_1.model as single_branch_model 7 | -------------------------------------------------------------------------------- /codenets/codesearchnet/ast_build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Usage: 4 | eval.py [options] SAVE_FOLDER TRAIN_DATA_PATH VALID_DATA_PATH TEST_DATA_PATH 5 | eval.py [options] [SAVE_FOLDER] 6 | 7 | *_DATA_PATH arguments may either accept (1) directory filled with .jsonl.gz files that we use as data, 8 | or a (2) plain text file containing a list of such directories (used for multi-language training). 9 | 10 | In the case that you supply a (2) plain text file, all directory names must be separated by a newline. 11 | For example, if you want to read from multiple directories you might have a plain text file called 12 | data_dirs_train.txt with the below contents: 13 | 14 | > cat ~/src/data_dirs_train.txt 15 | azure://semanticcodesearch/pythondata/Processed_Data/jsonl/train 16 | azure://semanticcodesearch/csharpdata/split/csharpCrawl-train 17 | 18 | Options: 19 | -h --help Show this screen. 20 | --config FILE Specify HOCON config file. 21 | --debug Enable debug routines. [default: False] 22 | """ 23 | 24 | from dpu_utils.utils import run_and_debug 25 | from docopt import docopt 26 | from loguru import logger 27 | import os 28 | from pyhocon import ConfigFactory 29 | 30 | 31 | from codenets.codesearchnet.training_ctx import CodeSearchTrainingContext 32 | from codenets.codesearchnet.code_ast.ast_utils import build_language_ast 33 | 34 | """Evaluating SBert.""" 35 | 36 | 37 | def run(args, tag_in_vcs=False) -> None: 38 | os.environ["WANDB_MODE"] = "dryrun" 39 | 40 | logger.debug("Building Training Context") 41 | conf_file = args["--config"] 42 | conf = ConfigFactory.parse_file(conf_file) 43 | 44 | logger.info(f"Restoring Training Context from config {conf_file}") 45 | training_ctx = CodeSearchTrainingContext.build_context_from_hocon(conf) 46 | 47 | # dirs = [Path("/home/mandubian/workspaces/tools/CodeSearchNet/resources/data/ruby/final/jsonl/valid/")] 48 | # build_language_ast("val", training_ctx.val_dirs, training_ctx.pickle_path, training_ctx.val_data_params) 49 | # build_language_ast("train", training_ctx.train_dirs, training_ctx.pickle_path, training_ctx.train_data_params) 50 | build_language_ast("test", training_ctx.test_dirs, training_ctx.pickle_path, training_ctx.test_data_params) 51 | 52 | # Language.build_library( 53 | # # Store the library in the `build` directory 54 | # "build/my-languages.so", 55 | # # Include one or more languages 56 | # [ 57 | # "vendor/tree-sitter-go", 58 | # "vendor/tree-sitter-java", 59 | # "vendor/tree-sitter-javascript", 60 | # "vendor/tree-sitter-python", 61 | # "vendor/tree-sitter-php", 62 | # "vendor/tree-sitter-ruby", 63 | # ], 64 | # ) 65 | 66 | # parser = Parser() 67 | 68 | # code_php = """ 69 | # hasAuthentication($repositoryName)) { 73 | # $auth = $this->getAuthentication($repositoryName); 74 | # if ($auth['username'] === $username && $auth['password'] === $password) { 75 | # return; 76 | # } 77 | 78 | # $this->writeError( 79 | # sprintf( 80 | # "Warning: You should avoid overwriting already defined auth settings for %s.", 81 | # $repositoryName 82 | # ) 83 | # ); 84 | # } 85 | # $this->setAuthentication($repositoryName, $username, $password); 86 | # } 87 | # ?> 88 | # """ 89 | # PHP_LANGUAGE = Language("build/my-languages.so", "php") 90 | # parser.set_language(PHP_LANGUAGE) 91 | # tree = parser.parse(bytes(code_php, "utf8")) 92 | # cursor = tree.walk() 93 | # print(cursor.node.sexp()) 94 | 95 | # skip_node_types = ["ERROR", ""] 96 | # all_tokens_php, special_tokens_php = breadth_first_path("php", code_php, cursor, skip_node_types=skip_node_types) 97 | # print("all_tokens_php", all_tokens_php) 98 | # print("special_tokens_php", special_tokens_php) 99 | 100 | # JAVA_LANGUAGE = Language("build/my-languages.so", "java") 101 | # # parser = Parser() 102 | # parser.set_language(JAVA_LANGUAGE) 103 | # code_java = """ 104 | # class A { 105 | # public int b() { 106 | # int c = 5; 107 | # } 108 | # } 109 | # """ 110 | # tree = parser.parse(bytes(code_java, "utf8")) 111 | # cursor = tree.walk() 112 | # print("code_java", code_java) 113 | # print(cursor.node.sexp()) 114 | # all_tokens_java, special_tokens_java = breadth_first_path(code_java, cursor) 115 | # print("all_tokens_java", all_tokens_java) 116 | # print("special_tokens_java", special_tokens_java) 117 | 118 | # print("===================================================") 119 | 120 | # PY_LANGUAGE = Language("build/my-languages.so", "python") 121 | # parser.set_language(PY_LANGUAGE) 122 | # code_python = """ 123 | # def foo(): 124 | # if bar: 125 | # a: List[str] = ["toto", "tata"] 126 | # baz(a, b, 5) 127 | # """ 128 | # tree = parser.parse(bytes(code_python, "utf8")) 129 | # cursor = tree.walk() 130 | # print("code_python", code_python) 131 | # print(cursor.node.sexp()) 132 | # all_tokens_python, special_tokens_python = breadth_first_path(code_python, cursor) 133 | # print("all_tokeall_tokens_pythonns", all_tokens_python) 134 | # print("special_tokens_python", special_tokens_python) 135 | 136 | # special_tokens = special_tokens_python.union(special_tokens_java) 137 | # print("special_tokens", special_tokens) 138 | # training_ctx.tokenizer.vocab.add_special_tokens(list(special_tokens)) 139 | 140 | # print("JAVA", training_ctx.tokenize_code_sentences([" ".join(all_tokens_java)], max_length=256)) 141 | # print("PYTHON", training_ctx.tokenize_code_sentences([" ".join(all_tokens_python)], max_length=256)) 142 | 143 | 144 | if __name__ == "__main__": 145 | args = docopt(__doc__) 146 | run_and_debug(lambda: run(args), args["--debug"]) 147 | -------------------------------------------------------------------------------- /codenets/codesearchnet/code_ast/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/codenets/codesearchnet/code_ast/__init__.py -------------------------------------------------------------------------------- /codenets/codesearchnet/code_ast/ast_utils.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | import time 3 | from typing import Dict, List, Tuple, IO, Set, Optional 4 | from pathlib import Path 5 | from tree_sitter import Language, Parser, Node 6 | import os 7 | import json 8 | from pyhocon import ConfigTree 9 | 10 | from codenets.codesearchnet.data import DatasetParams 11 | from codenets.utils import get_data_files_from_directory 12 | from codenets.codesearchnet.copied_code.utils import read_file_samples 13 | 14 | 15 | class TreeSitterParser: 16 | def __init__( 17 | self, 18 | langs: List[str], 19 | added_nodes: Dict[str, Dict[str, str]], 20 | skip_node_types: Dict[str, List[str]], 21 | vendors_path: Path = Path("./vendor"), 22 | ): 23 | super(TreeSitterParser, self).__init__() 24 | 25 | vendors = [] 26 | self.added_nodes = added_nodes 27 | self.skip_node_types = skip_node_types 28 | for lang in langs: 29 | vendors.append(vendors_path / f"tree-sitter-{lang}") 30 | if lang not in added_nodes: 31 | self.added_nodes[lang] = ConfigTree([("prefix", ""), ("suffix", "")]) 32 | if lang not in skip_node_types: 33 | self.skip_node_types[lang] = [] 34 | 35 | Language.build_library( 36 | # Store the library in the `build` directory 37 | "build/my-languages.so", 38 | # Include one or more languages 39 | vendors, 40 | ) 41 | 42 | self.parser = Parser() 43 | 44 | def repr_field_node( 45 | self, code, node, field: Optional[str] = None, skip_node_types: List[str] = [] 46 | ) -> Tuple[List[str], Set[str], bool]: 47 | skip_sub_nodes = False 48 | special_tokens: Set[str] = set() 49 | rpr: List[str] 50 | if field: 51 | rpr = ["", field] 52 | special_tokens.add("") 53 | else: 54 | rpr = [] 55 | 56 | if node.is_named: 57 | # no child, serialize it here 58 | if len(node.children) == 0: 59 | if node.type in skip_node_types: 60 | rpr.extend([f"{node.type}", ""]) 61 | special_tokens.add("") 62 | else: 63 | rpr.extend([f"<{node.type}>", code[node.start_byte : node.end_byte], ""]) 64 | special_tokens.update([f"<{node.type}>", ""]) 65 | 66 | else: 67 | if node.type not in skip_node_types: 68 | rpr.extend([f"<{node.type}>", ""]) 69 | special_tokens.update([f"<{node.type}>", ""]) 70 | else: 71 | skip_sub_nodes = True 72 | else: 73 | if node.type not in skip_node_types: 74 | rpr.extend([f"{node.type}", ""]) 75 | special_tokens.add("") 76 | else: 77 | skip_sub_nodes = True 78 | 79 | return rpr, special_tokens, skip_sub_nodes 80 | 81 | def repr_level(self, code, cursor, skip_node_types: List[str] = []): 82 | nodes: List[Node] = [] 83 | all_tokens: List[str] = [] 84 | special_tokens: Set[str] = set() 85 | 86 | if cursor.goto_first_child(): 87 | field = cursor.current_field_name() 88 | toks, specs, skip = self.repr_field_node(code, cursor.node, field, skip_node_types=skip_node_types) 89 | all_tokens.extend(toks) 90 | special_tokens.update(specs) 91 | if not skip: 92 | nodes.append(cursor.node) 93 | 94 | while cursor.goto_next_sibling(): 95 | field = cursor.current_field_name() 96 | toks, specs, skip = self.repr_field_node(code, cursor.node, field, skip_node_types=skip_node_types) 97 | all_tokens.extend(toks) 98 | special_tokens.update(specs) 99 | if not skip: 100 | nodes.append(cursor.node) 101 | 102 | all_tokens.append("") 103 | special_tokens.add("") 104 | return all_tokens, special_tokens, nodes 105 | 106 | def breadth_first_path(self, lang, code, cursor, skip_node_types: List[str] = []) -> Tuple[List[str], Set[str]]: 107 | all_tokens = [f"<{lang}>"] 108 | special_tokens = set([f"<{lang}>"]) 109 | all_tokens_1, special_tokens_1, skip = self.repr_field_node(code, cursor.node, skip_node_types=skip_node_types) 110 | all_tokens.extend(all_tokens_1) 111 | special_tokens.update(special_tokens_1) 112 | 113 | if not skip: 114 | all_tokens_lvl, special_tokens_lvl, nodes = self.repr_level(code, cursor, skip_node_types=skip_node_types) 115 | all_tokens.extend(all_tokens_lvl) 116 | special_tokens.update(special_tokens_lvl) 117 | 118 | while len(nodes) > 0: 119 | node = nodes.pop(0) 120 | cursor = node.walk() 121 | all_tokens_2, special_tokens_2, nodes_2 = self.repr_level(code, cursor, skip_node_types=skip_node_types) 122 | all_tokens.extend(all_tokens_2) 123 | special_tokens.update(special_tokens_2) 124 | nodes.extend(nodes_2) 125 | all_tokens.append("") 126 | special_tokens.add("") 127 | return all_tokens, special_tokens 128 | 129 | def breadth_first_path_light( 130 | self, lang, code, cursor, skip_node_types: List[str] = [], max_tokens: Optional[int] = None 131 | ) -> List[str]: 132 | all_tokens = [f"<{lang}>"] 133 | all_tokens_1, special_tokens_1, skip = self.repr_field_node(code, cursor.node, skip_node_types=skip_node_types) 134 | all_tokens.extend(all_tokens_1) 135 | 136 | if not skip: 137 | all_tokens_lvl, special_tokens_lvl, nodes = self.repr_level(code, cursor, skip_node_types=skip_node_types) 138 | all_tokens.extend(all_tokens_lvl) 139 | 140 | while len(nodes) > 0: 141 | if max_tokens is not None and len(all_tokens) >= max_tokens: 142 | break 143 | node = nodes.pop(0) 144 | cursor = node.walk() 145 | all_tokens_2, special_tokens_2, nodes_2 = self.repr_level(code, cursor, skip_node_types=skip_node_types) 146 | all_tokens.extend(all_tokens_2) 147 | nodes.extend(nodes_2) 148 | if max_tokens is not None: 149 | all_tokens = all_tokens[: (max_tokens - 1)] 150 | all_tokens.append("") 151 | return all_tokens 152 | 153 | def parse_full(self, lang: str, code: str) -> Tuple[List[str], Set[str]]: 154 | LANGUAGE = Language("build/my-languages.so", lang) 155 | self.parser.set_language(LANGUAGE) 156 | 157 | code = f"{self.added_nodes[lang]['prefix']} {code} {self.added_nodes[lang]['suffix']}" 158 | 159 | tree = self.parser.parse(bytes(code, "utf8")) 160 | cursor = tree.walk() 161 | 162 | tokens, special_tokens = self.breadth_first_path(lang, code, cursor, skip_node_types=self.skip_node_types[lang]) 163 | return tokens, special_tokens 164 | 165 | def parse(self, lang: str, code: str, max_tokens: Optional[int] = None) -> List[str]: 166 | LANGUAGE = Language("build/my-languages.so", lang) 167 | self.parser.set_language(LANGUAGE) 168 | 169 | code = f"{self.added_nodes[lang]['prefix']} {code} {self.added_nodes[lang]['suffix']}" 170 | 171 | tree = self.parser.parse(bytes(code, "utf8")) 172 | cursor = tree.walk() 173 | 174 | tokens = self.breadth_first_path_light( 175 | lang, code, cursor, skip_node_types=self.skip_node_types[lang], max_tokens=max_tokens 176 | ) 177 | return tokens 178 | 179 | 180 | def load_special_tokens(data_params: DatasetParams): 181 | special_tokens: List[str] = [] 182 | for f in data_params.ast_special_tokens_files: 183 | fp = open(f, "r") 184 | special_tokens.extend(json.load(fp)) 185 | 186 | return special_tokens 187 | 188 | 189 | def build_language_ast(name: str, dirs: List[Path], pickle_path: Path, data_params: DatasetParams): 190 | start = time.time() 191 | 192 | if data_params.use_ast == "tree-sitter": 193 | parser = TreeSitterParser( 194 | langs=["go", "java", "javascript", "python", "php", "ruby"], 195 | added_nodes=data_params.ast_added_nodes, 196 | skip_node_types=data_params.ast_skip_node_types, 197 | ) 198 | 199 | all_special_tokens: Set[str] = set() 200 | 201 | lengths: Dict[str, List[int]] = {"go": [], "java": [], "javascript": [], "python": [], "php": [], "ruby": []} 202 | 203 | for (idx, file_path) in enumerate(get_data_files_from_directory(dirs)): 204 | logger.info(f"Reading {file_path}") 205 | raw_samples = list(read_file_samples(file_path)) 206 | for raw_sample in raw_samples: 207 | lang = raw_sample["language"] 208 | tokens, special_tokens = parser.parse_full(lang, raw_sample["code"]) 209 | 210 | all_special_tokens.update(special_tokens) 211 | 212 | lengths[lang].append(len(tokens)) 213 | 214 | end = time.time() 215 | logger.debug(f"all_special_tokens ({len(all_special_tokens)}) {all_special_tokens}") 216 | 217 | if not os.path.exists(pickle_path): 218 | os.makedirs(pickle_path) 219 | 220 | json_file = Path(pickle_path) / f"{name}_special_tokens.json" 221 | with open(json_file, "w") as f: 222 | json.dump(list(all_special_tokens), f) 223 | 224 | import statistics 225 | 226 | for lang, lgs in lengths.items(): 227 | if len(lgs) > 0: 228 | max_lg = max(lgs) 229 | min_lg = min(lgs) 230 | mean_lg = statistics.mean(lgs) 231 | std_lg = statistics.stdev(lgs) 232 | logger.debug(f"{lang} [ min:{min_lg}, max:{max_lg}, mean:{mean_lg}, stddev:{std_lg} ]") 233 | 234 | time_p = end - start 235 | logger.info(f"Building AST took: {time_p} sec") 236 | -------------------------------------------------------------------------------- /codenets/codesearchnet/copied_code/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/codenets/codesearchnet/copied_code/__init__.py -------------------------------------------------------------------------------- /codenets/codesearchnet/copied_code/bpevocabulary.py: -------------------------------------------------------------------------------- 1 | # Code copied from https://github.com/github/CodeSearchNet for backward-compatible experimentations 2 | 3 | # Code adapted from https://github.com/soaxelbrooke/python-bpe/blob/master/bpe/encoder.py 4 | # MIT License (see repository) 5 | 6 | 7 | """ 8 | An encoder which learns byte pair encodings for white-space separated text. 9 | Can tokenize, encode, and decode. 10 | """ 11 | import typing 12 | from typing import Optional, Sized 13 | from collections import Counter 14 | # from dpu_utils.mlutils import Vocabulary 15 | 16 | try: 17 | from typing import Dict, Iterable, List, Iterator 18 | except ImportError: 19 | pass 20 | 21 | 22 | DEFAULT_EOW = "__eow" 23 | DEFAULT_SOW = "__sow" 24 | DEFAULT_UNK = "__unk" 25 | DEFAULT_PAD = "__pad" 26 | 27 | # pylint: disable= inherit-non-class 28 | 29 | 30 | class BpeVocabulary(Sized): 31 | """Encode white-space separated text using byte-pair encoding. See https://arxiv.org/abs/1508.07909 for details.""" 32 | 33 | def __init__( 34 | self, 35 | vocab_size: int = 8192, 36 | pct_bpe: float = 0.2, 37 | ngram_min: int = 2, 38 | ngram_max: int = 8, 39 | required_tokens: Optional[Iterable[str]] = None, 40 | strict=True, 41 | EOW=DEFAULT_EOW, 42 | SOW=DEFAULT_SOW, 43 | UNK=DEFAULT_UNK, 44 | PAD=DEFAULT_PAD, 45 | ): 46 | if vocab_size < 1: 47 | raise ValueError("vocab size must be greater than 0.") 48 | 49 | self.EOW = EOW 50 | self.SOW = SOW 51 | self.eow_len = len(EOW) 52 | self.sow_len = len(SOW) 53 | self.UNK = UNK 54 | self.PAD = PAD 55 | self.required_tokens = list(set(required_tokens or []).union({self.UNK, self.PAD})) 56 | self.vocab_size = vocab_size 57 | self.pct_bpe = pct_bpe 58 | self.word_vocab_size = max([int(vocab_size * (1 - pct_bpe)), len(self.required_tokens or [])]) 59 | self.bpe_vocab_size = vocab_size - self.word_vocab_size 60 | self.word_vocab = {} # type: Dict[str, int] 61 | self.bpe_vocab = {} # type: Dict[str, int] 62 | self.inverse_word_vocab = {} # type: Dict[int, str] 63 | self.inverse_bpe_vocab = {} # type: Dict[int, str] 64 | self.ngram_min = ngram_min 65 | self.ngram_max = ngram_max 66 | self.strict = strict 67 | 68 | def __len__(self): 69 | """Return vocab len""" 70 | return self.vocab_size 71 | 72 | def byte_pair_counts(self, words: Iterable[str]) -> Iterable[typing.Counter]: 73 | """ 74 | Count space separated token character pairs: 75 | [('T h i s ', 4}] -> {'Th': 4, 'hi': 4, 'is': 4} 76 | """ 77 | for token, count in self.count_tokens(words).items(): 78 | bp_counts = Counter() # type: Counter 79 | sub_tokens = token.split(" ") 80 | joined_tokens = "".join(sub_tokens) 81 | token_offsets = [0] 82 | length = 0 83 | for ngram in sub_tokens: 84 | bp_counts[ngram] += count 85 | length += len(ngram) 86 | token_offsets += [length] 87 | for ngram_size in range(self.ngram_min, min(self.ngram_max, len(sub_tokens)) + 1): 88 | for i in range(len(sub_tokens) - ngram_size + 1): 89 | bp_counts[joined_tokens[token_offsets[i] : token_offsets[i + ngram_size]]] += count 90 | 91 | yield bp_counts 92 | 93 | def count_tokens(self, words: Iterable[str]) -> Dict[str, int]: 94 | """Count tokens into a BPE vocab""" 95 | token_counts = Counter(words) 96 | return {" ".join(token): count for token, count in token_counts.items()} 97 | 98 | def learn_word_vocab(self, word_counts: typing.Counter[str]) -> Dict[str, int]: 99 | """Build vocab from self.word_vocab_size most common tokens in provided sentences""" 100 | for token in set(self.required_tokens or []): 101 | word_counts[token] = int(2 ** 31) 102 | word_counts[self.PAD] = int(2 ** 32) # Make sure that PAD gets id=0 103 | sorted_word_counts = sorted(word_counts.items(), key=lambda p: -p[1]) 104 | return {word: idx for idx, (word, count) in enumerate(sorted_word_counts[: self.word_vocab_size])} 105 | 106 | def learn_bpe_vocab(self, words: Iterable[str]) -> Dict[str, int]: 107 | """Learn a vocab of byte pair encodings""" 108 | vocab = Counter() # type: typing.Counter 109 | for token in {self.SOW, self.EOW}: 110 | vocab[token] = int(2 ** 63) 111 | for idx, byte_pair_count in enumerate(self.byte_pair_counts(words)): 112 | vocab.update(byte_pair_count) 113 | if (idx + 1) % 10000 == 0: 114 | self.trim_vocab(10 * self.bpe_vocab_size, vocab) 115 | 116 | sorted_bpe_counts = sorted(vocab.items(), key=lambda p: -p[1])[: self.bpe_vocab_size] 117 | return {bp: idx + self.word_vocab_size for idx, (bp, count) in enumerate(sorted_bpe_counts)} 118 | 119 | def fit(self, word_counts: typing.Counter[str]) -> None: 120 | """Learn vocab from text.""" 121 | 122 | # First, learn word vocab 123 | self.word_vocab = self.learn_word_vocab(word_counts) 124 | 125 | remaining_words = Counter({word: count for word, count in word_counts.items() if word not in self.word_vocab}) 126 | self.bpe_vocab = self.learn_bpe_vocab(remaining_words.elements()) 127 | 128 | self.inverse_word_vocab = {idx: token for token, idx in self.word_vocab.items()} 129 | self.inverse_bpe_vocab = {idx: token for token, idx in self.bpe_vocab.items()} 130 | 131 | @staticmethod 132 | def get_unk() -> str: 133 | return DEFAULT_UNK 134 | 135 | @staticmethod 136 | def get_pad() -> str: 137 | return DEFAULT_PAD 138 | 139 | @staticmethod 140 | def trim_vocab(n: int, vocab: Dict[str, int]) -> None: 141 | """Delete all pairs below 10 * vocab size to prevent memory problems""" 142 | pair_counts = sorted(vocab.items(), key=lambda p: -p[1]) 143 | pairs_to_trim = [pair for pair, count in pair_counts[n:]] 144 | for pair in pairs_to_trim: 145 | del vocab[pair] 146 | 147 | def subword_tokenize(self, word: str) -> List[str]: 148 | """Tokenize inside an unknown token using BPE""" 149 | end_idx = min([len(word), self.ngram_max]) 150 | sw_tokens = [self.SOW] 151 | start_idx = 0 152 | 153 | while start_idx < len(word): 154 | subword = word[start_idx:end_idx] 155 | if subword in self.bpe_vocab: 156 | sw_tokens.append(subword) 157 | start_idx = end_idx 158 | end_idx = min([len(word), start_idx + self.ngram_max]) 159 | elif len(subword) == 1: 160 | sw_tokens.append(self.UNK) 161 | start_idx = end_idx 162 | end_idx = min([len(word), start_idx + self.ngram_max]) 163 | else: 164 | end_idx -= 1 165 | 166 | sw_tokens.append(self.EOW) 167 | return sw_tokens 168 | 169 | def tokenize(self, word_tokens: List[str]) -> List[str]: 170 | """Split a sentence into word and subword tokens""" 171 | 172 | tokens = [] 173 | for word_token in word_tokens: 174 | if word_token in self.word_vocab: 175 | tokens.append(word_token) 176 | else: 177 | tokens.extend(self.subword_tokenize(word_token)) 178 | 179 | return tokens 180 | 181 | def transform(self, sentences: Iterable[List[str]], reverse=False, fixed_length=None) -> Iterable[List[int]]: 182 | """Turn tokens into vocab idxs""" 183 | direction = -1 if reverse else 1 184 | for sentence in sentences: 185 | encoded = [] 186 | tokens = list(self.tokenize(sentence)) 187 | for token in tokens: 188 | if token in self.word_vocab: 189 | encoded.append(self.word_vocab[token]) 190 | elif token in self.bpe_vocab: 191 | encoded.append(self.bpe_vocab[token]) 192 | else: 193 | encoded.append(self.word_vocab[self.UNK]) 194 | 195 | if fixed_length is not None: 196 | encoded = encoded[:fixed_length] 197 | while len(encoded) < fixed_length: 198 | encoded.append(self.word_vocab[self.PAD]) 199 | 200 | yield encoded[::direction] 201 | 202 | def inverse_transform(self, rows: Iterable[List[int]]) -> Iterator[str]: 203 | """Turn token indexes back into space-joined text.""" 204 | for row in rows: 205 | words = [] 206 | 207 | rebuilding_word = False 208 | current_word = "" 209 | for idx in row: 210 | if self.inverse_bpe_vocab.get(idx) == self.SOW: 211 | if rebuilding_word and self.strict: 212 | raise ValueError("Encountered second SOW token before EOW.") 213 | rebuilding_word = True 214 | 215 | elif self.inverse_bpe_vocab.get(idx) == self.EOW: 216 | if not rebuilding_word and self.strict: 217 | raise ValueError("Encountered EOW without matching SOW.") 218 | rebuilding_word = False 219 | words.append(current_word) 220 | current_word = "" 221 | 222 | elif rebuilding_word and (idx in self.inverse_bpe_vocab): 223 | current_word += self.inverse_bpe_vocab[idx] 224 | 225 | elif rebuilding_word and (idx in self.inverse_word_vocab): 226 | current_word += self.inverse_word_vocab[idx] 227 | 228 | elif idx in self.inverse_word_vocab: 229 | words.append(self.inverse_word_vocab[idx]) 230 | 231 | elif idx in self.inverse_bpe_vocab: 232 | if self.strict: 233 | raise ValueError("Found BPE index {} when not rebuilding word!".format(idx)) 234 | else: 235 | words.append(self.inverse_bpe_vocab[idx]) 236 | 237 | else: 238 | raise ValueError("Got index {} that was not in word or BPE vocabs!".format(idx)) 239 | 240 | yield " ".join(w for w in words if w != "") 241 | -------------------------------------------------------------------------------- /codenets/codesearchnet/copied_code/metadata.py: -------------------------------------------------------------------------------- 1 | # Code partially copied and adapted from https://github.com/github/CodeSearchNet for backward-compatible experimentations 2 | 3 | from collections import defaultdict 4 | 5 | from typing import DefaultDict, Dict, Iterable, List, Optional, Tuple 6 | 7 | from dpu_utils.mlutils import Vocabulary 8 | 9 | from dpu_utils.utils import RichPath 10 | 11 | from codenets.codesearchnet.copied_code.bpevocabulary import BpeVocabulary 12 | from codenets.codesearchnet.copied_code.utils import run_jobs_in_parallel 13 | 14 | from dataclasses import dataclass 15 | from pathlib import Path 16 | from collections import Counter 17 | from dataclasses import field 18 | from enum import Enum 19 | 20 | from codenets.utils import _to_subtoken_stream, get_data_files_from_directory 21 | 22 | 23 | class QueryType(Enum): 24 | DOCSTRING = "docstring_as_query" 25 | FUNCTION_NAME = "func_name_as_query" 26 | 27 | 28 | @dataclass 29 | class Metadata: 30 | token_counter: Counter = field(default_factory=Counter) 31 | token_vocab: Optional[BpeVocabulary] = None 32 | common_tokens: List[Tuple[str, int]] = field(default_factory=list) 33 | 34 | 35 | def load_metadata_from_sample( 36 | data_to_load: Iterable[str], raw_metadata: Metadata, use_subtokens: bool = False, mark_subtoken_end: bool = False 37 | ) -> Metadata: 38 | if use_subtokens: 39 | data_to_load = _to_subtoken_stream(data_to_load, mark_subtoken_end=mark_subtoken_end) 40 | # raw_metadata["token_counter"].update(data_to_load) 41 | raw_metadata.token_counter.update(data_to_load) 42 | return raw_metadata 43 | 44 | 45 | def append_metadata( 46 | encoder_label: str, 47 | vocab_size: int, 48 | vocab_count_threshold: int, 49 | # use_bpe: bool, 50 | pct_bpe: float, 51 | raw_metadata_list: List[Metadata], 52 | ) -> Metadata: 53 | merged_token_counter: Counter = Counter() 54 | for raw_metadata in raw_metadata_list: 55 | # merged_token_counter += raw_metadata["token_counter"] 56 | merged_token_counter += raw_metadata.token_counter 57 | 58 | # if hyperparameters["%s_use_bpe" % encoder_label]: 59 | # token_vocabulary: Vocabulary 60 | # if use_bpe: 61 | token_vocabulary = BpeVocabulary( 62 | # vocab_size=hyperparameters["%s_token_vocab_size" % encoder_label], 63 | vocab_size=vocab_size, 64 | # pct_bpe=hyperparameters["%s_pct_bpe" % encoder_label], 65 | pct_bpe=pct_bpe, 66 | ) 67 | token_vocabulary.fit(merged_token_counter) 68 | # else: 69 | # token_vocabulary = Vocabulary.create_vocabulary( 70 | # tokens=merged_token_counter, 71 | # # max_size=hyperparameters["%s_token_vocab_size" % encoder_label], 72 | # max_size=vocab_size, 73 | # # count_threshold=hyperparameters["%s_token_vocab_count_threshold" % encoder_label], 74 | # count_threshold=vocab_count_threshold, 75 | # ) 76 | 77 | # final_metadata["token_vocab"] = token_vocabulary 78 | # Save the most common tokens for use in data augmentation: 79 | # final_metadata["common_tokens"] = merged_token_counter.most_common(50) 80 | final_metadata = Metadata( 81 | token_vocab=token_vocabulary, 82 | token_counter=merged_token_counter, 83 | common_tokens=merged_token_counter.most_common(50), 84 | ) 85 | return final_metadata 86 | 87 | 88 | def build_tokenizer_metadata( 89 | data_dirs: List[Path], 90 | max_files_per_dir: Optional[int] = None, 91 | parallelize: bool = True, 92 | use_subtokens: bool = False, 93 | mark_subtoken_end: bool = False, 94 | ) -> Tuple[List[Metadata], Dict[str, List[Metadata]]]: 95 | raw_query_metadata_list = [] 96 | raw_code_language_metadata_lists: DefaultDict[str, List] = defaultdict(list) 97 | 98 | def metadata_parser_fn(_, file_path: Path) -> Iterable[Tuple[Metadata, Dict[str, Metadata]]]: 99 | raw_query_metadata = Metadata() 100 | per_code_language_metadata: DefaultDict[str, Metadata] = defaultdict(Metadata) 101 | 102 | for raw_sample in RichPath.create(str(file_path)).read_by_file_suffix(): 103 | sample_language = raw_sample["language"] 104 | per_code_language_metadata[sample_language] = load_metadata_from_sample( 105 | data_to_load=raw_sample["code_tokens"], 106 | raw_metadata=per_code_language_metadata[sample_language], 107 | use_subtokens=use_subtokens, 108 | mark_subtoken_end=mark_subtoken_end, 109 | ) 110 | 111 | raw_query_metadata = load_metadata_from_sample( 112 | data_to_load=[d.lower() for d in raw_sample["docstring_tokens"]], 113 | raw_metadata=raw_query_metadata, 114 | use_subtokens=use_subtokens, 115 | mark_subtoken_end=mark_subtoken_end, 116 | ) 117 | yield (raw_query_metadata, per_code_language_metadata) 118 | 119 | def received_result_callback(metadata_parser_result: Tuple[Metadata, Dict[str, Metadata]]): 120 | (raw_query_metadata, per_code_language_metadata) = metadata_parser_result 121 | raw_query_metadata_list.append(raw_query_metadata) 122 | for (metadata_language, raw_code_language_metadata) in per_code_language_metadata.items(): 123 | raw_code_language_metadata_lists[metadata_language].append(raw_code_language_metadata) 124 | 125 | def finished_callback(): 126 | pass 127 | 128 | if parallelize: 129 | run_jobs_in_parallel( 130 | get_data_files_from_directory(data_dirs, max_files_per_dir), 131 | metadata_parser_fn, 132 | received_result_callback, 133 | finished_callback, 134 | ) 135 | else: 136 | for (idx, file) in enumerate(get_data_files_from_directory(data_dirs, max_files_per_dir)): 137 | for res in metadata_parser_fn(idx, file): 138 | received_result_callback(res) 139 | 140 | return raw_query_metadata_list, raw_code_language_metadata_lists 141 | -------------------------------------------------------------------------------- /codenets/codesearchnet/copied_code/utils.py: -------------------------------------------------------------------------------- 1 | # Code copied from https://github.com/github/CodeSearchNet for backward-compatible experimentations 2 | 3 | import multiprocessing 4 | from typing import List, Iterable, Callable, TypeVar, Dict, Any, Union 5 | from dpu_utils.utils import RichPath 6 | from pathlib import Path 7 | 8 | JobType = TypeVar("JobType") 9 | ResultType = TypeVar("ResultType") 10 | 11 | 12 | def read_file_samples(file_path: Union[Path, str]) -> List[Dict[str, Any]]: 13 | return RichPath.create(str(file_path)).read_by_file_suffix() 14 | 15 | 16 | def __parallel_queue_worker( 17 | worker_id: int, 18 | job_queue: multiprocessing.Queue, 19 | result_queue: multiprocessing.Queue, 20 | worker_fn: Callable[[int, JobType], Iterable[ResultType]], 21 | ): 22 | while True: 23 | job = job_queue.get() 24 | 25 | # "None" is the signal for last job, put that back in for other workers and stop: 26 | if job is None: 27 | job_queue.put(job) 28 | break 29 | 30 | for result in worker_fn(worker_id, job): 31 | result_queue.put(result) 32 | result_queue.put(None) 33 | 34 | 35 | def run_jobs_in_parallel( 36 | all_jobs: List[JobType], 37 | worker_fn: Callable[[int, JobType], Iterable[ResultType]], 38 | received_result_callback: Callable[[ResultType], None], 39 | finished_callback: Callable[[], None], 40 | result_queue_size: int = 100, 41 | ) -> None: 42 | """ 43 | Run jobs in parallel and uses callbacks to collect results. 44 | :param all_jobs: Job descriptions; one at a time will be parsed into worker_fn. 45 | :param worker_fn: Worker function receiving a job; many copies may run in parallel. 46 | Can yield results, which will be processed (one at a time) by received_result_callback. 47 | :param received_result_callback: Called when a result was produced by any worker. Only one will run at a time. 48 | :param finished_callback: Called when all jobs have been processed. 49 | """ 50 | job_queue: multiprocessing.Queue = multiprocessing.Queue(len(all_jobs) + 1) 51 | for job in all_jobs: 52 | job_queue.put(job) 53 | job_queue.put(None) # Marker that we are done 54 | 55 | # This will hold the actual results: 56 | result_queue: multiprocessing.Queue = multiprocessing.Queue(result_queue_size) 57 | 58 | # Create workers: 59 | num_workers = multiprocessing.cpu_count() - 1 60 | workers = [ 61 | multiprocessing.Process(target=__parallel_queue_worker, args=(worker_id, job_queue, result_queue, worker_fn)) 62 | for worker_id in range(num_workers) 63 | ] 64 | for worker in workers: 65 | worker.start() 66 | 67 | num_workers_finished = 0 68 | while True: 69 | result = result_queue.get() 70 | if result is None: 71 | num_workers_finished += 1 72 | if num_workers_finished == len(workers): 73 | finished_callback() 74 | break 75 | else: 76 | received_result_callback(result) 77 | 78 | for worker in workers: 79 | worker.join() 80 | -------------------------------------------------------------------------------- /codenets/codesearchnet/data.py: -------------------------------------------------------------------------------- 1 | # This code is nearly 100% copied from original repo 2 | 3 | from dataclasses import dataclass, fields as datafields 4 | import numpy as np 5 | 6 | from typing import Dict, TypeVar, List 7 | from dataclasses import field 8 | 9 | 10 | @dataclass 11 | class DatasetParams: 12 | """Description of parameters of a CodeSearchnet dataset""" 13 | 14 | fraction_using_func_name: float 15 | min_len_func_name_for_query: int 16 | use_subtokens: bool 17 | mark_subtoken_end: bool 18 | code_max_num_tokens: int 19 | query_max_num_tokens: int 20 | use_bpe: bool 21 | vocab_size: int 22 | pct_bpe: float 23 | vocab_count_threshold: int 24 | lang_ids: Dict[str, int] 25 | do_lowercase: bool 26 | special_tokens: List[str] 27 | parallelize: bool 28 | use_lang_weights: bool = False # for backward compat 29 | query_random_token_frequency: float = 0.2 30 | query_embeddings: str = "none" 31 | use_ast: str = "none" 32 | ast_added_nodes: Dict[str, Dict[str, str]] = field(default_factory=dict) 33 | ast_skip_node_types: Dict[str, List[str]] = field(default_factory=dict) 34 | ast_special_tokens_files: List[str] = field(default_factory=list) 35 | 36 | 37 | T_InputFeatures = TypeVar("T_InputFeatures", bound="InputFeatures") 38 | 39 | 40 | @dataclass 41 | class InputFeatures: 42 | """Structure gathering query and code tokens/mask after passing through tokenizer""" 43 | 44 | language: int 45 | similarity: float 46 | query_tokens: np.ndarray 47 | query_tokens_mask: np.ndarray 48 | 49 | query_docstring_tokens: np.ndarray 50 | query_docstring_tokens_mask: np.ndarray 51 | 52 | code_tokens: np.ndarray 53 | code_tokens_mask: np.ndarray 54 | 55 | 56 | def dataclass_from_dict(klass, dikt): 57 | """Load any dataclass from a dict""" 58 | fieldtypes = {f.name: f.type for f in datafields(klass)} 59 | return klass(**{f: dataclass_from_dict(fieldtypes[f], dikt[f]) for f in dikt}) 60 | -------------------------------------------------------------------------------- /codenets/codesearchnet/dataset_main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Main to test dataset loading 4 | 5 | Usage: 6 | dataset_main.py [options] 7 | dataset_main.py [options] 8 | 9 | Options: 10 | -h --help Show this screen. 11 | --config FILE Specify HOCON config file. [default: ./conf/default.conf] 12 | --debug Enable debug routines. [default: False] 13 | """ 14 | 15 | 16 | from docopt import docopt 17 | from loguru import logger 18 | import sys 19 | import torch 20 | import itertools 21 | from dpu_utils.utils import run_and_debug 22 | from pyhocon import ConfigFactory, ConfigTree 23 | from torch.utils.data import DataLoader 24 | from codenets.codesearchnet.dataset_utils import BalancedBatchSchedulerSampler, DatasetType 25 | from codenets.codesearchnet.training_ctx import CodeSearchTrainingContext 26 | 27 | 28 | print("Torch version", torch.__version__) 29 | 30 | logger.remove() 31 | logger.add(sys.stderr, level="DEBUG", colorize=True, backtrace=False) 32 | 33 | 34 | def run(args, tag_in_vcs=False) -> None: 35 | conf_file = args["--config"] 36 | logger.info(f"config file {conf_file}") 37 | 38 | conf: ConfigTree = ConfigFactory.parse_file(conf_file) 39 | logger.info(f"config {conf}") 40 | logger.info(f"Build Training Context from config {conf_file}") 41 | training_ctx = CodeSearchTrainingContext.build_context_from_hocon(conf) 42 | 43 | train_dataset = training_ctx.build_lang_dataset(DatasetType.TRAIN) 44 | train_dataloader = DataLoader( 45 | dataset=train_dataset, 46 | batch_size=conf["training.batch_size.train"], 47 | sampler=BalancedBatchSchedulerSampler(dataset=train_dataset, batch_size=conf["training.batch_size.train"]), 48 | ) 49 | logger.info(f"train_dataloader [{len(train_dataloader)} samples]") 50 | 51 | for batch in itertools.islice(train_dataloader, 5): 52 | logger.info(f"batch {batch}") 53 | 54 | # val_dataset = training_ctx.build_lang_dataset(DatasetType.VAL) 55 | # val_dataloader = DataLoader( 56 | # dataset=val_dataset, 57 | # batch_size=conf["training.batch_size.val"], 58 | # sampler=BalancedBatchSchedulerSampler(dataset=val_dataset, batch_size=conf["training.batch_size.val"]), 59 | # ) 60 | # logger.info(f"val_dataloader [{len(val_dataloader)} samples]") 61 | 62 | # for batch in itertools.islice(val_dataloader, 5): 63 | # logger.info(f"batch {batch}") 64 | 65 | 66 | if __name__ == "__main__": 67 | args = docopt(__doc__) 68 | run_and_debug(lambda: run(args), args["--debug"]) 69 | -------------------------------------------------------------------------------- /codenets/codesearchnet/eval.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Usage: 4 | eval.py [options] SAVE_FOLDER TRAIN_DATA_PATH VALID_DATA_PATH TEST_DATA_PATH 5 | eval.py [options] [SAVE_FOLDER] 6 | 7 | *_DATA_PATH arguments may either accept (1) directory filled with .jsonl.gz files that we use as data, 8 | or a (2) plain text file containing a list of such directories (used for multi-language training). 9 | 10 | In the case that you supply a (2) plain text file, all directory names must be separated by a newline. 11 | For example, if you want to read from multiple directories you might have a plain text file called 12 | data_dirs_train.txt with the below contents: 13 | 14 | > cat ~/src/data_dirs_train.txt 15 | azure://semanticcodesearch/pythondata/Processed_Data/jsonl/train 16 | azure://semanticcodesearch/csharpdata/split/csharpCrawl-train 17 | 18 | Options: 19 | -h --help Show this screen. 20 | --restore DIR specify restoration dir. [optional] 21 | --debug Enable debug routines. [default: False] 22 | """ 23 | 24 | import os 25 | import torch 26 | from docopt import docopt 27 | from dpu_utils.utils import run_and_debug 28 | from loguru import logger 29 | from tqdm import tqdm 30 | 31 | from torch.utils.data import DataLoader 32 | 33 | # from codenets.codesearchnet.single_branch_ctx import SingleBranchTrainingContext 34 | from codenets.codesearchnet.dataset_utils import BalancedBatchSchedulerSampler, DatasetType 35 | from codenets.codesearchnet.training_ctx import ( 36 | CodeSearchTrainingContext, 37 | compute_loss_mrr, 38 | TotalLoss, 39 | TotalMrr, 40 | TotalSize, 41 | BatchSize, 42 | BatchLoss, 43 | ) 44 | 45 | 46 | def run(args, tag_in_vcs=False) -> None: 47 | os.environ["WANDB_MODE"] = "dryrun" 48 | 49 | logger.debug("Building Training Context") 50 | training_ctx: CodeSearchTrainingContext 51 | restore_dir = args["--restore"] 52 | logger.info(f"Restoring Training Context from directory{restore_dir}") 53 | training_ctx = CodeSearchTrainingContext.build_context_from_dir(restore_dir) 54 | 55 | # Build Val Dataloader 56 | # val_dataset = training_ctx.build_lang_dataset(DatasetType.VAL) 57 | # val_dataloader = DataLoader( 58 | # dataset=val_dataset, 59 | # batch_size=training_ctx.val_batch_size, 60 | # sampler=BalancedBatchSchedulerSampler(dataset=val_dataset, batch_size=training_ctx.val_batch_size), 61 | # ) 62 | # logger.info(f"Built val_dataloader [Length:{len(val_dataloader)} x Batch:{training_ctx.val_batch_size}]") 63 | 64 | # Build Test Dataloader 65 | test_dataset = training_ctx.build_lang_dataset(DatasetType.TEST) 66 | test_dataloader = DataLoader( 67 | dataset=test_dataset, 68 | batch_size=training_ctx.val_batch_size, 69 | sampler=BalancedBatchSchedulerSampler(dataset=test_dataset, batch_size=training_ctx.test_batch_size), 70 | ) 71 | logger.info(f"Built test_dataloader [Length:{len(test_dataloader)} x Batch:{training_ctx.test_batch_size}]") 72 | 73 | total_loss = TotalLoss(0.0) 74 | total_size = TotalSize(0) 75 | total_mrr = TotalMrr(0.0) 76 | training_ctx.eval_mode() 77 | with torch.no_grad(): 78 | training_ctx.zero_grad() 79 | with tqdm(total=len(test_dataloader)) as t_batch: 80 | for batch_idx, batch in enumerate(test_dataloader): 81 | languages, similarity, query_tokens, query_tokens_mask, code_tokens, code_tokens_mask = [ 82 | t.to(training_ctx.device) for t in batch 83 | ] 84 | 85 | batch_total_loss, similarity_scores = training_ctx.forward(batch, batch_idx) 86 | 87 | batch_size = BatchSize(batch[0].size()[0]) 88 | batch_loss = BatchLoss(batch_total_loss.item()) 89 | total_loss, avg_loss, total_mrr, avg_mrr, total_size = compute_loss_mrr( 90 | similarity_scores, batch_loss, batch_size, total_loss, total_mrr, total_size 91 | ) 92 | # languages=languages, 93 | # query_tokens=query_tokens, 94 | # query_tokens_mask=query_tokens_mask, 95 | # code_tokens=code_tokens, 96 | # code_tokens_mask=code_tokens_mask, 97 | # ) 98 | # batch_total_losses, similarity_scores = training_ctx.losses_scores_fn( 99 | # query_embedding, code_embedding, similarity 100 | # ) 101 | # batch_total_loss = torch.mean(batch_total_losses) 102 | 103 | # nb_samples = batch[0].size()[0] 104 | 105 | # # compute MRR 106 | # # extract the logits from the diagonal of the matrix, which are the logits corresponding to the ground-truth 107 | # correct_scores = similarity_scores.diagonal() 108 | # # compute how many queries have bigger logits than the ground truth (the diagonal) 109 | # # the elements that are incorrectly ranked 110 | # compared_scores = similarity_scores.ge(correct_scores.unsqueeze(dim=-1)).float() 111 | # compared_scores_nb = torch.sum(compared_scores, dim=1) 112 | # per_sample_mrr = torch.div(1.0, compared_scores_nb) 113 | # per_batch_mrr = torch.sum(per_sample_mrr) / nb_samples 114 | 115 | # epoch_samples += nb_samples 116 | # epoch_loss += batch_total_loss.item() * nb_samples 117 | # loss = epoch_loss / max(1, epoch_samples) 118 | 119 | # mrr_sum += per_batch_mrr.item() * nb_samples 120 | # mrr = mrr_sum / max(1, epoch_samples) 121 | 122 | t_batch.set_postfix({f"loss": f"{batch_total_loss.item():10}"}) 123 | t_batch.update(1) 124 | 125 | logger.info( 126 | f"total_loss:{total_loss}, avg_loss:{avg_loss}, total_mrr:{total_mrr}, avg_mrr:{avg_mrr}, total_size:{total_size}" 127 | ) 128 | 129 | 130 | if __name__ == "__main__": 131 | args = docopt(__doc__) 132 | run_and_debug(lambda: run(args), args["--debug"]) 133 | -------------------------------------------------------------------------------- /codenets/codesearchnet/huggingface/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/codenets/codesearchnet/huggingface/__init__.py -------------------------------------------------------------------------------- /codenets/codesearchnet/huggingface/models.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | from pathlib import Path 5 | from typing import Union, TypeVar, Type, Generic 6 | from loguru import logger 7 | from transformers import PreTrainedModel 8 | 9 | from codenets.recordable import RecordableTorchModule 10 | from codenets.utils import full_classname, instance_full_classname, runtime_import 11 | 12 | 13 | PretrainedRec_T = TypeVar("PretrainedRec_T", bound="PreTrainedModelRecordable") 14 | Pretrained_T = TypeVar("Pretrained_T", bound="PreTrainedModel") 15 | 16 | 17 | class PreTrainedModelRecordable(Generic[Pretrained_T], RecordableTorchModule): 18 | """ 19 | Wrap any generic HuggingFace PreTrainedModel as a Recordable Torch module 20 | equipped with load/save 21 | """ 22 | 23 | def __init__(self, model: Pretrained_T): 24 | super().__init__() 25 | self.model = model 26 | 27 | def save(self, output_dir: Union[Path, str]) -> bool: 28 | full_dir = Path(output_dir) / instance_full_classname(self) / instance_full_classname(self.model) 29 | logger.info(f"Saving HuggingFace model to {full_dir}") 30 | os.makedirs(full_dir, exist_ok=True) 31 | self.model.save_pretrained(full_dir) 32 | return True 33 | 34 | @classmethod 35 | def load(cls: Type[PretrainedRec_T], restore_dir: Union[Path, str]) -> PretrainedRec_T: 36 | full_dir = Path(restore_dir) / full_classname(cls) 37 | logger.info(f"Loading HuggingFace Pretrained model from {full_dir}") 38 | _, dirs, _ = list(os.walk(full_dir))[0] 39 | model_cls_name = dirs[0] 40 | logger.info(f"Loading HuggingFace {model_cls_name} model from {full_dir}/{model_cls_name}") 41 | klass = runtime_import(model_cls_name) 42 | assert issubclass(klass, PreTrainedModel) 43 | 44 | model = klass.from_pretrained(str(full_dir / model_cls_name)) 45 | 46 | return cls(model) 47 | 48 | def forward(self, *args, **kwargs): 49 | return self.model.forward(*args, **kwargs) -------------------------------------------------------------------------------- /codenets/codesearchnet/huggingface/tokenizer_recs.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Iterable, List, Optional, Tuple, Union, Dict, Callable, IO 3 | import numpy as np 4 | import os 5 | from loguru import logger 6 | from pathlib import Path 7 | from transformers import PreTrainedTokenizer, BertTokenizer 8 | 9 | from tokenizers import CharBPETokenizer, Encoding 10 | 11 | from codenets.recordable import instance_full_classname, full_classname 12 | from codenets.codesearchnet.data import DatasetParams 13 | from codenets.codesearchnet.tokenizer_recs import TokenizerRecordable 14 | from codenets.codesearchnet.copied_code.utils import read_file_samples 15 | from codenets.utils import get_data_files_from_directory 16 | from codenets.codesearchnet.training_ctx import default_sample_update 17 | 18 | 19 | class PreTrainedTokenizerRecordable(TokenizerRecordable): 20 | def __init__(self, vocab: PreTrainedTokenizer): 21 | self.vocab = vocab 22 | 23 | def tokenize(self, text: str, **kwargs) -> List[str]: 24 | return self.vocab.tokenize(text) 25 | 26 | def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]: 27 | return self.vocab.convert_tokens_to_ids(tokens) 28 | 29 | def unk_token(self) -> str: 30 | return self.vocab.unk_token() 31 | 32 | def encode_sentence(self, sentence: str, max_length: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]: 33 | encoded = self.vocab.encode_plus( 34 | sentence, 35 | max_length=max_length, 36 | pad_to_max_length=max_length is not None, 37 | return_token_type_ids=False, 38 | return_attention_mask=True, 39 | ) 40 | token_ids = np.array(encoded["input_ids"]) 41 | token_mask = np.array(encoded["attention_mask"]) 42 | return token_ids, token_mask 43 | 44 | def encode_sentences( 45 | self, sentences: List[str], max_length: Optional[int] = None 46 | ) -> Tuple[np.ndarray, np.ndarray]: 47 | encoded = self.vocab.batch_encode_plus( 48 | sentences, 49 | max_length=max_length, 50 | pad_to_max_length=max_length is not None, 51 | return_token_type_ids=False, 52 | return_attention_mask=True, 53 | ) 54 | token_ids = np.array(encoded["input_ids"]) 55 | token_mask = np.array(encoded["attention_mask"]) 56 | return (token_ids, token_mask) 57 | 58 | def encode_tokens( 59 | self, tokens: Iterable[List[str]], max_length: Optional[int] = None 60 | ) -> Tuple[np.ndarray, np.ndarray]: 61 | encoded = self.vocab( 62 | tokens, 63 | max_length=max_length, 64 | pad_to_max_length=max_length is not None, 65 | return_token_type_ids=False, 66 | return_attention_mask=True, 67 | ) 68 | token_ids = np.array(encoded["input_ids"]) 69 | token_mask = np.array(encoded["attention_mask"]) 70 | return (token_ids, token_mask) 71 | 72 | def decode_sequence(self, tokens_sequence: List[int]) -> str: 73 | return self.vocab.decode(tokens_sequence) 74 | 75 | def decode_sequences(self, tokens_sequences: Iterable[List[int]]) -> List[str]: 76 | return self.vocab.decode_batch(tokens_sequences) 77 | 78 | def add_special_tokens(self, special_tokens: List[str]) -> bool: 79 | self.vocab.add_special_tokens(special_tokens) 80 | return True 81 | 82 | 83 | class BertTokenizerRecordable(PreTrainedTokenizerRecordable): 84 | def __init__(self, vocab: BertTokenizer): 85 | super(BertTokenizerRecordable, self).__init__(vocab) 86 | 87 | def save(self, output_dir: Union[Path, str]) -> bool: 88 | full_dir = Path(output_dir) / instance_full_classname(self) 89 | logger.debug(f"Saving BertTokenizerRecordable to {full_dir}") 90 | os.makedirs(full_dir, exist_ok=True) 91 | self.vocab.save_pretrained(full_dir) 92 | return True 93 | 94 | @classmethod 95 | def load(cls, restore_dir: Union[Path, str]) -> "BertTokenizerRecordable": 96 | full_dir = Path(restore_dir) / full_classname(cls) 97 | logger.debug(f"Loading BertTokenizerRecordable from {full_dir}") 98 | vocab = BertTokenizer.from_pretrained(str(full_dir)) 99 | return BertTokenizerRecordable(vocab) 100 | 101 | 102 | class HuggingfaceBPETokenizerRecordable(TokenizerRecordable): 103 | def __init__(self, tokenizer: CharBPETokenizer): 104 | self.tokenizer = tokenizer 105 | 106 | def tokenize(self, text: str, **kwargs) -> List[str]: 107 | return self.tokenizer.encode(text).tokens 108 | 109 | def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]: 110 | return [self.tokenizer.token_to_id(tok) for tok in tokens] 111 | 112 | def unk_token(self) -> str: 113 | # no access to that in 114 | return "" 115 | 116 | # def pad_token(self) -> str: 117 | # return self.vocab.pad_token() 118 | 119 | def encode_sentence(self, sentence: str, max_length: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]: 120 | enc: Encoding = self.tokenizer.encode(sentence) 121 | if max_length is not None: 122 | enc.truncate(max_length) 123 | enc.pad(max_length) 124 | return np.array(enc.ids), np.array(enc.attention_mask) 125 | 126 | def encode_sentences( 127 | self, sentences: List[str], max_length: Optional[int] = None 128 | ) -> Tuple[np.ndarray, np.ndarray]: 129 | encs = self.tokenizer.encode_batch(sentences) 130 | if max_length is not None: 131 | for enc in encs: 132 | enc.truncate(max_length) 133 | enc.pad(max_length) 134 | # tokens_ids = [np.array(enc.ids) for enc in encs] 135 | # attention_mask = [np.array(enc.attention_mask) for enc in encs] 136 | tokens_ids = [enc.ids for enc in encs] 137 | attention_mask = [enc.attention_mask for enc in encs] 138 | return (np.array(tokens_ids), np.array(attention_mask)) 139 | 140 | def encode_tokens( 141 | self, tokens: Iterable[List[str]], max_length: Optional[int] = None 142 | ) -> Tuple[np.ndarray, np.ndarray]: 143 | # hack... 144 | sentences = [" ".join(toks) for toks in tokens] 145 | return self.encode_sentences(sentences, max_length) 146 | 147 | def decode_sequence(self, tokens_sequence: List[int]) -> str: 148 | return self.tokenizer.decode(tokens_sequence) 149 | 150 | def decode_sequences(self, tokens_sequences: Iterable[List[int]]) -> List[str]: 151 | return self.tokenizer.decode_batch(tokens_sequences) 152 | 153 | def save(self, output_dir: Union[Path, str]) -> bool: 154 | full_dir = Path(output_dir) / instance_full_classname(self) 155 | logger.debug(f"HuggingfaceBPETokenizerRecordable - Saving to {full_dir}") 156 | os.makedirs(full_dir, exist_ok=True) 157 | 158 | self.tokenizer._tokenizer.model.save(str(full_dir), name=str(instance_full_classname(self))) 159 | return True 160 | 161 | @classmethod 162 | def load(cls, restore_dir: Union[Path, str]) -> HuggingfaceBPETokenizerRecordable: 163 | full_dir = Path(restore_dir) / full_classname(cls) 164 | logger.debug(f"HuggingfaceBPETokenizerRecordable - Loading from {full_dir}") 165 | vocab = str(full_dir / f"{full_classname(cls)}-vocab.json") 166 | merges = str(full_dir / f"{full_classname(cls)}-merges.txt") 167 | tokenizer = CharBPETokenizer( 168 | vocab=vocab, 169 | merges=merges 170 | ) 171 | 172 | return HuggingfaceBPETokenizerRecordable(tokenizer) 173 | 174 | def add_special_tokens(self, special_tokens: List[str]) -> bool: 175 | self.tokenizer.add_special_tokens(special_tokens) 176 | return True 177 | 178 | 179 | def build_huggingface_token_files( 180 | data_dirs: List[Path], 181 | data_params: DatasetParams, 182 | output_path: Union[Path, str], 183 | sample_update: Callable[[str, str, List[str]], str] = default_sample_update, 184 | ) -> Tuple[List[Path], Dict[str, Path]]: 185 | tokenizers_path = Path(output_path) 186 | os.makedirs(tokenizers_path, exist_ok=True) 187 | # build files of strings 188 | lang_ios: Dict[str, Tuple[IO[str], IO[str]]] = {} 189 | 190 | query_files: List[Path] = [] 191 | lang_files: Dict[str, Path] = {} 192 | for (idx, file_path) in enumerate(get_data_files_from_directory(data_dirs)): 193 | logger.info(f"Reading {file_path}") 194 | for raw_sample in read_file_samples(file_path): 195 | lang = raw_sample["language"] 196 | if lang not in lang_ios: 197 | query_file = tokenizers_path / f"{lang}_query.txt" 198 | code_file = tokenizers_path / f"{lang}_code.txt" 199 | lang_ios[lang] = (open(query_file, "w"), open(code_file, "w")) 200 | query_files.append(query_file) 201 | lang_files[lang] = code_file 202 | lang_ios[lang][0].write(sample_update("query", lang, raw_sample["docstring_tokens"])) 203 | lang_ios[lang][1].write(sample_update("code", lang, raw_sample["code_tokens"])) 204 | 205 | return query_files, lang_files 206 | -------------------------------------------------------------------------------- /codenets/codesearchnet/poolers.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from abc import abstractmethod 3 | from torch import nn 4 | import torch 5 | from codenets.recordable import RecordableTorchModule 6 | 7 | 8 | class EmbeddingPooler(RecordableTorchModule): 9 | """ 10 | Compute pooler 11 | 12 | Args: 13 | seq_outputs (torch.tensor): [B x T x D] (B is batch, T is sequence size, D is embedding size) 14 | tokens_mask (torch.tensor): [B x T] 15 | 16 | Returns: 17 | tensor: [B x D] 18 | """ 19 | 20 | @abstractmethod 21 | def forward(self, seq_outputs: torch.Tensor, tokens_mask: torch.Tensor) -> torch.Tensor: 22 | pass 23 | 24 | 25 | class MeanPooler(EmbeddingPooler): 26 | def __init__(self, input_size: int = 128, eps: float = 1e-8): 27 | super().__init__() 28 | self.dense = nn.Linear(input_size, 1, bias=False) 29 | self.activation = nn.Sigmoid() 30 | self.eps = eps 31 | 32 | def forward(self, seq_outputs: torch.Tensor, tokens_mask: torch.Tensor) -> torch.Tensor: 33 | # TO TEST 34 | lg = torch.sum(tokens_mask, dim=-1) 35 | mask = tokens_mask.unsqueeze(dim=-1) 36 | seq_outputs_masked = seq_outputs * mask 37 | seq_outputs_sum = torch.sum(seq_outputs_masked, dim=-1) 38 | output = seq_outputs_sum / lg.unsqueeze(dim=-1).clamp(self.eps) 39 | return output 40 | 41 | 42 | class MeanWeightedPooler(EmbeddingPooler): 43 | def __init__(self, input_size: int = 512, eps: float = 1e-8): # default params required for module construction 44 | super().__init__() 45 | self.dense = nn.Linear(input_size, 1, bias=False) 46 | self.activation = nn.Sigmoid() 47 | self.eps = eps 48 | 49 | def forward(self, seq_outputs: torch.Tensor, tokens_mask: torch.Tensor) -> torch.Tensor: 50 | token_weights = self.activation(self.dense(seq_outputs)) # B x T x 1 51 | token_weights = token_weights * tokens_mask.unsqueeze(dim=-1) # B x T x 1 52 | # sum on the T dimension 53 | seq_weighted_sum = torch.sum(seq_outputs * token_weights, dim=1) # B x D 54 | output = seq_weighted_sum / torch.sum(token_weights, dim=1).clamp(min=self.eps) 55 | return output 56 | -------------------------------------------------------------------------------- /codenets/codesearchnet/predictions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Usage: 4 | train.py [options] SAVE_FOLDER TRAIN_DATA_PATH VALID_DATA_PATH TEST_DATA_PATH 5 | train.py [options] [SAVE_FOLDER] 6 | 7 | *_DATA_PATH arguments may either accept (1) directory filled with .jsonl.gz files that we use as data, 8 | or a (2) plain text file containing a list of such directories (used for multi-language training). 9 | 10 | In the case that you supply a (2) plain text file, all directory names must be separated by a newline. 11 | For example, if you want to read from multiple directories you might have a plain text file called 12 | data_dirs_train.txt with the below contents: 13 | 14 | > cat ~/src/data_dirs_train.txt 15 | azure://semanticcodesearch/pythondata/Processed_Data/jsonl/train 16 | azure://semanticcodesearch/csharpdata/split/csharpCrawl-train 17 | 18 | Options: 19 | -h --help Show this screen. 20 | --restore DIR specify restoration dir. 21 | --wandb_run_id // Specify Wandb Run 22 | --debug Enable debug routines. [default: False] 23 | """ 24 | 25 | import os 26 | import sys 27 | from pathlib import Path 28 | from typing import Tuple 29 | import torch 30 | import numpy as np 31 | from docopt import docopt 32 | from dpu_utils.utils import run_and_debug 33 | from loguru import logger 34 | import pandas as pd 35 | from annoy import AnnoyIndex 36 | from tqdm import tqdm 37 | import shutil 38 | from wandb.apis import InternalApi 39 | import wandb 40 | from codenets.codesearchnet.training_ctx import CodeSearchTrainingContext 41 | 42 | 43 | def compute_code_encodings_from_defs( 44 | language: str, training_ctx: CodeSearchTrainingContext, lang_token: str, batch_length: int = 1024 45 | ) -> Tuple[pd.DataFrame, pd.DataFrame]: 46 | logger.info(f"Computing Encoding for language: {language}") 47 | lang_id = training_ctx.train_data_params.lang_ids[language] 48 | h5_file = ( 49 | training_ctx.pickle_path 50 | / f"{language}_{training_ctx.training_full_name}_dedupe_definitions_v2_codes_encoded.h5" 51 | ) 52 | root_data_path = Path(training_ctx.conf["dataset.root_dir"]) 53 | 54 | def_file = root_data_path / f"data/{language}_dedupe_definitions_v2.pkl" 55 | definitions_df = pd.DataFrame(pd.read_pickle(open(def_file, "rb"), compression=None)) 56 | cols_to_remove = list(definitions_df.columns.difference(["function_tokens", "identifier", "url"])) 57 | for col in cols_to_remove: 58 | del definitions_df[col] 59 | # definitions_df.drop(cols_to_remove, inplace=True, axis=1) 60 | logger.debug(f"definitions_df {definitions_df.columns}") 61 | 62 | if not os.path.exists(h5_file): 63 | logger.info(f"Building encodings of code from {def_file}") 64 | 65 | # function_tokens = definitions_df["function_tokens"] 66 | # add language and lang_token () to tokens 67 | definitions_df["function_tokens"] = definitions_df["function_tokens"].apply( 68 | lambda row: [language, lang_token] + row 69 | ) 70 | function_tokens_batch = definitions_df["function_tokens"].groupby( 71 | np.arange(len(definitions_df["function_tokens"])) // batch_length 72 | ) 73 | 74 | code_embeddings = [] 75 | for g, df_batch in tqdm(function_tokens_batch): 76 | # logger.debug(f"df_batch {df_batch.values}") 77 | codes_encoded, codes_masks = training_ctx.tokenize_code_tokens( 78 | df_batch.values, max_length=training_ctx.conf["dataset.common_params.code_max_num_tokens"] 79 | ) 80 | 81 | # codes_encoded_t = torch.tensor(codes_encoded, dtype=torch.long).to(training_ctx.device) 82 | # codes_masks_t = torch.tensor(codes_masks, dtype=torch.long).to(training_ctx.device) 83 | 84 | # logger.debug(f"codes_encoded_t {codes_encoded_t}") 85 | # logger.debug(f"codes_masks_t {codes_masks_t}") 86 | 87 | emb_df = pd.DataFrame( 88 | training_ctx.encode_code( 89 | lang_id=lang_id, 90 | code_tokens=codes_encoded, 91 | code_tokens_mask=codes_masks 92 | ) 93 | # .cpu() 94 | # .numpy() 95 | ) 96 | # logger.debug(f"codes_encoded_t:{codes_encoded_t.shape} codes_masks_t:{codes_masks_t.shape}") 97 | if g < 2: 98 | logger.debug(f"emb_df {emb_df.head()}") 99 | code_embeddings.append(emb_df) 100 | 101 | # free memory or it explodes on 32GB... 102 | del definitions_df["function_tokens"] 103 | 104 | code_embeddings_df = pd.concat(code_embeddings) 105 | 106 | logger.debug(f"code_embeddings_df {code_embeddings_df.head(20)}") 107 | 108 | code_embeddings_df.to_hdf(h5_file, key="code_embeddings_df", mode="w") 109 | return (code_embeddings_df, definitions_df) 110 | else: 111 | code_embeddings_df = pd.read_hdf(h5_file, key="code_embeddings_df") 112 | return (code_embeddings_df, definitions_df) 113 | 114 | 115 | def run(args, tag_in_vcs=False) -> None: 116 | args_wandb_run_id = args["--wandb_run_id"] 117 | if args_wandb_run_id is not None: 118 | entity, project, name = args_wandb_run_id.split("/") 119 | os.environ["WANDB_RUN_ID"] = name 120 | os.environ["WANDB_RESUME"] = "must" 121 | 122 | wandb_api = wandb.Api() 123 | # retrieve saved model from W&B for this run 124 | logger.info("Fetching run from W&B...") 125 | try: 126 | wandb_api.run(args_wandb_run_id) 127 | except wandb.CommError: 128 | logger.error(f"ERROR: Problem querying W&B for wandb_run_id: {args_wandb_run_id}", file=sys.stderr) 129 | sys.exit(1) 130 | 131 | else: 132 | os.environ["WANDB_MODE"] = "dryrun" 133 | 134 | logger.debug("Building Training Context") 135 | training_ctx: CodeSearchTrainingContext 136 | restore_dir = args["--restore"] 137 | logger.info(f"Restoring Training Context from directory{restore_dir}") 138 | training_ctx = CodeSearchTrainingContext.build_context_from_dir(restore_dir) 139 | 140 | queries = pd.read_csv(training_ctx.queries_file) 141 | queries = list(map(lambda q: f" {q}", queries["query"].values)) 142 | queries_tokens, queries_masks = training_ctx.tokenize_query_sentences( 143 | queries, max_length=training_ctx.conf["dataset.common_params.query_max_num_tokens"] 144 | ) 145 | logger.info(f"queries: {queries}") 146 | 147 | training_ctx.eval_mode() 148 | with torch.no_grad(): 149 | query_embeddings = ( 150 | training_ctx.encode_query( 151 | query_tokens=queries_tokens, 152 | query_tokens_mask=queries_masks, 153 | ) 154 | # .cpu() 155 | # .numpy() 156 | ) 157 | logger.info(f"query_embeddings: {query_embeddings.shape}") 158 | 159 | topk = 100 160 | language_token = "" 161 | for lang_idx, language in enumerate( 162 | ("python", "go", "javascript", "java", "php", "ruby") 163 | # ("php", "ruby") 164 | ): # in enumerate(("python", "go", "javascript", "java", "php", "ruby")): 165 | predictions = [] 166 | # (codes_encoded_df, codes_masks_df, definitions) = get_language_defs(language, training_ctx, language_token) 167 | 168 | code_embeddings, definitions = compute_code_encodings_from_defs( 169 | language, training_ctx, language_token, batch_length=512 170 | ) 171 | logger.info(f"Building Annoy Index of length {len(code_embeddings.values[0])}") 172 | indices: AnnoyIndex = AnnoyIndex(len(code_embeddings.values[0]), "angular") 173 | # idx = 0 174 | for index, emb in enumerate(tqdm(code_embeddings.values)): 175 | indices.add_item(index, emb) 176 | indices.build(10) 177 | 178 | for i, (query, query_embedding) in enumerate(tqdm(zip(queries, query_embeddings))): 179 | idxs, distances = indices.get_nns_by_vector(query_embedding, topk, include_distances=True) 180 | for idx2, _ in zip(idxs, distances): 181 | predictions.append( 182 | (query, language, definitions.iloc[idx2]["identifier"], definitions.iloc[idx2]["url"]) 183 | ) 184 | 185 | logger.info(f"predictions {predictions[0]}") 186 | 187 | df = pd.DataFrame(predictions, columns=["query", "language", "identifier", "url"]) 188 | # BUT WHY DOESNT IT WORK AS EXPECTED???? 189 | df["query"] = df["query"].str.replace(" ", "") 190 | df["identifier"] = df["identifier"].str.replace(",", "") 191 | df["identifier"] = df["identifier"].str.replace('"', "") 192 | df["identifier"] = df["identifier"].str.replace(";", "") 193 | df.to_csv( 194 | training_ctx.output_dir / f"model_predictions_{training_ctx.training_tokenizer_type}.csv", 195 | index=False, 196 | header=True if lang_idx == 0 else False, 197 | # mode="w" if lang_idx == 0 else "a", 198 | mode="a", 199 | ) 200 | # Free memory 201 | del code_embeddings 202 | del definitions 203 | del predictions 204 | 205 | if args_wandb_run_id is not None: 206 | logger.info("Uploading predictions to W&B") 207 | # upload model predictions CSV file to W&B 208 | 209 | entity, project, name = args_wandb_run_id.split("/") 210 | 211 | # make sure the file is in our cwd, with the correct name 212 | predictions_csv = training_ctx.output_dir / f"model_predictions_{training_ctx.training_tokenizer_type}.csv" 213 | predictions_base_csv = "model_predictions.csv" 214 | shutil.copyfile(predictions_csv, predictions_base_csv) 215 | 216 | # Using internal wandb API. TODO: Update when available as a public API 217 | internal_api = InternalApi() 218 | internal_api.push([predictions_base_csv], run=name, entity=entity, project=project) 219 | 220 | 221 | if __name__ == "__main__": 222 | args = docopt(__doc__) 223 | run_and_debug(lambda: run(args), args["--debug"]) 224 | -------------------------------------------------------------------------------- /codenets/codesearchnet/query_1_code_1/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/codenets/codesearchnet/query_1_code_1/__init__.py -------------------------------------------------------------------------------- /codenets/codesearchnet/query_1_code_1/model.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pathlib import Path 4 | from typing import MutableMapping, Optional, Union, Type 5 | 6 | import numpy as np 7 | from transformers import BertConfig, BertModel 8 | 9 | from codenets.codesearchnet.poolers import MeanWeightedPooler 10 | from codenets.codesearchnet.huggingface.models import PreTrainedModelRecordable 11 | from codenets.recordable import ( 12 | Recordable, 13 | RecordableTorchModule, 14 | runtime_load_recordable_mapping, 15 | save_recordable_mapping, 16 | ) 17 | from codenets.utils import full_classname, instance_full_classname 18 | from pyhocon import ConfigTree 19 | 20 | 21 | class Query1Code1(RecordableTorchModule): 22 | """ 23 | A generic Pytorch Model with: 24 | - one single-branch query encoder 25 | - one single-branch code encoder 26 | - one optional pooler to pool output embeddings from any branch 27 | """ 28 | 29 | def __init__( 30 | self, 31 | query_encoder: RecordableTorchModule, 32 | code_encoder: RecordableTorchModule, 33 | pooler: Optional[RecordableTorchModule] = None, 34 | ): 35 | super(Query1Code1, self).__init__() 36 | self.code_encoder = code_encoder 37 | self.query_encoder = query_encoder 38 | self.pooler = pooler 39 | 40 | def save(self, output_dir: Union[Path, str]) -> bool: 41 | d = Path(output_dir) / instance_full_classname(self) 42 | records: MutableMapping[str, Recordable] = { 43 | "query_encoder": self.query_encoder, 44 | "code_encoder": self.code_encoder, 45 | } 46 | if self.pooler is not None: 47 | records["pooler"] = self.pooler 48 | return save_recordable_mapping(output_dir=d, records=records) 49 | 50 | @classmethod 51 | def load(cls, restore_dir: Union[Path, str]) -> Query1Code1: 52 | d = Path(restore_dir) / full_classname(cls) 53 | records = runtime_load_recordable_mapping(d) 54 | return cls(**records) # type:ignore[arg-type] 55 | 56 | def forward( 57 | self, 58 | languages: np.ndarray, 59 | query_tokens: np.ndarray, 60 | query_tokens_mask: np.ndarray, 61 | code_tokens: np.ndarray, 62 | code_tokens_mask: np.ndarray, 63 | ): 64 | # lang_id = str(languages[0].item()) 65 | query_seq_outputs = self.query_encoder(query_tokens, query_tokens_mask) # [B x S x H] 66 | code_seq_outputs = self.code_encoder(code_tokens, code_tokens_mask) # [B x S x H] 67 | 68 | if self.pooler is not None: 69 | return ( 70 | self.pooler(query_seq_outputs[0], query_tokens_mask), 71 | self.pooler(code_seq_outputs[0], code_tokens_mask), 72 | ) 73 | else: 74 | # use already pooled data (need to be pretrained as it uses 1st (CLS) token logit) 75 | return query_seq_outputs[1], code_seq_outputs[1] 76 | 77 | def encode_query(self, query_tokens: np.ndarray, query_tokens_mask: np.ndarray) -> np.ndarray: 78 | query_seq_outputs = self.query_encoder(query_tokens, query_tokens_mask) 79 | 80 | if self.pooler is not None: 81 | return self.pooler(query_seq_outputs[0], query_tokens_mask) 82 | else: 83 | return query_seq_outputs[1] 84 | 85 | def encode_code(self, lang_id: int, code_tokens: np.ndarray, code_tokens_mask: np.ndarray) -> np.ndarray: 86 | code_seq_outputs = self.code_encoder(code_tokens, code_tokens_mask) 87 | if self.pooler is not None: 88 | return self.pooler(code_seq_outputs[0], code_tokens_mask) 89 | else: 90 | return code_seq_outputs[1] 91 | 92 | def tokenize_code(self, lang_id: int, code_tokens: np.ndarray, code_tokens_mask: np.ndarray) -> np.ndarray: 93 | code_seq_outputs = self.code_encoder(code_tokens, code_tokens_mask) 94 | if self.pooler is not None: 95 | return self.pooler(code_seq_outputs[0], code_tokens_mask) 96 | else: 97 | return code_seq_outputs[1] 98 | 99 | @classmethod 100 | def from_hocon(cls: Type[Query1Code1], config: ConfigTree) -> Query1Code1: 101 | """Load Query1Code1_CodeSearchModel from a config tree""" 102 | 103 | query_bert_config = BertConfig(**config["training.model.query_encoder"]) 104 | query_encoder = PreTrainedModelRecordable(BertModel(query_bert_config)) 105 | code_bert_config = BertConfig(**config["training.model.code_encoder"]) 106 | code_encoder = PreTrainedModelRecordable(BertModel(code_bert_config)) 107 | 108 | model = Query1Code1( 109 | query_encoder=query_encoder, 110 | code_encoder=code_encoder, 111 | pooler=MeanWeightedPooler(input_size=query_bert_config.hidden_size), 112 | ) 113 | 114 | return model 115 | -------------------------------------------------------------------------------- /codenets/codesearchnet/query_1_code_n/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/codenets/codesearchnet/query_1_code_n/__init__.py -------------------------------------------------------------------------------- /codenets/codesearchnet/query_code_siamese/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/codenets/codesearchnet/query_code_siamese/__init__.py -------------------------------------------------------------------------------- /codenets/codesearchnet/query_code_siamese/model.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pathlib import Path 4 | from typing import MutableMapping, Optional, Union, Type 5 | 6 | import numpy as np 7 | from loguru import logger 8 | from transformers import BertConfig, BertModel, AlbertConfig, AlbertModel 9 | 10 | from codenets.codesearchnet.poolers import MeanWeightedPooler 11 | from codenets.codesearchnet.huggingface.models import PreTrainedModelRecordable 12 | from codenets.recordable import ( 13 | Recordable, 14 | RecordableTorchModule, 15 | runtime_load_recordable_mapping, 16 | save_recordable_mapping, 17 | ) 18 | from codenets.utils import full_classname, instance_full_classname 19 | from pyhocon import ConfigTree 20 | 21 | 22 | class QueryCodeSiamese(RecordableTorchModule): 23 | """ 24 | A generic Pytorch Model with: 25 | - one single-branch query encoder 26 | - one single-branch code encoder 27 | - one optional pooler to pool output embeddings from any branch 28 | """ 29 | def __init__(self, encoder: RecordableTorchModule, pooler: Optional[RecordableTorchModule] = None): 30 | super(QueryCodeSiamese, self).__init__() 31 | self.encoder = encoder 32 | self.pooler = pooler 33 | 34 | def save(self, output_dir: Union[Path, str]) -> bool: 35 | d = Path(output_dir) / instance_full_classname(self) 36 | records: MutableMapping[str, Recordable] = {"encoder": self.encoder} 37 | if self.pooler is not None: 38 | records["pooler"] = self.pooler 39 | return save_recordable_mapping(output_dir=d, records=records) 40 | 41 | @classmethod 42 | def load(cls, restore_dir: Union[Path, str]) -> QueryCodeSiamese: 43 | d = Path(restore_dir) / full_classname(cls) 44 | records = runtime_load_recordable_mapping(d) 45 | return cls(**records) # type: ignore[arg-type] 46 | 47 | def forward( 48 | self, 49 | languages: np.ndarray, 50 | query_tokens: np.ndarray, 51 | query_tokens_mask: np.ndarray, 52 | code_tokens: np.ndarray, 53 | code_tokens_mask: np.ndarray, 54 | lang_weights: np.ndarray, 55 | ): 56 | # lang_id = str(languages[0].item()) 57 | query_seq_outputs = self.encoder(query_tokens, query_tokens_mask) # [B x S x H] 58 | code_seq_outputs = self.encoder(code_tokens, code_tokens_mask) # [B x S x H] 59 | if self.pooler is not None: 60 | return ( 61 | self.pooler(query_seq_outputs[0], query_tokens_mask), 62 | self.pooler(code_seq_outputs[0], code_tokens_mask), 63 | ) 64 | else: 65 | # use already pooled data (need to be pretrained as it uses 1st (CLS) token logit) 66 | return query_seq_outputs[1], code_seq_outputs[1] 67 | 68 | def encode_query(self, query_tokens: np.ndarray, query_tokens_mask: np.ndarray) -> np.ndarray: 69 | query_seq_outputs = self.encoder(query_tokens, query_tokens_mask) 70 | 71 | if self.pooler is not None: 72 | return self.pooler(query_seq_outputs[0], query_tokens_mask) 73 | else: 74 | return query_seq_outputs[1] 75 | 76 | def encode_code(self, lang_id: int, code_tokens: np.ndarray, code_tokens_mask: np.ndarray) -> np.ndarray: 77 | code_seq_outputs = self.encoder(code_tokens, code_tokens_mask) 78 | if self.pooler is not None: 79 | return self.pooler(code_seq_outputs[0], code_tokens_mask) 80 | else: 81 | return code_seq_outputs[1] 82 | 83 | def tokenize_code(self, lang_id: int, code_tokens: np.ndarray, code_tokens_mask: np.ndarray) -> np.ndarray: 84 | code_seq_outputs = self.encoder(code_tokens, code_tokens_mask) 85 | if self.pooler is not None: 86 | return self.pooler(code_seq_outputs[0], code_tokens_mask) 87 | else: 88 | return code_seq_outputs[1] 89 | 90 | @classmethod 91 | def from_hocon(cls: Type[QueryCodeSiamese], config: ConfigTree) -> QueryCodeSiamese: 92 | """Load Query1Code1_CodeSearchModel from a config tree""" 93 | 94 | if "training.model.encoder.type" in config: 95 | if config["training.model.encoder.type"] == "albert": 96 | logger.info("Creating QueryCodeSiamese with Albert encoder") 97 | albert_config = AlbertConfig(**config["training.model.encoder"]) 98 | encoder = PreTrainedModelRecordable(AlbertModel(albert_config)) 99 | elif config["training.model.encoder.type"] == "bert": 100 | logger.info("Creating QueryCodeSiamese with Bert encoder") 101 | bert_config = BertConfig(**config["training.model.encoder"]) 102 | encoder = PreTrainedModelRecordable(BertModel(bert_config)) 103 | else: 104 | # default is BERT now 105 | logger.info("Creating QueryCodeSiamese with Bert encoder") 106 | bert_config = BertConfig(**config["training.model.encoder"]) 107 | encoder = PreTrainedModelRecordable(BertModel(bert_config)) 108 | 109 | model = QueryCodeSiamese( 110 | encoder=encoder, pooler=MeanWeightedPooler(input_size=config["training.model.encoder.hidden_size"]) 111 | ) 112 | 113 | return model 114 | -------------------------------------------------------------------------------- /codenets/codesearchnet/sbert_build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Usage: 4 | eval.py [options] SAVE_FOLDER TRAIN_DATA_PATH VALID_DATA_PATH TEST_DATA_PATH 5 | eval.py [options] [SAVE_FOLDER] 6 | 7 | *_DATA_PATH arguments may either accept (1) directory filled with .jsonl.gz files that we use as data, 8 | or a (2) plain text file containing a list of such directories (used for multi-language training). 9 | 10 | In the case that you supply a (2) plain text file, all directory names must be separated by a newline. 11 | For example, if you want to read from multiple directories you might have a plain text file called 12 | data_dirs_train.txt with the below contents: 13 | 14 | > cat ~/src/data_dirs_train.txt 15 | azure://semanticcodesearch/pythondata/Processed_Data/jsonl/train 16 | azure://semanticcodesearch/csharpdata/split/csharpCrawl-train 17 | 18 | Options: 19 | -h --help Show this screen. 20 | --config FILE Specify HOCON config file. 21 | --debug Enable debug routines. [default: False] 22 | """ 23 | 24 | from typing import Dict, List 25 | from sentence_transformers import SentenceTransformer 26 | from dpu_utils.utils import run_and_debug 27 | from docopt import docopt 28 | from loguru import logger 29 | import itertools 30 | import os 31 | import pickle 32 | from torch.utils.data import DataLoader 33 | from pathlib import Path 34 | from pyhocon import ConfigFactory 35 | from torch import nn 36 | from torch import Tensor 37 | import torch 38 | import numpy as np 39 | import pandas as pd 40 | 41 | from tree_sitter import Language, Parser 42 | from codenets.codesearchnet.copied_code.utils import read_file_samples 43 | from sklearn.metrics.pairwise import pairwise_distances 44 | from codenets.codesearchnet.dataset_utils import BalancedBatchSchedulerSampler, DatasetType 45 | from codenets.codesearchnet.data import DatasetParams 46 | from codenets.codesearchnet.training_ctx import CodeSearchTrainingContext 47 | from codenets.codesearchnet.query_code_siamese.dataset import load_data_from_dirs 48 | 49 | """Evaluating SBert.""" 50 | 51 | 52 | def run(args, tag_in_vcs=False) -> None: 53 | # os.environ["WANDB_MODE"] = "dryrun" 54 | 55 | logger.debug("Building Training Context") 56 | conf_file = args["--config"] 57 | conf = ConfigFactory.parse_file(conf_file) 58 | 59 | logger.info(f"Restoring Training Context from config {conf_file}") 60 | training_ctx = CodeSearchTrainingContext.build_context_from_hocon(conf) 61 | 62 | # val_dataset = training_ctx.build_lang_dataset(DatasetType.VAL) 63 | # if val_dataset.collate_fn is not None: 64 | # val_dataloader = DataLoader( 65 | # dataset=val_dataset, 66 | # batch_size=conf["training.batch_size.val"], 67 | # sampler=BalancedBatchSchedulerSampler(dataset=val_dataset, batch_size=conf["training.batch_size.val"]), 68 | # collate_fn=val_dataset.collate_fn, 69 | # ) 70 | # else: 71 | # val_dataloader = DataLoader( 72 | # dataset=val_dataset, 73 | # batch_size=conf["training.batch_size.val"], 74 | # sampler=BalancedBatchSchedulerSampler(dataset=val_dataset, batch_size=conf["training.batch_size.val"]), 75 | # ) 76 | 77 | val_dataloader = training_ctx.build_lang_dataloader(DatasetType.VAL) 78 | logger.info(f"val_dataloader [{len(val_dataloader)} samples]") 79 | 80 | # train_dataloader = training_ctx.build_lang_dataloader(DatasetType.TRAIN) 81 | # logger.info(f"train_dataloader [{len(train_dataloader)} samples]") 82 | 83 | # df = pd.read_parquet("./pickles/train_qc_30k_embeddings.parquet") 84 | # print(df.info()) 85 | 86 | # z = df.iloc[0][0] 87 | # print("z", z.shape) 88 | from annoy import AnnoyIndex 89 | 90 | t = AnnoyIndex(768, "angular") 91 | # for index, row in df.iterrows(): 92 | # print(row.shape) 93 | # t.add_item(index, row[0]) 94 | # t.build(10) # 10 trees 95 | # t.save("./pickles/train_qc_30k_embeddings.ann") 96 | 97 | t.load("./pickles/val_qc_30k_embeddings.ann") 98 | 99 | # for i in range(0, 100): 100 | # print(i, 99, 1.0 - t.get_distance(i, 99)) 101 | 102 | for batch in val_dataloader: # itertools.islice(val_dataloader, 0, 1000): 103 | indices, languages, similarity, query_tokens, query_tokens_mask, code_tokens, code_tokens_mask, code_lang_weights = ( 104 | batch 105 | ) 106 | toks = [toks.cpu().numpy()[: len(mask[mask != 0])] for (toks, mask) in zip(query_tokens, query_tokens_mask)] 107 | toks = training_ctx.decode_query_tokens(toks) 108 | qs = [str((t, score)) for (t, score) in list(zip(toks, similarity))] 109 | for i, scores in enumerate(similarity): 110 | for j, s in enumerate(scores): 111 | if s > 0.5 and i != j: 112 | print(s, toks[i], toks[j]) 113 | 114 | # print("query", "\n".join(qs)) 115 | 116 | # # print("query_tokens", query_tokens) 117 | # # 5 for removing " " 118 | # toks = [toks.cpu().numpy()[: len(mask[mask != 0])] for (toks, mask) in zip(query_tokens, query_tokens_mask)] 119 | # toks = training_ctx.decode_query_tokens(toks) 120 | # # print("toks", toks) 121 | # qs = [str((t, score)) for (t, score) in list(zip(toks, similarity))] 122 | # print("query", "\n".join(qs)) 123 | # print("-----------") 124 | 125 | # data_file = ( 126 | # "/home/mandubian/workspaces/tools/CodeSearchNet/resources/data/python/final/jsonl/valid/python_valid_0.jsonl.gz" 127 | # ) 128 | # filename = os.path.basename(data_file) 129 | # file_language = filename.split("_")[0] 130 | 131 | # samples = list(read_file_samples(data_file)) 132 | 133 | # sample0 = samples[0] 134 | # sample1 = samples[1] 135 | # logger.info(f"keys {sample0.keys()}") 136 | # logger.info(f"sample docstring {sample0['docstring_tokens']}") 137 | # query0 = " ".join(samples[0]["docstring_tokens"]) 138 | # logger.info(f"query0 {query0}") 139 | # query_embeddings0 = model.encode([query0]) 140 | # # logger.info(f"query_embeddings0 {query_embeddings0}") 141 | # query1 = " ".join(sample1["docstring_tokens"]) 142 | # query_embeddings1 = model.encode([query1]) 143 | 144 | # distances = pairwise_distances(query_embeddings0, query_embeddings1, metric="cosine") 145 | # logger.info(f"distances {distances}") 146 | 147 | # Language.build_library( 148 | # # Store the library in the `build` directory 149 | # "build/my-languages.so", 150 | # # Include one or more languages 151 | # [ 152 | # "vendor/tree-sitter-go", 153 | # "vendor/tree-sitter-java", 154 | # "vendor/tree-sitter-javascript", 155 | # "vendor/tree-sitter-python", 156 | # "vendor/tree-sitter-php", 157 | # "vendor/tree-sitter-ruby", 158 | # ], 159 | # ) 160 | # PY_LANGUAGE = Language("build/my-languages.so", "python") 161 | # parser = Parser() 162 | # parser.set_language(PY_LANGUAGE) 163 | # tree = parser.parse(bytes(samples[0]["code"], "utf8")) 164 | 165 | # logger.info(f"tree {tree}") 166 | 167 | 168 | if __name__ == "__main__": 169 | args = docopt(__doc__) 170 | run_and_debug(lambda: run(args), args["--debug"]) 171 | -------------------------------------------------------------------------------- /codenets/codesearchnet/tokenizer_build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Usage: 4 | tokenizers_huggingface_build.py [options] 5 | tokenizers_huggingface_build.py [options] 6 | 7 | Options: 8 | -h --help Show this screen. 9 | --config FILE Specify HOCON config file. [default: ./conf/default.conf] 10 | --debug Enable debug routines. [default: False] 11 | """ 12 | 13 | 14 | from docopt import docopt 15 | from loguru import logger 16 | import sys 17 | import torch 18 | from dpu_utils.utils import run_and_debug 19 | from pyhocon import ConfigFactory, ConfigTree 20 | 21 | from codenets.codesearchnet.training_ctx import CodeSearchTrainingContext 22 | from codenets.codesearchnet.tokenizer_recs import build_most_common_tokens 23 | 24 | print("Torch version", torch.__version__) 25 | 26 | logger.remove() 27 | logger.add(sys.stderr, level="DEBUG", colorize=True, backtrace=False) 28 | 29 | 30 | def run(args, tag_in_vcs=False) -> None: 31 | conf_file = args["--config"] 32 | logger.info(f"config file {conf_file}") 33 | 34 | conf: ConfigTree = ConfigFactory.parse_file(conf_file) 35 | logger.info(f"config {conf}") 36 | 37 | # logger.info(f"Build Training Context from config {conf_file}") 38 | # training_ctx = CodeSearchTrainingContext.build_context_from_hocon(conf) 39 | 40 | # training_ctx.build_tokenizers(from_dataset_type=DatasetType.TRAIN) 41 | 42 | logger.info(f"Reload Training Context from config {conf_file} with built tokenizers") 43 | training_ctx = CodeSearchTrainingContext.build_context_from_hocon(conf) 44 | 45 | txt = "python def toto():" 46 | logger.info(f"encoded {training_ctx.tokenize_code_sentences([txt])}") 47 | txt = "go function getCounts() { return 0 }" 48 | logger.info(f"encoded {training_ctx.tokenize_code_sentences([txt])}") 49 | 50 | most_commons = build_most_common_tokens( 51 | training_ctx.train_dirs, training_ctx.train_data_params, training_ctx.tokenizers_build_path, 52 | parallelize=False 53 | ) 54 | logger.info(f"most_commons {most_commons}") 55 | 56 | 57 | if __name__ == "__main__": 58 | args = docopt(__doc__) 59 | run_and_debug(lambda: run(args), args["--debug"]) 60 | -------------------------------------------------------------------------------- /codenets/codesearchnet/tokenizer_recs.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | 3 | from typing import Iterable, List, Optional, Tuple, Dict, cast 4 | import numpy as np 5 | import os 6 | from loguru import logger 7 | from pathlib import Path 8 | import pickle 9 | 10 | import time 11 | 12 | from pyhocon import ConfigTree 13 | from codenets.recordable import Recordable, RecordableMapping, DictRecordable 14 | from codenets.codesearchnet.data import DatasetParams 15 | from codenets.codesearchnet.copied_code.metadata import Metadata, append_metadata, build_tokenizer_metadata 16 | 17 | 18 | class TokenizerRecordable(Recordable): 19 | @abstractmethod 20 | def tokenize(self, text: str, **kwargs) -> List[str]: 21 | pass 22 | 23 | @abstractmethod 24 | def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]: 25 | pass 26 | 27 | @abstractmethod 28 | def unk_token(self) -> str: 29 | pass 30 | 31 | # @abstractmethod 32 | # def pad_token(self) -> str: 33 | # pass 34 | 35 | @abstractmethod 36 | def encode_sentence(self, sentence: str, max_length: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]: 37 | pass 38 | 39 | @abstractmethod 40 | def encode_sentences( 41 | self, sentences: List[str], max_length: Optional[int] = None 42 | ) -> Tuple[np.ndarray, np.ndarray]: 43 | pass 44 | 45 | @abstractmethod 46 | def encode_tokens( 47 | self, tokens: Iterable[List[str]], max_length: Optional[int] = None 48 | ) -> Tuple[np.ndarray, np.ndarray]: 49 | pass 50 | 51 | @abstractmethod 52 | def decode_sequence(self, tokens_sequence: List[int]) -> str: 53 | pass 54 | 55 | @abstractmethod 56 | def decode_sequences(self, tokens_sequences: Iterable[List[int]]) -> List[str]: 57 | pass 58 | 59 | @abstractmethod 60 | def add_special_tokens(self, special_tokens: List[str]) -> bool: 61 | pass 62 | 63 | 64 | def build_most_common_tokens( 65 | data_dirs: List[Path], 66 | data_params: DatasetParams, 67 | build_path: Path, 68 | max_files_per_dir: Optional[int] = None, 69 | parallelize: bool = True, 70 | ) -> Dict[str, List[Tuple[str, int]]]: 71 | 72 | start = time.time() 73 | 74 | logger.info(f"Build metadata for {data_dirs}") 75 | 76 | _, code_language_metadata_lists = build_tokenizer_metadata( 77 | data_dirs=data_dirs, 78 | max_files_per_dir=max_files_per_dir, 79 | parallelize=parallelize, 80 | use_subtokens=data_params.use_subtokens, 81 | mark_subtoken_end=data_params.mark_subtoken_end, 82 | ) 83 | 84 | logger.info("Merging metadata") 85 | 86 | # merge metadata if necessary 87 | per_code_language_metadata: Dict[str, Metadata] = {} 88 | for (language, raw_per_language_metadata) in code_language_metadata_lists.items(): 89 | logger.info(f"Build vocabulary for {language}") 90 | per_code_language_metadata[language] = append_metadata( 91 | "code", 92 | vocab_size=data_params.vocab_size, 93 | vocab_count_threshold=data_params.vocab_count_threshold, 94 | pct_bpe=data_params.pct_bpe, 95 | raw_metadata_list=raw_per_language_metadata, 96 | ) 97 | common_tokens: Dict[str, List[Tuple[str, int]]] = {} 98 | for (language, md) in per_code_language_metadata.items(): 99 | common_tokens[language] = md.common_tokens 100 | 101 | end = time.time() 102 | 103 | time_p = end - start 104 | logger.info(f"Most Common Tokens: {time_p} sec") 105 | 106 | pickle.dump(common_tokens, open("./checkpoints/tmp_common_tokens.p", "wb")) 107 | 108 | common_tokens_dict = DictRecordable(common_tokens) 109 | os.makedirs(build_path, exist_ok=True) 110 | records = RecordableMapping({"common_tokens": common_tokens_dict}) 111 | records.save(build_path) 112 | 113 | return common_tokens_dict 114 | 115 | 116 | def load_query_code_tokenizers_from_hocon(conf: ConfigTree) -> Optional[Tuple[TokenizerRecordable, RecordableMapping]]: 117 | build_path = Path(conf["tokenizers.build_path"]) 118 | 119 | if not os.path.exists(build_path): 120 | logger.error(f"Could find {build_path} where tokenizers should have been built and stored") 121 | return None 122 | 123 | records = RecordableMapping.load(build_path) 124 | if "query_tokenizer" in records and "code_tokenizers" in records: 125 | query_tokenizer = cast(TokenizerRecordable, records["query_tokenizer"]) 126 | code_tokenizers = cast(RecordableMapping, records["code_tokenizers"]) 127 | 128 | return query_tokenizer, code_tokenizers 129 | else: 130 | logger.error(f"Couldn't query_tokenizer/code_tokenizers recordables in path {build_path}") 131 | return None 132 | -------------------------------------------------------------------------------- /codenets/main.py: -------------------------------------------------------------------------------- 1 | """Dummy Main of the project.""" 2 | 3 | 4 | def main(): 5 | print("hello") 6 | 7 | 8 | if __name__ == "__main__": 9 | main() 10 | -------------------------------------------------------------------------------- /codenets/save.py: -------------------------------------------------------------------------------- 1 | """Utils to save Recordables in rotating mode""" 2 | 3 | import os 4 | from pathlib import Path 5 | import shutil 6 | from typing import Union, Type, TypeVar, Optional 7 | from codenets.recordable import Recordable 8 | 9 | 10 | def rotating_save_records(path: Union[Path, str], prefix: str, rec: Recordable, nb: int = 5) -> bool: 11 | root_path = Path(path) / prefix 12 | if not os.path.isdir(root_path): 13 | os.makedirs(root_path) 14 | 15 | paths = [] 16 | first_empty_path = None 17 | saved = True 18 | for i in range(nb): 19 | path_i = root_path / f"{prefix}_{i}" 20 | if not os.path.exists(path_i) and first_empty_path is None: 21 | first_empty_path = path_i 22 | os.makedirs(first_empty_path) 23 | paths.append(path_i) 24 | 25 | if first_empty_path is not None: 26 | saved = saved and rec.save(first_empty_path) 27 | else: 28 | first = paths[0] 29 | 30 | shutil.rmtree(first) 31 | for pth in paths[1:]: 32 | os.rename(pth, first) 33 | first = pth 34 | saved = saved and rec.save(paths[-1]) 35 | 36 | return saved 37 | 38 | 39 | def save_records_direct(path: Union[Path, str], rec: Recordable) -> bool: 40 | if not os.path.isdir(path): 41 | os.makedirs(path) 42 | 43 | return rec.save(path) 44 | 45 | 46 | def save_records_best(path: Union[Path, str], rec: Recordable, suffix: Optional[str] = None) -> bool: 47 | prefix = os.path.basename(path) 48 | if suffix is not None: 49 | best_path = Path(path) / f"{prefix}_best_{suffix}" 50 | else: 51 | best_path = Path(path) / f"{prefix}_best" 52 | if not os.path.isdir(best_path): 53 | os.makedirs(best_path) 54 | 55 | return rec.save(best_path) 56 | 57 | 58 | def save_records_last(output_dir: Union[Path, str], rec: Recordable) -> bool: 59 | return rotating_save_records(os.path.dirname(output_dir), os.path.basename(output_dir), rec) 60 | 61 | 62 | Recordable_T = TypeVar("Recordable_T", bound="Recordable") 63 | 64 | 65 | def rotating_recover_records( 66 | cls: Type[Recordable_T], path: Union[Path, str], prefix: str, nb: int = 5 67 | ) -> Optional[Recordable_T]: 68 | last_path = None 69 | for i in range(nb): 70 | path_i = Path(path) / prefix / f"{prefix}_{i}" 71 | if os.path.exists(path_i): 72 | last_path = path_i 73 | 74 | if last_path is not None: 75 | return cls.load(last_path) 76 | else: 77 | return None 78 | 79 | 80 | def recover_records_best( 81 | cls: Type[Recordable_T], recover_dir: Union[Path, str], nb: int = 5, *args, **kwargs 82 | ) -> Optional[Recordable_T]: 83 | prefix = os.path.basename(recover_dir) 84 | best_path = Path(recover_dir) / f"{prefix}_best" 85 | if best_path.exists(): 86 | return cls.load(best_path) 87 | else: 88 | return None 89 | 90 | 91 | def recover_records_direct( 92 | cls: Type[Recordable_T], recover_dir: Union[Path, str], *args, **kwargs 93 | ) -> Optional[Recordable_T]: 94 | p = Path(recover_dir) 95 | if p.exists(): 96 | return cls.load(p) 97 | else: 98 | return None 99 | 100 | 101 | def recover_records_last(cls: Type[Recordable_T], recover_dir: Union[Path, str]) -> Optional[Recordable_T]: 102 | return rotating_recover_records(cls, os.path.dirname(recover_dir), os.path.basename(recover_dir)) 103 | -------------------------------------------------------------------------------- /codenets/tensorboard_utils.py: -------------------------------------------------------------------------------- 1 | # Some 2 | 3 | from tensorboardX import SummaryWriter 4 | from pathlib import Path 5 | import datetime 6 | from loguru import logger 7 | from typing import Dict 8 | 9 | from tensorboard.backend.event_processing import event_accumulator 10 | 11 | 12 | def tensorboard_event_accumulator( 13 | file: str, 14 | loaded_scalars: int = 0, # load all scalars by default 15 | loaded_images: int = 4, # load 4 images by default 16 | loaded_compressed_histograms: int = 500, # load one histogram by default 17 | loaded_histograms: int = 1, # load one histogram by default 18 | loaded_audio: int = 4, # loads 4 audio by default 19 | ): 20 | """Read a Tensorboard event_accumulator from a file""" 21 | ea = event_accumulator.EventAccumulator( 22 | file, 23 | size_guidance={ # see below regarding this argument 24 | event_accumulator.COMPRESSED_HISTOGRAMS: loaded_compressed_histograms, 25 | event_accumulator.IMAGES: loaded_images, 26 | event_accumulator.AUDIO: loaded_audio, 27 | event_accumulator.SCALARS: loaded_scalars, 28 | event_accumulator.HISTOGRAMS: loaded_histograms, 29 | }, 30 | ) 31 | ea.Reload() 32 | return ea 33 | 34 | 35 | class Tensorboard: 36 | """ 37 | Tensorboard manager 38 | 39 | This manager is associated to a: 40 | 41 | - experiment 42 | - a unique ID for the current run (one experiment can be run many times) 43 | - groups of metrics (like "train" or "val") 44 | - sub-groups of metrics (like train/bash or val/epoch) 45 | """ 46 | 47 | def __init__(self, experiment_id, output_dir="./runs", unique_id=None, flush_secs=10): 48 | self.experiment_id = experiment_id 49 | self.output_dir = Path(output_dir) 50 | if unique_id is None: 51 | unique_id = datetime.datetime.now().isoformat(timespec="seconds") 52 | self.path = self.output_dir / f"{experiment_id}_{unique_id}" 53 | logger.debug(f"Writing TensorBoard events locally to {self.path}") 54 | self.writers: Dict[str, SummaryWriter] = {} 55 | self.flush_secs = flush_secs 56 | 57 | def _get_writer(self, group: str = "") -> SummaryWriter: 58 | if group not in self.writers: 59 | logger.debug(f"Adding group {group} to writers ({self.writers.keys()})") 60 | self.writers[group] = SummaryWriter(f"{str(self.path)}_{group}", flush_secs=self.flush_secs) 61 | return self.writers[group] 62 | 63 | def add_scalars(self, metrics: dict, global_step: int, group=None, sub_group="") -> None: 64 | for key, val in metrics.items(): 65 | cur_name = "/".join([sub_group, key]) 66 | self._get_writer(group).add_scalar(cur_name, val, global_step) 67 | -------------------------------------------------------------------------------- /codenets/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, List, Optional 2 | import os 3 | import re 4 | from dpu_utils.codeutils import split_identifier_into_parts 5 | 6 | # from dpu_utils.utils import Path 7 | from pathlib import Path 8 | import numpy as np 9 | import glob 10 | import base64 11 | from pickle import dumps, loads 12 | 13 | IDENTIFIER_TOKEN_REGEX = re.compile("[_a-zA-Z][_a-zA-Z0-9]*") 14 | 15 | 16 | def listdir_nohidden_gen(path): 17 | for f in os.listdir(path): 18 | if not f.startswith('.'): 19 | yield f 20 | 21 | 22 | def listdir_nohidden(path): 23 | return list(listdir_nohidden_gen(path)) 24 | 25 | 26 | def runtime_import(class_name: str): 27 | import importlib 28 | 29 | """ 30 | Runtime import from a string using "." to split module & class names 31 | 32 | Args: 33 | class_name (str): the class name to split according to "." and load dynamically modules & class 34 | 35 | Returns: 36 | Class: The imported class 37 | """ 38 | components = class_name.split(".") 39 | print(f">>> class_name {class_name}<<<<") 40 | mod = getattr(importlib.import_module(".".join(components[:-1])), components[-1]) 41 | return mod 42 | 43 | 44 | def full_classname(cls): 45 | """Return full class name with modules""" 46 | return cls.__module__ + "." + cls.__name__ 47 | 48 | 49 | def instance_full_classname(o): 50 | # o.__module__ + "." + o.__class__.__qualname__ is an example in 51 | # this context of H.L. Mencken's "neat, plausible, and wrong." 52 | # Python makes no guarantees as to whether the __module__ special 53 | # attribute is defined, so we take a more circumspect approach. 54 | # Alas, the module name is explicitly excluded from __qualname__ 55 | # in Python 3. 56 | module = o.__class__.__module__ 57 | if module is None or module == str.__class__.__module__: 58 | return o.__class__.__name__ # Avoid reporting __builtin__ 59 | else: 60 | return module + "." + o.__class__.__name__ 61 | 62 | 63 | def _to_subtoken_stream(input_stream: Iterable[str], mark_subtoken_end: bool) -> Iterable[str]: 64 | """Generate chopped strings into sub-tokens strings (like snake-case)""" 65 | for token in input_stream: 66 | if IDENTIFIER_TOKEN_REGEX.match(token): 67 | yield from split_identifier_into_parts(token) 68 | if mark_subtoken_end: 69 | yield "" 70 | else: 71 | yield token 72 | 73 | 74 | def expand_data_path(data_path: str) -> List[Path]: 75 | """ 76 | Expand data path as a simple directory or if a file, searches for directories in the file 77 | 78 | Args: 79 | data_path: A path to either a file or a directory. If it's a file, we interpret it as a list of 80 | data directories. 81 | 82 | Returns: 83 | List of data directories (potentially just data_path) 84 | """ 85 | data_rpath = Path(data_path) 86 | 87 | if data_rpath.is_dir(): 88 | return [data_rpath] 89 | 90 | data_dirs: List[Path] = [] 91 | with open(data_rpath) as f: 92 | for fl in map(Path, f.read().splitlines()): 93 | if fl.is_absolute(): 94 | data_dirs.append(fl) 95 | else: 96 | data_dirs.append(data_rpath.parent / fl) 97 | 98 | # data_dirs.extend(map(Path)) 99 | return data_dirs 100 | 101 | 102 | def get_data_files_from_directory(data_dirs: List[Path], max_files_per_dir: Optional[int] = None) -> List[Path]: 103 | """Search all *.jsonl.gz files in a multiple paths and concatenate them""" 104 | files: List[Path] = [] 105 | for data_dir in data_dirs: 106 | dir_files = [Path(path) for path in glob.iglob(os.path.join(data_dir, "*.jsonl.gz"), recursive=True)] 107 | # dir_files = data_dir.get_filtered_files_in_dir("*.jsonl.gz") 108 | if max_files_per_dir: 109 | dir_files = sorted(dir_files)[: int(max_files_per_dir)] 110 | files += dir_files 111 | 112 | np.random.shuffle(np.array(files)) # This avoids having large_file_0, large_file_1, ... subsequences 113 | return files 114 | 115 | 116 | # Some streaming pickles (not used) 117 | 118 | 119 | def stream_dump(iterable_to_pickle, file_obj): 120 | """ 121 | Dump contents of an iterable iterable_to_pickle to file_obj, a file 122 | opened in write mode 123 | """ 124 | for elt in iterable_to_pickle: 125 | stream_dump_elt(elt, file_obj) 126 | 127 | 128 | def stream_dump_elt(elt_to_pickle, file_obj): 129 | """Dump one element to file_obj, a file opened in write mode""" 130 | pickled_elt = dumps(elt_to_pickle) 131 | encoded = base64.b64encode(pickled_elt) 132 | file_obj.write(encoded) 133 | 134 | # record separator is a blank line 135 | # (since pickled_elt as base64 encoded cannot contain its own newlines) 136 | file_obj.write(b"\n\n") 137 | 138 | 139 | def stream_load(file_obj): 140 | """ 141 | Load contents from file_obj, returning a generator that yields one 142 | element at a time 143 | """ 144 | cur_elt = [] 145 | for line in file_obj: 146 | if line == b"\n": 147 | encoded_elt = b"".join(cur_elt) 148 | try: 149 | pickled_elt = base64.b64decode(encoded_elt) 150 | elt = loads(pickled_elt) 151 | except EOFError: 152 | print("EOF found while unpickling data") 153 | print(pickled_elt) 154 | raise StopIteration 155 | cur_elt = [] 156 | yield elt 157 | else: 158 | cur_elt.append(line) 159 | -------------------------------------------------------------------------------- /conf/code_search_bert_2020_02_01_1500.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | training { 4 | name = "code_search_bert" 5 | iteration = "2020_02_01_15_00" 6 | 7 | model { 8 | type = "single_query_multi_code" 9 | query_encoder = ${bert} 10 | code_encoder = ${bert} 11 | } 12 | } -------------------------------------------------------------------------------- /conf/code_search_bert_2020_02_03_20_00.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | dataset { 4 | common_params { 5 | parallelize = false 6 | } 7 | } 8 | 9 | training { 10 | name = "code_search_bert" 11 | iteration = "2020_02_03_20_00" 12 | 13 | model { 14 | type = "single_query_multi_code" 15 | query_encoder = ${bert} 16 | code_encoder = ${bert} 17 | } 18 | } -------------------------------------------------------------------------------- /conf/code_search_bert_lg_2020_02_04_15_00.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | tokenizers { 4 | build_path = "./build_tokenizers/with_lang" 5 | } 6 | 7 | dataset { 8 | common_params { 9 | parallelize = false 10 | do_lowercase = true 11 | special_tokens = ["", ""] 12 | } 13 | } 14 | 15 | training { 16 | name = "code_search_bert" 17 | iteration = "2020_02_04_21_00" 18 | 19 | } -------------------------------------------------------------------------------- /conf/code_search_bert_lg_2020_02_04_21_00.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | tokenizers { 4 | build_path = "./build_tokenizers/with_lang" 5 | } 6 | 7 | dataset { 8 | common_params { 9 | parallelize = false 10 | do_lowercase = true 11 | special_tokens = ["", ""] 12 | } 13 | } 14 | 15 | training { 16 | name = "code_search_bert" 17 | iteration = "2020_02_04_21_00" 18 | } -------------------------------------------------------------------------------- /conf/code_search_bert_lg_2020_02_05_00_00.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | # bert { 4 | # hidden_size = 256 5 | # vocab_size = ${common_vocab_size} 6 | # intermediate_size = 1024 7 | # num_hidden_layers = 6 8 | # num_attention_heads = 8 9 | # } 10 | 11 | tokenizers { 12 | build_path = "./build_tokenizers/with_lang" 13 | } 14 | 15 | dataset { 16 | common_params { 17 | parallelize = false 18 | do_lowercase = true 19 | special_tokens = ["", ""] 20 | } 21 | } 22 | 23 | training { 24 | name = "code_search_bert_lg" 25 | iteration = "2020_02_05_00_00" 26 | 27 | batch_size { 28 | train = 170 29 | val = 170 30 | test = 170 31 | } 32 | 33 | model { 34 | type = "single_query_single_code" 35 | output_size = 128 36 | query_encoder { 37 | hidden_size = ${training.model.output_size} 38 | vocab_size = ${common_vocab_size} 39 | intermediate_size = 512 40 | num_hidden_layers = 3 41 | num_attention_heads = 8 42 | } 43 | code_encoder { 44 | hidden_size = ${training.model.output_size} 45 | vocab_size = ${common_vocab_size} 46 | intermediate_size = 1024 47 | num_hidden_layers = 6 48 | num_attention_heads = 8 49 | } 50 | } 51 | 52 | } -------------------------------------------------------------------------------- /conf/code_search_bert_lg_2020_02_06_18_00.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | tokenizers { 4 | build_path = "./build_tokenizers/with_lang" 5 | } 6 | 7 | dataset { 8 | common_params { 9 | parallelize = false 10 | do_lowercase = true 11 | special_tokens = ["", ""] 12 | } 13 | } 14 | 15 | training { 16 | name = "code_search_bert_lg" 17 | iteration = "2020_02_06_18_00" 18 | 19 | batch_size { 20 | train = 200 21 | val = 200 22 | test = 200 23 | } 24 | 25 | model { 26 | type = "single_query_single_code" 27 | output_size = 64 28 | query_encoder { 29 | hidden_size = ${training.model.output_size} 30 | vocab_size = ${common_vocab_size} 31 | intermediate_size = 512 32 | num_hidden_layers = 3 33 | num_attention_heads = 8 34 | } 35 | code_encoder { 36 | hidden_size = ${training.model.output_size} 37 | vocab_size = ${common_vocab_size} 38 | intermediate_size = 1024 39 | num_hidden_layers = 6 40 | num_attention_heads = 8 41 | } 42 | } 43 | 44 | } -------------------------------------------------------------------------------- /conf/code_search_bert_lg_2020_02_06_22_30.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | tokenizers { 4 | build_path = "./build_tokenizers/with_lang" 5 | } 6 | 7 | dataset { 8 | common_params { 9 | parallelize = false 10 | do_lowercase = true 11 | special_tokens = ["", ""] 12 | } 13 | } 14 | 15 | training { 16 | name = "code_search_bert_lg" 17 | iteration = "2020_02_06_22_30" 18 | 19 | batch_size { 20 | train = 170 21 | val = 170 22 | test = 170 23 | } 24 | 25 | model { 26 | type = "single_query_single_code" 27 | output_size = 256 28 | query_encoder { 29 | hidden_size = ${training.model.output_size} 30 | vocab_size = ${common_vocab_size} 31 | intermediate_size = 512 32 | num_hidden_layers = 3 33 | num_attention_heads = 8 34 | } 35 | code_encoder { 36 | hidden_size = ${training.model.output_size} 37 | vocab_size = ${common_vocab_size} 38 | intermediate_size = 1024 39 | num_hidden_layers = 6 40 | num_attention_heads = 8 41 | } 42 | } 43 | 44 | } -------------------------------------------------------------------------------- /conf/code_search_bert_lg_2020_02_07_10_00.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | tokenizers { 4 | build_path = "./build_tokenizers/with_lang" 5 | } 6 | 7 | dataset { 8 | common_params { 9 | parallelize = false 10 | do_lowercase = true 11 | special_tokens = ["", ""] 12 | } 13 | } 14 | 15 | training { 16 | short_circuit = True 17 | name = "code_search_bert_lg" 18 | iteration = "2020_02_07_10_00" 19 | 20 | model { 21 | training_ctx_class = "codenets.codesearchnet.single_branch_ctx.SingleBranchTrainingContext" 22 | output_size = 256 23 | query_encoder { 24 | hidden_size = ${training.model.output_size} 25 | vocab_size = ${common_vocab_size} 26 | intermediate_size = 512 27 | num_hidden_layers = 3 28 | num_attention_heads = 8 29 | } 30 | code_encoder { 31 | hidden_size = ${training.model.output_size} 32 | vocab_size = ${common_vocab_size} 33 | intermediate_size = 1024 34 | num_hidden_layers = 6 35 | num_attention_heads = 8 36 | } 37 | } 38 | 39 | batch_size { 40 | train = 170 41 | val = 170 42 | test = 170 43 | } 44 | 45 | device = "cpu" 46 | wandb = false 47 | tensorboard = false 48 | 49 | } -------------------------------------------------------------------------------- /conf/code_search_bert_query_1_code_1_2020_02_10_11_00 copy.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | tokenizers { 4 | build_path = "./build_tokenizers/with_lang_query_1_code_1" 5 | } 6 | 7 | dataset { 8 | common_params { 9 | parallelize = false 10 | do_lowercase = true 11 | special_tokens = ["", ""] 12 | } 13 | } 14 | 15 | training { 16 | short_circuit = False 17 | name = "code_search_bert_query_1_code_1" 18 | iteration = "2020_02_10_11_00" 19 | tokenizer_type = "query_1_code_1" 20 | model { 21 | training_ctx_class = "codenets.codesearchnet.single_branch_ctx.SingleBranchTrainingContext" 22 | output_size = 128 23 | query_encoder { 24 | hidden_size = ${training.model.output_size} 25 | vocab_size = ${common_vocab_size} 26 | intermediate_size = 512 27 | num_hidden_layers = 3 28 | num_attention_heads = 8 29 | } 30 | code_encoder { 31 | hidden_size = ${training.model.output_size} 32 | vocab_size = ${common_vocab_size} 33 | intermediate_size = 1024 34 | num_hidden_layers = 6 35 | num_attention_heads = 8 36 | } 37 | } 38 | 39 | batch_size { 40 | train = 170 41 | val = 170 42 | test = 170 43 | } 44 | 45 | device = "cuda" 46 | wandb = true 47 | tensorboard = true 48 | 49 | } -------------------------------------------------------------------------------- /conf/code_search_bert_query_1_code_1_2020_02_10_11_00.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | tokenizers { 4 | build_path = "./build_tokenizers/with_lang_query_1_code_1" 5 | } 6 | 7 | dataset { 8 | common_params { 9 | parallelize = false 10 | do_lowercase = true 11 | special_tokens = ["", ""] 12 | } 13 | } 14 | 15 | training { 16 | short_circuit = False 17 | name = "code_search_bert_query_1_code_1" 18 | iteration = "2020_02_10_11_00" 19 | tokenizer_type = "query_1_code_1" 20 | model { 21 | training_ctx_class = "codenets.codesearchnet.single_branch_ctx.SingleBranchTrainingContext" 22 | output_size = 128 23 | query_encoder { 24 | hidden_size = ${training.model.output_size} 25 | vocab_size = ${common_vocab_size} 26 | intermediate_size = 512 27 | num_hidden_layers = 3 28 | num_attention_heads = 8 29 | } 30 | code_encoder { 31 | hidden_size = ${training.model.output_size} 32 | vocab_size = ${common_vocab_size} 33 | intermediate_size = 1024 34 | num_hidden_layers = 6 35 | num_attention_heads = 8 36 | } 37 | } 38 | 39 | batch_size { 40 | train = 170 41 | val = 170 42 | test = 170 43 | } 44 | 45 | device = "cuda" 46 | wandb = true 47 | tensorboard = true 48 | 49 | } -------------------------------------------------------------------------------- /conf/code_search_bert_query_1_code_1_2020_02_11_22_00 copy.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | tokenizers { 4 | build_path = "./build_tokenizers/with_lang_query_1_code_1" 5 | } 6 | 7 | dataset { 8 | common_params { 9 | parallelize = false 10 | do_lowercase = true 11 | special_tokens = ["", ""] 12 | } 13 | } 14 | 15 | training { 16 | short_circuit = false 17 | 18 | device = "cuda" 19 | wandb = true 20 | tensorboard = true 21 | 22 | name = "code_search_bert_query_1_code_1" 23 | iteration = "2020_02_11_22_00" 24 | tokenizer_type = "query_1_code_1" 25 | model { 26 | training_ctx_class = "codenets.codesearchnet.query_1_code_1.training_ctx.Query1Code1Ctx" 27 | output_size = 64 28 | query_encoder { 29 | hidden_size = ${training.model.output_size} 30 | vocab_size = ${common_vocab_size} 31 | intermediate_size = 512 32 | num_hidden_layers = 3 33 | num_attention_heads = 8 34 | } 35 | code_encoder { 36 | hidden_size = ${training.model.output_size} 37 | vocab_size = ${common_vocab_size} 38 | intermediate_size = 512 39 | num_hidden_layers = 6 40 | num_attention_heads = 8 41 | } 42 | } 43 | 44 | batch_size { 45 | train = 256 46 | val = 256 47 | test = 256 48 | } 49 | 50 | } -------------------------------------------------------------------------------- /conf/code_search_bert_query_1_code_1_2020_02_11_22_00.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | tokenizers { 4 | build_path = "./build_tokenizers/with_lang_query_1_code_1" 5 | } 6 | 7 | dataset { 8 | common_params { 9 | parallelize = false 10 | do_lowercase = true 11 | special_tokens = ["", ""] 12 | } 13 | } 14 | 15 | training { 16 | short_circuit = false 17 | 18 | device = "cuda" 19 | wandb = true 20 | tensorboard = true 21 | 22 | name = "code_search_bert_query_1_code_1" 23 | iteration = "2020_02_11_22_00" 24 | tokenizer_type = "query_1_code_1" 25 | model { 26 | training_ctx_class = "codenets.codesearchnet.query_1_code_1.training_ctx.Query1Code1Ctx" 27 | output_size = 64 28 | query_encoder { 29 | hidden_size = ${training.model.output_size} 30 | vocab_size = ${common_vocab_size} 31 | intermediate_size = 512 32 | num_hidden_layers = 3 33 | num_attention_heads = 8 34 | } 35 | code_encoder { 36 | hidden_size = ${training.model.output_size} 37 | vocab_size = ${common_vocab_size} 38 | intermediate_size = 512 39 | num_hidden_layers = 6 40 | num_attention_heads = 8 41 | } 42 | } 43 | 44 | batch_size { 45 | train = 256 46 | val = 256 47 | test = 256 48 | } 49 | 50 | } -------------------------------------------------------------------------------- /conf/code_search_bert_query_code_siamese_2020_02_12_00_00 copy.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000 4 | 5 | tokenizers { 6 | build_path = "./build_tokenizers/with_lang_query_code_siamese" 7 | token_files = "./build_tokenizers/token_files_query_code_siamese" 8 | } 9 | 10 | dataset { 11 | common_params { 12 | parallelize = false 13 | do_lowercase = true 14 | special_tokens = ["", "", ""] 15 | } 16 | } 17 | 18 | training { 19 | short_circuit = false 20 | 21 | device = "cuda" 22 | wandb = true 23 | tensorboard = true 24 | 25 | name = "code_search_bert_query_code_siamese" 26 | iteration = "2020_02_12_00_00" 27 | tokenizer_type = "query_code_siamese" 28 | model { 29 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 30 | output_size = 512 31 | encoder { 32 | hidden_size = ${training.model.output_size} 33 | vocab_size = ${common_vocab_size} 34 | intermediate_size = 1024 35 | num_hidden_layers = 6 36 | num_attention_heads = 8 37 | } 38 | } 39 | 40 | batch_size { 41 | train = 128 42 | val = 128 43 | test = 128 44 | } 45 | 46 | } -------------------------------------------------------------------------------- /conf/code_search_bert_query_code_siamese_2020_02_12_00_00.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000 4 | 5 | tokenizers { 6 | build_path = "./build_tokenizers/with_lang_query_code_siamese" 7 | token_files = "./build_tokenizers/token_files_query_code_siamese" 8 | } 9 | 10 | dataset { 11 | common_params { 12 | parallelize = false 13 | do_lowercase = true 14 | special_tokens = ["", "", ""] 15 | } 16 | } 17 | 18 | training { 19 | short_circuit = false 20 | 21 | device = "cuda" 22 | wandb = true 23 | tensorboard = true 24 | 25 | name = "code_search_bert_query_code_siamese" 26 | iteration = "2020_02_12_00_00" 27 | tokenizer_type = "query_code_siamese" 28 | model { 29 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 30 | output_size = 512 31 | encoder { 32 | hidden_size = ${training.model.output_size} 33 | vocab_size = ${common_vocab_size} 34 | intermediate_size = 1024 35 | num_hidden_layers = 6 36 | num_attention_heads = 8 37 | } 38 | } 39 | 40 | batch_size { 41 | train = 128 42 | val = 128 43 | test = 128 44 | } 45 | 46 | } -------------------------------------------------------------------------------- /conf/code_search_bert_query_code_siamese_2020_02_14_16_00 copy.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000 4 | 5 | tokenizers { 6 | build_path = "./build_tokenizers/with_lang_query_code_siamese" 7 | token_files = "./build_tokenizers/token_files_query_code_siamese" 8 | } 9 | 10 | dataset { 11 | common_params { 12 | parallelize = false 13 | do_lowercase = true 14 | special_tokens = ["", "", ""] 15 | } 16 | } 17 | 18 | training { 19 | short_circuit = false 20 | 21 | device = "cuda" 22 | wandb = true 23 | tensorboard = true 24 | 25 | name = "code_search_bert_query_code_siamese" 26 | iteration = "2020_02_14_16_00" 27 | tokenizer_type = "query_code_siamese" 28 | model { 29 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 30 | output_size = 72 31 | encoder { 32 | hidden_size = ${training.model.output_size} 33 | vocab_size = ${common_vocab_size} 34 | intermediate_size = 256 35 | num_hidden_layers = 12 36 | num_attention_heads = 12 37 | } 38 | } 39 | 40 | batch_size { 41 | train = 100 42 | val = 100 43 | test = 100 44 | } 45 | 46 | } -------------------------------------------------------------------------------- /conf/code_search_bert_query_code_siamese_2020_02_14_16_00.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000 4 | 5 | tokenizers { 6 | build_path = "./build_tokenizers/with_lang_query_code_siamese" 7 | token_files = "./build_tokenizers/token_files_query_code_siamese" 8 | } 9 | 10 | dataset { 11 | common_params { 12 | parallelize = false 13 | do_lowercase = true 14 | special_tokens = ["", "", ""] 15 | } 16 | } 17 | 18 | training { 19 | short_circuit = false 20 | 21 | device = "cuda" 22 | wandb = true 23 | tensorboard = true 24 | 25 | name = "code_search_bert_query_code_siamese" 26 | iteration = "2020_02_14_16_00" 27 | tokenizer_type = "query_code_siamese" 28 | model { 29 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 30 | output_size = 72 31 | encoder { 32 | hidden_size = ${training.model.output_size} 33 | vocab_size = ${common_vocab_size} 34 | intermediate_size = 256 35 | num_hidden_layers = 12 36 | num_attention_heads = 12 37 | } 38 | } 39 | 40 | batch_size { 41 | train = 100 42 | val = 100 43 | test = 100 44 | } 45 | 46 | } -------------------------------------------------------------------------------- /conf/code_search_bert_query_code_siamese_2020_02_15_14_00.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000 4 | 5 | tokenizers { 6 | build_path = "./build_tokenizers/with_lang_query_code_siamese" 7 | token_files = "./build_tokenizers/token_files_query_code_siamese" 8 | } 9 | 10 | dataset { 11 | common_params { 12 | parallelize = false 13 | do_lowercase = true 14 | special_tokens = ["", "", ""] 15 | } 16 | } 17 | 18 | training { 19 | short_circuit = true 20 | 21 | device = "cuda" 22 | wandb = false 23 | tensorboard = false 24 | 25 | name = "code_search_siamese" 26 | iteration = "2020_02_15_14_00" 27 | tokenizer_type = "query_code_siamese" 28 | # Temporary because Rust tokenizers do not manage common tokens 29 | common_tokens_file = "./pickles/common_tokens_"${training.tokenizer_type}"_"${iteration}".p" 30 | 31 | model { 32 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 33 | output_size = 72 34 | encoder { 35 | hidden_size = ${training.model.output_size} 36 | vocab_size = ${common_vocab_size} 37 | intermediate_size = 256 38 | num_hidden_layers = 12 39 | num_attention_heads = 12 40 | } 41 | } 42 | 43 | batch_size { 44 | train = 100 45 | val = 100 46 | test = 100 47 | } 48 | 49 | } -------------------------------------------------------------------------------- /conf/default.conf: -------------------------------------------------------------------------------- 1 | 2 | lang_ids { 3 | php = 0 4 | python = 1 5 | ruby = 2 6 | java = 3 7 | go = 4 8 | javascript = 5 9 | } 10 | 11 | common_vocab_size = 10000 12 | 13 | bert { 14 | hidden_size = 128 15 | vocab_size = ${common_vocab_size} 16 | intermediate_size = 512 17 | num_hidden_layers = 3 18 | num_attention_heads = 8 19 | } 20 | 21 | tokenizers { 22 | type = "TOKENIZER_TYPE" 23 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 24 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 25 | } 26 | 27 | dataset { 28 | root_dir = ${HOME}"/workspaces/tools/CodeSearchNet/resources" 29 | common_params { 30 | fraction_using_func_name=0.1 31 | min_len_func_name_for_query=12 32 | use_subtokens=False 33 | mark_subtoken_end=False 34 | code_max_num_tokens=200 35 | query_max_num_tokens=30 36 | use_bpe=True 37 | vocab_size=${common_vocab_size} 38 | pct_bpe=0.5 39 | vocab_count_threshold=10 40 | lang_ids = ${lang_ids} 41 | do_lowercase = true 42 | special_tokens = [""] 43 | parallelize = true 44 | use_lang_weights = False 45 | } 46 | 47 | train { 48 | dirs = ${dataset.root_dir}"/data_dirs_train.txt" 49 | params = ${dataset.common_params} 50 | } 51 | 52 | val { 53 | dirs = ${dataset.root_dir}"/data_dirs_valid.txt" 54 | params = ${dataset.common_params} 55 | } 56 | 57 | test { 58 | dirs = ${dataset.root_dir}"/data_dirs_test.txt" 59 | params = ${dataset.common_params} 60 | } 61 | 62 | queries_file = ${dataset.root_dir}"/queries.csv" 63 | } 64 | 65 | 66 | training { 67 | # The name of current experiment (can have several runs) 68 | name = "EXPERIMENT_NAME" 69 | # The unique id of current run 70 | iteration = "UNIQUE_RUN_ID" 71 | # The ID used to identify the pre-built pickled files 72 | # using the tokenizer defined above 73 | tokenizer_type = "TOKENIZER_ID" 74 | 75 | # Set that to true to test your run without slow-loading train dataset 76 | short_circuit = false 77 | 78 | device = "cuda" 79 | # deactivate wandb & tensorboard 80 | wandb = true 81 | tensorboard = true 82 | 83 | model { 84 | # IMPORTANT: the class representing Training Context 85 | training_ctx_class = "codenets.codesearchnet.query_1_code_1.training_ctx.Query1Code1Ctx" 86 | output_size = 64 87 | query_encoder { 88 | hidden_size = ${training.model.output_size} 89 | vocab_size = ${common_vocab_size} 90 | intermediate_size = 512 91 | num_hidden_layers = 3 92 | num_attention_heads = 8 93 | } 94 | code_encoder { 95 | hidden_size = ${training.model.output_size} 96 | vocab_size = ${common_vocab_size} 97 | intermediate_size = 512 98 | num_hidden_layers = 6 99 | num_attention_heads = 8 100 | } 101 | } 102 | 103 | # Training Hyper-Parameters 104 | seed = 0 105 | lr = 0.0001 106 | max_grad_norm = 1.0 107 | min_log_interval = 50 108 | start_epoch = 0 109 | epochs = 10 110 | 111 | batch_size { 112 | train = 256 113 | val = 256 114 | test = 256 115 | } 116 | 117 | loss { 118 | type = "softmax_cross_entropy" 119 | margin = 1.0 120 | } 121 | 122 | # Paths 123 | pickle_path = "./pickles" 124 | output_dir = "./checkpoints" 125 | tensorboard_path = "./runs" 126 | 127 | } -------------------------------------------------------------------------------- /conf/qc_ast_2020_03_13.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | embeddings { 13 | sbert { 14 | model="bert-base-nli-mean-tokens" 15 | pickle_path="./pickles" 16 | } 17 | } 18 | 19 | dataset { 20 | common_params { 21 | parallelize = false 22 | do_lowercase = true 23 | special_tokens = ["", "", ""] 24 | use_lang_weights = True 25 | code_max_num_tokens=200 26 | use_subtokens = false # to do later 27 | #query_embeddings="sbert" 28 | fraction_using_func_name=0.1 29 | use_ast = "tree-sitter" 30 | ast_added_nodes = { 31 | "php": {"prefix": ""}, 32 | "java": {"prefix": "class Toto {", "suffix": "}"} 33 | } 34 | ast_skip_node_types = {"php": ["ERROR", ""], "java": ["ERROR"]} 35 | 36 | ast_special_tokens_files = [ "./pickles/test_special_tokens.json" ] 37 | } 38 | } 39 | 40 | training { 41 | short_circuit = true 42 | 43 | device = "cuda" 44 | wandb = false 45 | tensorboard = false 46 | 47 | name = "qc_ast" 48 | iteration = "2020_03_15" 49 | tokenizer_type = ${tokenizers.type} 50 | 51 | model { 52 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 53 | encoder { 54 | hidden_size = 32 55 | vocab_size = ${common_vocab_size} 56 | intermediate_size = 128 57 | num_hidden_layers = 3 58 | num_attention_heads = 8 59 | } 60 | } 61 | lr = 0.001 62 | 63 | loss { 64 | type = "lambda_loss" 65 | } 66 | 67 | batch_size { 68 | #train = 400 69 | #val = 400 70 | #test = 400 71 | train = 5 72 | val = 5 73 | test = 5 74 | } 75 | 76 | } -------------------------------------------------------------------------------- /conf/qc_ast_2020_03_15 copy.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | embeddings { 13 | sbert { 14 | model="bert-base-nli-mean-tokens" 15 | pickle_path="./pickles" 16 | } 17 | } 18 | 19 | dataset { 20 | common_params { 21 | parallelize = false 22 | do_lowercase = true 23 | special_tokens = ["", "", ""] 24 | use_lang_weights = True 25 | code_max_num_tokens=1024 26 | use_subtokens = false # to do later 27 | #query_embeddings="sbert" 28 | fraction_using_func_name=0.1 29 | use_ast = "tree-sitter" 30 | ast_added_nodes = { 31 | "php": {"prefix": ""}, 32 | "java": {"prefix": "class Toto {", "suffix": "}"} 33 | } 34 | ast_skip_node_types = {"php": ["ERROR", ""], "java": ["ERROR"]} 35 | 36 | ast_special_tokens_files = [ "./pickles/test_special_tokens.json" ] 37 | } 38 | } 39 | 40 | training { 41 | short_circuit = true 42 | 43 | device = "cuda" 44 | wandb = false 45 | tensorboard = false 46 | 47 | name = "qc_ast" 48 | iteration = "2020_03_15" 49 | tokenizer_type = ${tokenizers.type}"_ast" 50 | 51 | model { 52 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 53 | encoder { 54 | hidden_size = 64 55 | vocab_size = ${common_vocab_size} 56 | intermediate_size = 512 57 | num_hidden_layers = 3 58 | num_attention_heads = 8 59 | } 60 | } 61 | lr = 0.00001 62 | 63 | loss { 64 | type = "softmax_cross_entropy" 65 | } 66 | 67 | batch_size { 68 | train = 8 69 | val = 8 70 | test = 8 71 | #train = 5 72 | #val = 5 73 | #test = 5 74 | } 75 | 76 | } -------------------------------------------------------------------------------- /conf/qc_ast_2020_03_15.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | embeddings { 13 | sbert { 14 | model="bert-base-nli-mean-tokens" 15 | pickle_path="./pickles" 16 | } 17 | } 18 | 19 | dataset { 20 | common_params { 21 | parallelize = false 22 | do_lowercase = true 23 | special_tokens = ["", "", ""] 24 | use_lang_weights = True 25 | code_max_num_tokens=1024 26 | use_subtokens = false # to do later 27 | #query_embeddings="sbert" 28 | fraction_using_func_name=0.1 29 | use_ast = "tree-sitter" 30 | ast_added_nodes = { 31 | "php": {"prefix": ""}, 32 | "java": {"prefix": "class Toto {", "suffix": "}"} 33 | } 34 | ast_skip_node_types = {"php": ["ERROR", ""], "java": ["ERROR"]} 35 | 36 | ast_special_tokens_files = [ "./pickles/test_special_tokens.json" ] 37 | } 38 | } 39 | 40 | training { 41 | short_circuit = true 42 | 43 | device = "cuda" 44 | wandb = false 45 | tensorboard = false 46 | 47 | name = "qc_ast" 48 | iteration = "2020_03_15" 49 | tokenizer_type = ${tokenizers.type}"_ast" 50 | 51 | model { 52 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 53 | encoder { 54 | hidden_size = 64 55 | vocab_size = ${common_vocab_size} 56 | intermediate_size = 512 57 | num_hidden_layers = 3 58 | num_attention_heads = 8 59 | } 60 | } 61 | lr = 0.00001 62 | 63 | loss { 64 | type = "softmax_cross_entropy" 65 | } 66 | 67 | batch_size { 68 | train = 8 69 | val = 8 70 | test = 8 71 | #train = 5 72 | #val = 5 73 | #test = 5 74 | } 75 | 76 | } -------------------------------------------------------------------------------- /conf/qc_ast_2020_03_17.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | embeddings { 13 | sbert { 14 | model="bert-base-nli-mean-tokens" 15 | pickle_path="./pickles" 16 | } 17 | } 18 | 19 | dataset { 20 | common_params { 21 | parallelize = false 22 | do_lowercase = true 23 | special_tokens = ["", "", ""] 24 | use_lang_weights = True 25 | code_max_num_tokens=1024 26 | use_subtokens = false # to do later 27 | #query_embeddings="sbert" 28 | fraction_using_func_name=0.1 29 | use_ast = "tree-sitter" 30 | ast_added_nodes = { 31 | "php": {"prefix": ""}, 32 | "java": {"prefix": "class Toto {", "suffix": "}"} 33 | } 34 | ast_skip_node_types = {"php": ["ERROR", ""], "java": ["ERROR"]} 35 | 36 | ast_special_tokens_files = [ "./pickles/test_special_tokens.json" ] 37 | } 38 | } 39 | 40 | training { 41 | short_circuit = true 42 | 43 | device = "cuda" 44 | wandb = false 45 | tensorboard = false 46 | 47 | name = "qc_ast" 48 | iteration = "2020_03_15" 49 | tokenizer_type = ${tokenizers.type}"_ast" 50 | 51 | model { 52 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 53 | encoder { 54 | hidden_size = 64 55 | vocab_size = ${common_vocab_size} 56 | intermediate_size = 512 57 | num_hidden_layers = 3 58 | num_attention_heads = 8 59 | } 60 | } 61 | lr = 0.00001 62 | 63 | loss { 64 | type = "softmax_cross_entropy" 65 | } 66 | 67 | batch_size { 68 | train = 8 69 | val = 8 70 | test = 8 71 | #train = 5 72 | #val = 5 73 | #test = 5 74 | } 75 | 76 | } -------------------------------------------------------------------------------- /conf/qc_ast_2020_03_18 copy.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | embeddings { 13 | sbert { 14 | model="bert-base-nli-mean-tokens" 15 | pickle_path="./pickles" 16 | } 17 | } 18 | 19 | dataset { 20 | common_params { 21 | parallelize = false 22 | do_lowercase = true 23 | special_tokens = ["", "", ""] 24 | use_lang_weights = True 25 | code_max_num_tokens = 512 26 | use_subtokens = false # to do later 27 | #query_embeddings="sbert" 28 | fraction_using_func_name=0.1 29 | use_ast = "tree-sitter" 30 | ast_added_nodes = { 31 | "php": {"prefix": ""}, 32 | "java": {"prefix": "class Toto {", "suffix": "}"} 33 | } 34 | ast_skip_node_types = {"php": ["ERROR", ""], "java": ["ERROR"]} 35 | 36 | ast_special_tokens_files = [ "./pickles/test_special_tokens.json" ] 37 | } 38 | } 39 | 40 | training { 41 | short_circuit = false 42 | 43 | device = "cuda" 44 | wandb = true 45 | tensorboard = false 46 | 47 | name = "qc_ast" 48 | iteration = "2020_03_18" 49 | tokenizer_type = ${tokenizers.type}"_ast_512" 50 | 51 | model { 52 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 53 | encoder { 54 | hidden_size = 128 55 | vocab_size = 30370 56 | intermediate_size = 512 57 | num_hidden_layers = 3 58 | num_attention_heads = 8 59 | } 60 | } 61 | lr = 0.0001 62 | 63 | loss { 64 | type = "softmax_cross_entropy" 65 | } 66 | 67 | batch_size { 68 | train = 85 69 | val = 85 70 | test = 85 71 | #train = 5 72 | #val = 5 73 | #test = 5 74 | } 75 | 76 | } -------------------------------------------------------------------------------- /conf/qc_ast_2020_03_18.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | embeddings { 13 | sbert { 14 | model="bert-base-nli-mean-tokens" 15 | pickle_path="./pickles" 16 | } 17 | } 18 | 19 | dataset { 20 | common_params { 21 | parallelize = false 22 | do_lowercase = true 23 | special_tokens = ["", "", ""] 24 | use_lang_weights = True 25 | code_max_num_tokens = 512 26 | use_subtokens = false # to do later 27 | #query_embeddings="sbert" 28 | fraction_using_func_name=0.1 29 | use_ast = "tree-sitter" 30 | ast_added_nodes = { 31 | "php": {"prefix": ""}, 32 | "java": {"prefix": "class Toto {", "suffix": "}"} 33 | } 34 | ast_skip_node_types = {"php": ["ERROR", ""], "java": ["ERROR"]} 35 | 36 | ast_special_tokens_files = [ "./pickles/test_special_tokens.json" ] 37 | } 38 | } 39 | 40 | training { 41 | short_circuit = false 42 | 43 | device = "cuda" 44 | wandb = true 45 | tensorboard = false 46 | 47 | name = "qc_ast" 48 | iteration = "2020_03_18" 49 | tokenizer_type = ${tokenizers.type}"_ast_512" 50 | 51 | model { 52 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 53 | encoder { 54 | hidden_size = 128 55 | vocab_size = 30370 56 | intermediate_size = 512 57 | num_hidden_layers = 3 58 | num_attention_heads = 8 59 | } 60 | } 61 | lr = 0.0001 62 | 63 | loss { 64 | type = "softmax_cross_entropy" 65 | } 66 | 67 | batch_size { 68 | train = 85 69 | val = 85 70 | test = 85 71 | #train = 5 72 | #val = 5 73 | #test = 5 74 | } 75 | 76 | } -------------------------------------------------------------------------------- /conf/qc_ast_2020_03_19.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | embeddings { 13 | sbert { 14 | model="bert-base-nli-mean-tokens" 15 | pickle_path="./pickles" 16 | } 17 | } 18 | 19 | dataset { 20 | common_params { 21 | parallelize = false 22 | do_lowercase = true 23 | special_tokens = ["", "", ""] 24 | use_lang_weights = True 25 | code_max_num_tokens = 256 26 | use_subtokens = false # to do later 27 | #query_embeddings="sbert" 28 | fraction_using_func_name=0.1 29 | use_ast = "tree-sitter" 30 | ast_added_nodes = { 31 | "php": {"prefix": ""}, 32 | "java": {"prefix": "class Toto {", "suffix": "}"} 33 | } 34 | ast_skip_node_types = {"php": ["ERROR", ""], "java": ["ERROR"]} 35 | 36 | ast_special_tokens_files = [ "./pickles/test_special_tokens.json" ] 37 | } 38 | } 39 | 40 | training { 41 | short_circuit = false 42 | 43 | device = "cuda" 44 | wandb = true 45 | tensorboard = false 46 | 47 | name = "qc_ast" 48 | iteration = "2020_03_19" 49 | tokenizer_type = ${tokenizers.type}"_ast_256" 50 | 51 | model { 52 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 53 | encoder { 54 | hidden_size = 64 55 | vocab_size = 30370 56 | intermediate_size = 768 57 | num_hidden_layers = 3 58 | num_attention_heads = 8 59 | } 60 | } 61 | lr = 0.0001 62 | 63 | loss { 64 | type = "softmax_cross_entropy" 65 | } 66 | 67 | batch_size { 68 | train = 256 69 | val = 256 70 | test = 256 71 | #train = 5 72 | #val = 5 73 | #test = 5 74 | } 75 | 76 | } -------------------------------------------------------------------------------- /conf/qc_ce_2020_02_23_01_00 copy.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | dataset { 13 | common_params { 14 | parallelize = false 15 | do_lowercase = true 16 | special_tokens = ["", "", ""] 17 | use_lang_weights = True 18 | } 19 | } 20 | 21 | training { 22 | short_circuit = false 23 | 24 | device = "cuda" 25 | wandb = true 26 | tensorboard = true 27 | 28 | name = "qc_ce" 29 | iteration = "2020_02_23_01_00" 30 | tokenizer_type = ${tokenizers.type} 31 | 32 | model { 33 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 34 | encoder { 35 | hidden_size = 32 36 | vocab_size = ${common_vocab_size} 37 | intermediate_size = 256 38 | num_hidden_layers = 2 39 | num_attention_heads = 8 40 | } 41 | } 42 | 43 | loss { 44 | type = "softmax_cross_entropy" 45 | } 46 | 47 | batch_size { 48 | train = 768 49 | val = 768 50 | test = 768 51 | } 52 | 53 | } -------------------------------------------------------------------------------- /conf/qc_ce_2020_02_23_01_00.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | dataset { 13 | common_params { 14 | parallelize = false 15 | do_lowercase = true 16 | special_tokens = ["", "", ""] 17 | use_lang_weights = True 18 | } 19 | } 20 | 21 | training { 22 | short_circuit = false 23 | 24 | device = "cuda" 25 | wandb = true 26 | tensorboard = true 27 | 28 | name = "qc_ce" 29 | iteration = "2020_02_23_01_00" 30 | tokenizer_type = ${tokenizers.type} 31 | 32 | model { 33 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 34 | encoder { 35 | hidden_size = 32 36 | vocab_size = ${common_vocab_size} 37 | intermediate_size = 256 38 | num_hidden_layers = 2 39 | num_attention_heads = 8 40 | } 41 | } 42 | 43 | loss { 44 | type = "softmax_cross_entropy" 45 | } 46 | 47 | batch_size { 48 | train = 768 49 | val = 768 50 | test = 768 51 | } 52 | 53 | } -------------------------------------------------------------------------------- /conf/qc_ce_long_seq_2020_02_24.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | dataset { 13 | common_params { 14 | parallelize = false 15 | do_lowercase = true 16 | special_tokens = ["", "", ""] 17 | use_lang_weights = True 18 | code_max_num_tokens=400 # mainly for JS which is more verbose 19 | } 20 | } 21 | 22 | training { 23 | short_circuit = false 24 | 25 | device = "cuda" 26 | wandb = true 27 | tensorboard = true 28 | 29 | name = "qc_ce" 30 | iteration = "2020_02_24" 31 | tokenizer_type = ${tokenizers.type} 32 | 33 | model { 34 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 35 | encoder { 36 | hidden_size = 32 37 | vocab_size = ${common_vocab_size} 38 | intermediate_size = 256 39 | num_hidden_layers = 2 40 | num_attention_heads = 8 41 | } 42 | } 43 | 44 | loss { 45 | type = "softmax_cross_entropy" 46 | } 47 | 48 | batch_size { 49 | train = 768 50 | val = 768 51 | test = 768 52 | } 53 | 54 | } -------------------------------------------------------------------------------- /conf/qc_ce_sbert_2020_02_27 copy.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | embeddings { 13 | sbert { 14 | model="bert-base-nli-mean-tokens" 15 | pickle_path="./pickles" 16 | } 17 | } 18 | 19 | dataset { 20 | common_params { 21 | parallelize = false 22 | do_lowercase = true 23 | special_tokens = ["", "", ""] 24 | use_lang_weights = True 25 | code_max_num_tokens=200 26 | use_subtokens=True 27 | query_embeddings="sbert" 28 | } 29 | } 30 | 31 | training { 32 | short_circuit = false 33 | 34 | device = "cuda" 35 | wandb = true 36 | tensorboard = true 37 | 38 | name = "qc_ce_sbert" 39 | iteration = "2020_02_27" 40 | tokenizer_type = ${tokenizers.type} 41 | 42 | model { 43 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 44 | encoder { 45 | hidden_size = 32 46 | vocab_size = ${common_vocab_size} 47 | intermediate_size = 256 48 | num_hidden_layers = 2 49 | num_attention_heads = 8 50 | } 51 | } 52 | 53 | loss { 54 | type = "lambda_loss" 55 | } 56 | 57 | batch_size { 58 | train = 425 59 | val = 425 60 | test = 425 61 | } 62 | 63 | } -------------------------------------------------------------------------------- /conf/qc_ce_sbert_2020_02_27.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | embeddings { 13 | sbert { 14 | model="bert-base-nli-mean-tokens" 15 | pickle_path="./pickles" 16 | } 17 | } 18 | 19 | dataset { 20 | common_params { 21 | parallelize = false 22 | do_lowercase = true 23 | special_tokens = ["", "", ""] 24 | use_lang_weights = True 25 | code_max_num_tokens=200 26 | use_subtokens=True 27 | query_embeddings="sbert" 28 | } 29 | } 30 | 31 | training { 32 | short_circuit = false 33 | 34 | device = "cuda" 35 | wandb = true 36 | tensorboard = true 37 | 38 | name = "qc_ce_sbert" 39 | iteration = "2020_02_27" 40 | tokenizer_type = ${tokenizers.type} 41 | 42 | model { 43 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 44 | encoder { 45 | hidden_size = 32 46 | vocab_size = ${common_vocab_size} 47 | intermediate_size = 256 48 | num_hidden_layers = 2 49 | num_attention_heads = 8 50 | } 51 | } 52 | 53 | loss { 54 | type = "lambda_loss" 55 | } 56 | 57 | batch_size { 58 | train = 425 59 | val = 425 60 | test = 425 61 | } 62 | 63 | } -------------------------------------------------------------------------------- /conf/qc_ce_sbert_2020_02_28 copy.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | embeddings { 13 | sbert { 14 | model="bert-base-nli-mean-tokens" 15 | pickle_path="./pickles" 16 | } 17 | } 18 | 19 | dataset { 20 | common_params { 21 | parallelize = false 22 | do_lowercase = true 23 | special_tokens = ["", "", ""] 24 | use_lang_weights = True 25 | code_max_num_tokens=200 26 | use_subtokens=True 27 | query_embeddings="sbert" 28 | } 29 | } 30 | 31 | training { 32 | short_circuit = false 33 | 34 | device = "cuda" 35 | wandb = true 36 | tensorboard = true 37 | 38 | name = "qc_ce_sbert" 39 | iteration = "2020_02_28" 40 | tokenizer_type = ${tokenizers.type} 41 | 42 | model { 43 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 44 | encoder { 45 | hidden_size = 64 46 | vocab_size = ${common_vocab_size} 47 | intermediate_size = 512 48 | num_hidden_layers = 4 49 | num_attention_heads = 8 50 | } 51 | } 52 | lr = 0.0001 53 | 54 | loss { 55 | type = "lambda_loss" 56 | } 57 | 58 | batch_size { 59 | train = 300 60 | val = 300 61 | test = 300 62 | } 63 | 64 | } -------------------------------------------------------------------------------- /conf/qc_ce_sbert_2020_02_28.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | embeddings { 13 | sbert { 14 | model="bert-base-nli-mean-tokens" 15 | pickle_path="./pickles" 16 | } 17 | } 18 | 19 | dataset { 20 | common_params { 21 | parallelize = false 22 | do_lowercase = true 23 | special_tokens = ["", "", ""] 24 | use_lang_weights = True 25 | code_max_num_tokens=200 26 | use_subtokens=True 27 | query_embeddings="sbert" 28 | } 29 | } 30 | 31 | training { 32 | short_circuit = false 33 | 34 | device = "cuda" 35 | wandb = true 36 | tensorboard = true 37 | 38 | name = "qc_ce_sbert" 39 | iteration = "2020_02_28" 40 | tokenizer_type = ${tokenizers.type} 41 | 42 | model { 43 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 44 | encoder { 45 | hidden_size = 64 46 | vocab_size = ${common_vocab_size} 47 | intermediate_size = 512 48 | num_hidden_layers = 4 49 | num_attention_heads = 8 50 | } 51 | } 52 | lr = 0.0001 53 | 54 | loss { 55 | type = "lambda_loss" 56 | } 57 | 58 | batch_size { 59 | train = 300 60 | val = 300 61 | test = 300 62 | } 63 | 64 | } -------------------------------------------------------------------------------- /conf/qc_ce_sbert_2020_02_29 copy.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | embeddings { 13 | sbert { 14 | model="bert-base-nli-mean-tokens" 15 | pickle_path="./pickles" 16 | } 17 | } 18 | 19 | dataset { 20 | common_params { 21 | parallelize = false 22 | do_lowercase = true 23 | special_tokens = ["", "", ""] 24 | use_lang_weights = True 25 | code_max_num_tokens=200 26 | use_subtokens=True 27 | query_embeddings="sbert" 28 | } 29 | } 30 | 31 | training { 32 | short_circuit = true 33 | 34 | device = "cuda" 35 | wandb = false 36 | tensorboard = false 37 | 38 | name = "qc_ce_sbert" 39 | iteration = "2020_02_29" 40 | tokenizer_type = ${tokenizers.type} 41 | 42 | model { 43 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 44 | encoder { 45 | hidden_size = 128 46 | vocab_size = ${common_vocab_size} 47 | intermediate_size = 512 48 | num_hidden_layers = 4 49 | num_attention_heads = 8 50 | } 51 | } 52 | lr = 0.0001 53 | 54 | loss { 55 | type = "lambda_loss" 56 | } 57 | 58 | batch_size { 59 | train = 275 60 | val = 275 61 | test = 275 62 | } 63 | 64 | } -------------------------------------------------------------------------------- /conf/qc_ce_sbert_2020_02_29.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | embeddings { 13 | sbert { 14 | model="bert-base-nli-mean-tokens" 15 | pickle_path="./pickles" 16 | } 17 | } 18 | 19 | dataset { 20 | common_params { 21 | parallelize = false 22 | do_lowercase = true 23 | special_tokens = ["", "", ""] 24 | use_lang_weights = True 25 | code_max_num_tokens=200 26 | use_subtokens=True 27 | query_embeddings="sbert" 28 | } 29 | } 30 | 31 | training { 32 | short_circuit = true 33 | 34 | device = "cuda" 35 | wandb = false 36 | tensorboard = false 37 | 38 | name = "qc_ce_sbert" 39 | iteration = "2020_02_29" 40 | tokenizer_type = ${tokenizers.type} 41 | 42 | model { 43 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 44 | encoder { 45 | hidden_size = 128 46 | vocab_size = ${common_vocab_size} 47 | intermediate_size = 512 48 | num_hidden_layers = 4 49 | num_attention_heads = 8 50 | } 51 | } 52 | lr = 0.0001 53 | 54 | loss { 55 | type = "lambda_loss" 56 | } 57 | 58 | batch_size { 59 | train = 275 60 | val = 275 61 | test = 275 62 | } 63 | 64 | } -------------------------------------------------------------------------------- /conf/qc_ce_sbert_2020_03_01 copy.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | embeddings { 13 | sbert { 14 | model="bert-base-nli-mean-tokens" 15 | pickle_path="./pickles" 16 | } 17 | } 18 | 19 | dataset { 20 | common_params { 21 | parallelize = false 22 | do_lowercase = true 23 | special_tokens = ["", "", ""] 24 | use_lang_weights = True 25 | code_max_num_tokens=200 26 | use_subtokens=True 27 | query_embeddings="sbert" 28 | } 29 | } 30 | 31 | training { 32 | short_circuit = true 33 | 34 | device = "cuda" 35 | wandb = false 36 | tensorboard = false 37 | 38 | name = "qc_ce_sbert" 39 | iteration = "2020_03_01" 40 | tokenizer_type = ${tokenizers.type} 41 | 42 | model { 43 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 44 | encoder { 45 | hidden_size = 32 46 | vocab_size = ${common_vocab_size} 47 | intermediate_size = 256 48 | num_hidden_layers = 2 49 | num_attention_heads = 8 50 | } 51 | } 52 | lr = 0.0001 53 | 54 | loss { 55 | type = "lambda_loss" 56 | } 57 | 58 | batch_size { 59 | train = 400 60 | val = 400 61 | test = 400 62 | } 63 | 64 | } -------------------------------------------------------------------------------- /conf/qc_ce_sbert_2020_03_01.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | embeddings { 13 | sbert { 14 | model="bert-base-nli-mean-tokens" 15 | pickle_path="./pickles" 16 | } 17 | } 18 | 19 | dataset { 20 | common_params { 21 | parallelize = false 22 | do_lowercase = true 23 | special_tokens = ["", "", ""] 24 | use_lang_weights = True 25 | code_max_num_tokens=200 26 | use_subtokens=True 27 | query_embeddings="sbert" 28 | } 29 | } 30 | 31 | training { 32 | short_circuit = true 33 | 34 | device = "cuda" 35 | wandb = false 36 | tensorboard = false 37 | 38 | name = "qc_ce_sbert" 39 | iteration = "2020_03_01" 40 | tokenizer_type = ${tokenizers.type} 41 | 42 | model { 43 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 44 | encoder { 45 | hidden_size = 32 46 | vocab_size = ${common_vocab_size} 47 | intermediate_size = 256 48 | num_hidden_layers = 2 49 | num_attention_heads = 8 50 | } 51 | } 52 | lr = 0.0001 53 | 54 | loss { 55 | type = "lambda_loss" 56 | } 57 | 58 | batch_size { 59 | train = 400 60 | val = 400 61 | test = 400 62 | } 63 | 64 | } -------------------------------------------------------------------------------- /conf/qc_ce_subtoken_2020_02_25 copy.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k_subtoken" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | dataset { 13 | common_params { 14 | parallelize = false 15 | do_lowercase = true 16 | special_tokens = ["", "", ""] 17 | use_lang_weights = True 18 | code_max_num_tokens=200 19 | use_subtokens=True 20 | } 21 | } 22 | 23 | training { 24 | short_circuit = false 25 | 26 | device = "cuda" 27 | wandb = true 28 | tensorboard = true 29 | 30 | name = "qc_ce_subtoken" 31 | iteration = "2020_02_25" 32 | tokenizer_type = ${tokenizers.type} 33 | 34 | model { 35 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 36 | encoder { 37 | hidden_size = 32 38 | vocab_size = ${common_vocab_size} 39 | intermediate_size = 256 40 | num_hidden_layers = 2 41 | num_attention_heads = 8 42 | } 43 | } 44 | 45 | loss { 46 | type = "softmax_cross_entropy" 47 | } 48 | 49 | batch_size { 50 | train = 768 51 | val = 768 52 | test = 768 53 | } 54 | 55 | } -------------------------------------------------------------------------------- /conf/qc_ce_subtoken_2020_02_25.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k_subtoken" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | dataset { 13 | common_params { 14 | parallelize = false 15 | do_lowercase = true 16 | special_tokens = ["", "", ""] 17 | use_lang_weights = True 18 | code_max_num_tokens=200 19 | use_subtokens=True 20 | } 21 | } 22 | 23 | training { 24 | short_circuit = false 25 | 26 | device = "cuda" 27 | wandb = true 28 | tensorboard = true 29 | 30 | name = "qc_ce_subtoken" 31 | iteration = "2020_02_25" 32 | tokenizer_type = ${tokenizers.type} 33 | 34 | model { 35 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 36 | encoder { 37 | hidden_size = 32 38 | vocab_size = ${common_vocab_size} 39 | intermediate_size = 256 40 | num_hidden_layers = 2 41 | num_attention_heads = 8 42 | } 43 | } 44 | 45 | loss { 46 | type = "softmax_cross_entropy" 47 | } 48 | 49 | batch_size { 50 | train = 768 51 | val = 768 52 | test = 768 53 | } 54 | 55 | } -------------------------------------------------------------------------------- /conf/qc_ce_subtoken_larger_2020_02_25.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k_subtoken" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | dataset { 13 | common_params { 14 | parallelize = false 15 | do_lowercase = true 16 | special_tokens = ["", "", ""] 17 | use_lang_weights = True 18 | code_max_num_tokens=200 19 | use_subtokens=True 20 | } 21 | } 22 | 23 | training { 24 | short_circuit = false 25 | 26 | device = "cuda" 27 | wandb = true 28 | tensorboard = true 29 | 30 | name = "qc_ce_subtoken_larger" 31 | iteration = "2020_02_26" 32 | tokenizer_type = ${tokenizers.type} 33 | 34 | model { 35 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 36 | encoder { 37 | hidden_size = 64 38 | vocab_size = ${common_vocab_size} 39 | intermediate_size = 512 40 | num_hidden_layers = 4 41 | num_attention_heads = 8 42 | } 43 | } 44 | 45 | loss { 46 | type = "softmax_cross_entropy" 47 | } 48 | 49 | batch_size { 50 | train = 350 51 | val = 350 52 | test = 350 53 | } 54 | 55 | } -------------------------------------------------------------------------------- /conf/qc_ce_subtoken_larger_2020_02_26 copy.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k_subtoken" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | dataset { 13 | common_params { 14 | parallelize = false 15 | do_lowercase = true 16 | special_tokens = ["", "", ""] 17 | use_lang_weights = True 18 | code_max_num_tokens=200 19 | use_subtokens=True 20 | } 21 | } 22 | 23 | training { 24 | short_circuit = false 25 | 26 | device = "cuda" 27 | wandb = true 28 | tensorboard = true 29 | 30 | name = "qc_ce_subtoken_larger" 31 | iteration = "2020_02_26" 32 | tokenizer_type = ${tokenizers.type} 33 | 34 | model { 35 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 36 | encoder { 37 | hidden_size = 64 38 | vocab_size = ${common_vocab_size} 39 | intermediate_size = 512 40 | num_hidden_layers = 4 41 | num_attention_heads = 8 42 | } 43 | } 44 | 45 | loss { 46 | type = "softmax_cross_entropy" 47 | } 48 | 49 | batch_size { 50 | train = 350 51 | val = 350 52 | test = 350 53 | } 54 | 55 | } -------------------------------------------------------------------------------- /conf/qc_ce_subtoken_larger_2020_02_26.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k_subtoken" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | dataset { 13 | common_params { 14 | parallelize = false 15 | do_lowercase = true 16 | special_tokens = ["", "", ""] 17 | use_lang_weights = True 18 | code_max_num_tokens=200 19 | use_subtokens=True 20 | } 21 | } 22 | 23 | training { 24 | short_circuit = false 25 | 26 | device = "cuda" 27 | wandb = true 28 | tensorboard = true 29 | 30 | name = "qc_ce_subtoken_larger" 31 | iteration = "2020_02_26" 32 | tokenizer_type = ${tokenizers.type} 33 | 34 | model { 35 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 36 | encoder { 37 | hidden_size = 64 38 | vocab_size = ${common_vocab_size} 39 | intermediate_size = 512 40 | num_hidden_layers = 4 41 | num_attention_heads = 8 42 | } 43 | } 44 | 45 | loss { 46 | type = "softmax_cross_entropy" 47 | } 48 | 49 | batch_size { 50 | train = 350 51 | val = 350 52 | test = 350 53 | } 54 | 55 | } -------------------------------------------------------------------------------- /conf/qc_lambda_2020_02_20_12_30 copy.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | dataset { 13 | common_params { 14 | parallelize = false 15 | do_lowercase = true 16 | special_tokens = ["", "", ""] 17 | use_lang_weights = True 18 | } 19 | } 20 | 21 | training { 22 | short_circuit = false 23 | 24 | device = "cuda" 25 | wandb = true 26 | tensorboard = true 27 | 28 | name = "qc_lambda" 29 | iteration = "2020_02_20_12_30" 30 | tokenizer_type = ${tokenizers.type} 31 | 32 | model { 33 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 34 | encoder { 35 | hidden_size = 128 36 | vocab_size = ${common_vocab_size} 37 | intermediate_size = 512 38 | num_hidden_layers = 6 39 | num_attention_heads = 8 40 | } 41 | } 42 | 43 | loss { 44 | type = "lambda_loss" 45 | } 46 | 47 | batch_size { 48 | train = 220 49 | val = 220 50 | test = 220 51 | # train = 8 52 | # val = 8 53 | # test = 8 54 | } 55 | 56 | } -------------------------------------------------------------------------------- /conf/qc_lambda_2020_02_20_12_30.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | dataset { 13 | common_params { 14 | parallelize = false 15 | do_lowercase = true 16 | special_tokens = ["", "", ""] 17 | use_lang_weights = True 18 | } 19 | } 20 | 21 | training { 22 | short_circuit = false 23 | 24 | device = "cuda" 25 | wandb = true 26 | tensorboard = true 27 | 28 | name = "qc_lambda" 29 | iteration = "2020_02_20_12_30" 30 | tokenizer_type = ${tokenizers.type} 31 | 32 | model { 33 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 34 | encoder { 35 | hidden_size = 128 36 | vocab_size = ${common_vocab_size} 37 | intermediate_size = 512 38 | num_hidden_layers = 6 39 | num_attention_heads = 8 40 | } 41 | } 42 | 43 | loss { 44 | type = "lambda_loss" 45 | } 46 | 47 | batch_size { 48 | train = 220 49 | val = 220 50 | test = 220 51 | # train = 8 52 | # val = 8 53 | # test = 8 54 | } 55 | 56 | } -------------------------------------------------------------------------------- /conf/qc_sbert_lambda_2020_03_02.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | embeddings { 13 | sbert { 14 | model="bert-base-nli-mean-tokens" 15 | pickle_path="./pickles" 16 | } 17 | } 18 | 19 | dataset { 20 | common_params { 21 | parallelize = false 22 | do_lowercase = true 23 | special_tokens = ["", "", ""] 24 | use_lang_weights = True 25 | code_max_num_tokens=200 26 | use_subtokens=True 27 | query_embeddings="sbert" 28 | } 29 | } 30 | 31 | training { 32 | short_circuit = false 33 | 34 | device = "cuda" 35 | wandb = true 36 | tensorboard = true 37 | 38 | name = "qc_sbert_lambda" 39 | iteration = "2020_03_02" 40 | tokenizer_type = ${tokenizers.type} 41 | 42 | model { 43 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 44 | encoder { 45 | hidden_size = 32 46 | vocab_size = ${common_vocab_size} 47 | intermediate_size = 128 48 | num_hidden_layers = 2 49 | num_attention_heads = 8 50 | } 51 | } 52 | lr = 0.0001 53 | 54 | loss { 55 | type = "approx_ndcg_loss" 56 | } 57 | 58 | batch_size { 59 | train = 400 60 | val = 400 61 | test = 400 62 | } 63 | 64 | } -------------------------------------------------------------------------------- /conf/qc_sbert_lambda_2020_03_04 copy.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | embeddings { 13 | sbert { 14 | model="bert-base-nli-mean-tokens" 15 | pickle_path="./pickles" 16 | } 17 | } 18 | 19 | dataset { 20 | common_params { 21 | parallelize = false 22 | do_lowercase = true 23 | special_tokens = ["", "", ""] 24 | use_lang_weights = True 25 | code_max_num_tokens=200 26 | use_subtokens=True 27 | query_embeddings="sbert" 28 | fraction_using_func_name=0.0 29 | } 30 | } 31 | 32 | training { 33 | short_circuit = false 34 | 35 | device = "cuda" 36 | wandb = true 37 | tensorboard = false 38 | 39 | name = "qc_sbert_lambda" 40 | iteration = "2020_03_04" 41 | tokenizer_type = ${tokenizers.type} 42 | 43 | model { 44 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 45 | encoder { 46 | hidden_size = 768 47 | vocab_size = ${common_vocab_size} 48 | intermediate_size = 2048 49 | num_hidden_layers = 3 50 | num_attention_heads = 8 51 | } 52 | } 53 | lr = 0.000001 54 | 55 | loss { 56 | type = "approx_ndcg_loss" 57 | } 58 | 59 | batch_size { 60 | train = 100 61 | val = 100 62 | test = 100 63 | } 64 | 65 | } -------------------------------------------------------------------------------- /conf/qc_sbert_lambda_2020_03_04.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | embeddings { 13 | sbert { 14 | model="bert-base-nli-mean-tokens" 15 | pickle_path="./pickles" 16 | } 17 | } 18 | 19 | dataset { 20 | common_params { 21 | parallelize = false 22 | do_lowercase = true 23 | special_tokens = ["", "", ""] 24 | use_lang_weights = True 25 | code_max_num_tokens=200 26 | use_subtokens=True 27 | query_embeddings="sbert" 28 | fraction_using_func_name=0.0 29 | } 30 | } 31 | 32 | training { 33 | short_circuit = false 34 | 35 | device = "cuda" 36 | wandb = true 37 | tensorboard = false 38 | 39 | name = "qc_sbert_lambda" 40 | iteration = "2020_03_04" 41 | tokenizer_type = ${tokenizers.type} 42 | 43 | model { 44 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 45 | encoder { 46 | hidden_size = 768 47 | vocab_size = ${common_vocab_size} 48 | intermediate_size = 2048 49 | num_hidden_layers = 3 50 | num_attention_heads = 8 51 | } 52 | } 53 | lr = 0.000001 54 | 55 | loss { 56 | type = "approx_ndcg_loss" 57 | } 58 | 59 | batch_size { 60 | train = 100 61 | val = 100 62 | test = 100 63 | } 64 | 65 | } -------------------------------------------------------------------------------- /conf/qc_sbert_lambda_2020_03_05.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | embeddings { 13 | sbert { 14 | model="bert-base-nli-mean-tokens" 15 | pickle_path="./pickles" 16 | } 17 | } 18 | 19 | dataset { 20 | common_params { 21 | parallelize = false 22 | do_lowercase = true 23 | special_tokens = ["", "", ""] 24 | use_lang_weights = True 25 | code_max_num_tokens=200 26 | use_subtokens=True 27 | query_embeddings="sbert" 28 | fraction_using_func_name=0.0 29 | } 30 | } 31 | 32 | training { 33 | short_circuit = false 34 | 35 | device = "cuda" 36 | wandb = true 37 | tensorboard = false 38 | 39 | name = "qc_sbert_lambda" 40 | iteration = "2020_03_04" 41 | tokenizer_type = ${tokenizers.type} 42 | 43 | model { 44 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 45 | encoder { 46 | hidden_size = 768 47 | vocab_size = ${common_vocab_size} 48 | intermediate_size = 2048 49 | num_hidden_layers = 3 50 | num_attention_heads = 8 51 | } 52 | } 53 | lr = 0.000001 54 | 55 | loss { 56 | type = "approx_ndcg_loss" 57 | } 58 | 59 | batch_size { 60 | train = 100 61 | val = 100 62 | test = 100 63 | } 64 | 65 | } -------------------------------------------------------------------------------- /conf/qc_sbert_lambda_2020_03_07 copy.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | embeddings { 13 | sbert { 14 | model="bert-base-nli-mean-tokens" 15 | pickle_path="./pickles" 16 | } 17 | } 18 | 19 | dataset { 20 | common_params { 21 | parallelize = false 22 | do_lowercase = true 23 | special_tokens = ["", "", ""] 24 | use_lang_weights = True 25 | code_max_num_tokens=200 26 | use_subtokens=True 27 | #query_embeddings="sbert" 28 | fraction_using_func_name=0.1 29 | } 30 | } 31 | 32 | training { 33 | short_circuit = true 34 | 35 | device = "cuda" 36 | wandb = false 37 | tensorboard = false 38 | 39 | name = "qc_sbert_lambda" 40 | iteration = "2020_03_07" 41 | tokenizer_type = ${tokenizers.type} 42 | 43 | model { 44 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 45 | encoder { 46 | hidden_size = 32 47 | vocab_size = ${common_vocab_size} 48 | intermediate_size = 128 49 | num_hidden_layers = 3 50 | num_attention_heads = 8 51 | } 52 | } 53 | lr = 0.001 54 | 55 | loss { 56 | type = "lambda_loss" 57 | } 58 | 59 | batch_size { 60 | #train = 400 61 | #val = 400 62 | #test = 400 63 | train = 5 64 | val = 5 65 | test = 5 66 | } 67 | 68 | } -------------------------------------------------------------------------------- /conf/qc_sbert_lambda_2020_03_07.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | embeddings { 13 | sbert { 14 | model="bert-base-nli-mean-tokens" 15 | pickle_path="./pickles" 16 | } 17 | } 18 | 19 | dataset { 20 | common_params { 21 | parallelize = false 22 | do_lowercase = true 23 | special_tokens = ["", "", ""] 24 | use_lang_weights = True 25 | code_max_num_tokens=200 26 | use_subtokens=True 27 | #query_embeddings="sbert" 28 | fraction_using_func_name=0.1 29 | } 30 | } 31 | 32 | training { 33 | short_circuit = true 34 | 35 | device = "cuda" 36 | wandb = false 37 | tensorboard = false 38 | 39 | name = "qc_sbert_lambda" 40 | iteration = "2020_03_07" 41 | tokenizer_type = ${tokenizers.type} 42 | 43 | model { 44 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 45 | encoder { 46 | hidden_size = 32 47 | vocab_size = ${common_vocab_size} 48 | intermediate_size = 128 49 | num_hidden_layers = 3 50 | num_attention_heads = 8 51 | } 52 | } 53 | lr = 0.001 54 | 55 | loss { 56 | type = "lambda_loss" 57 | } 58 | 59 | batch_size { 60 | #train = 400 61 | #val = 400 62 | #test = 400 63 | train = 5 64 | val = 5 65 | test = 5 66 | } 67 | 68 | } -------------------------------------------------------------------------------- /conf/query_code_siamese_2020_02_15_14_00 copy.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000 4 | 5 | tokenizers { 6 | build_path = "./build_tokenizers/with_lang_query_code_siamese" 7 | token_files = "./build_tokenizers/token_files_query_code_siamese" 8 | } 9 | 10 | dataset { 11 | common_params { 12 | parallelize = false 13 | do_lowercase = true 14 | special_tokens = ["", "", ""] 15 | use_lang_weights = True 16 | } 17 | } 18 | 19 | training { 20 | short_circuit = false 21 | 22 | device = "cuda" 23 | wandb = true 24 | tensorboard = true 25 | 26 | name = "query_code_siamese" 27 | iteration = "2020_02_15_14_00" 28 | tokenizer_type = "query_code_siamese" 29 | 30 | model { 31 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 32 | output_size = 72 33 | encoder { 34 | hidden_size = ${training.model.output_size} 35 | vocab_size = ${common_vocab_size} 36 | intermediate_size = 256 37 | num_hidden_layers = 12 38 | num_attention_heads = 12 39 | } 40 | } 41 | 42 | batch_size { 43 | train = 100 44 | val = 100 45 | test = 100 46 | } 47 | 48 | } -------------------------------------------------------------------------------- /conf/query_code_siamese_2020_02_15_14_00.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000 4 | 5 | tokenizers { 6 | build_path = "./build_tokenizers/with_lang_query_code_siamese" 7 | token_files = "./build_tokenizers/token_files_query_code_siamese" 8 | } 9 | 10 | dataset { 11 | common_params { 12 | parallelize = false 13 | do_lowercase = true 14 | special_tokens = ["", "", ""] 15 | use_lang_weights = True 16 | } 17 | } 18 | 19 | training { 20 | short_circuit = false 21 | 22 | device = "cuda" 23 | wandb = true 24 | tensorboard = true 25 | 26 | name = "query_code_siamese" 27 | iteration = "2020_02_15_14_00" 28 | tokenizer_type = "query_code_siamese" 29 | 30 | model { 31 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 32 | output_size = 72 33 | encoder { 34 | hidden_size = ${training.model.output_size} 35 | vocab_size = ${common_vocab_size} 36 | intermediate_size = 256 37 | num_hidden_layers = 12 38 | num_attention_heads = 12 39 | } 40 | } 41 | 42 | batch_size { 43 | train = 100 44 | val = 100 45 | test = 100 46 | } 47 | 48 | } -------------------------------------------------------------------------------- /conf/query_code_siamese_2020_02_17_21_30 copy.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000 4 | 5 | tokenizers { 6 | build_path = "./build_tokenizers/with_lang_query_code_siamese" 7 | token_files = "./build_tokenizers/token_files_query_code_siamese" 8 | } 9 | 10 | dataset { 11 | common_params { 12 | parallelize = false 13 | do_lowercase = true 14 | special_tokens = ["", "", ""] 15 | use_lang_weights = True 16 | } 17 | } 18 | 19 | training { 20 | short_circuit = false 21 | 22 | device = "cuda" 23 | wandb = true 24 | tensorboard = true 25 | 26 | name = "query_code_siamese" 27 | iteration = "2020_02_17_21_30" 28 | tokenizer_type = "query_code_siamese" 29 | 30 | model { 31 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 32 | output_size = 64 33 | encoder { 34 | hidden_size = ${training.model.output_size} 35 | vocab_size = ${common_vocab_size} 36 | intermediate_size = 256 37 | num_hidden_layers = 6 38 | num_attention_heads = 8 39 | } 40 | } 41 | 42 | batch_size { 43 | train = 290 44 | val = 290 45 | test = 290 46 | } 47 | 48 | } -------------------------------------------------------------------------------- /conf/query_code_siamese_2020_02_17_21_30.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000 4 | 5 | tokenizers { 6 | build_path = "./build_tokenizers/with_lang_query_code_siamese" 7 | token_files = "./build_tokenizers/token_files_query_code_siamese" 8 | } 9 | 10 | dataset { 11 | common_params { 12 | parallelize = false 13 | do_lowercase = true 14 | special_tokens = ["", "", ""] 15 | use_lang_weights = True 16 | } 17 | } 18 | 19 | training { 20 | short_circuit = false 21 | 22 | device = "cuda" 23 | wandb = true 24 | tensorboard = true 25 | 26 | name = "query_code_siamese" 27 | iteration = "2020_02_17_21_30" 28 | tokenizer_type = "query_code_siamese" 29 | 30 | model { 31 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 32 | output_size = 64 33 | encoder { 34 | hidden_size = ${training.model.output_size} 35 | vocab_size = ${common_vocab_size} 36 | intermediate_size = 256 37 | num_hidden_layers = 6 38 | num_attention_heads = 8 39 | } 40 | } 41 | 42 | batch_size { 43 | train = 290 44 | val = 290 45 | test = 290 46 | } 47 | 48 | } -------------------------------------------------------------------------------- /conf/query_code_siamese_2020_02_18_13_00.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000 4 | 5 | tokenizers { 6 | build_path = "./build_tokenizers/with_lang_query_code_siamese" 7 | token_files = "./build_tokenizers/token_files_query_code_siamese" 8 | } 9 | 10 | dataset { 11 | common_params { 12 | parallelize = false 13 | do_lowercase = true 14 | special_tokens = ["", "", ""] 15 | use_lang_weights = True 16 | } 17 | } 18 | 19 | training { 20 | short_circuit = false 21 | 22 | device = "cuda" 23 | wandb = true 24 | tensorboard = true 25 | 26 | name = "query_code_siamese" 27 | iteration = "2020_02_17_21_30" 28 | tokenizer_type = "query_code_siamese" 29 | 30 | model { 31 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 32 | output_size = 64 33 | encoder { 34 | hidden_size = ${training.model.output_size} 35 | vocab_size = ${common_vocab_size} 36 | intermediate_size = 256 37 | num_hidden_layers = 6 38 | num_attention_heads = 8 39 | } 40 | } 41 | 42 | batch_size { 43 | train = 290 44 | val = 290 45 | test = 290 46 | } 47 | 48 | } -------------------------------------------------------------------------------- /conf/query_code_siamese_2020_02_19_13_00 copy.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000 4 | 5 | tokenizers { 6 | build_path = "./build_tokenizers/with_lang_query_code_siamese" 7 | token_files = "./build_tokenizers/token_files_query_code_siamese" 8 | } 9 | 10 | dataset { 11 | common_params { 12 | parallelize = false 13 | do_lowercase = true 14 | special_tokens = ["", "", ""] 15 | use_lang_weights = True 16 | } 17 | } 18 | 19 | training { 20 | short_circuit = false 21 | 22 | device = "cuda" 23 | wandb = true 24 | tensorboard = true 25 | 26 | name = "query_code_siamese" 27 | iteration = "2020_02_19_13_00" 28 | tokenizer_type = "query_code_siamese" 29 | 30 | model { 31 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 32 | output_size = 64 33 | encoder { 34 | hidden_size = ${training.model.output_size} 35 | vocab_size = ${common_vocab_size} 36 | intermediate_size = 256 37 | num_hidden_layers = 6 38 | num_attention_heads = 8 39 | } 40 | } 41 | 42 | loss { 43 | type = "lambda_loss" 44 | } 45 | 46 | batch_size { 47 | train = 256 48 | val = 256 49 | test = 256 50 | # train = 8 51 | # val = 8 52 | # test = 8 53 | } 54 | 55 | } -------------------------------------------------------------------------------- /conf/query_code_siamese_2020_02_19_13_00.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000 4 | 5 | tokenizers { 6 | build_path = "./build_tokenizers/with_lang_query_code_siamese" 7 | token_files = "./build_tokenizers/token_files_query_code_siamese" 8 | } 9 | 10 | dataset { 11 | common_params { 12 | parallelize = false 13 | do_lowercase = true 14 | special_tokens = ["", "", ""] 15 | use_lang_weights = True 16 | } 17 | } 18 | 19 | training { 20 | short_circuit = false 21 | 22 | device = "cuda" 23 | wandb = true 24 | tensorboard = true 25 | 26 | name = "query_code_siamese" 27 | iteration = "2020_02_19_13_00" 28 | tokenizer_type = "query_code_siamese" 29 | 30 | model { 31 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 32 | output_size = 64 33 | encoder { 34 | hidden_size = ${training.model.output_size} 35 | vocab_size = ${common_vocab_size} 36 | intermediate_size = 256 37 | num_hidden_layers = 6 38 | num_attention_heads = 8 39 | } 40 | } 41 | 42 | loss { 43 | type = "lambda_loss" 44 | } 45 | 46 | batch_size { 47 | train = 256 48 | val = 256 49 | test = 256 50 | # train = 8 51 | # val = 8 52 | # test = 8 53 | } 54 | 55 | } -------------------------------------------------------------------------------- /conf/query_code_siamese_albert_2020_02_18_08_30 copy.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000 4 | 5 | tokenizers { 6 | build_path = "./build_tokenizers/with_lang_query_code_siamese" 7 | token_files = "./build_tokenizers/token_files_query_code_siamese" 8 | } 9 | 10 | dataset { 11 | common_params { 12 | parallelize = false 13 | do_lowercase = true 14 | special_tokens = ["", "", ""] 15 | use_lang_weights = True 16 | } 17 | } 18 | 19 | training { 20 | short_circuit = false 21 | 22 | device = "cuda" 23 | wandb = true 24 | tensorboard = true 25 | 26 | name = "query_code_siamese_albert" 27 | iteration = "2020_02_18_08_30" 28 | tokenizer_type = "query_code_siamese" 29 | 30 | model { 31 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 32 | output_size = 128 33 | encoder { 34 | type = "albert" 35 | embedding_size = ${training.model.output_size} 36 | hidden_size = 512 37 | vocab_size = ${common_vocab_size} 38 | intermediate_size = 768 39 | num_hidden_layers = 8 40 | num_attention_heads = 8 41 | } 42 | } 43 | 44 | batch_size { 45 | train = 128 46 | val = 128 47 | test = 128 48 | } 49 | 50 | lr = 0.00001 51 | 52 | } -------------------------------------------------------------------------------- /conf/query_code_siamese_albert_2020_02_18_08_30.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000 4 | 5 | tokenizers { 6 | build_path = "./build_tokenizers/with_lang_query_code_siamese" 7 | token_files = "./build_tokenizers/token_files_query_code_siamese" 8 | } 9 | 10 | dataset { 11 | common_params { 12 | parallelize = false 13 | do_lowercase = true 14 | special_tokens = ["", "", ""] 15 | use_lang_weights = True 16 | } 17 | } 18 | 19 | training { 20 | short_circuit = false 21 | 22 | device = "cuda" 23 | wandb = true 24 | tensorboard = true 25 | 26 | name = "query_code_siamese_albert" 27 | iteration = "2020_02_18_08_30" 28 | tokenizer_type = "query_code_siamese" 29 | 30 | model { 31 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 32 | output_size = 128 33 | encoder { 34 | type = "albert" 35 | embedding_size = ${training.model.output_size} 36 | hidden_size = 512 37 | vocab_size = ${common_vocab_size} 38 | intermediate_size = 768 39 | num_hidden_layers = 8 40 | num_attention_heads = 8 41 | } 42 | } 43 | 44 | batch_size { 45 | train = 128 46 | val = 128 47 | test = 128 48 | } 49 | 50 | lr = 0.00001 51 | 52 | } -------------------------------------------------------------------------------- /conf/query_code_siamese_albert_2020_02_18_14_00.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000 4 | 5 | tokenizers { 6 | build_path = "./build_tokenizers/with_lang_query_code_siamese" 7 | token_files = "./build_tokenizers/token_files_query_code_siamese" 8 | } 9 | 10 | dataset { 11 | common_params { 12 | parallelize = false 13 | do_lowercase = true 14 | special_tokens = ["", "", ""] 15 | use_lang_weights = True 16 | } 17 | } 18 | 19 | training { 20 | short_circuit = false 21 | 22 | device = "cuda" 23 | wandb = true 24 | tensorboard = true 25 | 26 | name = "query_code_siamese_albert" 27 | iteration = "2020_02_18_14_00" 28 | tokenizer_type = "query_code_siamese" 29 | 30 | model { 31 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 32 | # output_size = 128 33 | encoder { 34 | type = "albert" 35 | embedding_size = 64 36 | hidden_size = 256 37 | vocab_size = ${common_vocab_size} 38 | intermediate_size = 512 39 | num_hidden_layers = 6 40 | num_attention_heads = 8 41 | } 42 | } 43 | 44 | batch_size { 45 | train = 240 46 | val = 240 47 | test = 240 48 | } 49 | 50 | lr = 0.00001 51 | 52 | } -------------------------------------------------------------------------------- /guide.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | Ce projet est un projet bac-à-sable technique sur lequel j'ai travaillé en 2020 et que j'utilisais pour tester certains points techniques à titre personnel. C'est une réécriture quasi-complète d'un projet de Microsoft pour le challenge CodeSearchNet (moteur de recherche de code multi-langage à partir de requêtes textuelles, sujet qui depuis a été largement poussé plus loin par github/microsoft avec Copilot avec des capacités avancées de génération de code). 4 | J'ai remis ce projet à jour pour cette présentation car je me suis aperçu que les API python ont beaucoup évolué depuis 2020 et le code n'était plus du tout compatible avec les versions actuelles. Cependant, le code ne tournera pas si vous le lancez car il a besoin de tokenizers qu'il faut construire manuellement et qui demandent pas mal de temps de calcul et de librairies natives (pour les AST de langages). 5 | 6 | ## Points techniques remarquables 7 | 8 | Mon but ici n'est pas de parler du fond ML/IA (mes résultats n'étaient pas très intéressants) mais plutôt du code et plus spécifiquement les points suivants: 9 | 10 | - Projet complet Python/ML avec gestion des dépendances (poetry), isolation dans un virtualenv, intégration dans VSCode (qui devenait le standard de dev en 2020) avec utilisation d'extensions: mypy, linters, license, tests (même si anecdotiques), etc... 11 | 12 | - [pyproject.toml](./pyproject.toml) 13 | - et pour info, la [license](./LICENSE) 14 | 15 | - Utilisation des configurations au format générique HOCON qui permet de gérer des configurations complexes avec des imports, des variables, des références etc... 16 | 17 | - [Configuration générique](./conf/default.conf) 18 | - [Configuration spécifique](./conf/query_code_siamese_2020_02_15_14_00.conf) 19 | 20 | - Exploration des limites du typage fort en Python avec des types génériques abstraits (pour tenter de simuler l'équivalent des "typeclasses" qu'on trouve dans les langages fonctionnels comme Haskell/Scala) et les "newtypes" pour "spécialiser" des types simples 21 | 22 | - [Type abstraits](./codenets/recordable.py#L22) 23 | - [Type génériques](./codenets/codesearchnet/training_ctx.py#L205-L220) 24 | - [Newtypes](./codenets/codesearchnet/training_ctx.py#L49-L68) 25 | 26 | - Evaluation de la compilation des types avc le moteur de compilation Mypy de Microsoft intégré dans VS Code. 27 | 28 | - [mypy.ini](./mypy.ini) 29 | 30 | - Etude de sauvegarde/restoration générique d'un contexte complet de projet IA (configuration + commit + modèle + tokenizer + dataset + etc...) pour une sauvegarde dans un point unique (sur un cloud de type AWS ou un serveur orienté ML de type MLFlow par exemple). 31 | 32 | - [Recordable générique](./codenets/recordable.py#L22) 33 | - [Recordable spécialisé configuration HOCON](./codenets/recordable.py#L113) 34 | - [Recordable spécialisé modèle/tokenizer TorchModule](./codenets/recordable.py#L248) 35 | - [training context générique](./codenets/codesearchnet/training_ctx.py#L245) 36 | - [training context spécialisé sur un modèle spécifique](./codenets/codesearchnet/query_code_siamese/training_ctx.py#L40) 37 | 38 | - Evaluation de la complexité de réécriture d'un code Tensorflow vers du PyTorch et les librairies huggingface. 39 | 40 | - Intégration avec WanDB/Tensorflow pour le suivi des entraînements. 41 | 42 | et de manière plus anecdotique: 43 | 44 | - Etudier les résultats atteignables avec des transformers de petite taille sur un challenge de ce type 45 | et les résultats ont été très décevants cf. 46 | 47 | - [README.md](./README.md) 48 | 49 | - Utilisation de tokenizers natifs Rust avec interface Python de Huggingface tokenizers (qui venaient d'être publiés en 2020): 50 | 51 | - [tokenizer_recs.py](./codenets/codesearchnet/huggingface/tokenizer_recs.py#L102) 52 | 53 | - Utilisation des parsers d'AST de langages (tree-sitter) pour améliorer les performances des modèles à base de transformers (je n'ai pas réussi à pousser les expérimentations très loin par manque de ressources GPU) 54 | - [ast_build.py](./codenets/codesearchnet/ast_build.py#L189) 55 | 56 | ## Conclusion 57 | 58 | Au final, je retiendrai les points suivants: 59 | 60 | - L'utilisation des configurations HOCON est intéressante pour tout projet informatique quel que soit le langage à mon avis car cela permet de gérer des configurations complexes avec des variables/références tout en restant simple de format. 61 | - la sauvegarde générique complète d'un projet ML du code au modèle et dataset me semble un point important dans l'optique de backup et versioning de projets ML en associant l'intégralité des ressources: code, configuration, modèle, tokenizer, dataset etc... 62 | - le typage fort dans Python est devenu un outil intéressant qui permet d'améliorer la robustesse globale du code, de réduire la quantité de tests unitaires. Mypy semble être une solution robuste pour vérifier les types même s'il faut filtrer de nombreuses dépendances externes qui n'intègrent pas la gestion des types. Cependant, l'utilisation trop fréquente des unions de types dans les librairies Python peut conduire à des signatures de type assez indigestes. 63 | - L'utilisation des types génériques et abstraits est fonctionnelle mais reste assez fastidieuse en Python et ne donne pas l'impression d'être une fonctionnalité native du langage (sans parler des cast au runtime qui peuvent poser des problèmes de performance). Il vaut mieux rester dans les patterns orienté-objet classiques et éviter de trop s'aventurer en dehors des sentiers battus. 64 | - L'utilisation des NewTypes reste encore anecdotique de mon point de vue (en particulier, les opérations mathématiques ou de concaténation sur ces types leur font perdre leur spécificité) 65 | 66 | Si vous avez des questions, n'hésitez pas à me contacter. 67 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | """Main of the project.""" 2 | # import numpy as np 3 | # import os 4 | 5 | 6 | # def get_os_env(): 7 | # print(os.getcwd()) 8 | # print(os.uname()) 9 | 10 | 11 | # def main(): 12 | # # Do some os stuff 13 | # get_os_env() 14 | # # Do some numpy stuff 15 | # A = np.ones(3)*1 16 | # B = np.ones(3)*2 17 | # C = np.ones(3)*3 18 | # res = np.add(A,B,out=B) 19 | # res2 = np.divide(A,2,out=A) 20 | # res3 = np.negative(A,out=A) 21 | # res4 = np.multiply(A,B,out=A) 22 | 23 | # print(res) 24 | # print(f"this is the result 2 {res2}") 25 | # print(np.zeros(shape=(2, 3))) 26 | 27 | 28 | # if __name__ == "__main__": 29 | # main() 30 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | python_version = 3.10 3 | ; mypy_path = ./src 4 | namespace_packages = True 5 | ; ignore_missing_imports = True 6 | ; follow_imports = normal 7 | no_deprecation_warning=True 8 | 9 | [mypy-torch.*] 10 | ignore_missing_imports = True 11 | 12 | [mypy-loguru.*] 13 | ignore_missing_imports = True 14 | 15 | [mypy-torch.optim.*] 16 | ignore_missing_imports = True 17 | 18 | [mypy-dpu_utils] 19 | ignore_missing_imports = True 20 | 21 | [mypy-dpu_utils.*] 22 | ignore_missing_imports = True 23 | 24 | [mypy-toolz] 25 | ignore_missing_imports = True 26 | 27 | [mypy-numpy] 28 | ignore_missing_imports = True 29 | 30 | [mypy-tensorflow.*] 31 | ignore_missing_imports = True 32 | 33 | [mypy-pyhocon.*] 34 | ignore_missing_imports = True 35 | 36 | [mypy-transformers.*] 37 | ignore_missing_imports = True 38 | 39 | [mypy-tensorboard.*] 40 | ignore_missing_imports = True 41 | 42 | [mypy-tensorboardX.*] 43 | ignore_missing_imports = True 44 | 45 | [mypy-pathos.*] 46 | ignore_missing_imports = True 47 | 48 | [mypy-docopt] 49 | ignore_missing_imports = True 50 | 51 | [mypy-pandas] 52 | ignore_missing_imports = True 53 | 54 | [mypy-tqdm] 55 | ignore_missing_imports = True 56 | 57 | [mypy-pygments.*] 58 | ignore_missing_imports = True 59 | 60 | [mypy-scipy.*] 61 | ignore_missing_imports = True 62 | 63 | [mypy-annoy] 64 | ignore_missing_imports = True 65 | 66 | [mypy-wandb] 67 | ignore_missing_imports = True 68 | 69 | [mypy-wandb.*] 70 | ignore_missing_imports = True 71 | 72 | [mypy-sklearn.*] 73 | ignore_missing_imports = True 74 | 75 | [mypy-matplotlib.*] 76 | ignore_missing_imports = True 77 | 78 | [mypy-tokenizers.*] 79 | ignore_missing_imports = True 80 | 81 | [mypy-sentence_transformers.*] 82 | ignore_missing_imports = True 83 | 84 | [mypy-tree_sitter.*] 85 | ignore_missing_imports = True 86 | -------------------------------------------------------------------------------- /pylama.ini: -------------------------------------------------------------------------------- 1 | [pylama] 2 | ;format = pylint 3 | skip = .tox/*,.env/*,.venv/*,.vscode/* 4 | ;linters = mccabe,pep257,pydocstyle,pep8,pycodestyle,pyflakes,pylint,isort,radon,eradicate 5 | linters = mccabe,pydocstyle,pycodestyle,pyflakes 6 | ;ignore = F0401,C0111,E731 7 | ignore = C0413,D212,D211,D203,R0903,C0330,D104,C0111,E1101,W0221,D406,D413,D407,W293,C901,D202,W291,D103,D100,D101,D107,D102,D400,E1102,C0103,C0411,R0913,R0914,R1719,W0212,C0412,R0902,W0102,E501,R0915,C0301,W0703,R1705,R0904,R0912,E203,W0640,R0911,R0201,D205,D415,W292,W503 8 | 9 | [pylama:*/__init__.py] 10 | ignore = W0611,W0401 11 | 12 | [pylama:tests/*.py] 13 | ignore = D104,D100 14 | 15 | [pylama:pycodestyle] 16 | max_line_length = 120 17 | 18 | [pylama:pylint] 19 | max_line_length = 120 20 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["poetry>=0.12"] 3 | build-backend = "poetry.masonry.api" 4 | 5 | [tool.poetry] 6 | name = "codenets" 7 | version = "0.2.0" 8 | description = "code & neural nets." 9 | authors = ["Voitot Pascal"] 10 | readme = "README.md" 11 | 12 | # packages = [ 13 | # { include = "src/**/*.py" }, 14 | # ] 15 | 16 | [tool.poetry.dependencies] 17 | python = "^3.10" 18 | numpy = "^1.24" 19 | torch = "^2.0.0" 20 | pandas = "^2.0.0" 21 | #tokenizers = "^0.2.1" 22 | transformers = "^4.27.0" 23 | loguru = "^0.6" 24 | docopt = "^0.6" 25 | dpu-utils = "^0.6" 26 | wandb = "^0.14" 27 | pathos = "^0.3" 28 | pyhocon = "^0.3.60" 29 | annoy = "^1.17" 30 | #tables = "^3.6.1" 31 | sentence_transformers = "^2.2" 32 | tree_sitter = "^0.20" 33 | # tree-sitter = { file = "../../tools/py-tree-sitter/tree_sitter-0.1.0_mandubian-cp37-cp37m-linux_x86_64.whl" } 34 | #pyarrow = "*" 35 | fastparquet = "^2023.2" 36 | # apex = "*" 37 | 38 | [tool.poetry.dev-dependencies] 39 | black = "*" 40 | pylama = "*" 41 | pytest = "*" 42 | mypy = "^1.1" 43 | jupyterlab = "*" 44 | matplotlib = "*" 45 | rope = "*" 46 | codecov = "*" 47 | pytest-cov = "*" 48 | pylint = "*" 49 | tensorboard = "*" 50 | tensorboardX = "*" 51 | 52 | 53 | [tool.black] 54 | line-length = 88 55 | exclude = ''' 56 | /( 57 | \.git 58 | | \.mypy_cache 59 | | \.tox 60 | | \.venv 61 | | \.pytest_cache 62 | | dist 63 | | build 64 | | docs 65 | )/ 66 | ''' 67 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | annoy==1.16.3 2 | azure-common==1.1.24 3 | azure-nspkg==3.0.2 4 | azure-storage==0.36.0 5 | boto3==1.12.2 6 | botocore==1.15.2 7 | certifi==2019.11.28 8 | cffi==1.14.0 9 | chardet==3.0.4 10 | click==7.0 11 | colorama==0.4.3; sys_platform == "win32" 12 | configparser==4.0.2 13 | cryptography==2.8 14 | dill==0.3.1.1 15 | docker-pycreds==0.4.0 16 | docopt==0.6.2 17 | docutils==0.15.2 18 | dpu-utils==0.2.8 19 | gitdb2==3.0.2 20 | gitpython==3.0.8 21 | gql==0.2.0 22 | graphql-core==1.1 23 | idna==2.8 24 | jmespath==0.9.4 25 | joblib==0.14.1 26 | loguru==0.3.2 27 | multiprocess==0.70.9 28 | numexpr==2.7.1 29 | numpy==1.18.1 30 | nvidia-ml-py3==7.352.0 31 | pandas==0.25.3 32 | pathos==0.2.5 33 | pathtools==0.1.2 34 | pox==0.2.7 35 | ppft==1.6.6.1 36 | promise==2.3 37 | psutil==5.7.0 38 | pycparser==2.19 39 | pyhocon==0.3.54 40 | pyparsing==2.4.6 41 | python-dateutil==2.8.1 42 | pytz==2019.3 43 | pyyaml==5.3 44 | regex==2020.2.18 45 | requests==2.22.0 46 | s3transfer==0.3.3 47 | sacremoses==0.0.38 48 | sentencepiece==0.1.85 49 | sentry-sdk==0.14.1 50 | setsimilaritysearch==0.1.7 51 | shortuuid==0.5.0 52 | six==1.14.0 53 | smmap2==2.0.5 54 | subprocess32==3.5.4 55 | tables==3.6.1 56 | tokenizers==0.2.1 57 | torch==1.4.0 58 | tqdm==4.43.0 59 | transformers==2.3.0 60 | urllib3==1.25.8 61 | wandb==0.8.27 62 | watchdog==0.10.2 63 | win32-setctime==1.0.1; sys_platform == "win32" 64 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/test/__init__.py -------------------------------------------------------------------------------- /test/conf/default.conf: -------------------------------------------------------------------------------- 1 | 2 | lang_ids { 3 | php = 0 4 | python = 1 5 | ruby = 2 6 | java = 3 7 | go = 4 8 | javascript = 5 9 | } 10 | 11 | common_vocab_size = 10000 12 | 13 | bert { 14 | hidden_size = 128 15 | vocab_size = ${common_vocab_size} 16 | intermediate_size = 512 17 | num_hidden_layers = 3 18 | num_attention_heads = 8 19 | } 20 | 21 | tokenizers { 22 | type = "TOKENIZER_TYPE" 23 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 24 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 25 | } 26 | 27 | dataset { 28 | root_dir = ${HOME}"/workspaces/tools/CodeSearchNet/src" 29 | common_params { 30 | fraction_using_func_name=0.1 31 | min_len_func_name_for_query=12 32 | use_subtokens=False 33 | mark_subtoken_end=False 34 | code_max_num_tokens=200 35 | query_max_num_tokens=30 36 | use_bpe=True 37 | vocab_size=${common_vocab_size} 38 | pct_bpe=0.5 39 | vocab_count_threshold=10 40 | lang_ids = ${lang_ids} 41 | do_lowercase = true 42 | special_tokens = [""] 43 | parallelize = true 44 | use_lang_weights = False 45 | } 46 | 47 | train { 48 | dirs = ${dataset.root_dir}"/data_dirs_train.txt" 49 | params = ${dataset.common_params} 50 | } 51 | 52 | val { 53 | dirs = ${dataset.root_dir}"/data_dirs_valid.txt" 54 | params = ${dataset.common_params} 55 | } 56 | 57 | test { 58 | dirs = ${dataset.root_dir}"/data_dirs_test.txt" 59 | params = ${dataset.common_params} 60 | } 61 | 62 | queries_file = ${dataset.root_dir}"/queries.csv" 63 | } 64 | 65 | 66 | training { 67 | # The name of current experiment (can have several runs) 68 | name = "EXPERIMENT_NAME" 69 | # The unique id of current run 70 | iteration = "UNIQUE_RUN_ID" 71 | # The ID used to identify the pre-built pickled files 72 | # using the tokenizer defined above 73 | tokenizer_type = "TOKENIZER_ID" 74 | 75 | # Set that to true to test your run without slow-loading train dataset 76 | short_circuit = false 77 | 78 | device = "cuda" 79 | # deactivate wandb & tensorboard 80 | wandb = true 81 | tensorboard = true 82 | 83 | model { 84 | # IMPORTANT: the class representing Training Context 85 | training_ctx_class = "codenets.codesearchnet.query_1_code_1.training_ctx.Query1Code1Ctx" 86 | output_size = 64 87 | query_encoder { 88 | hidden_size = ${training.model.output_size} 89 | vocab_size = ${common_vocab_size} 90 | intermediate_size = 512 91 | num_hidden_layers = 3 92 | num_attention_heads = 8 93 | } 94 | code_encoder { 95 | hidden_size = ${training.model.output_size} 96 | vocab_size = ${common_vocab_size} 97 | intermediate_size = 512 98 | num_hidden_layers = 6 99 | num_attention_heads = 8 100 | } 101 | } 102 | 103 | # Training Hyper-Parameters 104 | seed = 0 105 | lr = 0.0001 106 | max_grad_norm = 1.0 107 | min_log_interval = 50 108 | start_epoch = 0 109 | epochs = 10 110 | 111 | batch_size { 112 | train = 256 113 | val = 256 114 | test = 256 115 | } 116 | 117 | loss { 118 | type = "softmax_cross_entropy" 119 | margin = 1.0 120 | } 121 | 122 | # Paths 123 | pickle_path = "./pickles" 124 | output_dir = "./checkpoints" 125 | tensorboard_path = "./runs" 126 | 127 | } -------------------------------------------------------------------------------- /test/conf/test.conf: -------------------------------------------------------------------------------- 1 | include "./default.conf" 2 | 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000 4 | 5 | 6 | tokenizers { 7 | type = "qc_30k" 8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type} 9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type} 10 | } 11 | 12 | dataset { 13 | common_params { 14 | parallelize = false 15 | do_lowercase = true 16 | special_tokens = ["", "", ""] 17 | use_lang_weights = True 18 | } 19 | } 20 | 21 | training { 22 | short_circuit = false 23 | 24 | device = "cpu" 25 | wandb = false 26 | tensorboard = false 27 | 28 | name = "test" 29 | iteration = "2020_02_23_01_00" 30 | tokenizer_type = ${tokenizers.type} 31 | 32 | model { 33 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx" 34 | encoder { 35 | hidden_size = 32 36 | vocab_size = ${common_vocab_size} 37 | intermediate_size = 256 38 | num_hidden_layers = 2 39 | num_attention_heads = 8 40 | } 41 | } 42 | 43 | loss { 44 | type = "softmax_cross_entropy" 45 | } 46 | 47 | batch_size { 48 | train = 768 49 | val = 768 50 | test = 768 51 | } 52 | 53 | } -------------------------------------------------------------------------------- /test/test_recordable.py: -------------------------------------------------------------------------------- 1 | 2 | from pathlib import Path 3 | from typing import cast 4 | from codenets.recordable import DictRecordable 5 | import os 6 | import shutil 7 | import pytest 8 | from pyhocon import ConfigFactory 9 | 10 | from codenets.codesearchnet.training_ctx import CodeSearchTrainingContext 11 | from codenets.codesearchnet.query_code_siamese.training_ctx import QueryCodeSiameseCtx 12 | 13 | test_dir = Path("./tmp-test") 14 | cfg = Path("./test/conf/test.conf") 15 | 16 | 17 | @pytest.fixture(autouse=True) 18 | def run_before_and_after_tests(tmpdir): 19 | """Fixture to execute asserts before and after a test is run""" 20 | # Setup: fill with any logic you want 21 | os.mkdir(test_dir) 22 | 23 | yield # this is where the testing happens 24 | 25 | # Teardown : fill with any logic you want 26 | shutil.rmtree(test_dir) 27 | 28 | 29 | def test_dict_recordable(): 30 | d = DictRecordable({ 31 | 'toto': 1, 32 | 'tata': "titi", 33 | "tutu": 1.2345 34 | }) 35 | 36 | assert d.save(test_dir / "d") 37 | d2 = DictRecordable.load(test_dir / "d") 38 | assert d == d2 39 | 40 | 41 | def test_fullconf_recordable(): 42 | training_ctx = CodeSearchTrainingContext.build_context_from_hocon(ConfigFactory.parse_file(cfg)) 43 | assert training_ctx.save(test_dir / "f") 44 | 45 | training_ctx_2 = QueryCodeSiameseCtx.load(test_dir / "f") 46 | print("keys", training_ctx.keys(), training_ctx_2.keys()) 47 | assert training_ctx.keys() == training_ctx_2.keys() 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /wandb/settings: -------------------------------------------------------------------------------- 1 | [default] 2 | project = codenets 3 | 4 | --------------------------------------------------------------------------------