├── .env
├── .gitignore
├── .python-version
├── .vscode
└── settings.json
├── LICENSE
├── README.md
├── codenets
├── __init__.py
├── codesearchnet
│ ├── __init__.py
│ ├── ast_build.py
│ ├── code_ast
│ │ ├── __init__.py
│ │ └── ast_utils.py
│ ├── copied_code
│ │ ├── __init__.py
│ │ ├── bpevocabulary.py
│ │ ├── metadata.py
│ │ └── utils.py
│ ├── data.py
│ ├── dataset_main.py
│ ├── dataset_utils.py
│ ├── eval.py
│ ├── huggingface
│ │ ├── __init__.py
│ │ ├── models.py
│ │ └── tokenizer_recs.py
│ ├── notebooks
│ │ ├── codesearchnet_distrib.ipynb
│ │ └── predictions.ipynb
│ ├── poolers.py
│ ├── predictions.py
│ ├── query_1_code_1
│ │ ├── __init__.py
│ │ ├── dataset.py
│ │ ├── model.py
│ │ └── training_ctx.py
│ ├── query_1_code_n
│ │ ├── __init__.py
│ │ ├── dataset.py
│ │ ├── model.py
│ │ └── training_ctx.py
│ ├── query_code_siamese
│ │ ├── __init__.py
│ │ ├── dataset.py
│ │ ├── model.py
│ │ └── training_ctx.py
│ ├── sbert_build.py
│ ├── tokenizer_build.py
│ ├── tokenizer_recs.py
│ ├── train.py
│ └── training_ctx.py
├── losses.py
├── main.py
├── recordable.py
├── save.py
├── tensorboard_utils.py
└── utils.py
├── conf
├── code_search_bert_2020_02_01_1500.conf
├── code_search_bert_2020_02_03_20_00.conf
├── code_search_bert_lg_2020_02_04_15_00.conf
├── code_search_bert_lg_2020_02_04_21_00.conf
├── code_search_bert_lg_2020_02_05_00_00.conf
├── code_search_bert_lg_2020_02_06_18_00.conf
├── code_search_bert_lg_2020_02_06_22_30.conf
├── code_search_bert_lg_2020_02_07_10_00.conf
├── code_search_bert_query_1_code_1_2020_02_10_11_00 copy.conf
├── code_search_bert_query_1_code_1_2020_02_10_11_00.conf
├── code_search_bert_query_1_code_1_2020_02_11_22_00 copy.conf
├── code_search_bert_query_1_code_1_2020_02_11_22_00.conf
├── code_search_bert_query_code_siamese_2020_02_12_00_00 copy.conf
├── code_search_bert_query_code_siamese_2020_02_12_00_00.conf
├── code_search_bert_query_code_siamese_2020_02_14_16_00 copy.conf
├── code_search_bert_query_code_siamese_2020_02_14_16_00.conf
├── code_search_bert_query_code_siamese_2020_02_15_14_00.conf
├── default.conf
├── qc_ast_2020_03_13.conf
├── qc_ast_2020_03_15 copy.conf
├── qc_ast_2020_03_15.conf
├── qc_ast_2020_03_17.conf
├── qc_ast_2020_03_18 copy.conf
├── qc_ast_2020_03_18.conf
├── qc_ast_2020_03_19.conf
├── qc_ce_2020_02_23_01_00 copy.conf
├── qc_ce_2020_02_23_01_00.conf
├── qc_ce_long_seq_2020_02_24.conf
├── qc_ce_sbert_2020_02_27 copy.conf
├── qc_ce_sbert_2020_02_27.conf
├── qc_ce_sbert_2020_02_28 copy.conf
├── qc_ce_sbert_2020_02_28.conf
├── qc_ce_sbert_2020_02_29 copy.conf
├── qc_ce_sbert_2020_02_29.conf
├── qc_ce_sbert_2020_03_01 copy.conf
├── qc_ce_sbert_2020_03_01.conf
├── qc_ce_subtoken_2020_02_25 copy.conf
├── qc_ce_subtoken_2020_02_25.conf
├── qc_ce_subtoken_larger_2020_02_25.conf
├── qc_ce_subtoken_larger_2020_02_26 copy.conf
├── qc_ce_subtoken_larger_2020_02_26.conf
├── qc_lambda_2020_02_20_12_30 copy.conf
├── qc_lambda_2020_02_20_12_30.conf
├── qc_sbert_lambda_2020_03_02.conf
├── qc_sbert_lambda_2020_03_04 copy.conf
├── qc_sbert_lambda_2020_03_04.conf
├── qc_sbert_lambda_2020_03_05.conf
├── qc_sbert_lambda_2020_03_07 copy.conf
├── qc_sbert_lambda_2020_03_07.conf
├── query_code_siamese_2020_02_15_14_00 copy.conf
├── query_code_siamese_2020_02_15_14_00.conf
├── query_code_siamese_2020_02_17_21_30 copy.conf
├── query_code_siamese_2020_02_17_21_30.conf
├── query_code_siamese_2020_02_18_13_00.conf
├── query_code_siamese_2020_02_19_13_00 copy.conf
├── query_code_siamese_2020_02_19_13_00.conf
├── query_code_siamese_albert_2020_02_18_08_30 copy.conf
├── query_code_siamese_albert_2020_02_18_08_30.conf
└── query_code_siamese_albert_2020_02_18_14_00.conf
├── guide.md
├── main.py
├── model_predictions.csv
├── mypy.ini
├── pylama.ini
├── pyproject.toml
├── requirements.txt
├── test
├── __init__.py
├── conf
│ ├── default.conf
│ └── test.conf
└── test_recordable.py
└── wandb
└── settings
/.env:
--------------------------------------------------------------------------------
1 | PYTHONPATH=./codenets
2 |
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # ts
2 | **/node_modules/
3 | /webroot/scripts/*.js
4 |
5 | # vim
6 | **/*.swp
7 |
8 | # python
9 | **/*.pyc
10 | **/__pycache__/
11 |
12 | # jupyter
13 | **/.ipynb_checkpoints/
14 |
15 | # data
16 | resources/
17 | !resources/README.md
18 | !tests/data/
19 | # *.csv
20 | !model_predictions.csv
21 |
22 | # environment
23 | *.ftpconfig
24 |
25 | .idea
26 | /src/wandb/run-*
27 | /src/wandb/debug.log
28 | *.html
29 |
30 | .mypy_cache
31 | *.lock
32 |
33 | wandb
34 | checkpoints
35 | pickles
36 | runs
37 | vendor
38 | build
39 | build_tokenizers
40 | codenets.egg-info/
41 |
--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.10.10
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | // "python.pythonPath": "/Users/Pascal/Library/Caches/pypoetry/virtualenvs/codenets-O5WbUhkp-py3.7/bin/python",
3 | "python.linting.lintOnSave": true,
4 | "python.linting.pylintEnabled": false,
5 | "python.linting.pylamaEnabled": true,
6 | "python.linting.mypyEnabled": true,
7 | "python.formatting.provider": "black",
8 | "python.formatting.blackArgs": ["--line-length", "120"],
9 | "[python]": {
10 | "editor.formatOnSave": true,
11 | "editor.formatOnSaveTimeout": 2000,
12 | "editor.rulers": [120]
13 | },
14 | "autoDocstring.docstringFormat": "google",
15 | "git.ignoreLimitWarning": true,
16 | "python.testing.pytestArgs": ["test"],
17 | "python.testing.unittestEnabled": false,
18 | "python.testing.pytestEnabled": true
19 | // "mypy.executable": "/Users/Pascal/Library/Caches/pypoetry/virtualenvs/codenets-O5WbUhkp-py3.7/bin/mypyls",
20 | // "mypy.targets": [
21 | // "./src"
22 | // ],
23 | }
24 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2020 Pascal Voitot
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | http://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
--------------------------------------------------------------------------------
/codenets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/codenets/__init__.py
--------------------------------------------------------------------------------
/codenets/codesearchnet/__init__.py:
--------------------------------------------------------------------------------
1 | from codenets.codesearchnet.query_1_code_1 import model, training_ctx
2 |
3 | import codenets.codesearchnet.query_1_code_1.training_ctx as single_branch_ctx
4 |
5 | # single_branch_model = model
6 | import codenets.codesearchnet.query_1_code_1.model as single_branch_model
7 |
--------------------------------------------------------------------------------
/codenets/codesearchnet/ast_build.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Usage:
4 | eval.py [options] SAVE_FOLDER TRAIN_DATA_PATH VALID_DATA_PATH TEST_DATA_PATH
5 | eval.py [options] [SAVE_FOLDER]
6 |
7 | *_DATA_PATH arguments may either accept (1) directory filled with .jsonl.gz files that we use as data,
8 | or a (2) plain text file containing a list of such directories (used for multi-language training).
9 |
10 | In the case that you supply a (2) plain text file, all directory names must be separated by a newline.
11 | For example, if you want to read from multiple directories you might have a plain text file called
12 | data_dirs_train.txt with the below contents:
13 |
14 | > cat ~/src/data_dirs_train.txt
15 | azure://semanticcodesearch/pythondata/Processed_Data/jsonl/train
16 | azure://semanticcodesearch/csharpdata/split/csharpCrawl-train
17 |
18 | Options:
19 | -h --help Show this screen.
20 | --config FILE Specify HOCON config file.
21 | --debug Enable debug routines. [default: False]
22 | """
23 |
24 | from dpu_utils.utils import run_and_debug
25 | from docopt import docopt
26 | from loguru import logger
27 | import os
28 | from pyhocon import ConfigFactory
29 |
30 |
31 | from codenets.codesearchnet.training_ctx import CodeSearchTrainingContext
32 | from codenets.codesearchnet.code_ast.ast_utils import build_language_ast
33 |
34 | """Evaluating SBert."""
35 |
36 |
37 | def run(args, tag_in_vcs=False) -> None:
38 | os.environ["WANDB_MODE"] = "dryrun"
39 |
40 | logger.debug("Building Training Context")
41 | conf_file = args["--config"]
42 | conf = ConfigFactory.parse_file(conf_file)
43 |
44 | logger.info(f"Restoring Training Context from config {conf_file}")
45 | training_ctx = CodeSearchTrainingContext.build_context_from_hocon(conf)
46 |
47 | # dirs = [Path("/home/mandubian/workspaces/tools/CodeSearchNet/resources/data/ruby/final/jsonl/valid/")]
48 | # build_language_ast("val", training_ctx.val_dirs, training_ctx.pickle_path, training_ctx.val_data_params)
49 | # build_language_ast("train", training_ctx.train_dirs, training_ctx.pickle_path, training_ctx.train_data_params)
50 | build_language_ast("test", training_ctx.test_dirs, training_ctx.pickle_path, training_ctx.test_data_params)
51 |
52 | # Language.build_library(
53 | # # Store the library in the `build` directory
54 | # "build/my-languages.so",
55 | # # Include one or more languages
56 | # [
57 | # "vendor/tree-sitter-go",
58 | # "vendor/tree-sitter-java",
59 | # "vendor/tree-sitter-javascript",
60 | # "vendor/tree-sitter-python",
61 | # "vendor/tree-sitter-php",
62 | # "vendor/tree-sitter-ruby",
63 | # ],
64 | # )
65 |
66 | # parser = Parser()
67 |
68 | # code_php = """
69 | # hasAuthentication($repositoryName)) {
73 | # $auth = $this->getAuthentication($repositoryName);
74 | # if ($auth['username'] === $username && $auth['password'] === $password) {
75 | # return;
76 | # }
77 |
78 | # $this->writeError(
79 | # sprintf(
80 | # "Warning: You should avoid overwriting already defined auth settings for %s.",
81 | # $repositoryName
82 | # )
83 | # );
84 | # }
85 | # $this->setAuthentication($repositoryName, $username, $password);
86 | # }
87 | # ?>
88 | # """
89 | # PHP_LANGUAGE = Language("build/my-languages.so", "php")
90 | # parser.set_language(PHP_LANGUAGE)
91 | # tree = parser.parse(bytes(code_php, "utf8"))
92 | # cursor = tree.walk()
93 | # print(cursor.node.sexp())
94 |
95 | # skip_node_types = ["ERROR", ""]
96 | # all_tokens_php, special_tokens_php = breadth_first_path("php", code_php, cursor, skip_node_types=skip_node_types)
97 | # print("all_tokens_php", all_tokens_php)
98 | # print("special_tokens_php", special_tokens_php)
99 |
100 | # JAVA_LANGUAGE = Language("build/my-languages.so", "java")
101 | # # parser = Parser()
102 | # parser.set_language(JAVA_LANGUAGE)
103 | # code_java = """
104 | # class A {
105 | # public int b() {
106 | # int c = 5;
107 | # }
108 | # }
109 | # """
110 | # tree = parser.parse(bytes(code_java, "utf8"))
111 | # cursor = tree.walk()
112 | # print("code_java", code_java)
113 | # print(cursor.node.sexp())
114 | # all_tokens_java, special_tokens_java = breadth_first_path(code_java, cursor)
115 | # print("all_tokens_java", all_tokens_java)
116 | # print("special_tokens_java", special_tokens_java)
117 |
118 | # print("===================================================")
119 |
120 | # PY_LANGUAGE = Language("build/my-languages.so", "python")
121 | # parser.set_language(PY_LANGUAGE)
122 | # code_python = """
123 | # def foo():
124 | # if bar:
125 | # a: List[str] = ["toto", "tata"]
126 | # baz(a, b, 5)
127 | # """
128 | # tree = parser.parse(bytes(code_python, "utf8"))
129 | # cursor = tree.walk()
130 | # print("code_python", code_python)
131 | # print(cursor.node.sexp())
132 | # all_tokens_python, special_tokens_python = breadth_first_path(code_python, cursor)
133 | # print("all_tokeall_tokens_pythonns", all_tokens_python)
134 | # print("special_tokens_python", special_tokens_python)
135 |
136 | # special_tokens = special_tokens_python.union(special_tokens_java)
137 | # print("special_tokens", special_tokens)
138 | # training_ctx.tokenizer.vocab.add_special_tokens(list(special_tokens))
139 |
140 | # print("JAVA", training_ctx.tokenize_code_sentences([" ".join(all_tokens_java)], max_length=256))
141 | # print("PYTHON", training_ctx.tokenize_code_sentences([" ".join(all_tokens_python)], max_length=256))
142 |
143 |
144 | if __name__ == "__main__":
145 | args = docopt(__doc__)
146 | run_and_debug(lambda: run(args), args["--debug"])
147 |
--------------------------------------------------------------------------------
/codenets/codesearchnet/code_ast/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/codenets/codesearchnet/code_ast/__init__.py
--------------------------------------------------------------------------------
/codenets/codesearchnet/code_ast/ast_utils.py:
--------------------------------------------------------------------------------
1 | from loguru import logger
2 | import time
3 | from typing import Dict, List, Tuple, IO, Set, Optional
4 | from pathlib import Path
5 | from tree_sitter import Language, Parser, Node
6 | import os
7 | import json
8 | from pyhocon import ConfigTree
9 |
10 | from codenets.codesearchnet.data import DatasetParams
11 | from codenets.utils import get_data_files_from_directory
12 | from codenets.codesearchnet.copied_code.utils import read_file_samples
13 |
14 |
15 | class TreeSitterParser:
16 | def __init__(
17 | self,
18 | langs: List[str],
19 | added_nodes: Dict[str, Dict[str, str]],
20 | skip_node_types: Dict[str, List[str]],
21 | vendors_path: Path = Path("./vendor"),
22 | ):
23 | super(TreeSitterParser, self).__init__()
24 |
25 | vendors = []
26 | self.added_nodes = added_nodes
27 | self.skip_node_types = skip_node_types
28 | for lang in langs:
29 | vendors.append(vendors_path / f"tree-sitter-{lang}")
30 | if lang not in added_nodes:
31 | self.added_nodes[lang] = ConfigTree([("prefix", ""), ("suffix", "")])
32 | if lang not in skip_node_types:
33 | self.skip_node_types[lang] = []
34 |
35 | Language.build_library(
36 | # Store the library in the `build` directory
37 | "build/my-languages.so",
38 | # Include one or more languages
39 | vendors,
40 | )
41 |
42 | self.parser = Parser()
43 |
44 | def repr_field_node(
45 | self, code, node, field: Optional[str] = None, skip_node_types: List[str] = []
46 | ) -> Tuple[List[str], Set[str], bool]:
47 | skip_sub_nodes = False
48 | special_tokens: Set[str] = set()
49 | rpr: List[str]
50 | if field:
51 | rpr = ["", field]
52 | special_tokens.add("")
53 | else:
54 | rpr = []
55 |
56 | if node.is_named:
57 | # no child, serialize it here
58 | if len(node.children) == 0:
59 | if node.type in skip_node_types:
60 | rpr.extend([f"{node.type}", ""])
61 | special_tokens.add("")
62 | else:
63 | rpr.extend([f"<{node.type}>", code[node.start_byte : node.end_byte], ""])
64 | special_tokens.update([f"<{node.type}>", ""])
65 |
66 | else:
67 | if node.type not in skip_node_types:
68 | rpr.extend([f"<{node.type}>", ""])
69 | special_tokens.update([f"<{node.type}>", ""])
70 | else:
71 | skip_sub_nodes = True
72 | else:
73 | if node.type not in skip_node_types:
74 | rpr.extend([f"{node.type}", ""])
75 | special_tokens.add("")
76 | else:
77 | skip_sub_nodes = True
78 |
79 | return rpr, special_tokens, skip_sub_nodes
80 |
81 | def repr_level(self, code, cursor, skip_node_types: List[str] = []):
82 | nodes: List[Node] = []
83 | all_tokens: List[str] = []
84 | special_tokens: Set[str] = set()
85 |
86 | if cursor.goto_first_child():
87 | field = cursor.current_field_name()
88 | toks, specs, skip = self.repr_field_node(code, cursor.node, field, skip_node_types=skip_node_types)
89 | all_tokens.extend(toks)
90 | special_tokens.update(specs)
91 | if not skip:
92 | nodes.append(cursor.node)
93 |
94 | while cursor.goto_next_sibling():
95 | field = cursor.current_field_name()
96 | toks, specs, skip = self.repr_field_node(code, cursor.node, field, skip_node_types=skip_node_types)
97 | all_tokens.extend(toks)
98 | special_tokens.update(specs)
99 | if not skip:
100 | nodes.append(cursor.node)
101 |
102 | all_tokens.append("")
103 | special_tokens.add("")
104 | return all_tokens, special_tokens, nodes
105 |
106 | def breadth_first_path(self, lang, code, cursor, skip_node_types: List[str] = []) -> Tuple[List[str], Set[str]]:
107 | all_tokens = [f"<{lang}>"]
108 | special_tokens = set([f"<{lang}>"])
109 | all_tokens_1, special_tokens_1, skip = self.repr_field_node(code, cursor.node, skip_node_types=skip_node_types)
110 | all_tokens.extend(all_tokens_1)
111 | special_tokens.update(special_tokens_1)
112 |
113 | if not skip:
114 | all_tokens_lvl, special_tokens_lvl, nodes = self.repr_level(code, cursor, skip_node_types=skip_node_types)
115 | all_tokens.extend(all_tokens_lvl)
116 | special_tokens.update(special_tokens_lvl)
117 |
118 | while len(nodes) > 0:
119 | node = nodes.pop(0)
120 | cursor = node.walk()
121 | all_tokens_2, special_tokens_2, nodes_2 = self.repr_level(code, cursor, skip_node_types=skip_node_types)
122 | all_tokens.extend(all_tokens_2)
123 | special_tokens.update(special_tokens_2)
124 | nodes.extend(nodes_2)
125 | all_tokens.append("")
126 | special_tokens.add("")
127 | return all_tokens, special_tokens
128 |
129 | def breadth_first_path_light(
130 | self, lang, code, cursor, skip_node_types: List[str] = [], max_tokens: Optional[int] = None
131 | ) -> List[str]:
132 | all_tokens = [f"<{lang}>"]
133 | all_tokens_1, special_tokens_1, skip = self.repr_field_node(code, cursor.node, skip_node_types=skip_node_types)
134 | all_tokens.extend(all_tokens_1)
135 |
136 | if not skip:
137 | all_tokens_lvl, special_tokens_lvl, nodes = self.repr_level(code, cursor, skip_node_types=skip_node_types)
138 | all_tokens.extend(all_tokens_lvl)
139 |
140 | while len(nodes) > 0:
141 | if max_tokens is not None and len(all_tokens) >= max_tokens:
142 | break
143 | node = nodes.pop(0)
144 | cursor = node.walk()
145 | all_tokens_2, special_tokens_2, nodes_2 = self.repr_level(code, cursor, skip_node_types=skip_node_types)
146 | all_tokens.extend(all_tokens_2)
147 | nodes.extend(nodes_2)
148 | if max_tokens is not None:
149 | all_tokens = all_tokens[: (max_tokens - 1)]
150 | all_tokens.append("")
151 | return all_tokens
152 |
153 | def parse_full(self, lang: str, code: str) -> Tuple[List[str], Set[str]]:
154 | LANGUAGE = Language("build/my-languages.so", lang)
155 | self.parser.set_language(LANGUAGE)
156 |
157 | code = f"{self.added_nodes[lang]['prefix']} {code} {self.added_nodes[lang]['suffix']}"
158 |
159 | tree = self.parser.parse(bytes(code, "utf8"))
160 | cursor = tree.walk()
161 |
162 | tokens, special_tokens = self.breadth_first_path(lang, code, cursor, skip_node_types=self.skip_node_types[lang])
163 | return tokens, special_tokens
164 |
165 | def parse(self, lang: str, code: str, max_tokens: Optional[int] = None) -> List[str]:
166 | LANGUAGE = Language("build/my-languages.so", lang)
167 | self.parser.set_language(LANGUAGE)
168 |
169 | code = f"{self.added_nodes[lang]['prefix']} {code} {self.added_nodes[lang]['suffix']}"
170 |
171 | tree = self.parser.parse(bytes(code, "utf8"))
172 | cursor = tree.walk()
173 |
174 | tokens = self.breadth_first_path_light(
175 | lang, code, cursor, skip_node_types=self.skip_node_types[lang], max_tokens=max_tokens
176 | )
177 | return tokens
178 |
179 |
180 | def load_special_tokens(data_params: DatasetParams):
181 | special_tokens: List[str] = []
182 | for f in data_params.ast_special_tokens_files:
183 | fp = open(f, "r")
184 | special_tokens.extend(json.load(fp))
185 |
186 | return special_tokens
187 |
188 |
189 | def build_language_ast(name: str, dirs: List[Path], pickle_path: Path, data_params: DatasetParams):
190 | start = time.time()
191 |
192 | if data_params.use_ast == "tree-sitter":
193 | parser = TreeSitterParser(
194 | langs=["go", "java", "javascript", "python", "php", "ruby"],
195 | added_nodes=data_params.ast_added_nodes,
196 | skip_node_types=data_params.ast_skip_node_types,
197 | )
198 |
199 | all_special_tokens: Set[str] = set()
200 |
201 | lengths: Dict[str, List[int]] = {"go": [], "java": [], "javascript": [], "python": [], "php": [], "ruby": []}
202 |
203 | for (idx, file_path) in enumerate(get_data_files_from_directory(dirs)):
204 | logger.info(f"Reading {file_path}")
205 | raw_samples = list(read_file_samples(file_path))
206 | for raw_sample in raw_samples:
207 | lang = raw_sample["language"]
208 | tokens, special_tokens = parser.parse_full(lang, raw_sample["code"])
209 |
210 | all_special_tokens.update(special_tokens)
211 |
212 | lengths[lang].append(len(tokens))
213 |
214 | end = time.time()
215 | logger.debug(f"all_special_tokens ({len(all_special_tokens)}) {all_special_tokens}")
216 |
217 | if not os.path.exists(pickle_path):
218 | os.makedirs(pickle_path)
219 |
220 | json_file = Path(pickle_path) / f"{name}_special_tokens.json"
221 | with open(json_file, "w") as f:
222 | json.dump(list(all_special_tokens), f)
223 |
224 | import statistics
225 |
226 | for lang, lgs in lengths.items():
227 | if len(lgs) > 0:
228 | max_lg = max(lgs)
229 | min_lg = min(lgs)
230 | mean_lg = statistics.mean(lgs)
231 | std_lg = statistics.stdev(lgs)
232 | logger.debug(f"{lang} [ min:{min_lg}, max:{max_lg}, mean:{mean_lg}, stddev:{std_lg} ]")
233 |
234 | time_p = end - start
235 | logger.info(f"Building AST took: {time_p} sec")
236 |
--------------------------------------------------------------------------------
/codenets/codesearchnet/copied_code/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/codenets/codesearchnet/copied_code/__init__.py
--------------------------------------------------------------------------------
/codenets/codesearchnet/copied_code/bpevocabulary.py:
--------------------------------------------------------------------------------
1 | # Code copied from https://github.com/github/CodeSearchNet for backward-compatible experimentations
2 |
3 | # Code adapted from https://github.com/soaxelbrooke/python-bpe/blob/master/bpe/encoder.py
4 | # MIT License (see repository)
5 |
6 |
7 | """
8 | An encoder which learns byte pair encodings for white-space separated text.
9 | Can tokenize, encode, and decode.
10 | """
11 | import typing
12 | from typing import Optional, Sized
13 | from collections import Counter
14 | # from dpu_utils.mlutils import Vocabulary
15 |
16 | try:
17 | from typing import Dict, Iterable, List, Iterator
18 | except ImportError:
19 | pass
20 |
21 |
22 | DEFAULT_EOW = "__eow"
23 | DEFAULT_SOW = "__sow"
24 | DEFAULT_UNK = "__unk"
25 | DEFAULT_PAD = "__pad"
26 |
27 | # pylint: disable= inherit-non-class
28 |
29 |
30 | class BpeVocabulary(Sized):
31 | """Encode white-space separated text using byte-pair encoding. See https://arxiv.org/abs/1508.07909 for details."""
32 |
33 | def __init__(
34 | self,
35 | vocab_size: int = 8192,
36 | pct_bpe: float = 0.2,
37 | ngram_min: int = 2,
38 | ngram_max: int = 8,
39 | required_tokens: Optional[Iterable[str]] = None,
40 | strict=True,
41 | EOW=DEFAULT_EOW,
42 | SOW=DEFAULT_SOW,
43 | UNK=DEFAULT_UNK,
44 | PAD=DEFAULT_PAD,
45 | ):
46 | if vocab_size < 1:
47 | raise ValueError("vocab size must be greater than 0.")
48 |
49 | self.EOW = EOW
50 | self.SOW = SOW
51 | self.eow_len = len(EOW)
52 | self.sow_len = len(SOW)
53 | self.UNK = UNK
54 | self.PAD = PAD
55 | self.required_tokens = list(set(required_tokens or []).union({self.UNK, self.PAD}))
56 | self.vocab_size = vocab_size
57 | self.pct_bpe = pct_bpe
58 | self.word_vocab_size = max([int(vocab_size * (1 - pct_bpe)), len(self.required_tokens or [])])
59 | self.bpe_vocab_size = vocab_size - self.word_vocab_size
60 | self.word_vocab = {} # type: Dict[str, int]
61 | self.bpe_vocab = {} # type: Dict[str, int]
62 | self.inverse_word_vocab = {} # type: Dict[int, str]
63 | self.inverse_bpe_vocab = {} # type: Dict[int, str]
64 | self.ngram_min = ngram_min
65 | self.ngram_max = ngram_max
66 | self.strict = strict
67 |
68 | def __len__(self):
69 | """Return vocab len"""
70 | return self.vocab_size
71 |
72 | def byte_pair_counts(self, words: Iterable[str]) -> Iterable[typing.Counter]:
73 | """
74 | Count space separated token character pairs:
75 | [('T h i s ', 4}] -> {'Th': 4, 'hi': 4, 'is': 4}
76 | """
77 | for token, count in self.count_tokens(words).items():
78 | bp_counts = Counter() # type: Counter
79 | sub_tokens = token.split(" ")
80 | joined_tokens = "".join(sub_tokens)
81 | token_offsets = [0]
82 | length = 0
83 | for ngram in sub_tokens:
84 | bp_counts[ngram] += count
85 | length += len(ngram)
86 | token_offsets += [length]
87 | for ngram_size in range(self.ngram_min, min(self.ngram_max, len(sub_tokens)) + 1):
88 | for i in range(len(sub_tokens) - ngram_size + 1):
89 | bp_counts[joined_tokens[token_offsets[i] : token_offsets[i + ngram_size]]] += count
90 |
91 | yield bp_counts
92 |
93 | def count_tokens(self, words: Iterable[str]) -> Dict[str, int]:
94 | """Count tokens into a BPE vocab"""
95 | token_counts = Counter(words)
96 | return {" ".join(token): count for token, count in token_counts.items()}
97 |
98 | def learn_word_vocab(self, word_counts: typing.Counter[str]) -> Dict[str, int]:
99 | """Build vocab from self.word_vocab_size most common tokens in provided sentences"""
100 | for token in set(self.required_tokens or []):
101 | word_counts[token] = int(2 ** 31)
102 | word_counts[self.PAD] = int(2 ** 32) # Make sure that PAD gets id=0
103 | sorted_word_counts = sorted(word_counts.items(), key=lambda p: -p[1])
104 | return {word: idx for idx, (word, count) in enumerate(sorted_word_counts[: self.word_vocab_size])}
105 |
106 | def learn_bpe_vocab(self, words: Iterable[str]) -> Dict[str, int]:
107 | """Learn a vocab of byte pair encodings"""
108 | vocab = Counter() # type: typing.Counter
109 | for token in {self.SOW, self.EOW}:
110 | vocab[token] = int(2 ** 63)
111 | for idx, byte_pair_count in enumerate(self.byte_pair_counts(words)):
112 | vocab.update(byte_pair_count)
113 | if (idx + 1) % 10000 == 0:
114 | self.trim_vocab(10 * self.bpe_vocab_size, vocab)
115 |
116 | sorted_bpe_counts = sorted(vocab.items(), key=lambda p: -p[1])[: self.bpe_vocab_size]
117 | return {bp: idx + self.word_vocab_size for idx, (bp, count) in enumerate(sorted_bpe_counts)}
118 |
119 | def fit(self, word_counts: typing.Counter[str]) -> None:
120 | """Learn vocab from text."""
121 |
122 | # First, learn word vocab
123 | self.word_vocab = self.learn_word_vocab(word_counts)
124 |
125 | remaining_words = Counter({word: count for word, count in word_counts.items() if word not in self.word_vocab})
126 | self.bpe_vocab = self.learn_bpe_vocab(remaining_words.elements())
127 |
128 | self.inverse_word_vocab = {idx: token for token, idx in self.word_vocab.items()}
129 | self.inverse_bpe_vocab = {idx: token for token, idx in self.bpe_vocab.items()}
130 |
131 | @staticmethod
132 | def get_unk() -> str:
133 | return DEFAULT_UNK
134 |
135 | @staticmethod
136 | def get_pad() -> str:
137 | return DEFAULT_PAD
138 |
139 | @staticmethod
140 | def trim_vocab(n: int, vocab: Dict[str, int]) -> None:
141 | """Delete all pairs below 10 * vocab size to prevent memory problems"""
142 | pair_counts = sorted(vocab.items(), key=lambda p: -p[1])
143 | pairs_to_trim = [pair for pair, count in pair_counts[n:]]
144 | for pair in pairs_to_trim:
145 | del vocab[pair]
146 |
147 | def subword_tokenize(self, word: str) -> List[str]:
148 | """Tokenize inside an unknown token using BPE"""
149 | end_idx = min([len(word), self.ngram_max])
150 | sw_tokens = [self.SOW]
151 | start_idx = 0
152 |
153 | while start_idx < len(word):
154 | subword = word[start_idx:end_idx]
155 | if subword in self.bpe_vocab:
156 | sw_tokens.append(subword)
157 | start_idx = end_idx
158 | end_idx = min([len(word), start_idx + self.ngram_max])
159 | elif len(subword) == 1:
160 | sw_tokens.append(self.UNK)
161 | start_idx = end_idx
162 | end_idx = min([len(word), start_idx + self.ngram_max])
163 | else:
164 | end_idx -= 1
165 |
166 | sw_tokens.append(self.EOW)
167 | return sw_tokens
168 |
169 | def tokenize(self, word_tokens: List[str]) -> List[str]:
170 | """Split a sentence into word and subword tokens"""
171 |
172 | tokens = []
173 | for word_token in word_tokens:
174 | if word_token in self.word_vocab:
175 | tokens.append(word_token)
176 | else:
177 | tokens.extend(self.subword_tokenize(word_token))
178 |
179 | return tokens
180 |
181 | def transform(self, sentences: Iterable[List[str]], reverse=False, fixed_length=None) -> Iterable[List[int]]:
182 | """Turn tokens into vocab idxs"""
183 | direction = -1 if reverse else 1
184 | for sentence in sentences:
185 | encoded = []
186 | tokens = list(self.tokenize(sentence))
187 | for token in tokens:
188 | if token in self.word_vocab:
189 | encoded.append(self.word_vocab[token])
190 | elif token in self.bpe_vocab:
191 | encoded.append(self.bpe_vocab[token])
192 | else:
193 | encoded.append(self.word_vocab[self.UNK])
194 |
195 | if fixed_length is not None:
196 | encoded = encoded[:fixed_length]
197 | while len(encoded) < fixed_length:
198 | encoded.append(self.word_vocab[self.PAD])
199 |
200 | yield encoded[::direction]
201 |
202 | def inverse_transform(self, rows: Iterable[List[int]]) -> Iterator[str]:
203 | """Turn token indexes back into space-joined text."""
204 | for row in rows:
205 | words = []
206 |
207 | rebuilding_word = False
208 | current_word = ""
209 | for idx in row:
210 | if self.inverse_bpe_vocab.get(idx) == self.SOW:
211 | if rebuilding_word and self.strict:
212 | raise ValueError("Encountered second SOW token before EOW.")
213 | rebuilding_word = True
214 |
215 | elif self.inverse_bpe_vocab.get(idx) == self.EOW:
216 | if not rebuilding_word and self.strict:
217 | raise ValueError("Encountered EOW without matching SOW.")
218 | rebuilding_word = False
219 | words.append(current_word)
220 | current_word = ""
221 |
222 | elif rebuilding_word and (idx in self.inverse_bpe_vocab):
223 | current_word += self.inverse_bpe_vocab[idx]
224 |
225 | elif rebuilding_word and (idx in self.inverse_word_vocab):
226 | current_word += self.inverse_word_vocab[idx]
227 |
228 | elif idx in self.inverse_word_vocab:
229 | words.append(self.inverse_word_vocab[idx])
230 |
231 | elif idx in self.inverse_bpe_vocab:
232 | if self.strict:
233 | raise ValueError("Found BPE index {} when not rebuilding word!".format(idx))
234 | else:
235 | words.append(self.inverse_bpe_vocab[idx])
236 |
237 | else:
238 | raise ValueError("Got index {} that was not in word or BPE vocabs!".format(idx))
239 |
240 | yield " ".join(w for w in words if w != "")
241 |
--------------------------------------------------------------------------------
/codenets/codesearchnet/copied_code/metadata.py:
--------------------------------------------------------------------------------
1 | # Code partially copied and adapted from https://github.com/github/CodeSearchNet for backward-compatible experimentations
2 |
3 | from collections import defaultdict
4 |
5 | from typing import DefaultDict, Dict, Iterable, List, Optional, Tuple
6 |
7 | from dpu_utils.mlutils import Vocabulary
8 |
9 | from dpu_utils.utils import RichPath
10 |
11 | from codenets.codesearchnet.copied_code.bpevocabulary import BpeVocabulary
12 | from codenets.codesearchnet.copied_code.utils import run_jobs_in_parallel
13 |
14 | from dataclasses import dataclass
15 | from pathlib import Path
16 | from collections import Counter
17 | from dataclasses import field
18 | from enum import Enum
19 |
20 | from codenets.utils import _to_subtoken_stream, get_data_files_from_directory
21 |
22 |
23 | class QueryType(Enum):
24 | DOCSTRING = "docstring_as_query"
25 | FUNCTION_NAME = "func_name_as_query"
26 |
27 |
28 | @dataclass
29 | class Metadata:
30 | token_counter: Counter = field(default_factory=Counter)
31 | token_vocab: Optional[BpeVocabulary] = None
32 | common_tokens: List[Tuple[str, int]] = field(default_factory=list)
33 |
34 |
35 | def load_metadata_from_sample(
36 | data_to_load: Iterable[str], raw_metadata: Metadata, use_subtokens: bool = False, mark_subtoken_end: bool = False
37 | ) -> Metadata:
38 | if use_subtokens:
39 | data_to_load = _to_subtoken_stream(data_to_load, mark_subtoken_end=mark_subtoken_end)
40 | # raw_metadata["token_counter"].update(data_to_load)
41 | raw_metadata.token_counter.update(data_to_load)
42 | return raw_metadata
43 |
44 |
45 | def append_metadata(
46 | encoder_label: str,
47 | vocab_size: int,
48 | vocab_count_threshold: int,
49 | # use_bpe: bool,
50 | pct_bpe: float,
51 | raw_metadata_list: List[Metadata],
52 | ) -> Metadata:
53 | merged_token_counter: Counter = Counter()
54 | for raw_metadata in raw_metadata_list:
55 | # merged_token_counter += raw_metadata["token_counter"]
56 | merged_token_counter += raw_metadata.token_counter
57 |
58 | # if hyperparameters["%s_use_bpe" % encoder_label]:
59 | # token_vocabulary: Vocabulary
60 | # if use_bpe:
61 | token_vocabulary = BpeVocabulary(
62 | # vocab_size=hyperparameters["%s_token_vocab_size" % encoder_label],
63 | vocab_size=vocab_size,
64 | # pct_bpe=hyperparameters["%s_pct_bpe" % encoder_label],
65 | pct_bpe=pct_bpe,
66 | )
67 | token_vocabulary.fit(merged_token_counter)
68 | # else:
69 | # token_vocabulary = Vocabulary.create_vocabulary(
70 | # tokens=merged_token_counter,
71 | # # max_size=hyperparameters["%s_token_vocab_size" % encoder_label],
72 | # max_size=vocab_size,
73 | # # count_threshold=hyperparameters["%s_token_vocab_count_threshold" % encoder_label],
74 | # count_threshold=vocab_count_threshold,
75 | # )
76 |
77 | # final_metadata["token_vocab"] = token_vocabulary
78 | # Save the most common tokens for use in data augmentation:
79 | # final_metadata["common_tokens"] = merged_token_counter.most_common(50)
80 | final_metadata = Metadata(
81 | token_vocab=token_vocabulary,
82 | token_counter=merged_token_counter,
83 | common_tokens=merged_token_counter.most_common(50),
84 | )
85 | return final_metadata
86 |
87 |
88 | def build_tokenizer_metadata(
89 | data_dirs: List[Path],
90 | max_files_per_dir: Optional[int] = None,
91 | parallelize: bool = True,
92 | use_subtokens: bool = False,
93 | mark_subtoken_end: bool = False,
94 | ) -> Tuple[List[Metadata], Dict[str, List[Metadata]]]:
95 | raw_query_metadata_list = []
96 | raw_code_language_metadata_lists: DefaultDict[str, List] = defaultdict(list)
97 |
98 | def metadata_parser_fn(_, file_path: Path) -> Iterable[Tuple[Metadata, Dict[str, Metadata]]]:
99 | raw_query_metadata = Metadata()
100 | per_code_language_metadata: DefaultDict[str, Metadata] = defaultdict(Metadata)
101 |
102 | for raw_sample in RichPath.create(str(file_path)).read_by_file_suffix():
103 | sample_language = raw_sample["language"]
104 | per_code_language_metadata[sample_language] = load_metadata_from_sample(
105 | data_to_load=raw_sample["code_tokens"],
106 | raw_metadata=per_code_language_metadata[sample_language],
107 | use_subtokens=use_subtokens,
108 | mark_subtoken_end=mark_subtoken_end,
109 | )
110 |
111 | raw_query_metadata = load_metadata_from_sample(
112 | data_to_load=[d.lower() for d in raw_sample["docstring_tokens"]],
113 | raw_metadata=raw_query_metadata,
114 | use_subtokens=use_subtokens,
115 | mark_subtoken_end=mark_subtoken_end,
116 | )
117 | yield (raw_query_metadata, per_code_language_metadata)
118 |
119 | def received_result_callback(metadata_parser_result: Tuple[Metadata, Dict[str, Metadata]]):
120 | (raw_query_metadata, per_code_language_metadata) = metadata_parser_result
121 | raw_query_metadata_list.append(raw_query_metadata)
122 | for (metadata_language, raw_code_language_metadata) in per_code_language_metadata.items():
123 | raw_code_language_metadata_lists[metadata_language].append(raw_code_language_metadata)
124 |
125 | def finished_callback():
126 | pass
127 |
128 | if parallelize:
129 | run_jobs_in_parallel(
130 | get_data_files_from_directory(data_dirs, max_files_per_dir),
131 | metadata_parser_fn,
132 | received_result_callback,
133 | finished_callback,
134 | )
135 | else:
136 | for (idx, file) in enumerate(get_data_files_from_directory(data_dirs, max_files_per_dir)):
137 | for res in metadata_parser_fn(idx, file):
138 | received_result_callback(res)
139 |
140 | return raw_query_metadata_list, raw_code_language_metadata_lists
141 |
--------------------------------------------------------------------------------
/codenets/codesearchnet/copied_code/utils.py:
--------------------------------------------------------------------------------
1 | # Code copied from https://github.com/github/CodeSearchNet for backward-compatible experimentations
2 |
3 | import multiprocessing
4 | from typing import List, Iterable, Callable, TypeVar, Dict, Any, Union
5 | from dpu_utils.utils import RichPath
6 | from pathlib import Path
7 |
8 | JobType = TypeVar("JobType")
9 | ResultType = TypeVar("ResultType")
10 |
11 |
12 | def read_file_samples(file_path: Union[Path, str]) -> List[Dict[str, Any]]:
13 | return RichPath.create(str(file_path)).read_by_file_suffix()
14 |
15 |
16 | def __parallel_queue_worker(
17 | worker_id: int,
18 | job_queue: multiprocessing.Queue,
19 | result_queue: multiprocessing.Queue,
20 | worker_fn: Callable[[int, JobType], Iterable[ResultType]],
21 | ):
22 | while True:
23 | job = job_queue.get()
24 |
25 | # "None" is the signal for last job, put that back in for other workers and stop:
26 | if job is None:
27 | job_queue.put(job)
28 | break
29 |
30 | for result in worker_fn(worker_id, job):
31 | result_queue.put(result)
32 | result_queue.put(None)
33 |
34 |
35 | def run_jobs_in_parallel(
36 | all_jobs: List[JobType],
37 | worker_fn: Callable[[int, JobType], Iterable[ResultType]],
38 | received_result_callback: Callable[[ResultType], None],
39 | finished_callback: Callable[[], None],
40 | result_queue_size: int = 100,
41 | ) -> None:
42 | """
43 | Run jobs in parallel and uses callbacks to collect results.
44 | :param all_jobs: Job descriptions; one at a time will be parsed into worker_fn.
45 | :param worker_fn: Worker function receiving a job; many copies may run in parallel.
46 | Can yield results, which will be processed (one at a time) by received_result_callback.
47 | :param received_result_callback: Called when a result was produced by any worker. Only one will run at a time.
48 | :param finished_callback: Called when all jobs have been processed.
49 | """
50 | job_queue: multiprocessing.Queue = multiprocessing.Queue(len(all_jobs) + 1)
51 | for job in all_jobs:
52 | job_queue.put(job)
53 | job_queue.put(None) # Marker that we are done
54 |
55 | # This will hold the actual results:
56 | result_queue: multiprocessing.Queue = multiprocessing.Queue(result_queue_size)
57 |
58 | # Create workers:
59 | num_workers = multiprocessing.cpu_count() - 1
60 | workers = [
61 | multiprocessing.Process(target=__parallel_queue_worker, args=(worker_id, job_queue, result_queue, worker_fn))
62 | for worker_id in range(num_workers)
63 | ]
64 | for worker in workers:
65 | worker.start()
66 |
67 | num_workers_finished = 0
68 | while True:
69 | result = result_queue.get()
70 | if result is None:
71 | num_workers_finished += 1
72 | if num_workers_finished == len(workers):
73 | finished_callback()
74 | break
75 | else:
76 | received_result_callback(result)
77 |
78 | for worker in workers:
79 | worker.join()
80 |
--------------------------------------------------------------------------------
/codenets/codesearchnet/data.py:
--------------------------------------------------------------------------------
1 | # This code is nearly 100% copied from original repo
2 |
3 | from dataclasses import dataclass, fields as datafields
4 | import numpy as np
5 |
6 | from typing import Dict, TypeVar, List
7 | from dataclasses import field
8 |
9 |
10 | @dataclass
11 | class DatasetParams:
12 | """Description of parameters of a CodeSearchnet dataset"""
13 |
14 | fraction_using_func_name: float
15 | min_len_func_name_for_query: int
16 | use_subtokens: bool
17 | mark_subtoken_end: bool
18 | code_max_num_tokens: int
19 | query_max_num_tokens: int
20 | use_bpe: bool
21 | vocab_size: int
22 | pct_bpe: float
23 | vocab_count_threshold: int
24 | lang_ids: Dict[str, int]
25 | do_lowercase: bool
26 | special_tokens: List[str]
27 | parallelize: bool
28 | use_lang_weights: bool = False # for backward compat
29 | query_random_token_frequency: float = 0.2
30 | query_embeddings: str = "none"
31 | use_ast: str = "none"
32 | ast_added_nodes: Dict[str, Dict[str, str]] = field(default_factory=dict)
33 | ast_skip_node_types: Dict[str, List[str]] = field(default_factory=dict)
34 | ast_special_tokens_files: List[str] = field(default_factory=list)
35 |
36 |
37 | T_InputFeatures = TypeVar("T_InputFeatures", bound="InputFeatures")
38 |
39 |
40 | @dataclass
41 | class InputFeatures:
42 | """Structure gathering query and code tokens/mask after passing through tokenizer"""
43 |
44 | language: int
45 | similarity: float
46 | query_tokens: np.ndarray
47 | query_tokens_mask: np.ndarray
48 |
49 | query_docstring_tokens: np.ndarray
50 | query_docstring_tokens_mask: np.ndarray
51 |
52 | code_tokens: np.ndarray
53 | code_tokens_mask: np.ndarray
54 |
55 |
56 | def dataclass_from_dict(klass, dikt):
57 | """Load any dataclass from a dict"""
58 | fieldtypes = {f.name: f.type for f in datafields(klass)}
59 | return klass(**{f: dataclass_from_dict(fieldtypes[f], dikt[f]) for f in dikt})
60 |
--------------------------------------------------------------------------------
/codenets/codesearchnet/dataset_main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Main to test dataset loading
4 |
5 | Usage:
6 | dataset_main.py [options]
7 | dataset_main.py [options]
8 |
9 | Options:
10 | -h --help Show this screen.
11 | --config FILE Specify HOCON config file. [default: ./conf/default.conf]
12 | --debug Enable debug routines. [default: False]
13 | """
14 |
15 |
16 | from docopt import docopt
17 | from loguru import logger
18 | import sys
19 | import torch
20 | import itertools
21 | from dpu_utils.utils import run_and_debug
22 | from pyhocon import ConfigFactory, ConfigTree
23 | from torch.utils.data import DataLoader
24 | from codenets.codesearchnet.dataset_utils import BalancedBatchSchedulerSampler, DatasetType
25 | from codenets.codesearchnet.training_ctx import CodeSearchTrainingContext
26 |
27 |
28 | print("Torch version", torch.__version__)
29 |
30 | logger.remove()
31 | logger.add(sys.stderr, level="DEBUG", colorize=True, backtrace=False)
32 |
33 |
34 | def run(args, tag_in_vcs=False) -> None:
35 | conf_file = args["--config"]
36 | logger.info(f"config file {conf_file}")
37 |
38 | conf: ConfigTree = ConfigFactory.parse_file(conf_file)
39 | logger.info(f"config {conf}")
40 | logger.info(f"Build Training Context from config {conf_file}")
41 | training_ctx = CodeSearchTrainingContext.build_context_from_hocon(conf)
42 |
43 | train_dataset = training_ctx.build_lang_dataset(DatasetType.TRAIN)
44 | train_dataloader = DataLoader(
45 | dataset=train_dataset,
46 | batch_size=conf["training.batch_size.train"],
47 | sampler=BalancedBatchSchedulerSampler(dataset=train_dataset, batch_size=conf["training.batch_size.train"]),
48 | )
49 | logger.info(f"train_dataloader [{len(train_dataloader)} samples]")
50 |
51 | for batch in itertools.islice(train_dataloader, 5):
52 | logger.info(f"batch {batch}")
53 |
54 | # val_dataset = training_ctx.build_lang_dataset(DatasetType.VAL)
55 | # val_dataloader = DataLoader(
56 | # dataset=val_dataset,
57 | # batch_size=conf["training.batch_size.val"],
58 | # sampler=BalancedBatchSchedulerSampler(dataset=val_dataset, batch_size=conf["training.batch_size.val"]),
59 | # )
60 | # logger.info(f"val_dataloader [{len(val_dataloader)} samples]")
61 |
62 | # for batch in itertools.islice(val_dataloader, 5):
63 | # logger.info(f"batch {batch}")
64 |
65 |
66 | if __name__ == "__main__":
67 | args = docopt(__doc__)
68 | run_and_debug(lambda: run(args), args["--debug"])
69 |
--------------------------------------------------------------------------------
/codenets/codesearchnet/eval.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Usage:
4 | eval.py [options] SAVE_FOLDER TRAIN_DATA_PATH VALID_DATA_PATH TEST_DATA_PATH
5 | eval.py [options] [SAVE_FOLDER]
6 |
7 | *_DATA_PATH arguments may either accept (1) directory filled with .jsonl.gz files that we use as data,
8 | or a (2) plain text file containing a list of such directories (used for multi-language training).
9 |
10 | In the case that you supply a (2) plain text file, all directory names must be separated by a newline.
11 | For example, if you want to read from multiple directories you might have a plain text file called
12 | data_dirs_train.txt with the below contents:
13 |
14 | > cat ~/src/data_dirs_train.txt
15 | azure://semanticcodesearch/pythondata/Processed_Data/jsonl/train
16 | azure://semanticcodesearch/csharpdata/split/csharpCrawl-train
17 |
18 | Options:
19 | -h --help Show this screen.
20 | --restore DIR specify restoration dir. [optional]
21 | --debug Enable debug routines. [default: False]
22 | """
23 |
24 | import os
25 | import torch
26 | from docopt import docopt
27 | from dpu_utils.utils import run_and_debug
28 | from loguru import logger
29 | from tqdm import tqdm
30 |
31 | from torch.utils.data import DataLoader
32 |
33 | # from codenets.codesearchnet.single_branch_ctx import SingleBranchTrainingContext
34 | from codenets.codesearchnet.dataset_utils import BalancedBatchSchedulerSampler, DatasetType
35 | from codenets.codesearchnet.training_ctx import (
36 | CodeSearchTrainingContext,
37 | compute_loss_mrr,
38 | TotalLoss,
39 | TotalMrr,
40 | TotalSize,
41 | BatchSize,
42 | BatchLoss,
43 | )
44 |
45 |
46 | def run(args, tag_in_vcs=False) -> None:
47 | os.environ["WANDB_MODE"] = "dryrun"
48 |
49 | logger.debug("Building Training Context")
50 | training_ctx: CodeSearchTrainingContext
51 | restore_dir = args["--restore"]
52 | logger.info(f"Restoring Training Context from directory{restore_dir}")
53 | training_ctx = CodeSearchTrainingContext.build_context_from_dir(restore_dir)
54 |
55 | # Build Val Dataloader
56 | # val_dataset = training_ctx.build_lang_dataset(DatasetType.VAL)
57 | # val_dataloader = DataLoader(
58 | # dataset=val_dataset,
59 | # batch_size=training_ctx.val_batch_size,
60 | # sampler=BalancedBatchSchedulerSampler(dataset=val_dataset, batch_size=training_ctx.val_batch_size),
61 | # )
62 | # logger.info(f"Built val_dataloader [Length:{len(val_dataloader)} x Batch:{training_ctx.val_batch_size}]")
63 |
64 | # Build Test Dataloader
65 | test_dataset = training_ctx.build_lang_dataset(DatasetType.TEST)
66 | test_dataloader = DataLoader(
67 | dataset=test_dataset,
68 | batch_size=training_ctx.val_batch_size,
69 | sampler=BalancedBatchSchedulerSampler(dataset=test_dataset, batch_size=training_ctx.test_batch_size),
70 | )
71 | logger.info(f"Built test_dataloader [Length:{len(test_dataloader)} x Batch:{training_ctx.test_batch_size}]")
72 |
73 | total_loss = TotalLoss(0.0)
74 | total_size = TotalSize(0)
75 | total_mrr = TotalMrr(0.0)
76 | training_ctx.eval_mode()
77 | with torch.no_grad():
78 | training_ctx.zero_grad()
79 | with tqdm(total=len(test_dataloader)) as t_batch:
80 | for batch_idx, batch in enumerate(test_dataloader):
81 | languages, similarity, query_tokens, query_tokens_mask, code_tokens, code_tokens_mask = [
82 | t.to(training_ctx.device) for t in batch
83 | ]
84 |
85 | batch_total_loss, similarity_scores = training_ctx.forward(batch, batch_idx)
86 |
87 | batch_size = BatchSize(batch[0].size()[0])
88 | batch_loss = BatchLoss(batch_total_loss.item())
89 | total_loss, avg_loss, total_mrr, avg_mrr, total_size = compute_loss_mrr(
90 | similarity_scores, batch_loss, batch_size, total_loss, total_mrr, total_size
91 | )
92 | # languages=languages,
93 | # query_tokens=query_tokens,
94 | # query_tokens_mask=query_tokens_mask,
95 | # code_tokens=code_tokens,
96 | # code_tokens_mask=code_tokens_mask,
97 | # )
98 | # batch_total_losses, similarity_scores = training_ctx.losses_scores_fn(
99 | # query_embedding, code_embedding, similarity
100 | # )
101 | # batch_total_loss = torch.mean(batch_total_losses)
102 |
103 | # nb_samples = batch[0].size()[0]
104 |
105 | # # compute MRR
106 | # # extract the logits from the diagonal of the matrix, which are the logits corresponding to the ground-truth
107 | # correct_scores = similarity_scores.diagonal()
108 | # # compute how many queries have bigger logits than the ground truth (the diagonal)
109 | # # the elements that are incorrectly ranked
110 | # compared_scores = similarity_scores.ge(correct_scores.unsqueeze(dim=-1)).float()
111 | # compared_scores_nb = torch.sum(compared_scores, dim=1)
112 | # per_sample_mrr = torch.div(1.0, compared_scores_nb)
113 | # per_batch_mrr = torch.sum(per_sample_mrr) / nb_samples
114 |
115 | # epoch_samples += nb_samples
116 | # epoch_loss += batch_total_loss.item() * nb_samples
117 | # loss = epoch_loss / max(1, epoch_samples)
118 |
119 | # mrr_sum += per_batch_mrr.item() * nb_samples
120 | # mrr = mrr_sum / max(1, epoch_samples)
121 |
122 | t_batch.set_postfix({f"loss": f"{batch_total_loss.item():10}"})
123 | t_batch.update(1)
124 |
125 | logger.info(
126 | f"total_loss:{total_loss}, avg_loss:{avg_loss}, total_mrr:{total_mrr}, avg_mrr:{avg_mrr}, total_size:{total_size}"
127 | )
128 |
129 |
130 | if __name__ == "__main__":
131 | args = docopt(__doc__)
132 | run_and_debug(lambda: run(args), args["--debug"])
133 |
--------------------------------------------------------------------------------
/codenets/codesearchnet/huggingface/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/codenets/codesearchnet/huggingface/__init__.py
--------------------------------------------------------------------------------
/codenets/codesearchnet/huggingface/models.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import os
4 | from pathlib import Path
5 | from typing import Union, TypeVar, Type, Generic
6 | from loguru import logger
7 | from transformers import PreTrainedModel
8 |
9 | from codenets.recordable import RecordableTorchModule
10 | from codenets.utils import full_classname, instance_full_classname, runtime_import
11 |
12 |
13 | PretrainedRec_T = TypeVar("PretrainedRec_T", bound="PreTrainedModelRecordable")
14 | Pretrained_T = TypeVar("Pretrained_T", bound="PreTrainedModel")
15 |
16 |
17 | class PreTrainedModelRecordable(Generic[Pretrained_T], RecordableTorchModule):
18 | """
19 | Wrap any generic HuggingFace PreTrainedModel as a Recordable Torch module
20 | equipped with load/save
21 | """
22 |
23 | def __init__(self, model: Pretrained_T):
24 | super().__init__()
25 | self.model = model
26 |
27 | def save(self, output_dir: Union[Path, str]) -> bool:
28 | full_dir = Path(output_dir) / instance_full_classname(self) / instance_full_classname(self.model)
29 | logger.info(f"Saving HuggingFace model to {full_dir}")
30 | os.makedirs(full_dir, exist_ok=True)
31 | self.model.save_pretrained(full_dir)
32 | return True
33 |
34 | @classmethod
35 | def load(cls: Type[PretrainedRec_T], restore_dir: Union[Path, str]) -> PretrainedRec_T:
36 | full_dir = Path(restore_dir) / full_classname(cls)
37 | logger.info(f"Loading HuggingFace Pretrained model from {full_dir}")
38 | _, dirs, _ = list(os.walk(full_dir))[0]
39 | model_cls_name = dirs[0]
40 | logger.info(f"Loading HuggingFace {model_cls_name} model from {full_dir}/{model_cls_name}")
41 | klass = runtime_import(model_cls_name)
42 | assert issubclass(klass, PreTrainedModel)
43 |
44 | model = klass.from_pretrained(str(full_dir / model_cls_name))
45 |
46 | return cls(model)
47 |
48 | def forward(self, *args, **kwargs):
49 | return self.model.forward(*args, **kwargs)
--------------------------------------------------------------------------------
/codenets/codesearchnet/huggingface/tokenizer_recs.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | from typing import Iterable, List, Optional, Tuple, Union, Dict, Callable, IO
3 | import numpy as np
4 | import os
5 | from loguru import logger
6 | from pathlib import Path
7 | from transformers import PreTrainedTokenizer, BertTokenizer
8 |
9 | from tokenizers import CharBPETokenizer, Encoding
10 |
11 | from codenets.recordable import instance_full_classname, full_classname
12 | from codenets.codesearchnet.data import DatasetParams
13 | from codenets.codesearchnet.tokenizer_recs import TokenizerRecordable
14 | from codenets.codesearchnet.copied_code.utils import read_file_samples
15 | from codenets.utils import get_data_files_from_directory
16 | from codenets.codesearchnet.training_ctx import default_sample_update
17 |
18 |
19 | class PreTrainedTokenizerRecordable(TokenizerRecordable):
20 | def __init__(self, vocab: PreTrainedTokenizer):
21 | self.vocab = vocab
22 |
23 | def tokenize(self, text: str, **kwargs) -> List[str]:
24 | return self.vocab.tokenize(text)
25 |
26 | def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
27 | return self.vocab.convert_tokens_to_ids(tokens)
28 |
29 | def unk_token(self) -> str:
30 | return self.vocab.unk_token()
31 |
32 | def encode_sentence(self, sentence: str, max_length: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]:
33 | encoded = self.vocab.encode_plus(
34 | sentence,
35 | max_length=max_length,
36 | pad_to_max_length=max_length is not None,
37 | return_token_type_ids=False,
38 | return_attention_mask=True,
39 | )
40 | token_ids = np.array(encoded["input_ids"])
41 | token_mask = np.array(encoded["attention_mask"])
42 | return token_ids, token_mask
43 |
44 | def encode_sentences(
45 | self, sentences: List[str], max_length: Optional[int] = None
46 | ) -> Tuple[np.ndarray, np.ndarray]:
47 | encoded = self.vocab.batch_encode_plus(
48 | sentences,
49 | max_length=max_length,
50 | pad_to_max_length=max_length is not None,
51 | return_token_type_ids=False,
52 | return_attention_mask=True,
53 | )
54 | token_ids = np.array(encoded["input_ids"])
55 | token_mask = np.array(encoded["attention_mask"])
56 | return (token_ids, token_mask)
57 |
58 | def encode_tokens(
59 | self, tokens: Iterable[List[str]], max_length: Optional[int] = None
60 | ) -> Tuple[np.ndarray, np.ndarray]:
61 | encoded = self.vocab(
62 | tokens,
63 | max_length=max_length,
64 | pad_to_max_length=max_length is not None,
65 | return_token_type_ids=False,
66 | return_attention_mask=True,
67 | )
68 | token_ids = np.array(encoded["input_ids"])
69 | token_mask = np.array(encoded["attention_mask"])
70 | return (token_ids, token_mask)
71 |
72 | def decode_sequence(self, tokens_sequence: List[int]) -> str:
73 | return self.vocab.decode(tokens_sequence)
74 |
75 | def decode_sequences(self, tokens_sequences: Iterable[List[int]]) -> List[str]:
76 | return self.vocab.decode_batch(tokens_sequences)
77 |
78 | def add_special_tokens(self, special_tokens: List[str]) -> bool:
79 | self.vocab.add_special_tokens(special_tokens)
80 | return True
81 |
82 |
83 | class BertTokenizerRecordable(PreTrainedTokenizerRecordable):
84 | def __init__(self, vocab: BertTokenizer):
85 | super(BertTokenizerRecordable, self).__init__(vocab)
86 |
87 | def save(self, output_dir: Union[Path, str]) -> bool:
88 | full_dir = Path(output_dir) / instance_full_classname(self)
89 | logger.debug(f"Saving BertTokenizerRecordable to {full_dir}")
90 | os.makedirs(full_dir, exist_ok=True)
91 | self.vocab.save_pretrained(full_dir)
92 | return True
93 |
94 | @classmethod
95 | def load(cls, restore_dir: Union[Path, str]) -> "BertTokenizerRecordable":
96 | full_dir = Path(restore_dir) / full_classname(cls)
97 | logger.debug(f"Loading BertTokenizerRecordable from {full_dir}")
98 | vocab = BertTokenizer.from_pretrained(str(full_dir))
99 | return BertTokenizerRecordable(vocab)
100 |
101 |
102 | class HuggingfaceBPETokenizerRecordable(TokenizerRecordable):
103 | def __init__(self, tokenizer: CharBPETokenizer):
104 | self.tokenizer = tokenizer
105 |
106 | def tokenize(self, text: str, **kwargs) -> List[str]:
107 | return self.tokenizer.encode(text).tokens
108 |
109 | def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
110 | return [self.tokenizer.token_to_id(tok) for tok in tokens]
111 |
112 | def unk_token(self) -> str:
113 | # no access to that in
114 | return ""
115 |
116 | # def pad_token(self) -> str:
117 | # return self.vocab.pad_token()
118 |
119 | def encode_sentence(self, sentence: str, max_length: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]:
120 | enc: Encoding = self.tokenizer.encode(sentence)
121 | if max_length is not None:
122 | enc.truncate(max_length)
123 | enc.pad(max_length)
124 | return np.array(enc.ids), np.array(enc.attention_mask)
125 |
126 | def encode_sentences(
127 | self, sentences: List[str], max_length: Optional[int] = None
128 | ) -> Tuple[np.ndarray, np.ndarray]:
129 | encs = self.tokenizer.encode_batch(sentences)
130 | if max_length is not None:
131 | for enc in encs:
132 | enc.truncate(max_length)
133 | enc.pad(max_length)
134 | # tokens_ids = [np.array(enc.ids) for enc in encs]
135 | # attention_mask = [np.array(enc.attention_mask) for enc in encs]
136 | tokens_ids = [enc.ids for enc in encs]
137 | attention_mask = [enc.attention_mask for enc in encs]
138 | return (np.array(tokens_ids), np.array(attention_mask))
139 |
140 | def encode_tokens(
141 | self, tokens: Iterable[List[str]], max_length: Optional[int] = None
142 | ) -> Tuple[np.ndarray, np.ndarray]:
143 | # hack...
144 | sentences = [" ".join(toks) for toks in tokens]
145 | return self.encode_sentences(sentences, max_length)
146 |
147 | def decode_sequence(self, tokens_sequence: List[int]) -> str:
148 | return self.tokenizer.decode(tokens_sequence)
149 |
150 | def decode_sequences(self, tokens_sequences: Iterable[List[int]]) -> List[str]:
151 | return self.tokenizer.decode_batch(tokens_sequences)
152 |
153 | def save(self, output_dir: Union[Path, str]) -> bool:
154 | full_dir = Path(output_dir) / instance_full_classname(self)
155 | logger.debug(f"HuggingfaceBPETokenizerRecordable - Saving to {full_dir}")
156 | os.makedirs(full_dir, exist_ok=True)
157 |
158 | self.tokenizer._tokenizer.model.save(str(full_dir), name=str(instance_full_classname(self)))
159 | return True
160 |
161 | @classmethod
162 | def load(cls, restore_dir: Union[Path, str]) -> HuggingfaceBPETokenizerRecordable:
163 | full_dir = Path(restore_dir) / full_classname(cls)
164 | logger.debug(f"HuggingfaceBPETokenizerRecordable - Loading from {full_dir}")
165 | vocab = str(full_dir / f"{full_classname(cls)}-vocab.json")
166 | merges = str(full_dir / f"{full_classname(cls)}-merges.txt")
167 | tokenizer = CharBPETokenizer(
168 | vocab=vocab,
169 | merges=merges
170 | )
171 |
172 | return HuggingfaceBPETokenizerRecordable(tokenizer)
173 |
174 | def add_special_tokens(self, special_tokens: List[str]) -> bool:
175 | self.tokenizer.add_special_tokens(special_tokens)
176 | return True
177 |
178 |
179 | def build_huggingface_token_files(
180 | data_dirs: List[Path],
181 | data_params: DatasetParams,
182 | output_path: Union[Path, str],
183 | sample_update: Callable[[str, str, List[str]], str] = default_sample_update,
184 | ) -> Tuple[List[Path], Dict[str, Path]]:
185 | tokenizers_path = Path(output_path)
186 | os.makedirs(tokenizers_path, exist_ok=True)
187 | # build files of strings
188 | lang_ios: Dict[str, Tuple[IO[str], IO[str]]] = {}
189 |
190 | query_files: List[Path] = []
191 | lang_files: Dict[str, Path] = {}
192 | for (idx, file_path) in enumerate(get_data_files_from_directory(data_dirs)):
193 | logger.info(f"Reading {file_path}")
194 | for raw_sample in read_file_samples(file_path):
195 | lang = raw_sample["language"]
196 | if lang not in lang_ios:
197 | query_file = tokenizers_path / f"{lang}_query.txt"
198 | code_file = tokenizers_path / f"{lang}_code.txt"
199 | lang_ios[lang] = (open(query_file, "w"), open(code_file, "w"))
200 | query_files.append(query_file)
201 | lang_files[lang] = code_file
202 | lang_ios[lang][0].write(sample_update("query", lang, raw_sample["docstring_tokens"]))
203 | lang_ios[lang][1].write(sample_update("code", lang, raw_sample["code_tokens"]))
204 |
205 | return query_files, lang_files
206 |
--------------------------------------------------------------------------------
/codenets/codesearchnet/poolers.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | from abc import abstractmethod
3 | from torch import nn
4 | import torch
5 | from codenets.recordable import RecordableTorchModule
6 |
7 |
8 | class EmbeddingPooler(RecordableTorchModule):
9 | """
10 | Compute pooler
11 |
12 | Args:
13 | seq_outputs (torch.tensor): [B x T x D] (B is batch, T is sequence size, D is embedding size)
14 | tokens_mask (torch.tensor): [B x T]
15 |
16 | Returns:
17 | tensor: [B x D]
18 | """
19 |
20 | @abstractmethod
21 | def forward(self, seq_outputs: torch.Tensor, tokens_mask: torch.Tensor) -> torch.Tensor:
22 | pass
23 |
24 |
25 | class MeanPooler(EmbeddingPooler):
26 | def __init__(self, input_size: int = 128, eps: float = 1e-8):
27 | super().__init__()
28 | self.dense = nn.Linear(input_size, 1, bias=False)
29 | self.activation = nn.Sigmoid()
30 | self.eps = eps
31 |
32 | def forward(self, seq_outputs: torch.Tensor, tokens_mask: torch.Tensor) -> torch.Tensor:
33 | # TO TEST
34 | lg = torch.sum(tokens_mask, dim=-1)
35 | mask = tokens_mask.unsqueeze(dim=-1)
36 | seq_outputs_masked = seq_outputs * mask
37 | seq_outputs_sum = torch.sum(seq_outputs_masked, dim=-1)
38 | output = seq_outputs_sum / lg.unsqueeze(dim=-1).clamp(self.eps)
39 | return output
40 |
41 |
42 | class MeanWeightedPooler(EmbeddingPooler):
43 | def __init__(self, input_size: int = 512, eps: float = 1e-8): # default params required for module construction
44 | super().__init__()
45 | self.dense = nn.Linear(input_size, 1, bias=False)
46 | self.activation = nn.Sigmoid()
47 | self.eps = eps
48 |
49 | def forward(self, seq_outputs: torch.Tensor, tokens_mask: torch.Tensor) -> torch.Tensor:
50 | token_weights = self.activation(self.dense(seq_outputs)) # B x T x 1
51 | token_weights = token_weights * tokens_mask.unsqueeze(dim=-1) # B x T x 1
52 | # sum on the T dimension
53 | seq_weighted_sum = torch.sum(seq_outputs * token_weights, dim=1) # B x D
54 | output = seq_weighted_sum / torch.sum(token_weights, dim=1).clamp(min=self.eps)
55 | return output
56 |
--------------------------------------------------------------------------------
/codenets/codesearchnet/predictions.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Usage:
4 | train.py [options] SAVE_FOLDER TRAIN_DATA_PATH VALID_DATA_PATH TEST_DATA_PATH
5 | train.py [options] [SAVE_FOLDER]
6 |
7 | *_DATA_PATH arguments may either accept (1) directory filled with .jsonl.gz files that we use as data,
8 | or a (2) plain text file containing a list of such directories (used for multi-language training).
9 |
10 | In the case that you supply a (2) plain text file, all directory names must be separated by a newline.
11 | For example, if you want to read from multiple directories you might have a plain text file called
12 | data_dirs_train.txt with the below contents:
13 |
14 | > cat ~/src/data_dirs_train.txt
15 | azure://semanticcodesearch/pythondata/Processed_Data/jsonl/train
16 | azure://semanticcodesearch/csharpdata/split/csharpCrawl-train
17 |
18 | Options:
19 | -h --help Show this screen.
20 | --restore DIR specify restoration dir.
21 | --wandb_run_id // Specify Wandb Run
22 | --debug Enable debug routines. [default: False]
23 | """
24 |
25 | import os
26 | import sys
27 | from pathlib import Path
28 | from typing import Tuple
29 | import torch
30 | import numpy as np
31 | from docopt import docopt
32 | from dpu_utils.utils import run_and_debug
33 | from loguru import logger
34 | import pandas as pd
35 | from annoy import AnnoyIndex
36 | from tqdm import tqdm
37 | import shutil
38 | from wandb.apis import InternalApi
39 | import wandb
40 | from codenets.codesearchnet.training_ctx import CodeSearchTrainingContext
41 |
42 |
43 | def compute_code_encodings_from_defs(
44 | language: str, training_ctx: CodeSearchTrainingContext, lang_token: str, batch_length: int = 1024
45 | ) -> Tuple[pd.DataFrame, pd.DataFrame]:
46 | logger.info(f"Computing Encoding for language: {language}")
47 | lang_id = training_ctx.train_data_params.lang_ids[language]
48 | h5_file = (
49 | training_ctx.pickle_path
50 | / f"{language}_{training_ctx.training_full_name}_dedupe_definitions_v2_codes_encoded.h5"
51 | )
52 | root_data_path = Path(training_ctx.conf["dataset.root_dir"])
53 |
54 | def_file = root_data_path / f"data/{language}_dedupe_definitions_v2.pkl"
55 | definitions_df = pd.DataFrame(pd.read_pickle(open(def_file, "rb"), compression=None))
56 | cols_to_remove = list(definitions_df.columns.difference(["function_tokens", "identifier", "url"]))
57 | for col in cols_to_remove:
58 | del definitions_df[col]
59 | # definitions_df.drop(cols_to_remove, inplace=True, axis=1)
60 | logger.debug(f"definitions_df {definitions_df.columns}")
61 |
62 | if not os.path.exists(h5_file):
63 | logger.info(f"Building encodings of code from {def_file}")
64 |
65 | # function_tokens = definitions_df["function_tokens"]
66 | # add language and lang_token () to tokens
67 | definitions_df["function_tokens"] = definitions_df["function_tokens"].apply(
68 | lambda row: [language, lang_token] + row
69 | )
70 | function_tokens_batch = definitions_df["function_tokens"].groupby(
71 | np.arange(len(definitions_df["function_tokens"])) // batch_length
72 | )
73 |
74 | code_embeddings = []
75 | for g, df_batch in tqdm(function_tokens_batch):
76 | # logger.debug(f"df_batch {df_batch.values}")
77 | codes_encoded, codes_masks = training_ctx.tokenize_code_tokens(
78 | df_batch.values, max_length=training_ctx.conf["dataset.common_params.code_max_num_tokens"]
79 | )
80 |
81 | # codes_encoded_t = torch.tensor(codes_encoded, dtype=torch.long).to(training_ctx.device)
82 | # codes_masks_t = torch.tensor(codes_masks, dtype=torch.long).to(training_ctx.device)
83 |
84 | # logger.debug(f"codes_encoded_t {codes_encoded_t}")
85 | # logger.debug(f"codes_masks_t {codes_masks_t}")
86 |
87 | emb_df = pd.DataFrame(
88 | training_ctx.encode_code(
89 | lang_id=lang_id,
90 | code_tokens=codes_encoded,
91 | code_tokens_mask=codes_masks
92 | )
93 | # .cpu()
94 | # .numpy()
95 | )
96 | # logger.debug(f"codes_encoded_t:{codes_encoded_t.shape} codes_masks_t:{codes_masks_t.shape}")
97 | if g < 2:
98 | logger.debug(f"emb_df {emb_df.head()}")
99 | code_embeddings.append(emb_df)
100 |
101 | # free memory or it explodes on 32GB...
102 | del definitions_df["function_tokens"]
103 |
104 | code_embeddings_df = pd.concat(code_embeddings)
105 |
106 | logger.debug(f"code_embeddings_df {code_embeddings_df.head(20)}")
107 |
108 | code_embeddings_df.to_hdf(h5_file, key="code_embeddings_df", mode="w")
109 | return (code_embeddings_df, definitions_df)
110 | else:
111 | code_embeddings_df = pd.read_hdf(h5_file, key="code_embeddings_df")
112 | return (code_embeddings_df, definitions_df)
113 |
114 |
115 | def run(args, tag_in_vcs=False) -> None:
116 | args_wandb_run_id = args["--wandb_run_id"]
117 | if args_wandb_run_id is not None:
118 | entity, project, name = args_wandb_run_id.split("/")
119 | os.environ["WANDB_RUN_ID"] = name
120 | os.environ["WANDB_RESUME"] = "must"
121 |
122 | wandb_api = wandb.Api()
123 | # retrieve saved model from W&B for this run
124 | logger.info("Fetching run from W&B...")
125 | try:
126 | wandb_api.run(args_wandb_run_id)
127 | except wandb.CommError:
128 | logger.error(f"ERROR: Problem querying W&B for wandb_run_id: {args_wandb_run_id}", file=sys.stderr)
129 | sys.exit(1)
130 |
131 | else:
132 | os.environ["WANDB_MODE"] = "dryrun"
133 |
134 | logger.debug("Building Training Context")
135 | training_ctx: CodeSearchTrainingContext
136 | restore_dir = args["--restore"]
137 | logger.info(f"Restoring Training Context from directory{restore_dir}")
138 | training_ctx = CodeSearchTrainingContext.build_context_from_dir(restore_dir)
139 |
140 | queries = pd.read_csv(training_ctx.queries_file)
141 | queries = list(map(lambda q: f" {q}", queries["query"].values))
142 | queries_tokens, queries_masks = training_ctx.tokenize_query_sentences(
143 | queries, max_length=training_ctx.conf["dataset.common_params.query_max_num_tokens"]
144 | )
145 | logger.info(f"queries: {queries}")
146 |
147 | training_ctx.eval_mode()
148 | with torch.no_grad():
149 | query_embeddings = (
150 | training_ctx.encode_query(
151 | query_tokens=queries_tokens,
152 | query_tokens_mask=queries_masks,
153 | )
154 | # .cpu()
155 | # .numpy()
156 | )
157 | logger.info(f"query_embeddings: {query_embeddings.shape}")
158 |
159 | topk = 100
160 | language_token = ""
161 | for lang_idx, language in enumerate(
162 | ("python", "go", "javascript", "java", "php", "ruby")
163 | # ("php", "ruby")
164 | ): # in enumerate(("python", "go", "javascript", "java", "php", "ruby")):
165 | predictions = []
166 | # (codes_encoded_df, codes_masks_df, definitions) = get_language_defs(language, training_ctx, language_token)
167 |
168 | code_embeddings, definitions = compute_code_encodings_from_defs(
169 | language, training_ctx, language_token, batch_length=512
170 | )
171 | logger.info(f"Building Annoy Index of length {len(code_embeddings.values[0])}")
172 | indices: AnnoyIndex = AnnoyIndex(len(code_embeddings.values[0]), "angular")
173 | # idx = 0
174 | for index, emb in enumerate(tqdm(code_embeddings.values)):
175 | indices.add_item(index, emb)
176 | indices.build(10)
177 |
178 | for i, (query, query_embedding) in enumerate(tqdm(zip(queries, query_embeddings))):
179 | idxs, distances = indices.get_nns_by_vector(query_embedding, topk, include_distances=True)
180 | for idx2, _ in zip(idxs, distances):
181 | predictions.append(
182 | (query, language, definitions.iloc[idx2]["identifier"], definitions.iloc[idx2]["url"])
183 | )
184 |
185 | logger.info(f"predictions {predictions[0]}")
186 |
187 | df = pd.DataFrame(predictions, columns=["query", "language", "identifier", "url"])
188 | # BUT WHY DOESNT IT WORK AS EXPECTED????
189 | df["query"] = df["query"].str.replace(" ", "")
190 | df["identifier"] = df["identifier"].str.replace(",", "")
191 | df["identifier"] = df["identifier"].str.replace('"', "")
192 | df["identifier"] = df["identifier"].str.replace(";", "")
193 | df.to_csv(
194 | training_ctx.output_dir / f"model_predictions_{training_ctx.training_tokenizer_type}.csv",
195 | index=False,
196 | header=True if lang_idx == 0 else False,
197 | # mode="w" if lang_idx == 0 else "a",
198 | mode="a",
199 | )
200 | # Free memory
201 | del code_embeddings
202 | del definitions
203 | del predictions
204 |
205 | if args_wandb_run_id is not None:
206 | logger.info("Uploading predictions to W&B")
207 | # upload model predictions CSV file to W&B
208 |
209 | entity, project, name = args_wandb_run_id.split("/")
210 |
211 | # make sure the file is in our cwd, with the correct name
212 | predictions_csv = training_ctx.output_dir / f"model_predictions_{training_ctx.training_tokenizer_type}.csv"
213 | predictions_base_csv = "model_predictions.csv"
214 | shutil.copyfile(predictions_csv, predictions_base_csv)
215 |
216 | # Using internal wandb API. TODO: Update when available as a public API
217 | internal_api = InternalApi()
218 | internal_api.push([predictions_base_csv], run=name, entity=entity, project=project)
219 |
220 |
221 | if __name__ == "__main__":
222 | args = docopt(__doc__)
223 | run_and_debug(lambda: run(args), args["--debug"])
224 |
--------------------------------------------------------------------------------
/codenets/codesearchnet/query_1_code_1/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/codenets/codesearchnet/query_1_code_1/__init__.py
--------------------------------------------------------------------------------
/codenets/codesearchnet/query_1_code_1/model.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from pathlib import Path
4 | from typing import MutableMapping, Optional, Union, Type
5 |
6 | import numpy as np
7 | from transformers import BertConfig, BertModel
8 |
9 | from codenets.codesearchnet.poolers import MeanWeightedPooler
10 | from codenets.codesearchnet.huggingface.models import PreTrainedModelRecordable
11 | from codenets.recordable import (
12 | Recordable,
13 | RecordableTorchModule,
14 | runtime_load_recordable_mapping,
15 | save_recordable_mapping,
16 | )
17 | from codenets.utils import full_classname, instance_full_classname
18 | from pyhocon import ConfigTree
19 |
20 |
21 | class Query1Code1(RecordableTorchModule):
22 | """
23 | A generic Pytorch Model with:
24 | - one single-branch query encoder
25 | - one single-branch code encoder
26 | - one optional pooler to pool output embeddings from any branch
27 | """
28 |
29 | def __init__(
30 | self,
31 | query_encoder: RecordableTorchModule,
32 | code_encoder: RecordableTorchModule,
33 | pooler: Optional[RecordableTorchModule] = None,
34 | ):
35 | super(Query1Code1, self).__init__()
36 | self.code_encoder = code_encoder
37 | self.query_encoder = query_encoder
38 | self.pooler = pooler
39 |
40 | def save(self, output_dir: Union[Path, str]) -> bool:
41 | d = Path(output_dir) / instance_full_classname(self)
42 | records: MutableMapping[str, Recordable] = {
43 | "query_encoder": self.query_encoder,
44 | "code_encoder": self.code_encoder,
45 | }
46 | if self.pooler is not None:
47 | records["pooler"] = self.pooler
48 | return save_recordable_mapping(output_dir=d, records=records)
49 |
50 | @classmethod
51 | def load(cls, restore_dir: Union[Path, str]) -> Query1Code1:
52 | d = Path(restore_dir) / full_classname(cls)
53 | records = runtime_load_recordable_mapping(d)
54 | return cls(**records) # type:ignore[arg-type]
55 |
56 | def forward(
57 | self,
58 | languages: np.ndarray,
59 | query_tokens: np.ndarray,
60 | query_tokens_mask: np.ndarray,
61 | code_tokens: np.ndarray,
62 | code_tokens_mask: np.ndarray,
63 | ):
64 | # lang_id = str(languages[0].item())
65 | query_seq_outputs = self.query_encoder(query_tokens, query_tokens_mask) # [B x S x H]
66 | code_seq_outputs = self.code_encoder(code_tokens, code_tokens_mask) # [B x S x H]
67 |
68 | if self.pooler is not None:
69 | return (
70 | self.pooler(query_seq_outputs[0], query_tokens_mask),
71 | self.pooler(code_seq_outputs[0], code_tokens_mask),
72 | )
73 | else:
74 | # use already pooled data (need to be pretrained as it uses 1st (CLS) token logit)
75 | return query_seq_outputs[1], code_seq_outputs[1]
76 |
77 | def encode_query(self, query_tokens: np.ndarray, query_tokens_mask: np.ndarray) -> np.ndarray:
78 | query_seq_outputs = self.query_encoder(query_tokens, query_tokens_mask)
79 |
80 | if self.pooler is not None:
81 | return self.pooler(query_seq_outputs[0], query_tokens_mask)
82 | else:
83 | return query_seq_outputs[1]
84 |
85 | def encode_code(self, lang_id: int, code_tokens: np.ndarray, code_tokens_mask: np.ndarray) -> np.ndarray:
86 | code_seq_outputs = self.code_encoder(code_tokens, code_tokens_mask)
87 | if self.pooler is not None:
88 | return self.pooler(code_seq_outputs[0], code_tokens_mask)
89 | else:
90 | return code_seq_outputs[1]
91 |
92 | def tokenize_code(self, lang_id: int, code_tokens: np.ndarray, code_tokens_mask: np.ndarray) -> np.ndarray:
93 | code_seq_outputs = self.code_encoder(code_tokens, code_tokens_mask)
94 | if self.pooler is not None:
95 | return self.pooler(code_seq_outputs[0], code_tokens_mask)
96 | else:
97 | return code_seq_outputs[1]
98 |
99 | @classmethod
100 | def from_hocon(cls: Type[Query1Code1], config: ConfigTree) -> Query1Code1:
101 | """Load Query1Code1_CodeSearchModel from a config tree"""
102 |
103 | query_bert_config = BertConfig(**config["training.model.query_encoder"])
104 | query_encoder = PreTrainedModelRecordable(BertModel(query_bert_config))
105 | code_bert_config = BertConfig(**config["training.model.code_encoder"])
106 | code_encoder = PreTrainedModelRecordable(BertModel(code_bert_config))
107 |
108 | model = Query1Code1(
109 | query_encoder=query_encoder,
110 | code_encoder=code_encoder,
111 | pooler=MeanWeightedPooler(input_size=query_bert_config.hidden_size),
112 | )
113 |
114 | return model
115 |
--------------------------------------------------------------------------------
/codenets/codesearchnet/query_1_code_n/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/codenets/codesearchnet/query_1_code_n/__init__.py
--------------------------------------------------------------------------------
/codenets/codesearchnet/query_code_siamese/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/codenets/codesearchnet/query_code_siamese/__init__.py
--------------------------------------------------------------------------------
/codenets/codesearchnet/query_code_siamese/model.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from pathlib import Path
4 | from typing import MutableMapping, Optional, Union, Type
5 |
6 | import numpy as np
7 | from loguru import logger
8 | from transformers import BertConfig, BertModel, AlbertConfig, AlbertModel
9 |
10 | from codenets.codesearchnet.poolers import MeanWeightedPooler
11 | from codenets.codesearchnet.huggingface.models import PreTrainedModelRecordable
12 | from codenets.recordable import (
13 | Recordable,
14 | RecordableTorchModule,
15 | runtime_load_recordable_mapping,
16 | save_recordable_mapping,
17 | )
18 | from codenets.utils import full_classname, instance_full_classname
19 | from pyhocon import ConfigTree
20 |
21 |
22 | class QueryCodeSiamese(RecordableTorchModule):
23 | """
24 | A generic Pytorch Model with:
25 | - one single-branch query encoder
26 | - one single-branch code encoder
27 | - one optional pooler to pool output embeddings from any branch
28 | """
29 | def __init__(self, encoder: RecordableTorchModule, pooler: Optional[RecordableTorchModule] = None):
30 | super(QueryCodeSiamese, self).__init__()
31 | self.encoder = encoder
32 | self.pooler = pooler
33 |
34 | def save(self, output_dir: Union[Path, str]) -> bool:
35 | d = Path(output_dir) / instance_full_classname(self)
36 | records: MutableMapping[str, Recordable] = {"encoder": self.encoder}
37 | if self.pooler is not None:
38 | records["pooler"] = self.pooler
39 | return save_recordable_mapping(output_dir=d, records=records)
40 |
41 | @classmethod
42 | def load(cls, restore_dir: Union[Path, str]) -> QueryCodeSiamese:
43 | d = Path(restore_dir) / full_classname(cls)
44 | records = runtime_load_recordable_mapping(d)
45 | return cls(**records) # type: ignore[arg-type]
46 |
47 | def forward(
48 | self,
49 | languages: np.ndarray,
50 | query_tokens: np.ndarray,
51 | query_tokens_mask: np.ndarray,
52 | code_tokens: np.ndarray,
53 | code_tokens_mask: np.ndarray,
54 | lang_weights: np.ndarray,
55 | ):
56 | # lang_id = str(languages[0].item())
57 | query_seq_outputs = self.encoder(query_tokens, query_tokens_mask) # [B x S x H]
58 | code_seq_outputs = self.encoder(code_tokens, code_tokens_mask) # [B x S x H]
59 | if self.pooler is not None:
60 | return (
61 | self.pooler(query_seq_outputs[0], query_tokens_mask),
62 | self.pooler(code_seq_outputs[0], code_tokens_mask),
63 | )
64 | else:
65 | # use already pooled data (need to be pretrained as it uses 1st (CLS) token logit)
66 | return query_seq_outputs[1], code_seq_outputs[1]
67 |
68 | def encode_query(self, query_tokens: np.ndarray, query_tokens_mask: np.ndarray) -> np.ndarray:
69 | query_seq_outputs = self.encoder(query_tokens, query_tokens_mask)
70 |
71 | if self.pooler is not None:
72 | return self.pooler(query_seq_outputs[0], query_tokens_mask)
73 | else:
74 | return query_seq_outputs[1]
75 |
76 | def encode_code(self, lang_id: int, code_tokens: np.ndarray, code_tokens_mask: np.ndarray) -> np.ndarray:
77 | code_seq_outputs = self.encoder(code_tokens, code_tokens_mask)
78 | if self.pooler is not None:
79 | return self.pooler(code_seq_outputs[0], code_tokens_mask)
80 | else:
81 | return code_seq_outputs[1]
82 |
83 | def tokenize_code(self, lang_id: int, code_tokens: np.ndarray, code_tokens_mask: np.ndarray) -> np.ndarray:
84 | code_seq_outputs = self.encoder(code_tokens, code_tokens_mask)
85 | if self.pooler is not None:
86 | return self.pooler(code_seq_outputs[0], code_tokens_mask)
87 | else:
88 | return code_seq_outputs[1]
89 |
90 | @classmethod
91 | def from_hocon(cls: Type[QueryCodeSiamese], config: ConfigTree) -> QueryCodeSiamese:
92 | """Load Query1Code1_CodeSearchModel from a config tree"""
93 |
94 | if "training.model.encoder.type" in config:
95 | if config["training.model.encoder.type"] == "albert":
96 | logger.info("Creating QueryCodeSiamese with Albert encoder")
97 | albert_config = AlbertConfig(**config["training.model.encoder"])
98 | encoder = PreTrainedModelRecordable(AlbertModel(albert_config))
99 | elif config["training.model.encoder.type"] == "bert":
100 | logger.info("Creating QueryCodeSiamese with Bert encoder")
101 | bert_config = BertConfig(**config["training.model.encoder"])
102 | encoder = PreTrainedModelRecordable(BertModel(bert_config))
103 | else:
104 | # default is BERT now
105 | logger.info("Creating QueryCodeSiamese with Bert encoder")
106 | bert_config = BertConfig(**config["training.model.encoder"])
107 | encoder = PreTrainedModelRecordable(BertModel(bert_config))
108 |
109 | model = QueryCodeSiamese(
110 | encoder=encoder, pooler=MeanWeightedPooler(input_size=config["training.model.encoder.hidden_size"])
111 | )
112 |
113 | return model
114 |
--------------------------------------------------------------------------------
/codenets/codesearchnet/sbert_build.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Usage:
4 | eval.py [options] SAVE_FOLDER TRAIN_DATA_PATH VALID_DATA_PATH TEST_DATA_PATH
5 | eval.py [options] [SAVE_FOLDER]
6 |
7 | *_DATA_PATH arguments may either accept (1) directory filled with .jsonl.gz files that we use as data,
8 | or a (2) plain text file containing a list of such directories (used for multi-language training).
9 |
10 | In the case that you supply a (2) plain text file, all directory names must be separated by a newline.
11 | For example, if you want to read from multiple directories you might have a plain text file called
12 | data_dirs_train.txt with the below contents:
13 |
14 | > cat ~/src/data_dirs_train.txt
15 | azure://semanticcodesearch/pythondata/Processed_Data/jsonl/train
16 | azure://semanticcodesearch/csharpdata/split/csharpCrawl-train
17 |
18 | Options:
19 | -h --help Show this screen.
20 | --config FILE Specify HOCON config file.
21 | --debug Enable debug routines. [default: False]
22 | """
23 |
24 | from typing import Dict, List
25 | from sentence_transformers import SentenceTransformer
26 | from dpu_utils.utils import run_and_debug
27 | from docopt import docopt
28 | from loguru import logger
29 | import itertools
30 | import os
31 | import pickle
32 | from torch.utils.data import DataLoader
33 | from pathlib import Path
34 | from pyhocon import ConfigFactory
35 | from torch import nn
36 | from torch import Tensor
37 | import torch
38 | import numpy as np
39 | import pandas as pd
40 |
41 | from tree_sitter import Language, Parser
42 | from codenets.codesearchnet.copied_code.utils import read_file_samples
43 | from sklearn.metrics.pairwise import pairwise_distances
44 | from codenets.codesearchnet.dataset_utils import BalancedBatchSchedulerSampler, DatasetType
45 | from codenets.codesearchnet.data import DatasetParams
46 | from codenets.codesearchnet.training_ctx import CodeSearchTrainingContext
47 | from codenets.codesearchnet.query_code_siamese.dataset import load_data_from_dirs
48 |
49 | """Evaluating SBert."""
50 |
51 |
52 | def run(args, tag_in_vcs=False) -> None:
53 | # os.environ["WANDB_MODE"] = "dryrun"
54 |
55 | logger.debug("Building Training Context")
56 | conf_file = args["--config"]
57 | conf = ConfigFactory.parse_file(conf_file)
58 |
59 | logger.info(f"Restoring Training Context from config {conf_file}")
60 | training_ctx = CodeSearchTrainingContext.build_context_from_hocon(conf)
61 |
62 | # val_dataset = training_ctx.build_lang_dataset(DatasetType.VAL)
63 | # if val_dataset.collate_fn is not None:
64 | # val_dataloader = DataLoader(
65 | # dataset=val_dataset,
66 | # batch_size=conf["training.batch_size.val"],
67 | # sampler=BalancedBatchSchedulerSampler(dataset=val_dataset, batch_size=conf["training.batch_size.val"]),
68 | # collate_fn=val_dataset.collate_fn,
69 | # )
70 | # else:
71 | # val_dataloader = DataLoader(
72 | # dataset=val_dataset,
73 | # batch_size=conf["training.batch_size.val"],
74 | # sampler=BalancedBatchSchedulerSampler(dataset=val_dataset, batch_size=conf["training.batch_size.val"]),
75 | # )
76 |
77 | val_dataloader = training_ctx.build_lang_dataloader(DatasetType.VAL)
78 | logger.info(f"val_dataloader [{len(val_dataloader)} samples]")
79 |
80 | # train_dataloader = training_ctx.build_lang_dataloader(DatasetType.TRAIN)
81 | # logger.info(f"train_dataloader [{len(train_dataloader)} samples]")
82 |
83 | # df = pd.read_parquet("./pickles/train_qc_30k_embeddings.parquet")
84 | # print(df.info())
85 |
86 | # z = df.iloc[0][0]
87 | # print("z", z.shape)
88 | from annoy import AnnoyIndex
89 |
90 | t = AnnoyIndex(768, "angular")
91 | # for index, row in df.iterrows():
92 | # print(row.shape)
93 | # t.add_item(index, row[0])
94 | # t.build(10) # 10 trees
95 | # t.save("./pickles/train_qc_30k_embeddings.ann")
96 |
97 | t.load("./pickles/val_qc_30k_embeddings.ann")
98 |
99 | # for i in range(0, 100):
100 | # print(i, 99, 1.0 - t.get_distance(i, 99))
101 |
102 | for batch in val_dataloader: # itertools.islice(val_dataloader, 0, 1000):
103 | indices, languages, similarity, query_tokens, query_tokens_mask, code_tokens, code_tokens_mask, code_lang_weights = (
104 | batch
105 | )
106 | toks = [toks.cpu().numpy()[: len(mask[mask != 0])] for (toks, mask) in zip(query_tokens, query_tokens_mask)]
107 | toks = training_ctx.decode_query_tokens(toks)
108 | qs = [str((t, score)) for (t, score) in list(zip(toks, similarity))]
109 | for i, scores in enumerate(similarity):
110 | for j, s in enumerate(scores):
111 | if s > 0.5 and i != j:
112 | print(s, toks[i], toks[j])
113 |
114 | # print("query", "\n".join(qs))
115 |
116 | # # print("query_tokens", query_tokens)
117 | # # 5 for removing " "
118 | # toks = [toks.cpu().numpy()[: len(mask[mask != 0])] for (toks, mask) in zip(query_tokens, query_tokens_mask)]
119 | # toks = training_ctx.decode_query_tokens(toks)
120 | # # print("toks", toks)
121 | # qs = [str((t, score)) for (t, score) in list(zip(toks, similarity))]
122 | # print("query", "\n".join(qs))
123 | # print("-----------")
124 |
125 | # data_file = (
126 | # "/home/mandubian/workspaces/tools/CodeSearchNet/resources/data/python/final/jsonl/valid/python_valid_0.jsonl.gz"
127 | # )
128 | # filename = os.path.basename(data_file)
129 | # file_language = filename.split("_")[0]
130 |
131 | # samples = list(read_file_samples(data_file))
132 |
133 | # sample0 = samples[0]
134 | # sample1 = samples[1]
135 | # logger.info(f"keys {sample0.keys()}")
136 | # logger.info(f"sample docstring {sample0['docstring_tokens']}")
137 | # query0 = " ".join(samples[0]["docstring_tokens"])
138 | # logger.info(f"query0 {query0}")
139 | # query_embeddings0 = model.encode([query0])
140 | # # logger.info(f"query_embeddings0 {query_embeddings0}")
141 | # query1 = " ".join(sample1["docstring_tokens"])
142 | # query_embeddings1 = model.encode([query1])
143 |
144 | # distances = pairwise_distances(query_embeddings0, query_embeddings1, metric="cosine")
145 | # logger.info(f"distances {distances}")
146 |
147 | # Language.build_library(
148 | # # Store the library in the `build` directory
149 | # "build/my-languages.so",
150 | # # Include one or more languages
151 | # [
152 | # "vendor/tree-sitter-go",
153 | # "vendor/tree-sitter-java",
154 | # "vendor/tree-sitter-javascript",
155 | # "vendor/tree-sitter-python",
156 | # "vendor/tree-sitter-php",
157 | # "vendor/tree-sitter-ruby",
158 | # ],
159 | # )
160 | # PY_LANGUAGE = Language("build/my-languages.so", "python")
161 | # parser = Parser()
162 | # parser.set_language(PY_LANGUAGE)
163 | # tree = parser.parse(bytes(samples[0]["code"], "utf8"))
164 |
165 | # logger.info(f"tree {tree}")
166 |
167 |
168 | if __name__ == "__main__":
169 | args = docopt(__doc__)
170 | run_and_debug(lambda: run(args), args["--debug"])
171 |
--------------------------------------------------------------------------------
/codenets/codesearchnet/tokenizer_build.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Usage:
4 | tokenizers_huggingface_build.py [options]
5 | tokenizers_huggingface_build.py [options]
6 |
7 | Options:
8 | -h --help Show this screen.
9 | --config FILE Specify HOCON config file. [default: ./conf/default.conf]
10 | --debug Enable debug routines. [default: False]
11 | """
12 |
13 |
14 | from docopt import docopt
15 | from loguru import logger
16 | import sys
17 | import torch
18 | from dpu_utils.utils import run_and_debug
19 | from pyhocon import ConfigFactory, ConfigTree
20 |
21 | from codenets.codesearchnet.training_ctx import CodeSearchTrainingContext
22 | from codenets.codesearchnet.tokenizer_recs import build_most_common_tokens
23 |
24 | print("Torch version", torch.__version__)
25 |
26 | logger.remove()
27 | logger.add(sys.stderr, level="DEBUG", colorize=True, backtrace=False)
28 |
29 |
30 | def run(args, tag_in_vcs=False) -> None:
31 | conf_file = args["--config"]
32 | logger.info(f"config file {conf_file}")
33 |
34 | conf: ConfigTree = ConfigFactory.parse_file(conf_file)
35 | logger.info(f"config {conf}")
36 |
37 | # logger.info(f"Build Training Context from config {conf_file}")
38 | # training_ctx = CodeSearchTrainingContext.build_context_from_hocon(conf)
39 |
40 | # training_ctx.build_tokenizers(from_dataset_type=DatasetType.TRAIN)
41 |
42 | logger.info(f"Reload Training Context from config {conf_file} with built tokenizers")
43 | training_ctx = CodeSearchTrainingContext.build_context_from_hocon(conf)
44 |
45 | txt = "python def toto():"
46 | logger.info(f"encoded {training_ctx.tokenize_code_sentences([txt])}")
47 | txt = "go function getCounts() { return 0 }"
48 | logger.info(f"encoded {training_ctx.tokenize_code_sentences([txt])}")
49 |
50 | most_commons = build_most_common_tokens(
51 | training_ctx.train_dirs, training_ctx.train_data_params, training_ctx.tokenizers_build_path,
52 | parallelize=False
53 | )
54 | logger.info(f"most_commons {most_commons}")
55 |
56 |
57 | if __name__ == "__main__":
58 | args = docopt(__doc__)
59 | run_and_debug(lambda: run(args), args["--debug"])
60 |
--------------------------------------------------------------------------------
/codenets/codesearchnet/tokenizer_recs.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod
2 |
3 | from typing import Iterable, List, Optional, Tuple, Dict, cast
4 | import numpy as np
5 | import os
6 | from loguru import logger
7 | from pathlib import Path
8 | import pickle
9 |
10 | import time
11 |
12 | from pyhocon import ConfigTree
13 | from codenets.recordable import Recordable, RecordableMapping, DictRecordable
14 | from codenets.codesearchnet.data import DatasetParams
15 | from codenets.codesearchnet.copied_code.metadata import Metadata, append_metadata, build_tokenizer_metadata
16 |
17 |
18 | class TokenizerRecordable(Recordable):
19 | @abstractmethod
20 | def tokenize(self, text: str, **kwargs) -> List[str]:
21 | pass
22 |
23 | @abstractmethod
24 | def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
25 | pass
26 |
27 | @abstractmethod
28 | def unk_token(self) -> str:
29 | pass
30 |
31 | # @abstractmethod
32 | # def pad_token(self) -> str:
33 | # pass
34 |
35 | @abstractmethod
36 | def encode_sentence(self, sentence: str, max_length: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]:
37 | pass
38 |
39 | @abstractmethod
40 | def encode_sentences(
41 | self, sentences: List[str], max_length: Optional[int] = None
42 | ) -> Tuple[np.ndarray, np.ndarray]:
43 | pass
44 |
45 | @abstractmethod
46 | def encode_tokens(
47 | self, tokens: Iterable[List[str]], max_length: Optional[int] = None
48 | ) -> Tuple[np.ndarray, np.ndarray]:
49 | pass
50 |
51 | @abstractmethod
52 | def decode_sequence(self, tokens_sequence: List[int]) -> str:
53 | pass
54 |
55 | @abstractmethod
56 | def decode_sequences(self, tokens_sequences: Iterable[List[int]]) -> List[str]:
57 | pass
58 |
59 | @abstractmethod
60 | def add_special_tokens(self, special_tokens: List[str]) -> bool:
61 | pass
62 |
63 |
64 | def build_most_common_tokens(
65 | data_dirs: List[Path],
66 | data_params: DatasetParams,
67 | build_path: Path,
68 | max_files_per_dir: Optional[int] = None,
69 | parallelize: bool = True,
70 | ) -> Dict[str, List[Tuple[str, int]]]:
71 |
72 | start = time.time()
73 |
74 | logger.info(f"Build metadata for {data_dirs}")
75 |
76 | _, code_language_metadata_lists = build_tokenizer_metadata(
77 | data_dirs=data_dirs,
78 | max_files_per_dir=max_files_per_dir,
79 | parallelize=parallelize,
80 | use_subtokens=data_params.use_subtokens,
81 | mark_subtoken_end=data_params.mark_subtoken_end,
82 | )
83 |
84 | logger.info("Merging metadata")
85 |
86 | # merge metadata if necessary
87 | per_code_language_metadata: Dict[str, Metadata] = {}
88 | for (language, raw_per_language_metadata) in code_language_metadata_lists.items():
89 | logger.info(f"Build vocabulary for {language}")
90 | per_code_language_metadata[language] = append_metadata(
91 | "code",
92 | vocab_size=data_params.vocab_size,
93 | vocab_count_threshold=data_params.vocab_count_threshold,
94 | pct_bpe=data_params.pct_bpe,
95 | raw_metadata_list=raw_per_language_metadata,
96 | )
97 | common_tokens: Dict[str, List[Tuple[str, int]]] = {}
98 | for (language, md) in per_code_language_metadata.items():
99 | common_tokens[language] = md.common_tokens
100 |
101 | end = time.time()
102 |
103 | time_p = end - start
104 | logger.info(f"Most Common Tokens: {time_p} sec")
105 |
106 | pickle.dump(common_tokens, open("./checkpoints/tmp_common_tokens.p", "wb"))
107 |
108 | common_tokens_dict = DictRecordable(common_tokens)
109 | os.makedirs(build_path, exist_ok=True)
110 | records = RecordableMapping({"common_tokens": common_tokens_dict})
111 | records.save(build_path)
112 |
113 | return common_tokens_dict
114 |
115 |
116 | def load_query_code_tokenizers_from_hocon(conf: ConfigTree) -> Optional[Tuple[TokenizerRecordable, RecordableMapping]]:
117 | build_path = Path(conf["tokenizers.build_path"])
118 |
119 | if not os.path.exists(build_path):
120 | logger.error(f"Could find {build_path} where tokenizers should have been built and stored")
121 | return None
122 |
123 | records = RecordableMapping.load(build_path)
124 | if "query_tokenizer" in records and "code_tokenizers" in records:
125 | query_tokenizer = cast(TokenizerRecordable, records["query_tokenizer"])
126 | code_tokenizers = cast(RecordableMapping, records["code_tokenizers"])
127 |
128 | return query_tokenizer, code_tokenizers
129 | else:
130 | logger.error(f"Couldn't query_tokenizer/code_tokenizers recordables in path {build_path}")
131 | return None
132 |
--------------------------------------------------------------------------------
/codenets/main.py:
--------------------------------------------------------------------------------
1 | """Dummy Main of the project."""
2 |
3 |
4 | def main():
5 | print("hello")
6 |
7 |
8 | if __name__ == "__main__":
9 | main()
10 |
--------------------------------------------------------------------------------
/codenets/save.py:
--------------------------------------------------------------------------------
1 | """Utils to save Recordables in rotating mode"""
2 |
3 | import os
4 | from pathlib import Path
5 | import shutil
6 | from typing import Union, Type, TypeVar, Optional
7 | from codenets.recordable import Recordable
8 |
9 |
10 | def rotating_save_records(path: Union[Path, str], prefix: str, rec: Recordable, nb: int = 5) -> bool:
11 | root_path = Path(path) / prefix
12 | if not os.path.isdir(root_path):
13 | os.makedirs(root_path)
14 |
15 | paths = []
16 | first_empty_path = None
17 | saved = True
18 | for i in range(nb):
19 | path_i = root_path / f"{prefix}_{i}"
20 | if not os.path.exists(path_i) and first_empty_path is None:
21 | first_empty_path = path_i
22 | os.makedirs(first_empty_path)
23 | paths.append(path_i)
24 |
25 | if first_empty_path is not None:
26 | saved = saved and rec.save(first_empty_path)
27 | else:
28 | first = paths[0]
29 |
30 | shutil.rmtree(first)
31 | for pth in paths[1:]:
32 | os.rename(pth, first)
33 | first = pth
34 | saved = saved and rec.save(paths[-1])
35 |
36 | return saved
37 |
38 |
39 | def save_records_direct(path: Union[Path, str], rec: Recordable) -> bool:
40 | if not os.path.isdir(path):
41 | os.makedirs(path)
42 |
43 | return rec.save(path)
44 |
45 |
46 | def save_records_best(path: Union[Path, str], rec: Recordable, suffix: Optional[str] = None) -> bool:
47 | prefix = os.path.basename(path)
48 | if suffix is not None:
49 | best_path = Path(path) / f"{prefix}_best_{suffix}"
50 | else:
51 | best_path = Path(path) / f"{prefix}_best"
52 | if not os.path.isdir(best_path):
53 | os.makedirs(best_path)
54 |
55 | return rec.save(best_path)
56 |
57 |
58 | def save_records_last(output_dir: Union[Path, str], rec: Recordable) -> bool:
59 | return rotating_save_records(os.path.dirname(output_dir), os.path.basename(output_dir), rec)
60 |
61 |
62 | Recordable_T = TypeVar("Recordable_T", bound="Recordable")
63 |
64 |
65 | def rotating_recover_records(
66 | cls: Type[Recordable_T], path: Union[Path, str], prefix: str, nb: int = 5
67 | ) -> Optional[Recordable_T]:
68 | last_path = None
69 | for i in range(nb):
70 | path_i = Path(path) / prefix / f"{prefix}_{i}"
71 | if os.path.exists(path_i):
72 | last_path = path_i
73 |
74 | if last_path is not None:
75 | return cls.load(last_path)
76 | else:
77 | return None
78 |
79 |
80 | def recover_records_best(
81 | cls: Type[Recordable_T], recover_dir: Union[Path, str], nb: int = 5, *args, **kwargs
82 | ) -> Optional[Recordable_T]:
83 | prefix = os.path.basename(recover_dir)
84 | best_path = Path(recover_dir) / f"{prefix}_best"
85 | if best_path.exists():
86 | return cls.load(best_path)
87 | else:
88 | return None
89 |
90 |
91 | def recover_records_direct(
92 | cls: Type[Recordable_T], recover_dir: Union[Path, str], *args, **kwargs
93 | ) -> Optional[Recordable_T]:
94 | p = Path(recover_dir)
95 | if p.exists():
96 | return cls.load(p)
97 | else:
98 | return None
99 |
100 |
101 | def recover_records_last(cls: Type[Recordable_T], recover_dir: Union[Path, str]) -> Optional[Recordable_T]:
102 | return rotating_recover_records(cls, os.path.dirname(recover_dir), os.path.basename(recover_dir))
103 |
--------------------------------------------------------------------------------
/codenets/tensorboard_utils.py:
--------------------------------------------------------------------------------
1 | # Some
2 |
3 | from tensorboardX import SummaryWriter
4 | from pathlib import Path
5 | import datetime
6 | from loguru import logger
7 | from typing import Dict
8 |
9 | from tensorboard.backend.event_processing import event_accumulator
10 |
11 |
12 | def tensorboard_event_accumulator(
13 | file: str,
14 | loaded_scalars: int = 0, # load all scalars by default
15 | loaded_images: int = 4, # load 4 images by default
16 | loaded_compressed_histograms: int = 500, # load one histogram by default
17 | loaded_histograms: int = 1, # load one histogram by default
18 | loaded_audio: int = 4, # loads 4 audio by default
19 | ):
20 | """Read a Tensorboard event_accumulator from a file"""
21 | ea = event_accumulator.EventAccumulator(
22 | file,
23 | size_guidance={ # see below regarding this argument
24 | event_accumulator.COMPRESSED_HISTOGRAMS: loaded_compressed_histograms,
25 | event_accumulator.IMAGES: loaded_images,
26 | event_accumulator.AUDIO: loaded_audio,
27 | event_accumulator.SCALARS: loaded_scalars,
28 | event_accumulator.HISTOGRAMS: loaded_histograms,
29 | },
30 | )
31 | ea.Reload()
32 | return ea
33 |
34 |
35 | class Tensorboard:
36 | """
37 | Tensorboard manager
38 |
39 | This manager is associated to a:
40 |
41 | - experiment
42 | - a unique ID for the current run (one experiment can be run many times)
43 | - groups of metrics (like "train" or "val")
44 | - sub-groups of metrics (like train/bash or val/epoch)
45 | """
46 |
47 | def __init__(self, experiment_id, output_dir="./runs", unique_id=None, flush_secs=10):
48 | self.experiment_id = experiment_id
49 | self.output_dir = Path(output_dir)
50 | if unique_id is None:
51 | unique_id = datetime.datetime.now().isoformat(timespec="seconds")
52 | self.path = self.output_dir / f"{experiment_id}_{unique_id}"
53 | logger.debug(f"Writing TensorBoard events locally to {self.path}")
54 | self.writers: Dict[str, SummaryWriter] = {}
55 | self.flush_secs = flush_secs
56 |
57 | def _get_writer(self, group: str = "") -> SummaryWriter:
58 | if group not in self.writers:
59 | logger.debug(f"Adding group {group} to writers ({self.writers.keys()})")
60 | self.writers[group] = SummaryWriter(f"{str(self.path)}_{group}", flush_secs=self.flush_secs)
61 | return self.writers[group]
62 |
63 | def add_scalars(self, metrics: dict, global_step: int, group=None, sub_group="") -> None:
64 | for key, val in metrics.items():
65 | cur_name = "/".join([sub_group, key])
66 | self._get_writer(group).add_scalar(cur_name, val, global_step)
67 |
--------------------------------------------------------------------------------
/codenets/utils.py:
--------------------------------------------------------------------------------
1 | from typing import Iterable, List, Optional
2 | import os
3 | import re
4 | from dpu_utils.codeutils import split_identifier_into_parts
5 |
6 | # from dpu_utils.utils import Path
7 | from pathlib import Path
8 | import numpy as np
9 | import glob
10 | import base64
11 | from pickle import dumps, loads
12 |
13 | IDENTIFIER_TOKEN_REGEX = re.compile("[_a-zA-Z][_a-zA-Z0-9]*")
14 |
15 |
16 | def listdir_nohidden_gen(path):
17 | for f in os.listdir(path):
18 | if not f.startswith('.'):
19 | yield f
20 |
21 |
22 | def listdir_nohidden(path):
23 | return list(listdir_nohidden_gen(path))
24 |
25 |
26 | def runtime_import(class_name: str):
27 | import importlib
28 |
29 | """
30 | Runtime import from a string using "." to split module & class names
31 |
32 | Args:
33 | class_name (str): the class name to split according to "." and load dynamically modules & class
34 |
35 | Returns:
36 | Class: The imported class
37 | """
38 | components = class_name.split(".")
39 | print(f">>> class_name {class_name}<<<<")
40 | mod = getattr(importlib.import_module(".".join(components[:-1])), components[-1])
41 | return mod
42 |
43 |
44 | def full_classname(cls):
45 | """Return full class name with modules"""
46 | return cls.__module__ + "." + cls.__name__
47 |
48 |
49 | def instance_full_classname(o):
50 | # o.__module__ + "." + o.__class__.__qualname__ is an example in
51 | # this context of H.L. Mencken's "neat, plausible, and wrong."
52 | # Python makes no guarantees as to whether the __module__ special
53 | # attribute is defined, so we take a more circumspect approach.
54 | # Alas, the module name is explicitly excluded from __qualname__
55 | # in Python 3.
56 | module = o.__class__.__module__
57 | if module is None or module == str.__class__.__module__:
58 | return o.__class__.__name__ # Avoid reporting __builtin__
59 | else:
60 | return module + "." + o.__class__.__name__
61 |
62 |
63 | def _to_subtoken_stream(input_stream: Iterable[str], mark_subtoken_end: bool) -> Iterable[str]:
64 | """Generate chopped strings into sub-tokens strings (like snake-case)"""
65 | for token in input_stream:
66 | if IDENTIFIER_TOKEN_REGEX.match(token):
67 | yield from split_identifier_into_parts(token)
68 | if mark_subtoken_end:
69 | yield ""
70 | else:
71 | yield token
72 |
73 |
74 | def expand_data_path(data_path: str) -> List[Path]:
75 | """
76 | Expand data path as a simple directory or if a file, searches for directories in the file
77 |
78 | Args:
79 | data_path: A path to either a file or a directory. If it's a file, we interpret it as a list of
80 | data directories.
81 |
82 | Returns:
83 | List of data directories (potentially just data_path)
84 | """
85 | data_rpath = Path(data_path)
86 |
87 | if data_rpath.is_dir():
88 | return [data_rpath]
89 |
90 | data_dirs: List[Path] = []
91 | with open(data_rpath) as f:
92 | for fl in map(Path, f.read().splitlines()):
93 | if fl.is_absolute():
94 | data_dirs.append(fl)
95 | else:
96 | data_dirs.append(data_rpath.parent / fl)
97 |
98 | # data_dirs.extend(map(Path))
99 | return data_dirs
100 |
101 |
102 | def get_data_files_from_directory(data_dirs: List[Path], max_files_per_dir: Optional[int] = None) -> List[Path]:
103 | """Search all *.jsonl.gz files in a multiple paths and concatenate them"""
104 | files: List[Path] = []
105 | for data_dir in data_dirs:
106 | dir_files = [Path(path) for path in glob.iglob(os.path.join(data_dir, "*.jsonl.gz"), recursive=True)]
107 | # dir_files = data_dir.get_filtered_files_in_dir("*.jsonl.gz")
108 | if max_files_per_dir:
109 | dir_files = sorted(dir_files)[: int(max_files_per_dir)]
110 | files += dir_files
111 |
112 | np.random.shuffle(np.array(files)) # This avoids having large_file_0, large_file_1, ... subsequences
113 | return files
114 |
115 |
116 | # Some streaming pickles (not used)
117 |
118 |
119 | def stream_dump(iterable_to_pickle, file_obj):
120 | """
121 | Dump contents of an iterable iterable_to_pickle to file_obj, a file
122 | opened in write mode
123 | """
124 | for elt in iterable_to_pickle:
125 | stream_dump_elt(elt, file_obj)
126 |
127 |
128 | def stream_dump_elt(elt_to_pickle, file_obj):
129 | """Dump one element to file_obj, a file opened in write mode"""
130 | pickled_elt = dumps(elt_to_pickle)
131 | encoded = base64.b64encode(pickled_elt)
132 | file_obj.write(encoded)
133 |
134 | # record separator is a blank line
135 | # (since pickled_elt as base64 encoded cannot contain its own newlines)
136 | file_obj.write(b"\n\n")
137 |
138 |
139 | def stream_load(file_obj):
140 | """
141 | Load contents from file_obj, returning a generator that yields one
142 | element at a time
143 | """
144 | cur_elt = []
145 | for line in file_obj:
146 | if line == b"\n":
147 | encoded_elt = b"".join(cur_elt)
148 | try:
149 | pickled_elt = base64.b64decode(encoded_elt)
150 | elt = loads(pickled_elt)
151 | except EOFError:
152 | print("EOF found while unpickling data")
153 | print(pickled_elt)
154 | raise StopIteration
155 | cur_elt = []
156 | yield elt
157 | else:
158 | cur_elt.append(line)
159 |
--------------------------------------------------------------------------------
/conf/code_search_bert_2020_02_01_1500.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | training {
4 | name = "code_search_bert"
5 | iteration = "2020_02_01_15_00"
6 |
7 | model {
8 | type = "single_query_multi_code"
9 | query_encoder = ${bert}
10 | code_encoder = ${bert}
11 | }
12 | }
--------------------------------------------------------------------------------
/conf/code_search_bert_2020_02_03_20_00.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | dataset {
4 | common_params {
5 | parallelize = false
6 | }
7 | }
8 |
9 | training {
10 | name = "code_search_bert"
11 | iteration = "2020_02_03_20_00"
12 |
13 | model {
14 | type = "single_query_multi_code"
15 | query_encoder = ${bert}
16 | code_encoder = ${bert}
17 | }
18 | }
--------------------------------------------------------------------------------
/conf/code_search_bert_lg_2020_02_04_15_00.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | tokenizers {
4 | build_path = "./build_tokenizers/with_lang"
5 | }
6 |
7 | dataset {
8 | common_params {
9 | parallelize = false
10 | do_lowercase = true
11 | special_tokens = ["", ""]
12 | }
13 | }
14 |
15 | training {
16 | name = "code_search_bert"
17 | iteration = "2020_02_04_21_00"
18 |
19 | }
--------------------------------------------------------------------------------
/conf/code_search_bert_lg_2020_02_04_21_00.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | tokenizers {
4 | build_path = "./build_tokenizers/with_lang"
5 | }
6 |
7 | dataset {
8 | common_params {
9 | parallelize = false
10 | do_lowercase = true
11 | special_tokens = ["", ""]
12 | }
13 | }
14 |
15 | training {
16 | name = "code_search_bert"
17 | iteration = "2020_02_04_21_00"
18 | }
--------------------------------------------------------------------------------
/conf/code_search_bert_lg_2020_02_05_00_00.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | # bert {
4 | # hidden_size = 256
5 | # vocab_size = ${common_vocab_size}
6 | # intermediate_size = 1024
7 | # num_hidden_layers = 6
8 | # num_attention_heads = 8
9 | # }
10 |
11 | tokenizers {
12 | build_path = "./build_tokenizers/with_lang"
13 | }
14 |
15 | dataset {
16 | common_params {
17 | parallelize = false
18 | do_lowercase = true
19 | special_tokens = ["", ""]
20 | }
21 | }
22 |
23 | training {
24 | name = "code_search_bert_lg"
25 | iteration = "2020_02_05_00_00"
26 |
27 | batch_size {
28 | train = 170
29 | val = 170
30 | test = 170
31 | }
32 |
33 | model {
34 | type = "single_query_single_code"
35 | output_size = 128
36 | query_encoder {
37 | hidden_size = ${training.model.output_size}
38 | vocab_size = ${common_vocab_size}
39 | intermediate_size = 512
40 | num_hidden_layers = 3
41 | num_attention_heads = 8
42 | }
43 | code_encoder {
44 | hidden_size = ${training.model.output_size}
45 | vocab_size = ${common_vocab_size}
46 | intermediate_size = 1024
47 | num_hidden_layers = 6
48 | num_attention_heads = 8
49 | }
50 | }
51 |
52 | }
--------------------------------------------------------------------------------
/conf/code_search_bert_lg_2020_02_06_18_00.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | tokenizers {
4 | build_path = "./build_tokenizers/with_lang"
5 | }
6 |
7 | dataset {
8 | common_params {
9 | parallelize = false
10 | do_lowercase = true
11 | special_tokens = ["", ""]
12 | }
13 | }
14 |
15 | training {
16 | name = "code_search_bert_lg"
17 | iteration = "2020_02_06_18_00"
18 |
19 | batch_size {
20 | train = 200
21 | val = 200
22 | test = 200
23 | }
24 |
25 | model {
26 | type = "single_query_single_code"
27 | output_size = 64
28 | query_encoder {
29 | hidden_size = ${training.model.output_size}
30 | vocab_size = ${common_vocab_size}
31 | intermediate_size = 512
32 | num_hidden_layers = 3
33 | num_attention_heads = 8
34 | }
35 | code_encoder {
36 | hidden_size = ${training.model.output_size}
37 | vocab_size = ${common_vocab_size}
38 | intermediate_size = 1024
39 | num_hidden_layers = 6
40 | num_attention_heads = 8
41 | }
42 | }
43 |
44 | }
--------------------------------------------------------------------------------
/conf/code_search_bert_lg_2020_02_06_22_30.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | tokenizers {
4 | build_path = "./build_tokenizers/with_lang"
5 | }
6 |
7 | dataset {
8 | common_params {
9 | parallelize = false
10 | do_lowercase = true
11 | special_tokens = ["", ""]
12 | }
13 | }
14 |
15 | training {
16 | name = "code_search_bert_lg"
17 | iteration = "2020_02_06_22_30"
18 |
19 | batch_size {
20 | train = 170
21 | val = 170
22 | test = 170
23 | }
24 |
25 | model {
26 | type = "single_query_single_code"
27 | output_size = 256
28 | query_encoder {
29 | hidden_size = ${training.model.output_size}
30 | vocab_size = ${common_vocab_size}
31 | intermediate_size = 512
32 | num_hidden_layers = 3
33 | num_attention_heads = 8
34 | }
35 | code_encoder {
36 | hidden_size = ${training.model.output_size}
37 | vocab_size = ${common_vocab_size}
38 | intermediate_size = 1024
39 | num_hidden_layers = 6
40 | num_attention_heads = 8
41 | }
42 | }
43 |
44 | }
--------------------------------------------------------------------------------
/conf/code_search_bert_lg_2020_02_07_10_00.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | tokenizers {
4 | build_path = "./build_tokenizers/with_lang"
5 | }
6 |
7 | dataset {
8 | common_params {
9 | parallelize = false
10 | do_lowercase = true
11 | special_tokens = ["", ""]
12 | }
13 | }
14 |
15 | training {
16 | short_circuit = True
17 | name = "code_search_bert_lg"
18 | iteration = "2020_02_07_10_00"
19 |
20 | model {
21 | training_ctx_class = "codenets.codesearchnet.single_branch_ctx.SingleBranchTrainingContext"
22 | output_size = 256
23 | query_encoder {
24 | hidden_size = ${training.model.output_size}
25 | vocab_size = ${common_vocab_size}
26 | intermediate_size = 512
27 | num_hidden_layers = 3
28 | num_attention_heads = 8
29 | }
30 | code_encoder {
31 | hidden_size = ${training.model.output_size}
32 | vocab_size = ${common_vocab_size}
33 | intermediate_size = 1024
34 | num_hidden_layers = 6
35 | num_attention_heads = 8
36 | }
37 | }
38 |
39 | batch_size {
40 | train = 170
41 | val = 170
42 | test = 170
43 | }
44 |
45 | device = "cpu"
46 | wandb = false
47 | tensorboard = false
48 |
49 | }
--------------------------------------------------------------------------------
/conf/code_search_bert_query_1_code_1_2020_02_10_11_00 copy.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | tokenizers {
4 | build_path = "./build_tokenizers/with_lang_query_1_code_1"
5 | }
6 |
7 | dataset {
8 | common_params {
9 | parallelize = false
10 | do_lowercase = true
11 | special_tokens = ["", ""]
12 | }
13 | }
14 |
15 | training {
16 | short_circuit = False
17 | name = "code_search_bert_query_1_code_1"
18 | iteration = "2020_02_10_11_00"
19 | tokenizer_type = "query_1_code_1"
20 | model {
21 | training_ctx_class = "codenets.codesearchnet.single_branch_ctx.SingleBranchTrainingContext"
22 | output_size = 128
23 | query_encoder {
24 | hidden_size = ${training.model.output_size}
25 | vocab_size = ${common_vocab_size}
26 | intermediate_size = 512
27 | num_hidden_layers = 3
28 | num_attention_heads = 8
29 | }
30 | code_encoder {
31 | hidden_size = ${training.model.output_size}
32 | vocab_size = ${common_vocab_size}
33 | intermediate_size = 1024
34 | num_hidden_layers = 6
35 | num_attention_heads = 8
36 | }
37 | }
38 |
39 | batch_size {
40 | train = 170
41 | val = 170
42 | test = 170
43 | }
44 |
45 | device = "cuda"
46 | wandb = true
47 | tensorboard = true
48 |
49 | }
--------------------------------------------------------------------------------
/conf/code_search_bert_query_1_code_1_2020_02_10_11_00.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | tokenizers {
4 | build_path = "./build_tokenizers/with_lang_query_1_code_1"
5 | }
6 |
7 | dataset {
8 | common_params {
9 | parallelize = false
10 | do_lowercase = true
11 | special_tokens = ["", ""]
12 | }
13 | }
14 |
15 | training {
16 | short_circuit = False
17 | name = "code_search_bert_query_1_code_1"
18 | iteration = "2020_02_10_11_00"
19 | tokenizer_type = "query_1_code_1"
20 | model {
21 | training_ctx_class = "codenets.codesearchnet.single_branch_ctx.SingleBranchTrainingContext"
22 | output_size = 128
23 | query_encoder {
24 | hidden_size = ${training.model.output_size}
25 | vocab_size = ${common_vocab_size}
26 | intermediate_size = 512
27 | num_hidden_layers = 3
28 | num_attention_heads = 8
29 | }
30 | code_encoder {
31 | hidden_size = ${training.model.output_size}
32 | vocab_size = ${common_vocab_size}
33 | intermediate_size = 1024
34 | num_hidden_layers = 6
35 | num_attention_heads = 8
36 | }
37 | }
38 |
39 | batch_size {
40 | train = 170
41 | val = 170
42 | test = 170
43 | }
44 |
45 | device = "cuda"
46 | wandb = true
47 | tensorboard = true
48 |
49 | }
--------------------------------------------------------------------------------
/conf/code_search_bert_query_1_code_1_2020_02_11_22_00 copy.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | tokenizers {
4 | build_path = "./build_tokenizers/with_lang_query_1_code_1"
5 | }
6 |
7 | dataset {
8 | common_params {
9 | parallelize = false
10 | do_lowercase = true
11 | special_tokens = ["", ""]
12 | }
13 | }
14 |
15 | training {
16 | short_circuit = false
17 |
18 | device = "cuda"
19 | wandb = true
20 | tensorboard = true
21 |
22 | name = "code_search_bert_query_1_code_1"
23 | iteration = "2020_02_11_22_00"
24 | tokenizer_type = "query_1_code_1"
25 | model {
26 | training_ctx_class = "codenets.codesearchnet.query_1_code_1.training_ctx.Query1Code1Ctx"
27 | output_size = 64
28 | query_encoder {
29 | hidden_size = ${training.model.output_size}
30 | vocab_size = ${common_vocab_size}
31 | intermediate_size = 512
32 | num_hidden_layers = 3
33 | num_attention_heads = 8
34 | }
35 | code_encoder {
36 | hidden_size = ${training.model.output_size}
37 | vocab_size = ${common_vocab_size}
38 | intermediate_size = 512
39 | num_hidden_layers = 6
40 | num_attention_heads = 8
41 | }
42 | }
43 |
44 | batch_size {
45 | train = 256
46 | val = 256
47 | test = 256
48 | }
49 |
50 | }
--------------------------------------------------------------------------------
/conf/code_search_bert_query_1_code_1_2020_02_11_22_00.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | tokenizers {
4 | build_path = "./build_tokenizers/with_lang_query_1_code_1"
5 | }
6 |
7 | dataset {
8 | common_params {
9 | parallelize = false
10 | do_lowercase = true
11 | special_tokens = ["", ""]
12 | }
13 | }
14 |
15 | training {
16 | short_circuit = false
17 |
18 | device = "cuda"
19 | wandb = true
20 | tensorboard = true
21 |
22 | name = "code_search_bert_query_1_code_1"
23 | iteration = "2020_02_11_22_00"
24 | tokenizer_type = "query_1_code_1"
25 | model {
26 | training_ctx_class = "codenets.codesearchnet.query_1_code_1.training_ctx.Query1Code1Ctx"
27 | output_size = 64
28 | query_encoder {
29 | hidden_size = ${training.model.output_size}
30 | vocab_size = ${common_vocab_size}
31 | intermediate_size = 512
32 | num_hidden_layers = 3
33 | num_attention_heads = 8
34 | }
35 | code_encoder {
36 | hidden_size = ${training.model.output_size}
37 | vocab_size = ${common_vocab_size}
38 | intermediate_size = 512
39 | num_hidden_layers = 6
40 | num_attention_heads = 8
41 | }
42 | }
43 |
44 | batch_size {
45 | train = 256
46 | val = 256
47 | test = 256
48 | }
49 |
50 | }
--------------------------------------------------------------------------------
/conf/code_search_bert_query_code_siamese_2020_02_12_00_00 copy.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
4 |
5 | tokenizers {
6 | build_path = "./build_tokenizers/with_lang_query_code_siamese"
7 | token_files = "./build_tokenizers/token_files_query_code_siamese"
8 | }
9 |
10 | dataset {
11 | common_params {
12 | parallelize = false
13 | do_lowercase = true
14 | special_tokens = ["", "", ""]
15 | }
16 | }
17 |
18 | training {
19 | short_circuit = false
20 |
21 | device = "cuda"
22 | wandb = true
23 | tensorboard = true
24 |
25 | name = "code_search_bert_query_code_siamese"
26 | iteration = "2020_02_12_00_00"
27 | tokenizer_type = "query_code_siamese"
28 | model {
29 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
30 | output_size = 512
31 | encoder {
32 | hidden_size = ${training.model.output_size}
33 | vocab_size = ${common_vocab_size}
34 | intermediate_size = 1024
35 | num_hidden_layers = 6
36 | num_attention_heads = 8
37 | }
38 | }
39 |
40 | batch_size {
41 | train = 128
42 | val = 128
43 | test = 128
44 | }
45 |
46 | }
--------------------------------------------------------------------------------
/conf/code_search_bert_query_code_siamese_2020_02_12_00_00.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
4 |
5 | tokenizers {
6 | build_path = "./build_tokenizers/with_lang_query_code_siamese"
7 | token_files = "./build_tokenizers/token_files_query_code_siamese"
8 | }
9 |
10 | dataset {
11 | common_params {
12 | parallelize = false
13 | do_lowercase = true
14 | special_tokens = ["", "", ""]
15 | }
16 | }
17 |
18 | training {
19 | short_circuit = false
20 |
21 | device = "cuda"
22 | wandb = true
23 | tensorboard = true
24 |
25 | name = "code_search_bert_query_code_siamese"
26 | iteration = "2020_02_12_00_00"
27 | tokenizer_type = "query_code_siamese"
28 | model {
29 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
30 | output_size = 512
31 | encoder {
32 | hidden_size = ${training.model.output_size}
33 | vocab_size = ${common_vocab_size}
34 | intermediate_size = 1024
35 | num_hidden_layers = 6
36 | num_attention_heads = 8
37 | }
38 | }
39 |
40 | batch_size {
41 | train = 128
42 | val = 128
43 | test = 128
44 | }
45 |
46 | }
--------------------------------------------------------------------------------
/conf/code_search_bert_query_code_siamese_2020_02_14_16_00 copy.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
4 |
5 | tokenizers {
6 | build_path = "./build_tokenizers/with_lang_query_code_siamese"
7 | token_files = "./build_tokenizers/token_files_query_code_siamese"
8 | }
9 |
10 | dataset {
11 | common_params {
12 | parallelize = false
13 | do_lowercase = true
14 | special_tokens = ["", "", ""]
15 | }
16 | }
17 |
18 | training {
19 | short_circuit = false
20 |
21 | device = "cuda"
22 | wandb = true
23 | tensorboard = true
24 |
25 | name = "code_search_bert_query_code_siamese"
26 | iteration = "2020_02_14_16_00"
27 | tokenizer_type = "query_code_siamese"
28 | model {
29 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
30 | output_size = 72
31 | encoder {
32 | hidden_size = ${training.model.output_size}
33 | vocab_size = ${common_vocab_size}
34 | intermediate_size = 256
35 | num_hidden_layers = 12
36 | num_attention_heads = 12
37 | }
38 | }
39 |
40 | batch_size {
41 | train = 100
42 | val = 100
43 | test = 100
44 | }
45 |
46 | }
--------------------------------------------------------------------------------
/conf/code_search_bert_query_code_siamese_2020_02_14_16_00.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
4 |
5 | tokenizers {
6 | build_path = "./build_tokenizers/with_lang_query_code_siamese"
7 | token_files = "./build_tokenizers/token_files_query_code_siamese"
8 | }
9 |
10 | dataset {
11 | common_params {
12 | parallelize = false
13 | do_lowercase = true
14 | special_tokens = ["", "", ""]
15 | }
16 | }
17 |
18 | training {
19 | short_circuit = false
20 |
21 | device = "cuda"
22 | wandb = true
23 | tensorboard = true
24 |
25 | name = "code_search_bert_query_code_siamese"
26 | iteration = "2020_02_14_16_00"
27 | tokenizer_type = "query_code_siamese"
28 | model {
29 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
30 | output_size = 72
31 | encoder {
32 | hidden_size = ${training.model.output_size}
33 | vocab_size = ${common_vocab_size}
34 | intermediate_size = 256
35 | num_hidden_layers = 12
36 | num_attention_heads = 12
37 | }
38 | }
39 |
40 | batch_size {
41 | train = 100
42 | val = 100
43 | test = 100
44 | }
45 |
46 | }
--------------------------------------------------------------------------------
/conf/code_search_bert_query_code_siamese_2020_02_15_14_00.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
4 |
5 | tokenizers {
6 | build_path = "./build_tokenizers/with_lang_query_code_siamese"
7 | token_files = "./build_tokenizers/token_files_query_code_siamese"
8 | }
9 |
10 | dataset {
11 | common_params {
12 | parallelize = false
13 | do_lowercase = true
14 | special_tokens = ["", "", ""]
15 | }
16 | }
17 |
18 | training {
19 | short_circuit = true
20 |
21 | device = "cuda"
22 | wandb = false
23 | tensorboard = false
24 |
25 | name = "code_search_siamese"
26 | iteration = "2020_02_15_14_00"
27 | tokenizer_type = "query_code_siamese"
28 | # Temporary because Rust tokenizers do not manage common tokens
29 | common_tokens_file = "./pickles/common_tokens_"${training.tokenizer_type}"_"${iteration}".p"
30 |
31 | model {
32 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
33 | output_size = 72
34 | encoder {
35 | hidden_size = ${training.model.output_size}
36 | vocab_size = ${common_vocab_size}
37 | intermediate_size = 256
38 | num_hidden_layers = 12
39 | num_attention_heads = 12
40 | }
41 | }
42 |
43 | batch_size {
44 | train = 100
45 | val = 100
46 | test = 100
47 | }
48 |
49 | }
--------------------------------------------------------------------------------
/conf/default.conf:
--------------------------------------------------------------------------------
1 |
2 | lang_ids {
3 | php = 0
4 | python = 1
5 | ruby = 2
6 | java = 3
7 | go = 4
8 | javascript = 5
9 | }
10 |
11 | common_vocab_size = 10000
12 |
13 | bert {
14 | hidden_size = 128
15 | vocab_size = ${common_vocab_size}
16 | intermediate_size = 512
17 | num_hidden_layers = 3
18 | num_attention_heads = 8
19 | }
20 |
21 | tokenizers {
22 | type = "TOKENIZER_TYPE"
23 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
24 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
25 | }
26 |
27 | dataset {
28 | root_dir = ${HOME}"/workspaces/tools/CodeSearchNet/resources"
29 | common_params {
30 | fraction_using_func_name=0.1
31 | min_len_func_name_for_query=12
32 | use_subtokens=False
33 | mark_subtoken_end=False
34 | code_max_num_tokens=200
35 | query_max_num_tokens=30
36 | use_bpe=True
37 | vocab_size=${common_vocab_size}
38 | pct_bpe=0.5
39 | vocab_count_threshold=10
40 | lang_ids = ${lang_ids}
41 | do_lowercase = true
42 | special_tokens = [""]
43 | parallelize = true
44 | use_lang_weights = False
45 | }
46 |
47 | train {
48 | dirs = ${dataset.root_dir}"/data_dirs_train.txt"
49 | params = ${dataset.common_params}
50 | }
51 |
52 | val {
53 | dirs = ${dataset.root_dir}"/data_dirs_valid.txt"
54 | params = ${dataset.common_params}
55 | }
56 |
57 | test {
58 | dirs = ${dataset.root_dir}"/data_dirs_test.txt"
59 | params = ${dataset.common_params}
60 | }
61 |
62 | queries_file = ${dataset.root_dir}"/queries.csv"
63 | }
64 |
65 |
66 | training {
67 | # The name of current experiment (can have several runs)
68 | name = "EXPERIMENT_NAME"
69 | # The unique id of current run
70 | iteration = "UNIQUE_RUN_ID"
71 | # The ID used to identify the pre-built pickled files
72 | # using the tokenizer defined above
73 | tokenizer_type = "TOKENIZER_ID"
74 |
75 | # Set that to true to test your run without slow-loading train dataset
76 | short_circuit = false
77 |
78 | device = "cuda"
79 | # deactivate wandb & tensorboard
80 | wandb = true
81 | tensorboard = true
82 |
83 | model {
84 | # IMPORTANT: the class representing Training Context
85 | training_ctx_class = "codenets.codesearchnet.query_1_code_1.training_ctx.Query1Code1Ctx"
86 | output_size = 64
87 | query_encoder {
88 | hidden_size = ${training.model.output_size}
89 | vocab_size = ${common_vocab_size}
90 | intermediate_size = 512
91 | num_hidden_layers = 3
92 | num_attention_heads = 8
93 | }
94 | code_encoder {
95 | hidden_size = ${training.model.output_size}
96 | vocab_size = ${common_vocab_size}
97 | intermediate_size = 512
98 | num_hidden_layers = 6
99 | num_attention_heads = 8
100 | }
101 | }
102 |
103 | # Training Hyper-Parameters
104 | seed = 0
105 | lr = 0.0001
106 | max_grad_norm = 1.0
107 | min_log_interval = 50
108 | start_epoch = 0
109 | epochs = 10
110 |
111 | batch_size {
112 | train = 256
113 | val = 256
114 | test = 256
115 | }
116 |
117 | loss {
118 | type = "softmax_cross_entropy"
119 | margin = 1.0
120 | }
121 |
122 | # Paths
123 | pickle_path = "./pickles"
124 | output_dir = "./checkpoints"
125 | tensorboard_path = "./runs"
126 |
127 | }
--------------------------------------------------------------------------------
/conf/qc_ast_2020_03_13.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | embeddings {
13 | sbert {
14 | model="bert-base-nli-mean-tokens"
15 | pickle_path="./pickles"
16 | }
17 | }
18 |
19 | dataset {
20 | common_params {
21 | parallelize = false
22 | do_lowercase = true
23 | special_tokens = ["", "", ""]
24 | use_lang_weights = True
25 | code_max_num_tokens=200
26 | use_subtokens = false # to do later
27 | #query_embeddings="sbert"
28 | fraction_using_func_name=0.1
29 | use_ast = "tree-sitter"
30 | ast_added_nodes = {
31 | "php": {"prefix": ""},
32 | "java": {"prefix": "class Toto {", "suffix": "}"}
33 | }
34 | ast_skip_node_types = {"php": ["ERROR", ""], "java": ["ERROR"]}
35 |
36 | ast_special_tokens_files = [ "./pickles/test_special_tokens.json" ]
37 | }
38 | }
39 |
40 | training {
41 | short_circuit = true
42 |
43 | device = "cuda"
44 | wandb = false
45 | tensorboard = false
46 |
47 | name = "qc_ast"
48 | iteration = "2020_03_15"
49 | tokenizer_type = ${tokenizers.type}
50 |
51 | model {
52 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
53 | encoder {
54 | hidden_size = 32
55 | vocab_size = ${common_vocab_size}
56 | intermediate_size = 128
57 | num_hidden_layers = 3
58 | num_attention_heads = 8
59 | }
60 | }
61 | lr = 0.001
62 |
63 | loss {
64 | type = "lambda_loss"
65 | }
66 |
67 | batch_size {
68 | #train = 400
69 | #val = 400
70 | #test = 400
71 | train = 5
72 | val = 5
73 | test = 5
74 | }
75 |
76 | }
--------------------------------------------------------------------------------
/conf/qc_ast_2020_03_15 copy.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | embeddings {
13 | sbert {
14 | model="bert-base-nli-mean-tokens"
15 | pickle_path="./pickles"
16 | }
17 | }
18 |
19 | dataset {
20 | common_params {
21 | parallelize = false
22 | do_lowercase = true
23 | special_tokens = ["", "", ""]
24 | use_lang_weights = True
25 | code_max_num_tokens=1024
26 | use_subtokens = false # to do later
27 | #query_embeddings="sbert"
28 | fraction_using_func_name=0.1
29 | use_ast = "tree-sitter"
30 | ast_added_nodes = {
31 | "php": {"prefix": ""},
32 | "java": {"prefix": "class Toto {", "suffix": "}"}
33 | }
34 | ast_skip_node_types = {"php": ["ERROR", ""], "java": ["ERROR"]}
35 |
36 | ast_special_tokens_files = [ "./pickles/test_special_tokens.json" ]
37 | }
38 | }
39 |
40 | training {
41 | short_circuit = true
42 |
43 | device = "cuda"
44 | wandb = false
45 | tensorboard = false
46 |
47 | name = "qc_ast"
48 | iteration = "2020_03_15"
49 | tokenizer_type = ${tokenizers.type}"_ast"
50 |
51 | model {
52 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
53 | encoder {
54 | hidden_size = 64
55 | vocab_size = ${common_vocab_size}
56 | intermediate_size = 512
57 | num_hidden_layers = 3
58 | num_attention_heads = 8
59 | }
60 | }
61 | lr = 0.00001
62 |
63 | loss {
64 | type = "softmax_cross_entropy"
65 | }
66 |
67 | batch_size {
68 | train = 8
69 | val = 8
70 | test = 8
71 | #train = 5
72 | #val = 5
73 | #test = 5
74 | }
75 |
76 | }
--------------------------------------------------------------------------------
/conf/qc_ast_2020_03_15.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | embeddings {
13 | sbert {
14 | model="bert-base-nli-mean-tokens"
15 | pickle_path="./pickles"
16 | }
17 | }
18 |
19 | dataset {
20 | common_params {
21 | parallelize = false
22 | do_lowercase = true
23 | special_tokens = ["", "", ""]
24 | use_lang_weights = True
25 | code_max_num_tokens=1024
26 | use_subtokens = false # to do later
27 | #query_embeddings="sbert"
28 | fraction_using_func_name=0.1
29 | use_ast = "tree-sitter"
30 | ast_added_nodes = {
31 | "php": {"prefix": ""},
32 | "java": {"prefix": "class Toto {", "suffix": "}"}
33 | }
34 | ast_skip_node_types = {"php": ["ERROR", ""], "java": ["ERROR"]}
35 |
36 | ast_special_tokens_files = [ "./pickles/test_special_tokens.json" ]
37 | }
38 | }
39 |
40 | training {
41 | short_circuit = true
42 |
43 | device = "cuda"
44 | wandb = false
45 | tensorboard = false
46 |
47 | name = "qc_ast"
48 | iteration = "2020_03_15"
49 | tokenizer_type = ${tokenizers.type}"_ast"
50 |
51 | model {
52 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
53 | encoder {
54 | hidden_size = 64
55 | vocab_size = ${common_vocab_size}
56 | intermediate_size = 512
57 | num_hidden_layers = 3
58 | num_attention_heads = 8
59 | }
60 | }
61 | lr = 0.00001
62 |
63 | loss {
64 | type = "softmax_cross_entropy"
65 | }
66 |
67 | batch_size {
68 | train = 8
69 | val = 8
70 | test = 8
71 | #train = 5
72 | #val = 5
73 | #test = 5
74 | }
75 |
76 | }
--------------------------------------------------------------------------------
/conf/qc_ast_2020_03_17.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | embeddings {
13 | sbert {
14 | model="bert-base-nli-mean-tokens"
15 | pickle_path="./pickles"
16 | }
17 | }
18 |
19 | dataset {
20 | common_params {
21 | parallelize = false
22 | do_lowercase = true
23 | special_tokens = ["", "", ""]
24 | use_lang_weights = True
25 | code_max_num_tokens=1024
26 | use_subtokens = false # to do later
27 | #query_embeddings="sbert"
28 | fraction_using_func_name=0.1
29 | use_ast = "tree-sitter"
30 | ast_added_nodes = {
31 | "php": {"prefix": ""},
32 | "java": {"prefix": "class Toto {", "suffix": "}"}
33 | }
34 | ast_skip_node_types = {"php": ["ERROR", ""], "java": ["ERROR"]}
35 |
36 | ast_special_tokens_files = [ "./pickles/test_special_tokens.json" ]
37 | }
38 | }
39 |
40 | training {
41 | short_circuit = true
42 |
43 | device = "cuda"
44 | wandb = false
45 | tensorboard = false
46 |
47 | name = "qc_ast"
48 | iteration = "2020_03_15"
49 | tokenizer_type = ${tokenizers.type}"_ast"
50 |
51 | model {
52 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
53 | encoder {
54 | hidden_size = 64
55 | vocab_size = ${common_vocab_size}
56 | intermediate_size = 512
57 | num_hidden_layers = 3
58 | num_attention_heads = 8
59 | }
60 | }
61 | lr = 0.00001
62 |
63 | loss {
64 | type = "softmax_cross_entropy"
65 | }
66 |
67 | batch_size {
68 | train = 8
69 | val = 8
70 | test = 8
71 | #train = 5
72 | #val = 5
73 | #test = 5
74 | }
75 |
76 | }
--------------------------------------------------------------------------------
/conf/qc_ast_2020_03_18 copy.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | embeddings {
13 | sbert {
14 | model="bert-base-nli-mean-tokens"
15 | pickle_path="./pickles"
16 | }
17 | }
18 |
19 | dataset {
20 | common_params {
21 | parallelize = false
22 | do_lowercase = true
23 | special_tokens = ["", "", ""]
24 | use_lang_weights = True
25 | code_max_num_tokens = 512
26 | use_subtokens = false # to do later
27 | #query_embeddings="sbert"
28 | fraction_using_func_name=0.1
29 | use_ast = "tree-sitter"
30 | ast_added_nodes = {
31 | "php": {"prefix": ""},
32 | "java": {"prefix": "class Toto {", "suffix": "}"}
33 | }
34 | ast_skip_node_types = {"php": ["ERROR", ""], "java": ["ERROR"]}
35 |
36 | ast_special_tokens_files = [ "./pickles/test_special_tokens.json" ]
37 | }
38 | }
39 |
40 | training {
41 | short_circuit = false
42 |
43 | device = "cuda"
44 | wandb = true
45 | tensorboard = false
46 |
47 | name = "qc_ast"
48 | iteration = "2020_03_18"
49 | tokenizer_type = ${tokenizers.type}"_ast_512"
50 |
51 | model {
52 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
53 | encoder {
54 | hidden_size = 128
55 | vocab_size = 30370
56 | intermediate_size = 512
57 | num_hidden_layers = 3
58 | num_attention_heads = 8
59 | }
60 | }
61 | lr = 0.0001
62 |
63 | loss {
64 | type = "softmax_cross_entropy"
65 | }
66 |
67 | batch_size {
68 | train = 85
69 | val = 85
70 | test = 85
71 | #train = 5
72 | #val = 5
73 | #test = 5
74 | }
75 |
76 | }
--------------------------------------------------------------------------------
/conf/qc_ast_2020_03_18.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | embeddings {
13 | sbert {
14 | model="bert-base-nli-mean-tokens"
15 | pickle_path="./pickles"
16 | }
17 | }
18 |
19 | dataset {
20 | common_params {
21 | parallelize = false
22 | do_lowercase = true
23 | special_tokens = ["", "", ""]
24 | use_lang_weights = True
25 | code_max_num_tokens = 512
26 | use_subtokens = false # to do later
27 | #query_embeddings="sbert"
28 | fraction_using_func_name=0.1
29 | use_ast = "tree-sitter"
30 | ast_added_nodes = {
31 | "php": {"prefix": ""},
32 | "java": {"prefix": "class Toto {", "suffix": "}"}
33 | }
34 | ast_skip_node_types = {"php": ["ERROR", ""], "java": ["ERROR"]}
35 |
36 | ast_special_tokens_files = [ "./pickles/test_special_tokens.json" ]
37 | }
38 | }
39 |
40 | training {
41 | short_circuit = false
42 |
43 | device = "cuda"
44 | wandb = true
45 | tensorboard = false
46 |
47 | name = "qc_ast"
48 | iteration = "2020_03_18"
49 | tokenizer_type = ${tokenizers.type}"_ast_512"
50 |
51 | model {
52 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
53 | encoder {
54 | hidden_size = 128
55 | vocab_size = 30370
56 | intermediate_size = 512
57 | num_hidden_layers = 3
58 | num_attention_heads = 8
59 | }
60 | }
61 | lr = 0.0001
62 |
63 | loss {
64 | type = "softmax_cross_entropy"
65 | }
66 |
67 | batch_size {
68 | train = 85
69 | val = 85
70 | test = 85
71 | #train = 5
72 | #val = 5
73 | #test = 5
74 | }
75 |
76 | }
--------------------------------------------------------------------------------
/conf/qc_ast_2020_03_19.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | embeddings {
13 | sbert {
14 | model="bert-base-nli-mean-tokens"
15 | pickle_path="./pickles"
16 | }
17 | }
18 |
19 | dataset {
20 | common_params {
21 | parallelize = false
22 | do_lowercase = true
23 | special_tokens = ["", "", ""]
24 | use_lang_weights = True
25 | code_max_num_tokens = 256
26 | use_subtokens = false # to do later
27 | #query_embeddings="sbert"
28 | fraction_using_func_name=0.1
29 | use_ast = "tree-sitter"
30 | ast_added_nodes = {
31 | "php": {"prefix": ""},
32 | "java": {"prefix": "class Toto {", "suffix": "}"}
33 | }
34 | ast_skip_node_types = {"php": ["ERROR", ""], "java": ["ERROR"]}
35 |
36 | ast_special_tokens_files = [ "./pickles/test_special_tokens.json" ]
37 | }
38 | }
39 |
40 | training {
41 | short_circuit = false
42 |
43 | device = "cuda"
44 | wandb = true
45 | tensorboard = false
46 |
47 | name = "qc_ast"
48 | iteration = "2020_03_19"
49 | tokenizer_type = ${tokenizers.type}"_ast_256"
50 |
51 | model {
52 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
53 | encoder {
54 | hidden_size = 64
55 | vocab_size = 30370
56 | intermediate_size = 768
57 | num_hidden_layers = 3
58 | num_attention_heads = 8
59 | }
60 | }
61 | lr = 0.0001
62 |
63 | loss {
64 | type = "softmax_cross_entropy"
65 | }
66 |
67 | batch_size {
68 | train = 256
69 | val = 256
70 | test = 256
71 | #train = 5
72 | #val = 5
73 | #test = 5
74 | }
75 |
76 | }
--------------------------------------------------------------------------------
/conf/qc_ce_2020_02_23_01_00 copy.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | dataset {
13 | common_params {
14 | parallelize = false
15 | do_lowercase = true
16 | special_tokens = ["", "", ""]
17 | use_lang_weights = True
18 | }
19 | }
20 |
21 | training {
22 | short_circuit = false
23 |
24 | device = "cuda"
25 | wandb = true
26 | tensorboard = true
27 |
28 | name = "qc_ce"
29 | iteration = "2020_02_23_01_00"
30 | tokenizer_type = ${tokenizers.type}
31 |
32 | model {
33 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
34 | encoder {
35 | hidden_size = 32
36 | vocab_size = ${common_vocab_size}
37 | intermediate_size = 256
38 | num_hidden_layers = 2
39 | num_attention_heads = 8
40 | }
41 | }
42 |
43 | loss {
44 | type = "softmax_cross_entropy"
45 | }
46 |
47 | batch_size {
48 | train = 768
49 | val = 768
50 | test = 768
51 | }
52 |
53 | }
--------------------------------------------------------------------------------
/conf/qc_ce_2020_02_23_01_00.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | dataset {
13 | common_params {
14 | parallelize = false
15 | do_lowercase = true
16 | special_tokens = ["", "", ""]
17 | use_lang_weights = True
18 | }
19 | }
20 |
21 | training {
22 | short_circuit = false
23 |
24 | device = "cuda"
25 | wandb = true
26 | tensorboard = true
27 |
28 | name = "qc_ce"
29 | iteration = "2020_02_23_01_00"
30 | tokenizer_type = ${tokenizers.type}
31 |
32 | model {
33 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
34 | encoder {
35 | hidden_size = 32
36 | vocab_size = ${common_vocab_size}
37 | intermediate_size = 256
38 | num_hidden_layers = 2
39 | num_attention_heads = 8
40 | }
41 | }
42 |
43 | loss {
44 | type = "softmax_cross_entropy"
45 | }
46 |
47 | batch_size {
48 | train = 768
49 | val = 768
50 | test = 768
51 | }
52 |
53 | }
--------------------------------------------------------------------------------
/conf/qc_ce_long_seq_2020_02_24.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | dataset {
13 | common_params {
14 | parallelize = false
15 | do_lowercase = true
16 | special_tokens = ["", "", ""]
17 | use_lang_weights = True
18 | code_max_num_tokens=400 # mainly for JS which is more verbose
19 | }
20 | }
21 |
22 | training {
23 | short_circuit = false
24 |
25 | device = "cuda"
26 | wandb = true
27 | tensorboard = true
28 |
29 | name = "qc_ce"
30 | iteration = "2020_02_24"
31 | tokenizer_type = ${tokenizers.type}
32 |
33 | model {
34 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
35 | encoder {
36 | hidden_size = 32
37 | vocab_size = ${common_vocab_size}
38 | intermediate_size = 256
39 | num_hidden_layers = 2
40 | num_attention_heads = 8
41 | }
42 | }
43 |
44 | loss {
45 | type = "softmax_cross_entropy"
46 | }
47 |
48 | batch_size {
49 | train = 768
50 | val = 768
51 | test = 768
52 | }
53 |
54 | }
--------------------------------------------------------------------------------
/conf/qc_ce_sbert_2020_02_27 copy.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | embeddings {
13 | sbert {
14 | model="bert-base-nli-mean-tokens"
15 | pickle_path="./pickles"
16 | }
17 | }
18 |
19 | dataset {
20 | common_params {
21 | parallelize = false
22 | do_lowercase = true
23 | special_tokens = ["", "", ""]
24 | use_lang_weights = True
25 | code_max_num_tokens=200
26 | use_subtokens=True
27 | query_embeddings="sbert"
28 | }
29 | }
30 |
31 | training {
32 | short_circuit = false
33 |
34 | device = "cuda"
35 | wandb = true
36 | tensorboard = true
37 |
38 | name = "qc_ce_sbert"
39 | iteration = "2020_02_27"
40 | tokenizer_type = ${tokenizers.type}
41 |
42 | model {
43 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
44 | encoder {
45 | hidden_size = 32
46 | vocab_size = ${common_vocab_size}
47 | intermediate_size = 256
48 | num_hidden_layers = 2
49 | num_attention_heads = 8
50 | }
51 | }
52 |
53 | loss {
54 | type = "lambda_loss"
55 | }
56 |
57 | batch_size {
58 | train = 425
59 | val = 425
60 | test = 425
61 | }
62 |
63 | }
--------------------------------------------------------------------------------
/conf/qc_ce_sbert_2020_02_27.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | embeddings {
13 | sbert {
14 | model="bert-base-nli-mean-tokens"
15 | pickle_path="./pickles"
16 | }
17 | }
18 |
19 | dataset {
20 | common_params {
21 | parallelize = false
22 | do_lowercase = true
23 | special_tokens = ["", "", ""]
24 | use_lang_weights = True
25 | code_max_num_tokens=200
26 | use_subtokens=True
27 | query_embeddings="sbert"
28 | }
29 | }
30 |
31 | training {
32 | short_circuit = false
33 |
34 | device = "cuda"
35 | wandb = true
36 | tensorboard = true
37 |
38 | name = "qc_ce_sbert"
39 | iteration = "2020_02_27"
40 | tokenizer_type = ${tokenizers.type}
41 |
42 | model {
43 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
44 | encoder {
45 | hidden_size = 32
46 | vocab_size = ${common_vocab_size}
47 | intermediate_size = 256
48 | num_hidden_layers = 2
49 | num_attention_heads = 8
50 | }
51 | }
52 |
53 | loss {
54 | type = "lambda_loss"
55 | }
56 |
57 | batch_size {
58 | train = 425
59 | val = 425
60 | test = 425
61 | }
62 |
63 | }
--------------------------------------------------------------------------------
/conf/qc_ce_sbert_2020_02_28 copy.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | embeddings {
13 | sbert {
14 | model="bert-base-nli-mean-tokens"
15 | pickle_path="./pickles"
16 | }
17 | }
18 |
19 | dataset {
20 | common_params {
21 | parallelize = false
22 | do_lowercase = true
23 | special_tokens = ["", "", ""]
24 | use_lang_weights = True
25 | code_max_num_tokens=200
26 | use_subtokens=True
27 | query_embeddings="sbert"
28 | }
29 | }
30 |
31 | training {
32 | short_circuit = false
33 |
34 | device = "cuda"
35 | wandb = true
36 | tensorboard = true
37 |
38 | name = "qc_ce_sbert"
39 | iteration = "2020_02_28"
40 | tokenizer_type = ${tokenizers.type}
41 |
42 | model {
43 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
44 | encoder {
45 | hidden_size = 64
46 | vocab_size = ${common_vocab_size}
47 | intermediate_size = 512
48 | num_hidden_layers = 4
49 | num_attention_heads = 8
50 | }
51 | }
52 | lr = 0.0001
53 |
54 | loss {
55 | type = "lambda_loss"
56 | }
57 |
58 | batch_size {
59 | train = 300
60 | val = 300
61 | test = 300
62 | }
63 |
64 | }
--------------------------------------------------------------------------------
/conf/qc_ce_sbert_2020_02_28.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | embeddings {
13 | sbert {
14 | model="bert-base-nli-mean-tokens"
15 | pickle_path="./pickles"
16 | }
17 | }
18 |
19 | dataset {
20 | common_params {
21 | parallelize = false
22 | do_lowercase = true
23 | special_tokens = ["", "", ""]
24 | use_lang_weights = True
25 | code_max_num_tokens=200
26 | use_subtokens=True
27 | query_embeddings="sbert"
28 | }
29 | }
30 |
31 | training {
32 | short_circuit = false
33 |
34 | device = "cuda"
35 | wandb = true
36 | tensorboard = true
37 |
38 | name = "qc_ce_sbert"
39 | iteration = "2020_02_28"
40 | tokenizer_type = ${tokenizers.type}
41 |
42 | model {
43 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
44 | encoder {
45 | hidden_size = 64
46 | vocab_size = ${common_vocab_size}
47 | intermediate_size = 512
48 | num_hidden_layers = 4
49 | num_attention_heads = 8
50 | }
51 | }
52 | lr = 0.0001
53 |
54 | loss {
55 | type = "lambda_loss"
56 | }
57 |
58 | batch_size {
59 | train = 300
60 | val = 300
61 | test = 300
62 | }
63 |
64 | }
--------------------------------------------------------------------------------
/conf/qc_ce_sbert_2020_02_29 copy.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | embeddings {
13 | sbert {
14 | model="bert-base-nli-mean-tokens"
15 | pickle_path="./pickles"
16 | }
17 | }
18 |
19 | dataset {
20 | common_params {
21 | parallelize = false
22 | do_lowercase = true
23 | special_tokens = ["", "", ""]
24 | use_lang_weights = True
25 | code_max_num_tokens=200
26 | use_subtokens=True
27 | query_embeddings="sbert"
28 | }
29 | }
30 |
31 | training {
32 | short_circuit = true
33 |
34 | device = "cuda"
35 | wandb = false
36 | tensorboard = false
37 |
38 | name = "qc_ce_sbert"
39 | iteration = "2020_02_29"
40 | tokenizer_type = ${tokenizers.type}
41 |
42 | model {
43 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
44 | encoder {
45 | hidden_size = 128
46 | vocab_size = ${common_vocab_size}
47 | intermediate_size = 512
48 | num_hidden_layers = 4
49 | num_attention_heads = 8
50 | }
51 | }
52 | lr = 0.0001
53 |
54 | loss {
55 | type = "lambda_loss"
56 | }
57 |
58 | batch_size {
59 | train = 275
60 | val = 275
61 | test = 275
62 | }
63 |
64 | }
--------------------------------------------------------------------------------
/conf/qc_ce_sbert_2020_02_29.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | embeddings {
13 | sbert {
14 | model="bert-base-nli-mean-tokens"
15 | pickle_path="./pickles"
16 | }
17 | }
18 |
19 | dataset {
20 | common_params {
21 | parallelize = false
22 | do_lowercase = true
23 | special_tokens = ["", "", ""]
24 | use_lang_weights = True
25 | code_max_num_tokens=200
26 | use_subtokens=True
27 | query_embeddings="sbert"
28 | }
29 | }
30 |
31 | training {
32 | short_circuit = true
33 |
34 | device = "cuda"
35 | wandb = false
36 | tensorboard = false
37 |
38 | name = "qc_ce_sbert"
39 | iteration = "2020_02_29"
40 | tokenizer_type = ${tokenizers.type}
41 |
42 | model {
43 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
44 | encoder {
45 | hidden_size = 128
46 | vocab_size = ${common_vocab_size}
47 | intermediate_size = 512
48 | num_hidden_layers = 4
49 | num_attention_heads = 8
50 | }
51 | }
52 | lr = 0.0001
53 |
54 | loss {
55 | type = "lambda_loss"
56 | }
57 |
58 | batch_size {
59 | train = 275
60 | val = 275
61 | test = 275
62 | }
63 |
64 | }
--------------------------------------------------------------------------------
/conf/qc_ce_sbert_2020_03_01 copy.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | embeddings {
13 | sbert {
14 | model="bert-base-nli-mean-tokens"
15 | pickle_path="./pickles"
16 | }
17 | }
18 |
19 | dataset {
20 | common_params {
21 | parallelize = false
22 | do_lowercase = true
23 | special_tokens = ["", "", ""]
24 | use_lang_weights = True
25 | code_max_num_tokens=200
26 | use_subtokens=True
27 | query_embeddings="sbert"
28 | }
29 | }
30 |
31 | training {
32 | short_circuit = true
33 |
34 | device = "cuda"
35 | wandb = false
36 | tensorboard = false
37 |
38 | name = "qc_ce_sbert"
39 | iteration = "2020_03_01"
40 | tokenizer_type = ${tokenizers.type}
41 |
42 | model {
43 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
44 | encoder {
45 | hidden_size = 32
46 | vocab_size = ${common_vocab_size}
47 | intermediate_size = 256
48 | num_hidden_layers = 2
49 | num_attention_heads = 8
50 | }
51 | }
52 | lr = 0.0001
53 |
54 | loss {
55 | type = "lambda_loss"
56 | }
57 |
58 | batch_size {
59 | train = 400
60 | val = 400
61 | test = 400
62 | }
63 |
64 | }
--------------------------------------------------------------------------------
/conf/qc_ce_sbert_2020_03_01.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | embeddings {
13 | sbert {
14 | model="bert-base-nli-mean-tokens"
15 | pickle_path="./pickles"
16 | }
17 | }
18 |
19 | dataset {
20 | common_params {
21 | parallelize = false
22 | do_lowercase = true
23 | special_tokens = ["", "", ""]
24 | use_lang_weights = True
25 | code_max_num_tokens=200
26 | use_subtokens=True
27 | query_embeddings="sbert"
28 | }
29 | }
30 |
31 | training {
32 | short_circuit = true
33 |
34 | device = "cuda"
35 | wandb = false
36 | tensorboard = false
37 |
38 | name = "qc_ce_sbert"
39 | iteration = "2020_03_01"
40 | tokenizer_type = ${tokenizers.type}
41 |
42 | model {
43 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
44 | encoder {
45 | hidden_size = 32
46 | vocab_size = ${common_vocab_size}
47 | intermediate_size = 256
48 | num_hidden_layers = 2
49 | num_attention_heads = 8
50 | }
51 | }
52 | lr = 0.0001
53 |
54 | loss {
55 | type = "lambda_loss"
56 | }
57 |
58 | batch_size {
59 | train = 400
60 | val = 400
61 | test = 400
62 | }
63 |
64 | }
--------------------------------------------------------------------------------
/conf/qc_ce_subtoken_2020_02_25 copy.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k_subtoken"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | dataset {
13 | common_params {
14 | parallelize = false
15 | do_lowercase = true
16 | special_tokens = ["", "", ""]
17 | use_lang_weights = True
18 | code_max_num_tokens=200
19 | use_subtokens=True
20 | }
21 | }
22 |
23 | training {
24 | short_circuit = false
25 |
26 | device = "cuda"
27 | wandb = true
28 | tensorboard = true
29 |
30 | name = "qc_ce_subtoken"
31 | iteration = "2020_02_25"
32 | tokenizer_type = ${tokenizers.type}
33 |
34 | model {
35 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
36 | encoder {
37 | hidden_size = 32
38 | vocab_size = ${common_vocab_size}
39 | intermediate_size = 256
40 | num_hidden_layers = 2
41 | num_attention_heads = 8
42 | }
43 | }
44 |
45 | loss {
46 | type = "softmax_cross_entropy"
47 | }
48 |
49 | batch_size {
50 | train = 768
51 | val = 768
52 | test = 768
53 | }
54 |
55 | }
--------------------------------------------------------------------------------
/conf/qc_ce_subtoken_2020_02_25.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k_subtoken"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | dataset {
13 | common_params {
14 | parallelize = false
15 | do_lowercase = true
16 | special_tokens = ["", "", ""]
17 | use_lang_weights = True
18 | code_max_num_tokens=200
19 | use_subtokens=True
20 | }
21 | }
22 |
23 | training {
24 | short_circuit = false
25 |
26 | device = "cuda"
27 | wandb = true
28 | tensorboard = true
29 |
30 | name = "qc_ce_subtoken"
31 | iteration = "2020_02_25"
32 | tokenizer_type = ${tokenizers.type}
33 |
34 | model {
35 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
36 | encoder {
37 | hidden_size = 32
38 | vocab_size = ${common_vocab_size}
39 | intermediate_size = 256
40 | num_hidden_layers = 2
41 | num_attention_heads = 8
42 | }
43 | }
44 |
45 | loss {
46 | type = "softmax_cross_entropy"
47 | }
48 |
49 | batch_size {
50 | train = 768
51 | val = 768
52 | test = 768
53 | }
54 |
55 | }
--------------------------------------------------------------------------------
/conf/qc_ce_subtoken_larger_2020_02_25.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k_subtoken"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | dataset {
13 | common_params {
14 | parallelize = false
15 | do_lowercase = true
16 | special_tokens = ["", "", ""]
17 | use_lang_weights = True
18 | code_max_num_tokens=200
19 | use_subtokens=True
20 | }
21 | }
22 |
23 | training {
24 | short_circuit = false
25 |
26 | device = "cuda"
27 | wandb = true
28 | tensorboard = true
29 |
30 | name = "qc_ce_subtoken_larger"
31 | iteration = "2020_02_26"
32 | tokenizer_type = ${tokenizers.type}
33 |
34 | model {
35 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
36 | encoder {
37 | hidden_size = 64
38 | vocab_size = ${common_vocab_size}
39 | intermediate_size = 512
40 | num_hidden_layers = 4
41 | num_attention_heads = 8
42 | }
43 | }
44 |
45 | loss {
46 | type = "softmax_cross_entropy"
47 | }
48 |
49 | batch_size {
50 | train = 350
51 | val = 350
52 | test = 350
53 | }
54 |
55 | }
--------------------------------------------------------------------------------
/conf/qc_ce_subtoken_larger_2020_02_26 copy.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k_subtoken"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | dataset {
13 | common_params {
14 | parallelize = false
15 | do_lowercase = true
16 | special_tokens = ["", "", ""]
17 | use_lang_weights = True
18 | code_max_num_tokens=200
19 | use_subtokens=True
20 | }
21 | }
22 |
23 | training {
24 | short_circuit = false
25 |
26 | device = "cuda"
27 | wandb = true
28 | tensorboard = true
29 |
30 | name = "qc_ce_subtoken_larger"
31 | iteration = "2020_02_26"
32 | tokenizer_type = ${tokenizers.type}
33 |
34 | model {
35 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
36 | encoder {
37 | hidden_size = 64
38 | vocab_size = ${common_vocab_size}
39 | intermediate_size = 512
40 | num_hidden_layers = 4
41 | num_attention_heads = 8
42 | }
43 | }
44 |
45 | loss {
46 | type = "softmax_cross_entropy"
47 | }
48 |
49 | batch_size {
50 | train = 350
51 | val = 350
52 | test = 350
53 | }
54 |
55 | }
--------------------------------------------------------------------------------
/conf/qc_ce_subtoken_larger_2020_02_26.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k_subtoken"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | dataset {
13 | common_params {
14 | parallelize = false
15 | do_lowercase = true
16 | special_tokens = ["", "", ""]
17 | use_lang_weights = True
18 | code_max_num_tokens=200
19 | use_subtokens=True
20 | }
21 | }
22 |
23 | training {
24 | short_circuit = false
25 |
26 | device = "cuda"
27 | wandb = true
28 | tensorboard = true
29 |
30 | name = "qc_ce_subtoken_larger"
31 | iteration = "2020_02_26"
32 | tokenizer_type = ${tokenizers.type}
33 |
34 | model {
35 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
36 | encoder {
37 | hidden_size = 64
38 | vocab_size = ${common_vocab_size}
39 | intermediate_size = 512
40 | num_hidden_layers = 4
41 | num_attention_heads = 8
42 | }
43 | }
44 |
45 | loss {
46 | type = "softmax_cross_entropy"
47 | }
48 |
49 | batch_size {
50 | train = 350
51 | val = 350
52 | test = 350
53 | }
54 |
55 | }
--------------------------------------------------------------------------------
/conf/qc_lambda_2020_02_20_12_30 copy.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | dataset {
13 | common_params {
14 | parallelize = false
15 | do_lowercase = true
16 | special_tokens = ["", "", ""]
17 | use_lang_weights = True
18 | }
19 | }
20 |
21 | training {
22 | short_circuit = false
23 |
24 | device = "cuda"
25 | wandb = true
26 | tensorboard = true
27 |
28 | name = "qc_lambda"
29 | iteration = "2020_02_20_12_30"
30 | tokenizer_type = ${tokenizers.type}
31 |
32 | model {
33 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
34 | encoder {
35 | hidden_size = 128
36 | vocab_size = ${common_vocab_size}
37 | intermediate_size = 512
38 | num_hidden_layers = 6
39 | num_attention_heads = 8
40 | }
41 | }
42 |
43 | loss {
44 | type = "lambda_loss"
45 | }
46 |
47 | batch_size {
48 | train = 220
49 | val = 220
50 | test = 220
51 | # train = 8
52 | # val = 8
53 | # test = 8
54 | }
55 |
56 | }
--------------------------------------------------------------------------------
/conf/qc_lambda_2020_02_20_12_30.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | dataset {
13 | common_params {
14 | parallelize = false
15 | do_lowercase = true
16 | special_tokens = ["", "", ""]
17 | use_lang_weights = True
18 | }
19 | }
20 |
21 | training {
22 | short_circuit = false
23 |
24 | device = "cuda"
25 | wandb = true
26 | tensorboard = true
27 |
28 | name = "qc_lambda"
29 | iteration = "2020_02_20_12_30"
30 | tokenizer_type = ${tokenizers.type}
31 |
32 | model {
33 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
34 | encoder {
35 | hidden_size = 128
36 | vocab_size = ${common_vocab_size}
37 | intermediate_size = 512
38 | num_hidden_layers = 6
39 | num_attention_heads = 8
40 | }
41 | }
42 |
43 | loss {
44 | type = "lambda_loss"
45 | }
46 |
47 | batch_size {
48 | train = 220
49 | val = 220
50 | test = 220
51 | # train = 8
52 | # val = 8
53 | # test = 8
54 | }
55 |
56 | }
--------------------------------------------------------------------------------
/conf/qc_sbert_lambda_2020_03_02.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | embeddings {
13 | sbert {
14 | model="bert-base-nli-mean-tokens"
15 | pickle_path="./pickles"
16 | }
17 | }
18 |
19 | dataset {
20 | common_params {
21 | parallelize = false
22 | do_lowercase = true
23 | special_tokens = ["", "", ""]
24 | use_lang_weights = True
25 | code_max_num_tokens=200
26 | use_subtokens=True
27 | query_embeddings="sbert"
28 | }
29 | }
30 |
31 | training {
32 | short_circuit = false
33 |
34 | device = "cuda"
35 | wandb = true
36 | tensorboard = true
37 |
38 | name = "qc_sbert_lambda"
39 | iteration = "2020_03_02"
40 | tokenizer_type = ${tokenizers.type}
41 |
42 | model {
43 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
44 | encoder {
45 | hidden_size = 32
46 | vocab_size = ${common_vocab_size}
47 | intermediate_size = 128
48 | num_hidden_layers = 2
49 | num_attention_heads = 8
50 | }
51 | }
52 | lr = 0.0001
53 |
54 | loss {
55 | type = "approx_ndcg_loss"
56 | }
57 |
58 | batch_size {
59 | train = 400
60 | val = 400
61 | test = 400
62 | }
63 |
64 | }
--------------------------------------------------------------------------------
/conf/qc_sbert_lambda_2020_03_04 copy.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | embeddings {
13 | sbert {
14 | model="bert-base-nli-mean-tokens"
15 | pickle_path="./pickles"
16 | }
17 | }
18 |
19 | dataset {
20 | common_params {
21 | parallelize = false
22 | do_lowercase = true
23 | special_tokens = ["", "", ""]
24 | use_lang_weights = True
25 | code_max_num_tokens=200
26 | use_subtokens=True
27 | query_embeddings="sbert"
28 | fraction_using_func_name=0.0
29 | }
30 | }
31 |
32 | training {
33 | short_circuit = false
34 |
35 | device = "cuda"
36 | wandb = true
37 | tensorboard = false
38 |
39 | name = "qc_sbert_lambda"
40 | iteration = "2020_03_04"
41 | tokenizer_type = ${tokenizers.type}
42 |
43 | model {
44 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
45 | encoder {
46 | hidden_size = 768
47 | vocab_size = ${common_vocab_size}
48 | intermediate_size = 2048
49 | num_hidden_layers = 3
50 | num_attention_heads = 8
51 | }
52 | }
53 | lr = 0.000001
54 |
55 | loss {
56 | type = "approx_ndcg_loss"
57 | }
58 |
59 | batch_size {
60 | train = 100
61 | val = 100
62 | test = 100
63 | }
64 |
65 | }
--------------------------------------------------------------------------------
/conf/qc_sbert_lambda_2020_03_04.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | embeddings {
13 | sbert {
14 | model="bert-base-nli-mean-tokens"
15 | pickle_path="./pickles"
16 | }
17 | }
18 |
19 | dataset {
20 | common_params {
21 | parallelize = false
22 | do_lowercase = true
23 | special_tokens = ["", "", ""]
24 | use_lang_weights = True
25 | code_max_num_tokens=200
26 | use_subtokens=True
27 | query_embeddings="sbert"
28 | fraction_using_func_name=0.0
29 | }
30 | }
31 |
32 | training {
33 | short_circuit = false
34 |
35 | device = "cuda"
36 | wandb = true
37 | tensorboard = false
38 |
39 | name = "qc_sbert_lambda"
40 | iteration = "2020_03_04"
41 | tokenizer_type = ${tokenizers.type}
42 |
43 | model {
44 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
45 | encoder {
46 | hidden_size = 768
47 | vocab_size = ${common_vocab_size}
48 | intermediate_size = 2048
49 | num_hidden_layers = 3
50 | num_attention_heads = 8
51 | }
52 | }
53 | lr = 0.000001
54 |
55 | loss {
56 | type = "approx_ndcg_loss"
57 | }
58 |
59 | batch_size {
60 | train = 100
61 | val = 100
62 | test = 100
63 | }
64 |
65 | }
--------------------------------------------------------------------------------
/conf/qc_sbert_lambda_2020_03_05.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | embeddings {
13 | sbert {
14 | model="bert-base-nli-mean-tokens"
15 | pickle_path="./pickles"
16 | }
17 | }
18 |
19 | dataset {
20 | common_params {
21 | parallelize = false
22 | do_lowercase = true
23 | special_tokens = ["", "", ""]
24 | use_lang_weights = True
25 | code_max_num_tokens=200
26 | use_subtokens=True
27 | query_embeddings="sbert"
28 | fraction_using_func_name=0.0
29 | }
30 | }
31 |
32 | training {
33 | short_circuit = false
34 |
35 | device = "cuda"
36 | wandb = true
37 | tensorboard = false
38 |
39 | name = "qc_sbert_lambda"
40 | iteration = "2020_03_04"
41 | tokenizer_type = ${tokenizers.type}
42 |
43 | model {
44 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
45 | encoder {
46 | hidden_size = 768
47 | vocab_size = ${common_vocab_size}
48 | intermediate_size = 2048
49 | num_hidden_layers = 3
50 | num_attention_heads = 8
51 | }
52 | }
53 | lr = 0.000001
54 |
55 | loss {
56 | type = "approx_ndcg_loss"
57 | }
58 |
59 | batch_size {
60 | train = 100
61 | val = 100
62 | test = 100
63 | }
64 |
65 | }
--------------------------------------------------------------------------------
/conf/qc_sbert_lambda_2020_03_07 copy.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | embeddings {
13 | sbert {
14 | model="bert-base-nli-mean-tokens"
15 | pickle_path="./pickles"
16 | }
17 | }
18 |
19 | dataset {
20 | common_params {
21 | parallelize = false
22 | do_lowercase = true
23 | special_tokens = ["", "", ""]
24 | use_lang_weights = True
25 | code_max_num_tokens=200
26 | use_subtokens=True
27 | #query_embeddings="sbert"
28 | fraction_using_func_name=0.1
29 | }
30 | }
31 |
32 | training {
33 | short_circuit = true
34 |
35 | device = "cuda"
36 | wandb = false
37 | tensorboard = false
38 |
39 | name = "qc_sbert_lambda"
40 | iteration = "2020_03_07"
41 | tokenizer_type = ${tokenizers.type}
42 |
43 | model {
44 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
45 | encoder {
46 | hidden_size = 32
47 | vocab_size = ${common_vocab_size}
48 | intermediate_size = 128
49 | num_hidden_layers = 3
50 | num_attention_heads = 8
51 | }
52 | }
53 | lr = 0.001
54 |
55 | loss {
56 | type = "lambda_loss"
57 | }
58 |
59 | batch_size {
60 | #train = 400
61 | #val = 400
62 | #test = 400
63 | train = 5
64 | val = 5
65 | test = 5
66 | }
67 |
68 | }
--------------------------------------------------------------------------------
/conf/qc_sbert_lambda_2020_03_07.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | embeddings {
13 | sbert {
14 | model="bert-base-nli-mean-tokens"
15 | pickle_path="./pickles"
16 | }
17 | }
18 |
19 | dataset {
20 | common_params {
21 | parallelize = false
22 | do_lowercase = true
23 | special_tokens = ["", "", ""]
24 | use_lang_weights = True
25 | code_max_num_tokens=200
26 | use_subtokens=True
27 | #query_embeddings="sbert"
28 | fraction_using_func_name=0.1
29 | }
30 | }
31 |
32 | training {
33 | short_circuit = true
34 |
35 | device = "cuda"
36 | wandb = false
37 | tensorboard = false
38 |
39 | name = "qc_sbert_lambda"
40 | iteration = "2020_03_07"
41 | tokenizer_type = ${tokenizers.type}
42 |
43 | model {
44 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
45 | encoder {
46 | hidden_size = 32
47 | vocab_size = ${common_vocab_size}
48 | intermediate_size = 128
49 | num_hidden_layers = 3
50 | num_attention_heads = 8
51 | }
52 | }
53 | lr = 0.001
54 |
55 | loss {
56 | type = "lambda_loss"
57 | }
58 |
59 | batch_size {
60 | #train = 400
61 | #val = 400
62 | #test = 400
63 | train = 5
64 | val = 5
65 | test = 5
66 | }
67 |
68 | }
--------------------------------------------------------------------------------
/conf/query_code_siamese_2020_02_15_14_00 copy.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
4 |
5 | tokenizers {
6 | build_path = "./build_tokenizers/with_lang_query_code_siamese"
7 | token_files = "./build_tokenizers/token_files_query_code_siamese"
8 | }
9 |
10 | dataset {
11 | common_params {
12 | parallelize = false
13 | do_lowercase = true
14 | special_tokens = ["", "", ""]
15 | use_lang_weights = True
16 | }
17 | }
18 |
19 | training {
20 | short_circuit = false
21 |
22 | device = "cuda"
23 | wandb = true
24 | tensorboard = true
25 |
26 | name = "query_code_siamese"
27 | iteration = "2020_02_15_14_00"
28 | tokenizer_type = "query_code_siamese"
29 |
30 | model {
31 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
32 | output_size = 72
33 | encoder {
34 | hidden_size = ${training.model.output_size}
35 | vocab_size = ${common_vocab_size}
36 | intermediate_size = 256
37 | num_hidden_layers = 12
38 | num_attention_heads = 12
39 | }
40 | }
41 |
42 | batch_size {
43 | train = 100
44 | val = 100
45 | test = 100
46 | }
47 |
48 | }
--------------------------------------------------------------------------------
/conf/query_code_siamese_2020_02_15_14_00.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
4 |
5 | tokenizers {
6 | build_path = "./build_tokenizers/with_lang_query_code_siamese"
7 | token_files = "./build_tokenizers/token_files_query_code_siamese"
8 | }
9 |
10 | dataset {
11 | common_params {
12 | parallelize = false
13 | do_lowercase = true
14 | special_tokens = ["", "", ""]
15 | use_lang_weights = True
16 | }
17 | }
18 |
19 | training {
20 | short_circuit = false
21 |
22 | device = "cuda"
23 | wandb = true
24 | tensorboard = true
25 |
26 | name = "query_code_siamese"
27 | iteration = "2020_02_15_14_00"
28 | tokenizer_type = "query_code_siamese"
29 |
30 | model {
31 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
32 | output_size = 72
33 | encoder {
34 | hidden_size = ${training.model.output_size}
35 | vocab_size = ${common_vocab_size}
36 | intermediate_size = 256
37 | num_hidden_layers = 12
38 | num_attention_heads = 12
39 | }
40 | }
41 |
42 | batch_size {
43 | train = 100
44 | val = 100
45 | test = 100
46 | }
47 |
48 | }
--------------------------------------------------------------------------------
/conf/query_code_siamese_2020_02_17_21_30 copy.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
4 |
5 | tokenizers {
6 | build_path = "./build_tokenizers/with_lang_query_code_siamese"
7 | token_files = "./build_tokenizers/token_files_query_code_siamese"
8 | }
9 |
10 | dataset {
11 | common_params {
12 | parallelize = false
13 | do_lowercase = true
14 | special_tokens = ["", "", ""]
15 | use_lang_weights = True
16 | }
17 | }
18 |
19 | training {
20 | short_circuit = false
21 |
22 | device = "cuda"
23 | wandb = true
24 | tensorboard = true
25 |
26 | name = "query_code_siamese"
27 | iteration = "2020_02_17_21_30"
28 | tokenizer_type = "query_code_siamese"
29 |
30 | model {
31 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
32 | output_size = 64
33 | encoder {
34 | hidden_size = ${training.model.output_size}
35 | vocab_size = ${common_vocab_size}
36 | intermediate_size = 256
37 | num_hidden_layers = 6
38 | num_attention_heads = 8
39 | }
40 | }
41 |
42 | batch_size {
43 | train = 290
44 | val = 290
45 | test = 290
46 | }
47 |
48 | }
--------------------------------------------------------------------------------
/conf/query_code_siamese_2020_02_17_21_30.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
4 |
5 | tokenizers {
6 | build_path = "./build_tokenizers/with_lang_query_code_siamese"
7 | token_files = "./build_tokenizers/token_files_query_code_siamese"
8 | }
9 |
10 | dataset {
11 | common_params {
12 | parallelize = false
13 | do_lowercase = true
14 | special_tokens = ["", "", ""]
15 | use_lang_weights = True
16 | }
17 | }
18 |
19 | training {
20 | short_circuit = false
21 |
22 | device = "cuda"
23 | wandb = true
24 | tensorboard = true
25 |
26 | name = "query_code_siamese"
27 | iteration = "2020_02_17_21_30"
28 | tokenizer_type = "query_code_siamese"
29 |
30 | model {
31 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
32 | output_size = 64
33 | encoder {
34 | hidden_size = ${training.model.output_size}
35 | vocab_size = ${common_vocab_size}
36 | intermediate_size = 256
37 | num_hidden_layers = 6
38 | num_attention_heads = 8
39 | }
40 | }
41 |
42 | batch_size {
43 | train = 290
44 | val = 290
45 | test = 290
46 | }
47 |
48 | }
--------------------------------------------------------------------------------
/conf/query_code_siamese_2020_02_18_13_00.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
4 |
5 | tokenizers {
6 | build_path = "./build_tokenizers/with_lang_query_code_siamese"
7 | token_files = "./build_tokenizers/token_files_query_code_siamese"
8 | }
9 |
10 | dataset {
11 | common_params {
12 | parallelize = false
13 | do_lowercase = true
14 | special_tokens = ["", "", ""]
15 | use_lang_weights = True
16 | }
17 | }
18 |
19 | training {
20 | short_circuit = false
21 |
22 | device = "cuda"
23 | wandb = true
24 | tensorboard = true
25 |
26 | name = "query_code_siamese"
27 | iteration = "2020_02_17_21_30"
28 | tokenizer_type = "query_code_siamese"
29 |
30 | model {
31 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
32 | output_size = 64
33 | encoder {
34 | hidden_size = ${training.model.output_size}
35 | vocab_size = ${common_vocab_size}
36 | intermediate_size = 256
37 | num_hidden_layers = 6
38 | num_attention_heads = 8
39 | }
40 | }
41 |
42 | batch_size {
43 | train = 290
44 | val = 290
45 | test = 290
46 | }
47 |
48 | }
--------------------------------------------------------------------------------
/conf/query_code_siamese_2020_02_19_13_00 copy.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
4 |
5 | tokenizers {
6 | build_path = "./build_tokenizers/with_lang_query_code_siamese"
7 | token_files = "./build_tokenizers/token_files_query_code_siamese"
8 | }
9 |
10 | dataset {
11 | common_params {
12 | parallelize = false
13 | do_lowercase = true
14 | special_tokens = ["", "", ""]
15 | use_lang_weights = True
16 | }
17 | }
18 |
19 | training {
20 | short_circuit = false
21 |
22 | device = "cuda"
23 | wandb = true
24 | tensorboard = true
25 |
26 | name = "query_code_siamese"
27 | iteration = "2020_02_19_13_00"
28 | tokenizer_type = "query_code_siamese"
29 |
30 | model {
31 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
32 | output_size = 64
33 | encoder {
34 | hidden_size = ${training.model.output_size}
35 | vocab_size = ${common_vocab_size}
36 | intermediate_size = 256
37 | num_hidden_layers = 6
38 | num_attention_heads = 8
39 | }
40 | }
41 |
42 | loss {
43 | type = "lambda_loss"
44 | }
45 |
46 | batch_size {
47 | train = 256
48 | val = 256
49 | test = 256
50 | # train = 8
51 | # val = 8
52 | # test = 8
53 | }
54 |
55 | }
--------------------------------------------------------------------------------
/conf/query_code_siamese_2020_02_19_13_00.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
4 |
5 | tokenizers {
6 | build_path = "./build_tokenizers/with_lang_query_code_siamese"
7 | token_files = "./build_tokenizers/token_files_query_code_siamese"
8 | }
9 |
10 | dataset {
11 | common_params {
12 | parallelize = false
13 | do_lowercase = true
14 | special_tokens = ["", "", ""]
15 | use_lang_weights = True
16 | }
17 | }
18 |
19 | training {
20 | short_circuit = false
21 |
22 | device = "cuda"
23 | wandb = true
24 | tensorboard = true
25 |
26 | name = "query_code_siamese"
27 | iteration = "2020_02_19_13_00"
28 | tokenizer_type = "query_code_siamese"
29 |
30 | model {
31 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
32 | output_size = 64
33 | encoder {
34 | hidden_size = ${training.model.output_size}
35 | vocab_size = ${common_vocab_size}
36 | intermediate_size = 256
37 | num_hidden_layers = 6
38 | num_attention_heads = 8
39 | }
40 | }
41 |
42 | loss {
43 | type = "lambda_loss"
44 | }
45 |
46 | batch_size {
47 | train = 256
48 | val = 256
49 | test = 256
50 | # train = 8
51 | # val = 8
52 | # test = 8
53 | }
54 |
55 | }
--------------------------------------------------------------------------------
/conf/query_code_siamese_albert_2020_02_18_08_30 copy.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
4 |
5 | tokenizers {
6 | build_path = "./build_tokenizers/with_lang_query_code_siamese"
7 | token_files = "./build_tokenizers/token_files_query_code_siamese"
8 | }
9 |
10 | dataset {
11 | common_params {
12 | parallelize = false
13 | do_lowercase = true
14 | special_tokens = ["", "", ""]
15 | use_lang_weights = True
16 | }
17 | }
18 |
19 | training {
20 | short_circuit = false
21 |
22 | device = "cuda"
23 | wandb = true
24 | tensorboard = true
25 |
26 | name = "query_code_siamese_albert"
27 | iteration = "2020_02_18_08_30"
28 | tokenizer_type = "query_code_siamese"
29 |
30 | model {
31 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
32 | output_size = 128
33 | encoder {
34 | type = "albert"
35 | embedding_size = ${training.model.output_size}
36 | hidden_size = 512
37 | vocab_size = ${common_vocab_size}
38 | intermediate_size = 768
39 | num_hidden_layers = 8
40 | num_attention_heads = 8
41 | }
42 | }
43 |
44 | batch_size {
45 | train = 128
46 | val = 128
47 | test = 128
48 | }
49 |
50 | lr = 0.00001
51 |
52 | }
--------------------------------------------------------------------------------
/conf/query_code_siamese_albert_2020_02_18_08_30.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
4 |
5 | tokenizers {
6 | build_path = "./build_tokenizers/with_lang_query_code_siamese"
7 | token_files = "./build_tokenizers/token_files_query_code_siamese"
8 | }
9 |
10 | dataset {
11 | common_params {
12 | parallelize = false
13 | do_lowercase = true
14 | special_tokens = ["", "", ""]
15 | use_lang_weights = True
16 | }
17 | }
18 |
19 | training {
20 | short_circuit = false
21 |
22 | device = "cuda"
23 | wandb = true
24 | tensorboard = true
25 |
26 | name = "query_code_siamese_albert"
27 | iteration = "2020_02_18_08_30"
28 | tokenizer_type = "query_code_siamese"
29 |
30 | model {
31 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
32 | output_size = 128
33 | encoder {
34 | type = "albert"
35 | embedding_size = ${training.model.output_size}
36 | hidden_size = 512
37 | vocab_size = ${common_vocab_size}
38 | intermediate_size = 768
39 | num_hidden_layers = 8
40 | num_attention_heads = 8
41 | }
42 | }
43 |
44 | batch_size {
45 | train = 128
46 | val = 128
47 | test = 128
48 | }
49 |
50 | lr = 0.00001
51 |
52 | }
--------------------------------------------------------------------------------
/conf/query_code_siamese_albert_2020_02_18_14_00.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
4 |
5 | tokenizers {
6 | build_path = "./build_tokenizers/with_lang_query_code_siamese"
7 | token_files = "./build_tokenizers/token_files_query_code_siamese"
8 | }
9 |
10 | dataset {
11 | common_params {
12 | parallelize = false
13 | do_lowercase = true
14 | special_tokens = ["", "", ""]
15 | use_lang_weights = True
16 | }
17 | }
18 |
19 | training {
20 | short_circuit = false
21 |
22 | device = "cuda"
23 | wandb = true
24 | tensorboard = true
25 |
26 | name = "query_code_siamese_albert"
27 | iteration = "2020_02_18_14_00"
28 | tokenizer_type = "query_code_siamese"
29 |
30 | model {
31 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
32 | # output_size = 128
33 | encoder {
34 | type = "albert"
35 | embedding_size = 64
36 | hidden_size = 256
37 | vocab_size = ${common_vocab_size}
38 | intermediate_size = 512
39 | num_hidden_layers = 6
40 | num_attention_heads = 8
41 | }
42 | }
43 |
44 | batch_size {
45 | train = 240
46 | val = 240
47 | test = 240
48 | }
49 |
50 | lr = 0.00001
51 |
52 | }
--------------------------------------------------------------------------------
/guide.md:
--------------------------------------------------------------------------------
1 | ## Introduction
2 |
3 | Ce projet est un projet bac-à-sable technique sur lequel j'ai travaillé en 2020 et que j'utilisais pour tester certains points techniques à titre personnel. C'est une réécriture quasi-complète d'un projet de Microsoft pour le challenge CodeSearchNet (moteur de recherche de code multi-langage à partir de requêtes textuelles, sujet qui depuis a été largement poussé plus loin par github/microsoft avec Copilot avec des capacités avancées de génération de code).
4 | J'ai remis ce projet à jour pour cette présentation car je me suis aperçu que les API python ont beaucoup évolué depuis 2020 et le code n'était plus du tout compatible avec les versions actuelles. Cependant, le code ne tournera pas si vous le lancez car il a besoin de tokenizers qu'il faut construire manuellement et qui demandent pas mal de temps de calcul et de librairies natives (pour les AST de langages).
5 |
6 | ## Points techniques remarquables
7 |
8 | Mon but ici n'est pas de parler du fond ML/IA (mes résultats n'étaient pas très intéressants) mais plutôt du code et plus spécifiquement les points suivants:
9 |
10 | - Projet complet Python/ML avec gestion des dépendances (poetry), isolation dans un virtualenv, intégration dans VSCode (qui devenait le standard de dev en 2020) avec utilisation d'extensions: mypy, linters, license, tests (même si anecdotiques), etc...
11 |
12 | - [pyproject.toml](./pyproject.toml)
13 | - et pour info, la [license](./LICENSE)
14 |
15 | - Utilisation des configurations au format générique HOCON qui permet de gérer des configurations complexes avec des imports, des variables, des références etc...
16 |
17 | - [Configuration générique](./conf/default.conf)
18 | - [Configuration spécifique](./conf/query_code_siamese_2020_02_15_14_00.conf)
19 |
20 | - Exploration des limites du typage fort en Python avec des types génériques abstraits (pour tenter de simuler l'équivalent des "typeclasses" qu'on trouve dans les langages fonctionnels comme Haskell/Scala) et les "newtypes" pour "spécialiser" des types simples
21 |
22 | - [Type abstraits](./codenets/recordable.py#L22)
23 | - [Type génériques](./codenets/codesearchnet/training_ctx.py#L205-L220)
24 | - [Newtypes](./codenets/codesearchnet/training_ctx.py#L49-L68)
25 |
26 | - Evaluation de la compilation des types avc le moteur de compilation Mypy de Microsoft intégré dans VS Code.
27 |
28 | - [mypy.ini](./mypy.ini)
29 |
30 | - Etude de sauvegarde/restoration générique d'un contexte complet de projet IA (configuration + commit + modèle + tokenizer + dataset + etc...) pour une sauvegarde dans un point unique (sur un cloud de type AWS ou un serveur orienté ML de type MLFlow par exemple).
31 |
32 | - [Recordable générique](./codenets/recordable.py#L22)
33 | - [Recordable spécialisé configuration HOCON](./codenets/recordable.py#L113)
34 | - [Recordable spécialisé modèle/tokenizer TorchModule](./codenets/recordable.py#L248)
35 | - [training context générique](./codenets/codesearchnet/training_ctx.py#L245)
36 | - [training context spécialisé sur un modèle spécifique](./codenets/codesearchnet/query_code_siamese/training_ctx.py#L40)
37 |
38 | - Evaluation de la complexité de réécriture d'un code Tensorflow vers du PyTorch et les librairies huggingface.
39 |
40 | - Intégration avec WanDB/Tensorflow pour le suivi des entraînements.
41 |
42 | et de manière plus anecdotique:
43 |
44 | - Etudier les résultats atteignables avec des transformers de petite taille sur un challenge de ce type
45 | et les résultats ont été très décevants cf.
46 |
47 | - [README.md](./README.md)
48 |
49 | - Utilisation de tokenizers natifs Rust avec interface Python de Huggingface tokenizers (qui venaient d'être publiés en 2020):
50 |
51 | - [tokenizer_recs.py](./codenets/codesearchnet/huggingface/tokenizer_recs.py#L102)
52 |
53 | - Utilisation des parsers d'AST de langages (tree-sitter) pour améliorer les performances des modèles à base de transformers (je n'ai pas réussi à pousser les expérimentations très loin par manque de ressources GPU)
54 | - [ast_build.py](./codenets/codesearchnet/ast_build.py#L189)
55 |
56 | ## Conclusion
57 |
58 | Au final, je retiendrai les points suivants:
59 |
60 | - L'utilisation des configurations HOCON est intéressante pour tout projet informatique quel que soit le langage à mon avis car cela permet de gérer des configurations complexes avec des variables/références tout en restant simple de format.
61 | - la sauvegarde générique complète d'un projet ML du code au modèle et dataset me semble un point important dans l'optique de backup et versioning de projets ML en associant l'intégralité des ressources: code, configuration, modèle, tokenizer, dataset etc...
62 | - le typage fort dans Python est devenu un outil intéressant qui permet d'améliorer la robustesse globale du code, de réduire la quantité de tests unitaires. Mypy semble être une solution robuste pour vérifier les types même s'il faut filtrer de nombreuses dépendances externes qui n'intègrent pas la gestion des types. Cependant, l'utilisation trop fréquente des unions de types dans les librairies Python peut conduire à des signatures de type assez indigestes.
63 | - L'utilisation des types génériques et abstraits est fonctionnelle mais reste assez fastidieuse en Python et ne donne pas l'impression d'être une fonctionnalité native du langage (sans parler des cast au runtime qui peuvent poser des problèmes de performance). Il vaut mieux rester dans les patterns orienté-objet classiques et éviter de trop s'aventurer en dehors des sentiers battus.
64 | - L'utilisation des NewTypes reste encore anecdotique de mon point de vue (en particulier, les opérations mathématiques ou de concaténation sur ces types leur font perdre leur spécificité)
65 |
66 | Si vous avez des questions, n'hésitez pas à me contacter.
67 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | """Main of the project."""
2 | # import numpy as np
3 | # import os
4 |
5 |
6 | # def get_os_env():
7 | # print(os.getcwd())
8 | # print(os.uname())
9 |
10 |
11 | # def main():
12 | # # Do some os stuff
13 | # get_os_env()
14 | # # Do some numpy stuff
15 | # A = np.ones(3)*1
16 | # B = np.ones(3)*2
17 | # C = np.ones(3)*3
18 | # res = np.add(A,B,out=B)
19 | # res2 = np.divide(A,2,out=A)
20 | # res3 = np.negative(A,out=A)
21 | # res4 = np.multiply(A,B,out=A)
22 |
23 | # print(res)
24 | # print(f"this is the result 2 {res2}")
25 | # print(np.zeros(shape=(2, 3)))
26 |
27 |
28 | # if __name__ == "__main__":
29 | # main()
30 |
--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | python_version = 3.10
3 | ; mypy_path = ./src
4 | namespace_packages = True
5 | ; ignore_missing_imports = True
6 | ; follow_imports = normal
7 | no_deprecation_warning=True
8 |
9 | [mypy-torch.*]
10 | ignore_missing_imports = True
11 |
12 | [mypy-loguru.*]
13 | ignore_missing_imports = True
14 |
15 | [mypy-torch.optim.*]
16 | ignore_missing_imports = True
17 |
18 | [mypy-dpu_utils]
19 | ignore_missing_imports = True
20 |
21 | [mypy-dpu_utils.*]
22 | ignore_missing_imports = True
23 |
24 | [mypy-toolz]
25 | ignore_missing_imports = True
26 |
27 | [mypy-numpy]
28 | ignore_missing_imports = True
29 |
30 | [mypy-tensorflow.*]
31 | ignore_missing_imports = True
32 |
33 | [mypy-pyhocon.*]
34 | ignore_missing_imports = True
35 |
36 | [mypy-transformers.*]
37 | ignore_missing_imports = True
38 |
39 | [mypy-tensorboard.*]
40 | ignore_missing_imports = True
41 |
42 | [mypy-tensorboardX.*]
43 | ignore_missing_imports = True
44 |
45 | [mypy-pathos.*]
46 | ignore_missing_imports = True
47 |
48 | [mypy-docopt]
49 | ignore_missing_imports = True
50 |
51 | [mypy-pandas]
52 | ignore_missing_imports = True
53 |
54 | [mypy-tqdm]
55 | ignore_missing_imports = True
56 |
57 | [mypy-pygments.*]
58 | ignore_missing_imports = True
59 |
60 | [mypy-scipy.*]
61 | ignore_missing_imports = True
62 |
63 | [mypy-annoy]
64 | ignore_missing_imports = True
65 |
66 | [mypy-wandb]
67 | ignore_missing_imports = True
68 |
69 | [mypy-wandb.*]
70 | ignore_missing_imports = True
71 |
72 | [mypy-sklearn.*]
73 | ignore_missing_imports = True
74 |
75 | [mypy-matplotlib.*]
76 | ignore_missing_imports = True
77 |
78 | [mypy-tokenizers.*]
79 | ignore_missing_imports = True
80 |
81 | [mypy-sentence_transformers.*]
82 | ignore_missing_imports = True
83 |
84 | [mypy-tree_sitter.*]
85 | ignore_missing_imports = True
86 |
--------------------------------------------------------------------------------
/pylama.ini:
--------------------------------------------------------------------------------
1 | [pylama]
2 | ;format = pylint
3 | skip = .tox/*,.env/*,.venv/*,.vscode/*
4 | ;linters = mccabe,pep257,pydocstyle,pep8,pycodestyle,pyflakes,pylint,isort,radon,eradicate
5 | linters = mccabe,pydocstyle,pycodestyle,pyflakes
6 | ;ignore = F0401,C0111,E731
7 | ignore = C0413,D212,D211,D203,R0903,C0330,D104,C0111,E1101,W0221,D406,D413,D407,W293,C901,D202,W291,D103,D100,D101,D107,D102,D400,E1102,C0103,C0411,R0913,R0914,R1719,W0212,C0412,R0902,W0102,E501,R0915,C0301,W0703,R1705,R0904,R0912,E203,W0640,R0911,R0201,D205,D415,W292,W503
8 |
9 | [pylama:*/__init__.py]
10 | ignore = W0611,W0401
11 |
12 | [pylama:tests/*.py]
13 | ignore = D104,D100
14 |
15 | [pylama:pycodestyle]
16 | max_line_length = 120
17 |
18 | [pylama:pylint]
19 | max_line_length = 120
20 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["poetry>=0.12"]
3 | build-backend = "poetry.masonry.api"
4 |
5 | [tool.poetry]
6 | name = "codenets"
7 | version = "0.2.0"
8 | description = "code & neural nets."
9 | authors = ["Voitot Pascal"]
10 | readme = "README.md"
11 |
12 | # packages = [
13 | # { include = "src/**/*.py" },
14 | # ]
15 |
16 | [tool.poetry.dependencies]
17 | python = "^3.10"
18 | numpy = "^1.24"
19 | torch = "^2.0.0"
20 | pandas = "^2.0.0"
21 | #tokenizers = "^0.2.1"
22 | transformers = "^4.27.0"
23 | loguru = "^0.6"
24 | docopt = "^0.6"
25 | dpu-utils = "^0.6"
26 | wandb = "^0.14"
27 | pathos = "^0.3"
28 | pyhocon = "^0.3.60"
29 | annoy = "^1.17"
30 | #tables = "^3.6.1"
31 | sentence_transformers = "^2.2"
32 | tree_sitter = "^0.20"
33 | # tree-sitter = { file = "../../tools/py-tree-sitter/tree_sitter-0.1.0_mandubian-cp37-cp37m-linux_x86_64.whl" }
34 | #pyarrow = "*"
35 | fastparquet = "^2023.2"
36 | # apex = "*"
37 |
38 | [tool.poetry.dev-dependencies]
39 | black = "*"
40 | pylama = "*"
41 | pytest = "*"
42 | mypy = "^1.1"
43 | jupyterlab = "*"
44 | matplotlib = "*"
45 | rope = "*"
46 | codecov = "*"
47 | pytest-cov = "*"
48 | pylint = "*"
49 | tensorboard = "*"
50 | tensorboardX = "*"
51 |
52 |
53 | [tool.black]
54 | line-length = 88
55 | exclude = '''
56 | /(
57 | \.git
58 | | \.mypy_cache
59 | | \.tox
60 | | \.venv
61 | | \.pytest_cache
62 | | dist
63 | | build
64 | | docs
65 | )/
66 | '''
67 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | annoy==1.16.3
2 | azure-common==1.1.24
3 | azure-nspkg==3.0.2
4 | azure-storage==0.36.0
5 | boto3==1.12.2
6 | botocore==1.15.2
7 | certifi==2019.11.28
8 | cffi==1.14.0
9 | chardet==3.0.4
10 | click==7.0
11 | colorama==0.4.3; sys_platform == "win32"
12 | configparser==4.0.2
13 | cryptography==2.8
14 | dill==0.3.1.1
15 | docker-pycreds==0.4.0
16 | docopt==0.6.2
17 | docutils==0.15.2
18 | dpu-utils==0.2.8
19 | gitdb2==3.0.2
20 | gitpython==3.0.8
21 | gql==0.2.0
22 | graphql-core==1.1
23 | idna==2.8
24 | jmespath==0.9.4
25 | joblib==0.14.1
26 | loguru==0.3.2
27 | multiprocess==0.70.9
28 | numexpr==2.7.1
29 | numpy==1.18.1
30 | nvidia-ml-py3==7.352.0
31 | pandas==0.25.3
32 | pathos==0.2.5
33 | pathtools==0.1.2
34 | pox==0.2.7
35 | ppft==1.6.6.1
36 | promise==2.3
37 | psutil==5.7.0
38 | pycparser==2.19
39 | pyhocon==0.3.54
40 | pyparsing==2.4.6
41 | python-dateutil==2.8.1
42 | pytz==2019.3
43 | pyyaml==5.3
44 | regex==2020.2.18
45 | requests==2.22.0
46 | s3transfer==0.3.3
47 | sacremoses==0.0.38
48 | sentencepiece==0.1.85
49 | sentry-sdk==0.14.1
50 | setsimilaritysearch==0.1.7
51 | shortuuid==0.5.0
52 | six==1.14.0
53 | smmap2==2.0.5
54 | subprocess32==3.5.4
55 | tables==3.6.1
56 | tokenizers==0.2.1
57 | torch==1.4.0
58 | tqdm==4.43.0
59 | transformers==2.3.0
60 | urllib3==1.25.8
61 | wandb==0.8.27
62 | watchdog==0.10.2
63 | win32-setctime==1.0.1; sys_platform == "win32"
64 |
--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/test/__init__.py
--------------------------------------------------------------------------------
/test/conf/default.conf:
--------------------------------------------------------------------------------
1 |
2 | lang_ids {
3 | php = 0
4 | python = 1
5 | ruby = 2
6 | java = 3
7 | go = 4
8 | javascript = 5
9 | }
10 |
11 | common_vocab_size = 10000
12 |
13 | bert {
14 | hidden_size = 128
15 | vocab_size = ${common_vocab_size}
16 | intermediate_size = 512
17 | num_hidden_layers = 3
18 | num_attention_heads = 8
19 | }
20 |
21 | tokenizers {
22 | type = "TOKENIZER_TYPE"
23 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
24 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
25 | }
26 |
27 | dataset {
28 | root_dir = ${HOME}"/workspaces/tools/CodeSearchNet/src"
29 | common_params {
30 | fraction_using_func_name=0.1
31 | min_len_func_name_for_query=12
32 | use_subtokens=False
33 | mark_subtoken_end=False
34 | code_max_num_tokens=200
35 | query_max_num_tokens=30
36 | use_bpe=True
37 | vocab_size=${common_vocab_size}
38 | pct_bpe=0.5
39 | vocab_count_threshold=10
40 | lang_ids = ${lang_ids}
41 | do_lowercase = true
42 | special_tokens = [""]
43 | parallelize = true
44 | use_lang_weights = False
45 | }
46 |
47 | train {
48 | dirs = ${dataset.root_dir}"/data_dirs_train.txt"
49 | params = ${dataset.common_params}
50 | }
51 |
52 | val {
53 | dirs = ${dataset.root_dir}"/data_dirs_valid.txt"
54 | params = ${dataset.common_params}
55 | }
56 |
57 | test {
58 | dirs = ${dataset.root_dir}"/data_dirs_test.txt"
59 | params = ${dataset.common_params}
60 | }
61 |
62 | queries_file = ${dataset.root_dir}"/queries.csv"
63 | }
64 |
65 |
66 | training {
67 | # The name of current experiment (can have several runs)
68 | name = "EXPERIMENT_NAME"
69 | # The unique id of current run
70 | iteration = "UNIQUE_RUN_ID"
71 | # The ID used to identify the pre-built pickled files
72 | # using the tokenizer defined above
73 | tokenizer_type = "TOKENIZER_ID"
74 |
75 | # Set that to true to test your run without slow-loading train dataset
76 | short_circuit = false
77 |
78 | device = "cuda"
79 | # deactivate wandb & tensorboard
80 | wandb = true
81 | tensorboard = true
82 |
83 | model {
84 | # IMPORTANT: the class representing Training Context
85 | training_ctx_class = "codenets.codesearchnet.query_1_code_1.training_ctx.Query1Code1Ctx"
86 | output_size = 64
87 | query_encoder {
88 | hidden_size = ${training.model.output_size}
89 | vocab_size = ${common_vocab_size}
90 | intermediate_size = 512
91 | num_hidden_layers = 3
92 | num_attention_heads = 8
93 | }
94 | code_encoder {
95 | hidden_size = ${training.model.output_size}
96 | vocab_size = ${common_vocab_size}
97 | intermediate_size = 512
98 | num_hidden_layers = 6
99 | num_attention_heads = 8
100 | }
101 | }
102 |
103 | # Training Hyper-Parameters
104 | seed = 0
105 | lr = 0.0001
106 | max_grad_norm = 1.0
107 | min_log_interval = 50
108 | start_epoch = 0
109 | epochs = 10
110 |
111 | batch_size {
112 | train = 256
113 | val = 256
114 | test = 256
115 | }
116 |
117 | loss {
118 | type = "softmax_cross_entropy"
119 | margin = 1.0
120 | }
121 |
122 | # Paths
123 | pickle_path = "./pickles"
124 | output_dir = "./checkpoints"
125 | tensorboard_path = "./runs"
126 |
127 | }
--------------------------------------------------------------------------------
/test/conf/test.conf:
--------------------------------------------------------------------------------
1 | include "./default.conf"
2 |
3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
4 |
5 |
6 | tokenizers {
7 | type = "qc_30k"
8 | build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
9 | token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 |
12 | dataset {
13 | common_params {
14 | parallelize = false
15 | do_lowercase = true
16 | special_tokens = ["", "", ""]
17 | use_lang_weights = True
18 | }
19 | }
20 |
21 | training {
22 | short_circuit = false
23 |
24 | device = "cpu"
25 | wandb = false
26 | tensorboard = false
27 |
28 | name = "test"
29 | iteration = "2020_02_23_01_00"
30 | tokenizer_type = ${tokenizers.type}
31 |
32 | model {
33 | training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
34 | encoder {
35 | hidden_size = 32
36 | vocab_size = ${common_vocab_size}
37 | intermediate_size = 256
38 | num_hidden_layers = 2
39 | num_attention_heads = 8
40 | }
41 | }
42 |
43 | loss {
44 | type = "softmax_cross_entropy"
45 | }
46 |
47 | batch_size {
48 | train = 768
49 | val = 768
50 | test = 768
51 | }
52 |
53 | }
--------------------------------------------------------------------------------
/test/test_recordable.py:
--------------------------------------------------------------------------------
1 |
2 | from pathlib import Path
3 | from typing import cast
4 | from codenets.recordable import DictRecordable
5 | import os
6 | import shutil
7 | import pytest
8 | from pyhocon import ConfigFactory
9 |
10 | from codenets.codesearchnet.training_ctx import CodeSearchTrainingContext
11 | from codenets.codesearchnet.query_code_siamese.training_ctx import QueryCodeSiameseCtx
12 |
13 | test_dir = Path("./tmp-test")
14 | cfg = Path("./test/conf/test.conf")
15 |
16 |
17 | @pytest.fixture(autouse=True)
18 | def run_before_and_after_tests(tmpdir):
19 | """Fixture to execute asserts before and after a test is run"""
20 | # Setup: fill with any logic you want
21 | os.mkdir(test_dir)
22 |
23 | yield # this is where the testing happens
24 |
25 | # Teardown : fill with any logic you want
26 | shutil.rmtree(test_dir)
27 |
28 |
29 | def test_dict_recordable():
30 | d = DictRecordable({
31 | 'toto': 1,
32 | 'tata': "titi",
33 | "tutu": 1.2345
34 | })
35 |
36 | assert d.save(test_dir / "d")
37 | d2 = DictRecordable.load(test_dir / "d")
38 | assert d == d2
39 |
40 |
41 | def test_fullconf_recordable():
42 | training_ctx = CodeSearchTrainingContext.build_context_from_hocon(ConfigFactory.parse_file(cfg))
43 | assert training_ctx.save(test_dir / "f")
44 |
45 | training_ctx_2 = QueryCodeSiameseCtx.load(test_dir / "f")
46 | print("keys", training_ctx.keys(), training_ctx_2.keys())
47 | assert training_ctx.keys() == training_ctx_2.keys()
48 |
49 |
50 |
51 |
52 |
--------------------------------------------------------------------------------
/wandb/settings:
--------------------------------------------------------------------------------
1 | [default]
2 | project = codenets
3 |
4 |
--------------------------------------------------------------------------------