├── .env
├── .gitignore
├── .python-version
├── .vscode
    └── settings.json
├── LICENSE
├── README.md
├── codenets
    ├── __init__.py
    ├── codesearchnet
    │   ├── __init__.py
    │   ├── ast_build.py
    │   ├── code_ast
    │   │   ├── __init__.py
    │   │   └── ast_utils.py
    │   ├── copied_code
    │   │   ├── __init__.py
    │   │   ├── bpevocabulary.py
    │   │   ├── metadata.py
    │   │   └── utils.py
    │   ├── data.py
    │   ├── dataset_main.py
    │   ├── dataset_utils.py
    │   ├── eval.py
    │   ├── huggingface
    │   │   ├── __init__.py
    │   │   ├── models.py
    │   │   └── tokenizer_recs.py
    │   ├── notebooks
    │   │   ├── codesearchnet_distrib.ipynb
    │   │   └── predictions.ipynb
    │   ├── poolers.py
    │   ├── predictions.py
    │   ├── query_1_code_1
    │   │   ├── __init__.py
    │   │   ├── dataset.py
    │   │   ├── model.py
    │   │   └── training_ctx.py
    │   ├── query_1_code_n
    │   │   ├── __init__.py
    │   │   ├── dataset.py
    │   │   ├── model.py
    │   │   └── training_ctx.py
    │   ├── query_code_siamese
    │   │   ├── __init__.py
    │   │   ├── dataset.py
    │   │   ├── model.py
    │   │   └── training_ctx.py
    │   ├── sbert_build.py
    │   ├── tokenizer_build.py
    │   ├── tokenizer_recs.py
    │   ├── train.py
    │   └── training_ctx.py
    ├── losses.py
    ├── main.py
    ├── recordable.py
    ├── save.py
    ├── tensorboard_utils.py
    └── utils.py
├── conf
    ├── code_search_bert_2020_02_01_1500.conf
    ├── code_search_bert_2020_02_03_20_00.conf
    ├── code_search_bert_lg_2020_02_04_15_00.conf
    ├── code_search_bert_lg_2020_02_04_21_00.conf
    ├── code_search_bert_lg_2020_02_05_00_00.conf
    ├── code_search_bert_lg_2020_02_06_18_00.conf
    ├── code_search_bert_lg_2020_02_06_22_30.conf
    ├── code_search_bert_lg_2020_02_07_10_00.conf
    ├── code_search_bert_query_1_code_1_2020_02_10_11_00 copy.conf
    ├── code_search_bert_query_1_code_1_2020_02_10_11_00.conf
    ├── code_search_bert_query_1_code_1_2020_02_11_22_00 copy.conf
    ├── code_search_bert_query_1_code_1_2020_02_11_22_00.conf
    ├── code_search_bert_query_code_siamese_2020_02_12_00_00 copy.conf
    ├── code_search_bert_query_code_siamese_2020_02_12_00_00.conf
    ├── code_search_bert_query_code_siamese_2020_02_14_16_00 copy.conf
    ├── code_search_bert_query_code_siamese_2020_02_14_16_00.conf
    ├── code_search_bert_query_code_siamese_2020_02_15_14_00.conf
    ├── default.conf
    ├── qc_ast_2020_03_13.conf
    ├── qc_ast_2020_03_15 copy.conf
    ├── qc_ast_2020_03_15.conf
    ├── qc_ast_2020_03_17.conf
    ├── qc_ast_2020_03_18 copy.conf
    ├── qc_ast_2020_03_18.conf
    ├── qc_ast_2020_03_19.conf
    ├── qc_ce_2020_02_23_01_00 copy.conf
    ├── qc_ce_2020_02_23_01_00.conf
    ├── qc_ce_long_seq_2020_02_24.conf
    ├── qc_ce_sbert_2020_02_27 copy.conf
    ├── qc_ce_sbert_2020_02_27.conf
    ├── qc_ce_sbert_2020_02_28 copy.conf
    ├── qc_ce_sbert_2020_02_28.conf
    ├── qc_ce_sbert_2020_02_29 copy.conf
    ├── qc_ce_sbert_2020_02_29.conf
    ├── qc_ce_sbert_2020_03_01 copy.conf
    ├── qc_ce_sbert_2020_03_01.conf
    ├── qc_ce_subtoken_2020_02_25 copy.conf
    ├── qc_ce_subtoken_2020_02_25.conf
    ├── qc_ce_subtoken_larger_2020_02_25.conf
    ├── qc_ce_subtoken_larger_2020_02_26 copy.conf
    ├── qc_ce_subtoken_larger_2020_02_26.conf
    ├── qc_lambda_2020_02_20_12_30 copy.conf
    ├── qc_lambda_2020_02_20_12_30.conf
    ├── qc_sbert_lambda_2020_03_02.conf
    ├── qc_sbert_lambda_2020_03_04 copy.conf
    ├── qc_sbert_lambda_2020_03_04.conf
    ├── qc_sbert_lambda_2020_03_05.conf
    ├── qc_sbert_lambda_2020_03_07 copy.conf
    ├── qc_sbert_lambda_2020_03_07.conf
    ├── query_code_siamese_2020_02_15_14_00 copy.conf
    ├── query_code_siamese_2020_02_15_14_00.conf
    ├── query_code_siamese_2020_02_17_21_30 copy.conf
    ├── query_code_siamese_2020_02_17_21_30.conf
    ├── query_code_siamese_2020_02_18_13_00.conf
    ├── query_code_siamese_2020_02_19_13_00 copy.conf
    ├── query_code_siamese_2020_02_19_13_00.conf
    ├── query_code_siamese_albert_2020_02_18_08_30 copy.conf
    ├── query_code_siamese_albert_2020_02_18_08_30.conf
    └── query_code_siamese_albert_2020_02_18_14_00.conf
├── guide.md
├── main.py
├── model_predictions.csv
├── mypy.ini
├── pylama.ini
├── pyproject.toml
├── requirements.txt
├── test
    ├── __init__.py
    ├── conf
    │   ├── default.conf
    │   └── test.conf
    └── test_recordable.py
└── wandb
    └── settings


/.env:
--------------------------------------------------------------------------------
1 | PYTHONPATH=./codenets
2 | 
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # ts
 2 | **/node_modules/
 3 | /webroot/scripts/*.js
 4 | 
 5 | # vim
 6 | **/*.swp
 7 | 
 8 | # python
 9 | **/*.pyc
10 | **/__pycache__/
11 | 
12 | # jupyter
13 | **/.ipynb_checkpoints/
14 | 
15 | # data
16 | resources/
17 | !resources/README.md
18 | !tests/data/
19 | # *.csv
20 | !model_predictions.csv
21 | 
22 | # environment
23 | *.ftpconfig
24 | 
25 | .idea
26 | /src/wandb/run-*
27 | /src/wandb/debug.log
28 | *.html
29 | 
30 | .mypy_cache
31 | *.lock
32 | 
33 | wandb
34 | checkpoints
35 | pickles
36 | runs
37 | vendor
38 | build
39 | build_tokenizers
40 | codenets.egg-info/
41 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.10.10


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   // "python.pythonPath": "/Users/Pascal/Library/Caches/pypoetry/virtualenvs/codenets-O5WbUhkp-py3.7/bin/python",
 3 |   "python.linting.lintOnSave": true,
 4 |   "python.linting.pylintEnabled": false,
 5 |   "python.linting.pylamaEnabled": true,
 6 |   "python.linting.mypyEnabled": true,
 7 |   "python.formatting.provider": "black",
 8 |   "python.formatting.blackArgs": ["--line-length", "120"],
 9 |   "[python]": {
10 |     "editor.formatOnSave": true,
11 |     "editor.formatOnSaveTimeout": 2000,
12 |     "editor.rulers": [120]
13 |   },
14 |   "autoDocstring.docstringFormat": "google",
15 |   "git.ignoreLimitWarning": true,
16 |   "python.testing.pytestArgs": ["test"],
17 |   "python.testing.unittestEnabled": false,
18 |   "python.testing.pytestEnabled": true
19 |   // "mypy.executable": "/Users/Pascal/Library/Caches/pypoetry/virtualenvs/codenets-O5WbUhkp-py3.7/bin/mypyls",
20 |   // "mypy.targets": [
21 |   //     "./src"
22 |   // ],
23 | }
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2020 Pascal Voitot
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.


--------------------------------------------------------------------------------
/codenets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/codenets/__init__.py


--------------------------------------------------------------------------------
/codenets/codesearchnet/__init__.py:
--------------------------------------------------------------------------------
1 | from codenets.codesearchnet.query_1_code_1 import model, training_ctx
2 | 
3 | import codenets.codesearchnet.query_1_code_1.training_ctx as single_branch_ctx
4 | 
5 | # single_branch_model = model
6 | import codenets.codesearchnet.query_1_code_1.model as single_branch_model
7 | 


--------------------------------------------------------------------------------
/codenets/codesearchnet/ast_build.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Usage:
  4 |     eval.py [options] SAVE_FOLDER TRAIN_DATA_PATH VALID_DATA_PATH TEST_DATA_PATH
  5 |     eval.py [options] [SAVE_FOLDER]
  6 | 
  7 | *_DATA_PATH arguments may either accept (1) directory filled with .jsonl.gz files that we use as data,
  8 | or a (2) plain text file containing a list of such directories (used for multi-language training).
  9 | 
 10 | In the case that you supply a (2) plain text file, all directory names must be separated by a newline.
 11 | For example, if you want to read from multiple directories you might have a plain text file called
 12 | data_dirs_train.txt with the below contents:
 13 | 
 14 | > cat ~/src/data_dirs_train.txt
 15 | azure://semanticcodesearch/pythondata/Processed_Data/jsonl/train
 16 | azure://semanticcodesearch/csharpdata/split/csharpCrawl-train
 17 | 
 18 | Options:
 19 |     -h --help                        Show this screen.
 20 |     --config FILE                    Specify HOCON config file.
 21 |     --debug                          Enable debug routines. [default: False]
 22 | """
 23 | 
 24 | from dpu_utils.utils import run_and_debug
 25 | from docopt import docopt
 26 | from loguru import logger
 27 | import os
 28 | from pyhocon import ConfigFactory
 29 | 
 30 | 
 31 | from codenets.codesearchnet.training_ctx import CodeSearchTrainingContext
 32 | from codenets.codesearchnet.code_ast.ast_utils import build_language_ast
 33 | 
 34 | """Evaluating SBert."""
 35 | 
 36 | 
 37 | def run(args, tag_in_vcs=False) -> None:
 38 |     os.environ["WANDB_MODE"] = "dryrun"
 39 | 
 40 |     logger.debug("Building Training Context")
 41 |     conf_file = args["--config"]
 42 |     conf = ConfigFactory.parse_file(conf_file)
 43 | 
 44 |     logger.info(f"Restoring Training Context from config {conf_file}")
 45 |     training_ctx = CodeSearchTrainingContext.build_context_from_hocon(conf)
 46 | 
 47 |     # dirs = [Path("/home/mandubian/workspaces/tools/CodeSearchNet/resources/data/ruby/final/jsonl/valid/")]
 48 |     # build_language_ast("val", training_ctx.val_dirs, training_ctx.pickle_path, training_ctx.val_data_params)
 49 |     # build_language_ast("train", training_ctx.train_dirs, training_ctx.pickle_path, training_ctx.train_data_params)
 50 |     build_language_ast("test", training_ctx.test_dirs, training_ctx.pickle_path, training_ctx.test_data_params)
 51 | 
 52 |     # Language.build_library(
 53 |     #     # Store the library in the `build` directory
 54 |     #     "build/my-languages.so",
 55 |     #     # Include one or more languages
 56 |     #     [
 57 |     #         "vendor/tree-sitter-go",
 58 |     #         "vendor/tree-sitter-java",
 59 |     #         "vendor/tree-sitter-javascript",
 60 |     #         "vendor/tree-sitter-python",
 61 |     #         "vendor/tree-sitter-php",
 62 |     #         "vendor/tree-sitter-ruby",
 63 |     #     ],
 64 |     # )
 65 | 
 66 |     # parser = Parser()
 67 | 
 68 |     # code_php = """
 69 |     # <?php
 70 |     # protected function checkAndSetAuthentication($repositoryName, $username, $password = null)
 71 |     #     {
 72 |     #         if ($this->hasAuthentication($repositoryName)) {
 73 |     #             $auth = $this->getAuthentication($repositoryName);
 74 |     #             if ($auth['username'] === $username && $auth['password'] === $password) {
 75 |     #                 return;
 76 |     #             }
 77 | 
 78 |     #             $this->writeError(
 79 |     #                 sprintf(
 80 |     #                     "<warning>Warning: You should avoid overwriting already defined auth settings for %s.</warning>",
 81 |     #                     $repositoryName
 82 |     #                 )
 83 |     #             );
 84 |     #         }
 85 |     #         $this->setAuthentication($repositoryName, $username, $password);
 86 |     #     }
 87 |     # ?>
 88 |     # """
 89 |     # PHP_LANGUAGE = Language("build/my-languages.so", "php")
 90 |     # parser.set_language(PHP_LANGUAGE)
 91 |     # tree = parser.parse(bytes(code_php, "utf8"))
 92 |     # cursor = tree.walk()
 93 |     # print(cursor.node.sexp())
 94 | 
 95 |     # skip_node_types = ["ERROR", "<?php", "?>"]
 96 |     # all_tokens_php, special_tokens_php = breadth_first_path("php", code_php, cursor, skip_node_types=skip_node_types)
 97 |     # print("all_tokens_php", all_tokens_php)
 98 |     # print("special_tokens_php", special_tokens_php)
 99 | 
100 |     # JAVA_LANGUAGE = Language("build/my-languages.so", "java")
101 |     # # parser = Parser()
102 |     # parser.set_language(JAVA_LANGUAGE)
103 |     # code_java = """
104 |     # class A {
105 |     #     public int b() {
106 |     #         int c = 5;
107 |     #     }
108 |     # }
109 |     # """
110 |     # tree = parser.parse(bytes(code_java, "utf8"))
111 |     # cursor = tree.walk()
112 |     # print("code_java", code_java)
113 |     # print(cursor.node.sexp())
114 |     # all_tokens_java, special_tokens_java = breadth_first_path(code_java, cursor)
115 |     # print("all_tokens_java", all_tokens_java)
116 |     # print("special_tokens_java", special_tokens_java)
117 | 
118 |     # print("===================================================")
119 | 
120 |     # PY_LANGUAGE = Language("build/my-languages.so", "python")
121 |     # parser.set_language(PY_LANGUAGE)
122 |     # code_python = """
123 |     # def foo():
124 |     #     if bar:
125 |     #         a: List[str] = ["toto", "tata"]
126 |     #         baz(a, b, 5)
127 |     # """
128 |     # tree = parser.parse(bytes(code_python, "utf8"))
129 |     # cursor = tree.walk()
130 |     # print("code_python", code_python)
131 |     # print(cursor.node.sexp())
132 |     # all_tokens_python, special_tokens_python = breadth_first_path(code_python, cursor)
133 |     # print("all_tokeall_tokens_pythonns", all_tokens_python)
134 |     # print("special_tokens_python", special_tokens_python)
135 | 
136 |     # special_tokens = special_tokens_python.union(special_tokens_java)
137 |     # print("special_tokens", special_tokens)
138 |     # training_ctx.tokenizer.vocab.add_special_tokens(list(special_tokens))
139 | 
140 |     # print("JAVA", training_ctx.tokenize_code_sentences([" ".join(all_tokens_java)], max_length=256))
141 |     # print("PYTHON", training_ctx.tokenize_code_sentences([" ".join(all_tokens_python)], max_length=256))
142 | 
143 | 
144 | if __name__ == "__main__":
145 |     args = docopt(__doc__)
146 |     run_and_debug(lambda: run(args), args["--debug"])
147 | 


--------------------------------------------------------------------------------
/codenets/codesearchnet/code_ast/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/codenets/codesearchnet/code_ast/__init__.py


--------------------------------------------------------------------------------
/codenets/codesearchnet/code_ast/ast_utils.py:
--------------------------------------------------------------------------------
  1 | from loguru import logger
  2 | import time
  3 | from typing import Dict, List, Tuple, IO, Set, Optional
  4 | from pathlib import Path
  5 | from tree_sitter import Language, Parser, Node
  6 | import os
  7 | import json
  8 | from pyhocon import ConfigTree
  9 | 
 10 | from codenets.codesearchnet.data import DatasetParams
 11 | from codenets.utils import get_data_files_from_directory
 12 | from codenets.codesearchnet.copied_code.utils import read_file_samples
 13 | 
 14 | 
 15 | class TreeSitterParser:
 16 |     def __init__(
 17 |         self,
 18 |         langs: List[str],
 19 |         added_nodes: Dict[str, Dict[str, str]],
 20 |         skip_node_types: Dict[str, List[str]],
 21 |         vendors_path: Path = Path("./vendor"),
 22 |     ):
 23 |         super(TreeSitterParser, self).__init__()
 24 | 
 25 |         vendors = []
 26 |         self.added_nodes = added_nodes
 27 |         self.skip_node_types = skip_node_types
 28 |         for lang in langs:
 29 |             vendors.append(vendors_path / f"tree-sitter-{lang}")
 30 |             if lang not in added_nodes:
 31 |                 self.added_nodes[lang] = ConfigTree([("prefix", ""), ("suffix", "")])
 32 |             if lang not in skip_node_types:
 33 |                 self.skip_node_types[lang] = []
 34 | 
 35 |         Language.build_library(
 36 |             # Store the library in the `build` directory
 37 |             "build/my-languages.so",
 38 |             # Include one or more languages
 39 |             vendors,
 40 |         )
 41 | 
 42 |         self.parser = Parser()
 43 | 
 44 |     def repr_field_node(
 45 |         self, code, node, field: Optional[str] = None, skip_node_types: List[str] = []
 46 |     ) -> Tuple[List[str], Set[str], bool]:
 47 |         skip_sub_nodes = False
 48 |         special_tokens: Set[str] = set()
 49 |         rpr: List[str]
 50 |         if field:
 51 |             rpr = ["<field>", field]
 52 |             special_tokens.add("<field>")
 53 |         else:
 54 |             rpr = []
 55 | 
 56 |         if node.is_named:
 57 |             # no child, serialize it here
 58 |             if len(node.children) == 0:
 59 |                 if node.type in skip_node_types:
 60 |                     rpr.extend([f"{node.type}", "<nosub>"])
 61 |                     special_tokens.add("<nosub>")
 62 |                 else:
 63 |                     rpr.extend([f"<{node.type}>", code[node.start_byte : node.end_byte], "<nosub>"])
 64 |                     special_tokens.update([f"<{node.type}>", "<nosub>"])
 65 | 
 66 |             else:
 67 |                 if node.type not in skip_node_types:
 68 |                     rpr.extend([f"<{node.type}>", "<sub>"])
 69 |                     special_tokens.update([f"<{node.type}>", "<sub>"])
 70 |                 else:
 71 |                     skip_sub_nodes = True
 72 |         else:
 73 |             if node.type not in skip_node_types:
 74 |                 rpr.extend([f"{node.type}", "<nosub>"])
 75 |                 special_tokens.add("<nosub>")
 76 |             else:
 77 |                 skip_sub_nodes = True
 78 | 
 79 |         return rpr, special_tokens, skip_sub_nodes
 80 | 
 81 |     def repr_level(self, code, cursor, skip_node_types: List[str] = []):
 82 |         nodes: List[Node] = []
 83 |         all_tokens: List[str] = []
 84 |         special_tokens: Set[str] = set()
 85 | 
 86 |         if cursor.goto_first_child():
 87 |             field = cursor.current_field_name()
 88 |             toks, specs, skip = self.repr_field_node(code, cursor.node, field, skip_node_types=skip_node_types)
 89 |             all_tokens.extend(toks)
 90 |             special_tokens.update(specs)
 91 |             if not skip:
 92 |                 nodes.append(cursor.node)
 93 | 
 94 |             while cursor.goto_next_sibling():
 95 |                 field = cursor.current_field_name()
 96 |                 toks, specs, skip = self.repr_field_node(code, cursor.node, field, skip_node_types=skip_node_types)
 97 |                 all_tokens.extend(toks)
 98 |                 special_tokens.update(specs)
 99 |                 if not skip:
100 |                     nodes.append(cursor.node)
101 | 
102 |             all_tokens.append("<lvl>")
103 |             special_tokens.add("<lvl>")
104 |         return all_tokens, special_tokens, nodes
105 | 
106 |     def breadth_first_path(self, lang, code, cursor, skip_node_types: List[str] = []) -> Tuple[List[str], Set[str]]:
107 |         all_tokens = [f"<{lang}>"]
108 |         special_tokens = set([f"<{lang}>"])
109 |         all_tokens_1, special_tokens_1, skip = self.repr_field_node(code, cursor.node, skip_node_types=skip_node_types)
110 |         all_tokens.extend(all_tokens_1)
111 |         special_tokens.update(special_tokens_1)
112 | 
113 |         if not skip:
114 |             all_tokens_lvl, special_tokens_lvl, nodes = self.repr_level(code, cursor, skip_node_types=skip_node_types)
115 |             all_tokens.extend(all_tokens_lvl)
116 |             special_tokens.update(special_tokens_lvl)
117 | 
118 |             while len(nodes) > 0:
119 |                 node = nodes.pop(0)
120 |                 cursor = node.walk()
121 |                 all_tokens_2, special_tokens_2, nodes_2 = self.repr_level(code, cursor, skip_node_types=skip_node_types)
122 |                 all_tokens.extend(all_tokens_2)
123 |                 special_tokens.update(special_tokens_2)
124 |                 nodes.extend(nodes_2)
125 |         all_tokens.append("<end>")
126 |         special_tokens.add("<end>")
127 |         return all_tokens, special_tokens
128 | 
129 |     def breadth_first_path_light(
130 |         self, lang, code, cursor, skip_node_types: List[str] = [], max_tokens: Optional[int] = None
131 |     ) -> List[str]:
132 |         all_tokens = [f"<{lang}>"]
133 |         all_tokens_1, special_tokens_1, skip = self.repr_field_node(code, cursor.node, skip_node_types=skip_node_types)
134 |         all_tokens.extend(all_tokens_1)
135 | 
136 |         if not skip:
137 |             all_tokens_lvl, special_tokens_lvl, nodes = self.repr_level(code, cursor, skip_node_types=skip_node_types)
138 |             all_tokens.extend(all_tokens_lvl)
139 | 
140 |             while len(nodes) > 0:
141 |                 if max_tokens is not None and len(all_tokens) >= max_tokens:
142 |                     break
143 |                 node = nodes.pop(0)
144 |                 cursor = node.walk()
145 |                 all_tokens_2, special_tokens_2, nodes_2 = self.repr_level(code, cursor, skip_node_types=skip_node_types)
146 |                 all_tokens.extend(all_tokens_2)
147 |                 nodes.extend(nodes_2)
148 |         if max_tokens is not None:
149 |             all_tokens = all_tokens[: (max_tokens - 1)]
150 |         all_tokens.append("<end>")
151 |         return all_tokens
152 | 
153 |     def parse_full(self, lang: str, code: str) -> Tuple[List[str], Set[str]]:
154 |         LANGUAGE = Language("build/my-languages.so", lang)
155 |         self.parser.set_language(LANGUAGE)
156 | 
157 |         code = f"{self.added_nodes[lang]['prefix']} {code} {self.added_nodes[lang]['suffix']}"
158 | 
159 |         tree = self.parser.parse(bytes(code, "utf8"))
160 |         cursor = tree.walk()
161 | 
162 |         tokens, special_tokens = self.breadth_first_path(lang, code, cursor, skip_node_types=self.skip_node_types[lang])
163 |         return tokens, special_tokens
164 | 
165 |     def parse(self, lang: str, code: str, max_tokens: Optional[int] = None) -> List[str]:
166 |         LANGUAGE = Language("build/my-languages.so", lang)
167 |         self.parser.set_language(LANGUAGE)
168 | 
169 |         code = f"{self.added_nodes[lang]['prefix']} {code} {self.added_nodes[lang]['suffix']}"
170 | 
171 |         tree = self.parser.parse(bytes(code, "utf8"))
172 |         cursor = tree.walk()
173 | 
174 |         tokens = self.breadth_first_path_light(
175 |             lang, code, cursor, skip_node_types=self.skip_node_types[lang], max_tokens=max_tokens
176 |         )
177 |         return tokens
178 | 
179 | 
180 | def load_special_tokens(data_params: DatasetParams):
181 |     special_tokens: List[str] = []
182 |     for f in data_params.ast_special_tokens_files:
183 |         fp = open(f, "r")
184 |         special_tokens.extend(json.load(fp))
185 | 
186 |     return special_tokens
187 | 
188 | 
189 | def build_language_ast(name: str, dirs: List[Path], pickle_path: Path, data_params: DatasetParams):
190 |     start = time.time()
191 | 
192 |     if data_params.use_ast == "tree-sitter":
193 |         parser = TreeSitterParser(
194 |             langs=["go", "java", "javascript", "python", "php", "ruby"],
195 |             added_nodes=data_params.ast_added_nodes,
196 |             skip_node_types=data_params.ast_skip_node_types,
197 |         )
198 | 
199 |         all_special_tokens: Set[str] = set()
200 | 
201 |         lengths: Dict[str, List[int]] = {"go": [], "java": [], "javascript": [], "python": [], "php": [], "ruby": []}
202 | 
203 |         for (idx, file_path) in enumerate(get_data_files_from_directory(dirs)):
204 |             logger.info(f"Reading {file_path}")
205 |             raw_samples = list(read_file_samples(file_path))
206 |             for raw_sample in raw_samples:
207 |                 lang = raw_sample["language"]
208 |                 tokens, special_tokens = parser.parse_full(lang, raw_sample["code"])
209 | 
210 |                 all_special_tokens.update(special_tokens)
211 | 
212 |                 lengths[lang].append(len(tokens))
213 | 
214 |         end = time.time()
215 |         logger.debug(f"all_special_tokens ({len(all_special_tokens)}) {all_special_tokens}")
216 | 
217 |         if not os.path.exists(pickle_path):
218 |             os.makedirs(pickle_path)
219 | 
220 |         json_file = Path(pickle_path) / f"{name}_special_tokens.json"
221 |         with open(json_file, "w") as f:
222 |             json.dump(list(all_special_tokens), f)
223 | 
224 |         import statistics
225 | 
226 |         for lang, lgs in lengths.items():
227 |             if len(lgs) > 0:
228 |                 max_lg = max(lgs)
229 |                 min_lg = min(lgs)
230 |                 mean_lg = statistics.mean(lgs)
231 |                 std_lg = statistics.stdev(lgs)
232 |                 logger.debug(f"{lang} [ min:{min_lg}, max:{max_lg}, mean:{mean_lg}, stddev:{std_lg} ]")
233 | 
234 |         time_p = end - start
235 |         logger.info(f"Building AST took: {time_p} sec")
236 | 


--------------------------------------------------------------------------------
/codenets/codesearchnet/copied_code/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/codenets/codesearchnet/copied_code/__init__.py


--------------------------------------------------------------------------------
/codenets/codesearchnet/copied_code/bpevocabulary.py:
--------------------------------------------------------------------------------
  1 | # Code copied from https://github.com/github/CodeSearchNet for backward-compatible experimentations
  2 | 
  3 | # Code adapted from https://github.com/soaxelbrooke/python-bpe/blob/master/bpe/encoder.py
  4 | # MIT License (see repository)
  5 | 
  6 | 
  7 | """
  8 | An encoder which learns byte pair encodings for white-space separated text.
  9 | Can tokenize, encode, and decode.
 10 | """
 11 | import typing
 12 | from typing import Optional, Sized
 13 | from collections import Counter
 14 | # from dpu_utils.mlutils import Vocabulary
 15 | 
 16 | try:
 17 |     from typing import Dict, Iterable, List, Iterator
 18 | except ImportError:
 19 |     pass
 20 | 
 21 | 
 22 | DEFAULT_EOW = "__eow"
 23 | DEFAULT_SOW = "__sow"
 24 | DEFAULT_UNK = "__unk"
 25 | DEFAULT_PAD = "__pad"
 26 | 
 27 | # pylint: disable= inherit-non-class
 28 | 
 29 | 
 30 | class BpeVocabulary(Sized):
 31 |     """Encode white-space separated text using byte-pair encoding.  See https://arxiv.org/abs/1508.07909 for details."""
 32 | 
 33 |     def __init__(
 34 |         self,
 35 |         vocab_size: int = 8192,
 36 |         pct_bpe: float = 0.2,
 37 |         ngram_min: int = 2,
 38 |         ngram_max: int = 8,
 39 |         required_tokens: Optional[Iterable[str]] = None,
 40 |         strict=True,
 41 |         EOW=DEFAULT_EOW,
 42 |         SOW=DEFAULT_SOW,
 43 |         UNK=DEFAULT_UNK,
 44 |         PAD=DEFAULT_PAD,
 45 |     ):
 46 |         if vocab_size < 1:
 47 |             raise ValueError("vocab size must be greater than 0.")
 48 | 
 49 |         self.EOW = EOW
 50 |         self.SOW = SOW
 51 |         self.eow_len = len(EOW)
 52 |         self.sow_len = len(SOW)
 53 |         self.UNK = UNK
 54 |         self.PAD = PAD
 55 |         self.required_tokens = list(set(required_tokens or []).union({self.UNK, self.PAD}))
 56 |         self.vocab_size = vocab_size
 57 |         self.pct_bpe = pct_bpe
 58 |         self.word_vocab_size = max([int(vocab_size * (1 - pct_bpe)), len(self.required_tokens or [])])
 59 |         self.bpe_vocab_size = vocab_size - self.word_vocab_size
 60 |         self.word_vocab = {}  # type: Dict[str, int]
 61 |         self.bpe_vocab = {}  # type: Dict[str, int]
 62 |         self.inverse_word_vocab = {}  # type: Dict[int, str]
 63 |         self.inverse_bpe_vocab = {}  # type: Dict[int, str]
 64 |         self.ngram_min = ngram_min
 65 |         self.ngram_max = ngram_max
 66 |         self.strict = strict
 67 | 
 68 |     def __len__(self):
 69 |         """Return vocab len"""
 70 |         return self.vocab_size
 71 | 
 72 |     def byte_pair_counts(self, words: Iterable[str]) -> Iterable[typing.Counter]:
 73 |         """
 74 |         Count space separated token character pairs:
 75 |             [('T h i s </w>', 4}] -> {'Th': 4, 'hi': 4, 'is': 4}
 76 |         """
 77 |         for token, count in self.count_tokens(words).items():
 78 |             bp_counts = Counter()  # type: Counter
 79 |             sub_tokens = token.split(" ")
 80 |             joined_tokens = "".join(sub_tokens)
 81 |             token_offsets = [0]
 82 |             length = 0
 83 |             for ngram in sub_tokens:
 84 |                 bp_counts[ngram] += count
 85 |                 length += len(ngram)
 86 |                 token_offsets += [length]
 87 |             for ngram_size in range(self.ngram_min, min(self.ngram_max, len(sub_tokens)) + 1):
 88 |                 for i in range(len(sub_tokens) - ngram_size + 1):
 89 |                     bp_counts[joined_tokens[token_offsets[i] : token_offsets[i + ngram_size]]] += count
 90 | 
 91 |             yield bp_counts
 92 | 
 93 |     def count_tokens(self, words: Iterable[str]) -> Dict[str, int]:
 94 |         """Count tokens into a BPE vocab"""
 95 |         token_counts = Counter(words)
 96 |         return {" ".join(token): count for token, count in token_counts.items()}
 97 | 
 98 |     def learn_word_vocab(self, word_counts: typing.Counter[str]) -> Dict[str, int]:
 99 |         """Build vocab from self.word_vocab_size most common tokens in provided sentences"""
100 |         for token in set(self.required_tokens or []):
101 |             word_counts[token] = int(2 ** 31)
102 |         word_counts[self.PAD] = int(2 ** 32)  # Make sure that PAD gets id=0
103 |         sorted_word_counts = sorted(word_counts.items(), key=lambda p: -p[1])
104 |         return {word: idx for idx, (word, count) in enumerate(sorted_word_counts[: self.word_vocab_size])}
105 | 
106 |     def learn_bpe_vocab(self, words: Iterable[str]) -> Dict[str, int]:
107 |         """Learn a vocab of byte pair encodings"""
108 |         vocab = Counter()  # type: typing.Counter
109 |         for token in {self.SOW, self.EOW}:
110 |             vocab[token] = int(2 ** 63)
111 |         for idx, byte_pair_count in enumerate(self.byte_pair_counts(words)):
112 |             vocab.update(byte_pair_count)
113 |             if (idx + 1) % 10000 == 0:
114 |                 self.trim_vocab(10 * self.bpe_vocab_size, vocab)
115 | 
116 |         sorted_bpe_counts = sorted(vocab.items(), key=lambda p: -p[1])[: self.bpe_vocab_size]
117 |         return {bp: idx + self.word_vocab_size for idx, (bp, count) in enumerate(sorted_bpe_counts)}
118 | 
119 |     def fit(self, word_counts: typing.Counter[str]) -> None:
120 |         """Learn vocab from text."""
121 | 
122 |         # First, learn word vocab
123 |         self.word_vocab = self.learn_word_vocab(word_counts)
124 | 
125 |         remaining_words = Counter({word: count for word, count in word_counts.items() if word not in self.word_vocab})
126 |         self.bpe_vocab = self.learn_bpe_vocab(remaining_words.elements())
127 | 
128 |         self.inverse_word_vocab = {idx: token for token, idx in self.word_vocab.items()}
129 |         self.inverse_bpe_vocab = {idx: token for token, idx in self.bpe_vocab.items()}
130 | 
131 |     @staticmethod
132 |     def get_unk() -> str:
133 |         return DEFAULT_UNK
134 | 
135 |     @staticmethod
136 |     def get_pad() -> str:
137 |         return DEFAULT_PAD
138 | 
139 |     @staticmethod
140 |     def trim_vocab(n: int, vocab: Dict[str, int]) -> None:
141 |         """Delete all pairs below 10 * vocab size to prevent memory problems"""
142 |         pair_counts = sorted(vocab.items(), key=lambda p: -p[1])
143 |         pairs_to_trim = [pair for pair, count in pair_counts[n:]]
144 |         for pair in pairs_to_trim:
145 |             del vocab[pair]
146 | 
147 |     def subword_tokenize(self, word: str) -> List[str]:
148 |         """Tokenize inside an unknown token using BPE"""
149 |         end_idx = min([len(word), self.ngram_max])
150 |         sw_tokens = [self.SOW]
151 |         start_idx = 0
152 | 
153 |         while start_idx < len(word):
154 |             subword = word[start_idx:end_idx]
155 |             if subword in self.bpe_vocab:
156 |                 sw_tokens.append(subword)
157 |                 start_idx = end_idx
158 |                 end_idx = min([len(word), start_idx + self.ngram_max])
159 |             elif len(subword) == 1:
160 |                 sw_tokens.append(self.UNK)
161 |                 start_idx = end_idx
162 |                 end_idx = min([len(word), start_idx + self.ngram_max])
163 |             else:
164 |                 end_idx -= 1
165 | 
166 |         sw_tokens.append(self.EOW)
167 |         return sw_tokens
168 | 
169 |     def tokenize(self, word_tokens: List[str]) -> List[str]:
170 |         """Split a sentence into word and subword tokens"""
171 | 
172 |         tokens = []
173 |         for word_token in word_tokens:
174 |             if word_token in self.word_vocab:
175 |                 tokens.append(word_token)
176 |             else:
177 |                 tokens.extend(self.subword_tokenize(word_token))
178 | 
179 |         return tokens
180 | 
181 |     def transform(self, sentences: Iterable[List[str]], reverse=False, fixed_length=None) -> Iterable[List[int]]:
182 |         """Turn tokens into vocab idxs"""
183 |         direction = -1 if reverse else 1
184 |         for sentence in sentences:
185 |             encoded = []
186 |             tokens = list(self.tokenize(sentence))
187 |             for token in tokens:
188 |                 if token in self.word_vocab:
189 |                     encoded.append(self.word_vocab[token])
190 |                 elif token in self.bpe_vocab:
191 |                     encoded.append(self.bpe_vocab[token])
192 |                 else:
193 |                     encoded.append(self.word_vocab[self.UNK])
194 | 
195 |             if fixed_length is not None:
196 |                 encoded = encoded[:fixed_length]
197 |                 while len(encoded) < fixed_length:
198 |                     encoded.append(self.word_vocab[self.PAD])
199 | 
200 |             yield encoded[::direction]
201 | 
202 |     def inverse_transform(self, rows: Iterable[List[int]]) -> Iterator[str]:
203 |         """Turn token indexes back into space-joined text."""
204 |         for row in rows:
205 |             words = []
206 | 
207 |             rebuilding_word = False
208 |             current_word = ""
209 |             for idx in row:
210 |                 if self.inverse_bpe_vocab.get(idx) == self.SOW:
211 |                     if rebuilding_word and self.strict:
212 |                         raise ValueError("Encountered second SOW token before EOW.")
213 |                     rebuilding_word = True
214 | 
215 |                 elif self.inverse_bpe_vocab.get(idx) == self.EOW:
216 |                     if not rebuilding_word and self.strict:
217 |                         raise ValueError("Encountered EOW without matching SOW.")
218 |                     rebuilding_word = False
219 |                     words.append(current_word)
220 |                     current_word = ""
221 | 
222 |                 elif rebuilding_word and (idx in self.inverse_bpe_vocab):
223 |                     current_word += self.inverse_bpe_vocab[idx]
224 | 
225 |                 elif rebuilding_word and (idx in self.inverse_word_vocab):
226 |                     current_word += self.inverse_word_vocab[idx]
227 | 
228 |                 elif idx in self.inverse_word_vocab:
229 |                     words.append(self.inverse_word_vocab[idx])
230 | 
231 |                 elif idx in self.inverse_bpe_vocab:
232 |                     if self.strict:
233 |                         raise ValueError("Found BPE index {} when not rebuilding word!".format(idx))
234 |                     else:
235 |                         words.append(self.inverse_bpe_vocab[idx])
236 | 
237 |                 else:
238 |                     raise ValueError("Got index {} that was not in word or BPE vocabs!".format(idx))
239 | 
240 |             yield " ".join(w for w in words if w != "")
241 | 


--------------------------------------------------------------------------------
/codenets/codesearchnet/copied_code/metadata.py:
--------------------------------------------------------------------------------
  1 | # Code partially copied and adapted from https://github.com/github/CodeSearchNet for backward-compatible experimentations
  2 | 
  3 | from collections import defaultdict
  4 | 
  5 | from typing import DefaultDict, Dict, Iterable, List, Optional, Tuple
  6 | 
  7 | from dpu_utils.mlutils import Vocabulary
  8 | 
  9 | from dpu_utils.utils import RichPath
 10 | 
 11 | from codenets.codesearchnet.copied_code.bpevocabulary import BpeVocabulary
 12 | from codenets.codesearchnet.copied_code.utils import run_jobs_in_parallel
 13 | 
 14 | from dataclasses import dataclass
 15 | from pathlib import Path
 16 | from collections import Counter
 17 | from dataclasses import field
 18 | from enum import Enum
 19 | 
 20 | from codenets.utils import _to_subtoken_stream, get_data_files_from_directory
 21 | 
 22 | 
 23 | class QueryType(Enum):
 24 |     DOCSTRING = "docstring_as_query"
 25 |     FUNCTION_NAME = "func_name_as_query"
 26 | 
 27 | 
 28 | @dataclass
 29 | class Metadata:
 30 |     token_counter: Counter = field(default_factory=Counter)
 31 |     token_vocab: Optional[BpeVocabulary] = None
 32 |     common_tokens: List[Tuple[str, int]] = field(default_factory=list)
 33 | 
 34 | 
 35 | def load_metadata_from_sample(
 36 |     data_to_load: Iterable[str], raw_metadata: Metadata, use_subtokens: bool = False, mark_subtoken_end: bool = False
 37 | ) -> Metadata:
 38 |     if use_subtokens:
 39 |         data_to_load = _to_subtoken_stream(data_to_load, mark_subtoken_end=mark_subtoken_end)
 40 |     # raw_metadata["token_counter"].update(data_to_load)
 41 |     raw_metadata.token_counter.update(data_to_load)
 42 |     return raw_metadata
 43 | 
 44 | 
 45 | def append_metadata(
 46 |     encoder_label: str,
 47 |     vocab_size: int,
 48 |     vocab_count_threshold: int,
 49 |     # use_bpe: bool,
 50 |     pct_bpe: float,
 51 |     raw_metadata_list: List[Metadata],
 52 | ) -> Metadata:
 53 |     merged_token_counter: Counter = Counter()
 54 |     for raw_metadata in raw_metadata_list:
 55 |         # merged_token_counter += raw_metadata["token_counter"]
 56 |         merged_token_counter += raw_metadata.token_counter
 57 | 
 58 |     # if hyperparameters["%s_use_bpe" % encoder_label]:
 59 |     # token_vocabulary: Vocabulary
 60 |     # if use_bpe:
 61 |         token_vocabulary = BpeVocabulary(
 62 |             # vocab_size=hyperparameters["%s_token_vocab_size" % encoder_label],
 63 |             vocab_size=vocab_size,
 64 |             # pct_bpe=hyperparameters["%s_pct_bpe" % encoder_label],
 65 |             pct_bpe=pct_bpe,
 66 |         )
 67 |         token_vocabulary.fit(merged_token_counter)
 68 |     # else:
 69 |     #     token_vocabulary = Vocabulary.create_vocabulary(
 70 |     #         tokens=merged_token_counter,
 71 |     #         # max_size=hyperparameters["%s_token_vocab_size" % encoder_label],
 72 |     #         max_size=vocab_size,
 73 |     #         # count_threshold=hyperparameters["%s_token_vocab_count_threshold" % encoder_label],
 74 |     #         count_threshold=vocab_count_threshold,
 75 |     #     )
 76 | 
 77 |     # final_metadata["token_vocab"] = token_vocabulary
 78 |     # Save the most common tokens for use in data augmentation:
 79 |     # final_metadata["common_tokens"] = merged_token_counter.most_common(50)
 80 |     final_metadata = Metadata(
 81 |         token_vocab=token_vocabulary,
 82 |         token_counter=merged_token_counter,
 83 |         common_tokens=merged_token_counter.most_common(50),
 84 |     )
 85 |     return final_metadata
 86 | 
 87 | 
 88 | def build_tokenizer_metadata(
 89 |     data_dirs: List[Path],
 90 |     max_files_per_dir: Optional[int] = None,
 91 |     parallelize: bool = True,
 92 |     use_subtokens: bool = False,
 93 |     mark_subtoken_end: bool = False,
 94 | ) -> Tuple[List[Metadata], Dict[str, List[Metadata]]]:
 95 |     raw_query_metadata_list = []
 96 |     raw_code_language_metadata_lists: DefaultDict[str, List] = defaultdict(list)
 97 | 
 98 |     def metadata_parser_fn(_, file_path: Path) -> Iterable[Tuple[Metadata, Dict[str, Metadata]]]:
 99 |         raw_query_metadata = Metadata()
100 |         per_code_language_metadata: DefaultDict[str, Metadata] = defaultdict(Metadata)
101 | 
102 |         for raw_sample in RichPath.create(str(file_path)).read_by_file_suffix():
103 |             sample_language = raw_sample["language"]
104 |             per_code_language_metadata[sample_language] = load_metadata_from_sample(
105 |                 data_to_load=raw_sample["code_tokens"],
106 |                 raw_metadata=per_code_language_metadata[sample_language],
107 |                 use_subtokens=use_subtokens,
108 |                 mark_subtoken_end=mark_subtoken_end,
109 |             )
110 | 
111 |             raw_query_metadata = load_metadata_from_sample(
112 |                 data_to_load=[d.lower() for d in raw_sample["docstring_tokens"]],
113 |                 raw_metadata=raw_query_metadata,
114 |                 use_subtokens=use_subtokens,
115 |                 mark_subtoken_end=mark_subtoken_end,
116 |             )
117 |         yield (raw_query_metadata, per_code_language_metadata)
118 | 
119 |     def received_result_callback(metadata_parser_result: Tuple[Metadata, Dict[str, Metadata]]):
120 |         (raw_query_metadata, per_code_language_metadata) = metadata_parser_result
121 |         raw_query_metadata_list.append(raw_query_metadata)
122 |         for (metadata_language, raw_code_language_metadata) in per_code_language_metadata.items():
123 |             raw_code_language_metadata_lists[metadata_language].append(raw_code_language_metadata)
124 | 
125 |     def finished_callback():
126 |         pass
127 | 
128 |     if parallelize:
129 |         run_jobs_in_parallel(
130 |             get_data_files_from_directory(data_dirs, max_files_per_dir),
131 |             metadata_parser_fn,
132 |             received_result_callback,
133 |             finished_callback,
134 |         )
135 |     else:
136 |         for (idx, file) in enumerate(get_data_files_from_directory(data_dirs, max_files_per_dir)):
137 |             for res in metadata_parser_fn(idx, file):
138 |                 received_result_callback(res)
139 | 
140 |     return raw_query_metadata_list, raw_code_language_metadata_lists
141 | 


--------------------------------------------------------------------------------
/codenets/codesearchnet/copied_code/utils.py:
--------------------------------------------------------------------------------
 1 | # Code copied from https://github.com/github/CodeSearchNet for backward-compatible experimentations
 2 | 
 3 | import multiprocessing
 4 | from typing import List, Iterable, Callable, TypeVar, Dict, Any, Union
 5 | from dpu_utils.utils import RichPath
 6 | from pathlib import Path
 7 | 
 8 | JobType = TypeVar("JobType")
 9 | ResultType = TypeVar("ResultType")
10 | 
11 | 
12 | def read_file_samples(file_path: Union[Path, str]) -> List[Dict[str, Any]]:
13 |     return RichPath.create(str(file_path)).read_by_file_suffix()
14 | 
15 | 
16 | def __parallel_queue_worker(
17 |     worker_id: int,
18 |     job_queue: multiprocessing.Queue,
19 |     result_queue: multiprocessing.Queue,
20 |     worker_fn: Callable[[int, JobType], Iterable[ResultType]],
21 | ):
22 |     while True:
23 |         job = job_queue.get()
24 | 
25 |         # "None" is the signal for last job, put that back in for other workers and stop:
26 |         if job is None:
27 |             job_queue.put(job)
28 |             break
29 | 
30 |         for result in worker_fn(worker_id, job):
31 |             result_queue.put(result)
32 |     result_queue.put(None)
33 | 
34 | 
35 | def run_jobs_in_parallel(
36 |     all_jobs: List[JobType],
37 |     worker_fn: Callable[[int, JobType], Iterable[ResultType]],
38 |     received_result_callback: Callable[[ResultType], None],
39 |     finished_callback: Callable[[], None],
40 |     result_queue_size: int = 100,
41 | ) -> None:
42 |     """
43 |     Run jobs in parallel and uses callbacks to collect results.
44 |     :param all_jobs: Job descriptions; one at a time will be parsed into worker_fn.
45 |     :param worker_fn: Worker function receiving a job; many copies may run in parallel.
46 |       Can yield results, which will be processed (one at a time) by received_result_callback.
47 |     :param received_result_callback: Called when a result was produced by any worker. Only one will run at a time.
48 |     :param finished_callback: Called when all jobs have been processed.
49 |     """
50 |     job_queue: multiprocessing.Queue = multiprocessing.Queue(len(all_jobs) + 1)
51 |     for job in all_jobs:
52 |         job_queue.put(job)
53 |     job_queue.put(None)  # Marker that we are done
54 | 
55 |     # This will hold the actual results:
56 |     result_queue: multiprocessing.Queue = multiprocessing.Queue(result_queue_size)
57 | 
58 |     # Create workers:
59 |     num_workers = multiprocessing.cpu_count() - 1
60 |     workers = [
61 |         multiprocessing.Process(target=__parallel_queue_worker, args=(worker_id, job_queue, result_queue, worker_fn))
62 |         for worker_id in range(num_workers)
63 |     ]
64 |     for worker in workers:
65 |         worker.start()
66 | 
67 |     num_workers_finished = 0
68 |     while True:
69 |         result = result_queue.get()
70 |         if result is None:
71 |             num_workers_finished += 1
72 |             if num_workers_finished == len(workers):
73 |                 finished_callback()
74 |                 break
75 |         else:
76 |             received_result_callback(result)
77 | 
78 |     for worker in workers:
79 |         worker.join()
80 | 


--------------------------------------------------------------------------------
/codenets/codesearchnet/data.py:
--------------------------------------------------------------------------------
 1 | # This code is nearly 100% copied from original repo
 2 | 
 3 | from dataclasses import dataclass, fields as datafields
 4 | import numpy as np
 5 | 
 6 | from typing import Dict, TypeVar, List
 7 | from dataclasses import field
 8 | 
 9 | 
10 | @dataclass
11 | class DatasetParams:
12 |     """Description of parameters of a CodeSearchnet dataset"""
13 | 
14 |     fraction_using_func_name: float
15 |     min_len_func_name_for_query: int
16 |     use_subtokens: bool
17 |     mark_subtoken_end: bool
18 |     code_max_num_tokens: int
19 |     query_max_num_tokens: int
20 |     use_bpe: bool
21 |     vocab_size: int
22 |     pct_bpe: float
23 |     vocab_count_threshold: int
24 |     lang_ids: Dict[str, int]
25 |     do_lowercase: bool
26 |     special_tokens: List[str]
27 |     parallelize: bool
28 |     use_lang_weights: bool = False  # for backward compat
29 |     query_random_token_frequency: float = 0.2
30 |     query_embeddings: str = "none"
31 |     use_ast: str = "none"
32 |     ast_added_nodes: Dict[str, Dict[str, str]] = field(default_factory=dict)
33 |     ast_skip_node_types: Dict[str, List[str]] = field(default_factory=dict)
34 |     ast_special_tokens_files: List[str] = field(default_factory=list)
35 | 
36 | 
37 | T_InputFeatures = TypeVar("T_InputFeatures", bound="InputFeatures")
38 | 
39 | 
40 | @dataclass
41 | class InputFeatures:
42 |     """Structure gathering query and code tokens/mask after passing through tokenizer"""
43 | 
44 |     language: int
45 |     similarity: float
46 |     query_tokens: np.ndarray
47 |     query_tokens_mask: np.ndarray
48 | 
49 |     query_docstring_tokens: np.ndarray
50 |     query_docstring_tokens_mask: np.ndarray
51 | 
52 |     code_tokens: np.ndarray
53 |     code_tokens_mask: np.ndarray
54 | 
55 | 
56 | def dataclass_from_dict(klass, dikt):
57 |     """Load any dataclass from a dict"""
58 |     fieldtypes = {f.name: f.type for f in datafields(klass)}
59 |     return klass(**{f: dataclass_from_dict(fieldtypes[f], dikt[f]) for f in dikt})
60 | 


--------------------------------------------------------------------------------
/codenets/codesearchnet/dataset_main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Main to test dataset loading
 4 | 
 5 | Usage:
 6 |     dataset_main.py [options]
 7 |     dataset_main.py [options]
 8 | 
 9 | Options:
10 |     -h --help                        Show this screen.
11 |     --config FILE                    Specify HOCON config file. [default: ./conf/default.conf]
12 |     --debug                          Enable debug routines. [default: False]
13 | """
14 | 
15 | 
16 | from docopt import docopt
17 | from loguru import logger
18 | import sys
19 | import torch
20 | import itertools
21 | from dpu_utils.utils import run_and_debug
22 | from pyhocon import ConfigFactory, ConfigTree
23 | from torch.utils.data import DataLoader
24 | from codenets.codesearchnet.dataset_utils import BalancedBatchSchedulerSampler, DatasetType
25 | from codenets.codesearchnet.training_ctx import CodeSearchTrainingContext
26 | 
27 | 
28 | print("Torch version", torch.__version__)
29 | 
30 | logger.remove()
31 | logger.add(sys.stderr, level="DEBUG", colorize=True, backtrace=False)
32 | 
33 | 
34 | def run(args, tag_in_vcs=False) -> None:
35 |     conf_file = args["--config"]
36 |     logger.info(f"config file {conf_file}")
37 | 
38 |     conf: ConfigTree = ConfigFactory.parse_file(conf_file)
39 |     logger.info(f"config {conf}")
40 |     logger.info(f"Build Training Context from config {conf_file}")
41 |     training_ctx = CodeSearchTrainingContext.build_context_from_hocon(conf)
42 | 
43 |     train_dataset = training_ctx.build_lang_dataset(DatasetType.TRAIN)
44 |     train_dataloader = DataLoader(
45 |         dataset=train_dataset,
46 |         batch_size=conf["training.batch_size.train"],
47 |         sampler=BalancedBatchSchedulerSampler(dataset=train_dataset, batch_size=conf["training.batch_size.train"]),
48 |     )
49 |     logger.info(f"train_dataloader [{len(train_dataloader)} samples]")
50 | 
51 |     for batch in itertools.islice(train_dataloader, 5):
52 |         logger.info(f"batch {batch}")
53 | 
54 |     # val_dataset = training_ctx.build_lang_dataset(DatasetType.VAL)
55 |     # val_dataloader = DataLoader(
56 |     #     dataset=val_dataset,
57 |     #     batch_size=conf["training.batch_size.val"],
58 |     #     sampler=BalancedBatchSchedulerSampler(dataset=val_dataset, batch_size=conf["training.batch_size.val"]),
59 |     # )
60 |     # logger.info(f"val_dataloader [{len(val_dataloader)} samples]")
61 | 
62 |     # for batch in itertools.islice(val_dataloader, 5):
63 |     #     logger.info(f"batch {batch}")
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     args = docopt(__doc__)
68 |     run_and_debug(lambda: run(args), args["--debug"])
69 | 


--------------------------------------------------------------------------------
/codenets/codesearchnet/eval.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Usage:
  4 |     eval.py [options] SAVE_FOLDER TRAIN_DATA_PATH VALID_DATA_PATH TEST_DATA_PATH
  5 |     eval.py [options] [SAVE_FOLDER]
  6 | 
  7 | *_DATA_PATH arguments may either accept (1) directory filled with .jsonl.gz files that we use as data,
  8 | or a (2) plain text file containing a list of such directories (used for multi-language training).
  9 | 
 10 | In the case that you supply a (2) plain text file, all directory names must be separated by a newline.
 11 | For example, if you want to read from multiple directories you might have a plain text file called
 12 | data_dirs_train.txt with the below contents:
 13 | 
 14 | > cat ~/src/data_dirs_train.txt
 15 | azure://semanticcodesearch/pythondata/Processed_Data/jsonl/train
 16 | azure://semanticcodesearch/csharpdata/split/csharpCrawl-train
 17 | 
 18 | Options:
 19 |     -h --help                        Show this screen.
 20 |     --restore DIR                    specify restoration dir. [optional]
 21 |     --debug                          Enable debug routines. [default: False]
 22 | """
 23 | 
 24 | import os
 25 | import torch
 26 | from docopt import docopt
 27 | from dpu_utils.utils import run_and_debug
 28 | from loguru import logger
 29 | from tqdm import tqdm
 30 | 
 31 | from torch.utils.data import DataLoader
 32 | 
 33 | # from codenets.codesearchnet.single_branch_ctx import SingleBranchTrainingContext
 34 | from codenets.codesearchnet.dataset_utils import BalancedBatchSchedulerSampler, DatasetType
 35 | from codenets.codesearchnet.training_ctx import (
 36 |     CodeSearchTrainingContext,
 37 |     compute_loss_mrr,
 38 |     TotalLoss,
 39 |     TotalMrr,
 40 |     TotalSize,
 41 |     BatchSize,
 42 |     BatchLoss,
 43 | )
 44 | 
 45 | 
 46 | def run(args, tag_in_vcs=False) -> None:
 47 |     os.environ["WANDB_MODE"] = "dryrun"
 48 | 
 49 |     logger.debug("Building Training Context")
 50 |     training_ctx: CodeSearchTrainingContext
 51 |     restore_dir = args["--restore"]
 52 |     logger.info(f"Restoring Training Context from directory{restore_dir}")
 53 |     training_ctx = CodeSearchTrainingContext.build_context_from_dir(restore_dir)
 54 | 
 55 |     # Build Val Dataloader
 56 |     # val_dataset = training_ctx.build_lang_dataset(DatasetType.VAL)
 57 |     # val_dataloader = DataLoader(
 58 |     #     dataset=val_dataset,
 59 |     #     batch_size=training_ctx.val_batch_size,
 60 |     #     sampler=BalancedBatchSchedulerSampler(dataset=val_dataset, batch_size=training_ctx.val_batch_size),
 61 |     # )
 62 |     # logger.info(f"Built val_dataloader [Length:{len(val_dataloader)} x Batch:{training_ctx.val_batch_size}]")
 63 | 
 64 |     # Build Test Dataloader
 65 |     test_dataset = training_ctx.build_lang_dataset(DatasetType.TEST)
 66 |     test_dataloader = DataLoader(
 67 |         dataset=test_dataset,
 68 |         batch_size=training_ctx.val_batch_size,
 69 |         sampler=BalancedBatchSchedulerSampler(dataset=test_dataset, batch_size=training_ctx.test_batch_size),
 70 |     )
 71 |     logger.info(f"Built test_dataloader [Length:{len(test_dataloader)} x Batch:{training_ctx.test_batch_size}]")
 72 | 
 73 |     total_loss = TotalLoss(0.0)
 74 |     total_size = TotalSize(0)
 75 |     total_mrr = TotalMrr(0.0)
 76 |     training_ctx.eval_mode()
 77 |     with torch.no_grad():
 78 |         training_ctx.zero_grad()
 79 |     with tqdm(total=len(test_dataloader)) as t_batch:
 80 |         for batch_idx, batch in enumerate(test_dataloader):
 81 |             languages, similarity, query_tokens, query_tokens_mask, code_tokens, code_tokens_mask = [
 82 |                 t.to(training_ctx.device) for t in batch
 83 |             ]
 84 | 
 85 |             batch_total_loss, similarity_scores = training_ctx.forward(batch, batch_idx)
 86 | 
 87 |             batch_size = BatchSize(batch[0].size()[0])
 88 |             batch_loss = BatchLoss(batch_total_loss.item())
 89 |             total_loss, avg_loss, total_mrr, avg_mrr, total_size = compute_loss_mrr(
 90 |                 similarity_scores, batch_loss, batch_size, total_loss, total_mrr, total_size
 91 |             )
 92 |             #     languages=languages,
 93 |             #     query_tokens=query_tokens,
 94 |             #     query_tokens_mask=query_tokens_mask,
 95 |             #     code_tokens=code_tokens,
 96 |             #     code_tokens_mask=code_tokens_mask,
 97 |             # )
 98 |             # batch_total_losses, similarity_scores = training_ctx.losses_scores_fn(
 99 |             #     query_embedding, code_embedding, similarity
100 |             # )
101 |             # batch_total_loss = torch.mean(batch_total_losses)
102 | 
103 |             # nb_samples = batch[0].size()[0]
104 | 
105 |             # # compute MRR
106 |             # # extract the logits from the diagonal of the matrix, which are the logits corresponding to the ground-truth
107 |             # correct_scores = similarity_scores.diagonal()
108 |             # # compute how many queries have bigger logits than the ground truth (the diagonal)
109 |             # # the elements that are incorrectly ranked
110 |             # compared_scores = similarity_scores.ge(correct_scores.unsqueeze(dim=-1)).float()
111 |             # compared_scores_nb = torch.sum(compared_scores, dim=1)
112 |             # per_sample_mrr = torch.div(1.0, compared_scores_nb)
113 |             # per_batch_mrr = torch.sum(per_sample_mrr) / nb_samples
114 | 
115 |             # epoch_samples += nb_samples
116 |             # epoch_loss += batch_total_loss.item() * nb_samples
117 |             # loss = epoch_loss / max(1, epoch_samples)
118 | 
119 |             # mrr_sum += per_batch_mrr.item() * nb_samples
120 |             # mrr = mrr_sum / max(1, epoch_samples)
121 | 
122 |             t_batch.set_postfix({f"loss": f"{batch_total_loss.item():10}"})
123 |             t_batch.update(1)
124 | 
125 |     logger.info(
126 |         f"total_loss:{total_loss}, avg_loss:{avg_loss}, total_mrr:{total_mrr}, avg_mrr:{avg_mrr}, total_size:{total_size}"
127 |     )
128 | 
129 | 
130 | if __name__ == "__main__":
131 |     args = docopt(__doc__)
132 |     run_and_debug(lambda: run(args), args["--debug"])
133 | 


--------------------------------------------------------------------------------
/codenets/codesearchnet/huggingface/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/codenets/codesearchnet/huggingface/__init__.py


--------------------------------------------------------------------------------
/codenets/codesearchnet/huggingface/models.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | from pathlib import Path
 5 | from typing import Union, TypeVar, Type, Generic
 6 | from loguru import logger
 7 | from transformers import PreTrainedModel
 8 | 
 9 | from codenets.recordable import RecordableTorchModule
10 | from codenets.utils import full_classname, instance_full_classname, runtime_import
11 | 
12 | 
13 | PretrainedRec_T = TypeVar("PretrainedRec_T", bound="PreTrainedModelRecordable")
14 | Pretrained_T = TypeVar("Pretrained_T", bound="PreTrainedModel")
15 | 
16 | 
17 | class PreTrainedModelRecordable(Generic[Pretrained_T], RecordableTorchModule):
18 |     """
19 |     Wrap any generic HuggingFace PreTrainedModel as a Recordable Torch module
20 |     equipped with load/save
21 |     """
22 | 
23 |     def __init__(self, model: Pretrained_T):
24 |         super().__init__()
25 |         self.model = model
26 | 
27 |     def save(self, output_dir: Union[Path, str]) -> bool:
28 |         full_dir = Path(output_dir) / instance_full_classname(self) / instance_full_classname(self.model)
29 |         logger.info(f"Saving HuggingFace model to {full_dir}")
30 |         os.makedirs(full_dir, exist_ok=True)
31 |         self.model.save_pretrained(full_dir)
32 |         return True
33 | 
34 |     @classmethod
35 |     def load(cls: Type[PretrainedRec_T], restore_dir: Union[Path, str]) -> PretrainedRec_T:
36 |         full_dir = Path(restore_dir) / full_classname(cls)
37 |         logger.info(f"Loading HuggingFace Pretrained model from {full_dir}")
38 |         _, dirs, _ = list(os.walk(full_dir))[0]
39 |         model_cls_name = dirs[0]
40 |         logger.info(f"Loading HuggingFace {model_cls_name} model from {full_dir}/{model_cls_name}")
41 |         klass = runtime_import(model_cls_name)
42 |         assert issubclass(klass, PreTrainedModel)
43 | 
44 |         model = klass.from_pretrained(str(full_dir / model_cls_name))
45 | 
46 |         return cls(model)
47 | 
48 |     def forward(self, *args, **kwargs):
49 |         return self.model.forward(*args, **kwargs)


--------------------------------------------------------------------------------
/codenets/codesearchnet/huggingface/tokenizer_recs.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from typing import Iterable, List, Optional, Tuple, Union, Dict, Callable, IO
  3 | import numpy as np
  4 | import os
  5 | from loguru import logger
  6 | from pathlib import Path
  7 | from transformers import PreTrainedTokenizer, BertTokenizer
  8 | 
  9 | from tokenizers import CharBPETokenizer, Encoding
 10 | 
 11 | from codenets.recordable import instance_full_classname, full_classname
 12 | from codenets.codesearchnet.data import DatasetParams
 13 | from codenets.codesearchnet.tokenizer_recs import TokenizerRecordable
 14 | from codenets.codesearchnet.copied_code.utils import read_file_samples
 15 | from codenets.utils import get_data_files_from_directory
 16 | from codenets.codesearchnet.training_ctx import default_sample_update
 17 | 
 18 | 
 19 | class PreTrainedTokenizerRecordable(TokenizerRecordable):
 20 |     def __init__(self, vocab: PreTrainedTokenizer):
 21 |         self.vocab = vocab
 22 | 
 23 |     def tokenize(self, text: str, **kwargs) -> List[str]:
 24 |         return self.vocab.tokenize(text)
 25 | 
 26 |     def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
 27 |         return self.vocab.convert_tokens_to_ids(tokens)
 28 | 
 29 |     def unk_token(self) -> str:
 30 |         return self.vocab.unk_token()
 31 | 
 32 |     def encode_sentence(self, sentence: str, max_length: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]:
 33 |         encoded = self.vocab.encode_plus(
 34 |             sentence,
 35 |             max_length=max_length,
 36 |             pad_to_max_length=max_length is not None,
 37 |             return_token_type_ids=False,
 38 |             return_attention_mask=True,
 39 |         )
 40 |         token_ids = np.array(encoded["input_ids"])
 41 |         token_mask = np.array(encoded["attention_mask"])
 42 |         return token_ids, token_mask
 43 | 
 44 |     def encode_sentences(
 45 |         self, sentences: List[str], max_length: Optional[int] = None
 46 |     ) -> Tuple[np.ndarray, np.ndarray]:
 47 |         encoded = self.vocab.batch_encode_plus(
 48 |             sentences,
 49 |             max_length=max_length,
 50 |             pad_to_max_length=max_length is not None,
 51 |             return_token_type_ids=False,
 52 |             return_attention_mask=True,
 53 |         )
 54 |         token_ids = np.array(encoded["input_ids"])
 55 |         token_mask = np.array(encoded["attention_mask"])
 56 |         return (token_ids, token_mask)
 57 | 
 58 |     def encode_tokens(
 59 |         self, tokens: Iterable[List[str]], max_length: Optional[int] = None
 60 |     ) -> Tuple[np.ndarray, np.ndarray]:
 61 |         encoded = self.vocab(
 62 |             tokens,
 63 |             max_length=max_length,
 64 |             pad_to_max_length=max_length is not None,
 65 |             return_token_type_ids=False,
 66 |             return_attention_mask=True,
 67 |         )
 68 |         token_ids = np.array(encoded["input_ids"])
 69 |         token_mask = np.array(encoded["attention_mask"])
 70 |         return (token_ids, token_mask)
 71 | 
 72 |     def decode_sequence(self, tokens_sequence: List[int]) -> str:
 73 |         return self.vocab.decode(tokens_sequence)
 74 | 
 75 |     def decode_sequences(self, tokens_sequences: Iterable[List[int]]) -> List[str]:
 76 |         return self.vocab.decode_batch(tokens_sequences)
 77 | 
 78 |     def add_special_tokens(self, special_tokens: List[str]) -> bool:
 79 |         self.vocab.add_special_tokens(special_tokens)
 80 |         return True
 81 | 
 82 | 
 83 | class BertTokenizerRecordable(PreTrainedTokenizerRecordable):
 84 |     def __init__(self, vocab: BertTokenizer):
 85 |         super(BertTokenizerRecordable, self).__init__(vocab)
 86 | 
 87 |     def save(self, output_dir: Union[Path, str]) -> bool:
 88 |         full_dir = Path(output_dir) / instance_full_classname(self)
 89 |         logger.debug(f"Saving BertTokenizerRecordable to {full_dir}")
 90 |         os.makedirs(full_dir, exist_ok=True)
 91 |         self.vocab.save_pretrained(full_dir)
 92 |         return True
 93 | 
 94 |     @classmethod
 95 |     def load(cls, restore_dir: Union[Path, str]) -> "BertTokenizerRecordable":
 96 |         full_dir = Path(restore_dir) / full_classname(cls)
 97 |         logger.debug(f"Loading BertTokenizerRecordable from {full_dir}")
 98 |         vocab = BertTokenizer.from_pretrained(str(full_dir))
 99 |         return BertTokenizerRecordable(vocab)
100 | 
101 | 
102 | class HuggingfaceBPETokenizerRecordable(TokenizerRecordable):
103 |     def __init__(self, tokenizer: CharBPETokenizer):
104 |         self.tokenizer = tokenizer
105 | 
106 |     def tokenize(self, text: str, **kwargs) -> List[str]:
107 |         return self.tokenizer.encode(text).tokens
108 | 
109 |     def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
110 |         return [self.tokenizer.token_to_id(tok) for tok in tokens]
111 | 
112 |     def unk_token(self) -> str:
113 |         # no access to that in
114 |         return "<unk>"
115 | 
116 |     # def pad_token(self) -> str:
117 |     #     return self.vocab.pad_token()
118 | 
119 |     def encode_sentence(self, sentence: str, max_length: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]:
120 |         enc: Encoding = self.tokenizer.encode(sentence)
121 |         if max_length is not None:
122 |             enc.truncate(max_length)
123 |             enc.pad(max_length)
124 |         return np.array(enc.ids), np.array(enc.attention_mask)
125 | 
126 |     def encode_sentences(
127 |         self, sentences: List[str], max_length: Optional[int] = None
128 |     ) -> Tuple[np.ndarray, np.ndarray]:
129 |         encs = self.tokenizer.encode_batch(sentences)
130 |         if max_length is not None:
131 |             for enc in encs:
132 |                 enc.truncate(max_length)
133 |                 enc.pad(max_length)
134 |         # tokens_ids = [np.array(enc.ids) for enc in encs]
135 |         # attention_mask = [np.array(enc.attention_mask) for enc in encs]
136 |         tokens_ids = [enc.ids for enc in encs]
137 |         attention_mask = [enc.attention_mask for enc in encs]
138 |         return (np.array(tokens_ids), np.array(attention_mask))
139 | 
140 |     def encode_tokens(
141 |         self, tokens: Iterable[List[str]], max_length: Optional[int] = None
142 |     ) -> Tuple[np.ndarray, np.ndarray]:
143 |         # hack...
144 |         sentences = [" ".join(toks) for toks in tokens]
145 |         return self.encode_sentences(sentences, max_length)
146 | 
147 |     def decode_sequence(self, tokens_sequence: List[int]) -> str:
148 |         return self.tokenizer.decode(tokens_sequence)
149 | 
150 |     def decode_sequences(self, tokens_sequences: Iterable[List[int]]) -> List[str]:
151 |         return self.tokenizer.decode_batch(tokens_sequences)
152 | 
153 |     def save(self, output_dir: Union[Path, str]) -> bool:
154 |         full_dir = Path(output_dir) / instance_full_classname(self)
155 |         logger.debug(f"HuggingfaceBPETokenizerRecordable - Saving to {full_dir}")
156 |         os.makedirs(full_dir, exist_ok=True)
157 | 
158 |         self.tokenizer._tokenizer.model.save(str(full_dir), name=str(instance_full_classname(self)))
159 |         return True
160 | 
161 |     @classmethod
162 |     def load(cls, restore_dir: Union[Path, str]) -> HuggingfaceBPETokenizerRecordable:
163 |         full_dir = Path(restore_dir) / full_classname(cls)
164 |         logger.debug(f"HuggingfaceBPETokenizerRecordable - Loading from {full_dir}")
165 |         vocab = str(full_dir / f"{full_classname(cls)}-vocab.json")
166 |         merges = str(full_dir / f"{full_classname(cls)}-merges.txt")
167 |         tokenizer = CharBPETokenizer(
168 |             vocab=vocab,
169 |             merges=merges
170 |         )
171 | 
172 |         return HuggingfaceBPETokenizerRecordable(tokenizer)
173 | 
174 |     def add_special_tokens(self, special_tokens: List[str]) -> bool:
175 |         self.tokenizer.add_special_tokens(special_tokens)
176 |         return True
177 | 
178 | 
179 | def build_huggingface_token_files(
180 |     data_dirs: List[Path],
181 |     data_params: DatasetParams,
182 |     output_path: Union[Path, str],
183 |     sample_update: Callable[[str, str, List[str]], str] = default_sample_update,
184 | ) -> Tuple[List[Path], Dict[str, Path]]:
185 |     tokenizers_path = Path(output_path)
186 |     os.makedirs(tokenizers_path, exist_ok=True)
187 |     # build files of strings
188 |     lang_ios: Dict[str, Tuple[IO[str], IO[str]]] = {}
189 | 
190 |     query_files: List[Path] = []
191 |     lang_files: Dict[str, Path] = {}
192 |     for (idx, file_path) in enumerate(get_data_files_from_directory(data_dirs)):
193 |         logger.info(f"Reading {file_path}")
194 |         for raw_sample in read_file_samples(file_path):
195 |             lang = raw_sample["language"]
196 |             if lang not in lang_ios:
197 |                 query_file = tokenizers_path / f"{lang}_query.txt"
198 |                 code_file = tokenizers_path / f"{lang}_code.txt"
199 |                 lang_ios[lang] = (open(query_file, "w"), open(code_file, "w"))
200 |                 query_files.append(query_file)
201 |                 lang_files[lang] = code_file
202 |             lang_ios[lang][0].write(sample_update("query", lang, raw_sample["docstring_tokens"]))
203 |             lang_ios[lang][1].write(sample_update("code", lang, raw_sample["code_tokens"]))
204 | 
205 |     return query_files, lang_files
206 | 


--------------------------------------------------------------------------------
/codenets/codesearchnet/poolers.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | from abc import abstractmethod
 3 | from torch import nn
 4 | import torch
 5 | from codenets.recordable import RecordableTorchModule
 6 | 
 7 | 
 8 | class EmbeddingPooler(RecordableTorchModule):
 9 |     """
10 |     Compute pooler
11 | 
12 |     Args:
13 |         seq_outputs (torch.tensor): [B x T x D] (B is batch, T is sequence size, D is embedding size)
14 |         tokens_mask (torch.tensor): [B x T]
15 |     
16 |     Returns:
17 |         tensor: [B x D]
18 |     """
19 | 
20 |     @abstractmethod
21 |     def forward(self, seq_outputs: torch.Tensor, tokens_mask: torch.Tensor) -> torch.Tensor:
22 |         pass
23 | 
24 | 
25 | class MeanPooler(EmbeddingPooler):
26 |     def __init__(self, input_size: int = 128, eps: float = 1e-8):
27 |         super().__init__()
28 |         self.dense = nn.Linear(input_size, 1, bias=False)
29 |         self.activation = nn.Sigmoid()
30 |         self.eps = eps
31 | 
32 |     def forward(self, seq_outputs: torch.Tensor, tokens_mask: torch.Tensor) -> torch.Tensor:
33 |         # TO TEST
34 |         lg = torch.sum(tokens_mask, dim=-1)
35 |         mask = tokens_mask.unsqueeze(dim=-1)
36 |         seq_outputs_masked = seq_outputs * mask
37 |         seq_outputs_sum = torch.sum(seq_outputs_masked, dim=-1)
38 |         output = seq_outputs_sum / lg.unsqueeze(dim=-1).clamp(self.eps)
39 |         return output
40 | 
41 | 
42 | class MeanWeightedPooler(EmbeddingPooler):
43 |     def __init__(self, input_size: int = 512, eps: float = 1e-8):  # default params required for module construction
44 |         super().__init__()
45 |         self.dense = nn.Linear(input_size, 1, bias=False)
46 |         self.activation = nn.Sigmoid()
47 |         self.eps = eps
48 | 
49 |     def forward(self, seq_outputs: torch.Tensor, tokens_mask: torch.Tensor) -> torch.Tensor:
50 |         token_weights = self.activation(self.dense(seq_outputs))  # B x T x 1
51 |         token_weights = token_weights * tokens_mask.unsqueeze(dim=-1)  # B x T x 1
52 |         # sum on the T dimension
53 |         seq_weighted_sum = torch.sum(seq_outputs * token_weights, dim=1)  # B x D
54 |         output = seq_weighted_sum / torch.sum(token_weights, dim=1).clamp(min=self.eps)
55 |         return output
56 | 


--------------------------------------------------------------------------------
/codenets/codesearchnet/predictions.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Usage:
  4 |     train.py [options] SAVE_FOLDER TRAIN_DATA_PATH VALID_DATA_PATH TEST_DATA_PATH
  5 |     train.py [options] [SAVE_FOLDER]
  6 | 
  7 | *_DATA_PATH arguments may either accept (1) directory filled with .jsonl.gz files that we use as data,
  8 | or a (2) plain text file containing a list of such directories (used for multi-language training).
  9 | 
 10 | In the case that you supply a (2) plain text file, all directory names must be separated by a newline.
 11 | For example, if you want to read from multiple directories you might have a plain text file called
 12 | data_dirs_train.txt with the below contents:
 13 | 
 14 | > cat ~/src/data_dirs_train.txt
 15 | azure://semanticcodesearch/pythondata/Processed_Data/jsonl/train
 16 | azure://semanticcodesearch/csharpdata/split/csharpCrawl-train
 17 | 
 18 | Options:
 19 |     -h --help                        Show this screen.
 20 |     --restore DIR                    specify restoration dir.
 21 |     --wandb_run_id <entity_name>/<project_name>/<run_id> Specify Wandb Run
 22 |     --debug                          Enable debug routines. [default: False]
 23 | """
 24 | 
 25 | import os
 26 | import sys
 27 | from pathlib import Path
 28 | from typing import Tuple
 29 | import torch
 30 | import numpy as np
 31 | from docopt import docopt
 32 | from dpu_utils.utils import run_and_debug
 33 | from loguru import logger
 34 | import pandas as pd
 35 | from annoy import AnnoyIndex
 36 | from tqdm import tqdm
 37 | import shutil
 38 | from wandb.apis import InternalApi
 39 | import wandb
 40 | from codenets.codesearchnet.training_ctx import CodeSearchTrainingContext
 41 | 
 42 | 
 43 | def compute_code_encodings_from_defs(
 44 |     language: str, training_ctx: CodeSearchTrainingContext, lang_token: str, batch_length: int = 1024
 45 | ) -> Tuple[pd.DataFrame, pd.DataFrame]:
 46 |     logger.info(f"Computing Encoding for language: {language}")
 47 |     lang_id = training_ctx.train_data_params.lang_ids[language]
 48 |     h5_file = (
 49 |         training_ctx.pickle_path
 50 |         / f"{language}_{training_ctx.training_full_name}_dedupe_definitions_v2_codes_encoded.h5"
 51 |     )
 52 |     root_data_path = Path(training_ctx.conf["dataset.root_dir"])
 53 | 
 54 |     def_file = root_data_path / f"data/{language}_dedupe_definitions_v2.pkl"
 55 |     definitions_df = pd.DataFrame(pd.read_pickle(open(def_file, "rb"), compression=None))
 56 |     cols_to_remove = list(definitions_df.columns.difference(["function_tokens", "identifier", "url"]))
 57 |     for col in cols_to_remove:
 58 |         del definitions_df[col]
 59 |     # definitions_df.drop(cols_to_remove, inplace=True, axis=1)
 60 |     logger.debug(f"definitions_df {definitions_df.columns}")
 61 | 
 62 |     if not os.path.exists(h5_file):
 63 |         logger.info(f"Building encodings of code from {def_file}")
 64 | 
 65 |         # function_tokens = definitions_df["function_tokens"]
 66 |         # add language and lang_token (<lg>) to tokens
 67 |         definitions_df["function_tokens"] = definitions_df["function_tokens"].apply(
 68 |             lambda row: [language, lang_token] + row
 69 |         )
 70 |         function_tokens_batch = definitions_df["function_tokens"].groupby(
 71 |             np.arange(len(definitions_df["function_tokens"])) // batch_length
 72 |         )
 73 | 
 74 |         code_embeddings = []
 75 |         for g, df_batch in tqdm(function_tokens_batch):
 76 |             # logger.debug(f"df_batch {df_batch.values}")
 77 |             codes_encoded, codes_masks = training_ctx.tokenize_code_tokens(
 78 |                 df_batch.values, max_length=training_ctx.conf["dataset.common_params.code_max_num_tokens"]
 79 |             )
 80 | 
 81 |             # codes_encoded_t = torch.tensor(codes_encoded, dtype=torch.long).to(training_ctx.device)
 82 |             # codes_masks_t = torch.tensor(codes_masks, dtype=torch.long).to(training_ctx.device)
 83 | 
 84 |             # logger.debug(f"codes_encoded_t {codes_encoded_t}")
 85 |             # logger.debug(f"codes_masks_t {codes_masks_t}")
 86 | 
 87 |             emb_df = pd.DataFrame(
 88 |                 training_ctx.encode_code(
 89 |                     lang_id=lang_id,
 90 |                     code_tokens=codes_encoded,
 91 |                     code_tokens_mask=codes_masks
 92 |                 )
 93 |                 # .cpu()
 94 |                 # .numpy()
 95 |             )
 96 |             # logger.debug(f"codes_encoded_t:{codes_encoded_t.shape} codes_masks_t:{codes_masks_t.shape}")
 97 |             if g < 2:
 98 |                 logger.debug(f"emb_df {emb_df.head()}")
 99 |             code_embeddings.append(emb_df)
100 | 
101 |         # free memory or it explodes on 32GB...
102 |         del definitions_df["function_tokens"]
103 | 
104 |         code_embeddings_df = pd.concat(code_embeddings)
105 | 
106 |         logger.debug(f"code_embeddings_df {code_embeddings_df.head(20)}")
107 | 
108 |         code_embeddings_df.to_hdf(h5_file, key="code_embeddings_df", mode="w")
109 |         return (code_embeddings_df, definitions_df)
110 |     else:
111 |         code_embeddings_df = pd.read_hdf(h5_file, key="code_embeddings_df")
112 |         return (code_embeddings_df, definitions_df)
113 | 
114 | 
115 | def run(args, tag_in_vcs=False) -> None:
116 |     args_wandb_run_id = args["--wandb_run_id"]
117 |     if args_wandb_run_id is not None:
118 |         entity, project, name = args_wandb_run_id.split("/")
119 |         os.environ["WANDB_RUN_ID"] = name
120 |         os.environ["WANDB_RESUME"] = "must"
121 | 
122 |         wandb_api = wandb.Api()
123 |         # retrieve saved model from W&B for this run
124 |         logger.info("Fetching run from W&B...")
125 |         try:
126 |             wandb_api.run(args_wandb_run_id)
127 |         except wandb.CommError:
128 |             logger.error(f"ERROR: Problem querying W&B for wandb_run_id: {args_wandb_run_id}", file=sys.stderr)
129 |             sys.exit(1)
130 | 
131 |     else:
132 |         os.environ["WANDB_MODE"] = "dryrun"
133 | 
134 |     logger.debug("Building Training Context")
135 |     training_ctx: CodeSearchTrainingContext
136 |     restore_dir = args["--restore"]
137 |     logger.info(f"Restoring Training Context from directory{restore_dir}")
138 |     training_ctx = CodeSearchTrainingContext.build_context_from_dir(restore_dir)
139 | 
140 |     queries = pd.read_csv(training_ctx.queries_file)
141 |     queries = list(map(lambda q: f"<qy> {q}", queries["query"].values))
142 |     queries_tokens, queries_masks = training_ctx.tokenize_query_sentences(
143 |         queries, max_length=training_ctx.conf["dataset.common_params.query_max_num_tokens"]
144 |     )
145 |     logger.info(f"queries: {queries}")
146 | 
147 |     training_ctx.eval_mode()
148 |     with torch.no_grad():
149 |         query_embeddings = (
150 |             training_ctx.encode_query(
151 |                 query_tokens=queries_tokens,
152 |                 query_tokens_mask=queries_masks,
153 |             )
154 |             # .cpu()
155 |             # .numpy()
156 |         )
157 |         logger.info(f"query_embeddings: {query_embeddings.shape}")
158 | 
159 |         topk = 100
160 |         language_token = "<lg>"
161 |         for lang_idx, language in enumerate(
162 |             ("python", "go", "javascript", "java", "php", "ruby")
163 |             # ("php", "ruby")
164 |         ):  # in enumerate(("python", "go", "javascript", "java", "php", "ruby")):
165 |             predictions = []
166 |             # (codes_encoded_df, codes_masks_df, definitions) = get_language_defs(language, training_ctx, language_token)
167 | 
168 |             code_embeddings, definitions = compute_code_encodings_from_defs(
169 |                 language, training_ctx, language_token, batch_length=512
170 |             )
171 |             logger.info(f"Building Annoy Index of length {len(code_embeddings.values[0])}")
172 |             indices: AnnoyIndex = AnnoyIndex(len(code_embeddings.values[0]), "angular")
173 |             # idx = 0
174 |             for index, emb in enumerate(tqdm(code_embeddings.values)):
175 |                 indices.add_item(index, emb)
176 |             indices.build(10)
177 | 
178 |             for i, (query, query_embedding) in enumerate(tqdm(zip(queries, query_embeddings))):
179 |                 idxs, distances = indices.get_nns_by_vector(query_embedding, topk, include_distances=True)
180 |                 for idx2, _ in zip(idxs, distances):
181 |                     predictions.append(
182 |                         (query, language, definitions.iloc[idx2]["identifier"], definitions.iloc[idx2]["url"])
183 |                     )
184 | 
185 |             logger.info(f"predictions {predictions[0]}")
186 | 
187 |             df = pd.DataFrame(predictions, columns=["query", "language", "identifier", "url"])
188 |             # BUT WHY DOESNT IT WORK AS EXPECTED????
189 |             df["query"] = df["query"].str.replace("<qy> ", "")
190 |             df["identifier"] = df["identifier"].str.replace(",", "")
191 |             df["identifier"] = df["identifier"].str.replace('"', "")
192 |             df["identifier"] = df["identifier"].str.replace(";", "")
193 |             df.to_csv(
194 |                 training_ctx.output_dir / f"model_predictions_{training_ctx.training_tokenizer_type}.csv",
195 |                 index=False,
196 |                 header=True if lang_idx == 0 else False,
197 |                 # mode="w" if lang_idx == 0 else "a",
198 |                 mode="a",
199 |             )
200 |             # Free memory
201 |             del code_embeddings
202 |             del definitions
203 |             del predictions
204 | 
205 |     if args_wandb_run_id is not None:
206 |         logger.info("Uploading predictions to W&B")
207 |         # upload model predictions CSV file to W&B
208 | 
209 |         entity, project, name = args_wandb_run_id.split("/")
210 | 
211 |         # make sure the file is in our cwd, with the correct name
212 |         predictions_csv = training_ctx.output_dir / f"model_predictions_{training_ctx.training_tokenizer_type}.csv"
213 |         predictions_base_csv = "model_predictions.csv"
214 |         shutil.copyfile(predictions_csv, predictions_base_csv)
215 | 
216 |         # Using internal wandb API. TODO: Update when available as a public API
217 |         internal_api = InternalApi()
218 |         internal_api.push([predictions_base_csv], run=name, entity=entity, project=project)
219 | 
220 | 
221 | if __name__ == "__main__":
222 |     args = docopt(__doc__)
223 |     run_and_debug(lambda: run(args), args["--debug"])
224 | 


--------------------------------------------------------------------------------
/codenets/codesearchnet/query_1_code_1/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/codenets/codesearchnet/query_1_code_1/__init__.py


--------------------------------------------------------------------------------
/codenets/codesearchnet/query_1_code_1/model.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from pathlib import Path
  4 | from typing import MutableMapping, Optional, Union, Type
  5 | 
  6 | import numpy as np
  7 | from transformers import BertConfig, BertModel
  8 | 
  9 | from codenets.codesearchnet.poolers import MeanWeightedPooler
 10 | from codenets.codesearchnet.huggingface.models import PreTrainedModelRecordable
 11 | from codenets.recordable import (
 12 |     Recordable,
 13 |     RecordableTorchModule,
 14 |     runtime_load_recordable_mapping,
 15 |     save_recordable_mapping,
 16 | )
 17 | from codenets.utils import full_classname, instance_full_classname
 18 | from pyhocon import ConfigTree
 19 | 
 20 | 
 21 | class Query1Code1(RecordableTorchModule):
 22 |     """
 23 |     A generic Pytorch Model with:
 24 |     - one single-branch query encoder
 25 |     - one single-branch code encoder
 26 |     - one optional pooler to pool output embeddings from any branch
 27 |     """
 28 | 
 29 |     def __init__(
 30 |         self,
 31 |         query_encoder: RecordableTorchModule,
 32 |         code_encoder: RecordableTorchModule,
 33 |         pooler: Optional[RecordableTorchModule] = None,
 34 |     ):
 35 |         super(Query1Code1, self).__init__()
 36 |         self.code_encoder = code_encoder
 37 |         self.query_encoder = query_encoder
 38 |         self.pooler = pooler
 39 | 
 40 |     def save(self, output_dir: Union[Path, str]) -> bool:
 41 |         d = Path(output_dir) / instance_full_classname(self)
 42 |         records: MutableMapping[str, Recordable] = {
 43 |             "query_encoder": self.query_encoder,
 44 |             "code_encoder": self.code_encoder,
 45 |         }
 46 |         if self.pooler is not None:
 47 |             records["pooler"] = self.pooler
 48 |         return save_recordable_mapping(output_dir=d, records=records)
 49 | 
 50 |     @classmethod
 51 |     def load(cls, restore_dir: Union[Path, str]) -> Query1Code1:
 52 |         d = Path(restore_dir) / full_classname(cls)
 53 |         records = runtime_load_recordable_mapping(d)
 54 |         return cls(**records) # type:ignore[arg-type]
 55 | 
 56 |     def forward(
 57 |         self,
 58 |         languages: np.ndarray,
 59 |         query_tokens: np.ndarray,
 60 |         query_tokens_mask: np.ndarray,
 61 |         code_tokens: np.ndarray,
 62 |         code_tokens_mask: np.ndarray,
 63 |     ):
 64 |         # lang_id = str(languages[0].item())
 65 |         query_seq_outputs = self.query_encoder(query_tokens, query_tokens_mask)  # [B x S x H]
 66 |         code_seq_outputs = self.code_encoder(code_tokens, code_tokens_mask)  # [B x S x H]
 67 | 
 68 |         if self.pooler is not None:
 69 |             return (
 70 |                 self.pooler(query_seq_outputs[0], query_tokens_mask),
 71 |                 self.pooler(code_seq_outputs[0], code_tokens_mask),
 72 |             )
 73 |         else:
 74 |             # use already pooled data (need to be pretrained as it uses 1st (CLS) token logit)
 75 |             return query_seq_outputs[1], code_seq_outputs[1]
 76 | 
 77 |     def encode_query(self, query_tokens: np.ndarray, query_tokens_mask: np.ndarray) -> np.ndarray:
 78 |         query_seq_outputs = self.query_encoder(query_tokens, query_tokens_mask)
 79 | 
 80 |         if self.pooler is not None:
 81 |             return self.pooler(query_seq_outputs[0], query_tokens_mask)
 82 |         else:
 83 |             return query_seq_outputs[1]
 84 | 
 85 |     def encode_code(self, lang_id: int, code_tokens: np.ndarray, code_tokens_mask: np.ndarray) -> np.ndarray:
 86 |         code_seq_outputs = self.code_encoder(code_tokens, code_tokens_mask)
 87 |         if self.pooler is not None:
 88 |             return self.pooler(code_seq_outputs[0], code_tokens_mask)
 89 |         else:
 90 |             return code_seq_outputs[1]
 91 | 
 92 |     def tokenize_code(self, lang_id: int, code_tokens: np.ndarray, code_tokens_mask: np.ndarray) -> np.ndarray:
 93 |         code_seq_outputs = self.code_encoder(code_tokens, code_tokens_mask)
 94 |         if self.pooler is not None:
 95 |             return self.pooler(code_seq_outputs[0], code_tokens_mask)
 96 |         else:
 97 |             return code_seq_outputs[1]
 98 | 
 99 |     @classmethod
100 |     def from_hocon(cls: Type[Query1Code1], config: ConfigTree) -> Query1Code1:
101 |         """Load Query1Code1_CodeSearchModel from a config tree"""
102 | 
103 |         query_bert_config = BertConfig(**config["training.model.query_encoder"])
104 |         query_encoder = PreTrainedModelRecordable(BertModel(query_bert_config))
105 |         code_bert_config = BertConfig(**config["training.model.code_encoder"])
106 |         code_encoder = PreTrainedModelRecordable(BertModel(code_bert_config))
107 | 
108 |         model = Query1Code1(
109 |             query_encoder=query_encoder,
110 |             code_encoder=code_encoder,
111 |             pooler=MeanWeightedPooler(input_size=query_bert_config.hidden_size),
112 |         )
113 | 
114 |         return model
115 | 


--------------------------------------------------------------------------------
/codenets/codesearchnet/query_1_code_n/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/codenets/codesearchnet/query_1_code_n/__init__.py


--------------------------------------------------------------------------------
/codenets/codesearchnet/query_code_siamese/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/codenets/codesearchnet/query_code_siamese/__init__.py


--------------------------------------------------------------------------------
/codenets/codesearchnet/query_code_siamese/model.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from pathlib import Path
  4 | from typing import MutableMapping, Optional, Union, Type
  5 | 
  6 | import numpy as np
  7 | from loguru import logger
  8 | from transformers import BertConfig, BertModel, AlbertConfig, AlbertModel
  9 | 
 10 | from codenets.codesearchnet.poolers import MeanWeightedPooler
 11 | from codenets.codesearchnet.huggingface.models import PreTrainedModelRecordable
 12 | from codenets.recordable import (
 13 |     Recordable,
 14 |     RecordableTorchModule,
 15 |     runtime_load_recordable_mapping,
 16 |     save_recordable_mapping,
 17 | )
 18 | from codenets.utils import full_classname, instance_full_classname
 19 | from pyhocon import ConfigTree
 20 | 
 21 | 
 22 | class QueryCodeSiamese(RecordableTorchModule):
 23 |     """
 24 |     A generic Pytorch Model with:
 25 |     - one single-branch query encoder
 26 |     - one single-branch code encoder
 27 |     - one optional pooler to pool output embeddings from any branch
 28 |     """
 29 |     def __init__(self, encoder: RecordableTorchModule, pooler: Optional[RecordableTorchModule] = None):
 30 |         super(QueryCodeSiamese, self).__init__()
 31 |         self.encoder = encoder
 32 |         self.pooler = pooler
 33 | 
 34 |     def save(self, output_dir: Union[Path, str]) -> bool:
 35 |         d = Path(output_dir) / instance_full_classname(self)
 36 |         records: MutableMapping[str, Recordable] = {"encoder": self.encoder}
 37 |         if self.pooler is not None:
 38 |             records["pooler"] = self.pooler
 39 |         return save_recordable_mapping(output_dir=d, records=records)
 40 | 
 41 |     @classmethod
 42 |     def load(cls, restore_dir: Union[Path, str]) -> QueryCodeSiamese:
 43 |         d = Path(restore_dir) / full_classname(cls)
 44 |         records = runtime_load_recordable_mapping(d)
 45 |         return cls(**records) # type: ignore[arg-type]
 46 | 
 47 |     def forward(
 48 |         self,
 49 |         languages: np.ndarray,
 50 |         query_tokens: np.ndarray,
 51 |         query_tokens_mask: np.ndarray,
 52 |         code_tokens: np.ndarray,
 53 |         code_tokens_mask: np.ndarray,
 54 |         lang_weights: np.ndarray,
 55 |     ):
 56 |         # lang_id = str(languages[0].item())
 57 |         query_seq_outputs = self.encoder(query_tokens, query_tokens_mask)  # [B x S x H]
 58 |         code_seq_outputs = self.encoder(code_tokens, code_tokens_mask)  # [B x S x H]
 59 |         if self.pooler is not None:
 60 |             return (
 61 |                 self.pooler(query_seq_outputs[0], query_tokens_mask),
 62 |                 self.pooler(code_seq_outputs[0], code_tokens_mask),
 63 |             )
 64 |         else:
 65 |             # use already pooled data (need to be pretrained as it uses 1st (CLS) token logit)
 66 |             return query_seq_outputs[1], code_seq_outputs[1]
 67 | 
 68 |     def encode_query(self, query_tokens: np.ndarray, query_tokens_mask: np.ndarray) -> np.ndarray:
 69 |         query_seq_outputs = self.encoder(query_tokens, query_tokens_mask)
 70 | 
 71 |         if self.pooler is not None:
 72 |             return self.pooler(query_seq_outputs[0], query_tokens_mask)
 73 |         else:
 74 |             return query_seq_outputs[1]
 75 | 
 76 |     def encode_code(self, lang_id: int, code_tokens: np.ndarray, code_tokens_mask: np.ndarray) -> np.ndarray:
 77 |         code_seq_outputs = self.encoder(code_tokens, code_tokens_mask)
 78 |         if self.pooler is not None:
 79 |             return self.pooler(code_seq_outputs[0], code_tokens_mask)
 80 |         else:
 81 |             return code_seq_outputs[1]
 82 | 
 83 |     def tokenize_code(self, lang_id: int, code_tokens: np.ndarray, code_tokens_mask: np.ndarray) -> np.ndarray:
 84 |         code_seq_outputs = self.encoder(code_tokens, code_tokens_mask)
 85 |         if self.pooler is not None:
 86 |             return self.pooler(code_seq_outputs[0], code_tokens_mask)
 87 |         else:
 88 |             return code_seq_outputs[1]
 89 | 
 90 |     @classmethod
 91 |     def from_hocon(cls: Type[QueryCodeSiamese], config: ConfigTree) -> QueryCodeSiamese:
 92 |         """Load Query1Code1_CodeSearchModel from a config tree"""
 93 | 
 94 |         if "training.model.encoder.type" in config:
 95 |             if config["training.model.encoder.type"] == "albert":
 96 |                 logger.info("Creating QueryCodeSiamese with Albert encoder")
 97 |                 albert_config = AlbertConfig(**config["training.model.encoder"])
 98 |                 encoder = PreTrainedModelRecordable(AlbertModel(albert_config))
 99 |             elif config["training.model.encoder.type"] == "bert":
100 |                 logger.info("Creating QueryCodeSiamese with Bert encoder")
101 |                 bert_config = BertConfig(**config["training.model.encoder"])
102 |                 encoder = PreTrainedModelRecordable(BertModel(bert_config))
103 |         else:
104 |             # default is BERT now
105 |             logger.info("Creating QueryCodeSiamese with Bert encoder")
106 |             bert_config = BertConfig(**config["training.model.encoder"])
107 |             encoder = PreTrainedModelRecordable(BertModel(bert_config))
108 | 
109 |         model = QueryCodeSiamese(
110 |             encoder=encoder, pooler=MeanWeightedPooler(input_size=config["training.model.encoder.hidden_size"])
111 |         )
112 | 
113 |         return model
114 | 


--------------------------------------------------------------------------------
/codenets/codesearchnet/sbert_build.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Usage:
  4 |     eval.py [options] SAVE_FOLDER TRAIN_DATA_PATH VALID_DATA_PATH TEST_DATA_PATH
  5 |     eval.py [options] [SAVE_FOLDER]
  6 | 
  7 | *_DATA_PATH arguments may either accept (1) directory filled with .jsonl.gz files that we use as data,
  8 | or a (2) plain text file containing a list of such directories (used for multi-language training).
  9 | 
 10 | In the case that you supply a (2) plain text file, all directory names must be separated by a newline.
 11 | For example, if you want to read from multiple directories you might have a plain text file called
 12 | data_dirs_train.txt with the below contents:
 13 | 
 14 | > cat ~/src/data_dirs_train.txt
 15 | azure://semanticcodesearch/pythondata/Processed_Data/jsonl/train
 16 | azure://semanticcodesearch/csharpdata/split/csharpCrawl-train
 17 | 
 18 | Options:
 19 |     -h --help                        Show this screen.
 20 |     --config FILE                    Specify HOCON config file.
 21 |     --debug                          Enable debug routines. [default: False]
 22 | """
 23 | 
 24 | from typing import Dict, List
 25 | from sentence_transformers import SentenceTransformer
 26 | from dpu_utils.utils import run_and_debug
 27 | from docopt import docopt
 28 | from loguru import logger
 29 | import itertools
 30 | import os
 31 | import pickle
 32 | from torch.utils.data import DataLoader
 33 | from pathlib import Path
 34 | from pyhocon import ConfigFactory
 35 | from torch import nn
 36 | from torch import Tensor
 37 | import torch
 38 | import numpy as np
 39 | import pandas as pd
 40 | 
 41 | from tree_sitter import Language, Parser
 42 | from codenets.codesearchnet.copied_code.utils import read_file_samples
 43 | from sklearn.metrics.pairwise import pairwise_distances
 44 | from codenets.codesearchnet.dataset_utils import BalancedBatchSchedulerSampler, DatasetType
 45 | from codenets.codesearchnet.data import DatasetParams
 46 | from codenets.codesearchnet.training_ctx import CodeSearchTrainingContext
 47 | from codenets.codesearchnet.query_code_siamese.dataset import load_data_from_dirs
 48 | 
 49 | """Evaluating SBert."""
 50 | 
 51 | 
 52 | def run(args, tag_in_vcs=False) -> None:
 53 |     # os.environ["WANDB_MODE"] = "dryrun"
 54 | 
 55 |     logger.debug("Building Training Context")
 56 |     conf_file = args["--config"]
 57 |     conf = ConfigFactory.parse_file(conf_file)
 58 | 
 59 |     logger.info(f"Restoring Training Context from config {conf_file}")
 60 |     training_ctx = CodeSearchTrainingContext.build_context_from_hocon(conf)
 61 | 
 62 |     # val_dataset = training_ctx.build_lang_dataset(DatasetType.VAL)
 63 |     # if val_dataset.collate_fn is not None:
 64 |     #     val_dataloader = DataLoader(
 65 |     #         dataset=val_dataset,
 66 |     #         batch_size=conf["training.batch_size.val"],
 67 |     #         sampler=BalancedBatchSchedulerSampler(dataset=val_dataset, batch_size=conf["training.batch_size.val"]),
 68 |     #         collate_fn=val_dataset.collate_fn,
 69 |     #     )
 70 |     # else:
 71 |     #     val_dataloader = DataLoader(
 72 |     #         dataset=val_dataset,
 73 |     #         batch_size=conf["training.batch_size.val"],
 74 |     #         sampler=BalancedBatchSchedulerSampler(dataset=val_dataset, batch_size=conf["training.batch_size.val"]),
 75 |     #     )
 76 | 
 77 |     val_dataloader = training_ctx.build_lang_dataloader(DatasetType.VAL)
 78 |     logger.info(f"val_dataloader [{len(val_dataloader)} samples]")
 79 | 
 80 |     # train_dataloader = training_ctx.build_lang_dataloader(DatasetType.TRAIN)
 81 |     # logger.info(f"train_dataloader [{len(train_dataloader)} samples]")
 82 | 
 83 |     # df = pd.read_parquet("./pickles/train_qc_30k_embeddings.parquet")
 84 |     # print(df.info())
 85 | 
 86 |     # z = df.iloc[0][0]
 87 |     # print("z", z.shape)
 88 |     from annoy import AnnoyIndex
 89 | 
 90 |     t = AnnoyIndex(768, "angular")
 91 |     # for index, row in df.iterrows():
 92 |     #     print(row.shape)
 93 |     #     t.add_item(index, row[0])
 94 |     # t.build(10)  # 10 trees
 95 |     # t.save("./pickles/train_qc_30k_embeddings.ann")
 96 | 
 97 |     t.load("./pickles/val_qc_30k_embeddings.ann")
 98 | 
 99 |     # for i in range(0, 100):
100 |     #     print(i, 99, 1.0 - t.get_distance(i, 99))
101 | 
102 |     for batch in val_dataloader:  # itertools.islice(val_dataloader, 0, 1000):
103 |         indices, languages, similarity, query_tokens, query_tokens_mask, code_tokens, code_tokens_mask, code_lang_weights = (
104 |             batch
105 |         )
106 |         toks = [toks.cpu().numpy()[: len(mask[mask != 0])] for (toks, mask) in zip(query_tokens, query_tokens_mask)]
107 |         toks = training_ctx.decode_query_tokens(toks)
108 |         qs = [str((t, score)) for (t, score) in list(zip(toks, similarity))]
109 |         for i, scores in enumerate(similarity):
110 |             for j, s in enumerate(scores):
111 |                 if s > 0.5 and i != j:
112 |                     print(s, toks[i], toks[j])
113 | 
114 |         # print("query", "\n".join(qs))
115 | 
116 |     #     # print("query_tokens", query_tokens)
117 |     #     # 5 for removing "<qy> "
118 |     #     toks = [toks.cpu().numpy()[: len(mask[mask != 0])] for (toks, mask) in zip(query_tokens, query_tokens_mask)]
119 |     #     toks = training_ctx.decode_query_tokens(toks)
120 |     #     # print("toks", toks)
121 |     #     qs = [str((t, score)) for (t, score) in list(zip(toks, similarity))]
122 |     #     print("query", "\n".join(qs))
123 |     #     print("-----------")
124 | 
125 |     # data_file = (
126 |     #     "/home/mandubian/workspaces/tools/CodeSearchNet/resources/data/python/final/jsonl/valid/python_valid_0.jsonl.gz"
127 |     # )
128 |     # filename = os.path.basename(data_file)
129 |     # file_language = filename.split("_")[0]
130 | 
131 |     # samples = list(read_file_samples(data_file))
132 | 
133 |     # sample0 = samples[0]
134 |     # sample1 = samples[1]
135 |     # logger.info(f"keys {sample0.keys()}")
136 |     # logger.info(f"sample docstring {sample0['docstring_tokens']}")
137 |     # query0 = " ".join(samples[0]["docstring_tokens"])
138 |     # logger.info(f"query0 {query0}")
139 |     # query_embeddings0 = model.encode([query0])
140 |     # # logger.info(f"query_embeddings0 {query_embeddings0}")
141 |     # query1 = " ".join(sample1["docstring_tokens"])
142 |     # query_embeddings1 = model.encode([query1])
143 | 
144 |     # distances = pairwise_distances(query_embeddings0, query_embeddings1, metric="cosine")
145 |     # logger.info(f"distances {distances}")
146 | 
147 |     # Language.build_library(
148 |     #     # Store the library in the `build` directory
149 |     #     "build/my-languages.so",
150 |     #     # Include one or more languages
151 |     #     [
152 |     #         "vendor/tree-sitter-go",
153 |     #         "vendor/tree-sitter-java",
154 |     #         "vendor/tree-sitter-javascript",
155 |     #         "vendor/tree-sitter-python",
156 |     #         "vendor/tree-sitter-php",
157 |     #         "vendor/tree-sitter-ruby",
158 |     #     ],
159 |     # )
160 |     # PY_LANGUAGE = Language("build/my-languages.so", "python")
161 |     # parser = Parser()
162 |     # parser.set_language(PY_LANGUAGE)
163 |     # tree = parser.parse(bytes(samples[0]["code"], "utf8"))
164 | 
165 |     # logger.info(f"tree {tree}")
166 | 
167 | 
168 | if __name__ == "__main__":
169 |     args = docopt(__doc__)
170 |     run_and_debug(lambda: run(args), args["--debug"])
171 | 


--------------------------------------------------------------------------------
/codenets/codesearchnet/tokenizer_build.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Usage:
 4 |     tokenizers_huggingface_build.py [options]
 5 |     tokenizers_huggingface_build.py [options]
 6 | 
 7 | Options:
 8 |     -h --help                        Show this screen.
 9 |     --config FILE                    Specify HOCON config file. [default: ./conf/default.conf]
10 |     --debug                          Enable debug routines. [default: False]
11 | """
12 | 
13 | 
14 | from docopt import docopt
15 | from loguru import logger
16 | import sys
17 | import torch
18 | from dpu_utils.utils import run_and_debug
19 | from pyhocon import ConfigFactory, ConfigTree
20 | 
21 | from codenets.codesearchnet.training_ctx import CodeSearchTrainingContext
22 | from codenets.codesearchnet.tokenizer_recs import build_most_common_tokens
23 | 
24 | print("Torch version", torch.__version__)
25 | 
26 | logger.remove()
27 | logger.add(sys.stderr, level="DEBUG", colorize=True, backtrace=False)
28 | 
29 | 
30 | def run(args, tag_in_vcs=False) -> None:
31 |     conf_file = args["--config"]
32 |     logger.info(f"config file {conf_file}")
33 | 
34 |     conf: ConfigTree = ConfigFactory.parse_file(conf_file)
35 |     logger.info(f"config {conf}")
36 | 
37 |     # logger.info(f"Build Training Context from config {conf_file}")
38 |     # training_ctx = CodeSearchTrainingContext.build_context_from_hocon(conf)
39 | 
40 |     # training_ctx.build_tokenizers(from_dataset_type=DatasetType.TRAIN)
41 | 
42 |     logger.info(f"Reload Training Context from config {conf_file} with built tokenizers")
43 |     training_ctx = CodeSearchTrainingContext.build_context_from_hocon(conf)
44 | 
45 |     txt = "python <lg> def toto():"
46 |     logger.info(f"encoded {training_ctx.tokenize_code_sentences([txt])}")
47 |     txt = "go <lg> function getCounts() { return 0 }"
48 |     logger.info(f"encoded {training_ctx.tokenize_code_sentences([txt])}")
49 | 
50 |     most_commons = build_most_common_tokens(
51 |         training_ctx.train_dirs, training_ctx.train_data_params, training_ctx.tokenizers_build_path,
52 |         parallelize=False
53 |     )
54 |     logger.info(f"most_commons {most_commons}")
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     args = docopt(__doc__)
59 |     run_and_debug(lambda: run(args), args["--debug"])
60 | 


--------------------------------------------------------------------------------
/codenets/codesearchnet/tokenizer_recs.py:
--------------------------------------------------------------------------------
  1 | from abc import abstractmethod
  2 | 
  3 | from typing import Iterable, List, Optional, Tuple, Dict, cast
  4 | import numpy as np
  5 | import os
  6 | from loguru import logger
  7 | from pathlib import Path
  8 | import pickle
  9 | 
 10 | import time
 11 | 
 12 | from pyhocon import ConfigTree
 13 | from codenets.recordable import Recordable, RecordableMapping, DictRecordable
 14 | from codenets.codesearchnet.data import DatasetParams
 15 | from codenets.codesearchnet.copied_code.metadata import Metadata, append_metadata, build_tokenizer_metadata
 16 | 
 17 | 
 18 | class TokenizerRecordable(Recordable):
 19 |     @abstractmethod
 20 |     def tokenize(self, text: str, **kwargs) -> List[str]:
 21 |         pass
 22 | 
 23 |     @abstractmethod
 24 |     def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
 25 |         pass
 26 | 
 27 |     @abstractmethod
 28 |     def unk_token(self) -> str:
 29 |         pass
 30 | 
 31 |     # @abstractmethod
 32 |     # def pad_token(self) -> str:
 33 |     #     pass
 34 | 
 35 |     @abstractmethod
 36 |     def encode_sentence(self, sentence: str, max_length: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]:
 37 |         pass
 38 | 
 39 |     @abstractmethod
 40 |     def encode_sentences(
 41 |         self, sentences: List[str], max_length: Optional[int] = None
 42 |     ) -> Tuple[np.ndarray, np.ndarray]:
 43 |         pass
 44 | 
 45 |     @abstractmethod
 46 |     def encode_tokens(
 47 |         self, tokens: Iterable[List[str]], max_length: Optional[int] = None
 48 |     ) -> Tuple[np.ndarray, np.ndarray]:
 49 |         pass
 50 | 
 51 |     @abstractmethod
 52 |     def decode_sequence(self, tokens_sequence: List[int]) -> str:
 53 |         pass
 54 | 
 55 |     @abstractmethod
 56 |     def decode_sequences(self, tokens_sequences: Iterable[List[int]]) -> List[str]:
 57 |         pass
 58 | 
 59 |     @abstractmethod
 60 |     def add_special_tokens(self, special_tokens: List[str]) -> bool:
 61 |         pass
 62 | 
 63 | 
 64 | def build_most_common_tokens(
 65 |     data_dirs: List[Path],
 66 |     data_params: DatasetParams,
 67 |     build_path: Path,
 68 |     max_files_per_dir: Optional[int] = None,
 69 |     parallelize: bool = True,
 70 | ) -> Dict[str, List[Tuple[str, int]]]:
 71 | 
 72 |     start = time.time()
 73 | 
 74 |     logger.info(f"Build metadata for {data_dirs}")
 75 | 
 76 |     _, code_language_metadata_lists = build_tokenizer_metadata(
 77 |         data_dirs=data_dirs,
 78 |         max_files_per_dir=max_files_per_dir,
 79 |         parallelize=parallelize,
 80 |         use_subtokens=data_params.use_subtokens,
 81 |         mark_subtoken_end=data_params.mark_subtoken_end,
 82 |     )
 83 | 
 84 |     logger.info("Merging metadata")
 85 | 
 86 |     # merge metadata if necessary
 87 |     per_code_language_metadata: Dict[str, Metadata] = {}
 88 |     for (language, raw_per_language_metadata) in code_language_metadata_lists.items():
 89 |         logger.info(f"Build vocabulary for {language}")
 90 |         per_code_language_metadata[language] = append_metadata(
 91 |             "code",
 92 |             vocab_size=data_params.vocab_size,
 93 |             vocab_count_threshold=data_params.vocab_count_threshold,
 94 |             pct_bpe=data_params.pct_bpe,
 95 |             raw_metadata_list=raw_per_language_metadata,
 96 |         )
 97 |     common_tokens: Dict[str, List[Tuple[str, int]]] = {}
 98 |     for (language, md) in per_code_language_metadata.items():
 99 |         common_tokens[language] = md.common_tokens
100 | 
101 |     end = time.time()
102 | 
103 |     time_p = end - start
104 |     logger.info(f"Most Common Tokens: {time_p} sec")
105 | 
106 |     pickle.dump(common_tokens, open("./checkpoints/tmp_common_tokens.p", "wb"))
107 | 
108 |     common_tokens_dict = DictRecordable(common_tokens)
109 |     os.makedirs(build_path, exist_ok=True)
110 |     records = RecordableMapping({"common_tokens": common_tokens_dict})
111 |     records.save(build_path)
112 | 
113 |     return common_tokens_dict
114 | 
115 | 
116 | def load_query_code_tokenizers_from_hocon(conf: ConfigTree) -> Optional[Tuple[TokenizerRecordable, RecordableMapping]]:
117 |     build_path = Path(conf["tokenizers.build_path"])
118 | 
119 |     if not os.path.exists(build_path):
120 |         logger.error(f"Could find {build_path} where tokenizers should have been built and stored")
121 |         return None
122 | 
123 |     records = RecordableMapping.load(build_path)
124 |     if "query_tokenizer" in records and "code_tokenizers" in records:
125 |         query_tokenizer = cast(TokenizerRecordable, records["query_tokenizer"])
126 |         code_tokenizers = cast(RecordableMapping, records["code_tokenizers"])
127 | 
128 |         return query_tokenizer, code_tokenizers
129 |     else:
130 |         logger.error(f"Couldn't query_tokenizer/code_tokenizers recordables in path {build_path}")
131 |         return None
132 | 


--------------------------------------------------------------------------------
/codenets/main.py:
--------------------------------------------------------------------------------
 1 | """Dummy Main of the project."""
 2 | 
 3 | 
 4 | def main():
 5 |     print("hello")
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     main()
10 | 


--------------------------------------------------------------------------------
/codenets/save.py:
--------------------------------------------------------------------------------
  1 | """Utils to save Recordables in rotating mode"""
  2 | 
  3 | import os
  4 | from pathlib import Path
  5 | import shutil
  6 | from typing import Union, Type, TypeVar, Optional
  7 | from codenets.recordable import Recordable
  8 | 
  9 | 
 10 | def rotating_save_records(path: Union[Path, str], prefix: str, rec: Recordable, nb: int = 5) -> bool:
 11 |     root_path = Path(path) / prefix
 12 |     if not os.path.isdir(root_path):
 13 |         os.makedirs(root_path)
 14 | 
 15 |     paths = []
 16 |     first_empty_path = None
 17 |     saved = True
 18 |     for i in range(nb):
 19 |         path_i = root_path / f"{prefix}_{i}"
 20 |         if not os.path.exists(path_i) and first_empty_path is None:
 21 |             first_empty_path = path_i
 22 |             os.makedirs(first_empty_path)
 23 |         paths.append(path_i)
 24 | 
 25 |     if first_empty_path is not None:
 26 |         saved = saved and rec.save(first_empty_path)
 27 |     else:
 28 |         first = paths[0]
 29 | 
 30 |         shutil.rmtree(first)
 31 |         for pth in paths[1:]:
 32 |             os.rename(pth, first)
 33 |             first = pth
 34 |         saved = saved and rec.save(paths[-1])
 35 | 
 36 |     return saved
 37 | 
 38 | 
 39 | def save_records_direct(path: Union[Path, str], rec: Recordable) -> bool:
 40 |     if not os.path.isdir(path):
 41 |         os.makedirs(path)
 42 | 
 43 |     return rec.save(path)
 44 | 
 45 | 
 46 | def save_records_best(path: Union[Path, str], rec: Recordable, suffix: Optional[str] = None) -> bool:
 47 |     prefix = os.path.basename(path)
 48 |     if suffix is not None:
 49 |         best_path = Path(path) / f"{prefix}_best_{suffix}"
 50 |     else:
 51 |         best_path = Path(path) / f"{prefix}_best"
 52 |     if not os.path.isdir(best_path):
 53 |         os.makedirs(best_path)
 54 | 
 55 |     return rec.save(best_path)
 56 | 
 57 | 
 58 | def save_records_last(output_dir: Union[Path, str], rec: Recordable) -> bool:
 59 |     return rotating_save_records(os.path.dirname(output_dir), os.path.basename(output_dir), rec)
 60 | 
 61 | 
 62 | Recordable_T = TypeVar("Recordable_T", bound="Recordable")
 63 | 
 64 | 
 65 | def rotating_recover_records(
 66 |     cls: Type[Recordable_T], path: Union[Path, str], prefix: str, nb: int = 5
 67 | ) -> Optional[Recordable_T]:
 68 |     last_path = None
 69 |     for i in range(nb):
 70 |         path_i = Path(path) / prefix / f"{prefix}_{i}"
 71 |         if os.path.exists(path_i):
 72 |             last_path = path_i
 73 | 
 74 |     if last_path is not None:
 75 |         return cls.load(last_path)
 76 |     else:
 77 |         return None
 78 | 
 79 | 
 80 | def recover_records_best(
 81 |     cls: Type[Recordable_T], recover_dir: Union[Path, str], nb: int = 5, *args, **kwargs
 82 | ) -> Optional[Recordable_T]:
 83 |     prefix = os.path.basename(recover_dir)
 84 |     best_path = Path(recover_dir) / f"{prefix}_best"
 85 |     if best_path.exists():
 86 |         return cls.load(best_path)
 87 |     else:
 88 |         return None
 89 | 
 90 | 
 91 | def recover_records_direct(
 92 |     cls: Type[Recordable_T], recover_dir: Union[Path, str], *args, **kwargs
 93 | ) -> Optional[Recordable_T]:
 94 |     p = Path(recover_dir)
 95 |     if p.exists():
 96 |         return cls.load(p)
 97 |     else:
 98 |         return None
 99 | 
100 | 
101 | def recover_records_last(cls: Type[Recordable_T], recover_dir: Union[Path, str]) -> Optional[Recordable_T]:
102 |     return rotating_recover_records(cls, os.path.dirname(recover_dir), os.path.basename(recover_dir))
103 | 


--------------------------------------------------------------------------------
/codenets/tensorboard_utils.py:
--------------------------------------------------------------------------------
 1 | # Some
 2 | 
 3 | from tensorboardX import SummaryWriter
 4 | from pathlib import Path
 5 | import datetime
 6 | from loguru import logger
 7 | from typing import Dict
 8 | 
 9 | from tensorboard.backend.event_processing import event_accumulator
10 | 
11 | 
12 | def tensorboard_event_accumulator(
13 |     file: str,
14 |     loaded_scalars: int = 0,  # load all scalars by default
15 |     loaded_images: int = 4,  # load 4 images by default
16 |     loaded_compressed_histograms: int = 500,  # load one histogram by default
17 |     loaded_histograms: int = 1,  # load one histogram by default
18 |     loaded_audio: int = 4,  # loads 4 audio by default
19 | ):
20 |     """Read a Tensorboard event_accumulator from a file"""
21 |     ea = event_accumulator.EventAccumulator(
22 |         file,
23 |         size_guidance={  # see below regarding this argument
24 |             event_accumulator.COMPRESSED_HISTOGRAMS: loaded_compressed_histograms,
25 |             event_accumulator.IMAGES: loaded_images,
26 |             event_accumulator.AUDIO: loaded_audio,
27 |             event_accumulator.SCALARS: loaded_scalars,
28 |             event_accumulator.HISTOGRAMS: loaded_histograms,
29 |         },
30 |     )
31 |     ea.Reload()
32 |     return ea
33 | 
34 | 
35 | class Tensorboard:
36 |     """
37 |     Tensorboard manager
38 | 
39 |     This manager is associated to a:
40 | 
41 |     - experiment
42 |     - a unique ID for the current run (one experiment can be run many times)
43 |     - groups of metrics (like "train" or "val")
44 |     - sub-groups of metrics (like train/bash or val/epoch)
45 |     """
46 | 
47 |     def __init__(self, experiment_id, output_dir="./runs", unique_id=None, flush_secs=10):
48 |         self.experiment_id = experiment_id
49 |         self.output_dir = Path(output_dir)
50 |         if unique_id is None:
51 |             unique_id = datetime.datetime.now().isoformat(timespec="seconds")
52 |         self.path = self.output_dir / f"{experiment_id}_{unique_id}"
53 |         logger.debug(f"Writing TensorBoard events locally to {self.path}")
54 |         self.writers: Dict[str, SummaryWriter] = {}
55 |         self.flush_secs = flush_secs
56 | 
57 |     def _get_writer(self, group: str = "") -> SummaryWriter:
58 |         if group not in self.writers:
59 |             logger.debug(f"Adding group {group} to writers ({self.writers.keys()})")
60 |             self.writers[group] = SummaryWriter(f"{str(self.path)}_{group}", flush_secs=self.flush_secs)
61 |         return self.writers[group]
62 | 
63 |     def add_scalars(self, metrics: dict, global_step: int, group=None, sub_group="") -> None:
64 |         for key, val in metrics.items():
65 |             cur_name = "/".join([sub_group, key])
66 |             self._get_writer(group).add_scalar(cur_name, val, global_step)
67 | 


--------------------------------------------------------------------------------
/codenets/utils.py:
--------------------------------------------------------------------------------
  1 | from typing import Iterable, List, Optional
  2 | import os
  3 | import re
  4 | from dpu_utils.codeutils import split_identifier_into_parts
  5 | 
  6 | # from dpu_utils.utils import Path
  7 | from pathlib import Path
  8 | import numpy as np
  9 | import glob
 10 | import base64
 11 | from pickle import dumps, loads
 12 | 
 13 | IDENTIFIER_TOKEN_REGEX = re.compile("[_a-zA-Z][_a-zA-Z0-9]*")
 14 | 
 15 | 
 16 | def listdir_nohidden_gen(path):
 17 |     for f in os.listdir(path):
 18 |         if not f.startswith('.'):
 19 |             yield f
 20 | 
 21 | 
 22 | def listdir_nohidden(path):
 23 |     return list(listdir_nohidden_gen(path))
 24 | 
 25 | 
 26 | def runtime_import(class_name: str):
 27 |     import importlib
 28 | 
 29 |     """
 30 |     Runtime import from a string using "." to split module & class names
 31 |     
 32 |     Args:
 33 |         class_name (str): the class name to split according to "." and load dynamically modules & class
 34 |     
 35 |     Returns:
 36 |         Class: The imported class
 37 |     """
 38 |     components = class_name.split(".")
 39 |     print(f">>> class_name {class_name}<<<<")
 40 |     mod = getattr(importlib.import_module(".".join(components[:-1])), components[-1])
 41 |     return mod
 42 | 
 43 | 
 44 | def full_classname(cls):
 45 |     """Return full class name with modules"""
 46 |     return cls.__module__ + "." + cls.__name__
 47 | 
 48 | 
 49 | def instance_full_classname(o):
 50 |     # o.__module__ + "." + o.__class__.__qualname__ is an example in
 51 |     # this context of H.L. Mencken's "neat, plausible, and wrong."
 52 |     # Python makes no guarantees as to whether the __module__ special
 53 |     # attribute is defined, so we take a more circumspect approach.
 54 |     # Alas, the module name is explicitly excluded from __qualname__
 55 |     # in Python 3.
 56 |     module = o.__class__.__module__
 57 |     if module is None or module == str.__class__.__module__:
 58 |         return o.__class__.__name__  # Avoid reporting __builtin__
 59 |     else:
 60 |         return module + "." + o.__class__.__name__
 61 | 
 62 | 
 63 | def _to_subtoken_stream(input_stream: Iterable[str], mark_subtoken_end: bool) -> Iterable[str]:
 64 |     """Generate chopped strings into sub-tokens strings (like snake-case)"""
 65 |     for token in input_stream:
 66 |         if IDENTIFIER_TOKEN_REGEX.match(token):
 67 |             yield from split_identifier_into_parts(token)
 68 |             if mark_subtoken_end:
 69 |                 yield "</id>"
 70 |         else:
 71 |             yield token
 72 | 
 73 | 
 74 | def expand_data_path(data_path: str) -> List[Path]:
 75 |     """
 76 |     Expand data path as a simple directory or if a file, searches for directories in the file
 77 | 
 78 |     Args:
 79 |         data_path: A path to either a file or a directory. If it's a file, we interpret it as a list of
 80 |             data directories.
 81 | 
 82 |     Returns:
 83 |         List of data directories (potentially just data_path)
 84 |     """
 85 |     data_rpath = Path(data_path)
 86 | 
 87 |     if data_rpath.is_dir():
 88 |         return [data_rpath]
 89 | 
 90 |     data_dirs: List[Path] = []
 91 |     with open(data_rpath) as f:
 92 |         for fl in map(Path, f.read().splitlines()):
 93 |             if fl.is_absolute():
 94 |                 data_dirs.append(fl)
 95 |             else:
 96 |                 data_dirs.append(data_rpath.parent / fl)
 97 | 
 98 |         # data_dirs.extend(map(Path))
 99 |     return data_dirs
100 | 
101 | 
102 | def get_data_files_from_directory(data_dirs: List[Path], max_files_per_dir: Optional[int] = None) -> List[Path]:
103 |     """Search all *.jsonl.gz files in a multiple paths and concatenate them"""
104 |     files: List[Path] = []
105 |     for data_dir in data_dirs:
106 |         dir_files = [Path(path) for path in glob.iglob(os.path.join(data_dir, "*.jsonl.gz"), recursive=True)]
107 |         # dir_files = data_dir.get_filtered_files_in_dir("*.jsonl.gz")
108 |         if max_files_per_dir:
109 |             dir_files = sorted(dir_files)[: int(max_files_per_dir)]
110 |         files += dir_files
111 | 
112 |     np.random.shuffle(np.array(files))  # This avoids having large_file_0, large_file_1, ... subsequences
113 |     return files
114 | 
115 | 
116 | # Some streaming pickles (not used)
117 | 
118 | 
119 | def stream_dump(iterable_to_pickle, file_obj):
120 |     """
121 |     Dump contents of an iterable iterable_to_pickle to file_obj, a file
122 |     opened in write mode
123 |     """
124 |     for elt in iterable_to_pickle:
125 |         stream_dump_elt(elt, file_obj)
126 | 
127 | 
128 | def stream_dump_elt(elt_to_pickle, file_obj):
129 |     """Dump one element to file_obj, a file opened in write mode"""
130 |     pickled_elt = dumps(elt_to_pickle)
131 |     encoded = base64.b64encode(pickled_elt)
132 |     file_obj.write(encoded)
133 | 
134 |     # record separator is a blank line
135 |     # (since pickled_elt as base64 encoded cannot contain its own newlines)
136 |     file_obj.write(b"\n\n")
137 | 
138 | 
139 | def stream_load(file_obj):
140 |     """
141 |     Load contents from file_obj, returning a generator that yields one
142 |     element at a time
143 |     """
144 |     cur_elt = []
145 |     for line in file_obj:
146 |         if line == b"\n":
147 |             encoded_elt = b"".join(cur_elt)
148 |             try:
149 |                 pickled_elt = base64.b64decode(encoded_elt)
150 |                 elt = loads(pickled_elt)
151 |             except EOFError:
152 |                 print("EOF found while unpickling data")
153 |                 print(pickled_elt)
154 |                 raise StopIteration
155 |             cur_elt = []
156 |             yield elt
157 |         else:
158 |             cur_elt.append(line)
159 | 


--------------------------------------------------------------------------------
/conf/code_search_bert_2020_02_01_1500.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | training {
 4 |     name = "code_search_bert"
 5 |     iteration = "2020_02_01_15_00"
 6 | 
 7 |     model {
 8 |         type = "single_query_multi_code"
 9 |         query_encoder = ${bert}
10 |         code_encoder = ${bert}
11 |     }
12 | }


--------------------------------------------------------------------------------
/conf/code_search_bert_2020_02_03_20_00.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | dataset {
 4 |     common_params {
 5 |         parallelize = false
 6 |     }
 7 | }
 8 | 
 9 | training {
10 |     name = "code_search_bert"
11 |     iteration = "2020_02_03_20_00"
12 | 
13 |     model {
14 |         type = "single_query_multi_code"
15 |         query_encoder = ${bert}
16 |         code_encoder = ${bert}
17 |     }
18 | }


--------------------------------------------------------------------------------
/conf/code_search_bert_lg_2020_02_04_15_00.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | tokenizers {
 4 |     build_path = "./build_tokenizers/with_lang"
 5 | }
 6 | 
 7 | dataset {
 8 |     common_params {
 9 |         parallelize = false
10 |         do_lowercase = true
11 |         special_tokens = ["<unk>", "<lg>"]
12 |     }
13 | }
14 | 
15 | training {
16 |     name = "code_search_bert"
17 |     iteration = "2020_02_04_21_00"
18 | 
19 | }


--------------------------------------------------------------------------------
/conf/code_search_bert_lg_2020_02_04_21_00.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | tokenizers {
 4 |     build_path = "./build_tokenizers/with_lang"
 5 | }
 6 | 
 7 | dataset {
 8 |     common_params {
 9 |         parallelize = false
10 |         do_lowercase = true
11 |         special_tokens = ["<unk>", "<lg>"]
12 |     }
13 | }
14 | 
15 | training {
16 |     name = "code_search_bert"
17 |     iteration = "2020_02_04_21_00"
18 | }


--------------------------------------------------------------------------------
/conf/code_search_bert_lg_2020_02_05_00_00.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | # bert {
 4 | #     hidden_size = 256
 5 | #     vocab_size = ${common_vocab_size}
 6 | #     intermediate_size = 1024
 7 | #     num_hidden_layers = 6
 8 | #     num_attention_heads = 8
 9 | # }
10 | 
11 | tokenizers {
12 |     build_path = "./build_tokenizers/with_lang"
13 | }
14 | 
15 | dataset {
16 |     common_params {
17 |         parallelize = false
18 |         do_lowercase = true
19 |         special_tokens = ["<unk>", "<lg>"]
20 |     }
21 | }
22 | 
23 | training {
24 |     name = "code_search_bert_lg"
25 |     iteration = "2020_02_05_00_00"
26 | 
27 |     batch_size {
28 |         train = 170
29 |         val = 170
30 |         test = 170
31 |     }
32 | 
33 |     model {
34 |         type = "single_query_single_code"
35 |         output_size = 128
36 |         query_encoder {
37 |             hidden_size = ${training.model.output_size}
38 |             vocab_size = ${common_vocab_size}
39 |             intermediate_size = 512
40 |             num_hidden_layers = 3
41 |             num_attention_heads = 8
42 |         }
43 |         code_encoder {
44 |             hidden_size = ${training.model.output_size}
45 |             vocab_size = ${common_vocab_size}
46 |             intermediate_size = 1024
47 |             num_hidden_layers = 6
48 |             num_attention_heads = 8
49 |         }
50 |     }
51 | 
52 | }


--------------------------------------------------------------------------------
/conf/code_search_bert_lg_2020_02_06_18_00.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | tokenizers {
 4 |     build_path = "./build_tokenizers/with_lang"
 5 | }
 6 | 
 7 | dataset {
 8 |     common_params {
 9 |         parallelize = false
10 |         do_lowercase = true
11 |         special_tokens = ["<unk>", "<lg>"]
12 |     }
13 | }
14 | 
15 | training {
16 |     name = "code_search_bert_lg"
17 |     iteration = "2020_02_06_18_00"
18 | 
19 |     batch_size {
20 |         train = 200
21 |         val = 200
22 |         test = 200
23 |     }
24 | 
25 |     model {
26 |         type = "single_query_single_code"
27 |         output_size = 64
28 |         query_encoder {
29 |             hidden_size = ${training.model.output_size}
30 |             vocab_size = ${common_vocab_size}
31 |             intermediate_size = 512
32 |             num_hidden_layers = 3
33 |             num_attention_heads = 8
34 |         }
35 |         code_encoder {
36 |             hidden_size = ${training.model.output_size}
37 |             vocab_size = ${common_vocab_size}
38 |             intermediate_size = 1024
39 |             num_hidden_layers = 6
40 |             num_attention_heads = 8
41 |         }
42 |     }
43 | 
44 | }


--------------------------------------------------------------------------------
/conf/code_search_bert_lg_2020_02_06_22_30.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | tokenizers {
 4 |     build_path = "./build_tokenizers/with_lang"
 5 | }
 6 | 
 7 | dataset {
 8 |     common_params {
 9 |         parallelize = false
10 |         do_lowercase = true
11 |         special_tokens = ["<unk>", "<lg>"]
12 |     }
13 | }
14 | 
15 | training {
16 |     name = "code_search_bert_lg"
17 |     iteration = "2020_02_06_22_30"
18 | 
19 |     batch_size {
20 |         train = 170
21 |         val = 170
22 |         test = 170
23 |     }
24 | 
25 |     model {
26 |         type = "single_query_single_code"
27 |         output_size = 256
28 |         query_encoder {
29 |             hidden_size = ${training.model.output_size}
30 |             vocab_size = ${common_vocab_size}
31 |             intermediate_size = 512
32 |             num_hidden_layers = 3
33 |             num_attention_heads = 8
34 |         }
35 |         code_encoder {
36 |             hidden_size = ${training.model.output_size}
37 |             vocab_size = ${common_vocab_size}
38 |             intermediate_size = 1024
39 |             num_hidden_layers = 6
40 |             num_attention_heads = 8
41 |         }
42 |     }
43 | 
44 | }


--------------------------------------------------------------------------------
/conf/code_search_bert_lg_2020_02_07_10_00.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | tokenizers {
 4 |     build_path = "./build_tokenizers/with_lang"
 5 | }
 6 | 
 7 | dataset {
 8 |     common_params {
 9 |         parallelize = false
10 |         do_lowercase = true
11 |         special_tokens = ["<unk>", "<lg>"]
12 |     }
13 | }
14 | 
15 | training {
16 |     short_circuit = True
17 |     name = "code_search_bert_lg"
18 |     iteration = "2020_02_07_10_00"
19 | 
20 |     model {
21 |         training_ctx_class = "codenets.codesearchnet.single_branch_ctx.SingleBranchTrainingContext"
22 |         output_size = 256
23 |         query_encoder {
24 |             hidden_size = ${training.model.output_size}
25 |             vocab_size = ${common_vocab_size}
26 |             intermediate_size = 512
27 |             num_hidden_layers = 3
28 |             num_attention_heads = 8
29 |         }
30 |         code_encoder {
31 |             hidden_size = ${training.model.output_size}
32 |             vocab_size = ${common_vocab_size}
33 |             intermediate_size = 1024
34 |             num_hidden_layers = 6
35 |             num_attention_heads = 8
36 |         }
37 |     }
38 | 
39 |     batch_size {
40 |         train = 170
41 |         val = 170
42 |         test = 170
43 |     }
44 | 
45 |     device = "cpu"
46 |     wandb = false
47 |     tensorboard = false
48 | 
49 | }


--------------------------------------------------------------------------------
/conf/code_search_bert_query_1_code_1_2020_02_10_11_00 copy.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | tokenizers {
 4 |     build_path = "./build_tokenizers/with_lang_query_1_code_1"
 5 | }
 6 | 
 7 | dataset {
 8 |     common_params {
 9 |         parallelize = false
10 |         do_lowercase = true
11 |         special_tokens = ["<unk>", "<lg>"]
12 |     }
13 | }
14 | 
15 | training {
16 |     short_circuit = False
17 |     name = "code_search_bert_query_1_code_1"
18 |     iteration = "2020_02_10_11_00"
19 |     tokenizer_type = "query_1_code_1"
20 |     model {
21 |         training_ctx_class = "codenets.codesearchnet.single_branch_ctx.SingleBranchTrainingContext"
22 |         output_size = 128
23 |         query_encoder {
24 |             hidden_size = ${training.model.output_size}
25 |             vocab_size = ${common_vocab_size}
26 |             intermediate_size = 512
27 |             num_hidden_layers = 3
28 |             num_attention_heads = 8
29 |         }
30 |         code_encoder {
31 |             hidden_size = ${training.model.output_size}
32 |             vocab_size = ${common_vocab_size}
33 |             intermediate_size = 1024
34 |             num_hidden_layers = 6
35 |             num_attention_heads = 8
36 |         }
37 |     }
38 | 
39 |     batch_size {
40 |         train = 170
41 |         val = 170
42 |         test = 170
43 |     }
44 | 
45 |     device = "cuda"
46 |     wandb = true
47 |     tensorboard = true
48 | 
49 | }


--------------------------------------------------------------------------------
/conf/code_search_bert_query_1_code_1_2020_02_10_11_00.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | tokenizers {
 4 |     build_path = "./build_tokenizers/with_lang_query_1_code_1"
 5 | }
 6 | 
 7 | dataset {
 8 |     common_params {
 9 |         parallelize = false
10 |         do_lowercase = true
11 |         special_tokens = ["<unk>", "<lg>"]
12 |     }
13 | }
14 | 
15 | training {
16 |     short_circuit = False
17 |     name = "code_search_bert_query_1_code_1"
18 |     iteration = "2020_02_10_11_00"
19 |     tokenizer_type = "query_1_code_1"
20 |     model {
21 |         training_ctx_class = "codenets.codesearchnet.single_branch_ctx.SingleBranchTrainingContext"
22 |         output_size = 128
23 |         query_encoder {
24 |             hidden_size = ${training.model.output_size}
25 |             vocab_size = ${common_vocab_size}
26 |             intermediate_size = 512
27 |             num_hidden_layers = 3
28 |             num_attention_heads = 8
29 |         }
30 |         code_encoder {
31 |             hidden_size = ${training.model.output_size}
32 |             vocab_size = ${common_vocab_size}
33 |             intermediate_size = 1024
34 |             num_hidden_layers = 6
35 |             num_attention_heads = 8
36 |         }
37 |     }
38 | 
39 |     batch_size {
40 |         train = 170
41 |         val = 170
42 |         test = 170
43 |     }
44 | 
45 |     device = "cuda"
46 |     wandb = true
47 |     tensorboard = true
48 | 
49 | }


--------------------------------------------------------------------------------
/conf/code_search_bert_query_1_code_1_2020_02_11_22_00 copy.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | tokenizers {
 4 |     build_path = "./build_tokenizers/with_lang_query_1_code_1"
 5 | }
 6 | 
 7 | dataset {
 8 |     common_params {
 9 |         parallelize = false
10 |         do_lowercase = true
11 |         special_tokens = ["<unk>", "<lg>"]
12 |     }
13 | }
14 | 
15 | training {
16 |     short_circuit = false
17 | 
18 |     device = "cuda"
19 |     wandb = true
20 |     tensorboard = true
21 | 
22 |     name = "code_search_bert_query_1_code_1"
23 |     iteration = "2020_02_11_22_00"
24 |     tokenizer_type = "query_1_code_1"
25 |     model {
26 |         training_ctx_class = "codenets.codesearchnet.query_1_code_1.training_ctx.Query1Code1Ctx"
27 |         output_size = 64
28 |         query_encoder {
29 |             hidden_size = ${training.model.output_size}
30 |             vocab_size = ${common_vocab_size}
31 |             intermediate_size = 512
32 |             num_hidden_layers = 3
33 |             num_attention_heads = 8
34 |         }
35 |         code_encoder {
36 |             hidden_size = ${training.model.output_size}
37 |             vocab_size = ${common_vocab_size}
38 |             intermediate_size = 512
39 |             num_hidden_layers = 6
40 |             num_attention_heads = 8
41 |         }
42 |     }
43 | 
44 |     batch_size {
45 |         train = 256
46 |         val = 256
47 |         test = 256
48 |     }
49 | 
50 | }


--------------------------------------------------------------------------------
/conf/code_search_bert_query_1_code_1_2020_02_11_22_00.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | tokenizers {
 4 |     build_path = "./build_tokenizers/with_lang_query_1_code_1"
 5 | }
 6 | 
 7 | dataset {
 8 |     common_params {
 9 |         parallelize = false
10 |         do_lowercase = true
11 |         special_tokens = ["<unk>", "<lg>"]
12 |     }
13 | }
14 | 
15 | training {
16 |     short_circuit = false
17 | 
18 |     device = "cuda"
19 |     wandb = true
20 |     tensorboard = true
21 | 
22 |     name = "code_search_bert_query_1_code_1"
23 |     iteration = "2020_02_11_22_00"
24 |     tokenizer_type = "query_1_code_1"
25 |     model {
26 |         training_ctx_class = "codenets.codesearchnet.query_1_code_1.training_ctx.Query1Code1Ctx"
27 |         output_size = 64
28 |         query_encoder {
29 |             hidden_size = ${training.model.output_size}
30 |             vocab_size = ${common_vocab_size}
31 |             intermediate_size = 512
32 |             num_hidden_layers = 3
33 |             num_attention_heads = 8
34 |         }
35 |         code_encoder {
36 |             hidden_size = ${training.model.output_size}
37 |             vocab_size = ${common_vocab_size}
38 |             intermediate_size = 512
39 |             num_hidden_layers = 6
40 |             num_attention_heads = 8
41 |         }
42 |     }
43 | 
44 |     batch_size {
45 |         train = 256
46 |         val = 256
47 |         test = 256
48 |     }
49 | 
50 | }


--------------------------------------------------------------------------------
/conf/code_search_bert_query_code_siamese_2020_02_12_00_00 copy.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
 4 | 
 5 | tokenizers {
 6 |     build_path = "./build_tokenizers/with_lang_query_code_siamese"
 7 |     token_files = "./build_tokenizers/token_files_query_code_siamese"
 8 | }
 9 | 
10 | dataset {
11 |     common_params {
12 |         parallelize = false
13 |         do_lowercase = true
14 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
15 |     }
16 | }
17 | 
18 | training {
19 |     short_circuit = false
20 | 
21 |     device = "cuda"
22 |     wandb = true
23 |     tensorboard = true
24 | 
25 |     name = "code_search_bert_query_code_siamese"
26 |     iteration = "2020_02_12_00_00"
27 |     tokenizer_type = "query_code_siamese"
28 |     model {
29 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
30 |         output_size = 512
31 |         encoder {
32 |             hidden_size = ${training.model.output_size}
33 |             vocab_size = ${common_vocab_size}
34 |             intermediate_size = 1024
35 |             num_hidden_layers = 6
36 |             num_attention_heads = 8
37 |         }
38 |     }
39 | 
40 |     batch_size {
41 |         train = 128
42 |         val = 128
43 |         test = 128
44 |     }
45 | 
46 | }


--------------------------------------------------------------------------------
/conf/code_search_bert_query_code_siamese_2020_02_12_00_00.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
 4 | 
 5 | tokenizers {
 6 |     build_path = "./build_tokenizers/with_lang_query_code_siamese"
 7 |     token_files = "./build_tokenizers/token_files_query_code_siamese"
 8 | }
 9 | 
10 | dataset {
11 |     common_params {
12 |         parallelize = false
13 |         do_lowercase = true
14 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
15 |     }
16 | }
17 | 
18 | training {
19 |     short_circuit = false
20 | 
21 |     device = "cuda"
22 |     wandb = true
23 |     tensorboard = true
24 | 
25 |     name = "code_search_bert_query_code_siamese"
26 |     iteration = "2020_02_12_00_00"
27 |     tokenizer_type = "query_code_siamese"
28 |     model {
29 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
30 |         output_size = 512
31 |         encoder {
32 |             hidden_size = ${training.model.output_size}
33 |             vocab_size = ${common_vocab_size}
34 |             intermediate_size = 1024
35 |             num_hidden_layers = 6
36 |             num_attention_heads = 8
37 |         }
38 |     }
39 | 
40 |     batch_size {
41 |         train = 128
42 |         val = 128
43 |         test = 128
44 |     }
45 | 
46 | }


--------------------------------------------------------------------------------
/conf/code_search_bert_query_code_siamese_2020_02_14_16_00 copy.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
 4 | 
 5 | tokenizers {
 6 |     build_path = "./build_tokenizers/with_lang_query_code_siamese"
 7 |     token_files = "./build_tokenizers/token_files_query_code_siamese"
 8 | }
 9 | 
10 | dataset {
11 |     common_params {
12 |         parallelize = false
13 |         do_lowercase = true
14 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
15 |     }
16 | }
17 | 
18 | training {
19 |     short_circuit = false
20 | 
21 |     device = "cuda"
22 |     wandb = true
23 |     tensorboard = true
24 | 
25 |     name = "code_search_bert_query_code_siamese"
26 |     iteration = "2020_02_14_16_00"
27 |     tokenizer_type = "query_code_siamese"
28 |     model {
29 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
30 |         output_size = 72
31 |         encoder {
32 |             hidden_size = ${training.model.output_size}
33 |             vocab_size = ${common_vocab_size}
34 |             intermediate_size = 256
35 |             num_hidden_layers = 12
36 |             num_attention_heads = 12
37 |         }
38 |     }
39 | 
40 |     batch_size {
41 |         train = 100
42 |         val = 100
43 |         test = 100
44 |     }
45 | 
46 | }


--------------------------------------------------------------------------------
/conf/code_search_bert_query_code_siamese_2020_02_14_16_00.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
 4 | 
 5 | tokenizers {
 6 |     build_path = "./build_tokenizers/with_lang_query_code_siamese"
 7 |     token_files = "./build_tokenizers/token_files_query_code_siamese"
 8 | }
 9 | 
10 | dataset {
11 |     common_params {
12 |         parallelize = false
13 |         do_lowercase = true
14 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
15 |     }
16 | }
17 | 
18 | training {
19 |     short_circuit = false
20 | 
21 |     device = "cuda"
22 |     wandb = true
23 |     tensorboard = true
24 | 
25 |     name = "code_search_bert_query_code_siamese"
26 |     iteration = "2020_02_14_16_00"
27 |     tokenizer_type = "query_code_siamese"
28 |     model {
29 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
30 |         output_size = 72
31 |         encoder {
32 |             hidden_size = ${training.model.output_size}
33 |             vocab_size = ${common_vocab_size}
34 |             intermediate_size = 256
35 |             num_hidden_layers = 12
36 |             num_attention_heads = 12
37 |         }
38 |     }
39 | 
40 |     batch_size {
41 |         train = 100
42 |         val = 100
43 |         test = 100
44 |     }
45 | 
46 | }


--------------------------------------------------------------------------------
/conf/code_search_bert_query_code_siamese_2020_02_15_14_00.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
 4 | 
 5 | tokenizers {
 6 |     build_path = "./build_tokenizers/with_lang_query_code_siamese"
 7 |     token_files = "./build_tokenizers/token_files_query_code_siamese"
 8 | }
 9 | 
10 | dataset {
11 |     common_params {
12 |         parallelize = false
13 |         do_lowercase = true
14 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
15 |     }
16 | }
17 | 
18 | training {
19 |     short_circuit = true
20 | 
21 |     device = "cuda"
22 |     wandb = false
23 |     tensorboard = false
24 | 
25 |     name = "code_search_siamese"
26 |     iteration = "2020_02_15_14_00"
27 |     tokenizer_type = "query_code_siamese"
28 |     # Temporary because Rust tokenizers do not manage common tokens
29 |     common_tokens_file = "./pickles/common_tokens_"${training.tokenizer_type}"_"${iteration}".p"
30 | 
31 |     model {
32 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
33 |         output_size = 72
34 |         encoder {
35 |             hidden_size = ${training.model.output_size}
36 |             vocab_size = ${common_vocab_size}
37 |             intermediate_size = 256
38 |             num_hidden_layers = 12
39 |             num_attention_heads = 12
40 |         }
41 |     }
42 | 
43 |     batch_size {
44 |         train = 100
45 |         val = 100
46 |         test = 100
47 |     }
48 | 
49 | }


--------------------------------------------------------------------------------
/conf/default.conf:
--------------------------------------------------------------------------------
  1 | 
  2 | lang_ids {
  3 |     php = 0
  4 |     python = 1
  5 |     ruby = 2
  6 |     java = 3
  7 |     go = 4
  8 |     javascript = 5
  9 | }
 10 | 
 11 | common_vocab_size = 10000
 12 | 
 13 | bert {
 14 |     hidden_size = 128
 15 |     vocab_size = ${common_vocab_size}
 16 |     intermediate_size = 512
 17 |     num_hidden_layers = 3
 18 |     num_attention_heads = 8
 19 | }
 20 | 
 21 | tokenizers {
 22 |     type = "TOKENIZER_TYPE"
 23 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 24 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
 25 | }
 26 | 
 27 | dataset {
 28 |     root_dir = ${HOME}"/workspaces/tools/CodeSearchNet/resources"
 29 |     common_params {
 30 |         fraction_using_func_name=0.1
 31 |         min_len_func_name_for_query=12
 32 |         use_subtokens=False
 33 |         mark_subtoken_end=False
 34 |         code_max_num_tokens=200
 35 |         query_max_num_tokens=30
 36 |         use_bpe=True
 37 |         vocab_size=${common_vocab_size}
 38 |         pct_bpe=0.5
 39 |         vocab_count_threshold=10
 40 |         lang_ids = ${lang_ids}
 41 |         do_lowercase = true
 42 |         special_tokens = ["<unk>"]
 43 |         parallelize = true
 44 |         use_lang_weights = False
 45 |     }
 46 | 
 47 |     train {
 48 |         dirs = ${dataset.root_dir}"/data_dirs_train.txt"
 49 |         params = ${dataset.common_params}
 50 |     }
 51 | 
 52 |     val {
 53 |         dirs = ${dataset.root_dir}"/data_dirs_valid.txt"
 54 |         params = ${dataset.common_params}
 55 |     }
 56 | 
 57 |     test {
 58 |         dirs = ${dataset.root_dir}"/data_dirs_test.txt"
 59 |         params = ${dataset.common_params}
 60 |     }
 61 | 
 62 |     queries_file = ${dataset.root_dir}"/queries.csv"
 63 | }
 64 | 
 65 | 
 66 | training {
 67 |     # The name of current experiment (can have several runs)
 68 |     name = "EXPERIMENT_NAME"
 69 |     # The unique id of current run
 70 |     iteration = "UNIQUE_RUN_ID"
 71 |     # The ID used to identify the pre-built pickled files
 72 |     # using the tokenizer defined above
 73 |     tokenizer_type = "TOKENIZER_ID"
 74 |     
 75 |     # Set that to true to test your run without slow-loading train dataset
 76 |     short_circuit = false
 77 | 
 78 |     device = "cuda"
 79 |     # deactivate wandb & tensorboard
 80 |     wandb = true
 81 |     tensorboard = true
 82 | 
 83 |     model {
 84 |         # IMPORTANT: the class representing Training Context
 85 |         training_ctx_class = "codenets.codesearchnet.query_1_code_1.training_ctx.Query1Code1Ctx"
 86 |         output_size = 64
 87 |         query_encoder {
 88 |             hidden_size = ${training.model.output_size}
 89 |             vocab_size = ${common_vocab_size}
 90 |             intermediate_size = 512
 91 |             num_hidden_layers = 3
 92 |             num_attention_heads = 8
 93 |         }
 94 |         code_encoder {
 95 |             hidden_size = ${training.model.output_size}
 96 |             vocab_size = ${common_vocab_size}
 97 |             intermediate_size = 512
 98 |             num_hidden_layers = 6
 99 |             num_attention_heads = 8
100 |         }
101 |     }
102 | 
103 |     # Training Hyper-Parameters
104 |     seed = 0
105 |     lr = 0.0001
106 |     max_grad_norm = 1.0
107 |     min_log_interval = 50
108 |     start_epoch = 0
109 |     epochs = 10
110 | 
111 |     batch_size {
112 |         train = 256
113 |         val = 256
114 |         test = 256
115 |     }
116 | 
117 |     loss {
118 |         type = "softmax_cross_entropy"
119 |         margin = 1.0
120 |     }
121 | 
122 |     # Paths
123 |     pickle_path = "./pickles"
124 |     output_dir = "./checkpoints"
125 |     tensorboard_path = "./runs"
126 | 
127 | }


--------------------------------------------------------------------------------
/conf/qc_ast_2020_03_13.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | embeddings {
13 |     sbert {
14 |         model="bert-base-nli-mean-tokens"
15 |         pickle_path="./pickles"
16 |     }
17 | }
18 | 
19 | dataset {
20 |     common_params {
21 |         parallelize = false
22 |         do_lowercase = true
23 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
24 |         use_lang_weights = True
25 |         code_max_num_tokens=200
26 |         use_subtokens = false # to do later
27 |         #query_embeddings="sbert"
28 |         fraction_using_func_name=0.1
29 |         use_ast = "tree-sitter"
30 |         ast_added_nodes = {
31 |             "php": {"prefix": "<?php", "suffix": "?>"},
32 |             "java": {"prefix": "class Toto {", "suffix": "}"}
33 |         }
34 |         ast_skip_node_types = {"php": ["ERROR", "<?php", "?>"], "java": ["ERROR"]}
35 | 
36 |         ast_special_tokens_files = [ "./pickles/test_special_tokens.json" ]
37 |     }
38 | }
39 | 
40 | training {
41 |     short_circuit = true
42 | 
43 |     device = "cuda"
44 |     wandb = false
45 |     tensorboard = false
46 | 
47 |     name = "qc_ast"
48 |     iteration = "2020_03_15"
49 |     tokenizer_type = ${tokenizers.type}
50 | 
51 |     model {
52 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
53 |         encoder {
54 |             hidden_size = 32
55 |             vocab_size = ${common_vocab_size}
56 |             intermediate_size = 128
57 |             num_hidden_layers = 3
58 |             num_attention_heads = 8
59 |         }
60 |     }
61 |     lr = 0.001
62 | 
63 |     loss {
64 |         type = "lambda_loss"
65 |     }
66 | 
67 |     batch_size {
68 |         #train = 400
69 |         #val = 400
70 |         #test = 400
71 |         train = 5
72 |         val = 5
73 |         test = 5
74 |     }
75 | 
76 | }


--------------------------------------------------------------------------------
/conf/qc_ast_2020_03_15 copy.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | embeddings {
13 |     sbert {
14 |         model="bert-base-nli-mean-tokens"
15 |         pickle_path="./pickles"
16 |     }
17 | }
18 | 
19 | dataset {
20 |     common_params {
21 |         parallelize = false
22 |         do_lowercase = true
23 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
24 |         use_lang_weights = True
25 |         code_max_num_tokens=1024
26 |         use_subtokens = false # to do later
27 |         #query_embeddings="sbert"
28 |         fraction_using_func_name=0.1
29 |         use_ast = "tree-sitter"
30 |         ast_added_nodes = {
31 |             "php": {"prefix": "<?php", "suffix": "?>"},
32 |             "java": {"prefix": "class Toto {", "suffix": "}"}
33 |         }
34 |         ast_skip_node_types = {"php": ["ERROR", "<?php", "?>"], "java": ["ERROR"]}
35 | 
36 |         ast_special_tokens_files = [ "./pickles/test_special_tokens.json" ]
37 |     }
38 | }
39 | 
40 | training {
41 |     short_circuit = true
42 | 
43 |     device = "cuda"
44 |     wandb = false
45 |     tensorboard = false
46 | 
47 |     name = "qc_ast"
48 |     iteration = "2020_03_15"
49 |     tokenizer_type = ${tokenizers.type}"_ast"
50 | 
51 |     model {
52 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
53 |         encoder {
54 |             hidden_size = 64
55 |             vocab_size = ${common_vocab_size}
56 |             intermediate_size = 512
57 |             num_hidden_layers = 3
58 |             num_attention_heads = 8
59 |         }
60 |     }
61 |     lr = 0.00001
62 | 
63 |     loss {
64 |         type = "softmax_cross_entropy"
65 |     }
66 | 
67 |     batch_size {
68 |         train = 8
69 |         val = 8
70 |         test = 8
71 |         #train = 5
72 |         #val = 5
73 |         #test = 5
74 |     }
75 | 
76 | }


--------------------------------------------------------------------------------
/conf/qc_ast_2020_03_15.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | embeddings {
13 |     sbert {
14 |         model="bert-base-nli-mean-tokens"
15 |         pickle_path="./pickles"
16 |     }
17 | }
18 | 
19 | dataset {
20 |     common_params {
21 |         parallelize = false
22 |         do_lowercase = true
23 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
24 |         use_lang_weights = True
25 |         code_max_num_tokens=1024
26 |         use_subtokens = false # to do later
27 |         #query_embeddings="sbert"
28 |         fraction_using_func_name=0.1
29 |         use_ast = "tree-sitter"
30 |         ast_added_nodes = {
31 |             "php": {"prefix": "<?php", "suffix": "?>"},
32 |             "java": {"prefix": "class Toto {", "suffix": "}"}
33 |         }
34 |         ast_skip_node_types = {"php": ["ERROR", "<?php", "?>"], "java": ["ERROR"]}
35 | 
36 |         ast_special_tokens_files = [ "./pickles/test_special_tokens.json" ]
37 |     }
38 | }
39 | 
40 | training {
41 |     short_circuit = true
42 | 
43 |     device = "cuda"
44 |     wandb = false
45 |     tensorboard = false
46 | 
47 |     name = "qc_ast"
48 |     iteration = "2020_03_15"
49 |     tokenizer_type = ${tokenizers.type}"_ast"
50 | 
51 |     model {
52 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
53 |         encoder {
54 |             hidden_size = 64
55 |             vocab_size = ${common_vocab_size}
56 |             intermediate_size = 512
57 |             num_hidden_layers = 3
58 |             num_attention_heads = 8
59 |         }
60 |     }
61 |     lr = 0.00001
62 | 
63 |     loss {
64 |         type = "softmax_cross_entropy"
65 |     }
66 | 
67 |     batch_size {
68 |         train = 8
69 |         val = 8
70 |         test = 8
71 |         #train = 5
72 |         #val = 5
73 |         #test = 5
74 |     }
75 | 
76 | }


--------------------------------------------------------------------------------
/conf/qc_ast_2020_03_17.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | embeddings {
13 |     sbert {
14 |         model="bert-base-nli-mean-tokens"
15 |         pickle_path="./pickles"
16 |     }
17 | }
18 | 
19 | dataset {
20 |     common_params {
21 |         parallelize = false
22 |         do_lowercase = true
23 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
24 |         use_lang_weights = True
25 |         code_max_num_tokens=1024
26 |         use_subtokens = false # to do later
27 |         #query_embeddings="sbert"
28 |         fraction_using_func_name=0.1
29 |         use_ast = "tree-sitter"
30 |         ast_added_nodes = {
31 |             "php": {"prefix": "<?php", "suffix": "?>"},
32 |             "java": {"prefix": "class Toto {", "suffix": "}"}
33 |         }
34 |         ast_skip_node_types = {"php": ["ERROR", "<?php", "?>"], "java": ["ERROR"]}
35 | 
36 |         ast_special_tokens_files = [ "./pickles/test_special_tokens.json" ]
37 |     }
38 | }
39 | 
40 | training {
41 |     short_circuit = true
42 | 
43 |     device = "cuda"
44 |     wandb = false
45 |     tensorboard = false
46 | 
47 |     name = "qc_ast"
48 |     iteration = "2020_03_15"
49 |     tokenizer_type = ${tokenizers.type}"_ast"
50 | 
51 |     model {
52 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
53 |         encoder {
54 |             hidden_size = 64
55 |             vocab_size = ${common_vocab_size}
56 |             intermediate_size = 512
57 |             num_hidden_layers = 3
58 |             num_attention_heads = 8
59 |         }
60 |     }
61 |     lr = 0.00001
62 | 
63 |     loss {
64 |         type = "softmax_cross_entropy"
65 |     }
66 | 
67 |     batch_size {
68 |         train = 8
69 |         val = 8
70 |         test = 8
71 |         #train = 5
72 |         #val = 5
73 |         #test = 5
74 |     }
75 | 
76 | }


--------------------------------------------------------------------------------
/conf/qc_ast_2020_03_18 copy.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | embeddings {
13 |     sbert {
14 |         model="bert-base-nli-mean-tokens"
15 |         pickle_path="./pickles"
16 |     }
17 | }
18 | 
19 | dataset {
20 |     common_params {
21 |         parallelize = false
22 |         do_lowercase = true
23 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
24 |         use_lang_weights = True
25 |         code_max_num_tokens = 512
26 |         use_subtokens = false # to do later
27 |         #query_embeddings="sbert"
28 |         fraction_using_func_name=0.1
29 |         use_ast = "tree-sitter"
30 |         ast_added_nodes = {
31 |             "php": {"prefix": "<?php", "suffix": "?>"},
32 |             "java": {"prefix": "class Toto {", "suffix": "}"}
33 |         }
34 |         ast_skip_node_types = {"php": ["ERROR", "<?php", "?>"], "java": ["ERROR"]}
35 | 
36 |         ast_special_tokens_files = [ "./pickles/test_special_tokens.json" ]
37 |     }
38 | }
39 | 
40 | training {
41 |     short_circuit = false
42 | 
43 |     device = "cuda"
44 |     wandb = true
45 |     tensorboard = false
46 | 
47 |     name = "qc_ast"
48 |     iteration = "2020_03_18"
49 |     tokenizer_type = ${tokenizers.type}"_ast_512"
50 | 
51 |     model {
52 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
53 |         encoder {
54 |             hidden_size = 128
55 |             vocab_size = 30370
56 |             intermediate_size = 512
57 |             num_hidden_layers = 3
58 |             num_attention_heads = 8
59 |         }
60 |     }
61 |     lr = 0.0001
62 | 
63 |     loss {
64 |         type = "softmax_cross_entropy"
65 |     }
66 | 
67 |     batch_size {
68 |         train = 85
69 |         val = 85
70 |         test = 85
71 |         #train = 5
72 |         #val = 5
73 |         #test = 5
74 |     }
75 | 
76 | }


--------------------------------------------------------------------------------
/conf/qc_ast_2020_03_18.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | embeddings {
13 |     sbert {
14 |         model="bert-base-nli-mean-tokens"
15 |         pickle_path="./pickles"
16 |     }
17 | }
18 | 
19 | dataset {
20 |     common_params {
21 |         parallelize = false
22 |         do_lowercase = true
23 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
24 |         use_lang_weights = True
25 |         code_max_num_tokens = 512
26 |         use_subtokens = false # to do later
27 |         #query_embeddings="sbert"
28 |         fraction_using_func_name=0.1
29 |         use_ast = "tree-sitter"
30 |         ast_added_nodes = {
31 |             "php": {"prefix": "<?php", "suffix": "?>"},
32 |             "java": {"prefix": "class Toto {", "suffix": "}"}
33 |         }
34 |         ast_skip_node_types = {"php": ["ERROR", "<?php", "?>"], "java": ["ERROR"]}
35 | 
36 |         ast_special_tokens_files = [ "./pickles/test_special_tokens.json" ]
37 |     }
38 | }
39 | 
40 | training {
41 |     short_circuit = false
42 | 
43 |     device = "cuda"
44 |     wandb = true
45 |     tensorboard = false
46 | 
47 |     name = "qc_ast"
48 |     iteration = "2020_03_18"
49 |     tokenizer_type = ${tokenizers.type}"_ast_512"
50 | 
51 |     model {
52 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
53 |         encoder {
54 |             hidden_size = 128
55 |             vocab_size = 30370
56 |             intermediate_size = 512
57 |             num_hidden_layers = 3
58 |             num_attention_heads = 8
59 |         }
60 |     }
61 |     lr = 0.0001
62 | 
63 |     loss {
64 |         type = "softmax_cross_entropy"
65 |     }
66 | 
67 |     batch_size {
68 |         train = 85
69 |         val = 85
70 |         test = 85
71 |         #train = 5
72 |         #val = 5
73 |         #test = 5
74 |     }
75 | 
76 | }


--------------------------------------------------------------------------------
/conf/qc_ast_2020_03_19.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | embeddings {
13 |     sbert {
14 |         model="bert-base-nli-mean-tokens"
15 |         pickle_path="./pickles"
16 |     }
17 | }
18 | 
19 | dataset {
20 |     common_params {
21 |         parallelize = false
22 |         do_lowercase = true
23 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
24 |         use_lang_weights = True
25 |         code_max_num_tokens = 256
26 |         use_subtokens = false # to do later
27 |         #query_embeddings="sbert"
28 |         fraction_using_func_name=0.1
29 |         use_ast = "tree-sitter"
30 |         ast_added_nodes = {
31 |             "php": {"prefix": "<?php", "suffix": "?>"},
32 |             "java": {"prefix": "class Toto {", "suffix": "}"}
33 |         }
34 |         ast_skip_node_types = {"php": ["ERROR", "<?php", "?>"], "java": ["ERROR"]}
35 | 
36 |         ast_special_tokens_files = [ "./pickles/test_special_tokens.json" ]
37 |     }
38 | }
39 | 
40 | training {
41 |     short_circuit = false
42 | 
43 |     device = "cuda"
44 |     wandb = true
45 |     tensorboard = false
46 | 
47 |     name = "qc_ast"
48 |     iteration = "2020_03_19"
49 |     tokenizer_type = ${tokenizers.type}"_ast_256"
50 | 
51 |     model {
52 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
53 |         encoder {
54 |             hidden_size = 64
55 |             vocab_size = 30370
56 |             intermediate_size = 768
57 |             num_hidden_layers = 3
58 |             num_attention_heads = 8
59 |         }
60 |     }
61 |     lr = 0.0001
62 | 
63 |     loss {
64 |         type = "softmax_cross_entropy"
65 |     }
66 | 
67 |     batch_size {
68 |         train = 256
69 |         val = 256
70 |         test = 256
71 |         #train = 5
72 |         #val = 5
73 |         #test = 5
74 |     }
75 | 
76 | }


--------------------------------------------------------------------------------
/conf/qc_ce_2020_02_23_01_00 copy.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | dataset {
13 |     common_params {
14 |         parallelize = false
15 |         do_lowercase = true
16 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
17 |         use_lang_weights = True
18 |     }
19 | }
20 | 
21 | training {
22 |     short_circuit = false
23 | 
24 |     device = "cuda"
25 |     wandb = true
26 |     tensorboard = true
27 | 
28 |     name = "qc_ce"
29 |     iteration = "2020_02_23_01_00"
30 |     tokenizer_type = ${tokenizers.type}
31 | 
32 |     model {
33 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
34 |         encoder {
35 |             hidden_size = 32
36 |             vocab_size = ${common_vocab_size}
37 |             intermediate_size = 256
38 |             num_hidden_layers = 2
39 |             num_attention_heads = 8
40 |         }
41 |     }
42 | 
43 |     loss {
44 |         type = "softmax_cross_entropy"
45 |     }
46 | 
47 |     batch_size {
48 |         train = 768
49 |         val = 768
50 |         test = 768
51 |     }
52 | 
53 | }


--------------------------------------------------------------------------------
/conf/qc_ce_2020_02_23_01_00.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | dataset {
13 |     common_params {
14 |         parallelize = false
15 |         do_lowercase = true
16 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
17 |         use_lang_weights = True
18 |     }
19 | }
20 | 
21 | training {
22 |     short_circuit = false
23 | 
24 |     device = "cuda"
25 |     wandb = true
26 |     tensorboard = true
27 | 
28 |     name = "qc_ce"
29 |     iteration = "2020_02_23_01_00"
30 |     tokenizer_type = ${tokenizers.type}
31 | 
32 |     model {
33 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
34 |         encoder {
35 |             hidden_size = 32
36 |             vocab_size = ${common_vocab_size}
37 |             intermediate_size = 256
38 |             num_hidden_layers = 2
39 |             num_attention_heads = 8
40 |         }
41 |     }
42 | 
43 |     loss {
44 |         type = "softmax_cross_entropy"
45 |     }
46 | 
47 |     batch_size {
48 |         train = 768
49 |         val = 768
50 |         test = 768
51 |     }
52 | 
53 | }


--------------------------------------------------------------------------------
/conf/qc_ce_long_seq_2020_02_24.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | dataset {
13 |     common_params {
14 |         parallelize = false
15 |         do_lowercase = true
16 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
17 |         use_lang_weights = True
18 |         code_max_num_tokens=400 # mainly for JS which is more verbose
19 |     }
20 | }
21 | 
22 | training {
23 |     short_circuit = false
24 | 
25 |     device = "cuda"
26 |     wandb = true
27 |     tensorboard = true
28 | 
29 |     name = "qc_ce"
30 |     iteration = "2020_02_24"
31 |     tokenizer_type = ${tokenizers.type}
32 | 
33 |     model {
34 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
35 |         encoder {
36 |             hidden_size = 32
37 |             vocab_size = ${common_vocab_size}
38 |             intermediate_size = 256
39 |             num_hidden_layers = 2
40 |             num_attention_heads = 8
41 |         }
42 |     }
43 | 
44 |     loss {
45 |         type = "softmax_cross_entropy"
46 |     }
47 | 
48 |     batch_size {
49 |         train = 768
50 |         val = 768
51 |         test = 768
52 |     }
53 | 
54 | }


--------------------------------------------------------------------------------
/conf/qc_ce_sbert_2020_02_27 copy.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | embeddings {
13 |     sbert {
14 |         model="bert-base-nli-mean-tokens"
15 |         pickle_path="./pickles"
16 |     }
17 | }
18 | 
19 | dataset {
20 |     common_params {
21 |         parallelize = false
22 |         do_lowercase = true
23 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
24 |         use_lang_weights = True
25 |         code_max_num_tokens=200
26 |         use_subtokens=True
27 |         query_embeddings="sbert"
28 |     }
29 | }
30 | 
31 | training {
32 |     short_circuit = false
33 | 
34 |     device = "cuda"
35 |     wandb = true
36 |     tensorboard = true
37 | 
38 |     name = "qc_ce_sbert"
39 |     iteration = "2020_02_27"
40 |     tokenizer_type = ${tokenizers.type}
41 | 
42 |     model {
43 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
44 |         encoder {
45 |             hidden_size = 32
46 |             vocab_size = ${common_vocab_size}
47 |             intermediate_size = 256
48 |             num_hidden_layers = 2
49 |             num_attention_heads = 8
50 |         }
51 |     }
52 | 
53 |     loss {
54 |         type = "lambda_loss"
55 |     }
56 | 
57 |     batch_size {
58 |         train = 425
59 |         val = 425
60 |         test = 425
61 |     }
62 | 
63 | }


--------------------------------------------------------------------------------
/conf/qc_ce_sbert_2020_02_27.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | embeddings {
13 |     sbert {
14 |         model="bert-base-nli-mean-tokens"
15 |         pickle_path="./pickles"
16 |     }
17 | }
18 | 
19 | dataset {
20 |     common_params {
21 |         parallelize = false
22 |         do_lowercase = true
23 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
24 |         use_lang_weights = True
25 |         code_max_num_tokens=200
26 |         use_subtokens=True
27 |         query_embeddings="sbert"
28 |     }
29 | }
30 | 
31 | training {
32 |     short_circuit = false
33 | 
34 |     device = "cuda"
35 |     wandb = true
36 |     tensorboard = true
37 | 
38 |     name = "qc_ce_sbert"
39 |     iteration = "2020_02_27"
40 |     tokenizer_type = ${tokenizers.type}
41 | 
42 |     model {
43 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
44 |         encoder {
45 |             hidden_size = 32
46 |             vocab_size = ${common_vocab_size}
47 |             intermediate_size = 256
48 |             num_hidden_layers = 2
49 |             num_attention_heads = 8
50 |         }
51 |     }
52 | 
53 |     loss {
54 |         type = "lambda_loss"
55 |     }
56 | 
57 |     batch_size {
58 |         train = 425
59 |         val = 425
60 |         test = 425
61 |     }
62 | 
63 | }


--------------------------------------------------------------------------------
/conf/qc_ce_sbert_2020_02_28 copy.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | embeddings {
13 |     sbert {
14 |         model="bert-base-nli-mean-tokens"
15 |         pickle_path="./pickles"
16 |     }
17 | }
18 | 
19 | dataset {
20 |     common_params {
21 |         parallelize = false
22 |         do_lowercase = true
23 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
24 |         use_lang_weights = True
25 |         code_max_num_tokens=200
26 |         use_subtokens=True
27 |         query_embeddings="sbert"
28 |     }
29 | }
30 | 
31 | training {
32 |     short_circuit = false
33 | 
34 |     device = "cuda"
35 |     wandb = true
36 |     tensorboard = true
37 | 
38 |     name = "qc_ce_sbert"
39 |     iteration = "2020_02_28"
40 |     tokenizer_type = ${tokenizers.type}
41 | 
42 |     model {
43 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
44 |         encoder {
45 |             hidden_size = 64
46 |             vocab_size = ${common_vocab_size}
47 |             intermediate_size = 512
48 |             num_hidden_layers = 4
49 |             num_attention_heads = 8
50 |         }
51 |     }
52 |     lr = 0.0001
53 | 
54 |     loss {
55 |         type = "lambda_loss"
56 |     }
57 | 
58 |     batch_size {
59 |         train = 300
60 |         val = 300
61 |         test = 300
62 |     }
63 | 
64 | }


--------------------------------------------------------------------------------
/conf/qc_ce_sbert_2020_02_28.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | embeddings {
13 |     sbert {
14 |         model="bert-base-nli-mean-tokens"
15 |         pickle_path="./pickles"
16 |     }
17 | }
18 | 
19 | dataset {
20 |     common_params {
21 |         parallelize = false
22 |         do_lowercase = true
23 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
24 |         use_lang_weights = True
25 |         code_max_num_tokens=200
26 |         use_subtokens=True
27 |         query_embeddings="sbert"
28 |     }
29 | }
30 | 
31 | training {
32 |     short_circuit = false
33 | 
34 |     device = "cuda"
35 |     wandb = true
36 |     tensorboard = true
37 | 
38 |     name = "qc_ce_sbert"
39 |     iteration = "2020_02_28"
40 |     tokenizer_type = ${tokenizers.type}
41 | 
42 |     model {
43 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
44 |         encoder {
45 |             hidden_size = 64
46 |             vocab_size = ${common_vocab_size}
47 |             intermediate_size = 512
48 |             num_hidden_layers = 4
49 |             num_attention_heads = 8
50 |         }
51 |     }
52 |     lr = 0.0001
53 | 
54 |     loss {
55 |         type = "lambda_loss"
56 |     }
57 | 
58 |     batch_size {
59 |         train = 300
60 |         val = 300
61 |         test = 300
62 |     }
63 | 
64 | }


--------------------------------------------------------------------------------
/conf/qc_ce_sbert_2020_02_29 copy.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | embeddings {
13 |     sbert {
14 |         model="bert-base-nli-mean-tokens"
15 |         pickle_path="./pickles"
16 |     }
17 | }
18 | 
19 | dataset {
20 |     common_params {
21 |         parallelize = false
22 |         do_lowercase = true
23 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
24 |         use_lang_weights = True
25 |         code_max_num_tokens=200
26 |         use_subtokens=True
27 |         query_embeddings="sbert"
28 |     }
29 | }
30 | 
31 | training {
32 |     short_circuit = true
33 | 
34 |     device = "cuda"
35 |     wandb = false
36 |     tensorboard = false
37 | 
38 |     name = "qc_ce_sbert"
39 |     iteration = "2020_02_29"
40 |     tokenizer_type = ${tokenizers.type}
41 | 
42 |     model {
43 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
44 |         encoder {
45 |             hidden_size = 128
46 |             vocab_size = ${common_vocab_size}
47 |             intermediate_size = 512
48 |             num_hidden_layers = 4
49 |             num_attention_heads = 8
50 |         }
51 |     }
52 |     lr = 0.0001
53 | 
54 |     loss {
55 |         type = "lambda_loss"
56 |     }
57 | 
58 |     batch_size {
59 |         train = 275
60 |         val = 275
61 |         test = 275
62 |     }
63 | 
64 | }


--------------------------------------------------------------------------------
/conf/qc_ce_sbert_2020_02_29.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | embeddings {
13 |     sbert {
14 |         model="bert-base-nli-mean-tokens"
15 |         pickle_path="./pickles"
16 |     }
17 | }
18 | 
19 | dataset {
20 |     common_params {
21 |         parallelize = false
22 |         do_lowercase = true
23 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
24 |         use_lang_weights = True
25 |         code_max_num_tokens=200
26 |         use_subtokens=True
27 |         query_embeddings="sbert"
28 |     }
29 | }
30 | 
31 | training {
32 |     short_circuit = true
33 | 
34 |     device = "cuda"
35 |     wandb = false
36 |     tensorboard = false
37 | 
38 |     name = "qc_ce_sbert"
39 |     iteration = "2020_02_29"
40 |     tokenizer_type = ${tokenizers.type}
41 | 
42 |     model {
43 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
44 |         encoder {
45 |             hidden_size = 128
46 |             vocab_size = ${common_vocab_size}
47 |             intermediate_size = 512
48 |             num_hidden_layers = 4
49 |             num_attention_heads = 8
50 |         }
51 |     }
52 |     lr = 0.0001
53 | 
54 |     loss {
55 |         type = "lambda_loss"
56 |     }
57 | 
58 |     batch_size {
59 |         train = 275
60 |         val = 275
61 |         test = 275
62 |     }
63 | 
64 | }


--------------------------------------------------------------------------------
/conf/qc_ce_sbert_2020_03_01 copy.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | embeddings {
13 |     sbert {
14 |         model="bert-base-nli-mean-tokens"
15 |         pickle_path="./pickles"
16 |     }
17 | }
18 | 
19 | dataset {
20 |     common_params {
21 |         parallelize = false
22 |         do_lowercase = true
23 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
24 |         use_lang_weights = True
25 |         code_max_num_tokens=200
26 |         use_subtokens=True
27 |         query_embeddings="sbert"
28 |     }
29 | }
30 | 
31 | training {
32 |     short_circuit = true
33 | 
34 |     device = "cuda"
35 |     wandb = false
36 |     tensorboard = false
37 | 
38 |     name = "qc_ce_sbert"
39 |     iteration = "2020_03_01"
40 |     tokenizer_type = ${tokenizers.type}
41 | 
42 |     model {
43 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
44 |         encoder {
45 |             hidden_size = 32
46 |             vocab_size = ${common_vocab_size}
47 |             intermediate_size = 256
48 |             num_hidden_layers = 2
49 |             num_attention_heads = 8
50 |         }
51 |     }
52 |     lr = 0.0001
53 | 
54 |     loss {
55 |         type = "lambda_loss"
56 |     }
57 | 
58 |     batch_size {
59 |         train = 400
60 |         val = 400
61 |         test = 400
62 |     }
63 | 
64 | }


--------------------------------------------------------------------------------
/conf/qc_ce_sbert_2020_03_01.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | embeddings {
13 |     sbert {
14 |         model="bert-base-nli-mean-tokens"
15 |         pickle_path="./pickles"
16 |     }
17 | }
18 | 
19 | dataset {
20 |     common_params {
21 |         parallelize = false
22 |         do_lowercase = true
23 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
24 |         use_lang_weights = True
25 |         code_max_num_tokens=200
26 |         use_subtokens=True
27 |         query_embeddings="sbert"
28 |     }
29 | }
30 | 
31 | training {
32 |     short_circuit = true
33 | 
34 |     device = "cuda"
35 |     wandb = false
36 |     tensorboard = false
37 | 
38 |     name = "qc_ce_sbert"
39 |     iteration = "2020_03_01"
40 |     tokenizer_type = ${tokenizers.type}
41 | 
42 |     model {
43 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
44 |         encoder {
45 |             hidden_size = 32
46 |             vocab_size = ${common_vocab_size}
47 |             intermediate_size = 256
48 |             num_hidden_layers = 2
49 |             num_attention_heads = 8
50 |         }
51 |     }
52 |     lr = 0.0001
53 | 
54 |     loss {
55 |         type = "lambda_loss"
56 |     }
57 | 
58 |     batch_size {
59 |         train = 400
60 |         val = 400
61 |         test = 400
62 |     }
63 | 
64 | }


--------------------------------------------------------------------------------
/conf/qc_ce_subtoken_2020_02_25 copy.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k_subtoken"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | dataset {
13 |     common_params {
14 |         parallelize = false
15 |         do_lowercase = true
16 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
17 |         use_lang_weights = True
18 |         code_max_num_tokens=200
19 |         use_subtokens=True
20 |     }
21 | }
22 | 
23 | training {
24 |     short_circuit = false
25 | 
26 |     device = "cuda"
27 |     wandb = true
28 |     tensorboard = true
29 | 
30 |     name = "qc_ce_subtoken"
31 |     iteration = "2020_02_25"
32 |     tokenizer_type = ${tokenizers.type}
33 | 
34 |     model {
35 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
36 |         encoder {
37 |             hidden_size = 32
38 |             vocab_size = ${common_vocab_size}
39 |             intermediate_size = 256
40 |             num_hidden_layers = 2
41 |             num_attention_heads = 8
42 |         }
43 |     }
44 | 
45 |     loss {
46 |         type = "softmax_cross_entropy"
47 |     }
48 | 
49 |     batch_size {
50 |         train = 768
51 |         val = 768
52 |         test = 768
53 |     }
54 | 
55 | }


--------------------------------------------------------------------------------
/conf/qc_ce_subtoken_2020_02_25.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k_subtoken"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | dataset {
13 |     common_params {
14 |         parallelize = false
15 |         do_lowercase = true
16 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
17 |         use_lang_weights = True
18 |         code_max_num_tokens=200
19 |         use_subtokens=True
20 |     }
21 | }
22 | 
23 | training {
24 |     short_circuit = false
25 | 
26 |     device = "cuda"
27 |     wandb = true
28 |     tensorboard = true
29 | 
30 |     name = "qc_ce_subtoken"
31 |     iteration = "2020_02_25"
32 |     tokenizer_type = ${tokenizers.type}
33 | 
34 |     model {
35 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
36 |         encoder {
37 |             hidden_size = 32
38 |             vocab_size = ${common_vocab_size}
39 |             intermediate_size = 256
40 |             num_hidden_layers = 2
41 |             num_attention_heads = 8
42 |         }
43 |     }
44 | 
45 |     loss {
46 |         type = "softmax_cross_entropy"
47 |     }
48 | 
49 |     batch_size {
50 |         train = 768
51 |         val = 768
52 |         test = 768
53 |     }
54 | 
55 | }


--------------------------------------------------------------------------------
/conf/qc_ce_subtoken_larger_2020_02_25.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k_subtoken"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | dataset {
13 |     common_params {
14 |         parallelize = false
15 |         do_lowercase = true
16 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
17 |         use_lang_weights = True
18 |         code_max_num_tokens=200
19 |         use_subtokens=True
20 |     }
21 | }
22 | 
23 | training {
24 |     short_circuit = false
25 | 
26 |     device = "cuda"
27 |     wandb = true
28 |     tensorboard = true
29 | 
30 |     name = "qc_ce_subtoken_larger"
31 |     iteration = "2020_02_26"
32 |     tokenizer_type = ${tokenizers.type}
33 | 
34 |     model {
35 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
36 |         encoder {
37 |             hidden_size = 64
38 |             vocab_size = ${common_vocab_size}
39 |             intermediate_size = 512
40 |             num_hidden_layers = 4
41 |             num_attention_heads = 8
42 |         }
43 |     }
44 | 
45 |     loss {
46 |         type = "softmax_cross_entropy"
47 |     }
48 | 
49 |     batch_size {
50 |         train = 350
51 |         val = 350
52 |         test = 350
53 |     }
54 | 
55 | }


--------------------------------------------------------------------------------
/conf/qc_ce_subtoken_larger_2020_02_26 copy.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k_subtoken"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | dataset {
13 |     common_params {
14 |         parallelize = false
15 |         do_lowercase = true
16 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
17 |         use_lang_weights = True
18 |         code_max_num_tokens=200
19 |         use_subtokens=True
20 |     }
21 | }
22 | 
23 | training {
24 |     short_circuit = false
25 | 
26 |     device = "cuda"
27 |     wandb = true
28 |     tensorboard = true
29 | 
30 |     name = "qc_ce_subtoken_larger"
31 |     iteration = "2020_02_26"
32 |     tokenizer_type = ${tokenizers.type}
33 | 
34 |     model {
35 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
36 |         encoder {
37 |             hidden_size = 64
38 |             vocab_size = ${common_vocab_size}
39 |             intermediate_size = 512
40 |             num_hidden_layers = 4
41 |             num_attention_heads = 8
42 |         }
43 |     }
44 | 
45 |     loss {
46 |         type = "softmax_cross_entropy"
47 |     }
48 | 
49 |     batch_size {
50 |         train = 350
51 |         val = 350
52 |         test = 350
53 |     }
54 | 
55 | }


--------------------------------------------------------------------------------
/conf/qc_ce_subtoken_larger_2020_02_26.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k_subtoken"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | dataset {
13 |     common_params {
14 |         parallelize = false
15 |         do_lowercase = true
16 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
17 |         use_lang_weights = True
18 |         code_max_num_tokens=200
19 |         use_subtokens=True
20 |     }
21 | }
22 | 
23 | training {
24 |     short_circuit = false
25 | 
26 |     device = "cuda"
27 |     wandb = true
28 |     tensorboard = true
29 | 
30 |     name = "qc_ce_subtoken_larger"
31 |     iteration = "2020_02_26"
32 |     tokenizer_type = ${tokenizers.type}
33 | 
34 |     model {
35 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
36 |         encoder {
37 |             hidden_size = 64
38 |             vocab_size = ${common_vocab_size}
39 |             intermediate_size = 512
40 |             num_hidden_layers = 4
41 |             num_attention_heads = 8
42 |         }
43 |     }
44 | 
45 |     loss {
46 |         type = "softmax_cross_entropy"
47 |     }
48 | 
49 |     batch_size {
50 |         train = 350
51 |         val = 350
52 |         test = 350
53 |     }
54 | 
55 | }


--------------------------------------------------------------------------------
/conf/qc_lambda_2020_02_20_12_30 copy.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | dataset {
13 |     common_params {
14 |         parallelize = false
15 |         do_lowercase = true
16 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
17 |         use_lang_weights = True
18 |     }
19 | }
20 | 
21 | training {
22 |     short_circuit = false
23 | 
24 |     device = "cuda"
25 |     wandb = true
26 |     tensorboard = true
27 | 
28 |     name = "qc_lambda"
29 |     iteration = "2020_02_20_12_30"
30 |     tokenizer_type = ${tokenizers.type}
31 | 
32 |     model {
33 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
34 |         encoder {
35 |             hidden_size = 128
36 |             vocab_size = ${common_vocab_size}
37 |             intermediate_size = 512
38 |             num_hidden_layers = 6
39 |             num_attention_heads = 8
40 |         }
41 |     }
42 | 
43 |     loss {
44 |         type = "lambda_loss"
45 |     }
46 | 
47 |     batch_size {
48 |         train = 220
49 |         val = 220
50 |         test = 220
51 |         # train = 8
52 |         # val = 8
53 |         # test = 8
54 |     }
55 | 
56 | }


--------------------------------------------------------------------------------
/conf/qc_lambda_2020_02_20_12_30.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | dataset {
13 |     common_params {
14 |         parallelize = false
15 |         do_lowercase = true
16 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
17 |         use_lang_weights = True
18 |     }
19 | }
20 | 
21 | training {
22 |     short_circuit = false
23 | 
24 |     device = "cuda"
25 |     wandb = true
26 |     tensorboard = true
27 | 
28 |     name = "qc_lambda"
29 |     iteration = "2020_02_20_12_30"
30 |     tokenizer_type = ${tokenizers.type}
31 | 
32 |     model {
33 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
34 |         encoder {
35 |             hidden_size = 128
36 |             vocab_size = ${common_vocab_size}
37 |             intermediate_size = 512
38 |             num_hidden_layers = 6
39 |             num_attention_heads = 8
40 |         }
41 |     }
42 | 
43 |     loss {
44 |         type = "lambda_loss"
45 |     }
46 | 
47 |     batch_size {
48 |         train = 220
49 |         val = 220
50 |         test = 220
51 |         # train = 8
52 |         # val = 8
53 |         # test = 8
54 |     }
55 | 
56 | }


--------------------------------------------------------------------------------
/conf/qc_sbert_lambda_2020_03_02.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | embeddings {
13 |     sbert {
14 |         model="bert-base-nli-mean-tokens"
15 |         pickle_path="./pickles"
16 |     }
17 | }
18 | 
19 | dataset {
20 |     common_params {
21 |         parallelize = false
22 |         do_lowercase = true
23 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
24 |         use_lang_weights = True
25 |         code_max_num_tokens=200
26 |         use_subtokens=True
27 |         query_embeddings="sbert"
28 |     }
29 | }
30 | 
31 | training {
32 |     short_circuit = false
33 | 
34 |     device = "cuda"
35 |     wandb = true
36 |     tensorboard = true
37 | 
38 |     name = "qc_sbert_lambda"
39 |     iteration = "2020_03_02"
40 |     tokenizer_type = ${tokenizers.type}
41 | 
42 |     model {
43 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
44 |         encoder {
45 |             hidden_size = 32
46 |             vocab_size = ${common_vocab_size}
47 |             intermediate_size = 128
48 |             num_hidden_layers = 2
49 |             num_attention_heads = 8
50 |         }
51 |     }
52 |     lr = 0.0001
53 | 
54 |     loss {
55 |         type = "approx_ndcg_loss"
56 |     }
57 | 
58 |     batch_size {
59 |         train = 400
60 |         val = 400
61 |         test = 400
62 |     }
63 | 
64 | }


--------------------------------------------------------------------------------
/conf/qc_sbert_lambda_2020_03_04 copy.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | embeddings {
13 |     sbert {
14 |         model="bert-base-nli-mean-tokens"
15 |         pickle_path="./pickles"
16 |     }
17 | }
18 | 
19 | dataset {
20 |     common_params {
21 |         parallelize = false
22 |         do_lowercase = true
23 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
24 |         use_lang_weights = True
25 |         code_max_num_tokens=200
26 |         use_subtokens=True
27 |         query_embeddings="sbert"
28 |         fraction_using_func_name=0.0
29 |     }
30 | }
31 | 
32 | training {
33 |     short_circuit = false
34 | 
35 |     device = "cuda"
36 |     wandb = true
37 |     tensorboard = false
38 | 
39 |     name = "qc_sbert_lambda"
40 |     iteration = "2020_03_04"
41 |     tokenizer_type = ${tokenizers.type}
42 | 
43 |     model {
44 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
45 |         encoder {
46 |             hidden_size = 768
47 |             vocab_size = ${common_vocab_size}
48 |             intermediate_size = 2048
49 |             num_hidden_layers = 3
50 |             num_attention_heads = 8
51 |         }
52 |     }
53 |     lr = 0.000001
54 | 
55 |     loss {
56 |         type = "approx_ndcg_loss"
57 |     }
58 | 
59 |     batch_size {
60 |         train = 100
61 |         val = 100
62 |         test = 100
63 |     }
64 | 
65 | }


--------------------------------------------------------------------------------
/conf/qc_sbert_lambda_2020_03_04.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | embeddings {
13 |     sbert {
14 |         model="bert-base-nli-mean-tokens"
15 |         pickle_path="./pickles"
16 |     }
17 | }
18 | 
19 | dataset {
20 |     common_params {
21 |         parallelize = false
22 |         do_lowercase = true
23 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
24 |         use_lang_weights = True
25 |         code_max_num_tokens=200
26 |         use_subtokens=True
27 |         query_embeddings="sbert"
28 |         fraction_using_func_name=0.0
29 |     }
30 | }
31 | 
32 | training {
33 |     short_circuit = false
34 | 
35 |     device = "cuda"
36 |     wandb = true
37 |     tensorboard = false
38 | 
39 |     name = "qc_sbert_lambda"
40 |     iteration = "2020_03_04"
41 |     tokenizer_type = ${tokenizers.type}
42 | 
43 |     model {
44 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
45 |         encoder {
46 |             hidden_size = 768
47 |             vocab_size = ${common_vocab_size}
48 |             intermediate_size = 2048
49 |             num_hidden_layers = 3
50 |             num_attention_heads = 8
51 |         }
52 |     }
53 |     lr = 0.000001
54 | 
55 |     loss {
56 |         type = "approx_ndcg_loss"
57 |     }
58 | 
59 |     batch_size {
60 |         train = 100
61 |         val = 100
62 |         test = 100
63 |     }
64 | 
65 | }


--------------------------------------------------------------------------------
/conf/qc_sbert_lambda_2020_03_05.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | embeddings {
13 |     sbert {
14 |         model="bert-base-nli-mean-tokens"
15 |         pickle_path="./pickles"
16 |     }
17 | }
18 | 
19 | dataset {
20 |     common_params {
21 |         parallelize = false
22 |         do_lowercase = true
23 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
24 |         use_lang_weights = True
25 |         code_max_num_tokens=200
26 |         use_subtokens=True
27 |         query_embeddings="sbert"
28 |         fraction_using_func_name=0.0
29 |     }
30 | }
31 | 
32 | training {
33 |     short_circuit = false
34 | 
35 |     device = "cuda"
36 |     wandb = true
37 |     tensorboard = false
38 | 
39 |     name = "qc_sbert_lambda"
40 |     iteration = "2020_03_04"
41 |     tokenizer_type = ${tokenizers.type}
42 | 
43 |     model {
44 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
45 |         encoder {
46 |             hidden_size = 768
47 |             vocab_size = ${common_vocab_size}
48 |             intermediate_size = 2048
49 |             num_hidden_layers = 3
50 |             num_attention_heads = 8
51 |         }
52 |     }
53 |     lr = 0.000001
54 | 
55 |     loss {
56 |         type = "approx_ndcg_loss"
57 |     }
58 | 
59 |     batch_size {
60 |         train = 100
61 |         val = 100
62 |         test = 100
63 |     }
64 | 
65 | }


--------------------------------------------------------------------------------
/conf/qc_sbert_lambda_2020_03_07 copy.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | embeddings {
13 |     sbert {
14 |         model="bert-base-nli-mean-tokens"
15 |         pickle_path="./pickles"
16 |     }
17 | }
18 | 
19 | dataset {
20 |     common_params {
21 |         parallelize = false
22 |         do_lowercase = true
23 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
24 |         use_lang_weights = True
25 |         code_max_num_tokens=200
26 |         use_subtokens=True
27 |         #query_embeddings="sbert"
28 |         fraction_using_func_name=0.1
29 |     }
30 | }
31 | 
32 | training {
33 |     short_circuit = true
34 | 
35 |     device = "cuda"
36 |     wandb = false
37 |     tensorboard = false
38 | 
39 |     name = "qc_sbert_lambda"
40 |     iteration = "2020_03_07"
41 |     tokenizer_type = ${tokenizers.type}
42 | 
43 |     model {
44 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
45 |         encoder {
46 |             hidden_size = 32
47 |             vocab_size = ${common_vocab_size}
48 |             intermediate_size = 128
49 |             num_hidden_layers = 3
50 |             num_attention_heads = 8
51 |         }
52 |     }
53 |     lr = 0.001
54 | 
55 |     loss {
56 |         type = "lambda_loss"
57 |     }
58 | 
59 |     batch_size {
60 |         #train = 400
61 |         #val = 400
62 |         #test = 400
63 |         train = 5
64 |         val = 5
65 |         test = 5
66 |     }
67 | 
68 | }


--------------------------------------------------------------------------------
/conf/qc_sbert_lambda_2020_03_07.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | embeddings {
13 |     sbert {
14 |         model="bert-base-nli-mean-tokens"
15 |         pickle_path="./pickles"
16 |     }
17 | }
18 | 
19 | dataset {
20 |     common_params {
21 |         parallelize = false
22 |         do_lowercase = true
23 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
24 |         use_lang_weights = True
25 |         code_max_num_tokens=200
26 |         use_subtokens=True
27 |         #query_embeddings="sbert"
28 |         fraction_using_func_name=0.1
29 |     }
30 | }
31 | 
32 | training {
33 |     short_circuit = true
34 | 
35 |     device = "cuda"
36 |     wandb = false
37 |     tensorboard = false
38 | 
39 |     name = "qc_sbert_lambda"
40 |     iteration = "2020_03_07"
41 |     tokenizer_type = ${tokenizers.type}
42 | 
43 |     model {
44 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
45 |         encoder {
46 |             hidden_size = 32
47 |             vocab_size = ${common_vocab_size}
48 |             intermediate_size = 128
49 |             num_hidden_layers = 3
50 |             num_attention_heads = 8
51 |         }
52 |     }
53 |     lr = 0.001
54 | 
55 |     loss {
56 |         type = "lambda_loss"
57 |     }
58 | 
59 |     batch_size {
60 |         #train = 400
61 |         #val = 400
62 |         #test = 400
63 |         train = 5
64 |         val = 5
65 |         test = 5
66 |     }
67 | 
68 | }


--------------------------------------------------------------------------------
/conf/query_code_siamese_2020_02_15_14_00 copy.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
 4 | 
 5 | tokenizers {
 6 |     build_path = "./build_tokenizers/with_lang_query_code_siamese"
 7 |     token_files = "./build_tokenizers/token_files_query_code_siamese"
 8 | }
 9 | 
10 | dataset {
11 |     common_params {
12 |         parallelize = false
13 |         do_lowercase = true
14 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
15 |         use_lang_weights = True
16 |     }
17 | }
18 | 
19 | training {
20 |     short_circuit = false
21 | 
22 |     device = "cuda"
23 |     wandb = true
24 |     tensorboard = true
25 | 
26 |     name = "query_code_siamese"
27 |     iteration = "2020_02_15_14_00"
28 |     tokenizer_type = "query_code_siamese"
29 | 
30 |     model {
31 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
32 |         output_size = 72
33 |         encoder {
34 |             hidden_size = ${training.model.output_size}
35 |             vocab_size = ${common_vocab_size}
36 |             intermediate_size = 256
37 |             num_hidden_layers = 12
38 |             num_attention_heads = 12
39 |         }
40 |     }
41 | 
42 |     batch_size {
43 |         train = 100
44 |         val = 100
45 |         test = 100
46 |     }
47 | 
48 | }


--------------------------------------------------------------------------------
/conf/query_code_siamese_2020_02_15_14_00.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
 4 | 
 5 | tokenizers {
 6 |     build_path = "./build_tokenizers/with_lang_query_code_siamese"
 7 |     token_files = "./build_tokenizers/token_files_query_code_siamese"
 8 | }
 9 | 
10 | dataset {
11 |     common_params {
12 |         parallelize = false
13 |         do_lowercase = true
14 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
15 |         use_lang_weights = True
16 |     }
17 | }
18 | 
19 | training {
20 |     short_circuit = false
21 | 
22 |     device = "cuda"
23 |     wandb = true
24 |     tensorboard = true
25 | 
26 |     name = "query_code_siamese"
27 |     iteration = "2020_02_15_14_00"
28 |     tokenizer_type = "query_code_siamese"
29 | 
30 |     model {
31 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
32 |         output_size = 72
33 |         encoder {
34 |             hidden_size = ${training.model.output_size}
35 |             vocab_size = ${common_vocab_size}
36 |             intermediate_size = 256
37 |             num_hidden_layers = 12
38 |             num_attention_heads = 12
39 |         }
40 |     }
41 | 
42 |     batch_size {
43 |         train = 100
44 |         val = 100
45 |         test = 100
46 |     }
47 | 
48 | }


--------------------------------------------------------------------------------
/conf/query_code_siamese_2020_02_17_21_30 copy.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
 4 | 
 5 | tokenizers {
 6 |     build_path = "./build_tokenizers/with_lang_query_code_siamese"
 7 |     token_files = "./build_tokenizers/token_files_query_code_siamese"
 8 | }
 9 | 
10 | dataset {
11 |     common_params {
12 |         parallelize = false
13 |         do_lowercase = true
14 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
15 |         use_lang_weights = True
16 |     }
17 | }
18 | 
19 | training {
20 |     short_circuit = false
21 | 
22 |     device = "cuda"
23 |     wandb = true
24 |     tensorboard = true
25 | 
26 |     name = "query_code_siamese"
27 |     iteration = "2020_02_17_21_30"
28 |     tokenizer_type = "query_code_siamese"
29 | 
30 |     model {
31 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
32 |         output_size = 64
33 |         encoder {
34 |             hidden_size = ${training.model.output_size}
35 |             vocab_size = ${common_vocab_size}
36 |             intermediate_size = 256
37 |             num_hidden_layers = 6
38 |             num_attention_heads = 8
39 |         }
40 |     }
41 | 
42 |     batch_size {
43 |         train = 290
44 |         val = 290
45 |         test = 290
46 |     }
47 | 
48 | }


--------------------------------------------------------------------------------
/conf/query_code_siamese_2020_02_17_21_30.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
 4 | 
 5 | tokenizers {
 6 |     build_path = "./build_tokenizers/with_lang_query_code_siamese"
 7 |     token_files = "./build_tokenizers/token_files_query_code_siamese"
 8 | }
 9 | 
10 | dataset {
11 |     common_params {
12 |         parallelize = false
13 |         do_lowercase = true
14 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
15 |         use_lang_weights = True
16 |     }
17 | }
18 | 
19 | training {
20 |     short_circuit = false
21 | 
22 |     device = "cuda"
23 |     wandb = true
24 |     tensorboard = true
25 | 
26 |     name = "query_code_siamese"
27 |     iteration = "2020_02_17_21_30"
28 |     tokenizer_type = "query_code_siamese"
29 | 
30 |     model {
31 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
32 |         output_size = 64
33 |         encoder {
34 |             hidden_size = ${training.model.output_size}
35 |             vocab_size = ${common_vocab_size}
36 |             intermediate_size = 256
37 |             num_hidden_layers = 6
38 |             num_attention_heads = 8
39 |         }
40 |     }
41 | 
42 |     batch_size {
43 |         train = 290
44 |         val = 290
45 |         test = 290
46 |     }
47 | 
48 | }


--------------------------------------------------------------------------------
/conf/query_code_siamese_2020_02_18_13_00.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
 4 | 
 5 | tokenizers {
 6 |     build_path = "./build_tokenizers/with_lang_query_code_siamese"
 7 |     token_files = "./build_tokenizers/token_files_query_code_siamese"
 8 | }
 9 | 
10 | dataset {
11 |     common_params {
12 |         parallelize = false
13 |         do_lowercase = true
14 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
15 |         use_lang_weights = True
16 |     }
17 | }
18 | 
19 | training {
20 |     short_circuit = false
21 | 
22 |     device = "cuda"
23 |     wandb = true
24 |     tensorboard = true
25 | 
26 |     name = "query_code_siamese"
27 |     iteration = "2020_02_17_21_30"
28 |     tokenizer_type = "query_code_siamese"
29 | 
30 |     model {
31 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
32 |         output_size = 64
33 |         encoder {
34 |             hidden_size = ${training.model.output_size}
35 |             vocab_size = ${common_vocab_size}
36 |             intermediate_size = 256
37 |             num_hidden_layers = 6
38 |             num_attention_heads = 8
39 |         }
40 |     }
41 | 
42 |     batch_size {
43 |         train = 290
44 |         val = 290
45 |         test = 290
46 |     }
47 | 
48 | }


--------------------------------------------------------------------------------
/conf/query_code_siamese_2020_02_19_13_00 copy.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
 4 | 
 5 | tokenizers {
 6 |     build_path = "./build_tokenizers/with_lang_query_code_siamese"
 7 |     token_files = "./build_tokenizers/token_files_query_code_siamese"
 8 | }
 9 | 
10 | dataset {
11 |     common_params {
12 |         parallelize = false
13 |         do_lowercase = true
14 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
15 |         use_lang_weights = True
16 |     }
17 | }
18 | 
19 | training {
20 |     short_circuit = false
21 | 
22 |     device = "cuda"
23 |     wandb = true
24 |     tensorboard = true
25 | 
26 |     name = "query_code_siamese"
27 |     iteration = "2020_02_19_13_00"
28 |     tokenizer_type = "query_code_siamese"
29 | 
30 |     model {
31 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
32 |         output_size = 64
33 |         encoder {
34 |             hidden_size = ${training.model.output_size}
35 |             vocab_size = ${common_vocab_size}
36 |             intermediate_size = 256
37 |             num_hidden_layers = 6
38 |             num_attention_heads = 8
39 |         }
40 |     }
41 | 
42 |     loss {
43 |         type = "lambda_loss"
44 |     }
45 | 
46 |     batch_size {
47 |         train = 256
48 |         val = 256
49 |         test = 256
50 |         # train = 8
51 |         # val = 8
52 |         # test = 8
53 |     }
54 | 
55 | }


--------------------------------------------------------------------------------
/conf/query_code_siamese_2020_02_19_13_00.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
 4 | 
 5 | tokenizers {
 6 |     build_path = "./build_tokenizers/with_lang_query_code_siamese"
 7 |     token_files = "./build_tokenizers/token_files_query_code_siamese"
 8 | }
 9 | 
10 | dataset {
11 |     common_params {
12 |         parallelize = false
13 |         do_lowercase = true
14 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
15 |         use_lang_weights = True
16 |     }
17 | }
18 | 
19 | training {
20 |     short_circuit = false
21 | 
22 |     device = "cuda"
23 |     wandb = true
24 |     tensorboard = true
25 | 
26 |     name = "query_code_siamese"
27 |     iteration = "2020_02_19_13_00"
28 |     tokenizer_type = "query_code_siamese"
29 | 
30 |     model {
31 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
32 |         output_size = 64
33 |         encoder {
34 |             hidden_size = ${training.model.output_size}
35 |             vocab_size = ${common_vocab_size}
36 |             intermediate_size = 256
37 |             num_hidden_layers = 6
38 |             num_attention_heads = 8
39 |         }
40 |     }
41 | 
42 |     loss {
43 |         type = "lambda_loss"
44 |     }
45 | 
46 |     batch_size {
47 |         train = 256
48 |         val = 256
49 |         test = 256
50 |         # train = 8
51 |         # val = 8
52 |         # test = 8
53 |     }
54 | 
55 | }


--------------------------------------------------------------------------------
/conf/query_code_siamese_albert_2020_02_18_08_30 copy.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
 4 | 
 5 | tokenizers {
 6 |     build_path = "./build_tokenizers/with_lang_query_code_siamese"
 7 |     token_files = "./build_tokenizers/token_files_query_code_siamese"
 8 | }
 9 | 
10 | dataset {
11 |     common_params {
12 |         parallelize = false
13 |         do_lowercase = true
14 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
15 |         use_lang_weights = True
16 |     }
17 | }
18 | 
19 | training {
20 |     short_circuit = false
21 | 
22 |     device = "cuda"
23 |     wandb = true
24 |     tensorboard = true
25 | 
26 |     name = "query_code_siamese_albert"
27 |     iteration = "2020_02_18_08_30"
28 |     tokenizer_type = "query_code_siamese"
29 | 
30 |     model {
31 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
32 |         output_size = 128
33 |         encoder {
34 |             type = "albert"
35 |             embedding_size = ${training.model.output_size}
36 |             hidden_size = 512
37 |             vocab_size = ${common_vocab_size}
38 |             intermediate_size = 768
39 |             num_hidden_layers = 8
40 |             num_attention_heads = 8
41 |         }
42 |     }
43 | 
44 |     batch_size {
45 |         train = 128
46 |         val = 128
47 |         test = 128
48 |     }
49 | 
50 |     lr = 0.00001
51 | 
52 | }


--------------------------------------------------------------------------------
/conf/query_code_siamese_albert_2020_02_18_08_30.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
 4 | 
 5 | tokenizers {
 6 |     build_path = "./build_tokenizers/with_lang_query_code_siamese"
 7 |     token_files = "./build_tokenizers/token_files_query_code_siamese"
 8 | }
 9 | 
10 | dataset {
11 |     common_params {
12 |         parallelize = false
13 |         do_lowercase = true
14 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
15 |         use_lang_weights = True
16 |     }
17 | }
18 | 
19 | training {
20 |     short_circuit = false
21 | 
22 |     device = "cuda"
23 |     wandb = true
24 |     tensorboard = true
25 | 
26 |     name = "query_code_siamese_albert"
27 |     iteration = "2020_02_18_08_30"
28 |     tokenizer_type = "query_code_siamese"
29 | 
30 |     model {
31 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
32 |         output_size = 128
33 |         encoder {
34 |             type = "albert"
35 |             embedding_size = ${training.model.output_size}
36 |             hidden_size = 512
37 |             vocab_size = ${common_vocab_size}
38 |             intermediate_size = 768
39 |             num_hidden_layers = 8
40 |             num_attention_heads = 8
41 |         }
42 |     }
43 | 
44 |     batch_size {
45 |         train = 128
46 |         val = 128
47 |         test = 128
48 |     }
49 | 
50 |     lr = 0.00001
51 | 
52 | }


--------------------------------------------------------------------------------
/conf/query_code_siamese_albert_2020_02_18_14_00.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 60000 # 5 lang + 1 query * 10000
 4 | 
 5 | tokenizers {
 6 |     build_path = "./build_tokenizers/with_lang_query_code_siamese"
 7 |     token_files = "./build_tokenizers/token_files_query_code_siamese"
 8 | }
 9 | 
10 | dataset {
11 |     common_params {
12 |         parallelize = false
13 |         do_lowercase = true
14 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
15 |         use_lang_weights = True
16 |     }
17 | }
18 | 
19 | training {
20 |     short_circuit = false
21 | 
22 |     device = "cuda"
23 |     wandb = true
24 |     tensorboard = true
25 | 
26 |     name = "query_code_siamese_albert"
27 |     iteration = "2020_02_18_14_00"
28 |     tokenizer_type = "query_code_siamese"
29 | 
30 |     model {
31 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
32 |         # output_size = 128
33 |         encoder {
34 |             type = "albert"
35 |             embedding_size = 64
36 |             hidden_size = 256
37 |             vocab_size = ${common_vocab_size}
38 |             intermediate_size = 512
39 |             num_hidden_layers = 6
40 |             num_attention_heads = 8
41 |         }
42 |     }
43 | 
44 |     batch_size {
45 |         train = 240
46 |         val = 240
47 |         test = 240
48 |     }
49 | 
50 |     lr = 0.00001
51 | 
52 | }


--------------------------------------------------------------------------------
/guide.md:
--------------------------------------------------------------------------------
 1 | ## Introduction
 2 | 
 3 | Ce projet est un projet bac-à-sable technique sur lequel j'ai travaillé en 2020 et que j'utilisais pour tester certains points techniques à titre personnel. C'est une réécriture quasi-complète d'un projet de Microsoft pour le challenge CodeSearchNet (moteur de recherche de code multi-langage à partir de requêtes textuelles, sujet qui depuis a été largement poussé plus loin par github/microsoft avec Copilot avec des capacités avancées de génération de code).
 4 | J'ai remis ce projet à jour pour cette présentation car je me suis aperçu que les API python ont beaucoup évolué depuis 2020 et le code n'était plus du tout compatible avec les versions actuelles. Cependant, le code ne tournera pas si vous le lancez car il a besoin de tokenizers qu'il faut construire manuellement et qui demandent pas mal de temps de calcul et de librairies natives (pour les AST de langages).
 5 | 
 6 | ## Points techniques remarquables
 7 | 
 8 | Mon but ici n'est pas de parler du fond ML/IA (mes résultats n'étaient pas très intéressants) mais plutôt du code et plus spécifiquement les points suivants:
 9 | 
10 | - Projet complet Python/ML avec gestion des dépendances (poetry), isolation dans un virtualenv, intégration dans VSCode (qui devenait le standard de dev en 2020) avec utilisation d'extensions: mypy, linters, license, tests (même si anecdotiques), etc...
11 | 
12 |   - [pyproject.toml](./pyproject.toml)
13 |   - et pour info, la [license](./LICENSE)
14 | 
15 | - Utilisation des configurations au format générique HOCON qui permet de gérer des configurations complexes avec des imports, des variables, des références etc...
16 | 
17 |   - [Configuration générique](./conf/default.conf)
18 |   - [Configuration spécifique](./conf/query_code_siamese_2020_02_15_14_00.conf)
19 | 
20 | - Exploration des limites du typage fort en Python avec des types génériques abstraits (pour tenter de simuler l'équivalent des "typeclasses" qu'on trouve dans les langages fonctionnels comme Haskell/Scala) et les "newtypes" pour "spécialiser" des types simples
21 | 
22 |   - [Type abstraits](./codenets/recordable.py#L22)
23 |   - [Type génériques](./codenets/codesearchnet/training_ctx.py#L205-L220)
24 |   - [Newtypes](./codenets/codesearchnet/training_ctx.py#L49-L68)
25 | 
26 | - Evaluation de la compilation des types avc le moteur de compilation Mypy de Microsoft intégré dans VS Code.
27 | 
28 |   - [mypy.ini](./mypy.ini)
29 | 
30 | - Etude de sauvegarde/restoration générique d'un contexte complet de projet IA (configuration + commit + modèle + tokenizer + dataset + etc...) pour une sauvegarde dans un point unique (sur un cloud de type AWS ou un serveur orienté ML de type MLFlow par exemple).
31 | 
32 |   - [Recordable générique](./codenets/recordable.py#L22)
33 |   - [Recordable spécialisé configuration HOCON](./codenets/recordable.py#L113)
34 |   - [Recordable spécialisé modèle/tokenizer TorchModule](./codenets/recordable.py#L248)
35 |   - [training context générique](./codenets/codesearchnet/training_ctx.py#L245)
36 |   - [training context spécialisé sur un modèle spécifique](./codenets/codesearchnet/query_code_siamese/training_ctx.py#L40)
37 | 
38 | - Evaluation de la complexité de réécriture d'un code Tensorflow vers du PyTorch et les librairies huggingface.
39 | 
40 | - Intégration avec WanDB/Tensorflow pour le suivi des entraînements.
41 | 
42 | et de manière plus anecdotique:
43 | 
44 | - Etudier les résultats atteignables avec des transformers de petite taille sur un challenge de ce type
45 |   et les résultats ont été très décevants cf.
46 | 
47 |   - [README.md](./README.md)
48 | 
49 | - Utilisation de tokenizers natifs Rust avec interface Python de Huggingface tokenizers (qui venaient d'être publiés en 2020):
50 | 
51 |   - [tokenizer_recs.py](./codenets/codesearchnet/huggingface/tokenizer_recs.py#L102)
52 | 
53 | - Utilisation des parsers d'AST de langages (tree-sitter) pour améliorer les performances des modèles à base de transformers (je n'ai pas réussi à pousser les expérimentations très loin par manque de ressources GPU)
54 |   - [ast_build.py](./codenets/codesearchnet/ast_build.py#L189)
55 | 
56 | ## Conclusion
57 | 
58 | Au final, je retiendrai les points suivants:
59 | 
60 | - L'utilisation des configurations HOCON est intéressante pour tout projet informatique quel que soit le langage à mon avis car cela permet de gérer des configurations complexes avec des variables/références tout en restant simple de format.
61 | - la sauvegarde générique complète d'un projet ML du code au modèle et dataset me semble un point important dans l'optique de backup et versioning de projets ML en associant l'intégralité des ressources: code, configuration, modèle, tokenizer, dataset etc...
62 | - le typage fort dans Python est devenu un outil intéressant qui permet d'améliorer la robustesse globale du code, de réduire la quantité de tests unitaires. Mypy semble être une solution robuste pour vérifier les types même s'il faut filtrer de nombreuses dépendances externes qui n'intègrent pas la gestion des types. Cependant, l'utilisation trop fréquente des unions de types dans les librairies Python peut conduire à des signatures de type assez indigestes.
63 | - L'utilisation des types génériques et abstraits est fonctionnelle mais reste assez fastidieuse en Python et ne donne pas l'impression d'être une fonctionnalité native du langage (sans parler des cast au runtime qui peuvent poser des problèmes de performance). Il vaut mieux rester dans les patterns orienté-objet classiques et éviter de trop s'aventurer en dehors des sentiers battus.
64 | - L'utilisation des NewTypes reste encore anecdotique de mon point de vue (en particulier, les opérations mathématiques ou de concaténation sur ces types leur font perdre leur spécificité)
65 | 
66 | Si vous avez des questions, n'hésitez pas à me contacter.
67 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | """Main of the project."""
 2 | # import numpy as np
 3 | # import os
 4 | 
 5 | 
 6 | # def get_os_env():
 7 | #     print(os.getcwd())
 8 | #     print(os.uname())
 9 | 
10 | 
11 | # def main():
12 | #     # Do some os stuff
13 | #     get_os_env()
14 | #     # Do some numpy stuff
15 | #     A = np.ones(3)*1
16 | #     B = np.ones(3)*2
17 | #     C = np.ones(3)*3
18 | #     res = np.add(A,B,out=B)
19 | #     res2 = np.divide(A,2,out=A)
20 | #     res3 = np.negative(A,out=A)
21 | #     res4 = np.multiply(A,B,out=A)
22 | 
23 | #     print(res)
24 | #     print(f"this is the result 2 {res2}")
25 | #     print(np.zeros(shape=(2, 3)))
26 | 
27 | 
28 | # if __name__ == "__main__":
29 | #     main()
30 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | python_version = 3.10
 3 | ; mypy_path = ./src
 4 | namespace_packages = True
 5 | ; ignore_missing_imports = True
 6 | ; follow_imports = normal
 7 | no_deprecation_warning=True
 8 | 
 9 | [mypy-torch.*]
10 | ignore_missing_imports = True
11 | 
12 | [mypy-loguru.*]
13 | ignore_missing_imports = True
14 | 
15 | [mypy-torch.optim.*]
16 | ignore_missing_imports = True
17 | 
18 | [mypy-dpu_utils]
19 | ignore_missing_imports = True
20 | 
21 | [mypy-dpu_utils.*]
22 | ignore_missing_imports = True
23 | 
24 | [mypy-toolz]
25 | ignore_missing_imports = True
26 | 
27 | [mypy-numpy]
28 | ignore_missing_imports = True
29 | 
30 | [mypy-tensorflow.*]
31 | ignore_missing_imports = True
32 | 
33 | [mypy-pyhocon.*]
34 | ignore_missing_imports = True
35 | 
36 | [mypy-transformers.*]
37 | ignore_missing_imports = True
38 | 
39 | [mypy-tensorboard.*]
40 | ignore_missing_imports = True
41 | 
42 | [mypy-tensorboardX.*]
43 | ignore_missing_imports = True
44 | 
45 | [mypy-pathos.*]
46 | ignore_missing_imports = True
47 | 
48 | [mypy-docopt]
49 | ignore_missing_imports = True
50 | 
51 | [mypy-pandas]
52 | ignore_missing_imports = True
53 | 
54 | [mypy-tqdm]
55 | ignore_missing_imports = True
56 | 
57 | [mypy-pygments.*]
58 | ignore_missing_imports = True
59 | 
60 | [mypy-scipy.*]
61 | ignore_missing_imports = True
62 | 
63 | [mypy-annoy]
64 | ignore_missing_imports = True
65 | 
66 | [mypy-wandb]
67 | ignore_missing_imports = True
68 | 
69 | [mypy-wandb.*]
70 | ignore_missing_imports = True
71 | 
72 | [mypy-sklearn.*]
73 | ignore_missing_imports = True
74 | 
75 | [mypy-matplotlib.*]
76 | ignore_missing_imports = True
77 | 
78 | [mypy-tokenizers.*]
79 | ignore_missing_imports = True
80 | 
81 | [mypy-sentence_transformers.*]
82 | ignore_missing_imports = True
83 | 
84 | [mypy-tree_sitter.*]
85 | ignore_missing_imports = True
86 | 


--------------------------------------------------------------------------------
/pylama.ini:
--------------------------------------------------------------------------------
 1 | [pylama]
 2 | ;format = pylint
 3 | skip = .tox/*,.env/*,.venv/*,.vscode/*
 4 | ;linters = mccabe,pep257,pydocstyle,pep8,pycodestyle,pyflakes,pylint,isort,radon,eradicate
 5 | linters = mccabe,pydocstyle,pycodestyle,pyflakes
 6 | ;ignore = F0401,C0111,E731
 7 | ignore = C0413,D212,D211,D203,R0903,C0330,D104,C0111,E1101,W0221,D406,D413,D407,W293,C901,D202,W291,D103,D100,D101,D107,D102,D400,E1102,C0103,C0411,R0913,R0914,R1719,W0212,C0412,R0902,W0102,E501,R0915,C0301,W0703,R1705,R0904,R0912,E203,W0640,R0911,R0201,D205,D415,W292,W503
 8 | 
 9 | [pylama:*/__init__.py]
10 | ignore = W0611,W0401
11 | 
12 | [pylama:tests/*.py]
13 | ignore = D104,D100
14 | 
15 | [pylama:pycodestyle]
16 | max_line_length = 120
17 | 
18 | [pylama:pylint]
19 | max_line_length = 120
20 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["poetry>=0.12"]
 3 | build-backend = "poetry.masonry.api"
 4 | 
 5 | [tool.poetry]
 6 | name = "codenets"
 7 | version = "0.2.0"
 8 | description = "code & neural nets."
 9 | authors = ["Voitot Pascal"]
10 | readme = "README.md"
11 | 
12 | # packages = [
13 | #     { include = "src/**/*.py" },
14 | # ]
15 | 
16 | [tool.poetry.dependencies]
17 | python = "^3.10"
18 | numpy = "^1.24"
19 | torch = "^2.0.0"
20 | pandas = "^2.0.0"
21 | #tokenizers = "^0.2.1"
22 | transformers = "^4.27.0"
23 | loguru = "^0.6"
24 | docopt = "^0.6"
25 | dpu-utils = "^0.6"
26 | wandb = "^0.14"
27 | pathos = "^0.3"
28 | pyhocon = "^0.3.60"
29 | annoy = "^1.17"
30 | #tables = "^3.6.1"
31 | sentence_transformers = "^2.2"
32 | tree_sitter = "^0.20"
33 | # tree-sitter = { file = "../../tools/py-tree-sitter/tree_sitter-0.1.0_mandubian-cp37-cp37m-linux_x86_64.whl" }
34 | #pyarrow = "*"
35 | fastparquet = "^2023.2"
36 | # apex = "*"
37 | 
38 | [tool.poetry.dev-dependencies]
39 | black = "*"
40 | pylama = "*"
41 | pytest = "*"
42 | mypy = "^1.1"
43 | jupyterlab = "*"
44 | matplotlib = "*"
45 | rope = "*"
46 | codecov = "*"
47 | pytest-cov = "*"
48 | pylint = "*"
49 | tensorboard = "*"
50 | tensorboardX = "*"
51 | 
52 | 
53 | [tool.black]
54 | line-length = 88
55 | exclude = '''
56 | /(
57 |     \.git
58 |   | \.mypy_cache
59 |   | \.tox
60 |   | \.venv
61 |   | \.pytest_cache
62 |   | dist
63 |   | build
64 |   | docs
65 | )/
66 | '''
67 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | annoy==1.16.3
 2 | azure-common==1.1.24
 3 | azure-nspkg==3.0.2
 4 | azure-storage==0.36.0
 5 | boto3==1.12.2
 6 | botocore==1.15.2
 7 | certifi==2019.11.28
 8 | cffi==1.14.0
 9 | chardet==3.0.4
10 | click==7.0
11 | colorama==0.4.3; sys_platform == "win32"
12 | configparser==4.0.2
13 | cryptography==2.8
14 | dill==0.3.1.1
15 | docker-pycreds==0.4.0
16 | docopt==0.6.2
17 | docutils==0.15.2
18 | dpu-utils==0.2.8
19 | gitdb2==3.0.2
20 | gitpython==3.0.8
21 | gql==0.2.0
22 | graphql-core==1.1
23 | idna==2.8
24 | jmespath==0.9.4
25 | joblib==0.14.1
26 | loguru==0.3.2
27 | multiprocess==0.70.9
28 | numexpr==2.7.1
29 | numpy==1.18.1
30 | nvidia-ml-py3==7.352.0
31 | pandas==0.25.3
32 | pathos==0.2.5
33 | pathtools==0.1.2
34 | pox==0.2.7
35 | ppft==1.6.6.1
36 | promise==2.3
37 | psutil==5.7.0
38 | pycparser==2.19
39 | pyhocon==0.3.54
40 | pyparsing==2.4.6
41 | python-dateutil==2.8.1
42 | pytz==2019.3
43 | pyyaml==5.3
44 | regex==2020.2.18
45 | requests==2.22.0
46 | s3transfer==0.3.3
47 | sacremoses==0.0.38
48 | sentencepiece==0.1.85
49 | sentry-sdk==0.14.1
50 | setsimilaritysearch==0.1.7
51 | shortuuid==0.5.0
52 | six==1.14.0
53 | smmap2==2.0.5
54 | subprocess32==3.5.4
55 | tables==3.6.1
56 | tokenizers==0.2.1
57 | torch==1.4.0
58 | tqdm==4.43.0
59 | transformers==2.3.0
60 | urllib3==1.25.8
61 | wandb==0.8.27
62 | watchdog==0.10.2
63 | win32-setctime==1.0.1; sys_platform == "win32"
64 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mandubian/codenets/f24b92d9208a62725c2d321366873be715179356/test/__init__.py


--------------------------------------------------------------------------------
/test/conf/default.conf:
--------------------------------------------------------------------------------
  1 | 
  2 | lang_ids {
  3 |     php = 0
  4 |     python = 1
  5 |     ruby = 2
  6 |     java = 3
  7 |     go = 4
  8 |     javascript = 5
  9 | }
 10 | 
 11 | common_vocab_size = 10000
 12 | 
 13 | bert {
 14 |     hidden_size = 128
 15 |     vocab_size = ${common_vocab_size}
 16 |     intermediate_size = 512
 17 |     num_hidden_layers = 3
 18 |     num_attention_heads = 8
 19 | }
 20 | 
 21 | tokenizers {
 22 |     type = "TOKENIZER_TYPE"
 23 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 24 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
 25 | }
 26 | 
 27 | dataset {
 28 |     root_dir = ${HOME}"/workspaces/tools/CodeSearchNet/src"
 29 |     common_params {
 30 |         fraction_using_func_name=0.1
 31 |         min_len_func_name_for_query=12
 32 |         use_subtokens=False
 33 |         mark_subtoken_end=False
 34 |         code_max_num_tokens=200
 35 |         query_max_num_tokens=30
 36 |         use_bpe=True
 37 |         vocab_size=${common_vocab_size}
 38 |         pct_bpe=0.5
 39 |         vocab_count_threshold=10
 40 |         lang_ids = ${lang_ids}
 41 |         do_lowercase = true
 42 |         special_tokens = ["<unk>"]
 43 |         parallelize = true
 44 |         use_lang_weights = False
 45 |     }
 46 | 
 47 |     train {
 48 |         dirs = ${dataset.root_dir}"/data_dirs_train.txt"
 49 |         params = ${dataset.common_params}
 50 |     }
 51 | 
 52 |     val {
 53 |         dirs = ${dataset.root_dir}"/data_dirs_valid.txt"
 54 |         params = ${dataset.common_params}
 55 |     }
 56 | 
 57 |     test {
 58 |         dirs = ${dataset.root_dir}"/data_dirs_test.txt"
 59 |         params = ${dataset.common_params}
 60 |     }
 61 | 
 62 |     queries_file = ${dataset.root_dir}"/queries.csv"
 63 | }
 64 | 
 65 | 
 66 | training {
 67 |     # The name of current experiment (can have several runs)
 68 |     name = "EXPERIMENT_NAME"
 69 |     # The unique id of current run
 70 |     iteration = "UNIQUE_RUN_ID"
 71 |     # The ID used to identify the pre-built pickled files
 72 |     # using the tokenizer defined above
 73 |     tokenizer_type = "TOKENIZER_ID"
 74 |     
 75 |     # Set that to true to test your run without slow-loading train dataset
 76 |     short_circuit = false
 77 | 
 78 |     device = "cuda"
 79 |     # deactivate wandb & tensorboard
 80 |     wandb = true
 81 |     tensorboard = true
 82 | 
 83 |     model {
 84 |         # IMPORTANT: the class representing Training Context
 85 |         training_ctx_class = "codenets.codesearchnet.query_1_code_1.training_ctx.Query1Code1Ctx"
 86 |         output_size = 64
 87 |         query_encoder {
 88 |             hidden_size = ${training.model.output_size}
 89 |             vocab_size = ${common_vocab_size}
 90 |             intermediate_size = 512
 91 |             num_hidden_layers = 3
 92 |             num_attention_heads = 8
 93 |         }
 94 |         code_encoder {
 95 |             hidden_size = ${training.model.output_size}
 96 |             vocab_size = ${common_vocab_size}
 97 |             intermediate_size = 512
 98 |             num_hidden_layers = 6
 99 |             num_attention_heads = 8
100 |         }
101 |     }
102 | 
103 |     # Training Hyper-Parameters
104 |     seed = 0
105 |     lr = 0.0001
106 |     max_grad_norm = 1.0
107 |     min_log_interval = 50
108 |     start_epoch = 0
109 |     epochs = 10
110 | 
111 |     batch_size {
112 |         train = 256
113 |         val = 256
114 |         test = 256
115 |     }
116 | 
117 |     loss {
118 |         type = "softmax_cross_entropy"
119 |         margin = 1.0
120 |     }
121 | 
122 |     # Paths
123 |     pickle_path = "./pickles"
124 |     output_dir = "./checkpoints"
125 |     tensorboard_path = "./runs"
126 | 
127 | }


--------------------------------------------------------------------------------
/test/conf/test.conf:
--------------------------------------------------------------------------------
 1 | include "./default.conf"
 2 | 
 3 | common_vocab_size = 30000 # 5 lang + 1 query * 5000
 4 | 
 5 | 
 6 | tokenizers {
 7 |     type = "qc_30k"
 8 |     build_path = "./build_tokenizers/with_lang_"${tokenizers.type}
 9 |     token_files = "./build_tokenizers/token_files_"${tokenizers.type}
10 | }
11 | 
12 | dataset {
13 |     common_params {
14 |         parallelize = false
15 |         do_lowercase = true
16 |         special_tokens = ["<unk>", "<lg>", "<qy>"]
17 |         use_lang_weights = True
18 |     }
19 | }
20 | 
21 | training {
22 |     short_circuit = false
23 | 
24 |     device = "cpu"
25 |     wandb = false
26 |     tensorboard = false
27 | 
28 |     name = "test"
29 |     iteration = "2020_02_23_01_00"
30 |     tokenizer_type = ${tokenizers.type}
31 | 
32 |     model {
33 |         training_ctx_class = "codenets.codesearchnet.query_code_siamese.training_ctx.QueryCodeSiameseCtx"
34 |         encoder {
35 |             hidden_size = 32
36 |             vocab_size = ${common_vocab_size}
37 |             intermediate_size = 256
38 |             num_hidden_layers = 2
39 |             num_attention_heads = 8
40 |         }
41 |     }
42 | 
43 |     loss {
44 |         type = "softmax_cross_entropy"
45 |     }
46 | 
47 |     batch_size {
48 |         train = 768
49 |         val = 768
50 |         test = 768
51 |     }
52 | 
53 | }


--------------------------------------------------------------------------------
/test/test_recordable.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from pathlib import Path
 3 | from typing import cast
 4 | from codenets.recordable import DictRecordable
 5 | import os
 6 | import shutil
 7 | import pytest
 8 | from pyhocon import ConfigFactory
 9 | 
10 | from codenets.codesearchnet.training_ctx import CodeSearchTrainingContext
11 | from codenets.codesearchnet.query_code_siamese.training_ctx import QueryCodeSiameseCtx
12 | 
13 | test_dir = Path("./tmp-test")
14 | cfg = Path("./test/conf/test.conf")
15 | 
16 | 
17 | @pytest.fixture(autouse=True)
18 | def run_before_and_after_tests(tmpdir):
19 |     """Fixture to execute asserts before and after a test is run"""
20 |     # Setup: fill with any logic you want
21 |     os.mkdir(test_dir)
22 | 
23 |     yield # this is where the testing happens
24 | 
25 |     # Teardown : fill with any logic you want
26 |     shutil.rmtree(test_dir)
27 | 
28 | 
29 | def test_dict_recordable():
30 |     d = DictRecordable({
31 |         'toto': 1,
32 |         'tata': "titi",
33 |         "tutu": 1.2345
34 |     })
35 | 
36 |     assert d.save(test_dir / "d")
37 |     d2 = DictRecordable.load(test_dir / "d")
38 |     assert d == d2
39 | 
40 | 
41 | def test_fullconf_recordable():
42 |     training_ctx = CodeSearchTrainingContext.build_context_from_hocon(ConfigFactory.parse_file(cfg))
43 |     assert training_ctx.save(test_dir / "f")
44 | 
45 |     training_ctx_2 = QueryCodeSiameseCtx.load(test_dir / "f")
46 |     print("keys", training_ctx.keys(), training_ctx_2.keys())
47 |     assert training_ctx.keys() == training_ctx_2.keys()
48 | 
49 | 
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/wandb/settings:
--------------------------------------------------------------------------------
1 | [default]
2 | project = codenets
3 | 
4 | 


--------------------------------------------------------------------------------