├── CodeBLEU ├── __init__.py ├── parser │ ├── my-languages.so │ ├── __pycache__ │ │ ├── DFG.cpython-37.pyc │ │ ├── DFG.cpython-38.pyc │ │ ├── utils.cpython-37.pyc │ │ ├── utils.cpython-38.pyc │ │ ├── __init__.cpython-37.pyc │ │ └── __init__.cpython-38.pyc │ ├── __init__.py │ ├── build.sh │ ├── build.py │ └── utils.py ├── __pycache__ │ ├── bleu2.cpython-38.pyc │ ├── utils.cpython-38.pyc │ ├── __init__.cpython-38.pyc │ ├── calc_code_bleu.cpython-38.pyc │ ├── dataflow_match.cpython-38.pyc │ ├── syntax_match.cpython-38.pyc │ └── weighted_ngram_match.cpython-38.pyc ├── readme.txt ├── keywords │ ├── c.txt │ ├── python.txt │ ├── javascript.txt │ ├── java.txt │ ├── php.txt │ ├── cpp.txt │ ├── cs.txt │ └── c_sharp.txt ├── run.py ├── syntax_match.py ├── calc_code_bleu.py ├── utils.py └── dataflow_match.py ├── compiler ├── test.exe ├── vendor │ ├── phpstan │ │ └── phpstan │ │ │ ├── conf │ │ │ └── bleedingEdge.neon │ │ │ ├── phpstan.phar │ │ │ ├── phpstan │ │ │ ├── composer.json │ │ │ ├── phpstan.phar.asc │ │ │ ├── LICENSE │ │ │ ├── bootstrap.php │ │ │ └── README.md │ ├── composer │ │ ├── autoload_psr4.php │ │ ├── autoload_namespaces.php │ │ ├── autoload_files.php │ │ ├── autoload_classmap.php │ │ ├── autoload_static.php │ │ ├── LICENSE │ │ ├── installed.php │ │ ├── installed.json │ │ └── autoload_real.php │ ├── autoload.php │ └── bin │ │ ├── phpstan │ │ └── phpstan.phar ├── __pycache__ │ ├── test.cpython-38.pyc │ ├── compilers.cpython-36.pyc │ ├── compilers.cpython-38.pyc │ ├── terminal_compiler.cpython-36.pyc │ └── terminal_compiler.cpython-38.pyc ├── test.php ├── test.py ├── test.cpp ├── test.java ├── test.cs ├── test.c ├── compilers.py └── terminal_compiler.py ├── images ├── PPOCoder_v4.gif └── ppocoder_overview.jpg ├── parser ├── my-languages.so ├── __pycache__ │ ├── DFG.cpython-37.pyc │ ├── DFG.cpython-38.pyc │ ├── DFG.cpython-39.pyc │ ├── utils.cpython-37.pyc │ ├── utils.cpython-38.pyc │ ├── utils.cpython-39.pyc │ ├── __init__.cpython-37.pyc │ ├── __init__.cpython-38.pyc │ └── __init__.cpython-39.pyc ├── __init__.py └── utils.py ├── code_prepro └── lang_processors │ ├── my-languages.so │ ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── __init__.cpython-38.pyc │ ├── c_processor.cpython-36.pyc │ ├── c_processor.cpython-38.pyc │ ├── cs_processor.cpython-36.pyc │ ├── cpp_processor.cpython-36.pyc │ ├── cpp_processor.cpython-38.pyc │ ├── java_processor.cpython-36.pyc │ ├── java_processor.cpython-38.pyc │ ├── lang_processor.cpython-36.pyc │ ├── lang_processor.cpython-38.pyc │ ├── php_processor.cpython-36.pyc │ ├── php_processor.cpython-38.pyc │ ├── csharp_processor.cpython-36.pyc │ ├── csharp_processor.cpython-38.pyc │ ├── python1_processor.cpython-36.pyc │ ├── python_processor.cpython-36.pyc │ ├── python_processor.cpython-38.pyc │ ├── javascript_processor.cpython-36.pyc │ ├── javascript_processor.cpython-38.pyc │ ├── tokenization_utils.cpython-36.pyc │ ├── tokenization_utils.cpython-38.pyc │ ├── tree_sitter_processor.cpython-36.pyc │ └── tree_sitter_processor.cpython-38.pyc │ ├── lang_processor.py │ ├── __init__.py │ ├── tokenization_utils.py │ ├── php_processor.py │ ├── csharp_processor.py │ ├── java_processor.py │ ├── javascript_processor.py │ ├── tree_sitter_processor.py │ ├── c_processor.py │ └── cpp_processor.py ├── data └── xlcost │ ├── README.md │ └── Java-C++ │ ├── test-C++-map.jsonl │ ├── test-Java-map.jsonl │ ├── val-C++-map.jsonl │ └── val-Java-map.jsonl ├── run.sh ├── LICENSE ├── model.py ├── bleu.py ├── requirements.txt ├── reward.py ├── compile_rl_experiments.py └── README.md /CodeBLEU/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.1" 2 | -------------------------------------------------------------------------------- /compiler/test.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/compiler/test.exe -------------------------------------------------------------------------------- /images/PPOCoder_v4.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/images/PPOCoder_v4.gif -------------------------------------------------------------------------------- /parser/my-languages.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/parser/my-languages.so -------------------------------------------------------------------------------- /compiler/vendor/phpstan/phpstan/conf/bleedingEdge.neon: -------------------------------------------------------------------------------- 1 | includes: 2 | - phar://phpstan.phar/conf/bleedingEdge.neon 3 | -------------------------------------------------------------------------------- /images/ppocoder_overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/images/ppocoder_overview.jpg -------------------------------------------------------------------------------- /CodeBLEU/parser/my-languages.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/CodeBLEU/parser/my-languages.so -------------------------------------------------------------------------------- /parser/__pycache__/DFG.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/parser/__pycache__/DFG.cpython-37.pyc -------------------------------------------------------------------------------- /parser/__pycache__/DFG.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/parser/__pycache__/DFG.cpython-38.pyc -------------------------------------------------------------------------------- /parser/__pycache__/DFG.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/parser/__pycache__/DFG.cpython-39.pyc -------------------------------------------------------------------------------- /compiler/__pycache__/test.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/compiler/__pycache__/test.cpython-38.pyc -------------------------------------------------------------------------------- /parser/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/parser/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /parser/__pycache__/utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/parser/__pycache__/utils.cpython-38.pyc -------------------------------------------------------------------------------- /parser/__pycache__/utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/parser/__pycache__/utils.cpython-39.pyc -------------------------------------------------------------------------------- /CodeBLEU/__pycache__/bleu2.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/CodeBLEU/__pycache__/bleu2.cpython-38.pyc -------------------------------------------------------------------------------- /CodeBLEU/__pycache__/utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/CodeBLEU/__pycache__/utils.cpython-38.pyc -------------------------------------------------------------------------------- /parser/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/parser/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /parser/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/parser/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /parser/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/parser/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /CodeBLEU/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/CodeBLEU/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /CodeBLEU/readme.txt: -------------------------------------------------------------------------------- 1 | python calc_code_bleu.py --refs reference_files --hyp candidate_file --language java ( or c_sharp) --params 0.25,0.25,0.25,0.25(default) -------------------------------------------------------------------------------- /code_prepro/lang_processors/my-languages.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/my-languages.so -------------------------------------------------------------------------------- /compiler/__pycache__/compilers.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/compiler/__pycache__/compilers.cpython-36.pyc -------------------------------------------------------------------------------- /compiler/__pycache__/compilers.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/compiler/__pycache__/compilers.cpython-38.pyc -------------------------------------------------------------------------------- /compiler/vendor/phpstan/phpstan/phpstan.phar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/compiler/vendor/phpstan/phpstan/phpstan.phar -------------------------------------------------------------------------------- /CodeBLEU/parser/__pycache__/DFG.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/CodeBLEU/parser/__pycache__/DFG.cpython-37.pyc -------------------------------------------------------------------------------- /CodeBLEU/parser/__pycache__/DFG.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/CodeBLEU/parser/__pycache__/DFG.cpython-38.pyc -------------------------------------------------------------------------------- /CodeBLEU/__pycache__/calc_code_bleu.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/CodeBLEU/__pycache__/calc_code_bleu.cpython-38.pyc -------------------------------------------------------------------------------- /CodeBLEU/__pycache__/dataflow_match.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/CodeBLEU/__pycache__/dataflow_match.cpython-38.pyc -------------------------------------------------------------------------------- /CodeBLEU/__pycache__/syntax_match.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/CodeBLEU/__pycache__/syntax_match.cpython-38.pyc -------------------------------------------------------------------------------- /CodeBLEU/parser/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/CodeBLEU/parser/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /CodeBLEU/parser/__pycache__/utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/CodeBLEU/parser/__pycache__/utils.cpython-38.pyc -------------------------------------------------------------------------------- /compiler/test.php: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CodeBLEU/parser/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/CodeBLEU/parser/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /CodeBLEU/parser/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/CodeBLEU/parser/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /compiler/__pycache__/terminal_compiler.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/compiler/__pycache__/terminal_compiler.cpython-36.pyc -------------------------------------------------------------------------------- /compiler/__pycache__/terminal_compiler.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/compiler/__pycache__/terminal_compiler.cpython-38.pyc -------------------------------------------------------------------------------- /CodeBLEU/__pycache__/weighted_ngram_match.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/CodeBLEU/__pycache__/weighted_ngram_match.cpython-38.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/c_processor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/c_processor.cpython-36.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/c_processor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/c_processor.cpython-38.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/cs_processor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/cs_processor.cpython-36.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/cpp_processor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/cpp_processor.cpython-36.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/cpp_processor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/cpp_processor.cpython-38.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/java_processor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/java_processor.cpython-36.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/java_processor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/java_processor.cpython-38.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/lang_processor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/lang_processor.cpython-36.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/lang_processor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/lang_processor.cpython-38.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/php_processor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/php_processor.cpython-36.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/php_processor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/php_processor.cpython-38.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/csharp_processor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/csharp_processor.cpython-36.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/csharp_processor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/csharp_processor.cpython-38.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/python1_processor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/python1_processor.cpython-36.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/python_processor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/python_processor.cpython-36.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/python_processor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/python_processor.cpython-38.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/javascript_processor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/javascript_processor.cpython-36.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/javascript_processor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/javascript_processor.cpython-38.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/tokenization_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/tokenization_utils.cpython-36.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/tokenization_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/tokenization_utils.cpython-38.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/tree_sitter_processor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/tree_sitter_processor.cpython-36.pyc -------------------------------------------------------------------------------- /code_prepro/lang_processors/__pycache__/tree_sitter_processor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reddy-lab-code-research/PPOCoder/HEAD/code_prepro/lang_processors/__pycache__/tree_sitter_processor.cpython-38.pyc -------------------------------------------------------------------------------- /compiler/vendor/phpstan/phpstan/phpstan: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env php 2 | $vendorDir . '/phpstan/phpstan/bootstrap.php', 10 | ); 11 | -------------------------------------------------------------------------------- /compiler/vendor/composer/autoload_classmap.php: -------------------------------------------------------------------------------- 1 | $vendorDir . '/composer/InstalledVersions.php', 10 | ); 11 | -------------------------------------------------------------------------------- /compiler/test.py: -------------------------------------------------------------------------------- 1 | class Solution ( object ) : 2 | def numberOfArithmeticSlices ( self , A ) : 3 | def numberOfArithmeticSlices2 ( self , A ) : 4 | result = 0 5 | for i in xrange ( 2 , len ( A ) ) : 6 | if A [ i ] - A [ i - 1 ] == A [ i - 1 ] - A [ i - 2 ] : 7 | result += i 8 | return result 9 | -------------------------------------------------------------------------------- /parser/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import (remove_comments_and_docstrings, 2 | tree_to_token_index, 3 | tree_to_token_nodes, 4 | index_to_code_token, 5 | tree_to_variable_index, 6 | detokenize_code) 7 | from .DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp -------------------------------------------------------------------------------- /CodeBLEU/keywords/c.txt: -------------------------------------------------------------------------------- 1 | auto 2 | else 3 | long 4 | switch 5 | break 6 | enum 7 | register 8 | typedef 9 | case 10 | extern 11 | return 12 | union 13 | char 14 | float 15 | short 16 | unsigned 17 | const 18 | for 19 | signed 20 | void 21 | continue 22 | goto 23 | sizeof 24 | volatile 25 | default 26 | if 27 | static 28 | while 29 | do 30 | int 31 | struct 32 | _Packed 33 | double -------------------------------------------------------------------------------- /CodeBLEU/parser/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | from .utils import (remove_comments_and_docstrings, 5 | tree_to_token_index, 6 | index_to_code_token, 7 | tree_to_variable_index) 8 | from .DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp -------------------------------------------------------------------------------- /CodeBLEU/keywords/python.txt: -------------------------------------------------------------------------------- 1 | False 2 | None 3 | True 4 | and 5 | as 6 | assert 7 | async 8 | await 9 | break 10 | class 11 | continue 12 | def 13 | del 14 | elif 15 | else 16 | except 17 | finally 18 | for 19 | from 20 | global 21 | if 22 | import 23 | in 24 | is 25 | lambda 26 | nonlocal 27 | not 28 | or 29 | pass 30 | raise 31 | return 32 | try 33 | while 34 | with 35 | yield 36 | -------------------------------------------------------------------------------- /CodeBLEU/parser/build.sh: -------------------------------------------------------------------------------- 1 | git clone https://github.com/tree-sitter/tree-sitter-go 2 | git clone https://github.com/tree-sitter/tree-sitter-javascript 3 | git clone https://github.com/tree-sitter/tree-sitter-python 4 | git clone https://github.com/tree-sitter/tree-sitter-ruby 5 | git clone https://github.com/tree-sitter/tree-sitter-php 6 | git clone https://github.com/tree-sitter/tree-sitter-java 7 | git clone https://github.com/tree-sitter/tree-sitter-c-sharp 8 | python build.py 9 | -------------------------------------------------------------------------------- /compiler/vendor/phpstan/phpstan/composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "phpstan/phpstan", 3 | "description": "PHPStan - PHP Static Analysis Tool", 4 | "license": ["MIT"], 5 | "require": { 6 | "php": "^7.1|^8.0" 7 | }, 8 | "conflict": { 9 | "phpstan/phpstan-shim": "*" 10 | }, 11 | "bin": [ 12 | "phpstan", 13 | "phpstan.phar" 14 | ], 15 | "extra": { 16 | "branch-alias": { 17 | "dev-master": "1.4-dev" 18 | } 19 | }, 20 | "autoload": { 21 | "files": ["bootstrap.php"] 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /CodeBLEU/keywords/javascript.txt: -------------------------------------------------------------------------------- 1 | await 2 | break 3 | case 4 | catch 5 | class 6 | const 7 | continue 8 | debugger 9 | default 10 | delete 11 | do 12 | else 13 | enum 14 | export 15 | extends 16 | false 17 | finally 18 | for 19 | function 20 | if 21 | implements 22 | import 23 | in 24 | instanceof 25 | interface 26 | let 27 | new 28 | null 29 | package 30 | private 31 | protected 32 | public 33 | return 34 | super 35 | switch 36 | static 37 | this 38 | throw 39 | try 40 | True 41 | typeof 42 | var 43 | void 44 | while 45 | with 46 | yield -------------------------------------------------------------------------------- /CodeBLEU/parser/build.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | from tree_sitter import Language, Parser 5 | 6 | Language.build_library( 7 | # Store the library in the `build` directory 8 | 'my-languages.so', 9 | 10 | # Include one or more languages 11 | [ 12 | 'tree-sitter-go', 13 | 'tree-sitter-javascript', 14 | 'tree-sitter-python', 15 | 'tree-sitter-php', 16 | 'tree-sitter-java', 17 | 'tree-sitter-ruby', 18 | 'tree-sitter-c-sharp', 19 | ] 20 | ) 21 | 22 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #Example of a run for Java-C++ Translation 2 | python rl_run.py \ 3 | --run 1 \ 4 | --l1 java \ 5 | --l2 cpp \ 6 | --asp 5 \ 7 | --ns 10 \ 8 | -- data_path PPOCodder/data/ \ 9 | --outpu_path PPOCodder/saved_models/ \ 10 | --load_model_path PPOCodder/baselines/saved_models/java-cpp/pytorch_model.bin \ 11 | --baseline_out_dir /PPOCodder/baselines/saved_models/java-cpp/ \ 12 | --max_source_length 400 \ 13 | --max_target_length 400 \ 14 | --train_batch_size 32 \ 15 | --test_batch_size 48 \ 16 | --lr 1e-6 \ 17 | --kl_coef 0.1 \ 18 | --kl_target 1 \ 19 | --vf_coef 1e-3 20 | -------------------------------------------------------------------------------- /CodeBLEU/keywords/java.txt: -------------------------------------------------------------------------------- 1 | abstract 2 | assert 3 | boolean 4 | break 5 | byte 6 | case 7 | catch 8 | char 9 | class 10 | const 11 | continue 12 | default 13 | do 14 | double 15 | else 16 | enum 17 | extends 18 | final 19 | finally 20 | float 21 | for 22 | goto 23 | if 24 | implements 25 | import 26 | instanceof 27 | int 28 | interface 29 | long 30 | native 31 | new 32 | package 33 | private 34 | protected 35 | public 36 | return 37 | short 38 | static 39 | strictfp 40 | super 41 | switch 42 | synchronized 43 | this 44 | throw 45 | throws 46 | transient 47 | try 48 | void 49 | volatile 50 | while 51 | -------------------------------------------------------------------------------- /compiler/test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | using namespace std ; 9 | class Solution { 10 | public : int maxSteps ( int n ) { 11 | int result = min ( n , max ( n , max ( n ) ) ) ; 12 | for ( int i = 1 ; 13 | i > 0 ; 14 | ++ i ) { 15 | if ( i % j == 0 ) { 16 | result = result - ( i / j ) ; 17 | break ; 18 | } 19 | else if ( ( i % j == 0 ) ) { 20 | result = result - ( i / j ) ; 21 | break ; 22 | } 23 | } 24 | return result ; 25 | } 26 | }; 27 | int main ( ) { 28 | } 29 | }; 30 | } 31 | -------------------------------------------------------------------------------- /CodeBLEU/keywords/php.txt: -------------------------------------------------------------------------------- 1 | __halt_compiler 2 | abstract 3 | and 4 | array 5 | as 6 | break 7 | callable 8 | case 9 | catch 10 | class 11 | clone 12 | const 13 | continue 14 | declare 15 | default 16 | die 17 | do 18 | echo 19 | else 20 | elseif 21 | empty 22 | enddeclare 23 | endfor 24 | endforeach 25 | endif 26 | endswitch 27 | endwhile 28 | eval 29 | exit 30 | extends 31 | final 32 | for 33 | foreach 34 | function 35 | global 36 | goto 37 | if 38 | implements 39 | include 40 | include_once 41 | instanceof 42 | insteadof 43 | interface 44 | isset 45 | list 46 | namespace 47 | new 48 | or 49 | print 50 | private 51 | protected 52 | public 53 | require 54 | require_once 55 | return 56 | static 57 | switch 58 | throw 59 | trait 60 | try 61 | unset 62 | use 63 | var 64 | while 65 | xor -------------------------------------------------------------------------------- /compiler/vendor/composer/autoload_static.php: -------------------------------------------------------------------------------- 1 | __DIR__ . '/..' . '/phpstan/phpstan/bootstrap.php', 11 | ); 12 | 13 | public static $classMap = array ( 14 | 'Composer\\InstalledVersions' => __DIR__ . '/..' . '/composer/InstalledVersions.php', 15 | ); 16 | 17 | public static function getInitializer(ClassLoader $loader) 18 | { 19 | return \Closure::bind(function () use ($loader) { 20 | $loader->classMap = ComposerStaticInit93012ad143650ef6581a183b8b883f71::$classMap; 21 | 22 | }, null, ClassLoader::class); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /compiler/vendor/phpstan/phpstan/phpstan.phar.asc: -------------------------------------------------------------------------------- 1 | -----BEGIN PGP SIGNATURE----- 2 | 3 | iQIzBAABCgAdFiEE0yaA1ZV9xxFr4pwUzxoQjQ565yAFAmIiDjsACgkQzxoQjQ56 4 | 5yCkOw/8DAIkYvPVfGdsBUGCVYSrnLY7mrmIlFYxMeI+KVXR2yS7rIgn4SLqLGc2 5 | rDCoEj8gFztEank90KuFR1D7UUwGs5N0OkqdJmmF2Sb5T2sOzvB2E3Zna9mVTXuG 6 | fSRSsb1SCBH4esZ570BnZGIMdmejUDBlTwQ3vQ0VD4SJ0prTdwyYbl0JyvwZ5rSB 7 | 1Y9z/u9qIOKohQEWsp3S4AoYh/fqFKZZd/JDk0Ou7i1Fw9TGb2YrPlI9777D5cxu 8 | tZo0Ajt24oo+g3zWSxJtcNFz58GnCTHkJEoSDibFuvfq7uH7q6gOigUmWeME5UKx 9 | eqc3oQW1OQawylCmdr9PrXeGmUwchfhtH8PtPFJRgGOn3D9Hv1LMoBP8S9K3hljz 10 | vFLgv1GNXKbajrpxWXaR316J72LH+PreYaT76843ip4XxlkpBYeiH9lRDxAanK8p 11 | EYUgZkkWsOSIkzqlsJxHBKPP6lmA+zK8GTWiV7NCr9qg+JJECX6JH3K02F/ngn2B 12 | GVlxiEaVBccSWP+rCRilfv9sXEDfzWfe0dxft5CgTcKiFQKNeJ5oVT5t/amCH8bz 13 | dIj15gsto/2yyQ1LWZDPYh/xZ/J8kE7ab7DCpINHWknRjs0Sr6wqFoRTZ0leUEH/ 14 | ddmSKb3rwa9o/VyyHBn8ZgHadf8gRPGd8vK1fXiMcoRKIe4doc0= 15 | =aFxl 16 | -----END PGP SIGNATURE----- 17 | -------------------------------------------------------------------------------- /compiler/test.java: -------------------------------------------------------------------------------- 1 | import java . io . * ; 2 | class GFG { 3 | static int countSubarray ( int arr [ ] , int K , int N ) { 4 | if ( K % 2 != 0 ) return 0 ; 5 | if ( N < K ) return 0 ; 6 | int start = 0 ; 7 | int i = 0 ; 8 | int count = 0 ; 9 | int currXor = arr [ i ] ; 10 | i ++ ; 11 | while ( i < K ) { 12 | currXor ^= arr [ i ] ; 13 | i ++ ; 14 | } 15 | if ( currXor == 0 ) count ++ ; 16 | currXor ^= arr [ start ] ; 17 | start ++ ; 18 | while ( i < N ) { 19 | currXor ^= arr [ i ] ; 20 | i ++ ; 21 | if ( currXor == 0 ) count ++ ; 22 | currXor ^= arr [ start ] ; 23 | start ++ ; 24 | } 25 | return count ; 26 | } 27 | public static void main ( String args [ ] ) { 28 | int arr [ ] = { 29 | 2 , 4 , 4 , 2 , 2 , 4 }; 30 | int K = 4 ; 31 | int N = arr . length ; 32 | System . out . print ( countSubarray ( arr , K , N ) ) ; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /CodeBLEU/keywords/cpp.txt: -------------------------------------------------------------------------------- 1 | auto 2 | const 3 | double 4 | float 5 | int 6 | short 7 | struct 8 | unsigned 9 | break 10 | continue 11 | else 12 | for 13 | long 14 | signed 15 | switch 16 | void 17 | case 18 | default 19 | enum 20 | goto 21 | register 22 | sizeof 23 | typedef 24 | volatile 25 | char 26 | do 27 | extern 28 | if 29 | return 30 | static 31 | union 32 | while 33 | asm 34 | dynamic_cast 35 | namespace 36 | reinterpret_cast 37 | bool 38 | explicit 39 | new 40 | static_cast 41 | typeid 42 | catch 43 | false 44 | try 45 | operator 46 | template 47 | typename 48 | class 49 | friend 50 | private 51 | this 52 | using 53 | const_cast 54 | inline 55 | public 56 | throw 57 | virtual 58 | delete 59 | mutable 60 | protected 61 | true 62 | wchar_t 63 | and 64 | bitand 65 | compl 66 | not_eq 67 | or_eq 68 | xor_eq 69 | and_eq 70 | bitor 71 | not 72 | or 73 | xor 74 | cin 75 | endl 76 | INT_MIN 77 | iomanip 78 | main 79 | npos 80 | std 81 | cout 82 | include 83 | INT_MAX 84 | iostream 85 | MAX_RAND 86 | NULL 87 | string 88 | -------------------------------------------------------------------------------- /compiler/test.cs: -------------------------------------------------------------------------------- 1 | using System ; 2 | using System . Collections . Generic ; 3 | public class MinSum { 4 | static List < int > minSqrNum ( int n ) { 5 | int [ ] arr = new int [ n + 1 ] ; 6 | int [ ] sqrNum = new int [ n + 1 ] ; 7 | List < int > v = new List < int > ( ) ; 8 | for ( int i = 0 ; 9 | i <= n ; 10 | i ++ ) { 11 | arr [ i ] = arr [ i - 1 ] + 1 ; 12 | sqrNum [ i ] = 1 ; 13 | int k = 1 ; 14 | while ( k * k <= i ) { 15 | if ( arr [ i ] > arr [ i - k * k ] + 1 ) { 16 | arr [ i ] = arr [ i - k * k ] + 1 ; 17 | sqrNum [ i ] = k * k ; 18 | } 19 | k ++ ; 20 | } 21 | } 22 | while ( n > 0 ) { 23 | v . Add ( sqrNum [ n ] ) ; 24 | n -= sqrNum [ n ] ; 25 | } 26 | return v ; 27 | } 28 | static public void Main ( String [ ] args ) { 29 | int n = 10 ; 30 | v = minSqrNum ( n ) ; 31 | for ( int i = 0 ; 32 | i < v . Count ; 33 | i ++ ) Console . Write ( v [ i ] ) ; 34 | if ( i < v . Count - 1 ) Console . Write ( "_+_" ) ; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 reddy-lab-code-research 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /compiler/vendor/composer/LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Copyright (c) Nils Adermann, Jordi Boggiano 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is furnished 9 | to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. 21 | 22 | -------------------------------------------------------------------------------- /compiler/vendor/phpstan/phpstan/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Ondřej Mirtes 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /compiler/test.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | int flip ( int arr [ ] , int i ) { 4 | int temp ; 5 | temp = arr [ start ] ; 6 | arr [ start ] = arr [ i ] ; 7 | arr [ i ] = temp ; 8 | start ++ ; 9 | i -- ; 10 | } 11 | int findMax ( int arr [ ] , int n ) { 12 | int mi = 0 ; 13 | for ( int i = 0 ; 14 | i < n ; 15 | i ++ ) { 16 | if ( arr [ i ] > arr [ mi ] ) mi = i ; 17 | else mi = i ; 18 | } 19 | return mi ; 20 | } 21 | int pancakeSort ( int arr [ ] , int n ) { 22 | int curr_size = n ; 23 | while ( curr_size > 1 ) { 24 | int mi = findMax ( arr , curr_size ) ; 25 | if ( mi != curr_size - 1 ) flip ( arr , mi ) ; 26 | flip ( arr , curr_size - 1 ) ; 27 | curr_size -- ; 28 | } 29 | } 30 | } 31 | void printArray ( int arr [ ] , int n ) { 32 | int i , temp , n ) ; 33 | printf ( "%d" , temp ) ; 34 | printf ( "Sorted_Array_" ) ; 35 | printArray ( arr , n ) ; 36 | } 37 | int main ( ) { 38 | int arr [ ] = { 39 | 23 , 10 , 20 , 11 , 12 , 6 , 7 }; 40 | int n = sizeof ( arr ) / sizeof ( arr [ 0 ] ) ; 41 | pancakeSort ( arr , n ) ; 42 | printf ( "Sorted_Array_" ) ; 43 | printArray ( arr , n ) ; 44 | return 0 ; 45 | } 46 | -------------------------------------------------------------------------------- /compiler/vendor/composer/installed.php: -------------------------------------------------------------------------------- 1 | array( 3 | 'pretty_version' => 'dev-main', 4 | 'version' => 'dev-main', 5 | 'type' => 'library', 6 | 'install_path' => __DIR__ . '/../../', 7 | 'aliases' => array(), 8 | 'reference' => 'cc9ad26af93a6c73c53bc3f6150bc25499e489be', 9 | 'name' => '__root__', 10 | 'dev' => true, 11 | ), 12 | 'versions' => array( 13 | '__root__' => array( 14 | 'pretty_version' => 'dev-main', 15 | 'version' => 'dev-main', 16 | 'type' => 'library', 17 | 'install_path' => __DIR__ . '/../../', 18 | 'aliases' => array(), 19 | 'reference' => 'cc9ad26af93a6c73c53bc3f6150bc25499e489be', 20 | 'dev_requirement' => false, 21 | ), 22 | 'phpstan/phpstan' => array( 23 | 'pretty_version' => '1.4.8', 24 | 'version' => '1.4.8.0', 25 | 'type' => 'library', 26 | 'install_path' => __DIR__ . '/../phpstan/phpstan', 27 | 'aliases' => array(), 28 | 'reference' => '2a6d6704b17c4db6190cc3104056c0aad740cb15', 29 | 'dev_requirement' => true, 30 | ), 31 | ), 32 | ); 33 | -------------------------------------------------------------------------------- /CodeBLEU/keywords/cs.txt: -------------------------------------------------------------------------------- 1 | abstract 2 | as 3 | base 4 | bool 5 | break 6 | byte 7 | case 8 | catch 9 | char 10 | checked 11 | class 12 | const 13 | continue 14 | decimal 15 | default 16 | delegate 17 | do 18 | double 19 | else 20 | enum 21 | event 22 | explicit 23 | extern 24 | false 25 | finally 26 | fixed 27 | float 28 | for 29 | foreach 30 | goto 31 | if 32 | implicit 33 | in 34 | int 35 | interface 36 | internal 37 | is 38 | lock 39 | long 40 | namespace 41 | new 42 | null 43 | object 44 | operator 45 | out 46 | override 47 | params 48 | private 49 | protected 50 | public 51 | readonly 52 | ref 53 | return 54 | sbyte 55 | sealed 56 | short 57 | sizeof 58 | stackalloc 59 | static 60 | string 61 | struct 62 | switch 63 | this 64 | throw 65 | true 66 | try 67 | typeof 68 | uint 69 | ulong 70 | unchecked 71 | unsafe 72 | ushort 73 | using 74 | virtual 75 | void 76 | volatile 77 | while 78 | add 79 | alias 80 | ascending 81 | async 82 | await 83 | by 84 | descending 85 | dynamic 86 | equals 87 | from 88 | get 89 | global 90 | group 91 | into 92 | join 93 | let 94 | nameof 95 | notnull 96 | on 97 | orderby 98 | partial 99 | remove 100 | select 101 | set 102 | unmanaged 103 | value 104 | var 105 | when 106 | where 107 | yield 108 | -------------------------------------------------------------------------------- /CodeBLEU/keywords/c_sharp.txt: -------------------------------------------------------------------------------- 1 | abstract 2 | as 3 | base 4 | bool 5 | break 6 | byte 7 | case 8 | catch 9 | char 10 | checked 11 | class 12 | const 13 | continue 14 | decimal 15 | default 16 | delegate 17 | do 18 | double 19 | else 20 | enum 21 | event 22 | explicit 23 | extern 24 | false 25 | finally 26 | fixed 27 | float 28 | for 29 | foreach 30 | goto 31 | if 32 | implicit 33 | in 34 | int 35 | interface 36 | internal 37 | is 38 | lock 39 | long 40 | namespace 41 | new 42 | null 43 | object 44 | operator 45 | out 46 | override 47 | params 48 | private 49 | protected 50 | public 51 | readonly 52 | ref 53 | return 54 | sbyte 55 | sealed 56 | short 57 | sizeof 58 | stackalloc 59 | static 60 | string 61 | struct 62 | switch 63 | this 64 | throw 65 | true 66 | try 67 | typeof 68 | uint 69 | ulong 70 | unchecked 71 | unsafe 72 | ushort 73 | using 74 | virtual 75 | void 76 | volatile 77 | while 78 | add 79 | alias 80 | ascending 81 | async 82 | await 83 | by 84 | descending 85 | dynamic 86 | equals 87 | from 88 | get 89 | global 90 | group 91 | into 92 | join 93 | let 94 | nameof 95 | notnull 96 | on 97 | orderby 98 | partial 99 | remove 100 | select 101 | set 102 | unmanaged 103 | value 104 | var 105 | when 106 | where 107 | yield 108 | -------------------------------------------------------------------------------- /CodeBLEU/run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import syntax_match 3 | import dataflow_match 4 | from calc_code_bleu import calc_code_bleu 5 | 6 | l1 = 'cpp' 7 | l2 = 'python' 8 | direct_path = "/home/grads/parshinshojaee/trl_code/trl_code/" 9 | output_dir = direct_path + 'saved_models/codet5/saved_models/'+l1+'-'+l2 10 | class Args(): 11 | def __init__(self): 12 | self.max_source_length = 320#400 13 | self.max_target_length = 320#400 14 | self.train_batch_size = 16 15 | self.output_dir = output_dir 16 | self.reward_id = 2 17 | self.run = 4 18 | self.loss_W = 10 19 | self.lr = 1e-6 20 | self.kl_coef = 1 21 | self.reward_W = 0.01 22 | self.epoch = 3 23 | if not(os.path.exists(self.output_dir)): 24 | os.makedirs(self.output_dir) 25 | args = Args() 26 | path = args.output_dir + '/codet5_ppo' + '_reward%d'%(args.reward_id) + '_bs%d'%(args.train_batch_size) + '_in-len%d'%(args.max_source_length) + '_out-len%d'%(args.max_target_length) +'_r%d/'%(args.run) 27 | references = os.path.join(path, "test.gold_ep%d"%(args.epoch) ) 28 | hypothesis = os.path.join(path, "test.model_ep%d"%(args.epoch) ) 29 | lang = l2 30 | keywords_dir = 'CodeBLEU/keywords/' 31 | 32 | codebleu_score = calc_code_bleu(references, hypothesis, lang, keywords_dir) 33 | breakpoint() 34 | -------------------------------------------------------------------------------- /code_prepro/lang_processors/lang_processor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | from abc import ABC 8 | 9 | 10 | class LangProcessor(ABC): 11 | processors = {} 12 | 13 | @classmethod 14 | def __init_subclass__(cls): 15 | super().__init_subclass__() 16 | assert ( 17 | len(cls.__name__.lower().split("processor")) == 2 18 | and cls.__name__.lower().split("processor")[1] == "" 19 | ), "language processors class name should be that format : YourlanguageProcessor" 20 | cls.processors[cls.__name__.lower().split("processor")[0]] = cls 21 | 22 | def tokenize_code(self, code, keep_comments=False, process_strings=True): 23 | raise NotImplementedError 24 | 25 | def detokenize_code(self, code): 26 | raise NotImplementedError 27 | 28 | def obfuscate_code(self, code): 29 | raise NotImplementedError 30 | 31 | def extract_functions(self, code): 32 | raise NotImplementedError 33 | 34 | def extract_function_name(self, function): 35 | raise NotImplementedError 36 | 37 | def extract_arguments(self, function): 38 | raise NotImplementedError 39 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch.nn.functional as F 3 | import torch 4 | import sys 5 | 6 | sys.path.append('../') 7 | sys.path.append('../../') 8 | from transformers import T5ForConditionalGeneration 9 | 10 | 11 | class CodeT5HeadWithValueModel(nn.Module): 12 | def __init__(self): 13 | super().__init__() 14 | self.model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base') 15 | self.first_dropout = nn.Dropout(0.1) 16 | self.summary = nn.Linear(self.model.model_dim, 1) 17 | 18 | def load_base_model(self, load_model_path): 19 | self.model.load_state_dict(torch.load(load_model_path)) 20 | 21 | def forward(self, input_ids, attention_mask=None, labels=None, decoder_attention_mask=None): 22 | outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, 23 | decoder_attention_mask=decoder_attention_mask, output_hidden_states=True) 24 | hidden_states = outputs.decoder_hidden_states[-1] 25 | value = self.summary(self.first_dropout(hidden_states)).squeeze(-1) 26 | outputs = (outputs.logits, outputs, value) 27 | return outputs 28 | 29 | 30 | def respond_to_batch(model, source_ids, attention_mask, max_target_length=400, top_k=5, top_p=1.0): 31 | 32 | preds = model.model.generate(source_ids, attention_mask=attention_mask, do_sample=True, top_k=top_k, top_p=top_p, 33 | max_length=max_target_length) 34 | # preds = model.module.model.generate(source_ids, attention_mask=attention_mask, do_sample=True, top_k=top_k, top_p=top_p, 35 | # max_length=max_target_length) 36 | return preds 37 | -------------------------------------------------------------------------------- /compiler/vendor/phpstan/phpstan/bootstrap.php: -------------------------------------------------------------------------------- 1 | loadClass($class); 32 | 33 | return; 34 | } 35 | if (strpos($class, 'PHPStan\\') !== 0 || strpos($class, 'PHPStan\\PhpDocParser\\') === 0) { 36 | return; 37 | } 38 | 39 | if (!in_array('phar', stream_get_wrappers(), true)) { 40 | throw new \Exception('Phar wrapper is not registered. Please review your php.ini settings.'); 41 | } 42 | 43 | $filename = str_replace('\\', DIRECTORY_SEPARATOR, $class); 44 | if (strpos($class, 'PHPStan\\BetterReflection\\') === 0) { 45 | $filename = substr($filename, strlen('PHPStan\\BetterReflection\\')); 46 | $filepath = 'phar://' . __DIR__ . '/phpstan.phar/vendor/ondrejmirtes/better-reflection/src/' . $filename . '.php'; 47 | } else { 48 | $filename = substr($filename, strlen('PHPStan\\')); 49 | $filepath = 'phar://' . __DIR__ . '/phpstan.phar/src/' . $filename . '.php'; 50 | } 51 | 52 | if (!file_exists($filepath)) { 53 | return; 54 | } 55 | 56 | require $filepath; 57 | } 58 | } 59 | 60 | spl_autoload_register([PharAutoloader::class, 'loadClass']); 61 | -------------------------------------------------------------------------------- /code_prepro/lang_processors/__init__.py: -------------------------------------------------------------------------------- 1 | import code_prepro.lang_processors.cpp_processor 2 | import code_prepro.lang_processors.java_processor 3 | import code_prepro.lang_processors.python_processor 4 | import code_prepro.lang_processors.csharp_processor 5 | import code_prepro.lang_processors.c_processor 6 | import code_prepro.lang_processors.php_processor 7 | import code_prepro.lang_processors.javascript_processor 8 | from code_prepro.lang_processors.lang_processor import LangProcessor 9 | 10 | def get_detokenizer(lang): 11 | processor = LangProcessor.processors[lang](root_folder=so_path) 12 | tokenizer = processor.detokenize_code 13 | return tokenizer 14 | 15 | def get_tokenizer(lang): 16 | processor = LangProcessor.processors[lang](root_folder=so_path) 17 | tokenizer = processor.tokenize_code 18 | return tokenizer 19 | 20 | so_path = "./code_prepro/lang_processors/" 21 | lang_py = 'python' 22 | lang_java = 'java' 23 | lang_cs = 'csharp' 24 | lang_cpp = 'cpp' 25 | lang_c = 'c' 26 | lang_php = 'php' 27 | lang_js = 'javascript' 28 | 29 | file_extensions = {"Java": ".java", "C++": ".cpp", "C": ".c", "Python": ".py","Javascript": ".js", 30 | "PHP":".php", "C#":".cs"} 31 | lang_lower = {"Java": "java", "C++": "cpp", "C": "c", "Python": "python","Javascript": "javascript", 32 | "PHP":"php", "C#":"csharp"} 33 | lang_upper = {"java": "Java", "cpp": "C++", "c": "C", "python": "Python","javascript": "Javascript", 34 | "php":"PHP", "csharp":"C#"} 35 | tags = ['train', 'val', 'test'] 36 | 37 | 38 | py_tokenizer = get_tokenizer(lang_py) 39 | cs_tokenizer = get_tokenizer(lang_cs) 40 | java_tokenizer = get_tokenizer(lang_java) 41 | cpp_tokenizer = get_tokenizer(lang_cpp) 42 | js_tokenizer = get_tokenizer(lang_js) 43 | c_tokenizer = get_tokenizer(lang_c) 44 | # php_tokenizer = get_tokenizer(lang_php) 45 | php_tokenizer = c_tokenizer 46 | 47 | py_detokenizer = get_detokenizer(lang_py) 48 | cs_detokenizer = get_detokenizer(lang_cs) 49 | java_detokenizer = get_detokenizer(lang_java) 50 | cpp_detokenizer = get_detokenizer(lang_cpp) 51 | js_detokenizer = get_detokenizer(lang_js) 52 | c_detokenizer = get_detokenizer(lang_c) 53 | # php_tokenizer = get_detokenizer(lang_php) 54 | php_detokenizer = c_detokenizer 55 | 56 | file_tokenizers = {"Java": java_tokenizer, "C++": cpp_tokenizer, "C": c_tokenizer, "Python": py_tokenizer, 57 | "Javascript": js_tokenizer, "PHP": php_tokenizer, "C#": cs_tokenizer} 58 | file_detokenizers = {"Java": java_detokenizer, "C++": cpp_detokenizer, "C": c_detokenizer, "Python": py_detokenizer, 59 | "Javascript": js_detokenizer, "PHP": php_detokenizer, "C#": cs_detokenizer} 60 | -------------------------------------------------------------------------------- /compiler/vendor/composer/installed.json: -------------------------------------------------------------------------------- 1 | { 2 | "packages": [ 3 | { 4 | "name": "phpstan/phpstan", 5 | "version": "1.4.8", 6 | "version_normalized": "1.4.8.0", 7 | "source": { 8 | "type": "git", 9 | "url": "https://github.com/phpstan/phpstan.git", 10 | "reference": "2a6d6704b17c4db6190cc3104056c0aad740cb15" 11 | }, 12 | "dist": { 13 | "type": "zip", 14 | "url": "https://api.github.com/repos/phpstan/phpstan/zipball/2a6d6704b17c4db6190cc3104056c0aad740cb15", 15 | "reference": "2a6d6704b17c4db6190cc3104056c0aad740cb15", 16 | "shasum": "" 17 | }, 18 | "require": { 19 | "php": "^7.1|^8.0" 20 | }, 21 | "conflict": { 22 | "phpstan/phpstan-shim": "*" 23 | }, 24 | "time": "2022-03-04T13:03:56+00:00", 25 | "bin": [ 26 | "phpstan", 27 | "phpstan.phar" 28 | ], 29 | "type": "library", 30 | "extra": { 31 | "branch-alias": { 32 | "dev-master": "1.4-dev" 33 | } 34 | }, 35 | "installation-source": "dist", 36 | "autoload": { 37 | "files": [ 38 | "bootstrap.php" 39 | ] 40 | }, 41 | "notification-url": "https://packagist.org/downloads/", 42 | "license": [ 43 | "MIT" 44 | ], 45 | "description": "PHPStan - PHP Static Analysis Tool", 46 | "support": { 47 | "issues": "https://github.com/phpstan/phpstan/issues", 48 | "source": "https://github.com/phpstan/phpstan/tree/1.4.8" 49 | }, 50 | "funding": [ 51 | { 52 | "url": "https://github.com/ondrejmirtes", 53 | "type": "github" 54 | }, 55 | { 56 | "url": "https://github.com/phpstan", 57 | "type": "github" 58 | }, 59 | { 60 | "url": "https://www.patreon.com/phpstan", 61 | "type": "patreon" 62 | }, 63 | { 64 | "url": "https://tidelift.com/funding/github/packagist/phpstan/phpstan", 65 | "type": "tidelift" 66 | } 67 | ], 68 | "install-path": "../phpstan/phpstan" 69 | } 70 | ], 71 | "dev": true, 72 | "dev-package-names": [ 73 | "phpstan/phpstan" 74 | ] 75 | } 76 | -------------------------------------------------------------------------------- /compiler/vendor/composer/autoload_real.php: -------------------------------------------------------------------------------- 1 | = 50600 && !defined('HHVM_VERSION') && (!function_exists('zend_loader_file_encoded') || !zend_loader_file_encoded()); 30 | if ($useStaticLoader) { 31 | require __DIR__ . '/autoload_static.php'; 32 | 33 | call_user_func(\Composer\Autoload\ComposerStaticInit93012ad143650ef6581a183b8b883f71::getInitializer($loader)); 34 | } else { 35 | $map = require __DIR__ . '/autoload_namespaces.php'; 36 | foreach ($map as $namespace => $path) { 37 | $loader->set($namespace, $path); 38 | } 39 | 40 | $map = require __DIR__ . '/autoload_psr4.php'; 41 | foreach ($map as $namespace => $path) { 42 | $loader->setPsr4($namespace, $path); 43 | } 44 | 45 | $classMap = require __DIR__ . '/autoload_classmap.php'; 46 | if ($classMap) { 47 | $loader->addClassMap($classMap); 48 | } 49 | } 50 | 51 | $loader->register(true); 52 | 53 | if ($useStaticLoader) { 54 | $includeFiles = Composer\Autoload\ComposerStaticInit93012ad143650ef6581a183b8b883f71::$files; 55 | } else { 56 | $includeFiles = require __DIR__ . '/autoload_files.php'; 57 | } 58 | foreach ($includeFiles as $fileIdentifier => $file) { 59 | composerRequire93012ad143650ef6581a183b8b883f71($fileIdentifier, $file); 60 | } 61 | 62 | return $loader; 63 | } 64 | } 65 | 66 | /** 67 | * @param string $fileIdentifier 68 | * @param string $file 69 | * @return void 70 | */ 71 | function composerRequire93012ad143650ef6581a183b8b883f71($fileIdentifier, $file) 72 | { 73 | if (empty($GLOBALS['__composer_autoload_files'][$fileIdentifier])) { 74 | $GLOBALS['__composer_autoload_files'][$fileIdentifier] = true; 75 | 76 | require $file; 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /compiler/compilers.py: -------------------------------------------------------------------------------- 1 | from subprocess import Popen, PIPE 2 | import os.path, subprocess 3 | import os 4 | import shutil 5 | import re 6 | import json 7 | from tqdm import tqdm 8 | import chardet 9 | import jsonlines 10 | import tempfile as tfile 11 | import json 12 | import threading 13 | import time 14 | 15 | 16 | def compile_prog(filepath, lang): 17 | ''' 18 | filepath: path of the file you would like to compile 19 | lang: prog. language; 'Py', 'Java', 'CPP', 'C', 'PHP', 'JS', 'CS' 20 | Dependencies: 21 | Java: Java Development kit (JDK) (https://www.oracle.com/java/technologies/downloads/) 22 | JS: Node.js (https://nodejs.org/en/download/) 23 | CS: Install mono library (brew install mono) (http://www.mono-project.com/Mono:OSX) 24 | ''' 25 | if lang=='Py': 26 | cmd = 'python3 -m py_compile '+filepath 27 | #cmd = 'pylint -E ' + filepath 28 | elif lang=='Java': 29 | cmd = 'javac '+filepath 30 | elif lang=='CPP' or lang == 'C': 31 | cmd = 'g++ -std=c++11 '+ filepath 32 | # elif lang=='C': 33 | # cmd = 'gcc '+filepath 34 | elif lang=='PHP': 35 | # cmd = "/home/aneesh/MuST-CoST/vendor/bin/phpstan analyse -l 5 --no-progress " + filepath 36 | cmd = 'php -l ' + filepath 37 | #cmd = 'php -l -d display_errors=on' + filepath 38 | elif lang=='JS': 39 | cmd = 'node '+filepath 40 | elif lang=='CS': 41 | cmd = 'mcs '+filepath 42 | #cmd = 'csc '+filepath 43 | else: 44 | print('invalid argument') 45 | return 46 | proc = subprocess.Popen(cmd, stdout=PIPE, stderr=PIPE,shell=True) 47 | error = [i.decode('utf-8') for i in proc.stderr.readlines()] 48 | err = '\n'.join(error) 49 | output = [i.decode('utf-8') for i in proc.stdout.readlines()] 50 | op = '\n'.join(output) 51 | return err, op 52 | 53 | def remove_comments(string, lang): 54 | if lang == 'Python': 55 | pattern = "('''[\s\S]*''')|(''[\s\S]*''')" 56 | string = re.sub(pattern, '', string) 57 | return re.sub(r'(?m)^ *#.*\n?', '', string) 58 | else: 59 | pattern = '\/\*[\s\S]*\*\/' 60 | pattern2 = '[^:]//.*|/\\*((?!=*/)(?s:.))+\\*/' 61 | string = re.sub(pattern, '', string) 62 | string = re.sub(pattern2, '', string) 63 | return string 64 | 65 | 66 | def php_compiler(code_str): 67 | prefix = '''" 0: 37 | possible_matches_by_order[order-1] += possible_matches 38 | 39 | precisions = [0] * max_order 40 | for i in range(0, max_order): 41 | if smooth: 42 | precisions[i] = ((matches_by_order[i] + 1.) / 43 | (possible_matches_by_order[i] + 1.)) 44 | else: 45 | if possible_matches_by_order[i] > 0: 46 | precisions[i] = (float(matches_by_order[i]) / 47 | possible_matches_by_order[i]) 48 | else: 49 | precisions[i] = 0.0 50 | 51 | if min(precisions) > 0: 52 | p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions) 53 | geo_mean = math.exp(p_log_sum) 54 | else: 55 | geo_mean = 0 56 | 57 | ratio = float(translation_length) / reference_length 58 | 59 | if ratio > 1.0: 60 | bp = 1. 61 | else: 62 | bp = math.exp(1 - 1. / ratio) 63 | 64 | bleu = geo_mean * bp 65 | 66 | return (bleu, precisions, bp, ratio, translation_length, reference_length) 67 | 68 | 69 | def _bleu(ref_file, trans_file, subword_option=None): 70 | max_order = 4 71 | smooth = True 72 | ref_files = [ref_file] 73 | reference_text = [] 74 | for reference_filename in ref_files: 75 | with open(reference_filename) as fh: 76 | reference_text.append(fh.readlines()) 77 | per_segment_references = [] 78 | for references in zip(*reference_text): 79 | reference_list = [] 80 | for reference in references: 81 | reference_list.append(reference.strip().split()) 82 | per_segment_references.append(reference_list) 83 | translations = [] 84 | with open(trans_file) as fh: 85 | for line in fh: 86 | translations.append(line.strip().split()) 87 | bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth) 88 | return round(100 * bleu_score,2) -------------------------------------------------------------------------------- /code_prepro/lang_processors/tokenization_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | import re 8 | #from sacrebleu import tokenize_v14_international 9 | import sacrebleu 10 | 11 | # IMPORTED 12 | NEWLINE_TOKEN = "NEWLINE_TOKEN" 13 | 14 | 15 | # IMPORTED 16 | class ind_iter(object): 17 | def __init__(self, len): 18 | self.i = 0 19 | self.len = len 20 | 21 | def next(self): 22 | self.i += 1 23 | if self.i > (self.len - 1): 24 | raise StopIteration 25 | 26 | def prev(self): 27 | self.i -= 1 28 | if self.i < 0: 29 | raise StopIteration 30 | 31 | 32 | # IMPORTED 33 | def process_string(tok, char2tok, tok2char, is_comment, do_whole_processing=True): 34 | if not (do_whole_processing or is_comment): 35 | return tok.replace("\n", "\\n").replace("\r", "") 36 | 37 | if is_comment: 38 | tok = re.sub(" +", " ", tok) 39 | tok = re.sub(r"(.)\1\1\1\1+", r"\1\1\1\1\1", tok) 40 | if len(re.sub(r"\W", "", tok)) < 2: 41 | return "" 42 | tok = replace_general_string_tok(tok) 43 | tok = replace_tokens(tok, char2tok) 44 | if tok.strip().startswith("STOKEN00"): 45 | if " STRNEWLINE " in tok: 46 | tok = tok.replace(" STRNEWLINE ", " ENDCOM", 1) 47 | else: 48 | tok += " ENDCOM" 49 | if not do_whole_processing: 50 | tok = replace_tokens( 51 | tok, {f" {key} ": value for key, value in tok2char.items()} 52 | ) 53 | tok = ( 54 | tok.replace(" ▁ ", " ") 55 | .replace(" TABSYMBOL ", "\t") 56 | .replace("\\r", "") 57 | .replace(" STRNEWLINE ", "\\n") 58 | ) 59 | return tok 60 | 61 | tok = re.sub(" +", " ", tok) 62 | tok = sacrebleu.tokenize_v14_international(tok) 63 | tok = re.sub(" +", " ", tok) 64 | tok = tok.replace("\r", "") 65 | for special_token, char in tok2char.items(): 66 | tok = tok.replace(special_token, char) 67 | return tok 68 | 69 | 70 | def tokenize_string(s: str): 71 | return process_string( 72 | s, char2tok=dict(), tok2char=dict(), is_comment=False, do_whole_processing=True 73 | ).split(" ") 74 | 75 | 76 | def detokenize_string(s): 77 | assert isinstance(s, str) or isinstance(s, list) 78 | if isinstance(s, list): 79 | s = " ".join(s) 80 | return s.replace(" ", "").replace("▁", " ") 81 | 82 | 83 | # IMPORTED 84 | def replace_tokens(tok, dictionary): 85 | for char, special_token in dictionary.items(): 86 | tok = tok.replace(char, special_token) 87 | return tok 88 | 89 | 90 | # IMPORTED 91 | def replace_general_string_tok(tok): 92 | return ( 93 | tok.replace(" ", " ▁ ") 94 | .replace("\n", " STRNEWLINE ") 95 | .replace("\t", " TABSYMBOL ") 96 | ) 97 | 98 | 99 | # IMPORTED 100 | def indent_lines(lines): 101 | prefix = "" 102 | for i, line in enumerate(lines): 103 | line = line.strip() 104 | if re.match("CB_COLON|CB_COMA|CB_", line): 105 | prefix = prefix[2:] 106 | line = prefix + line 107 | elif line.endswith("OB_"): 108 | line = prefix + line 109 | prefix += " " 110 | else: 111 | line = prefix + line 112 | lines[i] = line 113 | untok_s = "\n".join(lines) 114 | return untok_s 115 | -------------------------------------------------------------------------------- /compiler/vendor/bin/phpstan: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env php 2 | realpath = realpath($opened_path) ?: $opened_path; 34 | $opened_path = $this->realpath; 35 | $this->handle = fopen($this->realpath, $mode); 36 | $this->position = 0; 37 | 38 | return (bool) $this->handle; 39 | } 40 | 41 | public function stream_read($count) 42 | { 43 | $data = fread($this->handle, $count); 44 | 45 | if ($this->position === 0) { 46 | $data = preg_replace('{^#!.*\r?\n}', '', $data); 47 | } 48 | 49 | $this->position += strlen($data); 50 | 51 | return $data; 52 | } 53 | 54 | public function stream_cast($castAs) 55 | { 56 | return $this->handle; 57 | } 58 | 59 | public function stream_close() 60 | { 61 | fclose($this->handle); 62 | } 63 | 64 | public function stream_lock($operation) 65 | { 66 | return $operation ? flock($this->handle, $operation) : true; 67 | } 68 | 69 | public function stream_seek($offset, $whence) 70 | { 71 | if (0 === fseek($this->handle, $offset, $whence)) { 72 | $this->position = ftell($this->handle); 73 | return true; 74 | } 75 | 76 | return false; 77 | } 78 | 79 | public function stream_tell() 80 | { 81 | return $this->position; 82 | } 83 | 84 | public function stream_eof() 85 | { 86 | return feof($this->handle); 87 | } 88 | 89 | public function stream_stat() 90 | { 91 | return array(); 92 | } 93 | 94 | public function stream_set_option($option, $arg1, $arg2) 95 | { 96 | return true; 97 | } 98 | 99 | public function url_stat($path, $flags) 100 | { 101 | $path = substr($path, 17); 102 | if (file_exists($path)) { 103 | return stat($path); 104 | } 105 | 106 | return false; 107 | } 108 | } 109 | } 110 | 111 | if (function_exists('stream_wrapper_register') && stream_wrapper_register('phpvfscomposer', 'Composer\BinProxyWrapper')) { 112 | include("phpvfscomposer://" . __DIR__ . '/..'.'/phpstan/phpstan/phpstan'); 113 | exit(0); 114 | } 115 | } 116 | 117 | include __DIR__ . '/..'.'/phpstan/phpstan/phpstan'; 118 | -------------------------------------------------------------------------------- /compiler/vendor/bin/phpstan.phar: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env php 2 | realpath = realpath($opened_path) ?: $opened_path; 34 | $opened_path = $this->realpath; 35 | $this->handle = fopen($this->realpath, $mode); 36 | $this->position = 0; 37 | 38 | return (bool) $this->handle; 39 | } 40 | 41 | public function stream_read($count) 42 | { 43 | $data = fread($this->handle, $count); 44 | 45 | if ($this->position === 0) { 46 | $data = preg_replace('{^#!.*\r?\n}', '', $data); 47 | } 48 | 49 | $this->position += strlen($data); 50 | 51 | return $data; 52 | } 53 | 54 | public function stream_cast($castAs) 55 | { 56 | return $this->handle; 57 | } 58 | 59 | public function stream_close() 60 | { 61 | fclose($this->handle); 62 | } 63 | 64 | public function stream_lock($operation) 65 | { 66 | return $operation ? flock($this->handle, $operation) : true; 67 | } 68 | 69 | public function stream_seek($offset, $whence) 70 | { 71 | if (0 === fseek($this->handle, $offset, $whence)) { 72 | $this->position = ftell($this->handle); 73 | return true; 74 | } 75 | 76 | return false; 77 | } 78 | 79 | public function stream_tell() 80 | { 81 | return $this->position; 82 | } 83 | 84 | public function stream_eof() 85 | { 86 | return feof($this->handle); 87 | } 88 | 89 | public function stream_stat() 90 | { 91 | return array(); 92 | } 93 | 94 | public function stream_set_option($option, $arg1, $arg2) 95 | { 96 | return true; 97 | } 98 | 99 | public function url_stat($path, $flags) 100 | { 101 | $path = substr($path, 17); 102 | if (file_exists($path)) { 103 | return stat($path); 104 | } 105 | 106 | return false; 107 | } 108 | } 109 | } 110 | 111 | if (function_exists('stream_wrapper_register') && stream_wrapper_register('phpvfscomposer', 'Composer\BinProxyWrapper')) { 112 | include("phpvfscomposer://" . __DIR__ . '/..'.'/phpstan/phpstan/phpstan.phar'); 113 | exit(0); 114 | } 115 | } 116 | 117 | include __DIR__ . '/..'.'/phpstan/phpstan/phpstan.phar'; 118 | -------------------------------------------------------------------------------- /CodeBLEU/parser/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | import re 5 | from io import StringIO 6 | import tokenize 7 | def remove_comments_and_docstrings(source,lang): 8 | if lang in ['python']: 9 | """ 10 | Returns 'source' minus comments and docstrings. 11 | """ 12 | io_obj = StringIO(source) 13 | out = "" 14 | prev_toktype = tokenize.INDENT 15 | last_lineno = -1 16 | last_col = 0 17 | for tok in tokenize.generate_tokens(io_obj.readline): 18 | token_type = tok[0] 19 | token_string = tok[1] 20 | start_line, start_col = tok[2] 21 | end_line, end_col = tok[3] 22 | ltext = tok[4] 23 | if start_line > last_lineno: 24 | last_col = 0 25 | if start_col > last_col: 26 | out += (" " * (start_col - last_col)) 27 | # Remove comments: 28 | if token_type == tokenize.COMMENT: 29 | pass 30 | # This series of conditionals removes docstrings: 31 | elif token_type == tokenize.STRING: 32 | if prev_toktype != tokenize.INDENT: 33 | # This is likely a docstring; double-check we're not inside an operator: 34 | if prev_toktype != tokenize.NEWLINE: 35 | if start_col > 0: 36 | out += token_string 37 | else: 38 | out += token_string 39 | prev_toktype = token_type 40 | last_col = end_col 41 | last_lineno = end_line 42 | temp=[] 43 | for x in out.split('\n'): 44 | if x.strip()!="": 45 | temp.append(x) 46 | return '\n'.join(temp) 47 | elif lang in ['ruby']: 48 | return source 49 | else: 50 | def replacer(match): 51 | s = match.group(0) 52 | if s.startswith('/'): 53 | return " " # note: a space and not an empty string 54 | else: 55 | return s 56 | pattern = re.compile( 57 | r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', 58 | re.DOTALL | re.MULTILINE 59 | ) 60 | temp=[] 61 | for x in re.sub(pattern, replacer, source).split('\n'): 62 | if x.strip()!="": 63 | temp.append(x) 64 | return '\n'.join(temp) 65 | 66 | def tree_to_token_index(root_node): 67 | if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': 68 | return [(root_node.start_point,root_node.end_point)] 69 | else: 70 | code_tokens=[] 71 | for child in root_node.children: 72 | code_tokens+=tree_to_token_index(child) 73 | return code_tokens 74 | 75 | def tree_to_variable_index(root_node,index_to_code): 76 | if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': 77 | index=(root_node.start_point,root_node.end_point) 78 | _,code=index_to_code[index] 79 | if root_node.type!=code: 80 | return [(root_node.start_point,root_node.end_point)] 81 | else: 82 | return [] 83 | else: 84 | code_tokens=[] 85 | for child in root_node.children: 86 | code_tokens+=tree_to_variable_index(child,index_to_code) 87 | return code_tokens 88 | 89 | def index_to_code_token(index,code): 90 | start_point=index[0] 91 | end_point=index[1] 92 | if start_point[0]==end_point[0]: 93 | s=code[start_point[0]][start_point[1]:end_point[1]] 94 | else: 95 | s="" 96 | s+=code[start_point[0]][start_point[1]:] 97 | for i in range(start_point[0]+1,end_point[0]): 98 | s+=code[i] 99 | s+=code[end_point[0]][:end_point[1]] 100 | return s 101 | -------------------------------------------------------------------------------- /CodeBLEU/utils.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: Utility functions 2 | # 3 | # Copyright (C) 2001-2020 NLTK Project 4 | # Author: Steven Bird 5 | # URL: 6 | # For license information, see LICENSE.TXT 7 | 8 | from itertools import chain 9 | 10 | def pad_sequence( 11 | sequence, 12 | n, 13 | pad_left=False, 14 | pad_right=False, 15 | left_pad_symbol=None, 16 | right_pad_symbol=None, 17 | ): 18 | """ 19 | Returns a padded sequence of items before ngram extraction. 20 | >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='', right_pad_symbol='')) 21 | ['', 1, 2, 3, 4, 5, ''] 22 | >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='')) 23 | ['', 1, 2, 3, 4, 5] 24 | >>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='')) 25 | [1, 2, 3, 4, 5, ''] 26 | :param sequence: the source data to be padded 27 | :type sequence: sequence or iter 28 | :param n: the degree of the ngrams 29 | :type n: int 30 | :param pad_left: whether the ngrams should be left-padded 31 | :type pad_left: bool 32 | :param pad_right: whether the ngrams should be right-padded 33 | :type pad_right: bool 34 | :param left_pad_symbol: the symbol to use for left padding (default is None) 35 | :type left_pad_symbol: any 36 | :param right_pad_symbol: the symbol to use for right padding (default is None) 37 | :type right_pad_symbol: any 38 | :rtype: sequence or iter 39 | """ 40 | sequence = iter(sequence) 41 | if pad_left: 42 | sequence = chain((left_pad_symbol,) * (n - 1), sequence) 43 | if pad_right: 44 | sequence = chain(sequence, (right_pad_symbol,) * (n - 1)) 45 | return sequence 46 | 47 | 48 | # add a flag to pad the sequence so we get peripheral ngrams? 49 | 50 | 51 | def ngrams( 52 | sequence, 53 | n, 54 | pad_left=False, 55 | pad_right=False, 56 | left_pad_symbol=None, 57 | right_pad_symbol=None, 58 | ): 59 | """ 60 | Return the ngrams generated from a sequence of items, as an iterator. 61 | For example: 62 | >>> from nltk.util import ngrams 63 | >>> list(ngrams([1,2,3,4,5], 3)) 64 | [(1, 2, 3), (2, 3, 4), (3, 4, 5)] 65 | Wrap with list for a list version of this function. Set pad_left 66 | or pad_right to true in order to get additional ngrams: 67 | >>> list(ngrams([1,2,3,4,5], 2, pad_right=True)) 68 | [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)] 69 | >>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='')) 70 | [(1, 2), (2, 3), (3, 4), (4, 5), (5, '')] 71 | >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='')) 72 | [('', 1), (1, 2), (2, 3), (3, 4), (4, 5)] 73 | >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='', right_pad_symbol='')) 74 | [('', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '')] 75 | :param sequence: the source data to be converted into ngrams 76 | :type sequence: sequence or iter 77 | :param n: the degree of the ngrams 78 | :type n: int 79 | :param pad_left: whether the ngrams should be left-padded 80 | :type pad_left: bool 81 | :param pad_right: whether the ngrams should be right-padded 82 | :type pad_right: bool 83 | :param left_pad_symbol: the symbol to use for left padding (default is None) 84 | :type left_pad_symbol: any 85 | :param right_pad_symbol: the symbol to use for right padding (default is None) 86 | :type right_pad_symbol: any 87 | :rtype: sequence or iter 88 | """ 89 | sequence = pad_sequence( 90 | sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol 91 | ) 92 | 93 | history = [] 94 | while n > 1: 95 | # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator 96 | try: 97 | next_item = next(sequence) 98 | except StopIteration: 99 | # no more data, terminate the generator 100 | return 101 | history.append(next_item) 102 | n -= 1 103 | for item in sequence: 104 | history.append(item) 105 | yield tuple(history) 106 | del history[0] -------------------------------------------------------------------------------- /CodeBLEU/dataflow_match.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | from parser import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp 5 | from parser import (remove_comments_and_docstrings, 6 | tree_to_token_index, 7 | index_to_code_token, 8 | tree_to_variable_index) 9 | from tree_sitter import Language, Parser 10 | import pdb 11 | 12 | dfg_function={ 13 | 'python':DFG_python, 14 | 'java':DFG_java, 15 | 'ruby':DFG_ruby, 16 | 'go':DFG_go, 17 | 'php':DFG_php, 18 | 'javascript':DFG_javascript, 19 | 'c_sharp':DFG_csharp, 20 | 'c':DFG_csharp, 21 | 'cpp':DFG_csharp 22 | } 23 | 24 | def calc_dataflow_match(references, candidate, lang): 25 | return corpus_dataflow_match([references], [candidate], lang) 26 | 27 | def corpus_dataflow_match(references, candidates, lang): 28 | LANGUAGE = Language('parser/my-languages.so', lang) 29 | parser = Parser() 30 | parser.set_language(LANGUAGE) 31 | parser = [parser,dfg_function[lang]] 32 | match_count = 0 33 | total_count = 0 34 | 35 | for i in range(len(candidates)): 36 | references_sample = references[i] 37 | candidate = candidates[i] 38 | for reference in references_sample: 39 | try: 40 | candidate=remove_comments_and_docstrings(candidate,'java') 41 | except: 42 | pass 43 | try: 44 | reference=remove_comments_and_docstrings(reference,'java') 45 | except: 46 | pass 47 | 48 | cand_dfg = get_data_flow(candidate, parser) 49 | ref_dfg = get_data_flow(reference, parser) 50 | 51 | normalized_cand_dfg = normalize_dataflow(cand_dfg) 52 | normalized_ref_dfg = normalize_dataflow(ref_dfg) 53 | 54 | if len(normalized_ref_dfg) > 0: 55 | total_count += len(normalized_ref_dfg) 56 | for dataflow in normalized_ref_dfg: 57 | if dataflow in normalized_cand_dfg: 58 | match_count += 1 59 | normalized_cand_dfg.remove(dataflow) 60 | if total_count == 0: 61 | print("WARNING: There is no reference data-flows extracted from the whole corpus, and the data-flow match score degenerates to 0. Please consider ignoring this score.") 62 | return 0 63 | score = match_count / total_count 64 | return score 65 | 66 | def get_data_flow(code, parser): 67 | try: 68 | tree = parser[0].parse(bytes(code,'utf8')) 69 | root_node = tree.root_node 70 | tokens_index=tree_to_token_index(root_node) 71 | code=code.split('\n') 72 | code_tokens=[index_to_code_token(x,code) for x in tokens_index] 73 | index_to_code={} 74 | for idx,(index,code) in enumerate(zip(tokens_index,code_tokens)): 75 | index_to_code[index]=(idx,code) 76 | try: 77 | DFG,_=parser[1](root_node,index_to_code,{}) 78 | except: 79 | DFG=[] 80 | DFG=sorted(DFG,key=lambda x:x[1]) 81 | indexs=set() 82 | for d in DFG: 83 | if len(d[-1])!=0: 84 | indexs.add(d[1]) 85 | for x in d[-1]: 86 | indexs.add(x) 87 | new_DFG=[] 88 | for d in DFG: 89 | if d[1] in indexs: 90 | new_DFG.append(d) 91 | codes=code_tokens 92 | dfg=new_DFG 93 | except: 94 | codes=code.split() 95 | dfg=[] 96 | #merge nodes 97 | dic={} 98 | for d in dfg: 99 | if d[1] not in dic: 100 | dic[d[1]]=d 101 | else: 102 | dic[d[1]]=(d[0],d[1],d[2],list(set(dic[d[1]][3]+d[3])),list(set(dic[d[1]][4]+d[4]))) 103 | DFG=[] 104 | for d in dic: 105 | DFG.append(dic[d]) 106 | dfg=DFG 107 | return dfg 108 | 109 | def normalize_dataflow_item(dataflow_item): 110 | var_name = dataflow_item[0] 111 | var_pos = dataflow_item[1] 112 | relationship = dataflow_item[2] 113 | par_vars_name_list = dataflow_item[3] 114 | par_vars_pos_list = dataflow_item[4] 115 | 116 | var_names = list(set(par_vars_name_list+[var_name])) 117 | norm_names = {} 118 | for i in range(len(var_names)): 119 | norm_names[var_names[i]] = 'var_'+str(i) 120 | 121 | norm_var_name = norm_names[var_name] 122 | relationship = dataflow_item[2] 123 | norm_par_vars_name_list = [norm_names[x] for x in par_vars_name_list] 124 | 125 | return (norm_var_name, relationship, norm_par_vars_name_list) 126 | 127 | def normalize_dataflow(dataflow): 128 | var_dict = {} 129 | i = 0 130 | normalized_dataflow = [] 131 | for item in dataflow: 132 | var_name = item[0] 133 | relationship = item[2] 134 | par_vars_name_list = item[3] 135 | for name in par_vars_name_list: 136 | if name not in var_dict: 137 | var_dict[name] = 'var_'+str(i) 138 | i += 1 139 | if var_name not in var_dict: 140 | var_dict[var_name] = 'var_'+str(i) 141 | i+= 1 142 | normalized_dataflow.append((var_dict[var_name], relationship, [var_dict[x] for x in par_vars_name_list])) 143 | return normalized_dataflow 144 | 145 | -------------------------------------------------------------------------------- /compiler/terminal_compiler.py: -------------------------------------------------------------------------------- 1 | from .compilers import compile_prog 2 | import tempfile as tfile 3 | import re 4 | import os 5 | 6 | class TerminalCompiler: 7 | 8 | def __init__(self, language): 9 | 10 | self.lang = language 11 | 12 | self.lang2ext = { 13 | 'Python' : '.py', 14 | 'C' : '.c', 15 | 'Java': '.java', 16 | 'PHP': '.php', 17 | 'C++': '.cpp', 18 | 'C#': '.cs' 19 | } 20 | 21 | self.lang2compiler = { 22 | 'Python' : 'Py', 23 | 'C' : 'C', 24 | 'Java': 'Java', 25 | 'PHP': 'PHP', 26 | 'C++': 'CPP', 27 | 'C#': 'CS' 28 | } 29 | 30 | def remove_special_tokens(self, code_string): 31 | lines = code_string.split("NEW_LINE") 32 | lines = [item.strip() for item in lines] 33 | 34 | curr_indent = 0 35 | new_lines = [] 36 | for line in lines: 37 | indent_count = line.count('INDENT') 38 | dedent_count = line.count('DEDENT') 39 | curr_indent += indent_count - dedent_count 40 | wo_indent = re.sub('INDENT\s?', '', line) 41 | wo_dedent = re.sub('DEDENT\s?', '', wo_indent) 42 | new_lines.append('\t'*curr_indent + wo_dedent) 43 | return ("\n").join(new_lines) 44 | 45 | def remove_newline(self, code_string): 46 | return re.sub('NEW_LINE\s?', '\n', code_string) 47 | 48 | def process_php_string(self, code_string): 49 | 50 | 51 | if code_string.startswith('< ? php'): 52 | code_string = code_string.replace('< ? php', "'): 59 | code_string = code_string[:-3] + '?>' 60 | code_string = code_string.replace('$ ', '$') 61 | return code_string 62 | 63 | 64 | def compile_code_string(self, code_string, print_error = False): 65 | 66 | if self.lang == 'Python': 67 | #code_string = self.remove_special_tokens(code_string) 68 | pass 69 | else: 70 | code_string = self.remove_newline(code_string) 71 | 72 | if self.lang == 'PHP': 73 | code_string = self.process_php_string(code_string) 74 | elif self.lang == "Java": 75 | code_string = code_string.replace("public class", "class") 76 | 77 | # fd, path = tfile.mkstemp(suffix=self.lang2ext[self.lang]) #can use anything 78 | # try: 79 | # with os.fdopen(fd, 'w') as tmpo: 80 | # # do stuff with temp file 81 | # tmpo.write(code_string) 82 | # tmpo.flush() 83 | # print(path) 84 | # error, output = compile_prog(path, self.lang2compiler[self.lang]) 85 | # finally: 86 | # os.remove(path) 87 | 88 | with tfile.NamedTemporaryFile(mode="w+",suffix=self.lang2ext[self.lang], delete=True, encoding = 'utf-8') as tf: 89 | tf.write(code_string) 90 | tf.flush() 91 | file_path=tf.name 92 | error, output = compile_prog(file_path, self.lang2compiler[self.lang]) 93 | 94 | # compiler_path = '/home/grads/parshinshojaee/trl_code/trl_code/rl_code_repo/compiler' 95 | # with open(compiler_path + "/test"+self.lang2ext[self.lang], "w+", encoding = 'utf-8') as tf: 96 | # tf.write(code_string) 97 | 98 | # file_path= compiler_path + "/test"+self.lang2ext[self.lang] 99 | # error, output = compile_prog(file_path, self.lang2compiler[self.lang]) 100 | 101 | if print_error: 102 | print("Error: ", error) 103 | 104 | if self.lang == "PHP": 105 | if "Errors parsing" in output: 106 | return error, output, False 107 | 108 | elif "No syntax errors" in output: 109 | return error, output, True 110 | # if "[ERROR]" in output: 111 | # return error, output, False 112 | # elif "[OK] No errors" in output: 113 | # return error, output, True 114 | 115 | if error: 116 | return error, output, False 117 | else: 118 | return error, output, True 119 | 120 | def compile_code_file(self, file_path, print_error = False): 121 | 122 | # if self.lang == 'Python': 123 | # code_string = self.remove_special_tokens(code_string) 124 | # else: 125 | # code_string = self.remove_newline(code_string) 126 | 127 | # if self.lang == 'PHP': 128 | # code_string = self.process_php_string(code_string) 129 | 130 | # with tfile.NamedTemporaryFile(mode="w+",suffix=self.lang2ext[self.lang], delete=True) as tf: 131 | 132 | # tf.write(code_string) 133 | # tf.flush() 134 | # file_path=tf.name 135 | 136 | error, output = compile_prog(file_path, self.lang2compiler[self.lang]) 137 | 138 | if print_error: 139 | print("Error: ", error) 140 | 141 | if error: 142 | return error, output, False 143 | else: 144 | return error, output, True 145 | 146 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==1.3.0 2 | alabaster==0.7.12 3 | anyio==2.2.0 4 | appdirs==1.4.4 5 | argh==0.26.2 6 | argon2-cffi==20.1.0 7 | asn1crypto==1.4.0 8 | astroid==2.5.2 9 | astropy==4.2.1 10 | async-generator==1.10 11 | atomicwrites==1.4.0 12 | attrs==20.3.0 13 | Babel==2.9.0 14 | backcall==0.2.0 15 | backports.functools-lru-cache==1.6.4 16 | backports.shutil-get-terminal-size==1.0.0 17 | backports.tempfile==1.0 18 | backports.weakref==1.0.post1 19 | beautifulsoup4==4.9.3 20 | bitarray==2.1.0 21 | bkcharts==0.2 22 | black==19.10b0 23 | bleach==3.3.0 24 | bokeh==2.3.2 25 | boto==2.49.0 26 | Bottleneck==1.3.2 27 | brotlipy==0.7.0 28 | cachetools==5.2.0 29 | certifi==2020.12.5 30 | cffi==1.14.5 31 | chardet==4.0.0 32 | click==7.1.2 33 | cloudpickle==1.6.0 34 | clyent==1.2.1 35 | colorama==0.4.4 36 | contextlib2==0.6.0.post1 37 | cryptography==3.4.7 38 | cycler==0.10.0 39 | Cython==0.29.23 40 | cytoolz==0.11.0 41 | dask==2021.4.0 42 | decorator==5.0.6 43 | defusedxml==0.7.1 44 | diff-match-patch==20200713 45 | distributed==2021.4.1 46 | docformatter==1.5.0 47 | docker-pycreds==0.4.0 48 | docutils==0.17.1 49 | entrypoints==0.3 50 | et-xmlfile==1.0.1 51 | fastcache==1.1.0 52 | filelock==3.0.12 53 | Flask==1.1.2 54 | fsspec==0.9.0 55 | future==0.18.2 56 | gdown==4.5.1 57 | gevent==21.1.2 58 | gitdb==4.0.9 59 | GitPython==3.1.27 60 | glob2==0.7 61 | greenlet==1.0.0 62 | h5py==2.10.0 63 | HeapDict==1.0.1 64 | html5lib==1.1 65 | huggingface-hub==0.9.1 66 | idna==2.10 67 | imageio==2.9.0 68 | imagesize==1.2.0 69 | importlib-metadata==3.10.0 70 | iniconfig==1.1.1 71 | intervaltree==3.1.0 72 | ipykernel==5.3.4 73 | ipython==7.22.0 74 | ipython-genutils==0.2.0 75 | ipywidgets==7.6.3 76 | isort==5.8.0 77 | itsdangerous==1.1.0 78 | jdcal==1.4.1 79 | jedi==0.17.2 80 | jeepney==0.6.0 81 | Jinja2==2.11.3 82 | joblib==1.0.1 83 | json-lines==0.5.0 84 | json5==0.9.5 85 | jsonlines==3.1.0 86 | jsonschema==3.2.0 87 | jupyter==1.0.0 88 | jupyter-client==6.1.12 89 | jupyter-console==6.4.0 90 | jupyter-core==4.7.1 91 | jupyter-packaging==0.7.12 92 | jupyter-server==1.4.1 93 | jupyterlab==3.0.14 94 | jupyterlab-pygments==0.1.2 95 | jupyterlab-server==2.4.0 96 | jupyterlab-widgets==1.0.0 97 | keyring==22.3.0 98 | kiwisolver==1.3.1 99 | lazy-object-proxy==1.6.0 100 | libarchive-c==2.9 101 | llvmlite==0.36.0 102 | locket==0.2.1 103 | lxml==4.6.3 104 | MarkupSafe==1.1.1 105 | matplotlib==3.3.4 106 | mccabe==0.6.1 107 | mistune==0.8.4 108 | mkl-fft==1.3.0 109 | mkl-random==1.2.2 110 | mkl-service==2.4.0 111 | mock==4.0.3 112 | more-itertools==8.7.0 113 | mpmath==1.2.1 114 | msgpack==1.0.2 115 | multipledispatch==0.6.0 116 | mypy-extensions==0.4.3 117 | nbclassic==0.2.6 118 | nbclient==0.5.3 119 | nbconvert==6.0.7 120 | nbformat==5.1.3 121 | nest-asyncio==1.5.1 122 | networkx==2.5 123 | nltk==3.6.1 124 | nose==1.3.7 125 | notebook==6.3.0 126 | numba==0.53.1 127 | numexpr==2.7.3 128 | numpy==1.22.3 129 | numpydoc==1.1.0 130 | nvidia-ml-py==11.495.46 131 | olefile==0.46 132 | openpyxl==3.0.7 133 | packaging==20.9 134 | pandas==1.2.4 135 | pandocfilters==1.4.3 136 | parso==0.7.0 137 | partd==1.2.0 138 | path==15.1.2 139 | pathlib2==2.3.5 140 | pathspec==0.7.0 141 | pathtools==0.1.2 142 | patsy==0.5.1 143 | pep8==1.7.1 144 | pep8radius==0.9.2 145 | pexpect==4.8.0 146 | pickleshare==0.7.5 147 | Pillow==8.2.0 148 | pkginfo==1.7.0 149 | pluggy==0.13.1 150 | ply==3.11 151 | portalocker==2.6.0 152 | prometheus-client==0.10.1 153 | promise==2.3 154 | prompt-toolkit==3.0.17 155 | protobuf==3.20.2 156 | psutil==5.8.0 157 | ptyprocess==0.7.0 158 | py==1.10.0 159 | pycodestyle==2.6.0 160 | pycosat==0.6.3 161 | pycparser==2.20 162 | pycurl==7.43.0.6 163 | pydocstyle==6.0.0 164 | pyerfa==1.7.3 165 | Pygments==2.8.1 166 | pylint==2.7.4 167 | pyls-black==0.4.6 168 | pyls-spyder==0.3.2 169 | pyOpenSSL==20.0.1 170 | pyparsing==2.4.7 171 | pyrsistent==0.17.3 172 | PySocks==1.7.1 173 | pytest==6.2.3 174 | pytz==2021.1 175 | PyWavelets==1.1.1 176 | pyxdg==0.27 177 | PyYAML==5.4.1 178 | pyzmq==20.0.0 179 | QDarkStyle==2.8.1 180 | QtAwesome==1.0.2 181 | qtconsole==5.0.3 182 | QtPy==1.9.0 183 | regex==2021.4.4 184 | requests==2.25.1 185 | rope==0.18.0 186 | Rtree==0.9.7 187 | sacrebleu==2.3.1 188 | scikit-image==0.18.1 189 | scikit-learn==0.24.1 190 | scipy==1.6.2 191 | seaborn==0.11.1 192 | SecretStorage==3.3.1 193 | Send2Trash==1.5.0 194 | sentry-sdk==1.9.8 195 | setproctitle==1.3.2 196 | shortuuid==1.0.9 197 | simplegeneric==0.8.1 198 | singledispatch==3.6.0 199 | sip==5.0.1 200 | six==1.15.0 201 | smmap==5.0.0 202 | sniffio==1.2.0 203 | snowballstemmer==2.1.0 204 | sortedcollections==2.1.0 205 | sortedcontainers==2.3.0 206 | soupsieve==2.2.1 207 | Sphinx==4.0.1 208 | sphinxcontrib-applehelp==1.0.2 209 | sphinxcontrib-devhelp==1.0.2 210 | sphinxcontrib-htmlhelp==1.0.3 211 | sphinxcontrib-jsmath===1.0.1 212 | sphinxcontrib-qthelp==1.0.3 213 | sphinxcontrib-serializinghtml==1.1.4 214 | sphinxcontrib-websupport==1.2.4 215 | spyder==4.2.5 216 | spyder-kernels==1.10.2 217 | SQLAlchemy==1.4.15 218 | statsmodels==0.12.2 219 | sympy==1.8 220 | tables==3.6.1 221 | tabulate==0.9.0 222 | tblib==1.7.0 223 | termcolor==1.1.0 224 | terminado==0.9.4 225 | testpath==0.4.4 226 | textdistance==4.2.1 227 | threadpoolctl==2.1.0 228 | three-merge==0.1.1 229 | tifffile==2020.10.1 230 | tokenizers==0.12.1 231 | toml== 0.10.2 232 | tomli==2.0.1 233 | toolz==0.11.1 234 | torch==1.12.1 235 | tornado==6.1 236 | tqdm==4.59.0 237 | traitlets==5.0.5 238 | transformers==4.22.1 239 | tree-sitter==0.20.1 240 | typed-ast==1.4.2 241 | typing-extensions==3.7.4.3 242 | ujson==4.0.2 243 | unicodecsv==0.14.1 244 | untokenize==0.1.1 245 | urllib3==1.26.12 246 | wandb==0.13.3 247 | watchdog==1.0.2 248 | wcwidth==0.2.5 249 | webencodings==0.5.1 250 | Werkzeug==1.0.1 251 | widgetsnbextension==3.5.1 252 | wrapt==1.12.1 253 | wurlitzer==2.1.0 254 | xlrd==2.0.1 255 | XlsxWriter==1.3.8 256 | xlwt==1.3.0 257 | xmltodict==0.12.0 258 | yapf==0.31.0 259 | zict==2.0.0 260 | zipp==3.4.1 261 | zope.event==4.5.0 262 | zope.interface==5.3.0 263 | -------------------------------------------------------------------------------- /code_prepro/lang_processors/php_processor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | from code_prepro.lang_processors.tree_sitter_processor import ( 8 | TreeSitterLangProcessor, 9 | ) 10 | from code_prepro.lang_processors.tokenization_utils import ( 11 | ind_iter, 12 | NEWLINE_TOKEN, 13 | ) 14 | import re 15 | 16 | PHP_TOKEN2CHAR = { 17 | "STOKEN00": "//", 18 | "STOKEN01": "/*", 19 | "STOKEN02": "*/", 20 | "STOKEN03": "/**", 21 | "STOKEN04": "**/", 22 | "STOKEN05": '"""', 23 | "STOKEN06": "\\n", 24 | "STOKEN07": "\\r", 25 | "STOKEN08": ";", 26 | "STOKEN09": "{", 27 | "STOKEN10": "}", 28 | "STOKEN11": r"\'", 29 | "STOKEN12": r"\"", 30 | "STOKEN13": r"\\", 31 | } 32 | PHP_CHAR2TOKEN = {value: " " + key + " " for key, value in PHP_TOKEN2CHAR.items()} 33 | 34 | 35 | class PhpProcessor(TreeSitterLangProcessor): 36 | def __init__(self, root_folder): 37 | super().__init__( 38 | language="php", 39 | ast_nodes_type_string=["comment", "string_literal", "character_literal"], 40 | stokens_to_chars=PHP_TOKEN2CHAR, 41 | chars_to_stokens=PHP_CHAR2TOKEN, 42 | root_folder=root_folder, 43 | ) 44 | 45 | def extract_functions(self, tokenized_code): 46 | """Extract functions from tokenized Java code""" 47 | if isinstance(tokenized_code, str): 48 | tokens = tokenized_code.split() 49 | else: 50 | assert isinstance(tokenized_code, list) 51 | tokens = tokenized_code 52 | i = ind_iter(len(tokens)) 53 | functions_standalone = [] 54 | functions_class = [] 55 | try: 56 | token = tokens[i.i] 57 | except KeyboardInterrupt: 58 | raise 59 | except: 60 | return [], [] 61 | while True: 62 | try: 63 | # detect function 64 | tokens_no_newline = [] 65 | index = i.i 66 | while index < len(tokens) and len(tokens_no_newline) < 3: 67 | index += 1 68 | if tokens[index].startswith(NEWLINE_TOKEN): 69 | continue 70 | tokens_no_newline.append(tokens[index]) 71 | 72 | if token == ")" and ( 73 | tokens_no_newline[0] == "{" 74 | or ( 75 | tokens_no_newline[0] == "throws" and tokens_no_newline[2] == "{" 76 | ) 77 | ): 78 | # go previous until the start of function 79 | while token not in [";", "}", "{", "*/", "ENDCOM"]: 80 | i.prev() 81 | token = tokens[i.i] 82 | 83 | if token == "*/": 84 | while token != "/*": 85 | i.prev() 86 | token = tokens[i.i] 87 | function = [token] 88 | while token != "*/": 89 | i.next() 90 | token = tokens[i.i] 91 | function.append(token) 92 | elif token == "ENDCOM": 93 | while token != "//": 94 | i.prev() 95 | token = tokens[i.i] 96 | function = [token] 97 | while token != "ENDCOM": 98 | i.next() 99 | token = tokens[i.i] 100 | function.append(token) 101 | else: 102 | i.next() 103 | token = tokens[i.i] 104 | function = [token] 105 | 106 | while token != "{": 107 | i.next() 108 | token = tokens[i.i] 109 | function.append(token) 110 | if token == "{": 111 | number_indent = 1 112 | while not (token == "}" and number_indent == 0): 113 | try: 114 | i.next() 115 | token = tokens[i.i] 116 | if token == "{": 117 | number_indent += 1 118 | elif token == "}": 119 | number_indent -= 1 120 | function.append(token) 121 | except StopIteration: 122 | break 123 | if "static" in function[0 : function.index("{")]: 124 | functions_standalone.append( 125 | self.remove_annotation(" ".join(function)) 126 | ) 127 | else: 128 | functions_class.append( 129 | self.remove_annotation(" ".join(function)) 130 | ) 131 | i.next() 132 | token = tokens[i.i] 133 | except KeyboardInterrupt: 134 | raise 135 | except: 136 | break 137 | return functions_standalone, functions_class 138 | 139 | def remove_annotation(self, function): 140 | return re.sub( 141 | "^(@ (Override|Deprecated|SuppressWarnings) (\( .* \) )?)*", "", function 142 | ) 143 | 144 | def get_function_name(self, function): 145 | return self.get_first_token_before_first_parenthesis(function) 146 | 147 | def extract_arguments(self, function): 148 | return self.extract_arguments_using_parentheses(function) 149 | -------------------------------------------------------------------------------- /code_prepro/lang_processors/csharp_processor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | from code_prepro.lang_processors.tree_sitter_processor import ( 8 | TreeSitterLangProcessor, 9 | ) 10 | from code_prepro.lang_processors.tokenization_utils import ( 11 | ind_iter, 12 | NEWLINE_TOKEN, 13 | ) 14 | import re 15 | 16 | CS_TOKEN2CHAR = { 17 | "STOKEN00": "//", 18 | "STOKEN01": "/*", 19 | "STOKEN02": "*/", 20 | "STOKEN03": "/**", 21 | "STOKEN04": "**/", 22 | "STOKEN05": '"""', 23 | "STOKEN06": "\\n", 24 | "STOKEN07": "\\r", 25 | "STOKEN08": ";", 26 | "STOKEN09": "{", 27 | "STOKEN10": "}", 28 | "STOKEN11": r"\'", 29 | "STOKEN12": r"\"", 30 | "STOKEN13": r"\\", 31 | } 32 | CS_CHAR2TOKEN = {value: " " + key + " " for key, value in CS_TOKEN2CHAR.items()} 33 | 34 | 35 | class CsharpProcessor(TreeSitterLangProcessor): 36 | def __init__(self, root_folder): 37 | super().__init__( 38 | language="c_sharp", 39 | ast_nodes_type_string=["comment", "string_literal", "character_literal"], 40 | stokens_to_chars=CS_TOKEN2CHAR, 41 | chars_to_stokens=CS_CHAR2TOKEN, 42 | root_folder=root_folder, 43 | ) 44 | 45 | def extract_functions(self, tokenized_code): 46 | """Extract functions from tokenized Java code""" 47 | if isinstance(tokenized_code, str): 48 | tokens = tokenized_code.split() 49 | else: 50 | assert isinstance(tokenized_code, list) 51 | tokens = tokenized_code 52 | i = ind_iter(len(tokens)) 53 | functions_standalone = [] 54 | functions_class = [] 55 | try: 56 | token = tokens[i.i] 57 | except KeyboardInterrupt: 58 | raise 59 | except: 60 | return [], [] 61 | while True: 62 | try: 63 | # detect function 64 | tokens_no_newline = [] 65 | index = i.i 66 | while index < len(tokens) and len(tokens_no_newline) < 3: 67 | index += 1 68 | if tokens[index].startswith(NEWLINE_TOKEN): 69 | continue 70 | tokens_no_newline.append(tokens[index]) 71 | 72 | if token == ")" and ( 73 | tokens_no_newline[0] == "{" 74 | or ( 75 | tokens_no_newline[0] == "throws" and tokens_no_newline[2] == "{" 76 | ) 77 | ): 78 | # go previous until the start of function 79 | while token not in [";", "}", "{", "*/", "ENDCOM"]: 80 | i.prev() 81 | token = tokens[i.i] 82 | 83 | if token == "*/": 84 | while token != "/*": 85 | i.prev() 86 | token = tokens[i.i] 87 | function = [token] 88 | while token != "*/": 89 | i.next() 90 | token = tokens[i.i] 91 | function.append(token) 92 | elif token == "ENDCOM": 93 | while token != "//": 94 | i.prev() 95 | token = tokens[i.i] 96 | function = [token] 97 | while token != "ENDCOM": 98 | i.next() 99 | token = tokens[i.i] 100 | function.append(token) 101 | else: 102 | i.next() 103 | token = tokens[i.i] 104 | function = [token] 105 | 106 | while token != "{": 107 | i.next() 108 | token = tokens[i.i] 109 | function.append(token) 110 | if token == "{": 111 | number_indent = 1 112 | while not (token == "}" and number_indent == 0): 113 | try: 114 | i.next() 115 | token = tokens[i.i] 116 | if token == "{": 117 | number_indent += 1 118 | elif token == "}": 119 | number_indent -= 1 120 | function.append(token) 121 | except StopIteration: 122 | break 123 | if "static" in function[0 : function.index("{")]: 124 | functions_standalone.append( 125 | self.remove_annotation(" ".join(function)) 126 | ) 127 | else: 128 | functions_class.append( 129 | self.remove_annotation(" ".join(function)) 130 | ) 131 | i.next() 132 | token = tokens[i.i] 133 | except KeyboardInterrupt: 134 | raise 135 | except: 136 | break 137 | return functions_standalone, functions_class 138 | 139 | def remove_annotation(self, function): 140 | return re.sub( 141 | "^(@ (Override|Deprecated|SuppressWarnings) (\( .* \) )?)*", "", function 142 | ) 143 | 144 | def get_function_name(self, function): 145 | return self.get_first_token_before_first_parenthesis(function) 146 | 147 | def extract_arguments(self, function): 148 | return self.extract_arguments_using_parentheses(function) 149 | -------------------------------------------------------------------------------- /code_prepro/lang_processors/java_processor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | from code_prepro.lang_processors.tree_sitter_processor import ( 8 | TreeSitterLangProcessor, 9 | ) 10 | 11 | from code_prepro.lang_processors.tokenization_utils import ( 12 | ind_iter, 13 | NEWLINE_TOKEN, 14 | ) 15 | import re 16 | 17 | JAVA_TOKEN2CHAR = { 18 | "STOKEN00": "//", 19 | "STOKEN01": "/*", 20 | "STOKEN02": "*/", 21 | "STOKEN03": "/**", 22 | "STOKEN04": "**/", 23 | "STOKEN05": '"""', 24 | "STOKEN06": "\\n", 25 | "STOKEN07": "\\r", 26 | "STOKEN08": ";", 27 | "STOKEN09": "{", 28 | "STOKEN10": "}", 29 | "STOKEN11": r"\'", 30 | "STOKEN12": r"\"", 31 | "STOKEN13": r"\\", 32 | } 33 | JAVA_CHAR2TOKEN = {value: " " + key + " " for key, value in JAVA_TOKEN2CHAR.items()} 34 | 35 | 36 | class JavaProcessor(TreeSitterLangProcessor): 37 | def __init__(self, root_folder): 38 | super().__init__( 39 | language="java", 40 | ast_nodes_type_string=["comment", "string_literal", "character_literal"], 41 | stokens_to_chars=JAVA_TOKEN2CHAR, 42 | chars_to_stokens=JAVA_CHAR2TOKEN, 43 | root_folder=root_folder, 44 | ) 45 | 46 | def extract_functions(self, tokenized_code): 47 | """Extract functions from tokenized Java code""" 48 | if isinstance(tokenized_code, str): 49 | tokens = tokenized_code.split() 50 | else: 51 | assert isinstance(tokenized_code, list) 52 | tokens = tokenized_code 53 | i = ind_iter(len(tokens)) 54 | functions_standalone = [] 55 | functions_class = [] 56 | try: 57 | token = tokens[i.i] 58 | except KeyboardInterrupt: 59 | raise 60 | except: 61 | return [], [] 62 | while True: 63 | try: 64 | # detect function 65 | tokens_no_newline = [] 66 | index = i.i 67 | while index < len(tokens) and len(tokens_no_newline) < 3: 68 | index += 1 69 | if tokens[index].startswith(NEWLINE_TOKEN): 70 | continue 71 | tokens_no_newline.append(tokens[index]) 72 | 73 | if token == ")" and ( 74 | tokens_no_newline[0] == "{" 75 | or ( 76 | tokens_no_newline[0] == "throws" and tokens_no_newline[2] == "{" 77 | ) 78 | ): 79 | # go previous until the start of function 80 | while token not in [";", "}", "{", "*/", "ENDCOM"]: 81 | i.prev() 82 | token = tokens[i.i] 83 | 84 | if token == "*/": 85 | while token != "/*": 86 | i.prev() 87 | token = tokens[i.i] 88 | function = [token] 89 | while token != "*/": 90 | i.next() 91 | token = tokens[i.i] 92 | function.append(token) 93 | elif token == "ENDCOM": 94 | while token != "//": 95 | i.prev() 96 | token = tokens[i.i] 97 | function = [token] 98 | while token != "ENDCOM": 99 | i.next() 100 | token = tokens[i.i] 101 | function.append(token) 102 | else: 103 | i.next() 104 | token = tokens[i.i] 105 | function = [token] 106 | 107 | while token != "{": 108 | i.next() 109 | token = tokens[i.i] 110 | function.append(token) 111 | if token == "{": 112 | number_indent = 1 113 | while not (token == "}" and number_indent == 0): 114 | try: 115 | i.next() 116 | token = tokens[i.i] 117 | if token == "{": 118 | number_indent += 1 119 | elif token == "}": 120 | number_indent -= 1 121 | function.append(token) 122 | except StopIteration: 123 | break 124 | if "static" in function[0 : function.index("{")]: 125 | functions_standalone.append( 126 | self.remove_annotation(" ".join(function)) 127 | ) 128 | else: 129 | functions_class.append( 130 | self.remove_annotation(" ".join(function)) 131 | ) 132 | i.next() 133 | token = tokens[i.i] 134 | except KeyboardInterrupt: 135 | raise 136 | except: 137 | break 138 | return functions_standalone, functions_class 139 | 140 | def remove_annotation(self, function): 141 | return re.sub( 142 | "^(@ (Override|Deprecated|SuppressWarnings) (\( .* \) )?)*", "", function 143 | ) 144 | 145 | def get_function_name(self, function): 146 | return self.get_first_token_before_first_parenthesis(function) 147 | 148 | def extract_arguments(self, function): 149 | return self.extract_arguments_using_parentheses(function) 150 | -------------------------------------------------------------------------------- /code_prepro/lang_processors/javascript_processor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | from code_prepro.lang_processors.tree_sitter_processor import ( 8 | TreeSitterLangProcessor, 9 | ) 10 | 11 | from code_prepro.lang_processors.tokenization_utils import ( 12 | ind_iter, 13 | NEWLINE_TOKEN, 14 | ) 15 | import re 16 | 17 | JAVASCRIPT_TOKEN2CHAR = { 18 | "STOKEN00": "//", 19 | "STOKEN01": "/*", 20 | "STOKEN02": "*/", 21 | "STOKEN03": "/**", 22 | "STOKEN04": "**/", 23 | "STOKEN05": '"""', 24 | "STOKEN06": "\\n", 25 | "STOKEN07": "\\r", 26 | "STOKEN08": ";", 27 | "STOKEN09": "{", 28 | "STOKEN10": "}", 29 | "STOKEN11": r"\'", 30 | "STOKEN12": r"\"", 31 | "STOKEN13": r"\\", 32 | } 33 | JAVASCRIPT_CHAR2TOKEN = {value: " " + key + " " for key, value in JAVASCRIPT_TOKEN2CHAR.items()} 34 | 35 | 36 | class JavascriptProcessor(TreeSitterLangProcessor): 37 | def __init__(self, root_folder): 38 | super().__init__( 39 | language="javascript", 40 | ast_nodes_type_string=["comment", "string_literal", "character_literal"], 41 | stokens_to_chars=JAVASCRIPT_TOKEN2CHAR, 42 | chars_to_stokens=JAVASCRIPT_CHAR2TOKEN, 43 | root_folder=root_folder, 44 | ) 45 | 46 | def extract_functions(self, tokenized_code): 47 | """Extract functions from tokenized Java code""" 48 | if isinstance(tokenized_code, str): 49 | tokens = tokenized_code.split() 50 | else: 51 | assert isinstance(tokenized_code, list) 52 | tokens = tokenized_code 53 | i = ind_iter(len(tokens)) 54 | functions_standalone = [] 55 | functions_class = [] 56 | try: 57 | token = tokens[i.i] 58 | except KeyboardInterrupt: 59 | raise 60 | except: 61 | return [], [] 62 | while True: 63 | try: 64 | # detect function 65 | tokens_no_newline = [] 66 | index = i.i 67 | while index < len(tokens) and len(tokens_no_newline) < 3: 68 | index += 1 69 | if tokens[index].startswith(NEWLINE_TOKEN): 70 | continue 71 | tokens_no_newline.append(tokens[index]) 72 | 73 | if token == ")" and ( 74 | tokens_no_newline[0] == "{" 75 | or ( 76 | tokens_no_newline[0] == "throws" and tokens_no_newline[2] == "{" 77 | ) 78 | ): 79 | # go previous until the start of function 80 | while token not in [";", "}", "{", "*/", "ENDCOM"]: 81 | i.prev() 82 | token = tokens[i.i] 83 | 84 | if token == "*/": 85 | while token != "/*": 86 | i.prev() 87 | token = tokens[i.i] 88 | function = [token] 89 | while token != "*/": 90 | i.next() 91 | token = tokens[i.i] 92 | function.append(token) 93 | elif token == "ENDCOM": 94 | while token != "//": 95 | i.prev() 96 | token = tokens[i.i] 97 | function = [token] 98 | while token != "ENDCOM": 99 | i.next() 100 | token = tokens[i.i] 101 | function.append(token) 102 | else: 103 | i.next() 104 | token = tokens[i.i] 105 | function = [token] 106 | 107 | while token != "{": 108 | i.next() 109 | token = tokens[i.i] 110 | function.append(token) 111 | if token == "{": 112 | number_indent = 1 113 | while not (token == "}" and number_indent == 0): 114 | try: 115 | i.next() 116 | token = tokens[i.i] 117 | if token == "{": 118 | number_indent += 1 119 | elif token == "}": 120 | number_indent -= 1 121 | function.append(token) 122 | except StopIteration: 123 | break 124 | if "static" in function[0 : function.index("{")]: 125 | functions_standalone.append( 126 | self.remove_annotation(" ".join(function)) 127 | ) 128 | else: 129 | functions_class.append( 130 | self.remove_annotation(" ".join(function)) 131 | ) 132 | i.next() 133 | token = tokens[i.i] 134 | except KeyboardInterrupt: 135 | raise 136 | except: 137 | break 138 | return functions_standalone, functions_class 139 | 140 | def remove_annotation(self, function): 141 | return re.sub( 142 | "^(@ (Override|Deprecated|SuppressWarnings) (\( .* \) )?)*", "", function 143 | ) 144 | 145 | def get_function_name(self, function): 146 | return self.get_first_token_before_first_parenthesis(function) 147 | 148 | def extract_arguments(self, function): 149 | return self.extract_arguments_using_parentheses(function) 150 | -------------------------------------------------------------------------------- /reward.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tree_sitter import Language, Parser 3 | import re 4 | import torch 5 | from code_prepro.lang_processors import * 6 | from compiler.terminal_compiler import TerminalCompiler 7 | import sys 8 | from parser import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp 9 | sys.path.insert(0, '/home/grads/parshinshojaee/trl_code/trl_code/rl_code_repo/CodeBLEU/') 10 | from calc_code_bleu import calc_code_bleu 11 | 12 | 13 | 14 | 15 | 16 | code_tokenizers = {"java": java_tokenizer, "cpp": cpp_tokenizer, "c": c_tokenizer, "python": py_tokenizer, 17 | "javascript": js_tokenizer, "php": php_tokenizer, "c_sharp": cs_tokenizer} 18 | code_detokenizers = {"java": java_detokenizer, "cpp": cpp_detokenizer, "c": c_detokenizer, "python": py_detokenizer, 19 | "javascript": js_detokenizer, "php": php_detokenizer, "c_sharp": cs_detokenizer} 20 | 21 | lang2compiler = { 22 | "python": TerminalCompiler("Python"), 23 | "java": TerminalCompiler("Java"), 24 | "cpp": TerminalCompiler("C++"), 25 | "c_sharp": TerminalCompiler("C#"), 26 | "c": TerminalCompiler("C"), 27 | "php": TerminalCompiler("PHP"), 28 | } 29 | 30 | dfg_function={ 31 | 'python':DFG_python, 32 | 'java':DFG_java, 33 | 'php':DFG_php, 34 | 'javascript':DFG_javascript, 35 | 'c_sharp':DFG_csharp, 36 | 'c':DFG_csharp, 37 | 'cpp':DFG_csharp,} 38 | parsers={} 39 | for lang in dfg_function: 40 | LANGUAGE = Language('parser/my-languages.so', lang) 41 | parser = Parser() 42 | parser.set_language(LANGUAGE) 43 | parsers[lang]= parser 44 | 45 | def remove_special_tokens(code_string): 46 | lines = code_string.split("NEW_LINE") 47 | lines = [item.strip() for item in lines] 48 | 49 | curr_indent = 0 50 | new_lines = [] 51 | for line in lines: 52 | indent_count = line.count('INDENT') 53 | dedent_count = line.count('DEDENT') 54 | curr_indent += indent_count - dedent_count 55 | wo_indent = re.sub('INDENT\s?', '', line) 56 | wo_dedent = re.sub('DEDENT\s?', '', wo_indent) 57 | new_lines.append('\t'*curr_indent + wo_dedent) 58 | return ("\n").join(new_lines) 59 | 60 | def dfs_parse_tree(node, level, count_list, verbose = False): 61 | if verbose: 62 | if node.type == 'ERROR': 63 | print (level, '-'*(level*2), colored(node.type, 'red')) 64 | else: 65 | print (level, '-'*(level*2), node.type) 66 | if node.type == 'ERROR': 67 | count_list[0]+=1 68 | else: 69 | count_list[1]+=1 70 | for child in node.children: 71 | dfs_parse_tree(child, level+1, count_list, verbose) 72 | return 73 | 74 | def tree_sitter_full_compile(code, lang='python', verbose = False): 75 | root=parsers[lang].parse(bytes(code, 'utf-8')).root_node 76 | count_list = [0, 0] 77 | dfs_parse_tree(root, 0, count_list, verbose) 78 | return count_list 79 | 80 | 81 | def get_reward(lang, code_ids=None,code_ref_ids=None,gold_ids=None, tokenizer=None): 82 | code_ids = np.array(code_ids.cpu()) 83 | eos_positions = [] 84 | max_len = code_ids.shape[1] 85 | for id in code_ids: 86 | if tokenizer.eos_token_id in id: 87 | eos_positions.append((id==tokenizer.eos_token_id).argmax()) 88 | else: 89 | eos_positions.append(max_len) 90 | 91 | codes = [tokenizer.decode(id[:eos_pos], skip_special_tokens=True, clean_up_tokenization_spaces=False) \ 92 | for id,eos_pos in zip(code_ids, eos_positions)] 93 | codes_ref = [tokenizer.decode(id[:eos_pos], skip_special_tokens=True, clean_up_tokenization_spaces=False) \ 94 | for id,eos_pos in zip(code_ref_ids, eos_positions)] 95 | codes_gold = [tokenizer.decode(id[:eos_pos], skip_special_tokens=True, clean_up_tokenization_spaces=False) \ 96 | for id,eos_pos in zip(gold_ids, eos_positions)] 97 | 98 | codes = [code_detokenizers[lang](code) for code in codes] 99 | 100 | compilation = [lang2compiler[lang].compile_code_string(code) for code in codes] 101 | 102 | codes = [remove_special_tokens(code) for code in codes] 103 | codes_ref = [remove_special_tokens(code) for code in codes_ref] 104 | codes_gold = [remove_special_tokens(code) for code in codes_gold] 105 | error_node_counts = [tree_sitter_full_compile(code,lang) for code in codes] 106 | error_node_counts_ref = [tree_sitter_full_compile(code,lang) for code in codes_ref] 107 | error_node_counts_gold = [tree_sitter_full_compile(code,lang) for code in codes_gold] 108 | num_errors = [i[0] for i in error_node_counts] 109 | num_errors_ref = [i[0] for i in error_node_counts_ref] 110 | num_errors_gold = [i[0] for i in error_node_counts_gold] 111 | num_nodes = [i[1] for i in error_node_counts] 112 | num_nodes_ref = [i[1] for i in error_node_counts_ref] 113 | num_nodes_gold = [i[1] for i in error_node_counts_gold] 114 | 115 | keywords_dir = 'CodeBLEU/keywords/' 116 | # ast_match = calc_code_bleu([codes_gold], codes, lang, keywords_dir)[2] 117 | # dfg_match = calc_code_bleu([codes_gold], codes, lang, keywords_dir)[3] 118 | 119 | rewards = np.zeros_like(code_ids, dtype=np.float) 120 | ast_match_batch = 0 121 | dfg_match_batch = 0 122 | compile_batch = 0 123 | for i in range(len(rewards)): 124 | _, _, did_compile = compilation[i] 125 | reward = 1 if did_compile else -1 126 | 127 | ast_match = calc_code_bleu([[codes_gold[i]]], [codes[i]], lang, keywords_dir)[2] 128 | dfg_match = calc_code_bleu([[codes_gold[i]]], [codes[i]], lang, keywords_dir)[3] 129 | 130 | rewards[i, min(eos_positions[i],max_len-1)] = reward + ast_match + dfg_match 131 | compile_batch += reward 132 | ast_match_batch += ast_match 133 | dfg_match_batch += dfg_match 134 | 135 | mean_rate = compile_batch/len(codes) 136 | mean_ast_match = ast_match_batch/len(codes) 137 | mean_dfg_match = dfg_match_batch/len(codes) 138 | return torch.Tensor(rewards),mean_rate,mean_ast_match,mean_dfg_match, num_errors, num_errors_ref, num_nodes, num_nodes_ref 139 | 140 | -------------------------------------------------------------------------------- /compiler/vendor/phpstan/phpstan/README.md: -------------------------------------------------------------------------------- 1 |

PHPStan - PHP Static Analysis Tool

2 | 3 |

4 | PHPStan 5 |

6 | 7 |

8 | Build Status 9 | Latest Stable Version 10 | Total Downloads 11 | License 12 | PHPStan Enabled 13 |

14 | 15 | ------ 16 | 17 | PHPStan focuses on finding errors in your code without actually running it. It catches whole classes of bugs 18 | even before you write tests for the code. It moves PHP closer to compiled languages in the sense that the correctness of each line of the code 19 | can be checked before you run the actual line. 20 | 21 | **[Read more about PHPStan in an introductory article »](https://phpstan.org/blog/find-bugs-in-your-code-without-writing-tests)** 22 | 23 | **[Try out PHPStan on the on-line playground! »](https://phpstan.org/)** 24 | 25 | ## Sponsors 26 | 27 | TheCodingMachine 28 |     29 | Private Packagist 30 |
31 | Musement 32 |     33 | Blackfire.io 34 |
35 | iO 36 |     37 | TicketSwap 38 |
39 | ShipMonk 40 |     41 | Togetter 42 |
43 | RightCapital 44 |     45 | ContentKing 46 |
47 | ZOL 48 |     49 | Stepan Kocourek 50 |
51 | Shopware 52 |     53 | Craft CMS 54 |
55 | Worksome 56 |     57 | campoint AG 58 | 59 | [**You can now sponsor my open-source work on PHPStan through GitHub Sponsors.**](https://github.com/sponsors/ondrejmirtes) 60 | 61 | Does GitHub already have your 💳? Do you use PHPStan to find 🐛 before they reach production? [Send a couple of 💸 a month my way too.](https://github.com/sponsors/ondrejmirtes) Thank you! 62 | 63 | One-time donations [through PayPal](https://paypal.me/phpstan) are also accepted. To request an invoice, [contact me](mailto:ondrej@mirtes.cz) through e-mail. 64 | 65 | ## Documentation 66 | 67 | All the documentation lives on the [phpstan.org website](https://phpstan.org/): 68 | 69 | * [Getting Started & User Guide](https://phpstan.org/user-guide/getting-started) 70 | * [Config Reference](https://phpstan.org/config-reference) 71 | * [PHPDocs Basics](https://phpstan.org/writing-php-code/phpdocs-basics) & [PHPDoc Types](https://phpstan.org/writing-php-code/phpdoc-types) 72 | * [Extension Library](https://phpstan.org/user-guide/extension-library) 73 | * [Developing Extensions](https://phpstan.org/developing-extensions/extension-types) 74 | 75 | ## PHPStan Pro 76 | 77 | PHPStan Pro is a paid add-on on top of open-source PHPStan Static Analysis Tool with these premium features: 78 | 79 | * Web UI for browsing found errors, you can click and open your editor of choice on the offending line. 80 | * Continuous analysis (watch mode): scans changed files in the background, refreshes the UI automatically. 81 | * Interactive fixer: lets you choose the right fix for found errors :blush: 82 | 83 | Try it on PHPStan 0.12.45 or later by running it with the `--pro` option. You can create an account either by following the on-screen instructions, or by visiting [account.phpstan.com](https://account.phpstan.com/). 84 | 85 | After 30-day free trial period it costs 7 EUR for individuals monthly, 70 EUR for teams (up to 25 members). By paying for PHPStan Pro, you're supporting the development of open-source PHPStan. 86 | 87 | You can read more about it on [PHPStan's website](https://phpstan.org/blog/introducing-phpstan-pro). 88 | 89 | ## Code of Conduct 90 | 91 | This project adheres to a [Contributor Code of Conduct](https://github.com/phpstan/phpstan/blob/master/CODE_OF_CONDUCT.md). By participating in this project and its community, you are expected to uphold this code. 92 | 93 | ## Contributing 94 | 95 | Any contributions are welcome. PHPStan's source code open to pull requests lives at [`phpstan/phpstan-src`](https://github.com/phpstan/phpstan-src). 96 | -------------------------------------------------------------------------------- /compile_rl_experiments.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | from code_prepro.lang_processors import * 3 | from compiler.terminal_compiler import TerminalCompiler 4 | import os 5 | import json 6 | import argparse 7 | import torch 8 | 9 | 10 | lang2compiler = { 11 | "Python": TerminalCompiler('Python'), 12 | "C++": TerminalCompiler('C++'), 13 | "C": TerminalCompiler('C'), 14 | "C#": TerminalCompiler('C#'), 15 | "PHP": TerminalCompiler('PHP'), 16 | "Java": TerminalCompiler('Java') 17 | } 18 | 19 | ext2lang = { 20 | "py" : "Python", 21 | "cpp": "C++", 22 | "java": "Java" 23 | } 24 | 25 | file_tokenizers = {"Java": java_tokenizer, "C++": cpp_tokenizer, "C": c_tokenizer, "Python": py_tokenizer, 26 | "Javascript": js_tokenizer, "PHP": php_tokenizer, "C#": cs_tokenizer} 27 | file_detokenizers = {"Java": java_detokenizer, "C++": cpp_detokenizer, "C": c_detokenizer, "Python": py_detokenizer, 28 | "Javascript": js_detokenizer, "PHP": php_detokenizer, "C#": cs_detokenizer} 29 | 30 | 31 | experiment2lang = { 32 | "python": "Python", 33 | "c": "C", 34 | "cpp": "C++", 35 | "c_sharp": "C#", 36 | "java": "Java", 37 | "php": "PHP" 38 | } 39 | 40 | 41 | def read_hypotheses(hypo_path): 42 | hypo = [] 43 | with open(hypo_path, "r") as f: 44 | for line in f.readlines(): 45 | hypo.append(line.strip()) 46 | return hypo 47 | 48 | def write_summary(summary, path): 49 | with open(path, "w+") as f: 50 | for line in summary: 51 | f.write(json.dumps(line, ensure_ascii=False)) 52 | f.write("\n") 53 | 54 | 55 | 56 | parser = argparse.ArgumentParser() 57 | ## Required parameters 58 | parser.add_argument("--l1", default=None, type=str, 59 | help="source language") 60 | parser.add_argument("--l2", default=None, type=str, 61 | help="target language") 62 | parser.add_argument("--asp", default=2, type=int, 63 | help="action space") 64 | parser.add_argument("--ns", default=5, type=int, 65 | help="num syn samples") 66 | parser.add_argument("--data_path", default=None, type=str, 67 | help="data parent directory") 68 | parser.add_argument("--output_path", default=None, type=str, 69 | help="output directory") 70 | parser.add_argument("--load_model_path", default=None, type=str, 71 | help="path to load models") 72 | parser.add_argument("--baseline_output_path", default=None, type=str, 73 | help="path to load models") 74 | parser.add_argument("--run", default=1, type=int,help="run ID") 75 | args = parser.parse_args() 76 | args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 77 | 78 | 79 | 80 | data_parent_dir = args.data_path 81 | dir_dict = {'javascript':'Javascript', 'java':'Java', 'c_sharp':'C#', 'php':'PHP', 'python':'Python', 'c':'C', 'cpp':'C++'} 82 | end_dict = {'javascript':'js', 'java':'java', 'c_sharp':'cs', 'php':'php', 'python':'py', 'c':'c', 'cpp':'cpp'} 83 | l1, l2 = args.l1, args.l2 84 | data_dir = data_parent_dir + '/' + dir_dict[l1] + '-' + dir_dict[l2] + '/' 85 | template = data_dir+'train-XXX-YYY-tok.xxx,'+data_dir+'train-XXX-YYY-tok.yyy' 86 | template = template.replace('XXX', dir_dict[l1]).replace('YYY', dir_dict[l2]) 87 | if not(os.path.exists(data_dir)): 88 | data_dir = data_parent_dir + '/' + dir_dict[l2] + '-' + dir_dict[l1] + '/' 89 | template = data_dir+'train-XXX-YYY-tok.xxx,'+data_dir+'train-XXX-YYY-tok.yyy' 90 | template = template.replace('XXX', dir_dict[l2]).replace('YYY', dir_dict[l1]) 91 | train_filename = template.replace('xxx', end_dict[l1]).replace('yyy', end_dict[l2]) 92 | dev_filename = train_filename.replace('train', 'val') 93 | test_filename = train_filename.replace('train', 'test') 94 | baseline_output_dir = args.baseline_output_path + '/'+l1+'-'+l2+'/' 95 | load_model_path = args.load_model_path 96 | output_dir = args.output_path + '/'+l1+'-'+l2+'/' 97 | 98 | 99 | data_path = output_dir 100 | lang_pair = 'Java-C++' 101 | print(lang_pair,'- AS:', args.asp, '- NS: ', args.ns) 102 | all_experiments = ['test.model_ep0','test.model_ep1','test.model_ep2','test.model_ep3'] 103 | 104 | print(all_experiments) 105 | 106 | compilation_stats = {} 107 | #["python-php"]: 108 | for experiment in all_experiments: 109 | 110 | uncompiled_count = 0 111 | compiled_count = 0 112 | summary = [] 113 | 114 | print(experiment) 115 | 116 | #src, trg = experiment.split('-') 117 | src, trg = lang_pair.split('-') 118 | 119 | lang = trg 120 | src_lang = src 121 | #lang = experiment2lang[trg] 122 | #src_lang = experiment2lang[src] 123 | #hypo_path = os.path.join(data_path, experiment, hypothesis_filename) 124 | hypo_path = os.path.join(data_path, experiment) 125 | 126 | hypotheses = read_hypotheses(hypo_path) 127 | 128 | for i, code_string in enumerate(tqdm(hypotheses)): 129 | 130 | if lang != "PHP": 131 | code_string = file_detokenizers[lang](code_string) 132 | 133 | error, output, did_compile = lang2compiler[lang].compile_code_string(code_string) 134 | 135 | if lang == "PHP": 136 | if "[ERROR]" in output: 137 | did_compile = False 138 | elif "[OK] No errors" in output: 139 | did_compile = True 140 | 141 | if error or not did_compile: 142 | uncompiled_count+=1 143 | elif did_compile: 144 | compiled_count+=1 145 | 146 | line_item = { 147 | #"pid": lang2mapping[src_lang][i], 148 | "pid": i, 149 | "code_string": code_string, 150 | "did_compile": did_compile, 151 | "error": error, 152 | "output": output 153 | } 154 | summary.append(line_item) 155 | 156 | #summary_path = os.path.join(data_path, experiment, experiment+"-summary.jsonl") 157 | summary_path = os.path.join(data_path, experiment+"-summary.jsonl") 158 | write_summary(summary, summary_path) 159 | 160 | compilation_stats[experiment] = {"compilation_ratio":compiled_count/len(hypotheses), 161 | "compiled_count": compiled_count, 162 | "uncompiled_count": uncompiled_count} 163 | 164 | for key, value in compilation_stats.items(): 165 | print(f"{key}: {value}") 166 | with open(os.path.join(data_path, "aggregate_compilation_summary.jsonl"), "w+") as f: 167 | for key, value in compilation_stats.items(): 168 | f.write(json.dumps({key:value})) 169 | f.write("\n") 170 | 171 | 172 | 173 | 174 | -------------------------------------------------------------------------------- /parser/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from io import StringIO 3 | import tokenize 4 | def remove_comments_and_docstrings(source,lang): 5 | if lang in ['python']: 6 | """ 7 | Returns 'source' minus comments and docstrings. 8 | """ 9 | io_obj = StringIO(source) 10 | out = "" 11 | prev_toktype = tokenize.INDENT 12 | last_lineno = -1 13 | last_col = 0 14 | for tok in tokenize.generate_tokens(io_obj.readline): 15 | token_type = tok[0] 16 | token_string = tok[1] 17 | start_line, start_col = tok[2] 18 | end_line, end_col = tok[3] 19 | ltext = tok[4] 20 | if start_line > last_lineno: 21 | last_col = 0 22 | if start_col > last_col: 23 | out += (" " * (start_col - last_col)) 24 | # Remove comments: 25 | if token_type == tokenize.COMMENT: 26 | pass 27 | # This series of conditionals removes docstrings: 28 | elif token_type == tokenize.STRING: 29 | if prev_toktype != tokenize.INDENT: 30 | # This is likely a docstring; double-check we're not inside an operator: 31 | if prev_toktype != tokenize.NEWLINE: 32 | if start_col > 0: 33 | out += token_string 34 | else: 35 | out += token_string 36 | prev_toktype = token_type 37 | last_col = end_col 38 | last_lineno = end_line 39 | temp=[] 40 | for x in out.split('\n'): 41 | if x.strip()!="": 42 | temp.append(x) 43 | 44 | code = '\n'.join(temp) 45 | pos = 0 46 | while pos >", ">>").replace("< <", "<<") 181 | return untok_s -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PPOCoder 2 | 3 | Official Implementation of [Execution-based Code Generation using Deep Reinforcement Learning](https://arxiv.org/abs/2301.13816) 4 | 5 | ## Overview 6 | The utilization of programming language (PL) models, pretrained on large-scale code corpora, as a means of automating software engineering processes has demonstrated considerable potential in streamlining various code generation tasks such as code completion, code translation, and program synthesis. However, current approaches mainly rely on supervised fine-tuning objectives borrowed from text generation, neglecting specific sequence-level features of code, including but not limited to compilability as well as syntactic and functional correctness. To address this limitation, we propose **PPOCoder**, a new framework for code generation that combines pretrained PL models with Proximal Policy Optimization (PPO) deep reinforcement learning and employs execution feedback as the external source of knowledge into the model optimization. PPOCoder is transferable across different code generation tasks and PLs. 7 | 8 | 14 |

15 | 16 |
17 | Overview of the PPOCoder with actor and critic models: The action is sampled from the policy based on the given source data $x$ (NL or PL). Then, a reward is obtained for each action to guide and control policy updates. The reward function is composed of four elements: (a) compiler feedback; (b) syntactic matching score based on ASTs; (c) semantic matching score based on DFGs; and (d) KL-divergence penalty between active policy and the reference pretrained model. 18 | The critic model estimates value based on the obtained reward and PPOCoder will be optimized with PPO, which takes into account both value and policy optimization. 19 |

20 | 21 | 22 | ## Environment Installation 23 | To run the code, install the dependencies in requirements.txt. 24 | ``` 25 | pip install -r requirements.txt 26 | ``` 27 | 28 | 29 | ## Datasets 30 | We finetune/evaluate models on the following major dataset benchmarks for different code generation tasks: 31 | 32 | * **CodeSearchNet (CSN)** is available [here](https://github.com/github/CodeSearchNet##data-details) 33 | * **XLCoST** is available [here](https://github.com/reddy-lab-code-research/XLCoST) 34 | * **APPS** is available [here](https://github.com/hendrycks/apps) 35 | * **MBPP** is available [here](https://github.com/google-research/google-research/tree/master/mbpp) 36 | 37 | We preprocess the data and construct input/output sequences in the same manner as outlined in the original benchmark papers. Unzip and place all benchmarks in the `data` folder. 38 | 39 | 40 | ## Run 41 | We have created `run.sh` script to execute PPO-based PL model fine-tuning based on the compiler signal. To run the script for different code generation tasks, configure the following parameters: 42 | 43 | | **Parameters** | **Description** | **Example Values** | 44 | |:-----------------:|:--------------------------------------------------------------------------------------------------------:|:------------------------------:| 45 | | `l1` | Source Language | java | 46 | | `l2` | Target Language | cpp | 47 | | `asp` | Action Space Size | 5 | 48 | | `ns` | Number of Synthetic Samples | 10 | 49 | | `data_path` | Path to the original data samples | data/xlcost/java-cpp/ | 50 | | `output_path` | Path to save generations and outputs | saved_results/java-cpp/ | 51 | | `baseline_output_dir` | Path to the base finetuned CodeT5 (before RL) outputs | baselines/saved_models/java-cpp/ | 52 | | `load_model_path` | Path to the base finetuned CodeT5 model (before RL) for each downstream task | baselines/saved_models/java-cpp/pytorch_model.bin | 53 | | `max_source_length` | Maxmim Source Length | 400 | 54 | | `max_target_length` | Maxmim Target Length | 400 | 55 | | `train_batch_size` | Training Batch Size | 32 | 56 | | `test_batch_size` | Testing Batch Size | 48 | 57 | | `lr` | Learning Rate | 1e-6 | 58 | | `kl_coef` | Initial coefficient of the KL divergence penalty in the reward | 0.1 | 59 | | `kl_target` | Target of the KL which adaptively controls the KL coefficient | 1 | 60 | | `vf_coef` | Coefficient of the vf error in the ppo loss | 1e-3 | 61 | | `run` | Index of the run | 1 | 62 | 63 | 64 | Running `run.sh` saves generated programs in a `.txt` file and the model weights at the end of each epoch. 65 | 66 | 88 | 89 | 90 | 91 | ## Citation 92 | If you find the paper or the repo useful, please cite it with 93 |
 94 | @article{shojaee2023ppocoder,
 95 |   title={Execution-based code generation using deep reinforcement learning},
 96 |   author={Shojaee, Parshin and Jain, Aneesh and Tipirneni, Sindhu and Reddy, Chandan K},
 97 |   journal={arXiv preprint arXiv:2301.13816},
 98 |   year={2023}
 99 | }
100 | 
101 | -------------------------------------------------------------------------------- /code_prepro/lang_processors/tree_sitter_processor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | from code_prepro.lang_processors.lang_processor import LangProcessor 8 | from code_prepro.lang_processors.tokenization_utils import ( 9 | process_string, 10 | replace_tokens, 11 | indent_lines, 12 | ) 13 | import re 14 | from tree_sitter import Language, Parser 15 | from pathlib import Path 16 | from logging import getLogger 17 | 18 | NEW_LINE = "NEW_LINE" 19 | 20 | logger = getLogger() 21 | COMMENT_TYPES = {"comment", "line_comment", "block_comment"} 22 | 23 | 24 | class TreeSitterLangProcessor(LangProcessor): 25 | def __init__( 26 | self, 27 | language, 28 | ast_nodes_type_string, 29 | stokens_to_chars, 30 | chars_to_stokens, 31 | root_folder, 32 | ): 33 | self.language = language 34 | self.ast_nodes_type_string = ast_nodes_type_string 35 | self.stokens_to_chars = stokens_to_chars 36 | self.chars_to_stokens = chars_to_stokens 37 | self.root_folder = Path(root_folder) 38 | self.root_folder.is_dir(), f"{self.root_folder} is not a directory." 39 | self.parser = None 40 | self.create_treesiter_parser() 41 | 42 | def create_treesiter_parser(self): 43 | if self.parser is None: 44 | # lib_path = self.root_folder.joinpath(f"{self.language}.so") 45 | lib_path = self.root_folder.joinpath("my-languages.so") 46 | repo_path = self.root_folder.joinpath(f"tree-sitter-{self.language}") 47 | # print(lib_path, repo_path) 48 | if not lib_path.exists(): 49 | assert repo_path.is_dir() 50 | Language.build_library( 51 | # Store the library in the `build` directory 52 | str(lib_path), 53 | # Include one or more languages 54 | [str(repo_path)], 55 | ) 56 | language = Language(lib_path, self.language) 57 | self.parser = Parser() 58 | self.parser.set_language(language) 59 | 60 | def tokenize_code(self, code, keep_comments=False, process_strings=True): 61 | tokenized_code = [] 62 | tokens, token_types = self.get_tokens_and_types(code) 63 | for token, token_type in zip(tokens, token_types): 64 | if token_type in COMMENT_TYPES and keep_comments is False: 65 | continue 66 | if token_type in self.ast_nodes_type_string: 67 | token = process_string( 68 | token, 69 | self.chars_to_stokens, 70 | self.stokens_to_chars, 71 | token_type in COMMENT_TYPES, 72 | process_strings, 73 | ) 74 | if len(token) > 0: 75 | if token_type not in self.ast_nodes_type_string: 76 | token = token.replace("\n", "NEW_LINE") 77 | token = token.replace("NEW_LINENEW_LINE", "NEW_LINE") 78 | tokenized_code.append(token) 79 | return tokenized_code 80 | 81 | def get_tokens_and_types(self, code): 82 | code = code.replace("\r", "") 83 | code = bytes(code, "utf8") 84 | tree = self.get_ast(code) 85 | tokens = [] 86 | tokens_type = [] 87 | self.dfs(code, tree.root_node, tokens, tokens_type) 88 | return tokens, tokens_type 89 | 90 | def get_ast(self, code): 91 | assert isinstance(code, str) or isinstance(code, bytes) 92 | if isinstance(code, str): 93 | code = bytes(code, "utf8") 94 | tree = self.parser.parse(code) 95 | return tree 96 | 97 | def dfs(self, code, node, tokens, tokens_type): 98 | if len(node.children) == 0 or node.type in self.ast_nodes_type_string: 99 | snippet = code[node.start_byte : node.end_byte] 100 | if isinstance(snippet, bytes): 101 | snippet = snippet.decode("utf8") 102 | if len(snippet) > 0: 103 | tokens.append(snippet) 104 | tokens_type.append(node.type) 105 | return 106 | for child in node.children: 107 | self.dfs(code, child, tokens, tokens_type) 108 | 109 | def detokenize_code(self, code): 110 | # TODO make this cleaner with tree sitter AST ? 111 | assert isinstance(code, str) or isinstance(code, list) 112 | if isinstance(code, list): 113 | code = " ".join(code) 114 | code = code.replace("ENDCOM", "\n") 115 | replaced_tokens = [] 116 | # call parser of the tokenizer to find comments and string and detokenize them correctly 117 | try: 118 | tokens, token_types = self.get_tokens_and_types(code) 119 | for token, token_type in zip(tokens, token_types): 120 | if token_type in self.ast_nodes_type_string: 121 | token_ = token.replace("STRNEWLINE", "\n").replace( 122 | "TABSYMBOL", "\t" 123 | ) 124 | token_ = ( 125 | replace_tokens(token_, self.chars_to_stokens) 126 | .replace(" ", "") 127 | .replace("▁", " ") 128 | ) 129 | if token_type in COMMENT_TYPES: 130 | token_ += "\n" 131 | replaced_tokens.append(token_) 132 | else: 133 | replaced_tokens.append(token) 134 | except KeyboardInterrupt: 135 | raise 136 | except: 137 | pass 138 | 139 | code = " ".join(replaced_tokens) 140 | code = code.replace("\n", "NEW_LINE") 141 | code = code.replace('} "', 'CB_ "') 142 | code = code.replace('" {', '" OB_') 143 | code = code.replace("*/ ", "*/ NEW_LINE") 144 | code = code.replace("} ;", "CB_COLON NEW_LINE") 145 | code = code.replace("} ,", "CB_COMA") 146 | code = code.replace("}", "CB_ NEW_LINE") 147 | code = code.replace("{", "OB_ NEW_LINE") 148 | code = code.replace(";", "; NEW_LINE") 149 | code = replace_tokens(code, self.stokens_to_chars) 150 | lines = re.split("NEW_LINE", code) 151 | 152 | untok_s = indent_lines(lines) 153 | untok_s = ( 154 | untok_s.replace("CB_COLON", "};") 155 | .replace("CB_COMA", "},") 156 | .replace("CB_", "}") 157 | .replace("OB_", "{") 158 | ) 159 | untok_s = untok_s.replace("> > >", ">>>").replace("<< <", "<<<") 160 | untok_s = untok_s.replace("> >", ">>").replace("< <", "<<") 161 | 162 | return untok_s 163 | 164 | def extract_arguments_using_parentheses(self, function): 165 | function = function.split(" ") 166 | types = [] 167 | names = [] 168 | par = 0 169 | arguments = [] 170 | function = function[function.index("(") :] 171 | for tok in function: 172 | if tok == "(": 173 | par += 1 174 | elif tok == ")": 175 | par -= 1 176 | arguments.append(tok) 177 | if par == 0: 178 | break 179 | arguments = " ".join(arguments[1:-1]) 180 | if arguments == "": 181 | return ["None"], ["None"] 182 | arguments = arguments.split(",") 183 | for arg in arguments: 184 | bracks = re.findall("\[ \]", arg) 185 | bracks = " ".join(bracks) 186 | arg = arg.replace(bracks, "") 187 | arg = arg.strip() 188 | arg = re.sub(" +", " ", arg) 189 | t = " ".join(arg.split(" ")[:-1] + [bracks]) 190 | n = arg.split(" ")[-1] 191 | types.append(t) 192 | names.append(n) 193 | return types, names 194 | 195 | def get_first_token_before_first_parenthesis(self, code): 196 | assert isinstance(code, str) or isinstance( 197 | code, list 198 | ), f"function is not the right type, should be str or list : {code}" 199 | if isinstance(code, str): 200 | code = code.split() 201 | return code[code.index("(") - 1] 202 | -------------------------------------------------------------------------------- /data/xlcost/Java-C++/test-C++-map.jsonl: -------------------------------------------------------------------------------- 1 | 1057 2 | 1058 3 | 1134 4 | 118 5 | 1225 6 | 1255 7 | 1283 8 | 1387 9 | 142 10 | 1448 11 | 1472 12 | 1494 13 | 1513 14 | 1534 15 | 1535 16 | 1552 17 | 1557 18 | 1562 19 | 1608 20 | 1610 21 | 1614 22 | 1616 23 | 1634 24 | 1636 25 | 1656 26 | 1659 27 | 1683 28 | 1688 29 | 168 30 | 1693 31 | 1694 32 | 169 33 | 364 34 | 37 35 | 447 36 | 605 37 | 612 38 | 647 39 | 688 40 | 704 41 | 711 42 | 747 43 | 823 44 | 848 45 | 859 46 | 884 47 | 896 48 | 901 49 | 907 50 | 908 51 | 927 52 | 933 53 | 943 54 | 956 55 | 966 56 | 967 57 | 1703 58 | 1706 59 | 1708 60 | 1709 61 | 1843 62 | 1844 63 | 1853 64 | 1861 65 | 1862 66 | 1867 67 | 1870 68 | 1944 69 | 1949 70 | 1954 71 | 1976 72 | 1989 73 | 1994 74 | 2018 75 | 2022 76 | 2025 77 | 2028 78 | 2030 79 | 2032 80 | 2065 81 | 2066 82 | 2073 83 | 2079 84 | 2081 85 | 2090 86 | 2091 87 | 2096 88 | 2105 89 | 2114 90 | 2118 91 | 2123 92 | 2124 93 | 2127 94 | 2132 95 | 2137 96 | 2143 97 | 2147 98 | 2148 99 | 2149 100 | 2151 101 | 2168 102 | 2195 103 | 2196 104 | 2199 105 | 2226 106 | 2247 107 | 2259 108 | 2273 109 | 2286 110 | 2291 111 | 2292 112 | 2294 113 | 2311 114 | 2322 115 | 2349 116 | 2374 117 | 2381 118 | 2469 119 | 2477 120 | 2480 121 | 2519 122 | 2524 123 | 2527 124 | 2558 125 | 2573 126 | 2577 127 | 2578 128 | 2579 129 | 2584 130 | 2586 131 | 2594 132 | 2596 133 | 2600 134 | 2604 135 | 2616 136 | 2641 137 | 2652 138 | 2653 139 | 2673 140 | 2682 141 | 2689 142 | 2698 143 | 2719 144 | 2731 145 | 2762 146 | 2765 147 | 2779 148 | 2781 149 | 2782 150 | 2784 151 | 2797 152 | 2799 153 | 2800 154 | 2810 155 | 2829 156 | 2841 157 | 2883 158 | 2884 159 | 2887 160 | 2894 161 | 2912 162 | 2942 163 | 2946 164 | 2958 165 | 2978 166 | 2992 167 | 2995 168 | 2998 169 | 3010 170 | 3018 171 | 3038 172 | 3132 173 | 3140 174 | 3171 175 | 3208 176 | 3267 177 | 3277 178 | 3295 179 | 3303 180 | 3304 181 | 3343 182 | 3357 183 | 3358 184 | 3384 185 | 3423 186 | 3429 187 | 3448 188 | 3496 189 | 3534 190 | 3597 191 | 3604 192 | 3628 193 | 3665 194 | 3666 195 | 3702 196 | 3730 197 | 3737 198 | 3742 199 | 3757 200 | 3795 201 | 3798 202 | 3802 203 | 3868 204 | 3917 205 | 3918 206 | 3966 207 | 3968 208 | 3980 209 | 4061 210 | 4070 211 | 4107 212 | 4137 213 | 4150 214 | 4171 215 | 4191 216 | 4215 217 | 4216 218 | 4238 219 | 4241 220 | 4249 221 | 4272 222 | 4275 223 | 4358 224 | 4369 225 | 4378 226 | 4392 227 | 4412 228 | 4414 229 | 4420 230 | 4450 231 | 4468 232 | 4488 233 | 4490 234 | 4499 235 | 4524 236 | 4539 237 | 4561 238 | 4564 239 | 4570 240 | 4588 241 | 4589 242 | 4656 243 | 4687 244 | 4720 245 | 4800 246 | 4805 247 | 4819 248 | 4826 249 | 4882 250 | 4892 251 | 4895 252 | 4913 253 | 4921 254 | 4929 255 | 4966 256 | 4969 257 | 4974 258 | 4981 259 | 5011 260 | 5018 261 | 5032 262 | 5033 263 | 5054 264 | 5055 265 | 5057 266 | 5066 267 | 5075 268 | 5081 269 | 5083 270 | 5088 271 | 5091 272 | 5096 273 | 5118 274 | 5140 275 | 5192 276 | 5196 277 | 5216 278 | 5222 279 | 5228 280 | 5238 281 | 5268 282 | 5272 283 | 5273 284 | 5282 285 | 5287 286 | 5298 287 | 5299 288 | 5306 289 | 5309 290 | 5312 291 | 5336 292 | 5339 293 | 5384 294 | 5393 295 | 5481 296 | 5491 297 | 5493 298 | 5500 299 | 5538 300 | 5558 301 | 5562 302 | 5591 303 | 5597 304 | 5605 305 | 5610 306 | 5625 307 | 5627 308 | 5639 309 | 5664 310 | 5688 311 | 5699 312 | 5722 313 | 5929 314 | 5958 315 | 5959 316 | 5989 317 | 5998 318 | 6083 319 | 6084 320 | 6096 321 | 6100 322 | 6130 323 | 6131 324 | 6132 325 | 6138 326 | 6176 327 | 6182 328 | 6220 329 | 6243 330 | 6279 331 | 6301 332 | 6305 333 | 6312 334 | 6330 335 | 6362 336 | 6407 337 | 6436 338 | 6458 339 | 6571 340 | 6628 341 | 6662 342 | 6736 343 | 6746 344 | 6761 345 | 6774 346 | 6826 347 | 6849 348 | 6850 349 | 6941 350 | 6943 351 | 7040 352 | 7123 353 | 7137 354 | 7148 355 | 7177 356 | 7181 357 | 7184 358 | 7214 359 | 7217 360 | 7234 361 | 7249 362 | 7263 363 | 7298 364 | 7354 365 | 7356 366 | 7378 367 | 7385 368 | 7393 369 | 7397 370 | 7420 371 | 7471 372 | 7514 373 | 7540 374 | 7572 375 | 7579 376 | 7641 377 | 7665 378 | 7678 379 | 7680 380 | 7694 381 | 7699 382 | 7712 383 | 7718 384 | 7728 385 | 7731 386 | 7768 387 | 7774 388 | 7787 389 | 7789 390 | 7809 391 | 7813 392 | 7837 393 | 7844 394 | 7855 395 | 7867 396 | 7873 397 | 7879 398 | 7882 399 | 7883 400 | 7894 401 | 7900 402 | 7909 403 | 7926 404 | 7942 405 | 7945 406 | 7967 407 | 7968 408 | 7977 409 | 7980 410 | 8033 411 | 8054 412 | 8116 413 | 8128 414 | 8151 415 | 8156 416 | 8176 417 | 8221 418 | 8240 419 | 8276 420 | 8289 421 | 8301 422 | 8323 423 | 8334 424 | 8374 425 | 8379 426 | 8398 427 | 8401 428 | 8408 429 | 8414 430 | 8428 431 | 8445 432 | 8473 433 | 8500 434 | 8626 435 | 8634 436 | 8822 437 | 8832 438 | 8863 439 | 8921 440 | 8938 441 | 8966 442 | 8986 443 | 9005 444 | 9061 445 | 9063 446 | 9064 447 | 9067 448 | 9087 449 | 9090 450 | 9141 451 | 9146 452 | 9158 453 | 9182 454 | 10017 455 | 10019 456 | 10045 457 | 10067 458 | 10069 459 | 10123 460 | 10127 461 | 10129 462 | 10133 463 | 10138 464 | 10155 465 | 10165 466 | 10212 467 | 10219 468 | 10230 469 | 10249 470 | 10253 471 | 10267 472 | 10285 473 | 10303 474 | 10320 475 | 10372 476 | 10375 477 | 10399 478 | 10406 479 | 10413 480 | 10440 481 | 10477 482 | 10546 483 | 10563 484 | 10578 485 | 10601 486 | 10669 487 | 10679 488 | 10680 489 | 10685 490 | 10688 491 | 10701 492 | 10707 493 | 10753 494 | 10755 495 | 10771 496 | 10776 497 | 10784 498 | 10786 499 | 10793 500 | 10797 501 | 10800 502 | 10802 503 | 10810 504 | 10823 505 | 10836 506 | 10840 507 | 10858 508 | 10874 509 | 10879 510 | 10882 511 | 10887 512 | 10888 513 | 10898 514 | 10899 515 | 10927 516 | 9194 517 | 9208 518 | 9220 519 | 9224 520 | 9251 521 | 9260 522 | 9266 523 | 9276 524 | 9285 525 | 9291 526 | 9306 527 | 9317 528 | 9335 529 | 9357 530 | 9364 531 | 9368 532 | 9380 533 | 9407 534 | 9510 535 | 9521 536 | 9528 537 | 9543 538 | 9547 539 | 9549 540 | 9573 541 | 9609 542 | 9612 543 | 9626 544 | 9640 545 | 9706 546 | 9718 547 | 9729 548 | 9731 549 | 9734 550 | 9738 551 | 9791 552 | 9812 553 | 9825 554 | 9847 555 | 9850 556 | 9853 557 | 9876 558 | 9880 559 | 9884 560 | 9893 561 | 9896 562 | 9922 563 | 9927 564 | 9929 565 | 9942 566 | 9954 567 | 9963 568 | 9979 569 | 10970 570 | 10978 571 | 11007 572 | 11013 573 | 11019 574 | 11024 575 | 11055 576 | 11109 577 | 11121 578 | 11128 579 | 11139 580 | 11149 581 | 11168 582 | 11173 583 | 11181 584 | 11182 585 | 11185 586 | 11216 587 | 11223 588 | 11227 589 | 11254 590 | 11319 591 | 11328 592 | 11339 593 | 11349 594 | 11352 595 | 11371 596 | 11383 597 | 11387 598 | 11389 599 | 11404 600 | 11417 601 | 11425 602 | 11461 603 | 11475 604 | 11488 605 | 11515 606 | 11530 607 | 11545 608 | 11584 609 | 11615 610 | 11630 611 | 11652 612 | 11664 613 | 11721 614 | 11727 615 | 11781 616 | 11866 617 | 11902 618 | 11915 619 | 11922 620 | 11968 621 | 11969 622 | 12011 623 | 12038 624 | 12077 625 | 12087 626 | 12144 627 | 12244 628 | 12264 629 | 12287 630 | 12288 631 | 12294 632 | 12308 633 | 12359 634 | 12364 635 | 12394 636 | 12408 637 | 12476 638 | 12507 639 | 12517 640 | 12585 641 | 12592 642 | 12683 643 | 12684 644 | 12778 645 | 12819 646 | 12913 647 | 12932 648 | 10051 649 | 10186 650 | 10473 651 | 11337 652 | 11690 653 | 11823 654 | 11831 655 | 12405 656 | 12417 657 | 12639 658 | 12817 659 | 13033 660 | 13054 661 | 13116 662 | 13135 663 | 13140 664 | 13141 665 | 13179 666 | 13188 667 | 13222 668 | 13225 669 | 13275 670 | 13297 671 | 13415 672 | 13418 673 | 13429 674 | 13431 675 | 13432 676 | 13443 677 | 13471 678 | 13511 679 | 13513 680 | 13539 681 | 13544 682 | 13565 683 | 13575 684 | 13586 685 | 13589 686 | 13598 687 | 13630 688 | 13633 689 | 13643 690 | 13712 691 | 13727 692 | 13735 693 | 13776 694 | 13826 695 | 13858 696 | 13870 697 | 13885 698 | 13895 699 | 13915 700 | 13965 701 | 13976 702 | 13985 703 | 14020 704 | 3051 705 | 3944 706 | 4066 707 | 4067 708 | 4916 709 | 5766 710 | 6490 711 | 6682 712 | 6792 713 | 6837 714 | 6940 715 | 7070 716 | 7071 717 | 8070 718 | 8348 719 | 8366 720 | 8624 721 | 9311 722 | 2746 723 | 2915 724 | 2947 725 | 2965 726 | 3062 727 | 3063 728 | 3113 729 | 3129 730 | 3161 731 | 3245 732 | 3246 733 | 3255 734 | 3463 735 | 3491 736 | 3543 737 | 3603 738 | 3621 739 | 3633 740 | 3715 741 | 3760 742 | 3797 743 | 3850 744 | 3875 745 | 3930 746 | 3932 747 | 3961 748 | 3973 749 | 3986 750 | 4014 751 | 4033 752 | 4037 753 | 4162 754 | 4183 755 | 4185 756 | 4206 757 | 4471 758 | 4532 759 | 4543 760 | 4554 761 | 4567 762 | 4678 763 | 4692 764 | 4699 765 | 4745 766 | 4802 767 | 4960 768 | 4967 769 | 5128 770 | 5229 771 | 5349 772 | 5374 773 | 5432 774 | 5435 775 | 5458 776 | 5485 777 | 5503 778 | 5539 779 | 5623 780 | 5626 781 | 5630 782 | 5634 783 | 5684 784 | 5718 785 | 5774 786 | 5776 787 | 5870 788 | 5947 789 | 6145 790 | 6187 791 | 6207 792 | 6211 793 | -------------------------------------------------------------------------------- /data/xlcost/Java-C++/test-Java-map.jsonl: -------------------------------------------------------------------------------- 1 | 1057 2 | 1058 3 | 1134 4 | 118 5 | 1225 6 | 1255 7 | 1283 8 | 1387 9 | 142 10 | 1448 11 | 1472 12 | 1494 13 | 1513 14 | 1534 15 | 1535 16 | 1552 17 | 1557 18 | 1562 19 | 1608 20 | 1610 21 | 1614 22 | 1616 23 | 1634 24 | 1636 25 | 1656 26 | 1659 27 | 1683 28 | 1688 29 | 168 30 | 1693 31 | 1694 32 | 169 33 | 364 34 | 37 35 | 447 36 | 605 37 | 612 38 | 647 39 | 688 40 | 704 41 | 711 42 | 747 43 | 823 44 | 848 45 | 859 46 | 884 47 | 896 48 | 901 49 | 907 50 | 908 51 | 927 52 | 933 53 | 943 54 | 956 55 | 966 56 | 967 57 | 1703 58 | 1706 59 | 1708 60 | 1709 61 | 1843 62 | 1844 63 | 1853 64 | 1861 65 | 1862 66 | 1867 67 | 1870 68 | 1944 69 | 1949 70 | 1954 71 | 1976 72 | 1989 73 | 1994 74 | 2018 75 | 2022 76 | 2025 77 | 2028 78 | 2030 79 | 2032 80 | 2065 81 | 2066 82 | 2073 83 | 2079 84 | 2081 85 | 2090 86 | 2091 87 | 2096 88 | 2105 89 | 2114 90 | 2118 91 | 2123 92 | 2124 93 | 2127 94 | 2132 95 | 2137 96 | 2143 97 | 2147 98 | 2148 99 | 2149 100 | 2151 101 | 2168 102 | 2195 103 | 2196 104 | 2199 105 | 2226 106 | 2247 107 | 2259 108 | 2273 109 | 2286 110 | 2291 111 | 2292 112 | 2294 113 | 2311 114 | 2322 115 | 2349 116 | 2374 117 | 2381 118 | 2469 119 | 2477 120 | 2480 121 | 2519 122 | 2524 123 | 2527 124 | 2558 125 | 2573 126 | 2577 127 | 2578 128 | 2579 129 | 2584 130 | 2586 131 | 2594 132 | 2596 133 | 2600 134 | 2604 135 | 2616 136 | 2641 137 | 2652 138 | 2653 139 | 2673 140 | 2682 141 | 2689 142 | 2698 143 | 2719 144 | 2731 145 | 2762 146 | 2765 147 | 2779 148 | 2781 149 | 2782 150 | 2784 151 | 2797 152 | 2799 153 | 2800 154 | 2810 155 | 2829 156 | 2841 157 | 2883 158 | 2884 159 | 2887 160 | 2894 161 | 2912 162 | 2942 163 | 2946 164 | 2958 165 | 2978 166 | 2992 167 | 2995 168 | 2998 169 | 3010 170 | 3018 171 | 3038 172 | 3132 173 | 3140 174 | 3171 175 | 3208 176 | 3267 177 | 3277 178 | 3295 179 | 3303 180 | 3304 181 | 3343 182 | 3357 183 | 3358 184 | 3384 185 | 3423 186 | 3429 187 | 3448 188 | 3496 189 | 3534 190 | 3597 191 | 3604 192 | 3628 193 | 3665 194 | 3666 195 | 3702 196 | 3730 197 | 3737 198 | 3742 199 | 3757 200 | 3795 201 | 3798 202 | 3802 203 | 3868 204 | 3917 205 | 3918 206 | 3966 207 | 3968 208 | 3980 209 | 4061 210 | 4070 211 | 4107 212 | 4137 213 | 4150 214 | 4171 215 | 4191 216 | 4215 217 | 4216 218 | 4238 219 | 4241 220 | 4249 221 | 4272 222 | 4275 223 | 4358 224 | 4369 225 | 4378 226 | 4392 227 | 4412 228 | 4414 229 | 4420 230 | 4450 231 | 4468 232 | 4488 233 | 4490 234 | 4499 235 | 4524 236 | 4539 237 | 4561 238 | 4564 239 | 4570 240 | 4588 241 | 4589 242 | 4656 243 | 4687 244 | 4720 245 | 4800 246 | 4805 247 | 4819 248 | 4826 249 | 4882 250 | 4892 251 | 4895 252 | 4913 253 | 4921 254 | 4929 255 | 4966 256 | 4969 257 | 4974 258 | 4981 259 | 5011 260 | 5018 261 | 5032 262 | 5033 263 | 5054 264 | 5055 265 | 5057 266 | 5066 267 | 5075 268 | 5081 269 | 5083 270 | 5088 271 | 5091 272 | 5096 273 | 5118 274 | 5140 275 | 5192 276 | 5196 277 | 5216 278 | 5222 279 | 5228 280 | 5238 281 | 5268 282 | 5272 283 | 5273 284 | 5282 285 | 5287 286 | 5298 287 | 5299 288 | 5306 289 | 5309 290 | 5312 291 | 5336 292 | 5339 293 | 5384 294 | 5393 295 | 5481 296 | 5491 297 | 5493 298 | 5500 299 | 5538 300 | 5558 301 | 5562 302 | 5591 303 | 5597 304 | 5605 305 | 5610 306 | 5625 307 | 5627 308 | 5639 309 | 5664 310 | 5688 311 | 5699 312 | 5722 313 | 5929 314 | 5958 315 | 5959 316 | 5989 317 | 5998 318 | 6083 319 | 6084 320 | 6096 321 | 6100 322 | 6130 323 | 6131 324 | 6132 325 | 6138 326 | 6176 327 | 6182 328 | 6220 329 | 6243 330 | 6279 331 | 6301 332 | 6305 333 | 6312 334 | 6330 335 | 6362 336 | 6407 337 | 6436 338 | 6458 339 | 6571 340 | 6628 341 | 6662 342 | 6736 343 | 6746 344 | 6761 345 | 6774 346 | 6826 347 | 6849 348 | 6850 349 | 6941 350 | 6943 351 | 7040 352 | 7123 353 | 7137 354 | 7148 355 | 7177 356 | 7181 357 | 7184 358 | 7214 359 | 7217 360 | 7234 361 | 7249 362 | 7263 363 | 7298 364 | 7354 365 | 7356 366 | 7378 367 | 7385 368 | 7393 369 | 7397 370 | 7420 371 | 7471 372 | 7514 373 | 7540 374 | 7572 375 | 7579 376 | 7641 377 | 7665 378 | 7678 379 | 7680 380 | 7694 381 | 7699 382 | 7712 383 | 7718 384 | 7728 385 | 7731 386 | 7768 387 | 7774 388 | 7787 389 | 7789 390 | 7809 391 | 7813 392 | 7837 393 | 7844 394 | 7855 395 | 7867 396 | 7873 397 | 7879 398 | 7882 399 | 7883 400 | 7894 401 | 7900 402 | 7909 403 | 7926 404 | 7942 405 | 7945 406 | 7967 407 | 7968 408 | 7977 409 | 7980 410 | 8033 411 | 8054 412 | 8116 413 | 8128 414 | 8151 415 | 8156 416 | 8176 417 | 8221 418 | 8240 419 | 8276 420 | 8289 421 | 8301 422 | 8323 423 | 8334 424 | 8374 425 | 8379 426 | 8398 427 | 8401 428 | 8408 429 | 8414 430 | 8428 431 | 8445 432 | 8473 433 | 8500 434 | 8626 435 | 8634 436 | 8822 437 | 8832 438 | 8863 439 | 8921 440 | 8938 441 | 8966 442 | 8986 443 | 9005 444 | 9061 445 | 9063 446 | 9064 447 | 9067 448 | 9087 449 | 9090 450 | 9141 451 | 9146 452 | 9158 453 | 9182 454 | 10017 455 | 10019 456 | 10045 457 | 10067 458 | 10069 459 | 10123 460 | 10127 461 | 10129 462 | 10133 463 | 10138 464 | 10155 465 | 10165 466 | 10212 467 | 10219 468 | 10230 469 | 10249 470 | 10253 471 | 10267 472 | 10285 473 | 10303 474 | 10320 475 | 10372 476 | 10375 477 | 10399 478 | 10406 479 | 10413 480 | 10440 481 | 10477 482 | 10546 483 | 10563 484 | 10578 485 | 10601 486 | 10669 487 | 10679 488 | 10680 489 | 10685 490 | 10688 491 | 10701 492 | 10707 493 | 10753 494 | 10755 495 | 10771 496 | 10776 497 | 10784 498 | 10786 499 | 10793 500 | 10797 501 | 10800 502 | 10802 503 | 10810 504 | 10823 505 | 10836 506 | 10840 507 | 10858 508 | 10874 509 | 10879 510 | 10882 511 | 10887 512 | 10888 513 | 10898 514 | 10899 515 | 10927 516 | 9194 517 | 9208 518 | 9220 519 | 9224 520 | 9251 521 | 9260 522 | 9266 523 | 9276 524 | 9285 525 | 9291 526 | 9306 527 | 9317 528 | 9335 529 | 9357 530 | 9364 531 | 9368 532 | 9380 533 | 9407 534 | 9510 535 | 9521 536 | 9528 537 | 9543 538 | 9547 539 | 9549 540 | 9573 541 | 9609 542 | 9612 543 | 9626 544 | 9640 545 | 9706 546 | 9718 547 | 9729 548 | 9731 549 | 9734 550 | 9738 551 | 9791 552 | 9812 553 | 9825 554 | 9847 555 | 9850 556 | 9853 557 | 9876 558 | 9880 559 | 9884 560 | 9893 561 | 9896 562 | 9922 563 | 9927 564 | 9929 565 | 9942 566 | 9954 567 | 9963 568 | 9979 569 | 10970 570 | 10978 571 | 11007 572 | 11013 573 | 11019 574 | 11024 575 | 11055 576 | 11109 577 | 11121 578 | 11128 579 | 11139 580 | 11149 581 | 11168 582 | 11173 583 | 11181 584 | 11182 585 | 11185 586 | 11216 587 | 11223 588 | 11227 589 | 11254 590 | 11319 591 | 11328 592 | 11339 593 | 11349 594 | 11352 595 | 11371 596 | 11383 597 | 11387 598 | 11389 599 | 11404 600 | 11417 601 | 11425 602 | 11461 603 | 11475 604 | 11488 605 | 11515 606 | 11530 607 | 11545 608 | 11584 609 | 11615 610 | 11630 611 | 11652 612 | 11664 613 | 11721 614 | 11727 615 | 11781 616 | 11866 617 | 11902 618 | 11915 619 | 11922 620 | 11968 621 | 11969 622 | 12011 623 | 12038 624 | 12077 625 | 12087 626 | 12144 627 | 12244 628 | 12264 629 | 12287 630 | 12288 631 | 12294 632 | 12308 633 | 12359 634 | 12364 635 | 12394 636 | 12408 637 | 12476 638 | 12507 639 | 12517 640 | 12585 641 | 12592 642 | 12683 643 | 12684 644 | 12778 645 | 12819 646 | 12913 647 | 12932 648 | 10051 649 | 10186 650 | 10473 651 | 11337 652 | 11690 653 | 11823 654 | 11831 655 | 12405 656 | 12417 657 | 12639 658 | 12817 659 | 13033 660 | 13054 661 | 13116 662 | 13135 663 | 13140 664 | 13141 665 | 13179 666 | 13188 667 | 13222 668 | 13225 669 | 13275 670 | 13297 671 | 13415 672 | 13418 673 | 13429 674 | 13431 675 | 13432 676 | 13443 677 | 13471 678 | 13511 679 | 13513 680 | 13539 681 | 13544 682 | 13565 683 | 13575 684 | 13586 685 | 13589 686 | 13598 687 | 13630 688 | 13633 689 | 13643 690 | 13712 691 | 13727 692 | 13735 693 | 13776 694 | 13826 695 | 13858 696 | 13870 697 | 13885 698 | 13895 699 | 13915 700 | 13965 701 | 13976 702 | 13985 703 | 14020 704 | 3051 705 | 3944 706 | 4066 707 | 4067 708 | 4916 709 | 5766 710 | 6490 711 | 6682 712 | 6792 713 | 6837 714 | 6940 715 | 7070 716 | 7071 717 | 8070 718 | 8348 719 | 8366 720 | 8624 721 | 9311 722 | 2746 723 | 2915 724 | 2947 725 | 2965 726 | 3062 727 | 3063 728 | 3113 729 | 3129 730 | 3161 731 | 3245 732 | 3246 733 | 3255 734 | 3463 735 | 3491 736 | 3543 737 | 3603 738 | 3621 739 | 3633 740 | 3715 741 | 3760 742 | 3797 743 | 3850 744 | 3875 745 | 3930 746 | 3932 747 | 3961 748 | 3973 749 | 3986 750 | 4014 751 | 4033 752 | 4037 753 | 4162 754 | 4183 755 | 4185 756 | 4206 757 | 4471 758 | 4532 759 | 4543 760 | 4554 761 | 4567 762 | 4678 763 | 4692 764 | 4699 765 | 4745 766 | 4802 767 | 4960 768 | 4967 769 | 5128 770 | 5229 771 | 5349 772 | 5374 773 | 5432 774 | 5435 775 | 5458 776 | 5485 777 | 5503 778 | 5539 779 | 5623 780 | 5626 781 | 5630 782 | 5634 783 | 5684 784 | 5718 785 | 5774 786 | 5776 787 | 5870 788 | 5947 789 | 6145 790 | 6187 791 | 6207 792 | 6211 793 | -------------------------------------------------------------------------------- /data/xlcost/Java-C++/val-C++-map.jsonl: -------------------------------------------------------------------------------- 1 | 1010 2 | 1015 3 | 1074 4 | 1222 5 | 1239 6 | 1252 7 | 1268 8 | 1282 9 | 1288 10 | 1326 11 | 1366 12 | 1373 13 | 1379 14 | 1381 15 | 1385 16 | 1397 17 | 1428 18 | 1436 19 | 1437 20 | 1446 21 | 1447 22 | 1456 23 | 1480 24 | 1487 25 | 1489 26 | 148 27 | 1492 28 | 1495 29 | 1511 30 | 1518 31 | 1530 32 | 1531 33 | 1537 34 | 1576 35 | 1584 36 | 1596 37 | 1611 38 | 1637 39 | 1638 40 | 1682 41 | 1687 42 | 1691 43 | 187 44 | 221 45 | 22 46 | 241 47 | 472 48 | 494 49 | 694 50 | 69 51 | 725 52 | 758 53 | 808 54 | 81 55 | 821 56 | 824 57 | 836 58 | 840 59 | 906 60 | 913 61 | 918 62 | 921 63 | 960 64 | 961 65 | 1701 66 | 1702 67 | 1704 68 | 1705 69 | 1707 70 | 1710 71 | 1714 72 | 1715 73 | 1760 74 | 1767 75 | 1786 76 | 1788 77 | 1802 78 | 1803 79 | 1808 80 | 1818 81 | 1824 82 | 1854 83 | 1918 84 | 1920 85 | 1922 86 | 1926 87 | 1937 88 | 1939 89 | 1955 90 | 1978 91 | 1981 92 | 1996 93 | 1998 94 | 2000 95 | 2001 96 | 2011 97 | 2012 98 | 2026 99 | 2031 100 | 2035 101 | 2048 102 | 2052 103 | 2063 104 | 2076 105 | 2095 106 | 2097 107 | 2103 108 | 2108 109 | 2120 110 | 2125 111 | 2129 112 | 2130 113 | 2133 114 | 2142 115 | 2144 116 | 2174 117 | 2176 118 | 2190 119 | 2200 120 | 2258 121 | 2283 122 | 2288 123 | 2304 124 | 2308 125 | 2310 126 | 2315 127 | 2332 128 | 2367 129 | 2369 130 | 2409 131 | 2423 132 | 2472 133 | 2475 134 | 2481 135 | 2482 136 | 2500 137 | 2515 138 | 2626 139 | 2683 140 | 2691 141 | 2737 142 | 2739 143 | 2757 144 | 2795 145 | 2920 146 | 2922 147 | 2959 148 | 2983 149 | 2999 150 | 3019 151 | 3033 152 | 3048 153 | 3070 154 | 3073 155 | 3075 156 | 3078 157 | 3085 158 | 3100 159 | 3141 160 | 3142 161 | 3169 162 | 3183 163 | 3215 164 | 3216 165 | 3220 166 | 3252 167 | 3275 168 | 3289 169 | 3315 170 | 3341 171 | 3351 172 | 3380 173 | 3382 174 | 3391 175 | 3397 176 | 3401 177 | 3432 178 | 3433 179 | 3495 180 | 3500 181 | 3540 182 | 3545 183 | 3547 184 | 3576 185 | 3627 186 | 3630 187 | 3671 188 | 3691 189 | 3696 190 | 3711 191 | 3755 192 | 3758 193 | 3759 194 | 3761 195 | 3776 196 | 3785 197 | 3791 198 | 3849 199 | 3871 200 | 3879 201 | 3882 202 | 3899 203 | 3988 204 | 4020 205 | 4129 206 | 4204 207 | 4258 208 | 4263 209 | 4266 210 | 4276 211 | 4286 212 | 4308 213 | 4331 214 | 4385 215 | 4430 216 | 4449 217 | 4458 218 | 4482 219 | 4493 220 | 4501 221 | 4507 222 | 4510 223 | 4520 224 | 4551 225 | 4555 226 | 4661 227 | 4694 228 | 4777 229 | 4812 230 | 4873 231 | 4880 232 | 4887 233 | 4899 234 | 4903 235 | 4923 236 | 4956 237 | 4978 238 | 4991 239 | 4994 240 | 5002 241 | 5084 242 | 5110 243 | 5115 244 | 5121 245 | 5127 246 | 5133 247 | 5146 248 | 5148 249 | 5150 250 | 5157 251 | 5160 252 | 5170 253 | 5171 254 | 5175 255 | 5181 256 | 5197 257 | 5212 258 | 5217 259 | 5218 260 | 5220 261 | 5231 262 | 5236 263 | 5246 264 | 5258 265 | 5262 266 | 5292 267 | 5315 268 | 5317 269 | 5347 270 | 5363 271 | 5420 272 | 5448 273 | 5455 274 | 5463 275 | 5487 276 | 5496 277 | 5508 278 | 5522 279 | 5524 280 | 5529 281 | 5546 282 | 5552 283 | 5560 284 | 5585 285 | 5594 286 | 5644 287 | 5680 288 | 5738 289 | 5740 290 | 5749 291 | 5761 292 | 5762 293 | 5768 294 | 5779 295 | 5797 296 | 5879 297 | 5882 298 | 5889 299 | 5909 300 | 5957 301 | 5992 302 | 5995 303 | 6007 304 | 6022 305 | 6023 306 | 6026 307 | 6030 308 | 6041 309 | 6057 310 | 6082 311 | 6106 312 | 6126 313 | 6129 314 | 6140 315 | 6150 316 | 6168 317 | 6190 318 | 6205 319 | 6219 320 | 6231 321 | 6249 322 | 6267 323 | 6296 324 | 6304 325 | 6307 326 | 6356 327 | 6358 328 | 6373 329 | 6417 330 | 6456 331 | 6503 332 | 6579 333 | 6661 334 | 6690 335 | 6713 336 | 6717 337 | 6727 338 | 6759 339 | 6813 340 | 6887 341 | 6888 342 | 6898 343 | 7050 344 | 7068 345 | 7097 346 | 7103 347 | 7117 348 | 7127 349 | 7145 350 | 7151 351 | 7185 352 | 7186 353 | 7218 354 | 7219 355 | 7222 356 | 7246 357 | 7275 358 | 7284 359 | 7301 360 | 7310 361 | 7328 362 | 7346 363 | 7349 364 | 7382 365 | 7401 366 | 7421 367 | 7430 368 | 7466 369 | 7515 370 | 7518 371 | 7521 372 | 7537 373 | 7555 374 | 7582 375 | 7597 376 | 7612 377 | 7659 378 | 7675 379 | 7679 380 | 7707 381 | 7720 382 | 7744 383 | 7748 384 | 7753 385 | 7817 386 | 7831 387 | 7833 388 | 7864 389 | 7868 390 | 7877 391 | 7892 392 | 7899 393 | 7903 394 | 7937 395 | 7961 396 | 8027 397 | 8069 398 | 8079 399 | 8095 400 | 8174 401 | 8181 402 | 8201 403 | 8209 404 | 8257 405 | 8267 406 | 8296 407 | 8352 408 | 8354 409 | 8363 410 | 8412 411 | 8472 412 | 8479 413 | 8488 414 | 8513 415 | 8661 416 | 8663 417 | 8666 418 | 8675 419 | 8676 420 | 8682 421 | 8708 422 | 8815 423 | 8817 424 | 8818 425 | 8827 426 | 8922 427 | 8925 428 | 8989 429 | 8991 430 | 8995 431 | 9009 432 | 9012 433 | 9025 434 | 9043 435 | 9044 436 | 9046 437 | 9069 438 | 9078 439 | 9080 440 | 9100 441 | 9117 442 | 9120 443 | 9129 444 | 9137 445 | 9138 446 | 9147 447 | 9154 448 | 9184 449 | 10000 450 | 10013 451 | 10016 452 | 10021 453 | 10026 454 | 10041 455 | 10050 456 | 10054 457 | 10078 458 | 10130 459 | 10184 460 | 10199 461 | 10220 462 | 10241 463 | 10272 464 | 10284 465 | 10295 466 | 10305 467 | 10311 468 | 10376 469 | 10393 470 | 10415 471 | 10441 472 | 10467 473 | 10475 474 | 10478 475 | 10486 476 | 10494 477 | 10498 478 | 10499 479 | 10507 480 | 10513 481 | 10520 482 | 10539 483 | 10614 484 | 10637 485 | 10646 486 | 10648 487 | 10649 488 | 10656 489 | 10663 490 | 10692 491 | 10716 492 | 10720 493 | 10723 494 | 10787 495 | 10791 496 | 10798 497 | 10805 498 | 10808 499 | 10815 500 | 10817 501 | 10824 502 | 10827 503 | 10834 504 | 10855 505 | 10865 506 | 10875 507 | 10889 508 | 10890 509 | 10896 510 | 10902 511 | 10933 512 | 9196 513 | 9226 514 | 9239 515 | 9263 516 | 9281 517 | 9290 518 | 9295 519 | 9312 520 | 9320 521 | 9358 522 | 9362 523 | 9379 524 | 9423 525 | 9466 526 | 9478 527 | 9481 528 | 9530 529 | 9550 530 | 9568 531 | 9572 532 | 9574 533 | 9576 534 | 9618 535 | 9627 536 | 9647 537 | 9658 538 | 9667 539 | 9683 540 | 9690 541 | 9692 542 | 9721 543 | 9740 544 | 9751 545 | 9761 546 | 9766 547 | 9769 548 | 9795 549 | 9796 550 | 9806 551 | 9828 552 | 9836 553 | 9842 554 | 9860 555 | 9871 556 | 9901 557 | 9952 558 | 9955 559 | 9998 560 | 10948 561 | 10952 562 | 10954 563 | 10956 564 | 10960 565 | 10990 566 | 11003 567 | 11028 568 | 11037 569 | 11047 570 | 11049 571 | 11062 572 | 11063 573 | 11075 574 | 11094 575 | 11101 576 | 11104 577 | 11120 578 | 11131 579 | 11136 580 | 11137 581 | 11138 582 | 11143 583 | 11148 584 | 11153 585 | 11156 586 | 11171 587 | 11180 588 | 11209 589 | 11210 590 | 11212 591 | 11221 592 | 11233 593 | 11236 594 | 11240 595 | 11284 596 | 11288 597 | 11331 598 | 11342 599 | 11358 600 | 11382 601 | 11393 602 | 11401 603 | 11420 604 | 11422 605 | 11451 606 | 11454 607 | 11463 608 | 11524 609 | 11538 610 | 11555 611 | 11613 612 | 11636 613 | 11639 614 | 11655 615 | 11688 616 | 11748 617 | 11792 618 | 11860 619 | 11874 620 | 11887 621 | 11929 622 | 11939 623 | 11977 624 | 12055 625 | 12068 626 | 12222 627 | 12225 628 | 12260 629 | 12321 630 | 12353 631 | 12374 632 | 12455 633 | 12513 634 | 12518 635 | 12523 636 | 12538 637 | 12573 638 | 12593 639 | 12623 640 | 12672 641 | 12679 642 | 12695 643 | 12724 644 | 12766 645 | 12808 646 | 12814 647 | 12885 648 | 12902 649 | 12929 650 | 12930 651 | 12977 652 | 10032 653 | 10619 654 | 10961 655 | 11559 656 | 12375 657 | 12558 658 | 12693 659 | 12941 660 | 12963 661 | 13008 662 | 13112 663 | 13139 664 | 13162 665 | 13186 666 | 13202 667 | 13233 668 | 13279 669 | 13296 670 | 13342 671 | 13351 672 | 13359 673 | 13440 674 | 13464 675 | 13506 676 | 13514 677 | 13516 678 | 13535 679 | 13543 680 | 13581 681 | 13616 682 | 13621 683 | 13623 684 | 13634 685 | 13635 686 | 13651 687 | 13693 688 | 13705 689 | 13733 690 | 13743 691 | 13754 692 | 13769 693 | 13778 694 | 13788 695 | 13810 696 | 13825 697 | 13832 698 | 13835 699 | 13877 700 | 13902 701 | 13913 702 | 13931 703 | 13979 704 | 14040 705 | 3392 706 | 4243 707 | 4585 708 | 4718 709 | 4783 710 | 4855 711 | 5101 712 | 5547 713 | 5705 714 | 7075 715 | 7767 716 | 8539 717 | 8761 718 | 8853 719 | 8981 720 | 9323 721 | 9428 722 | 9457 723 | 9471 724 | 9488 725 | 9872 726 | 2721 727 | 2812 728 | 2814 729 | 2836 730 | 2856 731 | 2896 732 | 3029 733 | 3104 734 | 3158 735 | 3290 736 | 3310 737 | 3325 738 | 3417 739 | 3437 740 | 3449 741 | 3473 742 | 3480 743 | 3488 744 | 3531 745 | 3548 746 | 3581 747 | 3590 748 | 3605 749 | 3646 750 | 3668 751 | 3700 752 | 3770 753 | 3874 754 | 3908 755 | 3963 756 | 3964 757 | 4034 758 | 4049 759 | 4069 760 | 4151 761 | 4160 762 | 4343 763 | 4371 764 | 4502 765 | 4618 766 | 4696 767 | 4740 768 | 4857 769 | 5060 770 | 5167 771 | 5182 772 | 5277 773 | 5280 774 | 5343 775 | 5361 776 | 5379 777 | 5465 778 | 5492 779 | 5494 780 | 5569 781 | 5720 782 | 5734 783 | 5748 784 | 5943 785 | 5953 786 | 5974 787 | 5984 788 | 6036 789 | 6072 790 | 6090 791 | 6101 792 | 6108 793 | -------------------------------------------------------------------------------- /data/xlcost/Java-C++/val-Java-map.jsonl: -------------------------------------------------------------------------------- 1 | 1010 2 | 1015 3 | 1074 4 | 1222 5 | 1239 6 | 1252 7 | 1268 8 | 1282 9 | 1288 10 | 1326 11 | 1366 12 | 1373 13 | 1379 14 | 1381 15 | 1385 16 | 1397 17 | 1428 18 | 1436 19 | 1437 20 | 1446 21 | 1447 22 | 1456 23 | 1480 24 | 1487 25 | 1489 26 | 148 27 | 1492 28 | 1495 29 | 1511 30 | 1518 31 | 1530 32 | 1531 33 | 1537 34 | 1576 35 | 1584 36 | 1596 37 | 1611 38 | 1637 39 | 1638 40 | 1682 41 | 1687 42 | 1691 43 | 187 44 | 221 45 | 22 46 | 241 47 | 472 48 | 494 49 | 694 50 | 69 51 | 725 52 | 758 53 | 808 54 | 81 55 | 821 56 | 824 57 | 836 58 | 840 59 | 906 60 | 913 61 | 918 62 | 921 63 | 960 64 | 961 65 | 1701 66 | 1702 67 | 1704 68 | 1705 69 | 1707 70 | 1710 71 | 1714 72 | 1715 73 | 1760 74 | 1767 75 | 1786 76 | 1788 77 | 1802 78 | 1803 79 | 1808 80 | 1818 81 | 1824 82 | 1854 83 | 1918 84 | 1920 85 | 1922 86 | 1926 87 | 1937 88 | 1939 89 | 1955 90 | 1978 91 | 1981 92 | 1996 93 | 1998 94 | 2000 95 | 2001 96 | 2011 97 | 2012 98 | 2026 99 | 2031 100 | 2035 101 | 2048 102 | 2052 103 | 2063 104 | 2076 105 | 2095 106 | 2097 107 | 2103 108 | 2108 109 | 2120 110 | 2125 111 | 2129 112 | 2130 113 | 2133 114 | 2142 115 | 2144 116 | 2174 117 | 2176 118 | 2190 119 | 2200 120 | 2258 121 | 2283 122 | 2288 123 | 2304 124 | 2308 125 | 2310 126 | 2315 127 | 2332 128 | 2367 129 | 2369 130 | 2409 131 | 2423 132 | 2472 133 | 2475 134 | 2481 135 | 2482 136 | 2500 137 | 2515 138 | 2626 139 | 2683 140 | 2691 141 | 2737 142 | 2739 143 | 2757 144 | 2795 145 | 2920 146 | 2922 147 | 2959 148 | 2983 149 | 2999 150 | 3019 151 | 3033 152 | 3048 153 | 3070 154 | 3073 155 | 3075 156 | 3078 157 | 3085 158 | 3100 159 | 3141 160 | 3142 161 | 3169 162 | 3183 163 | 3215 164 | 3216 165 | 3220 166 | 3252 167 | 3275 168 | 3289 169 | 3315 170 | 3341 171 | 3351 172 | 3380 173 | 3382 174 | 3391 175 | 3397 176 | 3401 177 | 3432 178 | 3433 179 | 3495 180 | 3500 181 | 3540 182 | 3545 183 | 3547 184 | 3576 185 | 3627 186 | 3630 187 | 3671 188 | 3691 189 | 3696 190 | 3711 191 | 3755 192 | 3758 193 | 3759 194 | 3761 195 | 3776 196 | 3785 197 | 3791 198 | 3849 199 | 3871 200 | 3879 201 | 3882 202 | 3899 203 | 3988 204 | 4020 205 | 4129 206 | 4204 207 | 4258 208 | 4263 209 | 4266 210 | 4276 211 | 4286 212 | 4308 213 | 4331 214 | 4385 215 | 4430 216 | 4449 217 | 4458 218 | 4482 219 | 4493 220 | 4501 221 | 4507 222 | 4510 223 | 4520 224 | 4551 225 | 4555 226 | 4661 227 | 4694 228 | 4777 229 | 4812 230 | 4873 231 | 4880 232 | 4887 233 | 4899 234 | 4903 235 | 4923 236 | 4956 237 | 4978 238 | 4991 239 | 4994 240 | 5002 241 | 5084 242 | 5110 243 | 5115 244 | 5121 245 | 5127 246 | 5133 247 | 5146 248 | 5148 249 | 5150 250 | 5157 251 | 5160 252 | 5170 253 | 5171 254 | 5175 255 | 5181 256 | 5197 257 | 5212 258 | 5217 259 | 5218 260 | 5220 261 | 5231 262 | 5236 263 | 5246 264 | 5258 265 | 5262 266 | 5292 267 | 5315 268 | 5317 269 | 5347 270 | 5363 271 | 5420 272 | 5448 273 | 5455 274 | 5463 275 | 5487 276 | 5496 277 | 5508 278 | 5522 279 | 5524 280 | 5529 281 | 5546 282 | 5552 283 | 5560 284 | 5585 285 | 5594 286 | 5644 287 | 5680 288 | 5738 289 | 5740 290 | 5749 291 | 5761 292 | 5762 293 | 5768 294 | 5779 295 | 5797 296 | 5879 297 | 5882 298 | 5889 299 | 5909 300 | 5957 301 | 5992 302 | 5995 303 | 6007 304 | 6022 305 | 6023 306 | 6026 307 | 6030 308 | 6041 309 | 6057 310 | 6082 311 | 6106 312 | 6126 313 | 6129 314 | 6140 315 | 6150 316 | 6168 317 | 6190 318 | 6205 319 | 6219 320 | 6231 321 | 6249 322 | 6267 323 | 6296 324 | 6304 325 | 6307 326 | 6356 327 | 6358 328 | 6373 329 | 6417 330 | 6456 331 | 6503 332 | 6579 333 | 6661 334 | 6690 335 | 6713 336 | 6717 337 | 6727 338 | 6759 339 | 6813 340 | 6887 341 | 6888 342 | 6898 343 | 7050 344 | 7068 345 | 7097 346 | 7103 347 | 7117 348 | 7127 349 | 7145 350 | 7151 351 | 7185 352 | 7186 353 | 7218 354 | 7219 355 | 7222 356 | 7246 357 | 7275 358 | 7284 359 | 7301 360 | 7310 361 | 7328 362 | 7346 363 | 7349 364 | 7382 365 | 7401 366 | 7421 367 | 7430 368 | 7466 369 | 7515 370 | 7518 371 | 7521 372 | 7537 373 | 7555 374 | 7582 375 | 7597 376 | 7612 377 | 7659 378 | 7675 379 | 7679 380 | 7707 381 | 7720 382 | 7744 383 | 7748 384 | 7753 385 | 7817 386 | 7831 387 | 7833 388 | 7864 389 | 7868 390 | 7877 391 | 7892 392 | 7899 393 | 7903 394 | 7937 395 | 7961 396 | 8027 397 | 8069 398 | 8079 399 | 8095 400 | 8174 401 | 8181 402 | 8201 403 | 8209 404 | 8257 405 | 8267 406 | 8296 407 | 8352 408 | 8354 409 | 8363 410 | 8412 411 | 8472 412 | 8479 413 | 8488 414 | 8513 415 | 8661 416 | 8663 417 | 8666 418 | 8675 419 | 8676 420 | 8682 421 | 8708 422 | 8815 423 | 8817 424 | 8818 425 | 8827 426 | 8922 427 | 8925 428 | 8989 429 | 8991 430 | 8995 431 | 9009 432 | 9012 433 | 9025 434 | 9043 435 | 9044 436 | 9046 437 | 9069 438 | 9078 439 | 9080 440 | 9100 441 | 9117 442 | 9120 443 | 9129 444 | 9137 445 | 9138 446 | 9147 447 | 9154 448 | 9184 449 | 10000 450 | 10013 451 | 10016 452 | 10021 453 | 10026 454 | 10041 455 | 10050 456 | 10054 457 | 10078 458 | 10130 459 | 10184 460 | 10199 461 | 10220 462 | 10241 463 | 10272 464 | 10284 465 | 10295 466 | 10305 467 | 10311 468 | 10376 469 | 10393 470 | 10415 471 | 10441 472 | 10467 473 | 10475 474 | 10478 475 | 10486 476 | 10494 477 | 10498 478 | 10499 479 | 10507 480 | 10513 481 | 10520 482 | 10539 483 | 10614 484 | 10637 485 | 10646 486 | 10648 487 | 10649 488 | 10656 489 | 10663 490 | 10692 491 | 10716 492 | 10720 493 | 10723 494 | 10787 495 | 10791 496 | 10798 497 | 10805 498 | 10808 499 | 10815 500 | 10817 501 | 10824 502 | 10827 503 | 10834 504 | 10855 505 | 10865 506 | 10875 507 | 10889 508 | 10890 509 | 10896 510 | 10902 511 | 10933 512 | 9196 513 | 9226 514 | 9239 515 | 9263 516 | 9281 517 | 9290 518 | 9295 519 | 9312 520 | 9320 521 | 9358 522 | 9362 523 | 9379 524 | 9423 525 | 9466 526 | 9478 527 | 9481 528 | 9530 529 | 9550 530 | 9568 531 | 9572 532 | 9574 533 | 9576 534 | 9618 535 | 9627 536 | 9647 537 | 9658 538 | 9667 539 | 9683 540 | 9690 541 | 9692 542 | 9721 543 | 9740 544 | 9751 545 | 9761 546 | 9766 547 | 9769 548 | 9795 549 | 9796 550 | 9806 551 | 9828 552 | 9836 553 | 9842 554 | 9860 555 | 9871 556 | 9901 557 | 9952 558 | 9955 559 | 9998 560 | 10948 561 | 10952 562 | 10954 563 | 10956 564 | 10960 565 | 10990 566 | 11003 567 | 11028 568 | 11037 569 | 11047 570 | 11049 571 | 11062 572 | 11063 573 | 11075 574 | 11094 575 | 11101 576 | 11104 577 | 11120 578 | 11131 579 | 11136 580 | 11137 581 | 11138 582 | 11143 583 | 11148 584 | 11153 585 | 11156 586 | 11171 587 | 11180 588 | 11209 589 | 11210 590 | 11212 591 | 11221 592 | 11233 593 | 11236 594 | 11240 595 | 11284 596 | 11288 597 | 11331 598 | 11342 599 | 11358 600 | 11382 601 | 11393 602 | 11401 603 | 11420 604 | 11422 605 | 11451 606 | 11454 607 | 11463 608 | 11524 609 | 11538 610 | 11555 611 | 11613 612 | 11636 613 | 11639 614 | 11655 615 | 11688 616 | 11748 617 | 11792 618 | 11860 619 | 11874 620 | 11887 621 | 11929 622 | 11939 623 | 11977 624 | 12055 625 | 12068 626 | 12222 627 | 12225 628 | 12260 629 | 12321 630 | 12353 631 | 12374 632 | 12455 633 | 12513 634 | 12518 635 | 12523 636 | 12538 637 | 12573 638 | 12593 639 | 12623 640 | 12672 641 | 12679 642 | 12695 643 | 12724 644 | 12766 645 | 12808 646 | 12814 647 | 12885 648 | 12902 649 | 12929 650 | 12930 651 | 12977 652 | 10032 653 | 10619 654 | 10961 655 | 11559 656 | 12375 657 | 12558 658 | 12693 659 | 12941 660 | 12963 661 | 13008 662 | 13112 663 | 13139 664 | 13162 665 | 13186 666 | 13202 667 | 13233 668 | 13279 669 | 13296 670 | 13342 671 | 13351 672 | 13359 673 | 13440 674 | 13464 675 | 13506 676 | 13514 677 | 13516 678 | 13535 679 | 13543 680 | 13581 681 | 13616 682 | 13621 683 | 13623 684 | 13634 685 | 13635 686 | 13651 687 | 13693 688 | 13705 689 | 13733 690 | 13743 691 | 13754 692 | 13769 693 | 13778 694 | 13788 695 | 13810 696 | 13825 697 | 13832 698 | 13835 699 | 13877 700 | 13902 701 | 13913 702 | 13931 703 | 13979 704 | 14040 705 | 3392 706 | 4243 707 | 4585 708 | 4718 709 | 4783 710 | 4855 711 | 5101 712 | 5547 713 | 5705 714 | 7075 715 | 7767 716 | 8539 717 | 8761 718 | 8853 719 | 8981 720 | 9323 721 | 9428 722 | 9457 723 | 9471 724 | 9488 725 | 9872 726 | 2721 727 | 2812 728 | 2814 729 | 2836 730 | 2856 731 | 2896 732 | 3029 733 | 3104 734 | 3158 735 | 3290 736 | 3310 737 | 3325 738 | 3417 739 | 3437 740 | 3449 741 | 3473 742 | 3480 743 | 3488 744 | 3531 745 | 3548 746 | 3581 747 | 3590 748 | 3605 749 | 3646 750 | 3668 751 | 3700 752 | 3770 753 | 3874 754 | 3908 755 | 3963 756 | 3964 757 | 4034 758 | 4049 759 | 4069 760 | 4151 761 | 4160 762 | 4343 763 | 4371 764 | 4502 765 | 4618 766 | 4696 767 | 4740 768 | 4857 769 | 5060 770 | 5167 771 | 5182 772 | 5277 773 | 5280 774 | 5343 775 | 5361 776 | 5379 777 | 5465 778 | 5492 779 | 5494 780 | 5569 781 | 5720 782 | 5734 783 | 5748 784 | 5943 785 | 5953 786 | 5974 787 | 5984 788 | 6036 789 | 6072 790 | 6090 791 | 6101 792 | 6108 793 | -------------------------------------------------------------------------------- /code_prepro/lang_processors/c_processor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | from code_prepro.lang_processors.tree_sitter_processor import ( 8 | TreeSitterLangProcessor, 9 | ) 10 | from code_prepro.lang_processors.java_processor import ( 11 | JAVA_TOKEN2CHAR, 12 | JAVA_CHAR2TOKEN, 13 | ) 14 | from code_prepro.lang_processors.tokenization_utils import ind_iter 15 | import re 16 | 17 | IDENTIFIERS = {"identifier", "field_identifier"} 18 | 19 | C_TOKEN2CHAR = JAVA_TOKEN2CHAR.copy() 20 | C_CHAR2TOKEN = JAVA_CHAR2TOKEN.copy() 21 | 22 | 23 | class CProcessor(TreeSitterLangProcessor): 24 | def __init__(self, root_folder): 25 | super().__init__( 26 | language="c", 27 | ast_nodes_type_string=["comment", "string_literal", "char_literal"], 28 | stokens_to_chars=C_TOKEN2CHAR, 29 | chars_to_stokens=C_CHAR2TOKEN, 30 | root_folder=root_folder, 31 | ) 32 | 33 | def get_function_name(self, function): 34 | return self.get_first_token_before_first_parenthesis(function) 35 | 36 | def extract_arguments(self, function): 37 | return self.extract_arguments_using_parentheses(function) 38 | 39 | def clean_hashtags_function(self, function): 40 | function = re.sub('[#][ ][i][n][c][l][u][d][e][ ]["].*?["]', "", function) 41 | function = re.sub("[#][ ][i][n][c][l][u][d][e][ ][<].*?[>]", "", function) 42 | function = re.sub("[#][ ][i][f][n][d][e][f][ ][^ ]*", "", function) 43 | function = re.sub("[#][ ][i][f][d][e][f][ ][^ ]*", "", function) 44 | function = re.sub( 45 | "[#][ ][d][e][f][i][n][e][ ][^ ]*[ ][(][ ].*?[ ][)][ ][(][ ].*[ ][)]", 46 | "", 47 | function, 48 | ) 49 | function = re.sub( 50 | "[#][ ][d][e][f][i][n][e][ ][^ ]*[ ][(][ ].*?[ ][)][ ][{][ ].*[ ][}]", 51 | "", 52 | function, 53 | ) 54 | function = re.sub( 55 | '[#][ ][d][e][f][i][n][e][ ][^ ]*[ ]([(][ ])?["].*?["]([ ][)])?', 56 | "", 57 | function, 58 | ) 59 | function = re.sub( 60 | "[#][ ][d][e][f][i][n][e][ ][^ ]*[ ]([(][ ])?\d*\.?\d*([ ][+-/*][ ]?\d*\.?\d*)?([ ][)])?", 61 | "", 62 | function, 63 | ) 64 | function = re.sub("[#][ ][d][e][f][i][n][e][ ][^ ]", "", function) 65 | function = re.sub( 66 | "[#][ ][i][f][ ][d][e][f][i][n][e][d][ ][(][ ].*?[ ][)]", "", function 67 | ) 68 | function = re.sub("[#][ ][i][f][ ][^ ]*", "", function) 69 | function = function.replace("# else", "") 70 | function = function.replace("# endif", "") 71 | function = function.strip() 72 | return function 73 | 74 | def extract_functions(self, code): 75 | """Extract functions from tokenized C++ code""" 76 | if isinstance(code, list): 77 | code = " ".join(code) 78 | else: 79 | assert isinstance(code, str) 80 | 81 | try: 82 | code = self.clean_hashtags_function(code) 83 | code = code.replace("ENDCOM", "\n").replace("▁", "SPACETOKEN") 84 | tokens, token_types = self.get_tokens_and_types(code) 85 | tokens = list(zip(tokens, token_types)) 86 | except KeyboardInterrupt: 87 | raise 88 | except: 89 | return [], [] 90 | i = ind_iter(len(tokens)) 91 | functions_standalone = [] 92 | functions_class = [] 93 | try: 94 | token, token_type = tokens[i.i] 95 | except: 96 | return [], [] 97 | while True: 98 | try: 99 | # detect function 100 | if token == ")" and ( 101 | (tokens[i.i + 1][0] == "{" and tokens[i.i + 2][0] != "}") 102 | or ( 103 | tokens[i.i + 1][0] == "throw" 104 | and tokens[i.i + 4][0] == "{" 105 | and tokens[i.i + 5][0] != "}" 106 | ) 107 | ): 108 | # go previous until the start of function 109 | while token not in {";", "}", "{"}: 110 | try: 111 | i.prev() 112 | except StopIteration: 113 | break 114 | token = tokens[i.i][0] 115 | # We are at the beginning of the function 116 | i.next() 117 | token, token_type = tokens[i.i] 118 | if token_type == "comment": 119 | token = token.strip() 120 | token += " ENDCOM" 121 | function = [token] 122 | token_types = [token_type] 123 | while token != "{": 124 | i.next() 125 | token, token_type = tokens[i.i] 126 | if token_type == "comment": 127 | token = token.strip() 128 | token += " ENDCOM" 129 | function.append(token) 130 | token_types.append(token_type) 131 | 132 | if token_types[function.index("(") - 1] not in IDENTIFIERS: 133 | continue 134 | if token_types[function.index("(") - 1] == "field_identifier": 135 | field_identifier = True 136 | else: 137 | field_identifier = False 138 | if token == "{": 139 | number_indent = 1 140 | while not (token == "}" and number_indent == 0): 141 | try: 142 | i.next() 143 | token, token_type = tokens[i.i] 144 | if token == "{": 145 | number_indent += 1 146 | elif token == "}": 147 | number_indent -= 1 148 | if token_type == "comment": 149 | token = token.strip() 150 | token += " ENDCOM" 151 | function.append(token) 152 | except StopIteration: 153 | break 154 | 155 | if ( 156 | "static" in function[0 : function.index("{")] 157 | or "::" not in function[0 : function.index("(")] 158 | and not field_identifier 159 | ): 160 | function = " ".join(function) 161 | function = re.sub( 162 | "[<][ ][D][O][C][U][M][E][N][T].*?[>] ", "", function 163 | ) 164 | function = self.clean_hashtags_function(function) 165 | function = function.strip() 166 | function = function.replace("\n", "ENDCOM").replace( 167 | "SPACETOKEN", "▁" 168 | ) 169 | if not re.sub( 170 | "[^ ]*[ ][(][ ]\w*([ ][,][ ]\w*)*[ ][)]", 171 | "", 172 | function[: function.index("{")], 173 | ).strip().startswith("{") and not function.startswith("#"): 174 | functions_standalone.append(function) 175 | else: 176 | function = " ".join(function) 177 | function = re.sub( 178 | "[<][ ][D][O][C][U][M][E][N][T].*?[>] ", "", function 179 | ) 180 | function = self.clean_hashtags_function(function) 181 | function = function.strip() 182 | function = function.replace("\n", "ENDCOM").replace( 183 | "SPACETOKEN", "▁" 184 | ) 185 | if not re.sub( 186 | "[^ ]*[ ][(][ ]\w*([ ][,][ ]\w*)*[ ][)]", 187 | "", 188 | function[: function.index("{")], 189 | ).strip().startswith("{") and not function.startswith("#"): 190 | functions_class.append(function) 191 | i.next() 192 | token = tokens[i.i][0] 193 | except KeyboardInterrupt: 194 | raise 195 | except: 196 | break 197 | 198 | return functions_standalone, functions_class 199 | -------------------------------------------------------------------------------- /code_prepro/lang_processors/cpp_processor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | from code_prepro.lang_processors.tree_sitter_processor import ( 8 | TreeSitterLangProcessor, 9 | ) 10 | from code_prepro.lang_processors.java_processor import ( 11 | JAVA_TOKEN2CHAR, 12 | JAVA_CHAR2TOKEN, 13 | ) 14 | from code_prepro.lang_processors.tokenization_utils import ind_iter 15 | import re 16 | 17 | IDENTIFIERS = {"identifier", "field_identifier"} 18 | 19 | CPP_TOKEN2CHAR = JAVA_TOKEN2CHAR.copy() 20 | CPP_CHAR2TOKEN = JAVA_CHAR2TOKEN.copy() 21 | 22 | 23 | class CppProcessor(TreeSitterLangProcessor): 24 | def __init__(self, root_folder): 25 | super().__init__( 26 | language="cpp", 27 | ast_nodes_type_string=["comment", "string_literal", "char_literal"], 28 | stokens_to_chars=CPP_TOKEN2CHAR, 29 | chars_to_stokens=CPP_CHAR2TOKEN, 30 | root_folder=root_folder, 31 | ) 32 | 33 | def get_function_name(self, function): 34 | return self.get_first_token_before_first_parenthesis(function) 35 | 36 | def extract_arguments(self, function): 37 | return self.extract_arguments_using_parentheses(function) 38 | 39 | def clean_hashtags_function(self, function): 40 | function = re.sub('[#][ ][i][n][c][l][u][d][e][ ]["].*?["]', "", function) 41 | function = re.sub("[#][ ][i][n][c][l][u][d][e][ ][<].*?[>]", "", function) 42 | function = re.sub("[#][ ][i][f][n][d][e][f][ ][^ ]*", "", function) 43 | function = re.sub("[#][ ][i][f][d][e][f][ ][^ ]*", "", function) 44 | function = re.sub( 45 | "[#][ ][d][e][f][i][n][e][ ][^ ]*[ ][(][ ].*?[ ][)][ ][(][ ].*[ ][)]", 46 | "", 47 | function, 48 | ) 49 | function = re.sub( 50 | "[#][ ][d][e][f][i][n][e][ ][^ ]*[ ][(][ ].*?[ ][)][ ][{][ ].*[ ][}]", 51 | "", 52 | function, 53 | ) 54 | function = re.sub( 55 | '[#][ ][d][e][f][i][n][e][ ][^ ]*[ ]([(][ ])?["].*?["]([ ][)])?', 56 | "", 57 | function, 58 | ) 59 | function = re.sub( 60 | "[#][ ][d][e][f][i][n][e][ ][^ ]*[ ]([(][ ])?\d*\.?\d*([ ][+-/*][ ]?\d*\.?\d*)?([ ][)])?", 61 | "", 62 | function, 63 | ) 64 | function = re.sub("[#][ ][d][e][f][i][n][e][ ][^ ]", "", function) 65 | function = re.sub( 66 | "[#][ ][i][f][ ][d][e][f][i][n][e][d][ ][(][ ].*?[ ][)]", "", function 67 | ) 68 | function = re.sub("[#][ ][i][f][ ][^ ]*", "", function) 69 | function = function.replace("# else", "") 70 | function = function.replace("# endif", "") 71 | function = function.strip() 72 | return function 73 | 74 | def extract_functions(self, code): 75 | """Extract functions from tokenized C++ code""" 76 | if isinstance(code, list): 77 | code = " ".join(code) 78 | else: 79 | assert isinstance(code, str) 80 | 81 | try: 82 | code = self.clean_hashtags_function(code) 83 | code = code.replace("ENDCOM", "\n").replace("▁", "SPACETOKEN") 84 | tokens, token_types = self.get_tokens_and_types(code) 85 | tokens = list(zip(tokens, token_types)) 86 | except KeyboardInterrupt: 87 | raise 88 | except: 89 | return [], [] 90 | i = ind_iter(len(tokens)) 91 | functions_standalone = [] 92 | functions_class = [] 93 | try: 94 | token, token_type = tokens[i.i] 95 | except: 96 | return [], [] 97 | while True: 98 | try: 99 | # detect function 100 | if token == ")" and ( 101 | (tokens[i.i + 1][0] == "{" and tokens[i.i + 2][0] != "}") 102 | or ( 103 | tokens[i.i + 1][0] == "throw" 104 | and tokens[i.i + 4][0] == "{" 105 | and tokens[i.i + 5][0] != "}" 106 | ) 107 | ): 108 | # go previous until the start of function 109 | while token not in {";", "}", "{"}: 110 | try: 111 | i.prev() 112 | except StopIteration: 113 | break 114 | token = tokens[i.i][0] 115 | # We are at the beginning of the function 116 | i.next() 117 | token, token_type = tokens[i.i] 118 | if token_type == "comment": 119 | token = token.strip() 120 | token += " ENDCOM" 121 | function = [token] 122 | token_types = [token_type] 123 | while token != "{": 124 | i.next() 125 | token, token_type = tokens[i.i] 126 | if token_type == "comment": 127 | token = token.strip() 128 | token += " ENDCOM" 129 | function.append(token) 130 | token_types.append(token_type) 131 | 132 | if token_types[function.index("(") - 1] not in IDENTIFIERS: 133 | continue 134 | if token_types[function.index("(") - 1] == "field_identifier": 135 | field_identifier = True 136 | else: 137 | field_identifier = False 138 | if token == "{": 139 | number_indent = 1 140 | while not (token == "}" and number_indent == 0): 141 | try: 142 | i.next() 143 | token, token_type = tokens[i.i] 144 | if token == "{": 145 | number_indent += 1 146 | elif token == "}": 147 | number_indent -= 1 148 | if token_type == "comment": 149 | token = token.strip() 150 | token += " ENDCOM" 151 | function.append(token) 152 | except StopIteration: 153 | break 154 | 155 | if ( 156 | "static" in function[0 : function.index("{")] 157 | or "::" not in function[0 : function.index("(")] 158 | and not field_identifier 159 | ): 160 | function = " ".join(function) 161 | function = re.sub( 162 | "[<][ ][D][O][C][U][M][E][N][T].*?[>] ", "", function 163 | ) 164 | function = self.clean_hashtags_function(function) 165 | function = function.strip() 166 | function = function.replace("\n", "ENDCOM").replace( 167 | "SPACETOKEN", "▁" 168 | ) 169 | if not re.sub( 170 | "[^ ]*[ ][(][ ]\w*([ ][,][ ]\w*)*[ ][)]", 171 | "", 172 | function[: function.index("{")], 173 | ).strip().startswith("{") and not function.startswith("#"): 174 | functions_standalone.append(function) 175 | else: 176 | function = " ".join(function) 177 | function = re.sub( 178 | "[<][ ][D][O][C][U][M][E][N][T].*?[>] ", "", function 179 | ) 180 | function = self.clean_hashtags_function(function) 181 | function = function.strip() 182 | function = function.replace("\n", "ENDCOM").replace( 183 | "SPACETOKEN", "▁" 184 | ) 185 | if not re.sub( 186 | "[^ ]*[ ][(][ ]\w*([ ][,][ ]\w*)*[ ][)]", 187 | "", 188 | function[: function.index("{")], 189 | ).strip().startswith("{") and not function.startswith("#"): 190 | functions_class.append(function) 191 | i.next() 192 | token = tokens[i.i][0] 193 | except KeyboardInterrupt: 194 | raise 195 | except: 196 | break 197 | 198 | return functions_standalone, functions_class 199 | --------------------------------------------------------------------------------