├── .gitignore ├── LICENSE ├── README.md ├── benchmark ├── README.md ├── runtime_all.png └── runtime_raise.png ├── code_tokenize ├── __init__.py ├── config.py ├── lang │ ├── __init__.py │ ├── base_visitors.py │ ├── go │ │ └── __init__.py │ ├── java │ │ └── __init__.py │ ├── js │ │ └── __init__.py │ ├── php │ │ └── __init__.py │ ├── python │ │ ├── __init__.py │ │ └── indent.py │ └── ruby │ │ └── __init__.py ├── tokenizer.py └── tokens.py ├── pyproject.toml ├── requirements.txt ├── resources ├── code_tokenize.png └── code_tokenize.svg ├── setup.cfg ├── setup.py └── tests ├── __init__.py └── test_tokenization.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # Project specific ignore 132 | build/ 133 | 134 | data/ 135 | .DS_Store -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2021-2022 Cedric Richter 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 | ------------------------------------------------ 6 | > Fast tokenization and structural analysis of 7 | any programming language in Python 8 | 9 | Programming Language Processing (PLP) brings the capabilities of modern NLP systems to the world of programming languages. 10 | To achieve high performance PLP systems, existing methods often take advantage of the fully defined nature of programming languages. Especially the syntactical structure can be exploited to gain knowledge about programs. 11 | 12 | **code.tokenize** provides easy access to the syntactic structure of a program. The tokenizer converts a program into a sequence of program tokens ready for further end-to-end processing. 13 | By relating each token to an AST node, it is possible to extend the program representation easily with further syntactic information. 14 | 15 | ## Installation 16 | The package is tested under Python 3. It can be installed via: 17 | ``` 18 | pip install code-tokenize 19 | ``` 20 | 21 | ## Usage 22 | code.tokenize can tokenize nearly any program code in a few lines of code: 23 | ```python 24 | import code_tokenize as ctok 25 | 26 | # Python 27 | ctok.tokenize( 28 | ''' 29 | def my_func(): 30 | print("Hello World") 31 | ''', 32 | lang = "python") 33 | 34 | # Output: [def, my_func, (, ), :, #NEWLINE#, ...] 35 | 36 | # Java 37 | ctok.tokenize( 38 | ''' 39 | public static void main(String[] args){ 40 | System.out.println("Hello World"); 41 | } 42 | ''', 43 | lang = "java", 44 | syntax_error = "ignore") 45 | 46 | # Output: [public, static, void, main, (, String, [, ], args), {, System, ...] 47 | 48 | # JavaScript 49 | ctok.tokenize( 50 | ''' 51 | alert("Hello World"); 52 | ''', 53 | lang = "javascript", 54 | syntax_error = "ignore") 55 | 56 | # Output: [alert, (, "Hello World", ), ;] 57 | 58 | 59 | ``` 60 | 61 | ## Supported languages 62 | code.tokenize employs [tree-sitter](https://tree-sitter.github.io/tree-sitter/) as a backend. Therefore, in principal, any language supported by tree-sitter is also 63 | supported by a tokenizer in code.tokenize. 64 | 65 | For some languages, this library supports additional 66 | features that are not directly supported by tree-sitter. 67 | Therefore, we distinguish between three language classes 68 | and support the following language identifier: 69 | 70 | - `native`: python 71 | - `advanced`: java 72 | - `basic`: javascript, go, ruby, cpp, c, swift, rust, ... 73 | 74 | Languages in the `native` class support all features 75 | of this library and are extensively tested. `advanced` languages are tested but do not support the full feature set. Languages of the `basic` class are not tested and 76 | only support the feature set of the backend. They can still be used for tokenization and AST parsing. 77 | 78 | ## How to contribute 79 | **Your language is not natively supported by code.tokenize or the tokenization seems to be incorrect?** Then change it! 80 | 81 | While code.tokenize is developed mainly as an helper library for internal research projects, we welcome pull requests of any sorts (if it is a new feature or a bug fix). 82 | 83 | **Want to help to test more languages?** 84 | Our goal is to support as many languages as possible at a `native` level. However, languages on `basic` level are completly untested. You can help by testing `basic` languages and reporting issues in the tokenization process! 85 | 86 | ## Release history 87 | * 0.2.0 88 | * Major API redesign! 89 | * CHANGE: AST parsing is now done by an external library: [code_ast](https://github.com/cedricrupb/code_ast) 90 | * CHANGE: Visitor pattern instead of custom tokenizer 91 | * CHANGE: Custom visitors for language dependent tokenization 92 | * 0.1.0 93 | * The first proper release 94 | * CHANGE: Language specific tokenizer configuration 95 | * CHANGE: Basic analyses of the program structure and token role 96 | * CHANGE: Documentation 97 | * 0.0.1 98 | * Work in progress 99 | 100 | ## Project Info 101 | The goal of this project is to provide developer in the 102 | programming language processing community with easy 103 | access to program tokenization and AST parsing. This is currently developed as a helper library for internal research projects. Therefore, it will only be updated 104 | as needed. 105 | 106 | Feel free to open an issue if anything unexpected 107 | happens. 108 | 109 | Distributed under the MIT license. See ``LICENSE`` for more information. 110 | 111 | This project was developed as part of our research related to: 112 | ```bibtex 113 | @inproceedings{richter2022tssb, 114 | title={TSSB-3M: Mining single statement bugs at massive scale}, 115 | author={Cedric Richter, Heike Wehrheim}, 116 | booktitle={MSR}, 117 | year={2022} 118 | } 119 | ``` 120 | 121 | We thank the developer of [tree-sitter](https://tree-sitter.github.io/tree-sitter/) library. Without tree-sitter this project would not be possible. 122 | -------------------------------------------------------------------------------- /benchmark/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking 2 | 3 | In the following, we benchmark the runtime of **code.tokenize** for parsing Python functions. To obtain a realistic set of Python code for PLP, we employ 4 | the Python portion of the [CodeSearchNet](https://github.com/github/CodeSearchNet) corpus. The corpus includes more than 500K Python functions 5 | annotated for training. 6 | 7 | ## Environment 8 | We benchmark the following implementation in our benchmark: 9 | ```python 10 | import code_tokenize as ctok 11 | 12 | ctok.tokenize( 13 | source_code, 14 | lang = 'python', 15 | syntax_error = 'raise' 16 | ) 17 | ``` 18 | Therefore, we skip all instances that contain syntax errors. 19 | 20 | For benchmarking, we employ a Macbook Pro M1 with 8GB RAM. 21 | 22 | ## Results 23 | We start by plotting the mean runtime of the tokenizer in relation 24 | to the size of the Python function (in number of tokens). For determining the size of program, we count the tokens in the pretokenized code. For brevity, we show results for functions below 1024 tokens (since this is the typical size of functions employed in PLP). 25 | 26 |

27 | 28 |

29 | 30 | We observe that the time for tokenization scales linearly with the number of tokens in the Python function. Even large function with up to 1024 tokens can be tokenized within 10ms. 31 | Note: The plot only shows runtimes for function implementation that are parsed without an error (Python 2 functions will likely produce an error). However, also functions that raise an exception will also run in a similar time window. 32 | 33 | 34 | ## Complete set 35 | Below the uncut version of the diagram. Even for large scale function with 36 | more than 25K tokens, the tokenizer does not take much longer than 100ms. 37 | 38 |

39 | 40 |

41 | -------------------------------------------------------------------------------- /benchmark/runtime_all.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cedricrupb/code_tokenize/6797bcf682edea672677bf3bce708d38f9d20dd0/benchmark/runtime_all.png -------------------------------------------------------------------------------- /benchmark/runtime_raise.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cedricrupb/code_tokenize/6797bcf682edea672677bf3bce708d38f9d20dd0/benchmark/runtime_raise.png -------------------------------------------------------------------------------- /code_tokenize/__init__.py: -------------------------------------------------------------------------------- 1 | from code_ast import ASTParser, ASTVisitor 2 | 3 | from .tokenizer import tokenize_tree 4 | from .lang import load_from_lang_config 5 | 6 | import logging as logger 7 | 8 | # Main function -------------------------------- 9 | 10 | def tokenize(source_code, lang = "guess", **kwargs): 11 | """ 12 | Tokenizes source code of most programming languages quickly. 13 | 14 | Given source code as string, this function quickly tokenizes 15 | the code into basic program tokens. 16 | The function uses tree-sitter as a backend. Therefore, this 17 | function does not only support most programming languages (see README) 18 | but also relates every token to an AST node. 19 | Tokens can be abused to traverse the program AST. 20 | 21 | Parameters 22 | ---------- 23 | source_code : str 24 | Source code to parsed as a string. Also 25 | supports parsing of incomplete source code 26 | snippets (by deactivating the syntax checker; see syntax_error) 27 | 28 | lang : [python, java, javascript, ...] 29 | String identifier of the programming language 30 | to be parsed. Supported are most programming languages 31 | including python, java and javascript (see README) 32 | Default: guess (Guesses language / Not supported currently throws error currently) 33 | 34 | syntax_error : [raise, warn, ignore] 35 | Reaction to syntax error in code snippet. 36 | raise: raises a Syntax Error 37 | warn: prints a warning to console 38 | ignore: Ignores syntax errors. Helpful for parsing code snippets. 39 | Default: raise 40 | 41 | visitors : list[Visitor] 42 | Optional list of visitors that should be executed during tokenization 43 | Since code is tokenized by traversing the parsed AST, visitors 44 | can be used to run further AST based analyses. 45 | 46 | Returns 47 | ------- 48 | TokenSequence 49 | A list of tokens representing the source code snippet. 50 | 51 | """ 52 | 53 | if len(source_code.strip()) == 0: raise ValueError("The code string is empty. Cannot tokenize anything empty: %s" % source_code) 54 | 55 | # If lang == guess, automatically determine the language 56 | if lang == "guess": lang = _lang_detect(source_code) 57 | 58 | logger.debug("Parses source code with parser for %s" % lang) 59 | 60 | # Setup config 61 | config = load_from_lang_config(lang, **kwargs) 62 | 63 | # Parse source tree 64 | parser = ASTParser(config.lang) 65 | tree, code = parser.parse(source_code) 66 | 67 | return tokenize_tree(config, tree.root_node, code, visitors = config.visitors) 68 | 69 | 70 | 71 | # Lang detect -------------------------------------- 72 | 73 | 74 | def _lang_detect(source_code): 75 | """Guesses the source code type using pygments""" 76 | raise NotImplementedError( 77 | "Guessing the language automatically is currently not implemented. Please specify a language with the lang keyword\n code_tokenize.tokenize(code, lang = your_lang)" 78 | ) 79 | 80 | -------------------------------------------------------------------------------- /code_tokenize/config.py: -------------------------------------------------------------------------------- 1 | 2 | import json 3 | 4 | from .lang.base_visitors import LeafVisitor 5 | 6 | 7 | class TokenizationConfig: 8 | """Helper object to translate arguments of tokenize to config object""" 9 | 10 | def __init__(self, lang, **kwargs): 11 | self.lang = lang 12 | self.syntax_error = "raise" # Options: raise, warn, ignore 13 | 14 | self.indent_tokens = False # Whether to represent indentations and newlines (Helpful for script languages like Python) 15 | self.num_whitespaces_for_indent = 4 16 | 17 | # A list of all statement node defined in the language 18 | self.statement_types = [ 19 | "*_statement", "*_definition", "*_declaration" 20 | ] 21 | 22 | self.visitors = [LeafVisitor] # visitor classes which should be run during analysis 23 | 24 | self.update(kwargs) 25 | 26 | 27 | def update(self, kwargs): 28 | for k, v in kwargs.items(): 29 | 30 | if k not in self.__dict__: 31 | raise TypeError("TypeError: tokenize() got an unexpected keyword argument '%s'" % k) 32 | 33 | self.__dict__[k] = v 34 | 35 | def __repr__(self): 36 | 37 | elements = [] 38 | for k, v in self.__dict__.items(): 39 | if v is not None: 40 | elements.append("%s=%s" % (k, v)) 41 | 42 | return "Config(%s)" % ", ".join(elements) 43 | 44 | 45 | 46 | # From config ---------------------------------------------------------------- 47 | 48 | def load_from_config(config_path, **kwargs): 49 | """Load from a config file. Config options can still be overwritten with kwargs""" 50 | 51 | with open(config_path, "r") as config_file: 52 | config = json.load(config_file) 53 | config.update(kwargs) 54 | 55 | return TokenizationConfig(**config) 56 | 57 | -------------------------------------------------------------------------------- /code_tokenize/lang/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from ..config import TokenizationConfig 3 | 4 | from .python import create_tokenization_config as pytok_config 5 | from .java import create_tokenization_config as jvtok_config 6 | from .go import create_tokenization_config as gotok_config 7 | from .js import create_tokenization_config as jstok_config 8 | from .php import create_tokenization_config as phptok_config 9 | from .ruby import create_tokenization_config as rubytok_config 10 | 11 | 12 | def load_from_lang_config(lang, **kwargs): 13 | 14 | if lang == "python" : base_config = pytok_config() 15 | elif lang == "java" : base_config = jvtok_config() 16 | elif lang == "go" : base_config = gotok_config() 17 | elif lang == "javascript" : base_config = jstok_config() 18 | elif lang == "php" : base_config = phptok_config() 19 | elif lang == "ruby" : base_config = rubytok_config() 20 | else : base_config = TokenizationConfig(lang) 21 | 22 | base_config.update(kwargs) 23 | return base_config 24 | -------------------------------------------------------------------------------- /code_tokenize/lang/base_visitors.py: -------------------------------------------------------------------------------- 1 | from code_ast import ASTVisitor 2 | 3 | # Basic visitor ----------------------------------------------------------- 4 | 5 | class LeafVisitor(ASTVisitor): 6 | 7 | def __init__(self, node_handler): 8 | self.node_handler = node_handler 9 | 10 | def visit_string(self, node): 11 | self.node_handler(node) 12 | return False 13 | 14 | def visit(self, node): 15 | if node.child_count == 0: 16 | self.node_handler(node) 17 | return False -------------------------------------------------------------------------------- /code_tokenize/lang/go/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from ...config import TokenizationConfig 3 | from ...tokens import NewlineToken 4 | 5 | from ..base_visitors import LeafVisitor 6 | 7 | 8 | # Tokenization config ---------------------------------------------------------------- 9 | 10 | def create_tokenization_config(): 11 | return TokenizationConfig( 12 | lang = 'go', 13 | statement_types = ["*_statement", "*_declaration"], 14 | visitors = [GoLeafVisitor], 15 | indent_tokens = False 16 | ) 17 | 18 | # Custom leaf visitor ---------------------------------------------------------------- 19 | 20 | class GoLeafVisitor(LeafVisitor): 21 | 22 | def visit_interpreted_string_literal(self, node): 23 | self.node_handler(node) 24 | return False 25 | 26 | def visit(self, node): 27 | if node.type == "\n": 28 | self.node_handler.handle_token(NewlineToken(self.node_handler.config)) 29 | return False 30 | return super().visit(node) -------------------------------------------------------------------------------- /code_tokenize/lang/java/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from ...config import TokenizationConfig 3 | 4 | # Tokenization config ---------------------------------------------------------------- 5 | 6 | def create_tokenization_config(): 7 | return TokenizationConfig( 8 | lang = 'java', 9 | statement_types = ["*_statement", "*_definition", "*_declaration"], 10 | indent_tokens = False 11 | ) 12 | -------------------------------------------------------------------------------- /code_tokenize/lang/js/__init__.py: -------------------------------------------------------------------------------- 1 | from ...config import TokenizationConfig 2 | 3 | # Tokenization config ---------------------------------------------------------------- 4 | 5 | def create_tokenization_config(): 6 | return TokenizationConfig( 7 | lang = 'javascript', 8 | statement_types = ["*_statement", "*_declaration"], 9 | indent_tokens = False 10 | ) -------------------------------------------------------------------------------- /code_tokenize/lang/php/__init__.py: -------------------------------------------------------------------------------- 1 | from ...config import TokenizationConfig 2 | 3 | # Tokenization config ---------------------------------------------------------------- 4 | 5 | def create_tokenization_config(): 6 | return TokenizationConfig( 7 | lang = 'php', 8 | statement_types = ["*_statement"], 9 | indent_tokens = False 10 | ) -------------------------------------------------------------------------------- /code_tokenize/lang/python/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from ...config import TokenizationConfig 3 | 4 | from ..base_visitors import LeafVisitor 5 | from .indent import IndentVisitor 6 | 7 | 8 | # Tokenization config ---------------------------------------------------------------- 9 | 10 | def create_tokenization_config(): 11 | return TokenizationConfig( 12 | lang = "python", 13 | statement_types = ["*_statement", "*_definition"], 14 | visitors = [PythonLeafVisitor, IndentVisitor], 15 | indent_tokens = True 16 | ) 17 | 18 | # Custom leaf visitor ---------------------------------------------------------------- 19 | 20 | class PythonLeafVisitor(LeafVisitor): 21 | 22 | def visit_unary_operator(self, node): 23 | if node.children[-1].type == "integer": 24 | self.node_handler(node) 25 | return False -------------------------------------------------------------------------------- /code_tokenize/lang/python/indent.py: -------------------------------------------------------------------------------- 1 | """Hierarchical indentation independent of the concrete program formatting""" 2 | 3 | from code_ast.visitor import ASTVisitor 4 | from ...tokens import IndentToken, DedentToken, NewlineToken 5 | 6 | 7 | class IndentVisitor(ASTVisitor): 8 | 9 | def __init__(self, token_handler): 10 | super().__init__() 11 | self.config = token_handler.config 12 | self.handler = token_handler 13 | 14 | def visit_block(self, block): 15 | self.handler.handle_token(IndentToken(self.config)) 16 | 17 | def leave_block(self, block): 18 | self.handler.handle_token(DedentToken(self.config)) 19 | 20 | def leave_comment(self, comment): 21 | self.handler.handle_token(NewlineToken(self.config)) 22 | 23 | def leave(self, node): 24 | if not node.type.endswith('statement'): return 25 | self.handler.handle_token(NewlineToken(self.config)) -------------------------------------------------------------------------------- /code_tokenize/lang/ruby/__init__.py: -------------------------------------------------------------------------------- 1 | from ...config import TokenizationConfig 2 | 3 | # Tokenization config ---------------------------------------------------------------- 4 | 5 | def create_tokenization_config(): 6 | return TokenizationConfig( 7 | lang = 'ruby', 8 | statement_types = ["*_statement"], 9 | indent_tokens = True 10 | ) -------------------------------------------------------------------------------- /code_tokenize/tokenizer.py: -------------------------------------------------------------------------------- 1 | 2 | import logging as logger 3 | from code_ast.visitor import ASTVisitor, ResumingVisitorComposition 4 | 5 | from .tokens import ASTToken, TokenSequence 6 | 7 | 8 | # Interface ---------------------------------------------------------------- 9 | 10 | def tokenize_tree(config, code_tree, code_lines, visitors = None): 11 | """ 12 | Transforms AST tree into token sequence 13 | 14 | Function to analyse an AST tree resulting 15 | into a token sequence. The parsing process 16 | is fully customizable and is guided by the given 17 | configuration. 18 | Tokenizers also support additional analysis 19 | of AST tree and extenstions to the token sequence. 20 | 21 | Parameters 22 | ---------- 23 | config : TokenizationConfig 24 | A configuration which used to initialize the tokenizers 25 | 26 | code_tree: tree-sitter root node 27 | Root node of the program to be tokenized 28 | 29 | code_lines: list[str] 30 | Source lines of the program code to be tokenized. 31 | Has to be related to code_tree. Otherwise, behavior 32 | is undefined. 33 | 34 | Returns 35 | ------- 36 | TokenSequence 37 | A sequence of program tokens representing the given program 38 | 39 | """ 40 | return create_tokenizer(config)(code_tree, code_lines, visitors = visitors) 41 | 42 | 43 | # Tokenize ---------------------------------------------------------------- 44 | 45 | 46 | class Tokenizer: 47 | """ 48 | Basic tokenizer for parsing AST 49 | 50 | The tokenizer parses a given AST into a token sequence. 51 | Each token is representing an AST leaf. 52 | No further analyses or additions. 53 | """ 54 | 55 | def __init__(self, config): 56 | self.config = config 57 | self._visitor_factories = [] 58 | 59 | def append_visitor(self, visitor_factory): 60 | self._visitor_factories.append(visitor_factory) 61 | 62 | def _create_token_handler(self, code_lines): 63 | return TokenHandler(self.config, code_lines) 64 | 65 | def _create_tree_visitors(self, token_handler, visitors = None): 66 | visitors = visitors or [] 67 | visitors += self._visitor_factories 68 | 69 | visitors = [visitor_fn(token_handler) 70 | if callable(visitor_fn) 71 | else visitor_fn 72 | for visitor_fn in visitors] 73 | 74 | return ResumingVisitorComposition( 75 | ErrorVisitor(self.config), 76 | *visitors 77 | ) 78 | 79 | def __call__(self, code_tree, code_lines, visitors = None): 80 | token_handler = self._create_token_handler(code_lines) 81 | tree_visitor = self._create_tree_visitors(token_handler, visitors) 82 | 83 | # Run tree visitor 84 | tree_visitor.walk(code_tree) 85 | 86 | return token_handler.tokens() 87 | 88 | 89 | def create_tokenizer(config): 90 | """Function to create tokenizer based on configuration""" 91 | return Tokenizer(config) 92 | 93 | 94 | # Basic visitor ----------------------------------------------------------- 95 | 96 | 97 | class LeafVisitor(ASTVisitor): 98 | 99 | def __init__(self, node_handler): 100 | self.node_handler = node_handler 101 | 102 | def visit_string(self, node): 103 | self.node_handler(node) 104 | return False 105 | 106 | def visit(self, node): 107 | if node.child_count == 0: 108 | self.node_handler(node) 109 | return False 110 | 111 | 112 | class ErrorVisitor(ASTVisitor): 113 | 114 | def __init__(self, config): 115 | self.config = config 116 | 117 | def visit_ERROR(self, node): 118 | 119 | if self.config.syntax_error == "raise": 120 | raise_syntax_error(node) 121 | return 122 | 123 | if self.config.syntax_error == "warn": 124 | warn_syntax_error(node) 125 | return 126 | 127 | # Node handler ------------------------------------------------------------ 128 | 129 | class TokenHandler: 130 | 131 | def __init__(self, config, source_code): 132 | self.config = config 133 | self.source_code = source_code 134 | 135 | self._tokens = [] 136 | 137 | def tokens(self): 138 | result = TokenSequence(self._tokens) 139 | self._tokens = [] 140 | return result 141 | 142 | def handle_token(self, token): 143 | if token.type == "newline" and self._tokens[-1].type in ["indent", "dedent", "newline"]: 144 | return # TODO: Blocking double newlines seems to be general. Better solution? 145 | 146 | self._tokens.append(token) 147 | 148 | def __call__(self, node): 149 | self.handle_token( 150 | ASTToken(self.config, node, self.source_code) 151 | ) 152 | 153 | # Error handling ----------------------------------------------------------- 154 | 155 | def _construct_error_msg(node): 156 | 157 | start_line, start_char = node.start_point 158 | end_line, end_char = node.end_point 159 | 160 | position = "?" 161 | if start_line == end_line: 162 | position = "in line %d [pos. %d - %d]" % (start_line, start_char, end_char) 163 | else: 164 | position = "inbetween line %d (start: %d) to line %d (end: %d)" % (start_line, start_char, end_line, end_char) 165 | 166 | return "Problem while parsing given code snipet. Error occured %s" % position 167 | 168 | 169 | def warn_syntax_error(node): 170 | logger.warn(_construct_error_msg(node)) 171 | 172 | 173 | def raise_syntax_error(node): 174 | raise SyntaxError(_construct_error_msg(node)) 175 | -------------------------------------------------------------------------------- /code_tokenize/tokens.py: -------------------------------------------------------------------------------- 1 | from code_ast.parsers import match_span 2 | 3 | # Cache Properties --------------------------------------------------------- 4 | 5 | def cached_property(fnc): 6 | """Helper decorator for lazy computing properties""" 7 | name = fnc.__name__ 8 | 9 | def get_or_compute(self): 10 | cache_attr = getattr(self, "_%s" % name, None) 11 | if cache_attr is not None: return cache_attr 12 | 13 | if not hasattr(self, "_cache"): self._cache = {} 14 | 15 | if name not in self._cache: 16 | self._cache[name] = fnc(self) 17 | 18 | return self._cache[name] 19 | 20 | return property(get_or_compute) 21 | 22 | 23 | # Tokens ------------------------------------------------------------------- 24 | 25 | class Token: 26 | """ 27 | A token represents a single program entity of a given source code 28 | 29 | Attributes 30 | ---------- 31 | text : str 32 | text of program token inside the parsed source code 33 | 34 | type : str 35 | token type or role inside a program. 36 | Often it refers to the type of token, e.g. identifier. 37 | Dependent on the tokenization process can also 38 | refer to contextual roles like variable definitions. 39 | 40 | config : TokenizerConfig 41 | configuration used to parse this token 42 | 43 | root_sequence : TokenSequence 44 | back reference to the sequence containing this token 45 | Might be None (independent token). 46 | 47 | """ 48 | 49 | def __init__(self, config, text): 50 | """Representing a single program token""" 51 | self.config = config 52 | self._text = text 53 | self._type = "token" 54 | 55 | self.root_sequence = None 56 | 57 | @property 58 | def text(self): 59 | return self._text 60 | 61 | @property 62 | def type(self): 63 | return self._type 64 | 65 | def __repr__(self): 66 | return self.text 67 | 68 | 69 | class IndentToken(Token): 70 | """ 71 | Basic token to indicate an indentation 72 | 73 | Helpful for indentation based languages such as Python. 74 | 75 | """ 76 | 77 | def __init__(self, config, new_line_before = True): 78 | super().__init__(config, "#INDENT#") 79 | self.new_line_before = new_line_before 80 | self._type = "indent" 81 | 82 | 83 | class DedentToken(Token): 84 | """ 85 | Basic token to indicate an dedentation 86 | 87 | Helpful for indentation based languages such as Python. 88 | 89 | """ 90 | 91 | def __init__(self, config, new_line_before = True): 92 | super().__init__(config, "#DEDENT#") 93 | self.new_line_before = new_line_before 94 | self._type = "dedent" 95 | 96 | 97 | class NewlineToken(Token): 98 | """ 99 | Basic token to indicate a newline 100 | 101 | Helpful for indentation based languages such as Python. 102 | 103 | """ 104 | 105 | def __init__(self, config): 106 | super().__init__(config, "#NEWLINE#") 107 | self._type = "newline" 108 | 109 | 110 | # AST backed token ---------------------------------------------------------------- 111 | 112 | class ASTToken(Token): 113 | """ 114 | Tokens that are related to leaf nodes inside an AST 115 | 116 | Attributes 117 | ---------- 118 | text : str 119 | text of program token inside the parsed source code 120 | 121 | type : str 122 | token type or role inside a program. 123 | Often it refers to the type of token, e.g. identifier. 124 | Dependent on the tokenization process can also 125 | refer to contextual roles like variable definitions. 126 | 127 | ast_node : node object 128 | node inside an AST that is used to create this token 129 | 130 | statement_head : Token 131 | token representing the head (first token) of a statement 132 | 133 | parent_head : Token 134 | token representing the head of a parent statement (if existent) 135 | 136 | config : TokenizerConfig 137 | configuration used to parse this token 138 | 139 | root_sequence : TokenSequence 140 | back reference to the sequence containing this token 141 | Might be None (independent token). 142 | 143 | """ 144 | 145 | def __init__(self, config, ast_node, source_lines): 146 | super().__init__(config, None) 147 | self.ast_node = ast_node 148 | self.source_lines = source_lines 149 | self.root_sequence = None 150 | self._type = None 151 | 152 | def _create_token(self, node): 153 | if self.root_sequence is not None: 154 | return self.root_sequence.get_token_by_node(node) 155 | return ASTToken(self.config, node, self.source_lines) 156 | 157 | # API methods -------------------------------- 158 | 159 | @cached_property 160 | def text(self): 161 | return match_span(self.ast_node, self.source_lines) 162 | 163 | @cached_property 164 | def type(self): 165 | return self.ast_node.type 166 | 167 | @cached_property 168 | def statement_head(self): 169 | """Returns the token representing the head of a statement""" 170 | 171 | statement_types = self.config.statement_types 172 | 173 | parent_node = parent_statement_node(statement_types, self.ast_node) 174 | if parent_node is None: raise ValueError("No statement could be identified!") 175 | 176 | # Identify first token that belongs to the statement 177 | current_left = parent_node 178 | while not is_token(current_left): 179 | current_left = current_left.children[0] 180 | 181 | return self._create_token(current_left) 182 | 183 | @cached_property 184 | def parent_head(self): 185 | """ 186 | Returns head of parent node if it exists. 187 | 188 | If the current token belongs to a top level statement, 189 | the function return None. 190 | """ 191 | # For identifying statements 192 | statement_types = self.config.statement_types 193 | parent_node = parent_statement_node(statement_types, self.ast_node) 194 | if parent_node is None: raise ValueError("No statement could be identified!") 195 | 196 | grandparent_node = parent_statement_node(statement_types, parent_node) 197 | if grandparent_node is None: return None 198 | 199 | # Identify first token that belongs to the statement 200 | current_left = grandparent_node 201 | while not is_token(current_left): 202 | current_left = current_left.children[0] 203 | 204 | return self._create_token(current_left) 205 | 206 | 207 | 208 | class VarUseToken(ASTToken): 209 | """AST token representing a variable usage (name of variable)""" 210 | 211 | def __init__(self, config, ast_node, source_lines): 212 | super().__init__(config, ast_node, source_lines) 213 | self._type = "use_var" 214 | 215 | 216 | class VarDefToken(ASTToken): 217 | """AST token representing a variable definition (name of variable)""" 218 | 219 | def __init__(self, config, ast_node, source_lines): 220 | super().__init__(config, ast_node, source_lines) 221 | self._type = "def_var" 222 | 223 | 224 | 225 | # Token Collection ----------------------------------------------------- 226 | 227 | class TokenSequence(list): 228 | """ 229 | Sequence of tokens 230 | 231 | Represent a sequence of tokens. It acts 232 | as a list while backreferencing each token 233 | in this collection. 234 | 235 | """ 236 | 237 | def __init__(self, tokens): 238 | super().__init__(tokens) 239 | 240 | self._map_nodes = {} 241 | 242 | for tok in self: 243 | tok.root_sequence = self 244 | 245 | if hasattr(tok, "ast_node"): 246 | self._map_nodes[node_key(tok.ast_node)] = tok 247 | 248 | def get_token_by_node(self, node): 249 | """Maps a given leaf node back to a token in this sequence.""" 250 | return self._map_nodes[node_key(node)] 251 | 252 | def iterstmts(self): 253 | """Splits the token sequence into a sequence of statement tokens""" 254 | def _iter_stmts(): 255 | current_head = None 256 | stmt = [] 257 | 258 | for tok in self: 259 | tok_head = tok.statement_head if hasattr(tok, "statement_head") else current_head 260 | 261 | if tok_head != current_head: 262 | if len(stmt) > 0: yield stmt 263 | current_head = tok_head 264 | stmt = [] 265 | 266 | stmt.append(tok) 267 | 268 | if len(stmt) > 0: yield stmt 269 | 270 | return _iter_stmts() 271 | 272 | 273 | # Utils ---------------------------------------------------------------- 274 | 275 | def match_type(type_regex, type): 276 | # TODO Support general regex (Is this needed?) 277 | 278 | star_count = type_regex.count("*") 279 | 280 | if star_count == 0: 281 | return type == type_regex 282 | 283 | if star_count == 1: 284 | if type_regex[0] == "*": 285 | return type.endswith(type_regex[1:]) 286 | if type_regex[-1] == "*": 287 | return type.startswith(type_regex[:-1]) 288 | 289 | raise ValueError("Unsupported type regex: %s" % type_regex) 290 | 291 | 292 | def is_token(node): 293 | return node.type == "string" or not node.children 294 | 295 | 296 | def node_key(node): 297 | return (node.type, node.start_point, node.end_point) 298 | 299 | 300 | def parent_statement_node(statement_types, node): 301 | 302 | def is_statement(type): 303 | return any(match_type(reg, type) for reg in statement_types) 304 | 305 | # Go up till we find a statement node 306 | parent_node = node.parent 307 | while parent_node is not None and not is_statement(parent_node.type): 308 | parent_node = parent_node.parent 309 | 310 | return parent_node 311 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "code_tokenize" 7 | version = "v0.2.1" 8 | description = "Fast program tokenization and structural analysis in Python" 9 | readme = "README.md" 10 | requires-python = ">= 3.8" 11 | license = { file = "LICENSE.txt" } 12 | keywords = ["code", "tokenization", "tokenize", "program", "language processing"] 13 | 14 | authors = [{name = "Cedric Richter", email = "cedricr.upb@gmail.com"}] 15 | maintainers = [{name = "Cedric Richter", email = "cedricr.upb@gmail.com"}] 16 | 17 | classifiers = [ 18 | "Development Status :: 3 - Alpha", 19 | "Intended Audience :: Developers", 20 | "Topic :: Software Development :: Build Tools", 21 | "License :: OSI Approved :: MIT License", 22 | "Programming Language :: Python :: 3", 23 | "Programming Language :: Python :: 3.6", 24 | "Programming Language :: Python :: 3.7", 25 | "Programming Language :: Python :: 3.8", 26 | "Programming Language :: Python :: 3.9", 27 | "Programming Language :: Python :: 3.10", 28 | "Programming Language :: Python :: 3.11", 29 | "Programming Language :: Python :: 3.12", 30 | "Programming Language :: Python :: 3.13", 31 | "Programming Language :: Python :: 3 :: Only", 32 | ] 33 | 34 | dependencies = ["tree_sitter", "GitPython", "requests", "code_ast"] 35 | 36 | [project.urls] 37 | "Homepage" = "https://github.com/cedricrupb/code_tokenize" 38 | "Bug Reports" = "https://github.com/cedricrupb/code_tokenize/issues" 39 | "Source" = "https://github.com/cedricrupb/code_tokenize" -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tree_sitter==0.21.3 2 | requests>=2.32.0 3 | GitPython>=3.1.41 4 | code_ast>=0.1.1 -------------------------------------------------------------------------------- /resources/code_tokenize.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cedricrupb/code_tokenize/6797bcf682edea672677bf3bce708d38f9d20dd0/resources/code_tokenize.png -------------------------------------------------------------------------------- /resources/code_tokenize.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
code
code
.
.
tokenize
tokenize
identifier
identifier
identifier
identifier
dot
dot
attribute
attribute
func_call
func_call
Viewer does not support full SVG 1.1
-------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | ong_description_content_type = text/markdown -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open("README.md", "r") as f: 4 | long_description = f.read() 5 | 6 | setup( 7 | name = 'code_tokenize', 8 | packages = find_packages(exclude=['tests']), 9 | version = '0.2.1', 10 | license='MIT', 11 | description = 'Fast program tokenization and structural analysis in Python', 12 | long_description = long_description, 13 | long_description_content_type="text/markdown", 14 | author = 'Cedric Richter', 15 | author_email = 'cedricr.upb@gmail.com', 16 | url = 'https://github.com/cedricrupb/code_tokenize', 17 | download_url = 'https://github.com/cedricrupb/code_tokenize/archive/refs/tags/v0.2.1.tar.gz', 18 | keywords = ['code', 'tokenization', 'tokenize', 'program', 'language processing'], 19 | install_requires=[ 20 | 'tree_sitter==0.21.3', 21 | 'GitPython>=3.1.41', 22 | 'requests>=2.32.0', 23 | 'code-ast>=0.1.1' 24 | ], 25 | classifiers=[ 26 | 'Development Status :: 3 - Alpha', 27 | 'Intended Audience :: Developers', 28 | 'Topic :: Software Development :: Build Tools', 29 | 'License :: OSI Approved :: MIT License', 30 | 'Programming Language :: Python :: 3', 31 | 'Programming Language :: Python :: 3.6', 32 | 'Programming Language :: Python :: 3.7', 33 | 'Programming Language :: Python :: 3.8', 34 | 'Programming Language :: Python :: 3.9', 35 | 'Programming Language :: Python :: 3.10', 36 | 'Programming Language :: Python :: 3.11', 37 | 'Programming Language :: Python :: 3.12', 38 | 'Programming Language :: Python :: 3.13', 39 | ], 40 | ) -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cedricrupb/code_tokenize/6797bcf682edea672677bf3bce708d38f9d20dd0/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_tokenization.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | import code_tokenize as ctok 4 | 5 | class PythonTokenizationTestCase(TestCase): 6 | 7 | def test_tokenize1(self): 8 | tokens = ctok.tokenize("def my_func():\n bar()", lang = "python") 9 | expected = ["def", "my_func", "(", ")", ":", "#INDENT#", "bar", "(", ")", "#NEWLINE#", "#DEDENT#"] 10 | self.assertEqual(expected, [str(t) for t in tokens]) 11 | 12 | def test_tokenize2(self): 13 | tokens = ctok.tokenize("def my_func(x):\n x = x + 1\n return x", lang = "python") 14 | expected = ["def", "my_func", "(", "x", ")", ":", "#INDENT#", "x", "=", "x", "+", "1", "#NEWLINE#", "return", "x", "#NEWLINE#", "#DEDENT#"] 15 | self.assertEqual(expected, [str(t) for t in tokens]) 16 | 17 | def test_error_handling(self): 18 | self.assertRaises(SyntaxError, ctok.tokenize, "def my_func(x):\n x = x + 1 return x", lang = "python") 19 | 20 | def test_error_handling2(self): 21 | tokens = ctok.tokenize("def my_func(x):\n x = x + 1 return x", lang = "python", syntax_error = "ignore") 22 | expected = ["def", "my_func", "(", "x", ")", ":", "x", "=", "x", "+", "1", "#INDENT#", "return", "x", "#NEWLINE#", "#DEDENT#"] 23 | self.assertEqual(expected, [str(t) for t in tokens]) 24 | 25 | 26 | 27 | class JavaTokenizationTestCase(TestCase): 28 | 29 | def test_tokenize1(self): 30 | tokens = ctok.tokenize("public class Test {\npublic void myFunc(){\n bar();\n}\n}", lang = "java") 31 | expected = ["public", "class", "Test", "{", "public", "void", "myFunc", "(", ")", "{", "bar", "(", ")", ";", "}", "}"] 32 | self.assertEqual(expected, [str(t) for t in tokens]) 33 | 34 | def test_tokenize2(self): 35 | tokens = ctok.tokenize("public class Test {\npublic int myFunc(int x){\n x = x + 1;\n return x;\n}\n}", lang = "java") 36 | expected = ["public", "class", "Test", "{", "public", "int", "myFunc", "(", "int", "x", ")", "{", "x", "=", "x", "+", "1", ";", "return", "x", ";", "}", "}"] 37 | self.assertEqual(expected, [str(t) for t in tokens]) 38 | 39 | def test_error_handling(self): 40 | self.assertRaises(SyntaxError, ctok.tokenize, "public int myFunc(int x){\n x = x + 1;\n return x;\n}", lang = "java") 41 | 42 | def test_error_handling2(self): 43 | tokens = ctok.tokenize("public int myFunc(int x){\n x = x + 1;\n return x;\n}", lang = "java", syntax_error = "ignore") 44 | expected = ["public", "int", "myFunc", "", "(", "int", "x", ")", "{", "x", "=", "x", "+", "1", ";", "return", "x", ";", "}"] 45 | self.assertEqual(expected, [str(t) for t in tokens]) 46 | 47 | 48 | class GoTokenizationTest(TestCase): 49 | 50 | def test_tokenize1(self): 51 | tokens = ctok.tokenize('func main(){\n tip1 := "test"\n}', lang = "go") 52 | expected = ["func", "main", "(", ")", "{", "tip1", ":=", '"test"', "#NEWLINE#", "}"] 53 | 54 | self.assertEqual(expected, [str(t) for t in tokens]) --------------------------------------------------------------------------------