├── .gitignore
├── LICENSE
├── README.md
├── benchmark
├── README.md
├── runtime_all.png
└── runtime_raise.png
├── code_tokenize
├── __init__.py
├── config.py
├── lang
│ ├── __init__.py
│ ├── base_visitors.py
│ ├── go
│ │ └── __init__.py
│ ├── java
│ │ └── __init__.py
│ ├── js
│ │ └── __init__.py
│ ├── php
│ │ └── __init__.py
│ ├── python
│ │ ├── __init__.py
│ │ └── indent.py
│ └── ruby
│ │ └── __init__.py
├── tokenizer.py
└── tokens.py
├── pyproject.toml
├── requirements.txt
├── resources
├── code_tokenize.png
└── code_tokenize.svg
├── setup.cfg
├── setup.py
└── tests
├── __init__.py
└── test_tokenization.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | # Project specific ignore
132 | build/
133 |
134 | data/
135 | .DS_Store
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2021-2022 Cedric Richter
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | ------------------------------------------------
6 | > Fast tokenization and structural analysis of
7 | any programming language in Python
8 |
9 | Programming Language Processing (PLP) brings the capabilities of modern NLP systems to the world of programming languages.
10 | To achieve high performance PLP systems, existing methods often take advantage of the fully defined nature of programming languages. Especially the syntactical structure can be exploited to gain knowledge about programs.
11 |
12 | **code.tokenize** provides easy access to the syntactic structure of a program. The tokenizer converts a program into a sequence of program tokens ready for further end-to-end processing.
13 | By relating each token to an AST node, it is possible to extend the program representation easily with further syntactic information.
14 |
15 | ## Installation
16 | The package is tested under Python 3. It can be installed via:
17 | ```
18 | pip install code-tokenize
19 | ```
20 |
21 | ## Usage
22 | code.tokenize can tokenize nearly any program code in a few lines of code:
23 | ```python
24 | import code_tokenize as ctok
25 |
26 | # Python
27 | ctok.tokenize(
28 | '''
29 | def my_func():
30 | print("Hello World")
31 | ''',
32 | lang = "python")
33 |
34 | # Output: [def, my_func, (, ), :, #NEWLINE#, ...]
35 |
36 | # Java
37 | ctok.tokenize(
38 | '''
39 | public static void main(String[] args){
40 | System.out.println("Hello World");
41 | }
42 | ''',
43 | lang = "java",
44 | syntax_error = "ignore")
45 |
46 | # Output: [public, static, void, main, (, String, [, ], args), {, System, ...]
47 |
48 | # JavaScript
49 | ctok.tokenize(
50 | '''
51 | alert("Hello World");
52 | ''',
53 | lang = "javascript",
54 | syntax_error = "ignore")
55 |
56 | # Output: [alert, (, "Hello World", ), ;]
57 |
58 |
59 | ```
60 |
61 | ## Supported languages
62 | code.tokenize employs [tree-sitter](https://tree-sitter.github.io/tree-sitter/) as a backend. Therefore, in principal, any language supported by tree-sitter is also
63 | supported by a tokenizer in code.tokenize.
64 |
65 | For some languages, this library supports additional
66 | features that are not directly supported by tree-sitter.
67 | Therefore, we distinguish between three language classes
68 | and support the following language identifier:
69 |
70 | - `native`: python
71 | - `advanced`: java
72 | - `basic`: javascript, go, ruby, cpp, c, swift, rust, ...
73 |
74 | Languages in the `native` class support all features
75 | of this library and are extensively tested. `advanced` languages are tested but do not support the full feature set. Languages of the `basic` class are not tested and
76 | only support the feature set of the backend. They can still be used for tokenization and AST parsing.
77 |
78 | ## How to contribute
79 | **Your language is not natively supported by code.tokenize or the tokenization seems to be incorrect?** Then change it!
80 |
81 | While code.tokenize is developed mainly as an helper library for internal research projects, we welcome pull requests of any sorts (if it is a new feature or a bug fix).
82 |
83 | **Want to help to test more languages?**
84 | Our goal is to support as many languages as possible at a `native` level. However, languages on `basic` level are completly untested. You can help by testing `basic` languages and reporting issues in the tokenization process!
85 |
86 | ## Release history
87 | * 0.2.0
88 | * Major API redesign!
89 | * CHANGE: AST parsing is now done by an external library: [code_ast](https://github.com/cedricrupb/code_ast)
90 | * CHANGE: Visitor pattern instead of custom tokenizer
91 | * CHANGE: Custom visitors for language dependent tokenization
92 | * 0.1.0
93 | * The first proper release
94 | * CHANGE: Language specific tokenizer configuration
95 | * CHANGE: Basic analyses of the program structure and token role
96 | * CHANGE: Documentation
97 | * 0.0.1
98 | * Work in progress
99 |
100 | ## Project Info
101 | The goal of this project is to provide developer in the
102 | programming language processing community with easy
103 | access to program tokenization and AST parsing. This is currently developed as a helper library for internal research projects. Therefore, it will only be updated
104 | as needed.
105 |
106 | Feel free to open an issue if anything unexpected
107 | happens.
108 |
109 | Distributed under the MIT license. See ``LICENSE`` for more information.
110 |
111 | This project was developed as part of our research related to:
112 | ```bibtex
113 | @inproceedings{richter2022tssb,
114 | title={TSSB-3M: Mining single statement bugs at massive scale},
115 | author={Cedric Richter, Heike Wehrheim},
116 | booktitle={MSR},
117 | year={2022}
118 | }
119 | ```
120 |
121 | We thank the developer of [tree-sitter](https://tree-sitter.github.io/tree-sitter/) library. Without tree-sitter this project would not be possible.
122 |
--------------------------------------------------------------------------------
/benchmark/README.md:
--------------------------------------------------------------------------------
1 | # Benchmarking
2 |
3 | In the following, we benchmark the runtime of **code.tokenize** for parsing Python functions. To obtain a realistic set of Python code for PLP, we employ
4 | the Python portion of the [CodeSearchNet](https://github.com/github/CodeSearchNet) corpus. The corpus includes more than 500K Python functions
5 | annotated for training.
6 |
7 | ## Environment
8 | We benchmark the following implementation in our benchmark:
9 | ```python
10 | import code_tokenize as ctok
11 |
12 | ctok.tokenize(
13 | source_code,
14 | lang = 'python',
15 | syntax_error = 'raise'
16 | )
17 | ```
18 | Therefore, we skip all instances that contain syntax errors.
19 |
20 | For benchmarking, we employ a Macbook Pro M1 with 8GB RAM.
21 |
22 | ## Results
23 | We start by plotting the mean runtime of the tokenizer in relation
24 | to the size of the Python function (in number of tokens). For determining the size of program, we count the tokens in the pretokenized code. For brevity, we show results for functions below 1024 tokens (since this is the typical size of functions employed in PLP).
25 |
26 |
27 |
28 |
29 |
30 | We observe that the time for tokenization scales linearly with the number of tokens in the Python function. Even large function with up to 1024 tokens can be tokenized within 10ms.
31 | Note: The plot only shows runtimes for function implementation that are parsed without an error (Python 2 functions will likely produce an error). However, also functions that raise an exception will also run in a similar time window.
32 |
33 |
34 | ## Complete set
35 | Below the uncut version of the diagram. Even for large scale function with
36 | more than 25K tokens, the tokenizer does not take much longer than 100ms.
37 |
38 |
39 |
40 |
41 |
--------------------------------------------------------------------------------
/benchmark/runtime_all.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cedricrupb/code_tokenize/6797bcf682edea672677bf3bce708d38f9d20dd0/benchmark/runtime_all.png
--------------------------------------------------------------------------------
/benchmark/runtime_raise.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cedricrupb/code_tokenize/6797bcf682edea672677bf3bce708d38f9d20dd0/benchmark/runtime_raise.png
--------------------------------------------------------------------------------
/code_tokenize/__init__.py:
--------------------------------------------------------------------------------
1 | from code_ast import ASTParser, ASTVisitor
2 |
3 | from .tokenizer import tokenize_tree
4 | from .lang import load_from_lang_config
5 |
6 | import logging as logger
7 |
8 | # Main function --------------------------------
9 |
10 | def tokenize(source_code, lang = "guess", **kwargs):
11 | """
12 | Tokenizes source code of most programming languages quickly.
13 |
14 | Given source code as string, this function quickly tokenizes
15 | the code into basic program tokens.
16 | The function uses tree-sitter as a backend. Therefore, this
17 | function does not only support most programming languages (see README)
18 | but also relates every token to an AST node.
19 | Tokens can be abused to traverse the program AST.
20 |
21 | Parameters
22 | ----------
23 | source_code : str
24 | Source code to parsed as a string. Also
25 | supports parsing of incomplete source code
26 | snippets (by deactivating the syntax checker; see syntax_error)
27 |
28 | lang : [python, java, javascript, ...]
29 | String identifier of the programming language
30 | to be parsed. Supported are most programming languages
31 | including python, java and javascript (see README)
32 | Default: guess (Guesses language / Not supported currently throws error currently)
33 |
34 | syntax_error : [raise, warn, ignore]
35 | Reaction to syntax error in code snippet.
36 | raise: raises a Syntax Error
37 | warn: prints a warning to console
38 | ignore: Ignores syntax errors. Helpful for parsing code snippets.
39 | Default: raise
40 |
41 | visitors : list[Visitor]
42 | Optional list of visitors that should be executed during tokenization
43 | Since code is tokenized by traversing the parsed AST, visitors
44 | can be used to run further AST based analyses.
45 |
46 | Returns
47 | -------
48 | TokenSequence
49 | A list of tokens representing the source code snippet.
50 |
51 | """
52 |
53 | if len(source_code.strip()) == 0: raise ValueError("The code string is empty. Cannot tokenize anything empty: %s" % source_code)
54 |
55 | # If lang == guess, automatically determine the language
56 | if lang == "guess": lang = _lang_detect(source_code)
57 |
58 | logger.debug("Parses source code with parser for %s" % lang)
59 |
60 | # Setup config
61 | config = load_from_lang_config(lang, **kwargs)
62 |
63 | # Parse source tree
64 | parser = ASTParser(config.lang)
65 | tree, code = parser.parse(source_code)
66 |
67 | return tokenize_tree(config, tree.root_node, code, visitors = config.visitors)
68 |
69 |
70 |
71 | # Lang detect --------------------------------------
72 |
73 |
74 | def _lang_detect(source_code):
75 | """Guesses the source code type using pygments"""
76 | raise NotImplementedError(
77 | "Guessing the language automatically is currently not implemented. Please specify a language with the lang keyword\n code_tokenize.tokenize(code, lang = your_lang)"
78 | )
79 |
80 |
--------------------------------------------------------------------------------
/code_tokenize/config.py:
--------------------------------------------------------------------------------
1 |
2 | import json
3 |
4 | from .lang.base_visitors import LeafVisitor
5 |
6 |
7 | class TokenizationConfig:
8 | """Helper object to translate arguments of tokenize to config object"""
9 |
10 | def __init__(self, lang, **kwargs):
11 | self.lang = lang
12 | self.syntax_error = "raise" # Options: raise, warn, ignore
13 |
14 | self.indent_tokens = False # Whether to represent indentations and newlines (Helpful for script languages like Python)
15 | self.num_whitespaces_for_indent = 4
16 |
17 | # A list of all statement node defined in the language
18 | self.statement_types = [
19 | "*_statement", "*_definition", "*_declaration"
20 | ]
21 |
22 | self.visitors = [LeafVisitor] # visitor classes which should be run during analysis
23 |
24 | self.update(kwargs)
25 |
26 |
27 | def update(self, kwargs):
28 | for k, v in kwargs.items():
29 |
30 | if k not in self.__dict__:
31 | raise TypeError("TypeError: tokenize() got an unexpected keyword argument '%s'" % k)
32 |
33 | self.__dict__[k] = v
34 |
35 | def __repr__(self):
36 |
37 | elements = []
38 | for k, v in self.__dict__.items():
39 | if v is not None:
40 | elements.append("%s=%s" % (k, v))
41 |
42 | return "Config(%s)" % ", ".join(elements)
43 |
44 |
45 |
46 | # From config ----------------------------------------------------------------
47 |
48 | def load_from_config(config_path, **kwargs):
49 | """Load from a config file. Config options can still be overwritten with kwargs"""
50 |
51 | with open(config_path, "r") as config_file:
52 | config = json.load(config_file)
53 | config.update(kwargs)
54 |
55 | return TokenizationConfig(**config)
56 |
57 |
--------------------------------------------------------------------------------
/code_tokenize/lang/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | from ..config import TokenizationConfig
3 |
4 | from .python import create_tokenization_config as pytok_config
5 | from .java import create_tokenization_config as jvtok_config
6 | from .go import create_tokenization_config as gotok_config
7 | from .js import create_tokenization_config as jstok_config
8 | from .php import create_tokenization_config as phptok_config
9 | from .ruby import create_tokenization_config as rubytok_config
10 |
11 |
12 | def load_from_lang_config(lang, **kwargs):
13 |
14 | if lang == "python" : base_config = pytok_config()
15 | elif lang == "java" : base_config = jvtok_config()
16 | elif lang == "go" : base_config = gotok_config()
17 | elif lang == "javascript" : base_config = jstok_config()
18 | elif lang == "php" : base_config = phptok_config()
19 | elif lang == "ruby" : base_config = rubytok_config()
20 | else : base_config = TokenizationConfig(lang)
21 |
22 | base_config.update(kwargs)
23 | return base_config
24 |
--------------------------------------------------------------------------------
/code_tokenize/lang/base_visitors.py:
--------------------------------------------------------------------------------
1 | from code_ast import ASTVisitor
2 |
3 | # Basic visitor -----------------------------------------------------------
4 |
5 | class LeafVisitor(ASTVisitor):
6 |
7 | def __init__(self, node_handler):
8 | self.node_handler = node_handler
9 |
10 | def visit_string(self, node):
11 | self.node_handler(node)
12 | return False
13 |
14 | def visit(self, node):
15 | if node.child_count == 0:
16 | self.node_handler(node)
17 | return False
--------------------------------------------------------------------------------
/code_tokenize/lang/go/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | from ...config import TokenizationConfig
3 | from ...tokens import NewlineToken
4 |
5 | from ..base_visitors import LeafVisitor
6 |
7 |
8 | # Tokenization config ----------------------------------------------------------------
9 |
10 | def create_tokenization_config():
11 | return TokenizationConfig(
12 | lang = 'go',
13 | statement_types = ["*_statement", "*_declaration"],
14 | visitors = [GoLeafVisitor],
15 | indent_tokens = False
16 | )
17 |
18 | # Custom leaf visitor ----------------------------------------------------------------
19 |
20 | class GoLeafVisitor(LeafVisitor):
21 |
22 | def visit_interpreted_string_literal(self, node):
23 | self.node_handler(node)
24 | return False
25 |
26 | def visit(self, node):
27 | if node.type == "\n":
28 | self.node_handler.handle_token(NewlineToken(self.node_handler.config))
29 | return False
30 | return super().visit(node)
--------------------------------------------------------------------------------
/code_tokenize/lang/java/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | from ...config import TokenizationConfig
3 |
4 | # Tokenization config ----------------------------------------------------------------
5 |
6 | def create_tokenization_config():
7 | return TokenizationConfig(
8 | lang = 'java',
9 | statement_types = ["*_statement", "*_definition", "*_declaration"],
10 | indent_tokens = False
11 | )
12 |
--------------------------------------------------------------------------------
/code_tokenize/lang/js/__init__.py:
--------------------------------------------------------------------------------
1 | from ...config import TokenizationConfig
2 |
3 | # Tokenization config ----------------------------------------------------------------
4 |
5 | def create_tokenization_config():
6 | return TokenizationConfig(
7 | lang = 'javascript',
8 | statement_types = ["*_statement", "*_declaration"],
9 | indent_tokens = False
10 | )
--------------------------------------------------------------------------------
/code_tokenize/lang/php/__init__.py:
--------------------------------------------------------------------------------
1 | from ...config import TokenizationConfig
2 |
3 | # Tokenization config ----------------------------------------------------------------
4 |
5 | def create_tokenization_config():
6 | return TokenizationConfig(
7 | lang = 'php',
8 | statement_types = ["*_statement"],
9 | indent_tokens = False
10 | )
--------------------------------------------------------------------------------
/code_tokenize/lang/python/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | from ...config import TokenizationConfig
3 |
4 | from ..base_visitors import LeafVisitor
5 | from .indent import IndentVisitor
6 |
7 |
8 | # Tokenization config ----------------------------------------------------------------
9 |
10 | def create_tokenization_config():
11 | return TokenizationConfig(
12 | lang = "python",
13 | statement_types = ["*_statement", "*_definition"],
14 | visitors = [PythonLeafVisitor, IndentVisitor],
15 | indent_tokens = True
16 | )
17 |
18 | # Custom leaf visitor ----------------------------------------------------------------
19 |
20 | class PythonLeafVisitor(LeafVisitor):
21 |
22 | def visit_unary_operator(self, node):
23 | if node.children[-1].type == "integer":
24 | self.node_handler(node)
25 | return False
--------------------------------------------------------------------------------
/code_tokenize/lang/python/indent.py:
--------------------------------------------------------------------------------
1 | """Hierarchical indentation independent of the concrete program formatting"""
2 |
3 | from code_ast.visitor import ASTVisitor
4 | from ...tokens import IndentToken, DedentToken, NewlineToken
5 |
6 |
7 | class IndentVisitor(ASTVisitor):
8 |
9 | def __init__(self, token_handler):
10 | super().__init__()
11 | self.config = token_handler.config
12 | self.handler = token_handler
13 |
14 | def visit_block(self, block):
15 | self.handler.handle_token(IndentToken(self.config))
16 |
17 | def leave_block(self, block):
18 | self.handler.handle_token(DedentToken(self.config))
19 |
20 | def leave_comment(self, comment):
21 | self.handler.handle_token(NewlineToken(self.config))
22 |
23 | def leave(self, node):
24 | if not node.type.endswith('statement'): return
25 | self.handler.handle_token(NewlineToken(self.config))
--------------------------------------------------------------------------------
/code_tokenize/lang/ruby/__init__.py:
--------------------------------------------------------------------------------
1 | from ...config import TokenizationConfig
2 |
3 | # Tokenization config ----------------------------------------------------------------
4 |
5 | def create_tokenization_config():
6 | return TokenizationConfig(
7 | lang = 'ruby',
8 | statement_types = ["*_statement"],
9 | indent_tokens = True
10 | )
--------------------------------------------------------------------------------
/code_tokenize/tokenizer.py:
--------------------------------------------------------------------------------
1 |
2 | import logging as logger
3 | from code_ast.visitor import ASTVisitor, ResumingVisitorComposition
4 |
5 | from .tokens import ASTToken, TokenSequence
6 |
7 |
8 | # Interface ----------------------------------------------------------------
9 |
10 | def tokenize_tree(config, code_tree, code_lines, visitors = None):
11 | """
12 | Transforms AST tree into token sequence
13 |
14 | Function to analyse an AST tree resulting
15 | into a token sequence. The parsing process
16 | is fully customizable and is guided by the given
17 | configuration.
18 | Tokenizers also support additional analysis
19 | of AST tree and extenstions to the token sequence.
20 |
21 | Parameters
22 | ----------
23 | config : TokenizationConfig
24 | A configuration which used to initialize the tokenizers
25 |
26 | code_tree: tree-sitter root node
27 | Root node of the program to be tokenized
28 |
29 | code_lines: list[str]
30 | Source lines of the program code to be tokenized.
31 | Has to be related to code_tree. Otherwise, behavior
32 | is undefined.
33 |
34 | Returns
35 | -------
36 | TokenSequence
37 | A sequence of program tokens representing the given program
38 |
39 | """
40 | return create_tokenizer(config)(code_tree, code_lines, visitors = visitors)
41 |
42 |
43 | # Tokenize ----------------------------------------------------------------
44 |
45 |
46 | class Tokenizer:
47 | """
48 | Basic tokenizer for parsing AST
49 |
50 | The tokenizer parses a given AST into a token sequence.
51 | Each token is representing an AST leaf.
52 | No further analyses or additions.
53 | """
54 |
55 | def __init__(self, config):
56 | self.config = config
57 | self._visitor_factories = []
58 |
59 | def append_visitor(self, visitor_factory):
60 | self._visitor_factories.append(visitor_factory)
61 |
62 | def _create_token_handler(self, code_lines):
63 | return TokenHandler(self.config, code_lines)
64 |
65 | def _create_tree_visitors(self, token_handler, visitors = None):
66 | visitors = visitors or []
67 | visitors += self._visitor_factories
68 |
69 | visitors = [visitor_fn(token_handler)
70 | if callable(visitor_fn)
71 | else visitor_fn
72 | for visitor_fn in visitors]
73 |
74 | return ResumingVisitorComposition(
75 | ErrorVisitor(self.config),
76 | *visitors
77 | )
78 |
79 | def __call__(self, code_tree, code_lines, visitors = None):
80 | token_handler = self._create_token_handler(code_lines)
81 | tree_visitor = self._create_tree_visitors(token_handler, visitors)
82 |
83 | # Run tree visitor
84 | tree_visitor.walk(code_tree)
85 |
86 | return token_handler.tokens()
87 |
88 |
89 | def create_tokenizer(config):
90 | """Function to create tokenizer based on configuration"""
91 | return Tokenizer(config)
92 |
93 |
94 | # Basic visitor -----------------------------------------------------------
95 |
96 |
97 | class LeafVisitor(ASTVisitor):
98 |
99 | def __init__(self, node_handler):
100 | self.node_handler = node_handler
101 |
102 | def visit_string(self, node):
103 | self.node_handler(node)
104 | return False
105 |
106 | def visit(self, node):
107 | if node.child_count == 0:
108 | self.node_handler(node)
109 | return False
110 |
111 |
112 | class ErrorVisitor(ASTVisitor):
113 |
114 | def __init__(self, config):
115 | self.config = config
116 |
117 | def visit_ERROR(self, node):
118 |
119 | if self.config.syntax_error == "raise":
120 | raise_syntax_error(node)
121 | return
122 |
123 | if self.config.syntax_error == "warn":
124 | warn_syntax_error(node)
125 | return
126 |
127 | # Node handler ------------------------------------------------------------
128 |
129 | class TokenHandler:
130 |
131 | def __init__(self, config, source_code):
132 | self.config = config
133 | self.source_code = source_code
134 |
135 | self._tokens = []
136 |
137 | def tokens(self):
138 | result = TokenSequence(self._tokens)
139 | self._tokens = []
140 | return result
141 |
142 | def handle_token(self, token):
143 | if token.type == "newline" and self._tokens[-1].type in ["indent", "dedent", "newline"]:
144 | return # TODO: Blocking double newlines seems to be general. Better solution?
145 |
146 | self._tokens.append(token)
147 |
148 | def __call__(self, node):
149 | self.handle_token(
150 | ASTToken(self.config, node, self.source_code)
151 | )
152 |
153 | # Error handling -----------------------------------------------------------
154 |
155 | def _construct_error_msg(node):
156 |
157 | start_line, start_char = node.start_point
158 | end_line, end_char = node.end_point
159 |
160 | position = "?"
161 | if start_line == end_line:
162 | position = "in line %d [pos. %d - %d]" % (start_line, start_char, end_char)
163 | else:
164 | position = "inbetween line %d (start: %d) to line %d (end: %d)" % (start_line, start_char, end_line, end_char)
165 |
166 | return "Problem while parsing given code snipet. Error occured %s" % position
167 |
168 |
169 | def warn_syntax_error(node):
170 | logger.warn(_construct_error_msg(node))
171 |
172 |
173 | def raise_syntax_error(node):
174 | raise SyntaxError(_construct_error_msg(node))
175 |
--------------------------------------------------------------------------------
/code_tokenize/tokens.py:
--------------------------------------------------------------------------------
1 | from code_ast.parsers import match_span
2 |
3 | # Cache Properties ---------------------------------------------------------
4 |
5 | def cached_property(fnc):
6 | """Helper decorator for lazy computing properties"""
7 | name = fnc.__name__
8 |
9 | def get_or_compute(self):
10 | cache_attr = getattr(self, "_%s" % name, None)
11 | if cache_attr is not None: return cache_attr
12 |
13 | if not hasattr(self, "_cache"): self._cache = {}
14 |
15 | if name not in self._cache:
16 | self._cache[name] = fnc(self)
17 |
18 | return self._cache[name]
19 |
20 | return property(get_or_compute)
21 |
22 |
23 | # Tokens -------------------------------------------------------------------
24 |
25 | class Token:
26 | """
27 | A token represents a single program entity of a given source code
28 |
29 | Attributes
30 | ----------
31 | text : str
32 | text of program token inside the parsed source code
33 |
34 | type : str
35 | token type or role inside a program.
36 | Often it refers to the type of token, e.g. identifier.
37 | Dependent on the tokenization process can also
38 | refer to contextual roles like variable definitions.
39 |
40 | config : TokenizerConfig
41 | configuration used to parse this token
42 |
43 | root_sequence : TokenSequence
44 | back reference to the sequence containing this token
45 | Might be None (independent token).
46 |
47 | """
48 |
49 | def __init__(self, config, text):
50 | """Representing a single program token"""
51 | self.config = config
52 | self._text = text
53 | self._type = "token"
54 |
55 | self.root_sequence = None
56 |
57 | @property
58 | def text(self):
59 | return self._text
60 |
61 | @property
62 | def type(self):
63 | return self._type
64 |
65 | def __repr__(self):
66 | return self.text
67 |
68 |
69 | class IndentToken(Token):
70 | """
71 | Basic token to indicate an indentation
72 |
73 | Helpful for indentation based languages such as Python.
74 |
75 | """
76 |
77 | def __init__(self, config, new_line_before = True):
78 | super().__init__(config, "#INDENT#")
79 | self.new_line_before = new_line_before
80 | self._type = "indent"
81 |
82 |
83 | class DedentToken(Token):
84 | """
85 | Basic token to indicate an dedentation
86 |
87 | Helpful for indentation based languages such as Python.
88 |
89 | """
90 |
91 | def __init__(self, config, new_line_before = True):
92 | super().__init__(config, "#DEDENT#")
93 | self.new_line_before = new_line_before
94 | self._type = "dedent"
95 |
96 |
97 | class NewlineToken(Token):
98 | """
99 | Basic token to indicate a newline
100 |
101 | Helpful for indentation based languages such as Python.
102 |
103 | """
104 |
105 | def __init__(self, config):
106 | super().__init__(config, "#NEWLINE#")
107 | self._type = "newline"
108 |
109 |
110 | # AST backed token ----------------------------------------------------------------
111 |
112 | class ASTToken(Token):
113 | """
114 | Tokens that are related to leaf nodes inside an AST
115 |
116 | Attributes
117 | ----------
118 | text : str
119 | text of program token inside the parsed source code
120 |
121 | type : str
122 | token type or role inside a program.
123 | Often it refers to the type of token, e.g. identifier.
124 | Dependent on the tokenization process can also
125 | refer to contextual roles like variable definitions.
126 |
127 | ast_node : node object
128 | node inside an AST that is used to create this token
129 |
130 | statement_head : Token
131 | token representing the head (first token) of a statement
132 |
133 | parent_head : Token
134 | token representing the head of a parent statement (if existent)
135 |
136 | config : TokenizerConfig
137 | configuration used to parse this token
138 |
139 | root_sequence : TokenSequence
140 | back reference to the sequence containing this token
141 | Might be None (independent token).
142 |
143 | """
144 |
145 | def __init__(self, config, ast_node, source_lines):
146 | super().__init__(config, None)
147 | self.ast_node = ast_node
148 | self.source_lines = source_lines
149 | self.root_sequence = None
150 | self._type = None
151 |
152 | def _create_token(self, node):
153 | if self.root_sequence is not None:
154 | return self.root_sequence.get_token_by_node(node)
155 | return ASTToken(self.config, node, self.source_lines)
156 |
157 | # API methods --------------------------------
158 |
159 | @cached_property
160 | def text(self):
161 | return match_span(self.ast_node, self.source_lines)
162 |
163 | @cached_property
164 | def type(self):
165 | return self.ast_node.type
166 |
167 | @cached_property
168 | def statement_head(self):
169 | """Returns the token representing the head of a statement"""
170 |
171 | statement_types = self.config.statement_types
172 |
173 | parent_node = parent_statement_node(statement_types, self.ast_node)
174 | if parent_node is None: raise ValueError("No statement could be identified!")
175 |
176 | # Identify first token that belongs to the statement
177 | current_left = parent_node
178 | while not is_token(current_left):
179 | current_left = current_left.children[0]
180 |
181 | return self._create_token(current_left)
182 |
183 | @cached_property
184 | def parent_head(self):
185 | """
186 | Returns head of parent node if it exists.
187 |
188 | If the current token belongs to a top level statement,
189 | the function return None.
190 | """
191 | # For identifying statements
192 | statement_types = self.config.statement_types
193 | parent_node = parent_statement_node(statement_types, self.ast_node)
194 | if parent_node is None: raise ValueError("No statement could be identified!")
195 |
196 | grandparent_node = parent_statement_node(statement_types, parent_node)
197 | if grandparent_node is None: return None
198 |
199 | # Identify first token that belongs to the statement
200 | current_left = grandparent_node
201 | while not is_token(current_left):
202 | current_left = current_left.children[0]
203 |
204 | return self._create_token(current_left)
205 |
206 |
207 |
208 | class VarUseToken(ASTToken):
209 | """AST token representing a variable usage (name of variable)"""
210 |
211 | def __init__(self, config, ast_node, source_lines):
212 | super().__init__(config, ast_node, source_lines)
213 | self._type = "use_var"
214 |
215 |
216 | class VarDefToken(ASTToken):
217 | """AST token representing a variable definition (name of variable)"""
218 |
219 | def __init__(self, config, ast_node, source_lines):
220 | super().__init__(config, ast_node, source_lines)
221 | self._type = "def_var"
222 |
223 |
224 |
225 | # Token Collection -----------------------------------------------------
226 |
227 | class TokenSequence(list):
228 | """
229 | Sequence of tokens
230 |
231 | Represent a sequence of tokens. It acts
232 | as a list while backreferencing each token
233 | in this collection.
234 |
235 | """
236 |
237 | def __init__(self, tokens):
238 | super().__init__(tokens)
239 |
240 | self._map_nodes = {}
241 |
242 | for tok in self:
243 | tok.root_sequence = self
244 |
245 | if hasattr(tok, "ast_node"):
246 | self._map_nodes[node_key(tok.ast_node)] = tok
247 |
248 | def get_token_by_node(self, node):
249 | """Maps a given leaf node back to a token in this sequence."""
250 | return self._map_nodes[node_key(node)]
251 |
252 | def iterstmts(self):
253 | """Splits the token sequence into a sequence of statement tokens"""
254 | def _iter_stmts():
255 | current_head = None
256 | stmt = []
257 |
258 | for tok in self:
259 | tok_head = tok.statement_head if hasattr(tok, "statement_head") else current_head
260 |
261 | if tok_head != current_head:
262 | if len(stmt) > 0: yield stmt
263 | current_head = tok_head
264 | stmt = []
265 |
266 | stmt.append(tok)
267 |
268 | if len(stmt) > 0: yield stmt
269 |
270 | return _iter_stmts()
271 |
272 |
273 | # Utils ----------------------------------------------------------------
274 |
275 | def match_type(type_regex, type):
276 | # TODO Support general regex (Is this needed?)
277 |
278 | star_count = type_regex.count("*")
279 |
280 | if star_count == 0:
281 | return type == type_regex
282 |
283 | if star_count == 1:
284 | if type_regex[0] == "*":
285 | return type.endswith(type_regex[1:])
286 | if type_regex[-1] == "*":
287 | return type.startswith(type_regex[:-1])
288 |
289 | raise ValueError("Unsupported type regex: %s" % type_regex)
290 |
291 |
292 | def is_token(node):
293 | return node.type == "string" or not node.children
294 |
295 |
296 | def node_key(node):
297 | return (node.type, node.start_point, node.end_point)
298 |
299 |
300 | def parent_statement_node(statement_types, node):
301 |
302 | def is_statement(type):
303 | return any(match_type(reg, type) for reg in statement_types)
304 |
305 | # Go up till we find a statement node
306 | parent_node = node.parent
307 | while parent_node is not None and not is_statement(parent_node.type):
308 | parent_node = parent_node.parent
309 |
310 | return parent_node
311 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "code_tokenize"
7 | version = "v0.2.1"
8 | description = "Fast program tokenization and structural analysis in Python"
9 | readme = "README.md"
10 | requires-python = ">= 3.8"
11 | license = { file = "LICENSE.txt" }
12 | keywords = ["code", "tokenization", "tokenize", "program", "language processing"]
13 |
14 | authors = [{name = "Cedric Richter", email = "cedricr.upb@gmail.com"}]
15 | maintainers = [{name = "Cedric Richter", email = "cedricr.upb@gmail.com"}]
16 |
17 | classifiers = [
18 | "Development Status :: 3 - Alpha",
19 | "Intended Audience :: Developers",
20 | "Topic :: Software Development :: Build Tools",
21 | "License :: OSI Approved :: MIT License",
22 | "Programming Language :: Python :: 3",
23 | "Programming Language :: Python :: 3.6",
24 | "Programming Language :: Python :: 3.7",
25 | "Programming Language :: Python :: 3.8",
26 | "Programming Language :: Python :: 3.9",
27 | "Programming Language :: Python :: 3.10",
28 | "Programming Language :: Python :: 3.11",
29 | "Programming Language :: Python :: 3.12",
30 | "Programming Language :: Python :: 3.13",
31 | "Programming Language :: Python :: 3 :: Only",
32 | ]
33 |
34 | dependencies = ["tree_sitter", "GitPython", "requests", "code_ast"]
35 |
36 | [project.urls]
37 | "Homepage" = "https://github.com/cedricrupb/code_tokenize"
38 | "Bug Reports" = "https://github.com/cedricrupb/code_tokenize/issues"
39 | "Source" = "https://github.com/cedricrupb/code_tokenize"
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tree_sitter==0.21.3
2 | requests>=2.32.0
3 | GitPython>=3.1.41
4 | code_ast>=0.1.1
--------------------------------------------------------------------------------
/resources/code_tokenize.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cedricrupb/code_tokenize/6797bcf682edea672677bf3bce708d38f9d20dd0/resources/code_tokenize.png
--------------------------------------------------------------------------------
/resources/code_tokenize.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | ong_description_content_type = text/markdown
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | with open("README.md", "r") as f:
4 | long_description = f.read()
5 |
6 | setup(
7 | name = 'code_tokenize',
8 | packages = find_packages(exclude=['tests']),
9 | version = '0.2.1',
10 | license='MIT',
11 | description = 'Fast program tokenization and structural analysis in Python',
12 | long_description = long_description,
13 | long_description_content_type="text/markdown",
14 | author = 'Cedric Richter',
15 | author_email = 'cedricr.upb@gmail.com',
16 | url = 'https://github.com/cedricrupb/code_tokenize',
17 | download_url = 'https://github.com/cedricrupb/code_tokenize/archive/refs/tags/v0.2.1.tar.gz',
18 | keywords = ['code', 'tokenization', 'tokenize', 'program', 'language processing'],
19 | install_requires=[
20 | 'tree_sitter==0.21.3',
21 | 'GitPython>=3.1.41',
22 | 'requests>=2.32.0',
23 | 'code-ast>=0.1.1'
24 | ],
25 | classifiers=[
26 | 'Development Status :: 3 - Alpha',
27 | 'Intended Audience :: Developers',
28 | 'Topic :: Software Development :: Build Tools',
29 | 'License :: OSI Approved :: MIT License',
30 | 'Programming Language :: Python :: 3',
31 | 'Programming Language :: Python :: 3.6',
32 | 'Programming Language :: Python :: 3.7',
33 | 'Programming Language :: Python :: 3.8',
34 | 'Programming Language :: Python :: 3.9',
35 | 'Programming Language :: Python :: 3.10',
36 | 'Programming Language :: Python :: 3.11',
37 | 'Programming Language :: Python :: 3.12',
38 | 'Programming Language :: Python :: 3.13',
39 | ],
40 | )
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cedricrupb/code_tokenize/6797bcf682edea672677bf3bce708d38f9d20dd0/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_tokenization.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 |
3 | import code_tokenize as ctok
4 |
5 | class PythonTokenizationTestCase(TestCase):
6 |
7 | def test_tokenize1(self):
8 | tokens = ctok.tokenize("def my_func():\n bar()", lang = "python")
9 | expected = ["def", "my_func", "(", ")", ":", "#INDENT#", "bar", "(", ")", "#NEWLINE#", "#DEDENT#"]
10 | self.assertEqual(expected, [str(t) for t in tokens])
11 |
12 | def test_tokenize2(self):
13 | tokens = ctok.tokenize("def my_func(x):\n x = x + 1\n return x", lang = "python")
14 | expected = ["def", "my_func", "(", "x", ")", ":", "#INDENT#", "x", "=", "x", "+", "1", "#NEWLINE#", "return", "x", "#NEWLINE#", "#DEDENT#"]
15 | self.assertEqual(expected, [str(t) for t in tokens])
16 |
17 | def test_error_handling(self):
18 | self.assertRaises(SyntaxError, ctok.tokenize, "def my_func(x):\n x = x + 1 return x", lang = "python")
19 |
20 | def test_error_handling2(self):
21 | tokens = ctok.tokenize("def my_func(x):\n x = x + 1 return x", lang = "python", syntax_error = "ignore")
22 | expected = ["def", "my_func", "(", "x", ")", ":", "x", "=", "x", "+", "1", "#INDENT#", "return", "x", "#NEWLINE#", "#DEDENT#"]
23 | self.assertEqual(expected, [str(t) for t in tokens])
24 |
25 |
26 |
27 | class JavaTokenizationTestCase(TestCase):
28 |
29 | def test_tokenize1(self):
30 | tokens = ctok.tokenize("public class Test {\npublic void myFunc(){\n bar();\n}\n}", lang = "java")
31 | expected = ["public", "class", "Test", "{", "public", "void", "myFunc", "(", ")", "{", "bar", "(", ")", ";", "}", "}"]
32 | self.assertEqual(expected, [str(t) for t in tokens])
33 |
34 | def test_tokenize2(self):
35 | tokens = ctok.tokenize("public class Test {\npublic int myFunc(int x){\n x = x + 1;\n return x;\n}\n}", lang = "java")
36 | expected = ["public", "class", "Test", "{", "public", "int", "myFunc", "(", "int", "x", ")", "{", "x", "=", "x", "+", "1", ";", "return", "x", ";", "}", "}"]
37 | self.assertEqual(expected, [str(t) for t in tokens])
38 |
39 | def test_error_handling(self):
40 | self.assertRaises(SyntaxError, ctok.tokenize, "public int myFunc(int x){\n x = x + 1;\n return x;\n}", lang = "java")
41 |
42 | def test_error_handling2(self):
43 | tokens = ctok.tokenize("public int myFunc(int x){\n x = x + 1;\n return x;\n}", lang = "java", syntax_error = "ignore")
44 | expected = ["public", "int", "myFunc", "", "(", "int", "x", ")", "{", "x", "=", "x", "+", "1", ";", "return", "x", ";", "}"]
45 | self.assertEqual(expected, [str(t) for t in tokens])
46 |
47 |
48 | class GoTokenizationTest(TestCase):
49 |
50 | def test_tokenize1(self):
51 | tokens = ctok.tokenize('func main(){\n tip1 := "test"\n}', lang = "go")
52 | expected = ["func", "main", "(", ")", "{", "tip1", ":=", '"test"', "#NEWLINE#", "}"]
53 |
54 | self.assertEqual(expected, [str(t) for t in tokens])
--------------------------------------------------------------------------------