├── .gitignore
├── LICENSE
├── README.md
├── benchmark
    ├── README.md
    ├── runtime_all.png
    └── runtime_raise.png
├── code_tokenize
    ├── __init__.py
    ├── config.py
    ├── lang
    │   ├── __init__.py
    │   ├── base_visitors.py
    │   ├── go
    │   │   └── __init__.py
    │   ├── java
    │   │   └── __init__.py
    │   ├── js
    │   │   └── __init__.py
    │   ├── php
    │   │   └── __init__.py
    │   ├── python
    │   │   ├── __init__.py
    │   │   └── indent.py
    │   └── ruby
    │   │   └── __init__.py
    ├── tokenizer.py
    └── tokens.py
├── pyproject.toml
├── requirements.txt
├── resources
    ├── code_tokenize.png
    └── code_tokenize.svg
├── setup.cfg
├── setup.py
└── tests
    ├── __init__.py
    └── test_tokenization.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # Project specific ignore
132 | build/
133 | 
134 | data/
135 | .DS_Store


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2021-2022 Cedric Richter
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |   <img height="150" src="https://github.com/cedricrupb/ptokenizers/raw/main/resources/code_tokenize.svg" />
  3 | </p>
  4 | 
  5 | ------------------------------------------------
  6 | > Fast tokenization and structural analysis of
  7 | any programming language in Python
  8 | 
  9 | Programming Language Processing (PLP) brings the capabilities of modern NLP systems to the world of programming languages. 
 10 | To achieve high performance PLP systems, existing methods often take advantage of the fully defined nature of programming languages. Especially the syntactical structure can be exploited to gain knowledge about programs.
 11 | 
 12 | **code.tokenize** provides easy access to the syntactic structure of a program. The tokenizer converts a program into a sequence of program tokens ready for further end-to-end processing.
 13 | By relating each token to an AST node, it is possible to extend the program representation easily with further syntactic information.
 14 | 
 15 | ## Installation
 16 | The package is tested under Python 3. It can be installed via:
 17 | ```
 18 | pip install code-tokenize
 19 | ```
 20 | 
 21 | ## Usage
 22 | code.tokenize can tokenize nearly any program code in a few lines of code:
 23 | ```python
 24 | import code_tokenize as ctok
 25 | 
 26 | # Python
 27 | ctok.tokenize(
 28 |     '''
 29 |         def my_func():
 30 |             print("Hello World")
 31 |     ''',
 32 | lang = "python")
 33 | 
 34 | # Output: [def, my_func, (, ), :, #NEWLINE#, ...]
 35 | 
 36 | # Java
 37 | ctok.tokenize(
 38 |     '''
 39 |         public static void main(String[] args){
 40 |           System.out.println("Hello World");
 41 |         }
 42 |     ''',
 43 | lang = "java", 
 44 | syntax_error = "ignore")
 45 | 
 46 | # Output: [public, static, void, main, (, String, [, ], args), {, System, ...]
 47 | 
 48 | # JavaScript
 49 | ctok.tokenize(
 50 |     '''
 51 |         alert("Hello World");
 52 |     ''',
 53 | lang = "javascript", 
 54 | syntax_error = "ignore")
 55 | 
 56 | # Output: [alert, (, "Hello World", ), ;]
 57 | 
 58 | 
 59 | ```
 60 | 
 61 | ## Supported languages
 62 | code.tokenize employs [tree-sitter](https://tree-sitter.github.io/tree-sitter/) as a backend. Therefore, in principal, any language supported by tree-sitter is also
 63 | supported by a tokenizer in code.tokenize.
 64 | 
 65 | For some languages, this library supports additional
 66 | features that are not directly supported by tree-sitter.
 67 | Therefore, we distinguish between three language classes
 68 | and support the following language identifier:
 69 | 
 70 | - `native`: python
 71 | - `advanced`: java
 72 | - `basic`: javascript, go, ruby, cpp, c, swift, rust, ...
 73 | 
 74 | Languages in the `native` class support all features 
 75 | of this library and are extensively tested. `advanced` languages are tested but do not support the full feature set. Languages of the `basic` class are not tested and
 76 | only support the feature set of the backend. They can still be used for tokenization and AST parsing.
 77 | 
 78 | ## How to contribute
 79 | **Your language is not natively supported by code.tokenize or the tokenization seems to be incorrect?** Then change it!
 80 | 
 81 | While code.tokenize is developed mainly as an helper library for internal research projects, we welcome pull requests of any sorts (if it is a new feature or a bug fix). 
 82 | 
 83 | **Want to help to test more languages?**
 84 | Our goal is to support as many languages as possible at a `native` level. However, languages on `basic` level are completly untested. You can help by testing `basic` languages and reporting issues in the tokenization process!
 85 | 
 86 | ## Release history
 87 | * 0.2.0
 88 |     * Major API redesign!
 89 |     * CHANGE: AST parsing is now done by an external library: [code_ast](https://github.com/cedricrupb/code_ast)
 90 |     * CHANGE: Visitor pattern instead of custom tokenizer
 91 |     * CHANGE: Custom visitors for language dependent tokenization
 92 | * 0.1.0
 93 |     * The first proper release
 94 |     * CHANGE: Language specific tokenizer configuration
 95 |     * CHANGE: Basic analyses of the program structure and token role
 96 |     * CHANGE: Documentation
 97 | * 0.0.1
 98 |     * Work in progress
 99 | 
100 | ## Project Info
101 | The goal of this project is to provide developer in the
102 | programming language processing community with easy
103 | access to program tokenization and AST parsing. This is currently developed as a helper library for internal research projects. Therefore, it will only be updated
104 | as needed.
105 | 
106 | Feel free to open an issue if anything unexpected
107 | happens. 
108 | 
109 | Distributed under the MIT license. See ``LICENSE`` for more information.
110 | 
111 | This project was developed as part of our research related to:
112 | ```bibtex
113 | @inproceedings{richter2022tssb,
114 |   title={TSSB-3M: Mining single statement bugs at massive scale},
115 |   author={Cedric Richter, Heike Wehrheim},
116 |   booktitle={MSR},
117 |   year={2022}
118 | }
119 | ```
120 | 
121 | We thank the developer of [tree-sitter](https://tree-sitter.github.io/tree-sitter/) library. Without tree-sitter this project would not be possible. 
122 | 


--------------------------------------------------------------------------------
/benchmark/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmarking
 2 | 
 3 | In the following, we benchmark the runtime of **code.tokenize** for parsing Python functions. To obtain a realistic set of Python code for PLP, we employ
 4 | the Python portion of the [CodeSearchNet](https://github.com/github/CodeSearchNet) corpus. The corpus includes more than 500K Python functions 
 5 | annotated for training.
 6 | 
 7 | ## Environment 
 8 | We benchmark the following implementation in our benchmark:
 9 | ```python
10 | import code_tokenize as ctok
11 | 
12 | ctok.tokenize(
13 |     source_code,
14 |     lang = 'python',
15 |     syntax_error = 'raise'
16 | )
17 | ```
18 | Therefore, we skip all instances that contain syntax errors. 
19 | 
20 | For benchmarking, we employ a Macbook Pro M1 with 8GB RAM.
21 | 
22 | ## Results
23 | We start by plotting the mean runtime of the tokenizer in relation
24 | to the size of the Python function (in number of tokens). For determining the size of program, we count the tokens in the pretokenized code. For brevity, we show results for functions below 1024 tokens (since this is the typical size of functions employed in PLP).
25 | 
26 | <p align="center">
27 |   <img height="150" src="https://github.com/cedricrupb/code_tokenize/raw/main/benchmark/runtime_raise.png" />
28 | </p>
29 | 
30 | We observe that the time for tokenization scales linearly with the number of tokens in the Python function. Even large function with up to 1024 tokens can be tokenized within 10ms.
31 | Note: The plot only shows runtimes for function implementation that are parsed without an error (Python 2 functions will likely produce an error). However, also functions that raise an exception will also run in a similar time window.
32 | 
33 | 
34 | ## Complete set
35 | Below the uncut version of the diagram. Even for large scale function with
36 | more than 25K tokens, the tokenizer does not take much longer than 100ms.
37 | 
38 | <p align="center">
39 |   <img height="150" src="https://github.com/cedricrupb/code_tokenize/raw/main/benchmark/runtime_all.png" />
40 | </p>
41 | 


--------------------------------------------------------------------------------
/benchmark/runtime_all.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cedricrupb/code_tokenize/6797bcf682edea672677bf3bce708d38f9d20dd0/benchmark/runtime_all.png


--------------------------------------------------------------------------------
/benchmark/runtime_raise.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cedricrupb/code_tokenize/6797bcf682edea672677bf3bce708d38f9d20dd0/benchmark/runtime_raise.png


--------------------------------------------------------------------------------
/code_tokenize/__init__.py:
--------------------------------------------------------------------------------
 1 | from code_ast import ASTParser, ASTVisitor
 2 | 
 3 | from .tokenizer import tokenize_tree
 4 | from .lang      import load_from_lang_config
 5 | 
 6 | import logging as logger
 7 | 
 8 | # Main function --------------------------------
 9 | 
10 | def tokenize(source_code, lang = "guess", **kwargs):
11 |     """
12 |     Tokenizes source code of most programming languages quickly.
13 | 
14 |     Given source code as string, this function quickly tokenizes
15 |     the code into basic program tokens. 
16 |     The function uses tree-sitter as a backend. Therefore, this
17 |     function does not only support most programming languages (see README)
18 |     but also relates every token to an AST node.
19 |     Tokens can be abused to traverse the program AST.
20 | 
21 |     Parameters
22 |     ----------
23 |     source_code : str
24 |         Source code to parsed as a string. Also
25 |         supports parsing of incomplete source code
26 |         snippets (by deactivating the syntax checker; see syntax_error)
27 |     
28 |     lang : [python, java, javascript, ...]
29 |         String identifier of the programming language
30 |         to be parsed. Supported are most programming languages
31 |         including python, java and javascript (see README)
32 |         Default: guess (Guesses language / Not supported currently throws error currently)
33 |     
34 |     syntax_error : [raise, warn, ignore]
35 |         Reaction to syntax error in code snippet.
36 |         raise:  raises a Syntax Error
37 |         warn:   prints a warning to console
38 |         ignore: Ignores syntax errors. Helpful for parsing code snippets.
39 |         Default: raise
40 | 
41 |     visitors : list[Visitor]
42 |         Optional list of visitors that should be executed during tokenization
43 |         Since code is tokenized by traversing the parsed AST, visitors
44 |         can be used to run further AST based analyses.
45 | 
46 |     Returns
47 |     -------
48 |     TokenSequence
49 |         A list of tokens representing the source code snippet.
50 |     
51 |     """
52 | 
53 |     if len(source_code.strip()) == 0: raise ValueError("The code string is empty. Cannot tokenize anything empty: %s" % source_code) 
54 | 
55 |     # If lang == guess, automatically determine the language
56 |     if lang == "guess": lang = _lang_detect(source_code)
57 | 
58 |     logger.debug("Parses source code with parser for %s" % lang)
59 | 
60 |     # Setup config
61 |     config = load_from_lang_config(lang, **kwargs)
62 | 
63 |     # Parse source tree
64 |     parser = ASTParser(config.lang)
65 |     tree, code = parser.parse(source_code)
66 |     
67 |     return tokenize_tree(config, tree.root_node, code, visitors = config.visitors)
68 | 
69 | 
70 | 
71 | # Lang detect --------------------------------------  
72 | 
73 | 
74 | def _lang_detect(source_code):
75 |     """Guesses the source code type using pygments"""
76 |     raise NotImplementedError(
77 |         "Guessing the language automatically is currently not implemented. Please specify a language with the lang keyword\n code_tokenize.tokenize(code, lang = your_lang)"
78 |     )
79 | 
80 | 


--------------------------------------------------------------------------------
/code_tokenize/config.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import json
 3 | 
 4 | from .lang.base_visitors import LeafVisitor
 5 | 
 6 | 
 7 | class TokenizationConfig:
 8 |     """Helper object to translate arguments of tokenize to config object"""
 9 | 
10 |     def __init__(self, lang, **kwargs):
11 |         self.lang = lang
12 |         self.syntax_error = "raise" # Options: raise, warn, ignore
13 | 
14 |         self.indent_tokens = False # Whether to represent indentations and newlines (Helpful for script languages like Python)
15 |         self.num_whitespaces_for_indent = 4
16 | 
17 |         # A list of all statement node defined in the language
18 |         self.statement_types = [
19 |             "*_statement", "*_definition", "*_declaration"
20 |         ]
21 | 
22 |         self.visitors = [LeafVisitor] # visitor classes which should be run during analysis
23 | 
24 |         self.update(kwargs)
25 | 
26 |     
27 |     def update(self, kwargs):
28 |         for k, v in kwargs.items():
29 | 
30 |             if k not in self.__dict__:
31 |                 raise TypeError("TypeError: tokenize() got an unexpected keyword argument '%s'" % k)
32 |         
33 |             self.__dict__[k] = v
34 |     
35 |     def __repr__(self):
36 | 
37 |         elements = []
38 |         for k, v in self.__dict__.items():
39 |             if v is not None:
40 |                 elements.append("%s=%s" % (k, v))
41 |         
42 |         return "Config(%s)" % ", ".join(elements)
43 | 
44 | 
45 | 
46 | # From config ----------------------------------------------------------------
47 | 
48 | def load_from_config(config_path, **kwargs):
49 |     """Load from a config file. Config options can still be overwritten with kwargs"""
50 | 
51 |     with open(config_path, "r") as config_file:
52 |         config = json.load(config_file)
53 |     config.update(kwargs)
54 | 
55 |     return TokenizationConfig(**config)
56 | 
57 | 


--------------------------------------------------------------------------------
/code_tokenize/lang/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from ..config    import TokenizationConfig
 3 | 
 4 | from .python import create_tokenization_config as pytok_config
 5 | from .java   import create_tokenization_config as jvtok_config
 6 | from .go     import create_tokenization_config as gotok_config
 7 | from .js     import create_tokenization_config as jstok_config
 8 | from .php    import create_tokenization_config as phptok_config
 9 | from .ruby   import create_tokenization_config as rubytok_config
10 | 
11 | 
12 | def load_from_lang_config(lang, **kwargs):
13 |     
14 |     if lang == "python"       : base_config = pytok_config()
15 |     elif lang == "java"       : base_config = jvtok_config()
16 |     elif lang == "go"         : base_config = gotok_config()
17 |     elif lang == "javascript" : base_config = jstok_config()
18 |     elif lang == "php"        : base_config = phptok_config()
19 |     elif lang == "ruby"       : base_config = rubytok_config()
20 |     else                      : base_config = TokenizationConfig(lang)
21 | 
22 |     base_config.update(kwargs)
23 |     return base_config
24 | 


--------------------------------------------------------------------------------
/code_tokenize/lang/base_visitors.py:
--------------------------------------------------------------------------------
 1 | from code_ast import ASTVisitor
 2 | 
 3 | # Basic visitor -----------------------------------------------------------
 4 | 
 5 | class LeafVisitor(ASTVisitor):
 6 | 
 7 |     def __init__(self, node_handler):
 8 |         self.node_handler = node_handler
 9 | 
10 |     def visit_string(self, node):
11 |         self.node_handler(node)
12 |         return False
13 | 
14 |     def visit(self, node):
15 |         if node.child_count == 0:
16 |             self.node_handler(node)
17 |             return False


--------------------------------------------------------------------------------
/code_tokenize/lang/go/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from ...config import TokenizationConfig
 3 | from ...tokens import NewlineToken
 4 | 
 5 | from ..base_visitors import LeafVisitor
 6 | 
 7 | 
 8 | # Tokenization config ----------------------------------------------------------------
 9 | 
10 | def create_tokenization_config():
11 |     return TokenizationConfig(
12 |         lang = 'go',
13 |         statement_types = ["*_statement", "*_declaration"],
14 |         visitors = [GoLeafVisitor],
15 |         indent_tokens   = False
16 |     )
17 | 
18 | # Custom leaf visitor ----------------------------------------------------------------
19 | 
20 | class GoLeafVisitor(LeafVisitor):
21 | 
22 |     def visit_interpreted_string_literal(self, node):
23 |         self.node_handler(node)
24 |         return False
25 | 
26 |     def visit(self, node):
27 |         if node.type == "\n":
28 |             self.node_handler.handle_token(NewlineToken(self.node_handler.config))
29 |             return False
30 |         return super().visit(node)


--------------------------------------------------------------------------------
/code_tokenize/lang/java/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from ...config import TokenizationConfig
 3 | 
 4 | # Tokenization config ----------------------------------------------------------------
 5 | 
 6 | def create_tokenization_config():
 7 |     return TokenizationConfig(
 8 |         lang = 'java',
 9 |         statement_types = ["*_statement", "*_definition", "*_declaration"],
10 |         indent_tokens   = False
11 |     )
12 | 


--------------------------------------------------------------------------------
/code_tokenize/lang/js/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...config import TokenizationConfig
 2 | 
 3 | # Tokenization config ----------------------------------------------------------------
 4 | 
 5 | def create_tokenization_config():
 6 |     return TokenizationConfig(
 7 |         lang = 'javascript',
 8 |         statement_types = ["*_statement", "*_declaration"],
 9 |         indent_tokens   = False
10 |     )


--------------------------------------------------------------------------------
/code_tokenize/lang/php/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...config import TokenizationConfig
 2 | 
 3 | # Tokenization config ----------------------------------------------------------------
 4 | 
 5 | def create_tokenization_config():
 6 |     return TokenizationConfig(
 7 |         lang = 'php',
 8 |         statement_types = ["*_statement"],
 9 |         indent_tokens   = False
10 |     )


--------------------------------------------------------------------------------
/code_tokenize/lang/python/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from ...config import TokenizationConfig
 3 | 
 4 | from ..base_visitors import LeafVisitor
 5 | from .indent         import IndentVisitor
 6 | 
 7 | 
 8 | # Tokenization config ----------------------------------------------------------------
 9 | 
10 | def create_tokenization_config():
11 |     return TokenizationConfig(
12 |         lang = "python",
13 |         statement_types = ["*_statement", "*_definition"],
14 |         visitors = [PythonLeafVisitor, IndentVisitor],
15 |         indent_tokens   = True
16 |     )
17 | 
18 | # Custom leaf visitor ----------------------------------------------------------------
19 | 
20 | class PythonLeafVisitor(LeafVisitor):
21 | 
22 |    def visit_unary_operator(self, node):
23 |         if node.children[-1].type == "integer":
24 |             self.node_handler(node)
25 |             return False


--------------------------------------------------------------------------------
/code_tokenize/lang/python/indent.py:
--------------------------------------------------------------------------------
 1 | """Hierarchical indentation independent of the concrete program formatting"""
 2 | 
 3 | from code_ast.visitor import ASTVisitor
 4 | from ...tokens  import IndentToken, DedentToken, NewlineToken
 5 | 
 6 | 
 7 | class IndentVisitor(ASTVisitor):
 8 | 
 9 |     def __init__(self, token_handler):
10 |         super().__init__()
11 |         self.config = token_handler.config
12 |         self.handler = token_handler
13 | 
14 |     def visit_block(self, block):
15 |         self.handler.handle_token(IndentToken(self.config))
16 | 
17 |     def leave_block(self, block):
18 |         self.handler.handle_token(DedentToken(self.config))
19 | 
20 |     def leave_comment(self, comment):
21 |         self.handler.handle_token(NewlineToken(self.config))
22 | 
23 |     def leave(self, node):
24 |         if not node.type.endswith('statement'): return
25 |         self.handler.handle_token(NewlineToken(self.config))


--------------------------------------------------------------------------------
/code_tokenize/lang/ruby/__init__.py:
--------------------------------------------------------------------------------
 1 | from ...config import TokenizationConfig
 2 | 
 3 | # Tokenization config ----------------------------------------------------------------
 4 | 
 5 | def create_tokenization_config():
 6 |     return TokenizationConfig(
 7 |         lang = 'ruby',
 8 |         statement_types = ["*_statement"],
 9 |         indent_tokens   = True
10 |     )


--------------------------------------------------------------------------------
/code_tokenize/tokenizer.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import logging as logger
  3 | from code_ast.visitor import ASTVisitor, ResumingVisitorComposition
  4 | 
  5 | from .tokens  import ASTToken, TokenSequence
  6 | 
  7 | 
  8 | # Interface ----------------------------------------------------------------
  9 | 
 10 | def tokenize_tree(config, code_tree, code_lines, visitors = None):
 11 |     """
 12 |     Transforms AST tree into token sequence
 13 | 
 14 |     Function to analyse an AST tree resulting
 15 |     into a token sequence. The parsing process
 16 |     is fully customizable and is guided by the given
 17 |     configuration.
 18 |     Tokenizers also support additional analysis
 19 |     of AST tree and extenstions to the token sequence.
 20 | 
 21 |     Parameters
 22 |     ----------
 23 |     config : TokenizationConfig
 24 |         A configuration which used to initialize the tokenizers
 25 | 
 26 |     code_tree: tree-sitter root node
 27 |         Root node of the program to be tokenized
 28 |     
 29 |     code_lines: list[str]
 30 |         Source lines of the program code to be tokenized.
 31 |         Has to be related to code_tree. Otherwise, behavior
 32 |         is undefined.
 33 | 
 34 |     Returns
 35 |     -------
 36 |     TokenSequence
 37 |         A sequence of program tokens representing the given program
 38 | 
 39 |     """
 40 |     return create_tokenizer(config)(code_tree, code_lines, visitors = visitors)
 41 | 
 42 | 
 43 | # Tokenize ----------------------------------------------------------------
 44 | 
 45 | 
 46 | class Tokenizer:
 47 |     """
 48 |     Basic tokenizer for parsing AST
 49 |     
 50 |     The tokenizer parses a given AST into a token sequence. 
 51 |     Each token is representing an AST leaf.
 52 |     No further analyses or additions.
 53 |     """
 54 |     
 55 |     def __init__(self, config):
 56 |         self.config    = config
 57 |         self._visitor_factories = []
 58 | 
 59 |     def append_visitor(self, visitor_factory):
 60 |         self._visitor_factories.append(visitor_factory)
 61 | 
 62 |     def _create_token_handler(self, code_lines):
 63 |         return TokenHandler(self.config, code_lines)
 64 | 
 65 |     def _create_tree_visitors(self, token_handler, visitors = None):
 66 |         visitors  = visitors or []
 67 |         visitors += self._visitor_factories
 68 | 
 69 |         visitors  = [visitor_fn(token_handler) 
 70 |                         if callable(visitor_fn) 
 71 |                             else visitor_fn
 72 |                         for visitor_fn in visitors]
 73 | 
 74 |         return ResumingVisitorComposition(
 75 |             ErrorVisitor(self.config),
 76 |             *visitors
 77 |         )
 78 | 
 79 |     def __call__(self, code_tree, code_lines, visitors = None):
 80 |         token_handler = self._create_token_handler(code_lines)
 81 |         tree_visitor  = self._create_tree_visitors(token_handler, visitors)
 82 | 
 83 |         # Run tree visitor
 84 |         tree_visitor.walk(code_tree)
 85 | 
 86 |         return token_handler.tokens()
 87 | 
 88 | 
 89 | def create_tokenizer(config):
 90 |     """Function to create tokenizer based on configuration"""
 91 |     return Tokenizer(config)
 92 | 
 93 | 
 94 | # Basic visitor -----------------------------------------------------------
 95 | 
 96 | 
 97 | class LeafVisitor(ASTVisitor):
 98 | 
 99 |     def __init__(self, node_handler):
100 |         self.node_handler = node_handler
101 | 
102 |     def visit_string(self, node):
103 |         self.node_handler(node)
104 |         return False
105 | 
106 |     def visit(self, node):
107 |         if node.child_count == 0:
108 |             self.node_handler(node)
109 |             return False
110 | 
111 | 
112 | class ErrorVisitor(ASTVisitor):
113 | 
114 |     def __init__(self, config):
115 |         self.config = config
116 | 
117 |     def visit_ERROR(self, node):
118 | 
119 |         if self.config.syntax_error == "raise":
120 |             raise_syntax_error(node)
121 |             return
122 | 
123 |         if self.config.syntax_error == "warn":
124 |             warn_syntax_error(node)
125 |             return
126 | 
127 | # Node handler ------------------------------------------------------------
128 | 
129 | class TokenHandler:
130 | 
131 |     def __init__(self, config, source_code):
132 |         self.config = config
133 |         self.source_code = source_code
134 | 
135 |         self._tokens = []
136 | 
137 |     def tokens(self):
138 |         result = TokenSequence(self._tokens)
139 |         self._tokens = []
140 |         return result
141 |     
142 |     def handle_token(self, token):
143 |         if token.type == "newline" and self._tokens[-1].type in ["indent", "dedent", "newline"]:
144 |             return # TODO: Blocking double newlines seems to be general. Better solution?
145 | 
146 |         self._tokens.append(token)
147 | 
148 |     def __call__(self, node):
149 |         self.handle_token(
150 |             ASTToken(self.config, node, self.source_code)
151 |         )
152 | 
153 | # Error handling -----------------------------------------------------------
154 | 
155 | def _construct_error_msg(node):
156 | 
157 |     start_line, start_char = node.start_point
158 |     end_line, end_char     = node.end_point
159 | 
160 |     position = "?"
161 |     if start_line == end_line:
162 |         position = "in line %d [pos. %d - %d]" % (start_line, start_char, end_char)
163 |     else:
164 |         position = "inbetween line %d (start: %d) to line %d (end: %d)" % (start_line, start_char, end_line, end_char)
165 | 
166 |     return "Problem while parsing given code snipet. Error occured %s" % position
167 | 
168 | 
169 | def warn_syntax_error(node):
170 |     logger.warn(_construct_error_msg(node))
171 | 
172 | 
173 | def raise_syntax_error(node):
174 |     raise SyntaxError(_construct_error_msg(node))
175 | 


--------------------------------------------------------------------------------
/code_tokenize/tokens.py:
--------------------------------------------------------------------------------
  1 | from code_ast.parsers import match_span
  2 | 
  3 | # Cache Properties ---------------------------------------------------------
  4 | 
  5 | def cached_property(fnc):
  6 |     """Helper decorator for lazy computing properties"""
  7 |     name = fnc.__name__
  8 | 
  9 |     def get_or_compute(self):
 10 |         cache_attr = getattr(self, "_%s" % name, None)
 11 |         if cache_attr is not None: return cache_attr
 12 |         
 13 |         if not hasattr(self, "_cache"): self._cache = {}
 14 | 
 15 |         if name not in self._cache:
 16 |             self._cache[name] = fnc(self)
 17 |         
 18 |         return self._cache[name]
 19 |     
 20 |     return property(get_or_compute)
 21 | 
 22 | 
 23 | # Tokens -------------------------------------------------------------------
 24 | 
 25 | class Token:
 26 |     """
 27 |     A token represents a single program entity of a given source code
 28 | 
 29 |     Attributes
 30 |     ----------
 31 |     text : str
 32 |         text of program token inside the parsed source code
 33 | 
 34 |     type : str
 35 |         token type or role inside a program.
 36 |         Often it refers to the type of token, e.g. identifier.
 37 |         Dependent on the tokenization process can also
 38 |         refer to contextual roles like variable definitions.
 39 | 
 40 |     config : TokenizerConfig
 41 |         configuration used to parse this token
 42 | 
 43 |     root_sequence : TokenSequence
 44 |         back reference to the sequence containing this token
 45 |         Might be None (independent token).
 46 |     
 47 |     """
 48 | 
 49 |     def __init__(self, config, text):
 50 |         """Representing a single program token"""
 51 |         self.config = config
 52 |         self._text   = text
 53 |         self._type   = "token"
 54 | 
 55 |         self.root_sequence = None
 56 | 
 57 |     @property
 58 |     def text(self):
 59 |         return self._text
 60 | 
 61 |     @property
 62 |     def type(self):
 63 |         return self._type
 64 | 
 65 |     def __repr__(self):
 66 |         return self.text
 67 | 
 68 | 
 69 | class IndentToken(Token):
 70 |     """
 71 |     Basic token to indicate an indentation
 72 |     
 73 |     Helpful for indentation based languages such as Python.
 74 | 
 75 |     """
 76 | 
 77 |     def __init__(self, config, new_line_before = True):
 78 |         super().__init__(config, "#INDENT#")
 79 |         self.new_line_before = new_line_before
 80 |         self._type = "indent"
 81 |         
 82 | 
 83 | class DedentToken(Token):
 84 |     """
 85 |     Basic token to indicate an dedentation
 86 |     
 87 |     Helpful for indentation based languages such as Python.
 88 | 
 89 |     """
 90 | 
 91 |     def __init__(self, config, new_line_before = True):
 92 |         super().__init__(config, "#DEDENT#")
 93 |         self.new_line_before = new_line_before
 94 |         self._type = "dedent"
 95 | 
 96 | 
 97 | class NewlineToken(Token):
 98 |     """
 99 |     Basic token to indicate a newline
100 |     
101 |     Helpful for indentation based languages such as Python.
102 | 
103 |     """
104 | 
105 |     def __init__(self, config):
106 |         super().__init__(config, "#NEWLINE#")
107 |         self._type = "newline"
108 | 
109 | 
110 | # AST backed token  ----------------------------------------------------------------
111 | 
112 | class ASTToken(Token):
113 |     """
114 |     Tokens that are related to leaf nodes inside an AST
115 | 
116 |     Attributes
117 |     ----------
118 |     text : str
119 |         text of program token inside the parsed source code
120 | 
121 |     type : str
122 |         token type or role inside a program.
123 |         Often it refers to the type of token, e.g. identifier.
124 |         Dependent on the tokenization process can also
125 |         refer to contextual roles like variable definitions.
126 | 
127 |     ast_node : node object
128 |         node inside an AST that is used to create this token
129 | 
130 |     statement_head : Token
131 |         token representing the head (first token) of a statement
132 | 
133 |     parent_head : Token
134 |         token representing the head of a parent statement (if existent)
135 | 
136 |     config : TokenizerConfig
137 |         configuration used to parse this token
138 | 
139 |     root_sequence : TokenSequence
140 |         back reference to the sequence containing this token
141 |         Might be None (independent token).
142 |     
143 |     """
144 | 
145 |     def __init__(self, config, ast_node, source_lines):
146 |         super().__init__(config, None)
147 |         self.ast_node = ast_node
148 |         self.source_lines = source_lines
149 |         self.root_sequence = None
150 |         self._type = None
151 |     
152 |     def _create_token(self, node):
153 |         if self.root_sequence is not None:
154 |             return self.root_sequence.get_token_by_node(node)
155 |         return ASTToken(self.config, node, self.source_lines)
156 | 
157 |     # API methods --------------------------------
158 | 
159 |     @cached_property
160 |     def text(self):
161 |         return match_span(self.ast_node, self.source_lines)
162 | 
163 |     @cached_property
164 |     def type(self):
165 |         return self.ast_node.type
166 | 
167 |     @cached_property
168 |     def statement_head(self):
169 |         """Returns the token representing the head of a statement"""
170 | 
171 |         statement_types = self.config.statement_types
172 |         
173 |         parent_node = parent_statement_node(statement_types, self.ast_node)
174 |         if parent_node is None: raise ValueError("No statement could be identified!")
175 | 
176 |         # Identify first token that belongs to the statement
177 |         current_left = parent_node
178 |         while not is_token(current_left):
179 |             current_left = current_left.children[0]
180 | 
181 |         return self._create_token(current_left)
182 | 
183 |     @cached_property
184 |     def parent_head(self):
185 |         """
186 |         Returns head of parent node if it exists.
187 | 
188 |         If the current token belongs to a top level statement,
189 |         the function return None.
190 |         """
191 |         # For identifying statements
192 |         statement_types = self.config.statement_types
193 |         parent_node     = parent_statement_node(statement_types, self.ast_node)
194 |         if parent_node is None: raise ValueError("No statement could be identified!")
195 | 
196 |         grandparent_node = parent_statement_node(statement_types, parent_node)
197 |         if grandparent_node is None: return None
198 | 
199 |         # Identify first token that belongs to the statement
200 |         current_left = grandparent_node
201 |         while not is_token(current_left):
202 |             current_left = current_left.children[0]
203 | 
204 |         return self._create_token(current_left)
205 | 
206 | 
207 | 
208 | class VarUseToken(ASTToken):
209 |     """AST token representing a variable usage (name of variable)"""
210 | 
211 |     def __init__(self, config, ast_node, source_lines):
212 |         super().__init__(config, ast_node, source_lines)
213 |         self._type  = "use_var" 
214 | 
215 | 
216 | class VarDefToken(ASTToken):
217 |     """AST token representing a variable definition (name of variable)"""
218 | 
219 |     def __init__(self, config, ast_node, source_lines):
220 |         super().__init__(config, ast_node, source_lines)
221 |         self._type  = "def_var" 
222 | 
223 | 
224 | 
225 | # Token Collection -----------------------------------------------------
226 | 
227 | class TokenSequence(list):
228 |     """
229 |     Sequence of tokens
230 | 
231 |     Represent a sequence of tokens. It acts
232 |     as a list while backreferencing each token
233 |     in this collection.
234 |     
235 |     """
236 | 
237 |     def __init__(self, tokens):
238 |         super().__init__(tokens)
239 | 
240 |         self._map_nodes = {}
241 | 
242 |         for tok in self:
243 |             tok.root_sequence = self
244 | 
245 |             if hasattr(tok, "ast_node"):
246 |                 self._map_nodes[node_key(tok.ast_node)] = tok
247 | 
248 |     def get_token_by_node(self, node):
249 |         """Maps a given leaf node back to a token in this sequence."""
250 |         return self._map_nodes[node_key(node)]
251 | 
252 |     def iterstmts(self):
253 |         """Splits the token sequence into a sequence of statement tokens"""
254 |         def _iter_stmts():
255 |             current_head = None
256 |             stmt = []
257 | 
258 |             for tok in self:
259 |                 tok_head = tok.statement_head if hasattr(tok, "statement_head") else current_head
260 | 
261 |                 if tok_head != current_head:
262 |                     if len(stmt) > 0: yield stmt
263 |                     current_head = tok_head
264 |                     stmt = []
265 | 
266 |                 stmt.append(tok)
267 |             
268 |             if len(stmt) > 0: yield stmt
269 | 
270 |         return _iter_stmts()
271 | 
272 | 
273 | # Utils ----------------------------------------------------------------
274 | 
275 | def match_type(type_regex, type):
276 |     # TODO Support general regex (Is this needed?)
277 | 
278 |     star_count = type_regex.count("*")
279 | 
280 |     if star_count == 0:
281 |         return type == type_regex
282 | 
283 |     if  star_count == 1:
284 |         if type_regex[0] == "*":
285 |             return type.endswith(type_regex[1:])
286 |         if type_regex[-1] == "*":
287 |             return type.startswith(type_regex[:-1])
288 | 
289 |     raise ValueError("Unsupported type regex: %s" % type_regex)
290 | 
291 | 
292 | def is_token(node):
293 |     return node.type == "string" or not node.children 
294 | 
295 | 
296 | def node_key(node):
297 |     return (node.type, node.start_point, node.end_point)
298 | 
299 | 
300 | def parent_statement_node(statement_types, node):
301 | 
302 |     def is_statement(type):
303 |         return any(match_type(reg, type) for reg in statement_types)
304 |         
305 |     # Go up till we find a statement node
306 |     parent_node = node.parent
307 |     while parent_node is not None and not is_statement(parent_node.type):
308 |         parent_node = parent_node.parent
309 | 
310 |     return parent_node
311 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools"] 
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "code_tokenize"
 7 | version = "v0.2.1"
 8 | description = "Fast program tokenization and structural analysis in Python"
 9 | readme = "README.md"
10 | requires-python = ">= 3.8"
11 | license = { file = "LICENSE.txt" }
12 | keywords = ["code", "tokenization", "tokenize", "program", "language processing"]
13 | 
14 | authors = [{name = "Cedric Richter", email = "cedricr.upb@gmail.com"}]
15 | maintainers = [{name = "Cedric Richter", email = "cedricr.upb@gmail.com"}]
16 | 
17 | classifiers = [
18 |     "Development Status :: 3 - Alpha",    
19 |     "Intended Audience :: Developers",  
20 |     "Topic :: Software Development :: Build Tools",
21 |     "License :: OSI Approved :: MIT License",
22 |     "Programming Language :: Python :: 3", 
23 |     "Programming Language :: Python :: 3.6",
24 |     "Programming Language :: Python :: 3.7",
25 |     "Programming Language :: Python :: 3.8",
26 |     "Programming Language :: Python :: 3.9",
27 |     "Programming Language :: Python :: 3.10",
28 |     "Programming Language :: Python :: 3.11",
29 |     "Programming Language :: Python :: 3.12",
30 |     "Programming Language :: Python :: 3.13",
31 |     "Programming Language :: Python :: 3 :: Only",
32 |   ]
33 | 
34 | dependencies = ["tree_sitter", "GitPython", "requests", "code_ast"]
35 | 
36 | [project.urls]
37 | "Homepage" = "https://github.com/cedricrupb/code_tokenize"
38 | "Bug Reports" = "https://github.com/cedricrupb/code_tokenize/issues"
39 | "Source" = "https://github.com/cedricrupb/code_tokenize"


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tree_sitter==0.21.3
2 | requests>=2.32.0
3 | GitPython>=3.1.41
4 | code_ast>=0.1.1


--------------------------------------------------------------------------------
/resources/code_tokenize.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cedricrupb/code_tokenize/6797bcf682edea672677bf3bce708d38f9d20dd0/resources/code_tokenize.png


--------------------------------------------------------------------------------
/resources/code_tokenize.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!-- Do not edit this file with editors other than diagrams.net -->
3 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
4 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="363px" height="162px" viewBox="-0.5 -0.5 363 162" content="&lt;mxfile host=&quot;app.diagrams.net&quot; modified=&quot;2021-11-01T13:49:26.800Z&quot; agent=&quot;5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15&quot; etag=&quot;oBMPf5kJCJ_ZKNw9AgzX&quot; version=&quot;15.6.3&quot; type=&quot;device&quot;&gt;&lt;diagram id=&quot;rdy9DiI8dkUHSebYucwQ&quot; name=&quot;Page-1&quot;&gt;7VpLc5swEP41PtZjIR72MSFJm0MfMzm0OXVUkI0mAnkUOcb99RUgDEL4EeqA3cY+hF1JC/q+z7trxSPox+lHjpbRZxZiOrImYTqCNyPLAq7tyj+ZZ1N4plA5FpyEalLleCC/sXJOlHdFQvysTRSMUUGWujNgSYIDofkQ52ytT5szqt91iRbYcDwEiJre7yQUkdqF5VX+T5gsovLOwJ0VIzEqJ6udPEcoZOuaC96OoM8ZE8VVnPqYZuCVuBTr7naMbh+M40Qcs8D3H7/A+3j582sQz3xw//S0ER9gEeUF0ZXacCApVA8sNiUKnK2SEGeBJiN4vY6IwA9LFGSja8m79EUiptIC8nLOEqGItNzMJpT6jDKex4J3+Uv6nwVnT7g2cp2/5Yh6KMwFTnfuFmwxlOLDLMaCb+SUVNePkh2Ayl5XJIKSmahGoK18SOlmsY1cQSsvFLqvQNo2kB6fFua3hRMextPuE07HgFPI3ScZHheEqncYVWvSJ6yuAatMv4kgc4K5AazcuNDR09FKWIIbn3/lQpQsEmkGMrYMDK8zGInMuldqICZhmN2mlS6d0BpjwFN2ja/pJHufhi9Lp2vaklPa2IJvxZb3ztY+ts6NrqlBVyiL/3/PE7CcQzw5fdI0M2gyOMJJeJU1lxX2Nbq6AYlTIn5ka8aOsh5VhOz6Jq0bm9JI5H5rizLzUT1FblTLcqtcZxQ1P39t6cSh0RQ3yJRosBUP8OHWUiC+wOJQyTHFcYD90scxRYK86I/bJgl1h2+MyI3srsDNylpsU62qd9eNQM1kM2vEKWAw4uQC3e66u2bLdrdv0Z6zAO0jBTgdUoDTE+mvGcfqWX9gEP3t1lGXdHpC7TlHas8bUnuWOxt7rq6bZlU9Ov+1xAI9a9AyNIiE4OTXSphfCs+vy7LfuMsC3ljvs2yzz3KdHvssYB4+9dpo1fPC2HP2poZdha4qbQMWulL4591q/SuVzjzI61W1Wj0De0V7XuIbtM0CUM99rlUmw7+UX999vnnu2af6wGty5p7mrGM2HUK2g3ZowNXV5nXMmc04favWPFaer5Lgp2ya6HtnZvyvqqUva5FYs+6dri8zj5UvoS/zzrAxc49MMkU2GizLWM2vBt6llkfziP0Cy+PJFTjYAcNEV4PdUVTQ218GO6tKmtUPMIrp1c9Y4O0f&lt;/diagram&gt;&lt;/mxfile&gt;" style="background-color: rgb(255, 255, 255);"><defs/><g><rect x="0" y="120" width="120" height="40" fill="#ffffff" stroke="#b3b3b3" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 140px; margin-left: 1px;"><div data-drawio-colors="color: rgba(0, 0, 0, 1); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 26px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal;">code</div></div></div></foreignObject><text x="60" y="148" fill="rgba(0, 0, 0, 1)" font-family="Helvetica" font-size="26px" text-anchor="middle">code</text></switch></g><rect x="120" y="120" width="40" height="40" fill="rgba(255, 255, 255, 1)" stroke="#b3b3b3" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 38px; height: 1px; padding-top: 140px; margin-left: 121px;"><div data-drawio-colors="color: rgba(0, 0, 0, 1); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 26px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal;">.</div></div></div></foreignObject><text x="140" y="148" fill="rgba(0, 0, 0, 1)" font-family="Helvetica" font-size="26px" text-anchor="middle">.</text></switch></g><rect x="160" y="120" width="200" height="40" fill="rgba(255, 255, 255, 1)" stroke="#b3b3b3" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 198px; height: 1px; padding-top: 140px; margin-left: 161px;"><div data-drawio-colors="color: rgba(0, 0, 0, 1); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 26px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal;">tokenize</div></div></div></foreignObject><text x="260" y="148" fill="rgba(0, 0, 0, 1)" font-family="Helvetica" font-size="26px" text-anchor="middle">tokenize</text></switch></g><rect x="10" y="70" width="100" height="30" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 98px; height: 1px; padding-top: 85px; margin-left: 11px;"><div data-drawio-colors="color: #808080; " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 17px; font-family: Helvetica; color: rgb(128, 128, 128); line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal;">identifier</div></div></div></foreignObject><text x="60" y="90" fill="#808080" font-family="Helvetica" font-size="17px" text-anchor="middle">identifier</text></switch></g><rect x="210" y="70" width="100" height="30" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 98px; height: 1px; padding-top: 85px; margin-left: 211px;"><div data-drawio-colors="color: #808080; " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 17px; font-family: Helvetica; color: rgb(128, 128, 128); line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal;">identifier</div></div></div></foreignObject><text x="260" y="90" fill="#808080" font-family="Helvetica" font-size="17px" text-anchor="middle">identifier</text></switch></g><rect x="115" y="70" width="50" height="30" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 48px; height: 1px; padding-top: 85px; margin-left: 116px;"><div data-drawio-colors="color: #808080; " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 17px; font-family: Helvetica; color: rgb(128, 128, 128); line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal;">dot</div></div></div></foreignObject><text x="140" y="90" fill="#808080" font-family="Helvetica" font-size="17px" text-anchor="middle">dot</text></switch></g><path d="M 60 120 L 60 100" fill="none" stroke="#cccccc" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 140 120 L 140 100" fill="none" stroke="#cccccc" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 260 120 L 260 100" fill="none" stroke="#cccccc" stroke-miterlimit="10" pointer-events="stroke"/><rect x="107.5" y="30" width="65" height="30" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 63px; height: 1px; padding-top: 45px; margin-left: 109px;"><div data-drawio-colors="color: #808080; " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 14px; font-family: Helvetica; color: rgb(128, 128, 128); line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal;">attribute</div></div></div></foreignObject><text x="140" y="49" fill="#808080" font-family="Helvetica" font-size="14px" text-anchor="middle">attribute</text></switch></g><path d="M 107.5 52.5 L 60 70" fill="none" stroke="#cccccc" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 140 60 L 140 70" fill="none" stroke="#cccccc" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 172.5 52.5 L 260 70" fill="none" stroke="#cccccc" stroke-miterlimit="10" pointer-events="stroke"/><rect x="200" y="0" width="60" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 10px; margin-left: 201px;"><div data-drawio-colors="color: #808080; " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 14px; font-family: Helvetica; color: rgb(128, 128, 128); line-height: 1.2; pointer-events: all; white-space: normal; word-wrap: normal;">func_call</div></div></div></foreignObject><text x="230" y="14" fill="#808080" font-family="Helvetica" font-size="14px" text-anchor="middle">func_call</text></switch></g><path d="M 200 15 L 156.25 30" fill="none" stroke="#cccccc" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 260 15 L 360 60" fill="none" stroke="#cccccc" stroke-miterlimit="10" pointer-events="stroke"/></g><switch><g requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"/><a transform="translate(0,-5)" xlink:href="https://www.diagrams.net/doc/faq/svg-export-text-problems" target="_blank"><text text-anchor="middle" font-size="10px" x="50%" y="100%">Viewer does not support full SVG 1.1</text></a></switch></svg>


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | ong_description_content_type = text/markdown


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open("README.md", "r") as f:
 4 |     long_description = f.read()
 5 | 
 6 | setup(
 7 |   name = 'code_tokenize',
 8 |   packages = find_packages(exclude=['tests']), 
 9 |   version = '0.2.1', 
10 |   license='MIT',     
11 |   description = 'Fast program tokenization and structural analysis in Python',
12 |   long_description = long_description,
13 |   long_description_content_type="text/markdown",
14 |   author = 'Cedric Richter',                   
15 |   author_email = 'cedricr.upb@gmail.com',    
16 |   url = 'https://github.com/cedricrupb/code_tokenize',  
17 |   download_url = 'https://github.com/cedricrupb/code_tokenize/archive/refs/tags/v0.2.1.tar.gz', 
18 |   keywords = ['code', 'tokenization', 'tokenize', 'program', 'language processing'], 
19 |   install_requires=[          
20 |           'tree_sitter==0.21.3',
21 |           'GitPython>=3.1.41',
22 |           'requests>=2.32.0',
23 |           'code-ast>=0.1.1'
24 |       ],
25 |   classifiers=[
26 |     'Development Status :: 3 - Alpha',    
27 |     'Intended Audience :: Developers',  
28 |     'Topic :: Software Development :: Build Tools',
29 |     'License :: OSI Approved :: MIT License',
30 |     'Programming Language :: Python :: 3', 
31 |     'Programming Language :: Python :: 3.6',
32 |     'Programming Language :: Python :: 3.7',
33 |     'Programming Language :: Python :: 3.8',
34 |     'Programming Language :: Python :: 3.9',
35 |     'Programming Language :: Python :: 3.10',
36 |     'Programming Language :: Python :: 3.11',
37 |     'Programming Language :: Python :: 3.12',
38 |     'Programming Language :: Python :: 3.13',
39 |   ],
40 | )


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cedricrupb/code_tokenize/6797bcf682edea672677bf3bce708d38f9d20dd0/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_tokenization.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | import code_tokenize as ctok
 4 | 
 5 | class PythonTokenizationTestCase(TestCase):
 6 |     
 7 |     def test_tokenize1(self):
 8 |         tokens = ctok.tokenize("def my_func():\n    bar()", lang = "python")
 9 |         expected = ["def", "my_func", "(", ")", ":", "#INDENT#", "bar", "(", ")", "#NEWLINE#", "#DEDENT#"]
10 |         self.assertEqual(expected, [str(t) for t in tokens])
11 | 
12 |     def test_tokenize2(self):
13 |         tokens = ctok.tokenize("def my_func(x):\n   x = x + 1\n    return x", lang = "python")
14 |         expected = ["def", "my_func", "(", "x", ")", ":", "#INDENT#", "x", "=", "x", "+", "1", "#NEWLINE#", "return", "x", "#NEWLINE#", "#DEDENT#"]
15 |         self.assertEqual(expected, [str(t) for t in tokens])
16 | 
17 |     def test_error_handling(self):
18 |         self.assertRaises(SyntaxError, ctok.tokenize, "def my_func(x):\n   x = x + 1    return x", lang = "python")
19 |     
20 |     def test_error_handling2(self):
21 |         tokens = ctok.tokenize("def my_func(x):\n   x = x + 1    return x", lang = "python", syntax_error = "ignore")
22 |         expected = ["def", "my_func", "(", "x", ")", ":",  "x", "=", "x", "+", "1", "#INDENT#", "return", "x", "#NEWLINE#", "#DEDENT#"]
23 |         self.assertEqual(expected, [str(t) for t in tokens])
24 |         
25 | 
26 | 
27 | class JavaTokenizationTestCase(TestCase):
28 |     
29 |     def test_tokenize1(self):
30 |         tokens = ctok.tokenize("public class Test {\npublic void myFunc(){\n    bar();\n}\n}", lang = "java")
31 |         expected = ["public", "class", "Test", "{", "public", "void", "myFunc", "(", ")", "{", "bar", "(", ")", ";", "}", "}"]
32 |         self.assertEqual(expected, [str(t) for t in tokens])
33 | 
34 |     def test_tokenize2(self):
35 |         tokens = ctok.tokenize("public class Test {\npublic int myFunc(int x){\n    x = x + 1;\n    return x;\n}\n}", lang = "java")
36 |         expected = ["public", "class", "Test", "{", "public", "int", "myFunc", "(", "int", "x", ")", "{", "x", "=", "x", "+", "1", ";", "return", "x", ";", "}", "}"]
37 |         self.assertEqual(expected, [str(t) for t in tokens])
38 | 
39 |     def test_error_handling(self):
40 |         self.assertRaises(SyntaxError, ctok.tokenize, "public int myFunc(int x){\n    x = x + 1;\n    return x;\n}", lang = "java")
41 |     
42 |     def test_error_handling2(self):
43 |         tokens = ctok.tokenize("public int myFunc(int x){\n    x = x + 1;\n    return x;\n}", lang = "java", syntax_error = "ignore")
44 |         expected = ["public", "int", "myFunc", "", "(", "int", "x", ")", "{", "x", "=", "x", "+", "1", ";", "return", "x", ";", "}"]
45 |         self.assertEqual(expected, [str(t) for t in tokens])
46 | 
47 | 
48 | class GoTokenizationTest(TestCase):
49 | 
50 |     def test_tokenize1(self):
51 |         tokens = ctok.tokenize('func main(){\n    tip1 := "test"\n}', lang = "go")
52 |         expected = ["func", "main", "(", ")", "{", "tip1", ":=", '"test"', "#NEWLINE#", "}"]
53 | 
54 |         self.assertEqual(expected, [str(t) for t in tokens])


--------------------------------------------------------------------------------