├── code_indexer_loop ├── __init__.py ├── utils.py ├── test_api_dummy_sql.sql.txt ├── constants.py ├── test_api.py ├── api.py ├── code_splitter.py └── test_api_dummy_file.py.txt ├── Makefile ├── .github └── workflows │ ├── stale.yaml │ ├── gitleaks_pr.yaml │ └── gitleaks_push.yaml ├── pyproject.toml ├── examples └── basic_usage.ipynb ├── .gitignore ├── README.md └── LICENSE /code_indexer_loop/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.2.1" 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: clean build publish 2 | 3 | build: 4 | flit build 5 | 6 | clean: 7 | rm -rf dist 8 | rm -rf build 9 | 10 | publish: build 11 | flit publish 12 | 13 | .PHONY: build clean publish all 14 | -------------------------------------------------------------------------------- /code_indexer_loop/utils.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | 3 | 4 | def hash_md5(filename): 5 | hash_md5 = hashlib.md5() 6 | with open(filename, "rb") as f: 7 | for chunk in iter(lambda: f.read(4096), b""): 8 | hash_md5.update(chunk) 9 | return hash_md5.hexdigest() 10 | -------------------------------------------------------------------------------- /code_indexer_loop/test_api_dummy_sql.sql.txt: -------------------------------------------------------------------------------- 1 | SELECT 2 | Books.BookID, 3 | Books.Title, 4 | Authors.AuthorName, 5 | Orders.OrderDate 6 | FROM 7 | Books 8 | JOIN 9 | Authors ON Books.AuthorID = Authors.AuthorID 10 | JOIN 11 | Orders ON Books.BookID = Orders.BookID 12 | WHERE 13 | Orders.OrderDate >= '2022-01-01' 14 | AND Orders.OrderDate <= '2022-12-31' 15 | AND Authors.AuthorName LIKE '%John%' 16 | ORDER BY 17 | Orders.OrderDate DESC; 18 | -------------------------------------------------------------------------------- /code_indexer_loop/constants.py: -------------------------------------------------------------------------------- 1 | EXTENSION_TO_TREE_SITTER_LANGUAGE = { 2 | ".c": "c", 3 | ".cc": "cpp", 4 | ".cpp": "cpp", 5 | ".cs": "c-sharp", 6 | ".cxx": "cpp", 7 | ".go": "go", 8 | ".hs": "haskell", 9 | ".java": "java", 10 | ".jl": "julia", 11 | ".js": "javascript", 12 | ".jsx": "javascript", 13 | ".php": "php", 14 | ".py": "python", 15 | ".rb": "ruby", 16 | ".rs": "rust", 17 | ".scala": "scala", 18 | ".sql": "sql", 19 | ".swift": "swift", 20 | ".ts": "typescript", 21 | ".tsx": "typescript", 22 | } 23 | -------------------------------------------------------------------------------- /.github/workflows/stale.yaml: -------------------------------------------------------------------------------- 1 | ##################################### 2 | # DO NOT EDIT DIRECTLY. # 3 | # This file is managed by Terraform # 4 | ##################################### 5 | 6 | name: 'Close stale PRs' 7 | on: 8 | schedule: 9 | - cron: '30 1 * * *' 10 | 11 | jobs: 12 | stale: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/stale@v8 16 | with: 17 | stale-pr-message: 'This PR is stale because it has been open for 30 days with no activity. Remove stale label or comment or this will be closed in 7 days.' 18 | close-pr-message: 'This PR was closed because it has been stalled for 7 days with no activity.' 19 | days-before-pr-stale: 30 20 | days-before-pr-close: 7 21 | exempt-pr-labels: 'dependencies,security' 22 | -------------------------------------------------------------------------------- /.github/workflows/gitleaks_pr.yaml: -------------------------------------------------------------------------------- 1 | ##################################### 2 | # DO NOT EDIT DIRECTLY. # 3 | # This file is managed by Terraform # 4 | ##################################### 5 | 6 | on: [pull_request] 7 | 8 | jobs: 9 | gitleaks: 10 | runs-on: ubuntu-latest 11 | name: Detect Secrets 12 | steps: 13 | - name: Checkout 14 | uses: actions/checkout@v3 15 | with: 16 | fetch-depth: 0 # Checkout full history to make .gitleaksignore work like it does locally 17 | 18 | - name: GitLeaks 19 | uses: gacts/gitleaks@v1 # Action page: 20 | 21 | - name: Add Failure Instructions to Pull Request 22 | if: ${{ failure() }} 23 | uses: thollander/actions-comment-pull-request@v1 # Action page: 24 | with: 25 | message: | 26 | :warning: A secret was detected :warning: 27 | Follow instructions in [Notion](https://www.notion.so/definitive-io/GitHub-Secret-Prevention-97986fd7ae9f45dd8703a1e42f7b07f8#027d1f9cd2544a0798505a1817dfe3df) to resolve. 28 | -------------------------------------------------------------------------------- /.github/workflows/gitleaks_push.yaml: -------------------------------------------------------------------------------- 1 | ##################################### 2 | # DO NOT EDIT DIRECTLY. # 3 | # This file is managed by Terraform # 4 | ##################################### 5 | 6 | on: [push] 7 | 8 | jobs: 9 | gitleaks: 10 | runs-on: ubuntu-latest 11 | name: Detect Secrets 12 | steps: 13 | - name: Checkout 14 | uses: actions/checkout@v3 15 | with: 16 | fetch-depth: 0 # Checkout full history to make .gitleaksignore work like it does locally 17 | 18 | - name: GitLeaks 19 | uses: gacts/gitleaks@v1 # Action page: 20 | 21 | - name: Email security@definitive.io 22 | if: ${{ failure() && github.event.number == 0 }} # Only run for push events 23 | uses: licenseware/send-email-notification@v1 # Action page: 24 | with: 25 | api-key: ${{ secrets.SENDGRID_API_KEY }} 26 | subject: Secret detected in GitHub repository '${{ github.event.repository.name }}' 27 | from-email: GitLeaks GitHub Action 28 | to-email: security@definitive.io 29 | markdown-body: | 30 | Secret detected in ${{ github.event.repository.url }}. See details below: 31 | 32 | * action: ${{ github.event.repository.url }}/actions/runs/${{ github.run_id }} 33 | * commit: ${{ github.event.head_commit.url }} 34 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | build-backend = "flit_core.buildapi" 3 | requires = ["flit_core >=3.8.0,<4"] 4 | 5 | [project] 6 | name = "code-indexer-loop" 7 | description = "Code Indexer Loop" 8 | authors = [ 9 | {name = "Rick Lamers", email = "rick@definitive.io"} 10 | ] 11 | dynamic = ["version"] 12 | readme = "README.md" 13 | requires-python = ">=3.9" 14 | 15 | dependencies = [ 16 | "llama-index>=0.9.14,<0.10", 17 | "chromadb>=0.4.8,<0.5", 18 | "tree-sitter-languages>=1.7.0,<1.8", 19 | "tree-sitter>=0.20.2,<0.21", 20 | "tiktoken>=0.4.0,<0.5", 21 | "langchain>=0.0.354,<0.1.0", 22 | "watchdog>=2.3.1,<2.4", 23 | "nltk>=3.8.1,<3.9", 24 | ] 25 | 26 | [project.optional-dependencies] 27 | dev = [ 28 | "toml ~=0.10.2", 29 | "black ~=23.3.0", 30 | "isort ~=5.9.3", 31 | "autoflake ~=2.2.0", 32 | "ruff ~=0.0.284", 33 | "pytest ~=7.4.1", 34 | "flit >=3.8.0,<4", 35 | ] 36 | test = [ 37 | "pytest-cov ~=3.0.0", 38 | ] 39 | 40 | [tool.black] 41 | line-length = 120 42 | 43 | [tool.ruff] 44 | # Enable the pycodestyle (`E`) and Pyflakes (`F`) rules by default. 45 | # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or 46 | # McCabe complexity (`C901`) by default. 47 | select = ["E", "F"] 48 | ignore = [] 49 | 50 | # Allow autofix for all enabled rules (when `--fix`) is provided. 51 | fixable = ["ALL"] 52 | unfixable = [] 53 | 54 | # Exclude a variety of commonly ignored directories. 55 | exclude = [ 56 | ".bzr", 57 | ".direnv", 58 | ".eggs", 59 | ".git", 60 | ".git-rewrite", 61 | ".hg", 62 | ".mypy_cache", 63 | ".nox", 64 | ".pants.d", 65 | ".pytype", 66 | ".ruff_cache", 67 | ".svn", 68 | ".tox", 69 | ".venv", 70 | "__pypackages__", 71 | "_build", 72 | "buck-out", 73 | "build", 74 | "dist", 75 | "node_modules", 76 | "venv", 77 | ] 78 | per-file-ignores = {} 79 | 80 | # Same as Black. 81 | line-length = 120 82 | 83 | # Allow unused variables when underscore-prefixed. 84 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 85 | 86 | # Assume Python 3.9 87 | target-version = "py39" -------------------------------------------------------------------------------- /examples/basic_usage.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from code_indexer_loop.api import CodeIndexer" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 3, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "indexer = CodeIndexer(src_dir=os.environ[\"CIL_SRC_DIR\"], watch=True)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 4, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/plain": [ 38 | "6" 39 | ] 40 | }, 41 | "execution_count": 4, 42 | "metadata": {}, 43 | "output_type": "execute_result" 44 | } 45 | ], 46 | "source": [ 47 | "indexer.index.vector_store.client.count()" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 5, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "query = \"pandas\"" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 6, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "name": "stdout", 66 | "output_type": "stream", 67 | "text": [ 68 | "import os\n", 69 | "\n", 70 | "import pandas as pd\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "print(indexer.query(query)[0:30])" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 7, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/plain": [ 86 | "llama_index.schema.NodeWithScore" 87 | ] 88 | }, 89 | "execution_count": 7, 90 | "metadata": {}, 91 | "output_type": "execute_result" 92 | } 93 | ], 94 | "source": [ 95 | "text_nodes = indexer.query_nodes(query)\n", 96 | "\n", 97 | "type(text_nodes[0])" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 8, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "text/plain": [ 108 | "dict_keys(['file', 'content'])" 109 | ] 110 | }, 111 | "execution_count": 8, 112 | "metadata": {}, 113 | "output_type": "execute_result" 114 | } 115 | ], 116 | "source": [ 117 | "files = indexer.query_documents(query)\n", 118 | "\n", 119 | "files[0].keys()" 120 | ] 121 | } 122 | ], 123 | "metadata": { 124 | "kernelspec": { 125 | "display_name": ".venv", 126 | "language": "python", 127 | "name": "python3" 128 | }, 129 | "language_info": { 130 | "codemirror_mode": { 131 | "name": "ipython", 132 | "version": 3 133 | }, 134 | "file_extension": ".py", 135 | "mimetype": "text/x-python", 136 | "name": "python", 137 | "nbconvert_exporter": "python", 138 | "pygments_lexer": "ipython3", 139 | "version": "3.9.2" 140 | }, 141 | "orig_nbformat": 4 142 | }, 143 | "nbformat": 4, 144 | "nbformat_minor": 2 145 | } 146 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | .vscode/ -------------------------------------------------------------------------------- /code_indexer_loop/test_api.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from code_indexer_loop.code_splitter import (CodeSplitter, 6 | MaxChunkLengthExceededError, 7 | TokenCounter) 8 | 9 | THIS_FILE_DIR = os.path.dirname(os.path.realpath(__file__)) 10 | 11 | 12 | def create_code_splitter(language="python", target_chunk_tokens=5, max_chunk_tokens=200, enforce_max_chunk_tokens=True): 13 | return CodeSplitter( 14 | language=language, 15 | target_chunk_tokens=target_chunk_tokens, 16 | max_chunk_tokens=max_chunk_tokens, 17 | enforce_max_chunk_tokens=enforce_max_chunk_tokens, 18 | token_model="gpt-4", 19 | coalesce=50, 20 | ) 21 | 22 | 23 | def test_code_splitter_prefix_model(): 24 | CodeSplitter( 25 | language="python", 26 | target_chunk_tokens=10, 27 | max_chunk_tokens=10, 28 | enforce_max_chunk_tokens=True, 29 | token_model="gpt-4-32k-0613", 30 | coalesce=50, 31 | ) 32 | 33 | 34 | def test_code_splitter(): 35 | python_code_splitter = create_code_splitter() 36 | chunks = python_code_splitter.split_text( 37 | """def foo(): 38 | print("Hello, world!") 39 | 40 | print(1)""" 41 | ) 42 | assert chunks[0].startswith("def foo():") 43 | assert not chunks[0].endswith('")') 44 | 45 | 46 | def test_code_splitter_newlines(): 47 | python_code_splitter = create_code_splitter() 48 | chunks = python_code_splitter.split_text( 49 | """ 50 | def foo(): 51 | print("Hello, world!") 52 | 53 | print(1) 54 | 55 | """ 56 | ) 57 | assert chunks[0].startswith("\ndef foo():") 58 | assert not chunks[0].endswith('")') 59 | assert chunks[-1].endswith("\n\n") 60 | 61 | 62 | def test_code_splitter_raise(): 63 | python_code_splitter = create_code_splitter(max_chunk_tokens=5) 64 | with pytest.raises(MaxChunkLengthExceededError): 65 | python_code_splitter.split_text( 66 | """ 67 | def mostdefinitelynotlessthan5tokens(): 68 | pass 69 | """ 70 | ) 71 | 72 | 73 | def test_code_splitter_noraise(): 74 | python_code_splitter = create_code_splitter(max_chunk_tokens=5, enforce_max_chunk_tokens=False) 75 | python_code_splitter.split_text( 76 | """ 77 | def mostdefinitelynotlessthan5tokens(): 78 | pass 79 | """ 80 | ) 81 | 82 | 83 | def test_code_splitter_token_lengths(): 84 | tc = TokenCounter(default_model="gpt-4") 85 | max_chunk_tokens = 20 86 | python_code_splitter = create_code_splitter( 87 | max_chunk_tokens=max_chunk_tokens, target_chunk_tokens=max_chunk_tokens // 2 88 | ) 89 | source_code = """ 90 | def add(a, b): 91 | return a + b 92 | 93 | def subtract(a, b): 94 | return a - b 95 | 96 | add(1, 2) 97 | """ 98 | chunks = python_code_splitter.split_text(source_code) 99 | joined_chunks = "".join(chunks) 100 | assert source_code == joined_chunks 101 | 102 | chunk_lengths = [tc.count(chunk) for chunk in chunks] 103 | assert all([chunk_length <= max_chunk_tokens for chunk_length in chunk_lengths]) 104 | 105 | 106 | def test_long_file(): 107 | hard_file_path = os.path.join(THIS_FILE_DIR, "test_api_dummy_file.py.txt") 108 | with open(hard_file_path, "r") as f: 109 | source_code = f.read() 110 | 111 | python_code_splitter = create_code_splitter(target_chunk_tokens=1000, max_chunk_tokens=9000) 112 | chunks = python_code_splitter.split_text(source_code) 113 | joined_chunks = "".join(chunks) 114 | assert source_code == joined_chunks 115 | 116 | 117 | def test_sql(): 118 | sql_file_path = os.path.join(THIS_FILE_DIR, "test_api_dummy_sql.sql.txt") 119 | with open(sql_file_path, "r") as f: 120 | source_code = f.read() 121 | 122 | sql_code_splitter = CodeSplitter( 123 | language="sql", 124 | target_chunk_tokens=10, 125 | max_chunk_tokens=1000, 126 | enforce_max_chunk_tokens=True, 127 | token_model="gpt-4", 128 | coalesce=50, 129 | ) 130 | 131 | chunks = sql_code_splitter.split_text(source_code) 132 | joined_chunks = "".join(chunks) 133 | assert source_code == joined_chunks 134 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Code Indexer Loop 2 | 3 | [![PyPI version](https://badge.fury.io/py/code-indexer-loop.svg?v=2)](https://pypi.org/project/code-indexer-loop/) 4 | [![License](https://img.shields.io/github/license/definitive-io/code-indexer-loop?v=2)](LICENSE) 5 | [![Forks](https://img.shields.io/github/forks/definitive-io/code-indexer-loop?v=2)](https://github.com/definitive-io/code-indexer-loop/network) 6 | [![Stars](https://img.shields.io/github/stars/definitive-io/code-indexer-loop?v=2)](https://github.com/definitive-io/code-indexer-loop/stargazers) 7 | [![Twitter](https://img.shields.io/twitter/url/https/twitter.com?style=social&label=Follow%20%40DefinitiveIO)](https://twitter.com/definitiveio) 8 | [![Discord](https://dcbadge.vercel.app/api/server/CPJJfq87Vx?compact=true&style=flat)](https://discord.gg/CPJJfq87Vx) 9 | 10 | 11 | **Code Indexer Loop** is a Python library designed to index and retrieve code snippets. 12 | 13 | It uses the useful indexing utilities of the **LlamaIndex** library and the multi-language **tree-sitter** library to parse the code from many popular programming languages. **tiktoken** is used to right-size retrieval based on number of tokens and **LangChain** is used to obtain embeddings (defaults to **OpenAI**'s `text-embedding-ada-002`) and store them in an embedded **ChromaDB** vector database. **watchdog** is used for continuous updating of the index based on file system events. 14 | 15 | Read the [launch blog post](https://www.definitive.io/blog/open-sourcing-code-indexer-loop) for more details about why we've built this! 16 | 17 | ## Installation: 18 | Use `pip` to install Code Indexer Loop from PyPI. 19 | ``` 20 | pip install code-indexer-loop 21 | ``` 22 | 23 | ## Usage: 24 | 1. Import necessary modules: 25 | ```python 26 | from code_indexer_loop.api import CodeIndexer 27 | ``` 28 | 2. Create a CodeIndexer object and have it watch for changes: 29 | ```python 30 | indexer = CodeIndexer(src_dir="path/to/code/", watch=True) 31 | ``` 32 | 3. Use `.query` to perform a search query: 33 | ```python 34 | query = "pandas" 35 | print(indexer.query(query)[0:30]) 36 | ``` 37 | 38 | Note: make sure the `OPENAI_API_KEY` environment variable is set. This is needed for generating the embeddings. 39 | 40 | You can also use `indexer.query_nodes` to get the nodes of a query or `indexer.query_documents` to receive the entire source code files. 41 | 42 | Note that if you edit any of the source code files in the `src_dir` it will efficiently re-index those files using `watchdog` and an `md5` based caching mechanism. This results in up-to-date embeddings every time you query the index. 43 | 44 | ## Examples 45 | Check out the [basic_usage](examples/basic_usage.ipynb) notebook for a quick overview of the API. 46 | 47 | ## Token limits 48 | You can configure token limits for the chunks through the CodeIndexer constructor: 49 | 50 | ```python 51 | indexer = CodeIndexer( 52 | src_dir="path/to/code/", watch=True, 53 | target_chunk_tokens = 300, 54 | max_chunk_tokens = 1000, 55 | enforce_max_chunk_tokens = False, 56 | coalesce = 50 57 | token_model = "gpt-4" 58 | ) 59 | ``` 60 | 61 | Note you can choose whether the `max_chunk_tokens` is enforced. If it is, it will raise an exception in case there is no semantic parsing that respects the `max_chunk_tokens`. 62 | 63 | The `coalesce` argument controls the limit of combining smaller chunks into single chunks to avoid having many very small chunks. The unit for `coalesce` is also tokens. 64 | 65 | ## tree-sitter 66 | Using `tree-sitter` for parsing, the chunks are broken only at valid node-level string positions in the source file. This avoids breaking up e.g. function and class definitions. 67 | 68 | ### Supported languages: 69 | C, C++, C#, Go, Haskell, Java, Julia, JavaScript, PHP, Python, Ruby, Rust, Scala, Swift, SQL, TypeScript 70 | 71 | Note, we're mainly testing Python support. Use other languages at your own peril. 72 | 73 | ## Contributing 74 | Pull requests are welcome. Please make sure to update tests as appropriate. Use tools provided within `dev` dependencies to maintain the code standard. 75 | 76 | ### Tests 77 | Run the unit tests by invoking `pytest` in the root. 78 | 79 | ## License 80 | Please see the LICENSE file provided with the source code. 81 | 82 | ## Attribution 83 | We'd like to thank the Sweep AI for publishing their ideas about code chunking. Read their blog posts about the topic [here](https://docs.sweep.dev/blogs/chunking-2m-files) and [here](https://docs.sweep.dev/blogs/chunking-improvements). The implementation in `code_indexer_loop` is modified from their original implementation mainly to limit based on tokens instead of characters and to achieve perfect document reconstruction (`"".join(chunks) == original_source_code`). 84 | -------------------------------------------------------------------------------- /code_indexer_loop/api.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | import os 3 | from pathlib import Path 4 | 5 | import chromadb 6 | from langchain.embeddings.openai import OpenAIEmbeddings 7 | from llama_index import ServiceContext, VectorStoreIndex 8 | from llama_index.embeddings import LangchainEmbedding 9 | from llama_index.schema import NodeWithScore, TextNode 10 | from llama_index.vector_stores import ChromaVectorStore 11 | from watchdog.events import FileSystemEventHandler 12 | from watchdog.observers import Observer 13 | 14 | from code_indexer_loop.code_splitter import CodeSplitter 15 | from code_indexer_loop.constants import EXTENSION_TO_TREE_SITTER_LANGUAGE 16 | from code_indexer_loop.utils import hash_md5 17 | 18 | 19 | class CodeIndexer: 20 | src_dir: str 21 | target_chunk_tokens: int 22 | max_chunk_tokens: int 23 | enforce_max_chunk_tokens: bool 24 | token_model: str 25 | code_splitters = {} 26 | hash_cache = {} 27 | index: VectorStoreIndex = None 28 | 29 | def __init__( 30 | self, 31 | src_dir: str, 32 | target_chunk_tokens: int = 300, 33 | max_chunk_tokens: int = 1000, 34 | enforce_max_chunk_tokens: bool = False, 35 | coalesce: int = 50, 36 | token_model: str = "gpt-4", 37 | watch: bool = False, 38 | ): 39 | self.src_dir = src_dir 40 | self.target_chunk_tokens = target_chunk_tokens 41 | self.max_chunk_tokens = max_chunk_tokens 42 | self.enforce_max_chunk_tokens = enforce_max_chunk_tokens 43 | self.coalesce = coalesce 44 | self.token_model = token_model 45 | self._create_index() 46 | self.refresh_nodes() 47 | 48 | if watch: 49 | self._start_watching() 50 | atexit.register(self._stop_watching) 51 | 52 | def query(self, query: str, k=10) -> str: 53 | return "\n".join( 54 | [node_with_score.node.text for node_with_score in self.index.as_retriever(similarity_top_k=k).retrieve(query)] 55 | ) 56 | 57 | def query_nodes(self, query: str, k=10) -> list[NodeWithScore]: 58 | return self.index.as_retriever(similarity_top_k=k).retrieve(query) 59 | 60 | def query_documents(self, query: str, k=10) -> list[dict[str, str]]: 61 | nodes = self.index.as_retriever(similarity_top_k=k).retrieve(query) 62 | files = [node_with_score.node.metadata["file"] for node_with_score in nodes] 63 | # Deduplicate files, preserving order 64 | files = list(dict.fromkeys(files)) 65 | # Read file contents 66 | contents = [] 67 | for file in files: 68 | with open(file, "r") as f: 69 | contents.append( 70 | { 71 | "file": file, 72 | "content": f.read(), 73 | } 74 | ) 75 | return contents 76 | 77 | def add_file(self, file: str): 78 | ext = os.path.splitext(file)[1] 79 | text_splitter = self._get_code_splitter(ext) 80 | 81 | calculated_hash = hash_md5(file) 82 | if file in self.hash_cache: 83 | if self.hash_cache[file] == calculated_hash: 84 | # Skip file if it hasn't changed 85 | return 86 | else: 87 | self.hash_cache[file] = calculated_hash 88 | 89 | with open(file, "r") as f: 90 | text = f.read() 91 | nodes = [ 92 | TextNode( 93 | text=chunk, 94 | metadata={ 95 | "file": file, 96 | }, 97 | ) 98 | for chunk in text_splitter.split_text(text) 99 | ] 100 | 101 | self._remove_old_nodes(file) 102 | self._insert_nodes(nodes) 103 | 104 | def remove_file(self, file: str): 105 | self._remove_old_nodes(file) 106 | del self.hash_cache[file] 107 | 108 | def refresh_nodes(self): 109 | files = self._find_files(self.src_dir, EXTENSION_TO_TREE_SITTER_LANGUAGE) 110 | 111 | # Clear any files that no longer exist 112 | for file in list(self.hash_cache.keys()): 113 | if file not in files: 114 | del self.hash_cache[file] 115 | self._remove_old_nodes(file) 116 | 117 | # For each file, split into chunks and index 118 | for file in files: 119 | self.add_file(str(file)) 120 | 121 | def _start_watching(self): 122 | event_handler = CodeChangeHandler(self) 123 | self.observer = Observer() 124 | self.observer.schedule(event_handler, self.src_dir, recursive=True) 125 | self.observer.start() 126 | 127 | def _stop_watching(self): 128 | if hasattr(self, "observer"): 129 | self.observer.stop() 130 | self.observer.join() 131 | 132 | def _find_files(self, path, include_ext={}): 133 | """ 134 | Recursively find all files in a given path. 135 | 136 | Parameters: 137 | path (str): The root directory to start searching from. 138 | include_ext (dict): A dictionary of file extensions to include 139 | (keys are extensions including leading period if applicable). 140 | 141 | Returns: 142 | list: A list of full file paths for each file found. 143 | """ 144 | found_files = [] 145 | 146 | for root, _, files in os.walk(path): 147 | for file in files: 148 | # Check if the file should be excluded based on its extension 149 | file_ext = os.path.splitext(file)[1] 150 | if file_ext in include_ext: 151 | # Construct the full path of the file and append to list 152 | full_path = Path(os.path.join(root, file)).resolve() 153 | found_files.append(full_path) 154 | 155 | return set(found_files) 156 | 157 | def _get_code_splitter(self, ext) -> CodeSplitter: 158 | if ext not in EXTENSION_TO_TREE_SITTER_LANGUAGE: 159 | raise ValueError(f"Extension {ext} not supported.") 160 | language = EXTENSION_TO_TREE_SITTER_LANGUAGE[ext] 161 | if language not in self.code_splitters: 162 | text_splitter = CodeSplitter( 163 | language=language, 164 | target_chunk_tokens=self.target_chunk_tokens, 165 | max_chunk_tokens=self.max_chunk_tokens, 166 | enforce_max_chunk_tokens=self.enforce_max_chunk_tokens, 167 | coalesce=self.coalesce, 168 | token_model=self.token_model, 169 | ) 170 | self.code_splitters[ext] = text_splitter 171 | 172 | return self.code_splitters[ext] 173 | 174 | def _remove_old_nodes(self, file): 175 | # Remove existing nodes for the same file 176 | self.index.vector_store.client.delete(where={"file": file}) 177 | 178 | def _insert_nodes(self, nodes): 179 | self.index.insert_nodes(nodes) 180 | 181 | def _create_index(self) -> VectorStoreIndex: 182 | # Create client and a new collection 183 | chroma_client = chromadb.EphemeralClient() 184 | chroma_collection = chroma_client.create_collection("code-index") 185 | 186 | # Define embedding function 187 | embed_model = LangchainEmbedding(OpenAIEmbeddings()) 188 | 189 | # Set up ChromaVectorStore and load in data 190 | vector_store = ChromaVectorStore(chroma_collection=chroma_collection) 191 | service_context = ServiceContext.from_defaults(embed_model=embed_model) 192 | index = VectorStoreIndex.from_vector_store(vector_store=vector_store, service_context=service_context) 193 | 194 | self.index = index 195 | return index 196 | 197 | 198 | class CodeChangeHandler(FileSystemEventHandler): 199 | def __init__(self, indexer: CodeIndexer): 200 | self.indexer = indexer 201 | 202 | def on_modified(self, event): 203 | if event.is_directory: 204 | # Directory modifications shouldn't trigger a reindex 205 | return 206 | else: 207 | # Update only if the changed file has a supported extension 208 | ext = os.path.splitext(event.src_path)[1] 209 | if ext in EXTENSION_TO_TREE_SITTER_LANGUAGE: 210 | self.indexer.add_file(event.src_path) 211 | 212 | def on_created(self, event): 213 | if event.is_directory: 214 | self.indexer.refresh_nodes() 215 | else: 216 | # Update only if the changed file has a supported extension 217 | ext = os.path.splitext(event.src_path)[1] 218 | if ext in EXTENSION_TO_TREE_SITTER_LANGUAGE: 219 | self.indexer.add_file(event.src_path) 220 | 221 | def on_moved(self, event): 222 | self.indexer.refresh_nodes() 223 | 224 | def on_deleted(self, event): 225 | if event.is_directory: 226 | self.indexer.refresh_nodes() 227 | else: 228 | ext = os.path.splitext(event.src_path)[1] 229 | if ext in EXTENSION_TO_TREE_SITTER_LANGUAGE: 230 | self.indexer.remove_file(event.src_path) 231 | -------------------------------------------------------------------------------- /code_indexer_loop/code_splitter.py: -------------------------------------------------------------------------------- 1 | """Code Splitter. 2 | 3 | Implementation amalgamated from: 4 | https://docs.sweep.dev/blogs/chunking-improvements 5 | https://docs.sweep.dev/blogs/chunking-2m-files 6 | https://github.com/jerryjliu/llama_index/pull/7100 7 | 8 | """ 9 | 10 | import re 11 | from dataclasses import dataclass 12 | from typing import List, Optional, Union 13 | 14 | import tiktoken 15 | from tree_sitter import Node 16 | 17 | 18 | class MaxChunkLengthExceededError(Exception): 19 | pass 20 | 21 | 22 | @dataclass 23 | class Span: 24 | # Represents a slice of a string 25 | start: int = 0 26 | end: int = 0 27 | 28 | def __post_init__(self): 29 | # If end is None, set it to start 30 | if self.end is None: 31 | self.end = self.start 32 | 33 | def extract(self, s: bytes) -> bytes: 34 | # Grab the corresponding substring of string s by bytes 35 | return s[self.start : self.end] 36 | 37 | def extract_lines(self, s: str) -> str: 38 | lines = s.split("\n") 39 | selected_lines = lines[self.start : self.end] 40 | joined = "\n".join(selected_lines) 41 | # if selection doesn't extend to the last line, add the missing newline 42 | if self.end < len(lines): 43 | joined += "\n" 44 | return joined 45 | 46 | def __add__(self, other: Union["Span", int]) -> "Span": 47 | # e.g. Span(1, 2) + Span(2, 4) = Span(1, 4) (concatenation) 48 | # There are no safety checks: Span(a, b) + Span(c, d) = Span(a, d) 49 | # and there are no requirements for b = c. 50 | if isinstance(other, int): 51 | return Span(self.start + other, self.end + other) 52 | elif isinstance(other, Span): 53 | return Span(self.start, other.end) 54 | else: 55 | raise NotImplementedError() 56 | 57 | def __len__(self) -> int: 58 | # i.e. Span(a, b) = b - a 59 | return self.end - self.start 60 | 61 | 62 | class TokenCounter: 63 | default_model: str 64 | initialized_models = {} 65 | 66 | def __init__(self, default_model: str): 67 | self.default_model = default_model 68 | 69 | def count(self, text: str, model: Optional[str] = None): 70 | if model is None: 71 | model = self.default_model 72 | 73 | if model not in self.initialized_models: 74 | try: 75 | self.initialized_models[model] = tiktoken.encoding_for_model(model) 76 | except KeyError: 77 | raise KeyError(f"Model {model} not supported.") 78 | 79 | return len(self.initialized_models[model].encode(text, disallowed_special=())) 80 | 81 | def count_chunk(self, chunk: Span, source_code: bytes, model: Optional[str] = None): 82 | return self.count(chunk.extract(source_code).decode("utf-8"), model) 83 | 84 | 85 | class CodeSplitter: 86 | """Split code using a AST parser.""" 87 | 88 | language: str 89 | target_chunk_tokens: int 90 | max_chunk_tokens: int 91 | enforce_max_chunk_tokens: bool 92 | coalesce: int 93 | token_counter: TokenCounter 94 | 95 | def __init__( 96 | self, 97 | language: str, 98 | target_chunk_tokens: int, 99 | max_chunk_tokens: int, 100 | enforce_max_chunk_tokens: bool, 101 | coalesce: int, 102 | token_model: str, 103 | ): 104 | self.token_counter = TokenCounter(default_model=token_model) 105 | self.target_chunk_tokens = target_chunk_tokens 106 | self.max_chunk_tokens = max_chunk_tokens 107 | self.enforce_max_chunk_tokens = enforce_max_chunk_tokens 108 | self.language = language 109 | self.coalesce = coalesce 110 | 111 | @classmethod 112 | def class_name(cls) -> str: 113 | """Get class name.""" 114 | return "CodeSplitter" 115 | 116 | def chunk_tree( 117 | self, 118 | tree, 119 | source_code: bytes, 120 | ) -> list[Span]: 121 | # 1. Recursively form chunks 122 | def chunk_node(node: Node) -> list[Span]: 123 | chunks: list[Span] = [] 124 | current_chunk: Span = Span(node.start_byte, node.start_byte) 125 | node_children = node.children 126 | for child in node_children: 127 | child_token_len = self.token_counter.count_chunk(Span(child.start_byte, child.end_byte), source_code) 128 | child_and_current_token_len = self.token_counter.count_chunk( 129 | Span(child.start_byte, child.end_byte), source_code 130 | ) + self.token_counter.count_chunk(current_chunk, source_code) 131 | 132 | if child_token_len > self.target_chunk_tokens: 133 | if child_token_len > self.max_chunk_tokens and self.enforce_max_chunk_tokens: 134 | raise MaxChunkLengthExceededError( 135 | f"Chunk token length {child_token_len} exceeds maximum {self.max_chunk_tokens}." 136 | ) 137 | 138 | chunks.append(current_chunk) 139 | current_chunk = Span(child.end_byte, child.end_byte) 140 | chunks.extend(chunk_node(child)) 141 | elif child_and_current_token_len > self.target_chunk_tokens: 142 | if child_and_current_token_len > self.max_chunk_tokens and self.enforce_max_chunk_tokens: 143 | raise MaxChunkLengthExceededError( 144 | f"Chunk token length {child_and_current_token_len}" 145 | f" exceeds maximum {self.max_chunk_tokens}." 146 | ) 147 | chunks.append(current_chunk) 148 | current_chunk = Span(child.start_byte, child.end_byte) 149 | else: 150 | current_chunk += Span(child.start_byte, child.end_byte) 151 | 152 | final_chunk_token_len = self.token_counter.count_chunk(current_chunk, source_code) 153 | if final_chunk_token_len > self.max_chunk_tokens and self.enforce_max_chunk_tokens: 154 | raise MaxChunkLengthExceededError( 155 | f"Chunk token length {final_chunk_token_len} exceeds maximum {self.max_chunk_tokens}." 156 | ) 157 | chunks.append(current_chunk) 158 | return chunks 159 | 160 | chunks = chunk_node(tree.root_node) 161 | 162 | # Filter empty chunks 163 | chunks = [chunk for chunk in chunks if len(chunk) > 0] 164 | 165 | # Early return if there is no chunk 166 | if len(chunks) == 0: 167 | return [] 168 | # Early return if there is only one chunk 169 | if len(chunks) < 2: 170 | return [Span(0, len(chunks[0]))] 171 | 172 | # Filling in the gaps 173 | # by aligning end of one chunk with start of next 174 | chunks[0].start = 0 175 | for prev, curr in zip(chunks[:-1], chunks[1:]): 176 | prev.end = curr.start 177 | curr.end = len(source_code) 178 | 179 | # Combining small chunks with bigger ones 180 | new_chunks = [] 181 | aggregated_chunk = Span(0, 0) 182 | aggregated_chunk_token_len = 0 183 | for chunk in chunks: 184 | # Check if the combined chunk exceeds target_chunk_tokens 185 | # Note, at this point no chunk exceeds max_chunk_tokens 186 | # if max_chunk_tokens is enforced. 187 | chunk_token_len = self.token_counter.count_chunk(chunk, source_code) 188 | if chunk_token_len > self.target_chunk_tokens: 189 | new_chunks.append(aggregated_chunk) 190 | new_chunks.append(chunk) 191 | aggregated_chunk = Span(chunk.end, chunk.end) 192 | aggregated_chunk_token_len = 0 193 | elif aggregated_chunk_token_len + chunk_token_len > self.target_chunk_tokens: 194 | new_chunks.append(aggregated_chunk) 195 | aggregated_chunk = Span(chunk.start, chunk.end) 196 | aggregated_chunk_token_len = chunk_token_len 197 | else: 198 | # Combined chunk does not exceed target_chunk_tokens 199 | # so we add the current chunk to the aggregated_chunk. 200 | # Note, there is no need to check whether the combined chunk 201 | # exceeds max_chunk_tokens because we have already checked. 202 | aggregated_chunk += chunk 203 | aggregated_chunk_token_len += chunk_token_len 204 | if aggregated_chunk_token_len > self.coalesce: 205 | new_chunks.append(aggregated_chunk) 206 | aggregated_chunk = Span(chunk.end, chunk.end) 207 | aggregated_chunk_token_len = 0 208 | 209 | if len(aggregated_chunk) > 0: 210 | new_chunks.append(aggregated_chunk) 211 | 212 | # Changing line numbers 213 | line_chunks = [ 214 | Span( 215 | self.get_line_number(chunk.start, source_code), 216 | self.get_line_number(chunk.end, source_code), 217 | ) 218 | for chunk in new_chunks 219 | ] 220 | 221 | # Eliminating empty chunks 222 | line_chunks = [chunk for chunk in line_chunks if len(chunk) > 0] 223 | return line_chunks 224 | 225 | def split_and_keep_newline(self, byte_str): 226 | return re.split(b"(?<=\n)", byte_str) 227 | 228 | def get_line_number(self, index: int, source_code: bytes) -> int: 229 | total_chars = 0 230 | for line_number, line in enumerate(self.split_and_keep_newline(source_code), start=1): 231 | total_chars += len(line) 232 | if total_chars > index: 233 | return line_number - 1 234 | return line_number 235 | 236 | def split_text(self, text: str) -> List[str]: 237 | """Split incoming code and return chunks using the AST.""" 238 | try: 239 | import tree_sitter_languages 240 | except ImportError: 241 | raise ImportError("Please install tree_sitter_languages to use CodeSplitter.") 242 | 243 | try: 244 | parser = tree_sitter_languages.get_parser(self.language) 245 | except Exception as e: 246 | print( 247 | f"Could not get parser for language {self.language}. Check " 248 | "https://github.com/grantjenks/py-tree-sitter-languages#license " 249 | "for a list of valid languages." 250 | ) 251 | raise e 252 | 253 | tree = parser.parse(text.encode("utf-8")) 254 | if not tree.root_node.children or tree.root_node.children[0].type != "ERROR": 255 | line_spans = self.chunk_tree(tree, text.encode("utf-8")) 256 | chunks = [line_span.extract_lines(text) for line_span in line_spans] 257 | return chunks 258 | else: 259 | raise ValueError(f"Could not parse code with language {self.language}.") 260 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2023 Definitive Inc. 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /code_indexer_loop/test_api_dummy_file.py.txt: -------------------------------------------------------------------------------- 1 | """ 2 | Sourced from: cpython/Lib/ast.py (https://github.com/python/cpython) 3 | 4 | ast 5 | ~~~ 6 | 7 | The `ast` module helps Python applications to process trees of the Python 8 | abstract syntax grammar. The abstract syntax itself might change with 9 | each Python release; this module helps to find out programmatically what 10 | the current grammar looks like and allows modifications of it. 11 | 12 | An abstract syntax tree can be generated by passing `ast.PyCF_ONLY_AST` as 13 | a flag to the `compile()` builtin function or by using the `parse()` 14 | function from this module. The result will be a tree of objects whose 15 | classes all inherit from `ast.AST`. 16 | 17 | A modified abstract syntax tree can be compiled into a Python code object 18 | using the built-in `compile()` function. 19 | 20 | Additionally various helper functions are provided that make working with 21 | the trees simpler. The main intention of the helper functions and this 22 | module in general is to provide an easy to use interface for libraries 23 | that work tightly with the python syntax (template engines for example). 24 | 25 | 26 | :copyright: Copyright 2008 by Armin Ronacher. 27 | :license: Python License. 28 | """ 29 | import sys 30 | import re 31 | from _ast import * 32 | from contextlib import contextmanager, nullcontext 33 | from enum import IntEnum, auto, _simple_enum 34 | 35 | 36 | def parse(source, filename='', mode='exec', *, 37 | type_comments=False, feature_version=None, optimize=-1): 38 | """ 39 | Parse the source into an AST node. 40 | Equivalent to compile(source, filename, mode, PyCF_ONLY_AST). 41 | Pass type_comments=True to get back type comments where the syntax allows. 42 | """ 43 | flags = PyCF_ONLY_AST 44 | if optimize > 0: 45 | flags |= PyCF_OPTIMIZED_AST 46 | if type_comments: 47 | flags |= PyCF_TYPE_COMMENTS 48 | if feature_version is None: 49 | feature_version = -1 50 | elif isinstance(feature_version, tuple): 51 | major, minor = feature_version # Should be a 2-tuple. 52 | if major != 3: 53 | raise ValueError(f"Unsupported major version: {major}") 54 | feature_version = minor 55 | # Else it should be an int giving the minor version for 3.x. 56 | return compile(source, filename, mode, flags, 57 | _feature_version=feature_version, optimize=optimize) 58 | 59 | 60 | def literal_eval(node_or_string): 61 | """ 62 | Evaluate an expression node or a string containing only a Python 63 | expression. The string or node provided may only consist of the following 64 | Python literal structures: strings, bytes, numbers, tuples, lists, dicts, 65 | sets, booleans, and None. 66 | 67 | Caution: A complex expression can overflow the C stack and cause a crash. 68 | """ 69 | if isinstance(node_or_string, str): 70 | node_or_string = parse(node_or_string.lstrip(" \t"), mode='eval') 71 | if isinstance(node_or_string, Expression): 72 | node_or_string = node_or_string.body 73 | def _raise_malformed_node(node): 74 | msg = "malformed node or string" 75 | if lno := getattr(node, 'lineno', None): 76 | msg += f' on line {lno}' 77 | raise ValueError(msg + f': {node!r}') 78 | def _convert_num(node): 79 | if not isinstance(node, Constant) or type(node.value) not in (int, float, complex): 80 | _raise_malformed_node(node) 81 | return node.value 82 | def _convert_signed_num(node): 83 | if isinstance(node, UnaryOp) and isinstance(node.op, (UAdd, USub)): 84 | operand = _convert_num(node.operand) 85 | if isinstance(node.op, UAdd): 86 | return + operand 87 | else: 88 | return - operand 89 | return _convert_num(node) 90 | def _convert(node): 91 | if isinstance(node, Constant): 92 | return node.value 93 | elif isinstance(node, Tuple): 94 | return tuple(map(_convert, node.elts)) 95 | elif isinstance(node, List): 96 | return list(map(_convert, node.elts)) 97 | elif isinstance(node, Set): 98 | return set(map(_convert, node.elts)) 99 | elif (isinstance(node, Call) and isinstance(node.func, Name) and 100 | node.func.id == 'set' and node.args == node.keywords == []): 101 | return set() 102 | elif isinstance(node, Dict): 103 | if len(node.keys) != len(node.values): 104 | _raise_malformed_node(node) 105 | return dict(zip(map(_convert, node.keys), 106 | map(_convert, node.values))) 107 | elif isinstance(node, BinOp) and isinstance(node.op, (Add, Sub)): 108 | left = _convert_signed_num(node.left) 109 | right = _convert_num(node.right) 110 | if isinstance(left, (int, float)) and isinstance(right, complex): 111 | if isinstance(node.op, Add): 112 | return left + right 113 | else: 114 | return left - right 115 | return _convert_signed_num(node) 116 | return _convert(node_or_string) 117 | 118 | 119 | def dump(node, annotate_fields=True, include_attributes=False, *, indent=None): 120 | """ 121 | Return a formatted dump of the tree in node. This is mainly useful for 122 | debugging purposes. If annotate_fields is true (by default), 123 | the returned string will show the names and the values for fields. 124 | If annotate_fields is false, the result string will be more compact by 125 | omitting unambiguous field names. Attributes such as line 126 | numbers and column offsets are not dumped by default. If this is wanted, 127 | include_attributes can be set to true. If indent is a non-negative 128 | integer or string, then the tree will be pretty-printed with that indent 129 | level. None (the default) selects the single line representation. 130 | """ 131 | def _format(node, level=0): 132 | if indent is not None: 133 | level += 1 134 | prefix = '\n' + indent * level 135 | sep = ',\n' + indent * level 136 | else: 137 | prefix = '' 138 | sep = ', ' 139 | if isinstance(node, AST): 140 | cls = type(node) 141 | args = [] 142 | allsimple = True 143 | keywords = annotate_fields 144 | for name in node._fields: 145 | try: 146 | value = getattr(node, name) 147 | except AttributeError: 148 | keywords = True 149 | continue 150 | if value is None and getattr(cls, name, ...) is None: 151 | keywords = True 152 | continue 153 | value, simple = _format(value, level) 154 | allsimple = allsimple and simple 155 | if keywords: 156 | args.append('%s=%s' % (name, value)) 157 | else: 158 | args.append(value) 159 | if include_attributes and node._attributes: 160 | for name in node._attributes: 161 | try: 162 | value = getattr(node, name) 163 | except AttributeError: 164 | continue 165 | if value is None and getattr(cls, name, ...) is None: 166 | continue 167 | value, simple = _format(value, level) 168 | allsimple = allsimple and simple 169 | args.append('%s=%s' % (name, value)) 170 | if allsimple and len(args) <= 3: 171 | return '%s(%s)' % (node.__class__.__name__, ', '.join(args)), not args 172 | return '%s(%s%s)' % (node.__class__.__name__, prefix, sep.join(args)), False 173 | elif isinstance(node, list): 174 | if not node: 175 | return '[]', True 176 | return '[%s%s]' % (prefix, sep.join(_format(x, level)[0] for x in node)), False 177 | return repr(node), True 178 | 179 | if not isinstance(node, AST): 180 | raise TypeError('expected AST, got %r' % node.__class__.__name__) 181 | if indent is not None and not isinstance(indent, str): 182 | indent = ' ' * indent 183 | return _format(node)[0] 184 | 185 | 186 | def copy_location(new_node, old_node): 187 | """ 188 | Copy source location (`lineno`, `col_offset`, `end_lineno`, and `end_col_offset` 189 | attributes) from *old_node* to *new_node* if possible, and return *new_node*. 190 | """ 191 | for attr in 'lineno', 'col_offset', 'end_lineno', 'end_col_offset': 192 | if attr in old_node._attributes and attr in new_node._attributes: 193 | value = getattr(old_node, attr, None) 194 | # end_lineno and end_col_offset are optional attributes, and they 195 | # should be copied whether the value is None or not. 196 | if value is not None or ( 197 | hasattr(old_node, attr) and attr.startswith("end_") 198 | ): 199 | setattr(new_node, attr, value) 200 | return new_node 201 | 202 | 203 | def fix_missing_locations(node): 204 | """ 205 | When you compile a node tree with compile(), the compiler expects lineno and 206 | col_offset attributes for every node that supports them. This is rather 207 | tedious to fill in for generated nodes, so this helper adds these attributes 208 | recursively where not already set, by setting them to the values of the 209 | parent node. It works recursively starting at *node*. 210 | """ 211 | def _fix(node, lineno, col_offset, end_lineno, end_col_offset): 212 | if 'lineno' in node._attributes: 213 | if not hasattr(node, 'lineno'): 214 | node.lineno = lineno 215 | else: 216 | lineno = node.lineno 217 | if 'end_lineno' in node._attributes: 218 | if getattr(node, 'end_lineno', None) is None: 219 | node.end_lineno = end_lineno 220 | else: 221 | end_lineno = node.end_lineno 222 | if 'col_offset' in node._attributes: 223 | if not hasattr(node, 'col_offset'): 224 | node.col_offset = col_offset 225 | else: 226 | col_offset = node.col_offset 227 | if 'end_col_offset' in node._attributes: 228 | if getattr(node, 'end_col_offset', None) is None: 229 | node.end_col_offset = end_col_offset 230 | else: 231 | end_col_offset = node.end_col_offset 232 | for child in iter_child_nodes(node): 233 | _fix(child, lineno, col_offset, end_lineno, end_col_offset) 234 | _fix(node, 1, 0, 1, 0) 235 | return node 236 | 237 | 238 | def increment_lineno(node, n=1): 239 | """ 240 | Increment the line number and end line number of each node in the tree 241 | starting at *node* by *n*. This is useful to "move code" to a different 242 | location in a file. 243 | """ 244 | for child in walk(node): 245 | # TypeIgnore is a special case where lineno is not an attribute 246 | # but rather a field of the node itself. 247 | if isinstance(child, TypeIgnore): 248 | child.lineno = getattr(child, 'lineno', 0) + n 249 | continue 250 | 251 | if 'lineno' in child._attributes: 252 | child.lineno = getattr(child, 'lineno', 0) + n 253 | if ( 254 | "end_lineno" in child._attributes 255 | and (end_lineno := getattr(child, "end_lineno", 0)) is not None 256 | ): 257 | child.end_lineno = end_lineno + n 258 | return node 259 | 260 | 261 | def iter_fields(node): 262 | """ 263 | Yield a tuple of ``(fieldname, value)`` for each field in ``node._fields`` 264 | that is present on *node*. 265 | """ 266 | for field in node._fields: 267 | try: 268 | yield field, getattr(node, field) 269 | except AttributeError: 270 | pass 271 | 272 | 273 | def iter_child_nodes(node): 274 | """ 275 | Yield all direct child nodes of *node*, that is, all fields that are nodes 276 | and all items of fields that are lists of nodes. 277 | """ 278 | for name, field in iter_fields(node): 279 | if isinstance(field, AST): 280 | yield field 281 | elif isinstance(field, list): 282 | for item in field: 283 | if isinstance(item, AST): 284 | yield item 285 | 286 | 287 | def get_docstring(node, clean=True): 288 | """ 289 | Return the docstring for the given node or None if no docstring can 290 | be found. If the node provided does not have docstrings a TypeError 291 | will be raised. 292 | 293 | If *clean* is `True`, all tabs are expanded to spaces and any whitespace 294 | that can be uniformly removed from the second line onwards is removed. 295 | """ 296 | if not isinstance(node, (AsyncFunctionDef, FunctionDef, ClassDef, Module)): 297 | raise TypeError("%r can't have docstrings" % node.__class__.__name__) 298 | if not(node.body and isinstance(node.body[0], Expr)): 299 | return None 300 | node = node.body[0].value 301 | if isinstance(node, Constant) and isinstance(node.value, str): 302 | text = node.value 303 | else: 304 | return None 305 | if clean: 306 | import inspect 307 | text = inspect.cleandoc(text) 308 | return text 309 | 310 | 311 | _line_pattern = re.compile(r"(.*?(?:\r\n|\n|\r|$))") 312 | def _splitlines_no_ff(source, maxlines=None): 313 | """Split a string into lines ignoring form feed and other chars. 314 | 315 | This mimics how the Python parser splits source code. 316 | """ 317 | lines = [] 318 | for lineno, match in enumerate(_line_pattern.finditer(source), 1): 319 | if maxlines is not None and lineno > maxlines: 320 | break 321 | lines.append(match[0]) 322 | return lines 323 | 324 | 325 | def _pad_whitespace(source): 326 | r"""Replace all chars except '\f\t' in a line with spaces.""" 327 | result = '' 328 | for c in source: 329 | if c in '\f\t': 330 | result += c 331 | else: 332 | result += ' ' 333 | return result 334 | 335 | 336 | def get_source_segment(source, node, *, padded=False): 337 | """Get source code segment of the *source* that generated *node*. 338 | 339 | If some location information (`lineno`, `end_lineno`, `col_offset`, 340 | or `end_col_offset`) is missing, return None. 341 | 342 | If *padded* is `True`, the first line of a multi-line statement will 343 | be padded with spaces to match its original position. 344 | """ 345 | try: 346 | if node.end_lineno is None or node.end_col_offset is None: 347 | return None 348 | lineno = node.lineno - 1 349 | end_lineno = node.end_lineno - 1 350 | col_offset = node.col_offset 351 | end_col_offset = node.end_col_offset 352 | except AttributeError: 353 | return None 354 | 355 | lines = _splitlines_no_ff(source, maxlines=end_lineno+1) 356 | if end_lineno == lineno: 357 | return lines[lineno].encode()[col_offset:end_col_offset].decode() 358 | 359 | if padded: 360 | padding = _pad_whitespace(lines[lineno].encode()[:col_offset].decode()) 361 | else: 362 | padding = '' 363 | 364 | first = padding + lines[lineno].encode()[col_offset:].decode() 365 | last = lines[end_lineno].encode()[:end_col_offset].decode() 366 | lines = lines[lineno+1:end_lineno] 367 | 368 | lines.insert(0, first) 369 | lines.append(last) 370 | return ''.join(lines) 371 | 372 | 373 | def walk(node): 374 | """ 375 | Recursively yield all descendant nodes in the tree starting at *node* 376 | (including *node* itself), in no specified order. This is useful if you 377 | only want to modify nodes in place and don't care about the context. 378 | """ 379 | from collections import deque 380 | todo = deque([node]) 381 | while todo: 382 | node = todo.popleft() 383 | todo.extend(iter_child_nodes(node)) 384 | yield node 385 | 386 | 387 | class NodeVisitor(object): 388 | """ 389 | A node visitor base class that walks the abstract syntax tree and calls a 390 | visitor function for every node found. This function may return a value 391 | which is forwarded by the `visit` method. 392 | 393 | This class is meant to be subclassed, with the subclass adding visitor 394 | methods. 395 | 396 | Per default the visitor functions for the nodes are ``'visit_'`` + 397 | class name of the node. So a `TryFinally` node visit function would 398 | be `visit_TryFinally`. This behavior can be changed by overriding 399 | the `visit` method. If no visitor function exists for a node 400 | (return value `None`) the `generic_visit` visitor is used instead. 401 | 402 | Don't use the `NodeVisitor` if you want to apply changes to nodes during 403 | traversing. For this a special visitor exists (`NodeTransformer`) that 404 | allows modifications. 405 | """ 406 | 407 | def visit(self, node): 408 | """Visit a node.""" 409 | method = 'visit_' + node.__class__.__name__ 410 | visitor = getattr(self, method, self.generic_visit) 411 | return visitor(node) 412 | 413 | def generic_visit(self, node): 414 | """Called if no explicit visitor function exists for a node.""" 415 | for field, value in iter_fields(node): 416 | if isinstance(value, list): 417 | for item in value: 418 | if isinstance(item, AST): 419 | self.visit(item) 420 | elif isinstance(value, AST): 421 | self.visit(value) 422 | 423 | def visit_Constant(self, node): 424 | value = node.value 425 | type_name = _const_node_type_names.get(type(value)) 426 | if type_name is None: 427 | for cls, name in _const_node_type_names.items(): 428 | if isinstance(value, cls): 429 | type_name = name 430 | break 431 | if type_name is not None: 432 | method = 'visit_' + type_name 433 | try: 434 | visitor = getattr(self, method) 435 | except AttributeError: 436 | pass 437 | else: 438 | import warnings 439 | warnings.warn(f"{method} is deprecated; add visit_Constant", 440 | DeprecationWarning, 2) 441 | return visitor(node) 442 | return self.generic_visit(node) 443 | 444 | 445 | class NodeTransformer(NodeVisitor): 446 | """ 447 | A :class:`NodeVisitor` subclass that walks the abstract syntax tree and 448 | allows modification of nodes. 449 | 450 | The `NodeTransformer` will walk the AST and use the return value of the 451 | visitor methods to replace or remove the old node. If the return value of 452 | the visitor method is ``None``, the node will be removed from its location, 453 | otherwise it is replaced with the return value. The return value may be the 454 | original node in which case no replacement takes place. 455 | 456 | Here is an example transformer that rewrites all occurrences of name lookups 457 | (``foo``) to ``data['foo']``:: 458 | 459 | class RewriteName(NodeTransformer): 460 | 461 | def visit_Name(self, node): 462 | return Subscript( 463 | value=Name(id='data', ctx=Load()), 464 | slice=Constant(value=node.id), 465 | ctx=node.ctx 466 | ) 467 | 468 | Keep in mind that if the node you're operating on has child nodes you must 469 | either transform the child nodes yourself or call the :meth:`generic_visit` 470 | method for the node first. 471 | 472 | For nodes that were part of a collection of statements (that applies to all 473 | statement nodes), the visitor may also return a list of nodes rather than 474 | just a single node. 475 | 476 | Usually you use the transformer like this:: 477 | 478 | node = YourTransformer().visit(node) 479 | """ 480 | 481 | def generic_visit(self, node): 482 | for field, old_value in iter_fields(node): 483 | if isinstance(old_value, list): 484 | new_values = [] 485 | for value in old_value: 486 | if isinstance(value, AST): 487 | value = self.visit(value) 488 | if value is None: 489 | continue 490 | elif not isinstance(value, AST): 491 | new_values.extend(value) 492 | continue 493 | new_values.append(value) 494 | old_value[:] = new_values 495 | elif isinstance(old_value, AST): 496 | new_node = self.visit(old_value) 497 | if new_node is None: 498 | delattr(node, field) 499 | else: 500 | setattr(node, field, new_node) 501 | return node 502 | 503 | 504 | _DEPRECATED_VALUE_ALIAS_MESSAGE = ( 505 | "{name} is deprecated and will be removed in Python {remove}; use value instead" 506 | ) 507 | _DEPRECATED_CLASS_MESSAGE = ( 508 | "{name} is deprecated and will be removed in Python {remove}; " 509 | "use ast.Constant instead" 510 | ) 511 | 512 | 513 | # If the ast module is loaded more than once, only add deprecated methods once 514 | if not hasattr(Constant, 'n'): 515 | # The following code is for backward compatibility. 516 | # It will be removed in future. 517 | 518 | def _n_getter(self): 519 | """Deprecated. Use value instead.""" 520 | import warnings 521 | warnings._deprecated( 522 | "Attribute n", message=_DEPRECATED_VALUE_ALIAS_MESSAGE, remove=(3, 14) 523 | ) 524 | return self.value 525 | 526 | def _n_setter(self, value): 527 | import warnings 528 | warnings._deprecated( 529 | "Attribute n", message=_DEPRECATED_VALUE_ALIAS_MESSAGE, remove=(3, 14) 530 | ) 531 | self.value = value 532 | 533 | def _s_getter(self): 534 | """Deprecated. Use value instead.""" 535 | import warnings 536 | warnings._deprecated( 537 | "Attribute s", message=_DEPRECATED_VALUE_ALIAS_MESSAGE, remove=(3, 14) 538 | ) 539 | return self.value 540 | 541 | def _s_setter(self, value): 542 | import warnings 543 | warnings._deprecated( 544 | "Attribute s", message=_DEPRECATED_VALUE_ALIAS_MESSAGE, remove=(3, 14) 545 | ) 546 | self.value = value 547 | 548 | Constant.n = property(_n_getter, _n_setter) 549 | Constant.s = property(_s_getter, _s_setter) 550 | 551 | class _ABC(type): 552 | 553 | def __init__(cls, *args): 554 | cls.__doc__ = """Deprecated AST node class. Use ast.Constant instead""" 555 | 556 | def __instancecheck__(cls, inst): 557 | if cls in _const_types: 558 | import warnings 559 | warnings._deprecated( 560 | f"ast.{cls.__qualname__}", 561 | message=_DEPRECATED_CLASS_MESSAGE, 562 | remove=(3, 14) 563 | ) 564 | if not isinstance(inst, Constant): 565 | return False 566 | if cls in _const_types: 567 | try: 568 | value = inst.value 569 | except AttributeError: 570 | return False 571 | else: 572 | return ( 573 | isinstance(value, _const_types[cls]) and 574 | not isinstance(value, _const_types_not.get(cls, ())) 575 | ) 576 | return type.__instancecheck__(cls, inst) 577 | 578 | def _new(cls, *args, **kwargs): 579 | for key in kwargs: 580 | if key not in cls._fields: 581 | # arbitrary keyword arguments are accepted 582 | continue 583 | pos = cls._fields.index(key) 584 | if pos < len(args): 585 | raise TypeError(f"{cls.__name__} got multiple values for argument {key!r}") 586 | if cls in _const_types: 587 | import warnings 588 | warnings._deprecated( 589 | f"ast.{cls.__qualname__}", message=_DEPRECATED_CLASS_MESSAGE, remove=(3, 14) 590 | ) 591 | return Constant(*args, **kwargs) 592 | return Constant.__new__(cls, *args, **kwargs) 593 | 594 | class Num(Constant, metaclass=_ABC): 595 | _fields = ('n',) 596 | __new__ = _new 597 | 598 | class Str(Constant, metaclass=_ABC): 599 | _fields = ('s',) 600 | __new__ = _new 601 | 602 | class Bytes(Constant, metaclass=_ABC): 603 | _fields = ('s',) 604 | __new__ = _new 605 | 606 | class NameConstant(Constant, metaclass=_ABC): 607 | __new__ = _new 608 | 609 | class Ellipsis(Constant, metaclass=_ABC): 610 | _fields = () 611 | 612 | def __new__(cls, *args, **kwargs): 613 | if cls is _ast_Ellipsis: 614 | import warnings 615 | warnings._deprecated( 616 | "ast.Ellipsis", message=_DEPRECATED_CLASS_MESSAGE, remove=(3, 14) 617 | ) 618 | return Constant(..., *args, **kwargs) 619 | return Constant.__new__(cls, *args, **kwargs) 620 | 621 | # Keep another reference to Ellipsis in the global namespace 622 | # so it can be referenced in Ellipsis.__new__ 623 | # (The original "Ellipsis" name is removed from the global namespace later on) 624 | _ast_Ellipsis = Ellipsis 625 | 626 | _const_types = { 627 | Num: (int, float, complex), 628 | Str: (str,), 629 | Bytes: (bytes,), 630 | NameConstant: (type(None), bool), 631 | Ellipsis: (type(...),), 632 | } 633 | _const_types_not = { 634 | Num: (bool,), 635 | } 636 | 637 | _const_node_type_names = { 638 | bool: 'NameConstant', # should be before int 639 | type(None): 'NameConstant', 640 | int: 'Num', 641 | float: 'Num', 642 | complex: 'Num', 643 | str: 'Str', 644 | bytes: 'Bytes', 645 | type(...): 'Ellipsis', 646 | } 647 | 648 | class slice(AST): 649 | """Deprecated AST node class.""" 650 | 651 | class Index(slice): 652 | """Deprecated AST node class. Use the index value directly instead.""" 653 | def __new__(cls, value, **kwargs): 654 | return value 655 | 656 | class ExtSlice(slice): 657 | """Deprecated AST node class. Use ast.Tuple instead.""" 658 | def __new__(cls, dims=(), **kwargs): 659 | return Tuple(list(dims), Load(), **kwargs) 660 | 661 | # If the ast module is loaded more than once, only add deprecated methods once 662 | if not hasattr(Tuple, 'dims'): 663 | # The following code is for backward compatibility. 664 | # It will be removed in future. 665 | 666 | def _dims_getter(self): 667 | """Deprecated. Use elts instead.""" 668 | return self.elts 669 | 670 | def _dims_setter(self, value): 671 | self.elts = value 672 | 673 | Tuple.dims = property(_dims_getter, _dims_setter) 674 | 675 | class Suite(mod): 676 | """Deprecated AST node class. Unused in Python 3.""" 677 | 678 | class AugLoad(expr_context): 679 | """Deprecated AST node class. Unused in Python 3.""" 680 | 681 | class AugStore(expr_context): 682 | """Deprecated AST node class. Unused in Python 3.""" 683 | 684 | class Param(expr_context): 685 | """Deprecated AST node class. Unused in Python 3.""" 686 | 687 | 688 | # Large float and imaginary literals get turned into infinities in the AST. 689 | # We unparse those infinities to INFSTR. 690 | _INFSTR = "1e" + repr(sys.float_info.max_10_exp + 1) 691 | 692 | @_simple_enum(IntEnum) 693 | class _Precedence: 694 | """Precedence table that originated from python grammar.""" 695 | 696 | NAMED_EXPR = auto() # := 697 | TUPLE = auto() # , 698 | YIELD = auto() # 'yield', 'yield from' 699 | TEST = auto() # 'if'-'else', 'lambda' 700 | OR = auto() # 'or' 701 | AND = auto() # 'and' 702 | NOT = auto() # 'not' 703 | CMP = auto() # '<', '>', '==', '>=', '<=', '!=', 704 | # 'in', 'not in', 'is', 'is not' 705 | EXPR = auto() 706 | BOR = EXPR # '|' 707 | BXOR = auto() # '^' 708 | BAND = auto() # '&' 709 | SHIFT = auto() # '<<', '>>' 710 | ARITH = auto() # '+', '-' 711 | TERM = auto() # '*', '@', '/', '%', '//' 712 | FACTOR = auto() # unary '+', '-', '~' 713 | POWER = auto() # '**' 714 | AWAIT = auto() # 'await' 715 | ATOM = auto() 716 | 717 | def next(self): 718 | try: 719 | return self.__class__(self + 1) 720 | except ValueError: 721 | return self 722 | 723 | 724 | _SINGLE_QUOTES = ("'", '"') 725 | _MULTI_QUOTES = ('"""', "'''") 726 | _ALL_QUOTES = (*_SINGLE_QUOTES, *_MULTI_QUOTES) 727 | 728 | class _Unparser(NodeVisitor): 729 | """Methods in this class recursively traverse an AST and 730 | output source code for the abstract syntax; original formatting 731 | is disregarded.""" 732 | 733 | def __init__(self, *, _avoid_backslashes=False): 734 | self._source = [] 735 | self._precedences = {} 736 | self._type_ignores = {} 737 | self._indent = 0 738 | self._avoid_backslashes = _avoid_backslashes 739 | self._in_try_star = False 740 | 741 | def interleave(self, inter, f, seq): 742 | """Call f on each item in seq, calling inter() in between.""" 743 | seq = iter(seq) 744 | try: 745 | f(next(seq)) 746 | except StopIteration: 747 | pass 748 | else: 749 | for x in seq: 750 | inter() 751 | f(x) 752 | 753 | def items_view(self, traverser, items): 754 | """Traverse and separate the given *items* with a comma and append it to 755 | the buffer. If *items* is a single item sequence, a trailing comma 756 | will be added.""" 757 | if len(items) == 1: 758 | traverser(items[0]) 759 | self.write(",") 760 | else: 761 | self.interleave(lambda: self.write(", "), traverser, items) 762 | 763 | def maybe_newline(self): 764 | """Adds a newline if it isn't the start of generated source""" 765 | if self._source: 766 | self.write("\n") 767 | 768 | def fill(self, text=""): 769 | """Indent a piece of text and append it, according to the current 770 | indentation level""" 771 | self.maybe_newline() 772 | self.write(" " * self._indent + text) 773 | 774 | def write(self, *text): 775 | """Add new source parts""" 776 | self._source.extend(text) 777 | 778 | @contextmanager 779 | def buffered(self, buffer = None): 780 | if buffer is None: 781 | buffer = [] 782 | 783 | original_source = self._source 784 | self._source = buffer 785 | yield buffer 786 | self._source = original_source 787 | 788 | @contextmanager 789 | def block(self, *, extra = None): 790 | """A context manager for preparing the source for blocks. It adds 791 | the character':', increases the indentation on enter and decreases 792 | the indentation on exit. If *extra* is given, it will be directly 793 | appended after the colon character. 794 | """ 795 | self.write(":") 796 | if extra: 797 | self.write(extra) 798 | self._indent += 1 799 | yield 800 | self._indent -= 1 801 | 802 | @contextmanager 803 | def delimit(self, start, end): 804 | """A context manager for preparing the source for expressions. It adds 805 | *start* to the buffer and enters, after exit it adds *end*.""" 806 | 807 | self.write(start) 808 | yield 809 | self.write(end) 810 | 811 | def delimit_if(self, start, end, condition): 812 | if condition: 813 | return self.delimit(start, end) 814 | else: 815 | return nullcontext() 816 | 817 | def require_parens(self, precedence, node): 818 | """Shortcut to adding precedence related parens""" 819 | return self.delimit_if("(", ")", self.get_precedence(node) > precedence) 820 | 821 | def get_precedence(self, node): 822 | return self._precedences.get(node, _Precedence.TEST) 823 | 824 | def set_precedence(self, precedence, *nodes): 825 | for node in nodes: 826 | self._precedences[node] = precedence 827 | 828 | def get_raw_docstring(self, node): 829 | """If a docstring node is found in the body of the *node* parameter, 830 | return that docstring node, None otherwise. 831 | 832 | Logic mirrored from ``_PyAST_GetDocString``.""" 833 | if not isinstance( 834 | node, (AsyncFunctionDef, FunctionDef, ClassDef, Module) 835 | ) or len(node.body) < 1: 836 | return None 837 | node = node.body[0] 838 | if not isinstance(node, Expr): 839 | return None 840 | node = node.value 841 | if isinstance(node, Constant) and isinstance(node.value, str): 842 | return node 843 | 844 | def get_type_comment(self, node): 845 | comment = self._type_ignores.get(node.lineno) or node.type_comment 846 | if comment is not None: 847 | return f" # type: {comment}" 848 | 849 | def traverse(self, node): 850 | if isinstance(node, list): 851 | for item in node: 852 | self.traverse(item) 853 | else: 854 | super().visit(node) 855 | 856 | # Note: as visit() resets the output text, do NOT rely on 857 | # NodeVisitor.generic_visit to handle any nodes (as it calls back in to 858 | # the subclass visit() method, which resets self._source to an empty list) 859 | def visit(self, node): 860 | """Outputs a source code string that, if converted back to an ast 861 | (using ast.parse) will generate an AST equivalent to *node*""" 862 | self._source = [] 863 | self.traverse(node) 864 | return "".join(self._source) 865 | 866 | def _write_docstring_and_traverse_body(self, node): 867 | if (docstring := self.get_raw_docstring(node)): 868 | self._write_docstring(docstring) 869 | self.traverse(node.body[1:]) 870 | else: 871 | self.traverse(node.body) 872 | 873 | def visit_Module(self, node): 874 | self._type_ignores = { 875 | ignore.lineno: f"ignore{ignore.tag}" 876 | for ignore in node.type_ignores 877 | } 878 | self._write_docstring_and_traverse_body(node) 879 | self._type_ignores.clear() 880 | 881 | def visit_FunctionType(self, node): 882 | with self.delimit("(", ")"): 883 | self.interleave( 884 | lambda: self.write(", "), self.traverse, node.argtypes 885 | ) 886 | 887 | self.write(" -> ") 888 | self.traverse(node.returns) 889 | 890 | def visit_Expr(self, node): 891 | self.fill() 892 | self.set_precedence(_Precedence.YIELD, node.value) 893 | self.traverse(node.value) 894 | 895 | def visit_NamedExpr(self, node): 896 | with self.require_parens(_Precedence.NAMED_EXPR, node): 897 | self.set_precedence(_Precedence.ATOM, node.target, node.value) 898 | self.traverse(node.target) 899 | self.write(" := ") 900 | self.traverse(node.value) 901 | 902 | def visit_Import(self, node): 903 | self.fill("import ") 904 | self.interleave(lambda: self.write(", "), self.traverse, node.names) 905 | 906 | def visit_ImportFrom(self, node): 907 | self.fill("from ") 908 | self.write("." * (node.level or 0)) 909 | if node.module: 910 | self.write(node.module) 911 | self.write(" import ") 912 | self.interleave(lambda: self.write(", "), self.traverse, node.names) 913 | 914 | def visit_Assign(self, node): 915 | self.fill() 916 | for target in node.targets: 917 | self.set_precedence(_Precedence.TUPLE, target) 918 | self.traverse(target) 919 | self.write(" = ") 920 | self.traverse(node.value) 921 | if type_comment := self.get_type_comment(node): 922 | self.write(type_comment) 923 | 924 | def visit_AugAssign(self, node): 925 | self.fill() 926 | self.traverse(node.target) 927 | self.write(" " + self.binop[node.op.__class__.__name__] + "= ") 928 | self.traverse(node.value) 929 | 930 | def visit_AnnAssign(self, node): 931 | self.fill() 932 | with self.delimit_if("(", ")", not node.simple and isinstance(node.target, Name)): 933 | self.traverse(node.target) 934 | self.write(": ") 935 | self.traverse(node.annotation) 936 | if node.value: 937 | self.write(" = ") 938 | self.traverse(node.value) 939 | 940 | def visit_Return(self, node): 941 | self.fill("return") 942 | if node.value: 943 | self.write(" ") 944 | self.traverse(node.value) 945 | 946 | def visit_Pass(self, node): 947 | self.fill("pass") 948 | 949 | def visit_Break(self, node): 950 | self.fill("break") 951 | 952 | def visit_Continue(self, node): 953 | self.fill("continue") 954 | 955 | def visit_Delete(self, node): 956 | self.fill("del ") 957 | self.interleave(lambda: self.write(", "), self.traverse, node.targets) 958 | 959 | def visit_Assert(self, node): 960 | self.fill("assert ") 961 | self.traverse(node.test) 962 | if node.msg: 963 | self.write(", ") 964 | self.traverse(node.msg) 965 | 966 | def visit_Global(self, node): 967 | self.fill("global ") 968 | self.interleave(lambda: self.write(", "), self.write, node.names) 969 | 970 | def visit_Nonlocal(self, node): 971 | self.fill("nonlocal ") 972 | self.interleave(lambda: self.write(", "), self.write, node.names) 973 | 974 | def visit_Await(self, node): 975 | with self.require_parens(_Precedence.AWAIT, node): 976 | self.write("await") 977 | if node.value: 978 | self.write(" ") 979 | self.set_precedence(_Precedence.ATOM, node.value) 980 | self.traverse(node.value) 981 | 982 | def visit_Yield(self, node): 983 | with self.require_parens(_Precedence.YIELD, node): 984 | self.write("yield") 985 | if node.value: 986 | self.write(" ") 987 | self.set_precedence(_Precedence.ATOM, node.value) 988 | self.traverse(node.value) 989 | 990 | def visit_YieldFrom(self, node): 991 | with self.require_parens(_Precedence.YIELD, node): 992 | self.write("yield from ") 993 | if not node.value: 994 | raise ValueError("Node can't be used without a value attribute.") 995 | self.set_precedence(_Precedence.ATOM, node.value) 996 | self.traverse(node.value) 997 | 998 | def visit_Raise(self, node): 999 | self.fill("raise") 1000 | if not node.exc: 1001 | if node.cause: 1002 | raise ValueError(f"Node can't use cause without an exception.") 1003 | return 1004 | self.write(" ") 1005 | self.traverse(node.exc) 1006 | if node.cause: 1007 | self.write(" from ") 1008 | self.traverse(node.cause) 1009 | 1010 | def do_visit_try(self, node): 1011 | self.fill("try") 1012 | with self.block(): 1013 | self.traverse(node.body) 1014 | for ex in node.handlers: 1015 | self.traverse(ex) 1016 | if node.orelse: 1017 | self.fill("else") 1018 | with self.block(): 1019 | self.traverse(node.orelse) 1020 | if node.finalbody: 1021 | self.fill("finally") 1022 | with self.block(): 1023 | self.traverse(node.finalbody) 1024 | 1025 | def visit_Try(self, node): 1026 | prev_in_try_star = self._in_try_star 1027 | try: 1028 | self._in_try_star = False 1029 | self.do_visit_try(node) 1030 | finally: 1031 | self._in_try_star = prev_in_try_star 1032 | 1033 | def visit_TryStar(self, node): 1034 | prev_in_try_star = self._in_try_star 1035 | try: 1036 | self._in_try_star = True 1037 | self.do_visit_try(node) 1038 | finally: 1039 | self._in_try_star = prev_in_try_star 1040 | 1041 | def visit_ExceptHandler(self, node): 1042 | self.fill("except*" if self._in_try_star else "except") 1043 | if node.type: 1044 | self.write(" ") 1045 | self.traverse(node.type) 1046 | if node.name: 1047 | self.write(" as ") 1048 | self.write(node.name) 1049 | with self.block(): 1050 | self.traverse(node.body) 1051 | 1052 | def visit_ClassDef(self, node): 1053 | self.maybe_newline() 1054 | for deco in node.decorator_list: 1055 | self.fill("@") 1056 | self.traverse(deco) 1057 | self.fill("class " + node.name) 1058 | if hasattr(node, "type_params"): 1059 | self._type_params_helper(node.type_params) 1060 | with self.delimit_if("(", ")", condition = node.bases or node.keywords): 1061 | comma = False 1062 | for e in node.bases: 1063 | if comma: 1064 | self.write(", ") 1065 | else: 1066 | comma = True 1067 | self.traverse(e) 1068 | for e in node.keywords: 1069 | if comma: 1070 | self.write(", ") 1071 | else: 1072 | comma = True 1073 | self.traverse(e) 1074 | 1075 | with self.block(): 1076 | self._write_docstring_and_traverse_body(node) 1077 | 1078 | def visit_FunctionDef(self, node): 1079 | self._function_helper(node, "def") 1080 | 1081 | def visit_AsyncFunctionDef(self, node): 1082 | self._function_helper(node, "async def") 1083 | 1084 | def _function_helper(self, node, fill_suffix): 1085 | self.maybe_newline() 1086 | for deco in node.decorator_list: 1087 | self.fill("@") 1088 | self.traverse(deco) 1089 | def_str = fill_suffix + " " + node.name 1090 | self.fill(def_str) 1091 | if hasattr(node, "type_params"): 1092 | self._type_params_helper(node.type_params) 1093 | with self.delimit("(", ")"): 1094 | self.traverse(node.args) 1095 | if node.returns: 1096 | self.write(" -> ") 1097 | self.traverse(node.returns) 1098 | with self.block(extra=self.get_type_comment(node)): 1099 | self._write_docstring_and_traverse_body(node) 1100 | 1101 | def _type_params_helper(self, type_params): 1102 | if type_params is not None and len(type_params) > 0: 1103 | with self.delimit("[", "]"): 1104 | self.interleave(lambda: self.write(", "), self.traverse, type_params) 1105 | 1106 | def visit_TypeVar(self, node): 1107 | self.write(node.name) 1108 | if node.bound: 1109 | self.write(": ") 1110 | self.traverse(node.bound) 1111 | 1112 | def visit_TypeVarTuple(self, node): 1113 | self.write("*" + node.name) 1114 | 1115 | def visit_ParamSpec(self, node): 1116 | self.write("**" + node.name) 1117 | 1118 | def visit_TypeAlias(self, node): 1119 | self.fill("type ") 1120 | self.traverse(node.name) 1121 | self._type_params_helper(node.type_params) 1122 | self.write(" = ") 1123 | self.traverse(node.value) 1124 | 1125 | def visit_For(self, node): 1126 | self._for_helper("for ", node) 1127 | 1128 | def visit_AsyncFor(self, node): 1129 | self._for_helper("async for ", node) 1130 | 1131 | def _for_helper(self, fill, node): 1132 | self.fill(fill) 1133 | self.set_precedence(_Precedence.TUPLE, node.target) 1134 | self.traverse(node.target) 1135 | self.write(" in ") 1136 | self.traverse(node.iter) 1137 | with self.block(extra=self.get_type_comment(node)): 1138 | self.traverse(node.body) 1139 | if node.orelse: 1140 | self.fill("else") 1141 | with self.block(): 1142 | self.traverse(node.orelse) 1143 | 1144 | def visit_If(self, node): 1145 | self.fill("if ") 1146 | self.traverse(node.test) 1147 | with self.block(): 1148 | self.traverse(node.body) 1149 | # collapse nested ifs into equivalent elifs. 1150 | while node.orelse and len(node.orelse) == 1 and isinstance(node.orelse[0], If): 1151 | node = node.orelse[0] 1152 | self.fill("elif ") 1153 | self.traverse(node.test) 1154 | with self.block(): 1155 | self.traverse(node.body) 1156 | # final else 1157 | if node.orelse: 1158 | self.fill("else") 1159 | with self.block(): 1160 | self.traverse(node.orelse) 1161 | 1162 | def visit_While(self, node): 1163 | self.fill("while ") 1164 | self.traverse(node.test) 1165 | with self.block(): 1166 | self.traverse(node.body) 1167 | if node.orelse: 1168 | self.fill("else") 1169 | with self.block(): 1170 | self.traverse(node.orelse) 1171 | 1172 | def visit_With(self, node): 1173 | self.fill("with ") 1174 | self.interleave(lambda: self.write(", "), self.traverse, node.items) 1175 | with self.block(extra=self.get_type_comment(node)): 1176 | self.traverse(node.body) 1177 | 1178 | def visit_AsyncWith(self, node): 1179 | self.fill("async with ") 1180 | self.interleave(lambda: self.write(", "), self.traverse, node.items) 1181 | with self.block(extra=self.get_type_comment(node)): 1182 | self.traverse(node.body) 1183 | 1184 | def _str_literal_helper( 1185 | self, string, *, quote_types=_ALL_QUOTES, escape_special_whitespace=False 1186 | ): 1187 | """Helper for writing string literals, minimizing escapes. 1188 | Returns the tuple (string literal to write, possible quote types). 1189 | """ 1190 | def escape_char(c): 1191 | # \n and \t are non-printable, but we only escape them if 1192 | # escape_special_whitespace is True 1193 | if not escape_special_whitespace and c in "\n\t": 1194 | return c 1195 | # Always escape backslashes and other non-printable characters 1196 | if c == "\\" or not c.isprintable(): 1197 | return c.encode("unicode_escape").decode("ascii") 1198 | return c 1199 | 1200 | escaped_string = "".join(map(escape_char, string)) 1201 | possible_quotes = quote_types 1202 | if "\n" in escaped_string: 1203 | possible_quotes = [q for q in possible_quotes if q in _MULTI_QUOTES] 1204 | possible_quotes = [q for q in possible_quotes if q not in escaped_string] 1205 | if not possible_quotes: 1206 | # If there aren't any possible_quotes, fallback to using repr 1207 | # on the original string. Try to use a quote from quote_types, 1208 | # e.g., so that we use triple quotes for docstrings. 1209 | string = repr(string) 1210 | quote = next((q for q in quote_types if string[0] in q), string[0]) 1211 | return string[1:-1], [quote] 1212 | if escaped_string: 1213 | # Sort so that we prefer '''"''' over """\"""" 1214 | possible_quotes.sort(key=lambda q: q[0] == escaped_string[-1]) 1215 | # If we're using triple quotes and we'd need to escape a final 1216 | # quote, escape it 1217 | if possible_quotes[0][0] == escaped_string[-1]: 1218 | assert len(possible_quotes[0]) == 3 1219 | escaped_string = escaped_string[:-1] + "\\" + escaped_string[-1] 1220 | return escaped_string, possible_quotes 1221 | 1222 | def _write_str_avoiding_backslashes(self, string, *, quote_types=_ALL_QUOTES): 1223 | """Write string literal value with a best effort attempt to avoid backslashes.""" 1224 | string, quote_types = self._str_literal_helper(string, quote_types=quote_types) 1225 | quote_type = quote_types[0] 1226 | self.write(f"{quote_type}{string}{quote_type}") 1227 | 1228 | def visit_JoinedStr(self, node): 1229 | self.write("f") 1230 | 1231 | fstring_parts = [] 1232 | for value in node.values: 1233 | with self.buffered() as buffer: 1234 | self._write_fstring_inner(value) 1235 | fstring_parts.append( 1236 | ("".join(buffer), isinstance(value, Constant)) 1237 | ) 1238 | 1239 | new_fstring_parts = [] 1240 | quote_types = list(_ALL_QUOTES) 1241 | for value, is_constant in fstring_parts: 1242 | if is_constant: 1243 | value, quote_types = self._str_literal_helper( 1244 | value, 1245 | quote_types=quote_types, 1246 | escape_special_whitespace=True, 1247 | ) 1248 | elif "\n" in value: 1249 | quote_types = [q for q in quote_types if q in _MULTI_QUOTES] 1250 | new_fstring_parts.append(value) 1251 | 1252 | value = "".join(new_fstring_parts) 1253 | quote_type = quote_types[0] 1254 | self.write(f"{quote_type}{value}{quote_type}") 1255 | 1256 | def _write_fstring_inner(self, node): 1257 | if isinstance(node, JoinedStr): 1258 | # for both the f-string itself, and format_spec 1259 | for value in node.values: 1260 | self._write_fstring_inner(value) 1261 | elif isinstance(node, Constant) and isinstance(node.value, str): 1262 | value = node.value.replace("{", "{{").replace("}", "}}") 1263 | self.write(value) 1264 | elif isinstance(node, FormattedValue): 1265 | self.visit_FormattedValue(node) 1266 | else: 1267 | raise ValueError(f"Unexpected node inside JoinedStr, {node!r}") 1268 | 1269 | def visit_FormattedValue(self, node): 1270 | def unparse_inner(inner): 1271 | unparser = type(self)() 1272 | unparser.set_precedence(_Precedence.TEST.next(), inner) 1273 | return unparser.visit(inner) 1274 | 1275 | with self.delimit("{", "}"): 1276 | expr = unparse_inner(node.value) 1277 | if expr.startswith("{"): 1278 | # Separate pair of opening brackets as "{ {" 1279 | self.write(" ") 1280 | self.write(expr) 1281 | if node.conversion != -1: 1282 | self.write(f"!{chr(node.conversion)}") 1283 | if node.format_spec: 1284 | self.write(":") 1285 | self._write_fstring_inner(node.format_spec) 1286 | 1287 | def visit_Name(self, node): 1288 | self.write(node.id) 1289 | 1290 | def _write_docstring(self, node): 1291 | self.fill() 1292 | if node.kind == "u": 1293 | self.write("u") 1294 | self._write_str_avoiding_backslashes(node.value, quote_types=_MULTI_QUOTES) 1295 | 1296 | def _write_constant(self, value): 1297 | if isinstance(value, (float, complex)): 1298 | # Substitute overflowing decimal literal for AST infinities, 1299 | # and inf - inf for NaNs. 1300 | self.write( 1301 | repr(value) 1302 | .replace("inf", _INFSTR) 1303 | .replace("nan", f"({_INFSTR}-{_INFSTR})") 1304 | ) 1305 | elif self._avoid_backslashes and isinstance(value, str): 1306 | self._write_str_avoiding_backslashes(value) 1307 | else: 1308 | self.write(repr(value)) 1309 | 1310 | def visit_Constant(self, node): 1311 | value = node.value 1312 | if isinstance(value, tuple): 1313 | with self.delimit("(", ")"): 1314 | self.items_view(self._write_constant, value) 1315 | elif value is ...: 1316 | self.write("...") 1317 | else: 1318 | if node.kind == "u": 1319 | self.write("u") 1320 | self._write_constant(node.value) 1321 | 1322 | def visit_List(self, node): 1323 | with self.delimit("[", "]"): 1324 | self.interleave(lambda: self.write(", "), self.traverse, node.elts) 1325 | 1326 | def visit_ListComp(self, node): 1327 | with self.delimit("[", "]"): 1328 | self.traverse(node.elt) 1329 | for gen in node.generators: 1330 | self.traverse(gen) 1331 | 1332 | def visit_GeneratorExp(self, node): 1333 | with self.delimit("(", ")"): 1334 | self.traverse(node.elt) 1335 | for gen in node.generators: 1336 | self.traverse(gen) 1337 | 1338 | def visit_SetComp(self, node): 1339 | with self.delimit("{", "}"): 1340 | self.traverse(node.elt) 1341 | for gen in node.generators: 1342 | self.traverse(gen) 1343 | 1344 | def visit_DictComp(self, node): 1345 | with self.delimit("{", "}"): 1346 | self.traverse(node.key) 1347 | self.write(": ") 1348 | self.traverse(node.value) 1349 | for gen in node.generators: 1350 | self.traverse(gen) 1351 | 1352 | def visit_comprehension(self, node): 1353 | if node.is_async: 1354 | self.write(" async for ") 1355 | else: 1356 | self.write(" for ") 1357 | self.set_precedence(_Precedence.TUPLE, node.target) 1358 | self.traverse(node.target) 1359 | self.write(" in ") 1360 | self.set_precedence(_Precedence.TEST.next(), node.iter, *node.ifs) 1361 | self.traverse(node.iter) 1362 | for if_clause in node.ifs: 1363 | self.write(" if ") 1364 | self.traverse(if_clause) 1365 | 1366 | def visit_IfExp(self, node): 1367 | with self.require_parens(_Precedence.TEST, node): 1368 | self.set_precedence(_Precedence.TEST.next(), node.body, node.test) 1369 | self.traverse(node.body) 1370 | self.write(" if ") 1371 | self.traverse(node.test) 1372 | self.write(" else ") 1373 | self.set_precedence(_Precedence.TEST, node.orelse) 1374 | self.traverse(node.orelse) 1375 | 1376 | def visit_Set(self, node): 1377 | if node.elts: 1378 | with self.delimit("{", "}"): 1379 | self.interleave(lambda: self.write(", "), self.traverse, node.elts) 1380 | else: 1381 | # `{}` would be interpreted as a dictionary literal, and 1382 | # `set` might be shadowed. Thus: 1383 | self.write('{*()}') 1384 | 1385 | def visit_Dict(self, node): 1386 | def write_key_value_pair(k, v): 1387 | self.traverse(k) 1388 | self.write(": ") 1389 | self.traverse(v) 1390 | 1391 | def write_item(item): 1392 | k, v = item 1393 | if k is None: 1394 | # for dictionary unpacking operator in dicts {**{'y': 2}} 1395 | # see PEP 448 for details 1396 | self.write("**") 1397 | self.set_precedence(_Precedence.EXPR, v) 1398 | self.traverse(v) 1399 | else: 1400 | write_key_value_pair(k, v) 1401 | 1402 | with self.delimit("{", "}"): 1403 | self.interleave( 1404 | lambda: self.write(", "), write_item, zip(node.keys, node.values) 1405 | ) 1406 | 1407 | def visit_Tuple(self, node): 1408 | with self.delimit_if( 1409 | "(", 1410 | ")", 1411 | len(node.elts) == 0 or self.get_precedence(node) > _Precedence.TUPLE 1412 | ): 1413 | self.items_view(self.traverse, node.elts) 1414 | 1415 | unop = {"Invert": "~", "Not": "not", "UAdd": "+", "USub": "-"} 1416 | unop_precedence = { 1417 | "not": _Precedence.NOT, 1418 | "~": _Precedence.FACTOR, 1419 | "+": _Precedence.FACTOR, 1420 | "-": _Precedence.FACTOR, 1421 | } 1422 | 1423 | def visit_UnaryOp(self, node): 1424 | operator = self.unop[node.op.__class__.__name__] 1425 | operator_precedence = self.unop_precedence[operator] 1426 | with self.require_parens(operator_precedence, node): 1427 | self.write(operator) 1428 | # factor prefixes (+, -, ~) shouldn't be separated 1429 | # from the value they belong, (e.g: +1 instead of + 1) 1430 | if operator_precedence is not _Precedence.FACTOR: 1431 | self.write(" ") 1432 | self.set_precedence(operator_precedence, node.operand) 1433 | self.traverse(node.operand) 1434 | 1435 | binop = { 1436 | "Add": "+", 1437 | "Sub": "-", 1438 | "Mult": "*", 1439 | "MatMult": "@", 1440 | "Div": "/", 1441 | "Mod": "%", 1442 | "LShift": "<<", 1443 | "RShift": ">>", 1444 | "BitOr": "|", 1445 | "BitXor": "^", 1446 | "BitAnd": "&", 1447 | "FloorDiv": "//", 1448 | "Pow": "**", 1449 | } 1450 | 1451 | binop_precedence = { 1452 | "+": _Precedence.ARITH, 1453 | "-": _Precedence.ARITH, 1454 | "*": _Precedence.TERM, 1455 | "@": _Precedence.TERM, 1456 | "/": _Precedence.TERM, 1457 | "%": _Precedence.TERM, 1458 | "<<": _Precedence.SHIFT, 1459 | ">>": _Precedence.SHIFT, 1460 | "|": _Precedence.BOR, 1461 | "^": _Precedence.BXOR, 1462 | "&": _Precedence.BAND, 1463 | "//": _Precedence.TERM, 1464 | "**": _Precedence.POWER, 1465 | } 1466 | 1467 | binop_rassoc = frozenset(("**",)) 1468 | def visit_BinOp(self, node): 1469 | operator = self.binop[node.op.__class__.__name__] 1470 | operator_precedence = self.binop_precedence[operator] 1471 | with self.require_parens(operator_precedence, node): 1472 | if operator in self.binop_rassoc: 1473 | left_precedence = operator_precedence.next() 1474 | right_precedence = operator_precedence 1475 | else: 1476 | left_precedence = operator_precedence 1477 | right_precedence = operator_precedence.next() 1478 | 1479 | self.set_precedence(left_precedence, node.left) 1480 | self.traverse(node.left) 1481 | self.write(f" {operator} ") 1482 | self.set_precedence(right_precedence, node.right) 1483 | self.traverse(node.right) 1484 | 1485 | cmpops = { 1486 | "Eq": "==", 1487 | "NotEq": "!=", 1488 | "Lt": "<", 1489 | "LtE": "<=", 1490 | "Gt": ">", 1491 | "GtE": ">=", 1492 | "Is": "is", 1493 | "IsNot": "is not", 1494 | "In": "in", 1495 | "NotIn": "not in", 1496 | } 1497 | 1498 | def visit_Compare(self, node): 1499 | with self.require_parens(_Precedence.CMP, node): 1500 | self.set_precedence(_Precedence.CMP.next(), node.left, *node.comparators) 1501 | self.traverse(node.left) 1502 | for o, e in zip(node.ops, node.comparators): 1503 | self.write(" " + self.cmpops[o.__class__.__name__] + " ") 1504 | self.traverse(e) 1505 | 1506 | boolops = {"And": "and", "Or": "or"} 1507 | boolop_precedence = {"and": _Precedence.AND, "or": _Precedence.OR} 1508 | 1509 | def visit_BoolOp(self, node): 1510 | operator = self.boolops[node.op.__class__.__name__] 1511 | operator_precedence = self.boolop_precedence[operator] 1512 | 1513 | def increasing_level_traverse(node): 1514 | nonlocal operator_precedence 1515 | operator_precedence = operator_precedence.next() 1516 | self.set_precedence(operator_precedence, node) 1517 | self.traverse(node) 1518 | 1519 | with self.require_parens(operator_precedence, node): 1520 | s = f" {operator} " 1521 | self.interleave(lambda: self.write(s), increasing_level_traverse, node.values) 1522 | 1523 | def visit_Attribute(self, node): 1524 | self.set_precedence(_Precedence.ATOM, node.value) 1525 | self.traverse(node.value) 1526 | # Special case: 3.__abs__() is a syntax error, so if node.value 1527 | # is an integer literal then we need to either parenthesize 1528 | # it or add an extra space to get 3 .__abs__(). 1529 | if isinstance(node.value, Constant) and isinstance(node.value.value, int): 1530 | self.write(" ") 1531 | self.write(".") 1532 | self.write(node.attr) 1533 | 1534 | def visit_Call(self, node): 1535 | self.set_precedence(_Precedence.ATOM, node.func) 1536 | self.traverse(node.func) 1537 | with self.delimit("(", ")"): 1538 | comma = False 1539 | for e in node.args: 1540 | if comma: 1541 | self.write(", ") 1542 | else: 1543 | comma = True 1544 | self.traverse(e) 1545 | for e in node.keywords: 1546 | if comma: 1547 | self.write(", ") 1548 | else: 1549 | comma = True 1550 | self.traverse(e) 1551 | 1552 | def visit_Subscript(self, node): 1553 | def is_non_empty_tuple(slice_value): 1554 | return ( 1555 | isinstance(slice_value, Tuple) 1556 | and slice_value.elts 1557 | ) 1558 | 1559 | self.set_precedence(_Precedence.ATOM, node.value) 1560 | self.traverse(node.value) 1561 | with self.delimit("[", "]"): 1562 | if is_non_empty_tuple(node.slice): 1563 | # parentheses can be omitted if the tuple isn't empty 1564 | self.items_view(self.traverse, node.slice.elts) 1565 | else: 1566 | self.traverse(node.slice) 1567 | 1568 | def visit_Starred(self, node): 1569 | self.write("*") 1570 | self.set_precedence(_Precedence.EXPR, node.value) 1571 | self.traverse(node.value) 1572 | 1573 | def visit_Ellipsis(self, node): 1574 | self.write("...") 1575 | 1576 | def visit_Slice(self, node): 1577 | if node.lower: 1578 | self.traverse(node.lower) 1579 | self.write(":") 1580 | if node.upper: 1581 | self.traverse(node.upper) 1582 | if node.step: 1583 | self.write(":") 1584 | self.traverse(node.step) 1585 | 1586 | def visit_Match(self, node): 1587 | self.fill("match ") 1588 | self.traverse(node.subject) 1589 | with self.block(): 1590 | for case in node.cases: 1591 | self.traverse(case) 1592 | 1593 | def visit_arg(self, node): 1594 | self.write(node.arg) 1595 | if node.annotation: 1596 | self.write(": ") 1597 | self.traverse(node.annotation) 1598 | 1599 | def visit_arguments(self, node): 1600 | first = True 1601 | # normal arguments 1602 | all_args = node.posonlyargs + node.args 1603 | defaults = [None] * (len(all_args) - len(node.defaults)) + node.defaults 1604 | for index, elements in enumerate(zip(all_args, defaults), 1): 1605 | a, d = elements 1606 | if first: 1607 | first = False 1608 | else: 1609 | self.write(", ") 1610 | self.traverse(a) 1611 | if d: 1612 | self.write("=") 1613 | self.traverse(d) 1614 | if index == len(node.posonlyargs): 1615 | self.write(", /") 1616 | 1617 | # varargs, or bare '*' if no varargs but keyword-only arguments present 1618 | if node.vararg or node.kwonlyargs: 1619 | if first: 1620 | first = False 1621 | else: 1622 | self.write(", ") 1623 | self.write("*") 1624 | if node.vararg: 1625 | self.write(node.vararg.arg) 1626 | if node.vararg.annotation: 1627 | self.write(": ") 1628 | self.traverse(node.vararg.annotation) 1629 | 1630 | # keyword-only arguments 1631 | if node.kwonlyargs: 1632 | for a, d in zip(node.kwonlyargs, node.kw_defaults): 1633 | self.write(", ") 1634 | self.traverse(a) 1635 | if d: 1636 | self.write("=") 1637 | self.traverse(d) 1638 | 1639 | # kwargs 1640 | if node.kwarg: 1641 | if first: 1642 | first = False 1643 | else: 1644 | self.write(", ") 1645 | self.write("**" + node.kwarg.arg) 1646 | if node.kwarg.annotation: 1647 | self.write(": ") 1648 | self.traverse(node.kwarg.annotation) 1649 | 1650 | def visit_keyword(self, node): 1651 | if node.arg is None: 1652 | self.write("**") 1653 | else: 1654 | self.write(node.arg) 1655 | self.write("=") 1656 | self.traverse(node.value) 1657 | 1658 | def visit_Lambda(self, node): 1659 | with self.require_parens(_Precedence.TEST, node): 1660 | self.write("lambda") 1661 | with self.buffered() as buffer: 1662 | self.traverse(node.args) 1663 | if buffer: 1664 | self.write(" ", *buffer) 1665 | self.write(": ") 1666 | self.set_precedence(_Precedence.TEST, node.body) 1667 | self.traverse(node.body) 1668 | 1669 | def visit_alias(self, node): 1670 | self.write(node.name) 1671 | if node.asname: 1672 | self.write(" as " + node.asname) 1673 | 1674 | def visit_withitem(self, node): 1675 | self.traverse(node.context_expr) 1676 | if node.optional_vars: 1677 | self.write(" as ") 1678 | self.traverse(node.optional_vars) 1679 | 1680 | def visit_match_case(self, node): 1681 | self.fill("case ") 1682 | self.traverse(node.pattern) 1683 | if node.guard: 1684 | self.write(" if ") 1685 | self.traverse(node.guard) 1686 | with self.block(): 1687 | self.traverse(node.body) 1688 | 1689 | def visit_MatchValue(self, node): 1690 | self.traverse(node.value) 1691 | 1692 | def visit_MatchSingleton(self, node): 1693 | self._write_constant(node.value) 1694 | 1695 | def visit_MatchSequence(self, node): 1696 | with self.delimit("[", "]"): 1697 | self.interleave( 1698 | lambda: self.write(", "), self.traverse, node.patterns 1699 | ) 1700 | 1701 | def visit_MatchStar(self, node): 1702 | name = node.name 1703 | if name is None: 1704 | name = "_" 1705 | self.write(f"*{name}") 1706 | 1707 | def visit_MatchMapping(self, node): 1708 | def write_key_pattern_pair(pair): 1709 | k, p = pair 1710 | self.traverse(k) 1711 | self.write(": ") 1712 | self.traverse(p) 1713 | 1714 | with self.delimit("{", "}"): 1715 | keys = node.keys 1716 | self.interleave( 1717 | lambda: self.write(", "), 1718 | write_key_pattern_pair, 1719 | zip(keys, node.patterns, strict=True), 1720 | ) 1721 | rest = node.rest 1722 | if rest is not None: 1723 | if keys: 1724 | self.write(", ") 1725 | self.write(f"**{rest}") 1726 | 1727 | def visit_MatchClass(self, node): 1728 | self.set_precedence(_Precedence.ATOM, node.cls) 1729 | self.traverse(node.cls) 1730 | with self.delimit("(", ")"): 1731 | patterns = node.patterns 1732 | self.interleave( 1733 | lambda: self.write(", "), self.traverse, patterns 1734 | ) 1735 | attrs = node.kwd_attrs 1736 | if attrs: 1737 | def write_attr_pattern(pair): 1738 | attr, pattern = pair 1739 | self.write(f"{attr}=") 1740 | self.traverse(pattern) 1741 | 1742 | if patterns: 1743 | self.write(", ") 1744 | self.interleave( 1745 | lambda: self.write(", "), 1746 | write_attr_pattern, 1747 | zip(attrs, node.kwd_patterns, strict=True), 1748 | ) 1749 | 1750 | def visit_MatchAs(self, node): 1751 | name = node.name 1752 | pattern = node.pattern 1753 | if name is None: 1754 | self.write("_") 1755 | elif pattern is None: 1756 | self.write(node.name) 1757 | else: 1758 | with self.require_parens(_Precedence.TEST, node): 1759 | self.set_precedence(_Precedence.BOR, node.pattern) 1760 | self.traverse(node.pattern) 1761 | self.write(f" as {node.name}") 1762 | 1763 | def visit_MatchOr(self, node): 1764 | with self.require_parens(_Precedence.BOR, node): 1765 | self.set_precedence(_Precedence.BOR.next(), *node.patterns) 1766 | self.interleave(lambda: self.write(" | "), self.traverse, node.patterns) 1767 | 1768 | def unparse(ast_obj): 1769 | unparser = _Unparser() 1770 | return unparser.visit(ast_obj) 1771 | 1772 | 1773 | _deprecated_globals = { 1774 | name: globals().pop(name) 1775 | for name in ('Num', 'Str', 'Bytes', 'NameConstant', 'Ellipsis') 1776 | } 1777 | 1778 | def __getattr__(name): 1779 | if name in _deprecated_globals: 1780 | globals()[name] = value = _deprecated_globals[name] 1781 | import warnings 1782 | warnings._deprecated( 1783 | f"ast.{name}", message=_DEPRECATED_CLASS_MESSAGE, remove=(3, 14) 1784 | ) 1785 | return value 1786 | raise AttributeError(f"module 'ast' has no attribute '{name}'") 1787 | 1788 | 1789 | def main(): 1790 | import argparse 1791 | 1792 | parser = argparse.ArgumentParser(prog='python -m ast') 1793 | parser.add_argument('infile', type=argparse.FileType(mode='rb'), nargs='?', 1794 | default='-', 1795 | help='the file to parse; defaults to stdin') 1796 | parser.add_argument('-m', '--mode', default='exec', 1797 | choices=('exec', 'single', 'eval', 'func_type'), 1798 | help='specify what kind of code must be parsed') 1799 | parser.add_argument('--no-type-comments', default=True, action='store_false', 1800 | help="don't add information about type comments") 1801 | parser.add_argument('-a', '--include-attributes', action='store_true', 1802 | help='include attributes such as line numbers and ' 1803 | 'column offsets') 1804 | parser.add_argument('-i', '--indent', type=int, default=3, 1805 | help='indentation of nodes (number of spaces)') 1806 | args = parser.parse_args() 1807 | 1808 | with args.infile as infile: 1809 | source = infile.read() 1810 | tree = parse(source, args.infile.name, args.mode, type_comments=args.no_type_comments) 1811 | print(dump(tree, include_attributes=args.include_attributes, indent=args.indent)) 1812 | 1813 | if __name__ == '__main__': 1814 | main() --------------------------------------------------------------------------------