├── LICENSE.txt
├── README.md
├── pyproject.toml
└── src
    ├── __init__.py
    └── llmap
        ├── __init__.py
        ├── cache.py
        ├── client.py
        ├── deepseek_v3_tokenizer
            ├── __init__.py
            ├── tokenizer.json
            └── tokenizer_config.json
        ├── exceptions.py
        ├── llmap.py
        ├── parse.py
        ├── prompts.py
        └── queries
            ├── c_sharp
                └── skeleton.scm
            ├── java
                └── skeleton.scm
            └── python
                └── skeleton.scm


/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 
204 | 
205 |    APACHE CASSANDRA SUBCOMPONENTS:
206 | 
207 |    Apache Cassandra includes a number of subcomponents with
208 |    separate copyright notices and license terms. Your use of the source
209 |    code for these subcomponents is subject to the terms and
210 |    conditions of the following licenses.
211 | 
212 |    This product bundles a file (AbstractIterator.java) from Chronicle-Bytes,
213 |    copyright higherfrequencytrading.com, which is available under an
214 |    Apache License version 2.
215 | 
216 |    This product bundles a file (AbstractIterator.java) from Guava,
217 |    copyright The Guava Authors, which is available under an
218 |    Apache License version 2.
219 | 
220 |    This product bundles a file (LongTimSort.java) from Android libcore,
221 |    copyright The Android Open Source Project, which is available under an
222 |    Apache License version 2.
223 | 
224 |    This product bundles several files (LongTimSort.java) from PATRICIA Trie
225 |    copyright Roger Kapsi and Sam Berlin, which is available under an
226 |    Apache License version 2.
227 | 
228 |    This product bundles a file (VIntCoding.java) from Protocol Buffers
229 |    copyright Google Inc., which is available under a BSD license.
230 | 
231 |    Thus product bundles material adapted from Cassandra, The Definitive Guide.
232 |    Published by O'Reilly Media, Inc. Copyright Jeff Carpenter and Eben Hewitt 
233 |    and used with their permission.
234 | 
235 |    This product bundles The Project Gutenberg EBook of Adventures of
236 |    Huckleberry Finn, Complete by Mark Twain (Samuel Clemens), which is in
237 |    the public domain.
238 |    
239 |    This product bundles code (internalOffer) that is written by Doug Lea and
240 |    Martin Buchholz available under a Creative Commons zero license.
241 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Context extraction at scale
  2 | 
  3 | Tools like Aider and Cursor are great at editing code for you once you give them the right context. But 
  4 | [finding that context automatically is largely an unsolved problem](https://spyced.blogspot.com/2024/12/the-missing-piece-in-ai-coding.html),
  5 | especially in large codebases.
  6 | 
  7 | LLMap is a CLI code search tool designed to solve that problem by asking
  8 | Gemini Flash (preferred) or DeepSeek V3 to evaluate the relevance of each source file
  9 | in your codebase to your problem.
 10 | 
 11 | Until recently, this would be prohibitively expensive and slow.  But these models are not only
 12 | smart and fast, but also cheap enough to search large codebases exhaustively without worrying about the price.
 13 | 
 14 | LLMap also structures its request to take advantage of DeepSeek's automatic caching.  This means that repeated
 15 | searches against the same files will be [faster and less expensive](https://api-docs.deepseek.com/guides/kv_cache).
 16 | (It is possible to also support this for Gemini but in Gemini caching is not automatic and costs extra.)
 17 | 
 18 | Finally, LLMap optimizes the problem by using a multi-stage analysis to avoid spending more time
 19 | than necessary analyzing obviously irrelevant files.  LLMap performs 3 stages of analysis:
 20 |  1. Coarse analysis using code skeletons [Flash/V3]
 21 |  2. Full source analysis of potentially relevant files from (1) [Flash/V3]
 22 |  3. Refine the output of (2) to only the most relevant snippets [Pro/R1]
 23 | 
 24 | ## Limitations
 25 | 
 26 | Currently only Java, Python, and C# files are supported by the skeletonization pass.  
 27 | LLMap will process other source files, but it will perform full source analysis on all of them,
 28 | which will be slower.
 29 | 
 30 | [Extending the parsing to other languages](https://github.com/jbellis/llmap/blob/master/src/llmap/parse.py)
 31 | is straightforward; contributions are welcome.
 32 | 
 33 | ## Installation
 34 | 
 35 | ```bash
 36 | pip install llmap-ai
 37 | ```
 38 | 
 39 | Get a Gemini API key from [ai.google.dev](https://ai.google.dev/)
 40 | or a DeepSeek API key from [platform.deepseek.com](https://platform.deepseek.com)
 41 | or an OpenRouter API key from [openrouter.ai][https://openrouter.ai]
 42 | 
 43 | ## Usage
 44 | 
 45 | ```bash
 46 | export GEMINI_API_KEY=YYY # or DEEPSEEK_API_KEY if using DeepSeek, OPENROUTER_API_KEY if using Open Router 
 47 | 
 48 | 
 49 | find src/ -name "*.java" | llmap "Where is the database connection configured?"
 50 | ```
 51 | 
 52 | LLMs APIs are not super reliable, so LLMap caches LLM responses in `~/.cache/llmap`
 53 | so that you don't have to start over from scratch if you get rate limited or run into another hiccup.
 54 | (This also means that if you want to check the raw, unrefined output [see below], you won't have to
 55 | reprocess the search.)
 56 | 
 57 | ## Output
 58 | 
 59 | LLMap prints the most relevant context found to stdout.  You can save this to a file and send it to Aider
 60 | or attach it to a conversation with your favorite AI chat tool.
 61 | 
 62 | Errors are logged to stderr.
 63 | 
 64 | ## Didn't find what you were looking for?
 65 | 
 66 | First, try passing `--no-refine`.  While the refine step is usually helpful in filtering out the noise
 67 | (thus taking up less of your context window), sometimes it's too aggressive.
 68 | 
 69 | You can also try passing `--no-skeletons` in case DeepSeek was too conservative in its initial filtering. 
 70 | 
 71 | Finally, try rephrasing your question with more clues for the LLM to latch onto.  Like any information
 72 | retrieval tool, sometimes the way you ask can make a big difference.
 73 | - Worse: "How can I add a WITH clause to the CQL SELECT statement?"
 74 | - Better: "How can I add a WITH clause to the CQL SELECT statement? It will be used for adding query planning hints like which index to use."
 75 | 
 76 | ## Options
 77 | 
 78 | Commandline parameters:
 79 | ```
 80 |   --sample SAMPLE       Number of random files to sample from the input set
 81 |   --llm-concurrency LLM_CONCURRENCY
 82 |                         Maximum number of concurrent LLM requests
 83 |   --no-refine           Skip refinement and combination of analyses
 84 |   --no-skeletons        Skip skeleton analysis phase for all files
 85 | ```
 86 | 
 87 | Environment variables:
 88 | ```
 89 |   LLMAP_CACHE           none|read|write|read/write
 90 |   LLMAP_ANALYZE_MODEL   deepseek-chat|deepseek-reasoner
 91 |   LLMAP_REFINE_MODEL    deepseek-chat|deepseek-reasoner
 92 | ```
 93 | 
 94 | Open Router models:
 95 | - *Currently, only Gemini and Deepseek models are accessible through Open Router connection.*
 96 | - *To use gemini through Open Router, just set LLMAP_ANALYZE_MODEL and LLMAP_REFINE_MODEL each to one of the below. Defaults to deepseek.*
 97 | 
 98 | ```
 99 | deepseek/deepseek-chat|deepseek/deepseek-r1
100 | google/gemini-flash-1.5|google/gemini-2.0-flash-001|google/gemini-pro-1.5|google/gemini-2.0-pro-exp-02-05:free
101 | ```


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "llmap-ai"
 7 | version = "1.1.0"
 8 | authors = [
 9 |     { name = "Jonathan Ellis", email = "jbellis@gmail.com" },
10 | ]
11 | description = "High performance code search for large codebases"
12 | readme = "README.md"
13 | requires-python = ">=3.10"
14 | classifiers = [
15 |     "Programming Language :: Python :: 3",
16 |     "License :: OSI Approved :: Apache Software License",
17 |     "Operating System :: OS Independent",
18 | ]
19 | dependencies = [
20 |     "openai>=1.58.1,<2.0.0",
21 |     "transformers>=4.47.1",
22 |     "dbutils>=3.1.0",
23 |     "tree-sitter==0.21.3",
24 |     "tree-sitter-languages==1.10.2",
25 |     "tree-sitter-c-sharp",
26 |     "tqdm",
27 | ]
28 | 
29 | [project.scripts]
30 | llmap = "llmap.llmap:main"
31 | 
32 | [project.urls]
33 | "Homepage" = "https://github.com/jbellis/llmap"
34 | "Issues" = "https://github.com/jbellis/llmap/issues"
35 | 
36 | [tool.setuptools.package-data]
37 | llmap = [
38 |     "queries/*/skeleton.scm",
39 |     "deepseek_v3_tokenizer/*.json"
40 | ]
41 | 
42 | include = ["llmap*"]
43 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jbellis/llmap/36e9399b9de09ebf6114e25018276d85fa8d301d/src/__init__.py


--------------------------------------------------------------------------------
/src/llmap/__init__.py:
--------------------------------------------------------------------------------
1 | from .llmap import main
2 | 
3 | __version__ = "0.1"
4 | 


--------------------------------------------------------------------------------
/src/llmap/cache.py:
--------------------------------------------------------------------------------
 1 | from contextlib import contextmanager, closing
 2 | import sqlite3
 3 | import json
 4 | from pathlib import Path
 5 | from dbutils.pooled_db import PooledDB
 6 | 
 7 | class Cache:
 8 |     """
 9 |     A cache implementation that uses a pooled SQLite connection.
10 |     """
11 |     def __init__(self):
12 |         cache_dir = Path.home() / ".cache" / "llmap"
13 |         cache_dir.mkdir(parents=True, exist_ok=True)
14 |         self.db_path = str(cache_dir / "cache.db")
15 |         # Create a pool with a maximum of 10 connections and disable thread check.
16 |         self.pool = PooledDB(
17 |             sqlite3,
18 |             database=self.db_path,
19 |             check_same_thread=False,
20 |             maxconnections=10,
21 |             blocking=True,
22 |         )
23 |         self._init_db()
24 | 
25 |     def _init_db(self):
26 |         """
27 |         Initialize the cache database with the required table.
28 |         """
29 |         with self.get_conn() as conn:
30 |             with closing(conn.cursor()) as cur:
31 |                 cur.execute("PRAGMA journal_mode=WAL;")
32 |                 cur.execute("PRAGMA synchronous = NORMAL;")
33 |                 cur.execute("""
34 |                     CREATE TABLE IF NOT EXISTS responses (
35 |                         cache_key TEXT PRIMARY KEY,
36 |                         response TEXT,
37 |                         timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
38 |                     )
39 |                 """)
40 |                 conn.commit()
41 | 
42 |     @contextmanager
43 |     def get_conn(self):
44 |         """
45 |         Context manager to get a connection from the pool.
46 |         """
47 |         # print("Getting connection for thread " + str(threading.get_ident()))
48 |         conn = self.pool.connection()
49 |         try:
50 |             yield conn
51 |         finally:
52 |             conn.close()  # returns connection to the pool
53 |             # print("Closed connection for thread " + str(threading.get_ident()))
54 | 
55 |     def get(self, cache_key: str) -> dict | None:
56 |         """
57 |         Retrieve a cached response by key.
58 |         """
59 |         with self.get_conn() as conn:
60 |             with closing(conn.cursor()) as cur:
61 |                 cur.execute(
62 |                     "SELECT response FROM responses WHERE cache_key = ?",
63 |                     (cache_key,)
64 |                 )
65 |                 result = cur.fetchone()
66 |             if result:
67 |                 return json.loads(result[0])
68 |         return None
69 | 
70 |     def set(self, cache_key: str, response: dict):
71 |         """
72 |         Cache a response with the given key.
73 |         """
74 |         with self.get_conn() as conn:
75 |             with closing(conn.cursor()) as cur:
76 |                 cur.execute(
77 |                     "INSERT OR REPLACE INTO responses (cache_key, response) VALUES (?, ?)",
78 |                     (cache_key, json.dumps(response))
79 |                 )
80 |                 conn.commit()
81 | 
82 |     def delete(self, cache_key: str):
83 |         """
84 |         Remove a cached response by key.
85 |         """
86 |         with self.get_conn() as conn:
87 |             with closing(conn.cursor()) as cur:
88 |                 cur.execute("DELETE FROM responses WHERE cache_key = ?", (cache_key,))
89 |                 conn.commit()
90 | 


--------------------------------------------------------------------------------
/src/llmap/client.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import json
  3 | import os
  4 | import sys
  5 | import time
  6 | from random import random
  7 | from typing import NamedTuple
  8 | 
  9 | import httpx
 10 | from openai import OpenAI, BadRequestError, AuthenticationError, PermissionDeniedError, UnprocessableEntityError, \
 11 |     RateLimitError, APIError
 12 | 
 13 | from .cache import Cache
 14 | from .exceptions import AIRequestException, AITimeoutException
 15 | 
 16 | 
 17 | class FakeInternalServerError(Exception):
 18 |     pass
 19 | 
 20 | 
 21 | class SourceText(NamedTuple):
 22 |     file_path: str
 23 |     text: str
 24 | 
 25 | 
 26 | class CachingClient:
 27 |     def __init__(self):
 28 |         # Set up caching based on LLMAP_CACHE env var
 29 |         cache_mode = os.getenv('LLMAP_CACHE', 'read/write').lower()
 30 |         if cache_mode not in ['none', 'read', 'write', 'read/write']:
 31 |             raise ValueError("LLMAP_CACHE must be one of: none, read, write, read/write")
 32 |         self.cache_mode = cache_mode
 33 |         self.cache = None if cache_mode == 'none' else Cache()
 34 | 
 35 |         # Initialize API configuration
 36 |         self._setup_api()
 37 | 
 38 |         # Progress callback will be set per-phase
 39 |         self.progress_callback = None
 40 | 
 41 |     def _setup_api(self):
 42 |         openrouter_api_key = os.getenv('OPENROUTER_API_KEY')
 43 |         deepseek_api_key = os.getenv('DEEPSEEK_API_KEY')
 44 |         gemini_api_key = os.getenv('GEMINI_API_KEY')
 45 | 
 46 |         if not (deepseek_api_key or gemini_api_key or openrouter_api_key):
 47 |             raise Exception("Either DEEPSEEK_API_KEY, OPENROUTER_API_KEY or GEMINI_API_KEY environment variable must be set")
 48 | 
 49 |         if gemini_api_key:
 50 |             valid_models = {'gemini-1.5-flash', 'gemini-2.0-flash', 'gemini-1.5-pro', 'gemini-2.0-pro-exp-02-05'}
 51 |             self.api_base_url = "https://generativelanguage.googleapis.com/v1beta/openai/"
 52 |             self.api_key = gemini_api_key
 53 |             self.analyze_model = os.getenv('LLMAP_ANALYZE_MODEL', 'gemini-2.0-flash')
 54 |             self.refine_model = os.getenv('LLMAP_REFINE_MODEL', 'gemini-2.0-pro-exp-02-05')
 55 |             print("Using Gemini API", file=sys.stderr)
 56 |         elif deepseek_api_key:
 57 |             valid_models = {'deepseek-chat', 'deepseek-reasoner'}
 58 |             self.api_base_url = "https://api.deepseek.com"
 59 |             self.api_key = deepseek_api_key
 60 |             self.analyze_model = os.getenv('LLMAP_ANALYZE_MODEL', 'deepseek-chat')
 61 |             self.refine_model = os.getenv('LLMAP_REFINE_MODEL', 'deepseek-reasoner')
 62 |             print("Using DeepSeek API", file=sys.stderr)
 63 |         else: # Open Router
 64 |             valid_models = {'deepseek/deepseek-chat', 'deepseek/deepseek-r1', 'google/gemini-flash-1.5', 'google/gemini-2.0-flash-001', 'google/gemini-pro-1.5', 'google/gemini-2.0-pro-exp-02-05:free'}
 65 |             self.api_base_url = "https://openrouter.ai/api/v1"
 66 |             self.api_key = openrouter_api_key
 67 |             self.analyze_model = os.getenv('LLMAP_ANALYZE_MODEL', 'deepseek/deepseek-chat')
 68 |             self.refine_model = os.getenv('LLMAP_REFINE_MODEL', 'deepseek/deepseek-r1')
 69 |             print("Using OpenRouter API", file=sys.stderr)
 70 |         
 71 |         if self.analyze_model not in valid_models:
 72 |             raise ValueError(f"LLMAP_ANALYZE_MODEL must be one of: {', '.join(valid_models)}")
 73 |         if self.refine_model not in valid_models:
 74 |             raise ValueError(f"LLMAP_REFINE_MODEL must be one of: {', '.join(valid_models)}")
 75 | 
 76 |         self.llm_client = OpenAI(api_key=self.api_key, base_url=self.api_base_url)
 77 | 
 78 |     def max_tokens(self) -> int:
 79 |         """Return the maximum tokens allowed for the current API"""
 80 |         if self.api_base_url == "https://api.deepseek.com":
 81 |             return 62000 - 8000  # output 8k counts towards 64k limit. Headroom for scaffolding
 82 |         else:
 83 |             return 500000
 84 | 
 85 |     def ask(self, messages, model, file_path=None):
 86 |         """Helper method to make requests to the API with error handling, retries and caching"""
 87 |         # Try to load from cache if reading enabled
 88 |         cache_key = _make_cache_key(messages, model)
 89 |         if self.cache and self.cache_mode in ['read', 'read/write']:
 90 |             cached_data = self.cache.get(cache_key)
 91 |             if cached_data:
 92 |                 return type('Response', (), {
 93 |                     'choices': [type('Choice', (), {
 94 |                         'message': type('Message', (), {
 95 |                             'content': cached_data['answer']
 96 |                         })
 97 |                     })]
 98 |                 })
 99 | 
100 |         for attempt in range(10):
101 |             stream = None
102 |             try:
103 |                 stream = self.llm_client.chat.completions.create(
104 |                     model=model,
105 |                     messages=messages,
106 |                     stream=True,
107 |                     max_tokens=8000,
108 |                 )
109 | 
110 |                 full_content = []
111 |                 for chunk in stream:
112 |                     if chunk.choices[0].delta.content is not None:
113 |                         delta = chunk.choices[0].delta.content
114 |                         full_content.append(delta)
115 |                         
116 |                         # Update progress based on newlines received
117 |                         if self.progress_callback:
118 |                             new_lines = delta.count('\n')
119 |                             if new_lines > 0:
120 |                                 self.progress_callback(new_lines)
121 | 
122 |                 content = ''.join(full_content)
123 |                 if not content.strip():
124 |                     raise FakeInternalServerError()
125 |                 
126 |                 # Save to cache if enabled
127 |                 if self.cache and self.cache_mode in ['write', 'read/write']:
128 |                     self.cache.set(cache_key, {'answer': content})
129 |                 
130 |                 # Return mock response object
131 |                 return type('Response', (), {
132 |                     'choices': [type('Choice', (), {
133 |                         'message': type('Message', (), {
134 |                             'content': content
135 |                         })
136 |                     })]
137 |                 })
138 |             except (BadRequestError, AuthenticationError, PermissionDeniedError, UnprocessableEntityError) as e:
139 |                 with open('/tmp/deepseek_error.log', 'a') as f:
140 |                     print(f"{messages}\n\n->\n{e}", file=f)
141 |                 raise AIRequestException("Error evaluating source code", file_path, e)
142 |             except RateLimitError:
143 |                 # print("Rate limited, waiting", file=sys.stderr)
144 |                 time.sleep(5 * random() + 2 ** attempt)
145 |             except (httpx.RemoteProtocolError, APIError, FakeInternalServerError):
146 |                 time.sleep(1)
147 |             finally:
148 |                 if stream:
149 |                     stream.close()
150 |         else:
151 |             raise AITimeoutException("Repeated timeouts evaluating source code", file_path)
152 | 
153 | 
154 | def _make_cache_key(messages: list, model: str) -> str:
155 |     return hashlib.sha256(json.dumps([messages, model]).encode()).hexdigest()


--------------------------------------------------------------------------------
/src/llmap/deepseek_v3_tokenizer/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | # suppress "None of PyTorch..." warning before importing transformers
 3 | os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
 4 | import transformers
 5 | 
 6 | # Get the directory containing this script
 7 | _current_dir = os.path.dirname(os.path.abspath(__file__))
 8 | 
 9 | tokenizer = transformers.AutoTokenizer.from_pretrained(
10 |         _current_dir, trust_remote_code=True
11 |         )
12 | 


--------------------------------------------------------------------------------
/src/llmap/deepseek_v3_tokenizer/tokenizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "add_bos_token": false,
 3 |   "add_eos_token": false,
 4 |   "bos_token": {
 5 |     "__type": "AddedToken",
 6 |     "content": "<｜begin▁of▁sentence｜>",
 7 |     "lstrip": false,
 8 |     "normalized": true,
 9 |     "rstrip": false,
10 |     "single_word": false
11 |   },
12 |   "clean_up_tokenization_spaces": false,
13 |   "eos_token": {
14 |     "__type": "AddedToken",
15 |     "content": "<｜end▁of▁sentence｜>",
16 |     "lstrip": false,
17 |     "normalized": true,
18 |     "rstrip": false,
19 |     "single_word": false
20 |   },
21 |   "legacy": true,
22 |   "model_max_length": 65536,
23 |   "pad_token": {
24 |     "__type": "AddedToken",
25 |     "content": "<｜end▁of▁sentence｜>",
26 |     "lstrip": false,
27 |     "normalized": true,
28 |     "rstrip": false,
29 |     "single_word": false
30 |   },
31 |   "sp_model_kwargs": {},
32 |   "unk_token": null,
33 |   "tokenizer_class": "LlamaTokenizerFast",
34 |   "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- else %}{{'<｜Assistant｜>' + message['content'] + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- endif %}{%- endfor %}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}"
35 | }
36 | 


--------------------------------------------------------------------------------
/src/llmap/exceptions.py:
--------------------------------------------------------------------------------
 1 | class AIException(Exception):
 2 |     """Base exception class for AI-related errors"""
 3 |     
 4 |     def __init__(self, message: str, filename: str = None, original_exception: Exception = None):
 5 |         """
 6 |         Initialize the AIException
 7 |         
 8 |         Args:
 9 |             message: Description of the error
10 |             original_exception: The original exception that caused this error, if any
11 |             filename: Name of the file being processed when error occurred, if applicable
12 |         """
13 |         super().__init__(message)
14 |         self.original_exception = original_exception
15 |         self.message = message
16 |         self.filename = filename
17 |         
18 |     def __str__(self):
19 |         error_str = self.message
20 |         if self.filename:
21 |             error_str = f"{error_str} [File: {self.filename}]"
22 |         if self.original_exception:
23 |             error_str = f"{error_str} (Original exception: {str(self.original_exception)})"
24 |         return error_str
25 | 
26 | class AIRequestException(AIException):
27 |     pass
28 | 
29 | class AITimeoutException(AIException):
30 |     pass


--------------------------------------------------------------------------------
/src/llmap/llmap.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import random
  4 | import sys
  5 | from collections import defaultdict
  6 | from concurrent.futures import ThreadPoolExecutor, as_completed
  7 | from typing import Callable, TypeVar
  8 | 
  9 | from tqdm import tqdm
 10 | 
 11 | from .deepseek_v3_tokenizer import tokenizer
 12 | from .client import CachingClient, SourceText
 13 | from .prompts import multi_skeleton_relevance, full_source_relevance, refine_context
 14 | from .exceptions import AIException
 15 | from .parse import chunk, parseable_extension, maybe_truncate, extract_skeleton
 16 | 
 17 | T = TypeVar('T')
 18 | 
 19 | # we're using an old tree-sitter API
 20 | import warnings
 21 | warnings.filterwarnings('ignore', category=FutureWarning, module='tree_sitter')
 22 | 
 23 | def collate(sources: list[SourceText], max_tokens_per_group) -> tuple[list[list[SourceText]], list[SourceText]]:
 24 |     """
 25 |     Group analyses into batches that fit under token limit, and separate out large files.
 26 | 
 27 |     Args:
 28 |         sources: List of SourceText objects
 29 | 
 30 |     Returns:
 31 |         Tuple of (grouped_analyses, large_files) where:
 32 |         - grouped_analyses is a list of lists of SourceAnalysis objects, each group under max_tokens
 33 |         - large_files is a list of SourceAnalysis objects that individually exceed max_tokens
 34 |     """
 35 |     large_files = []
 36 |     small_files = []
 37 | 
 38 |     # Separate large and small files
 39 |     for analysis in sources:
 40 |         tokens = len(tokenizer.encode(analysis.text))
 41 |         if tokens > max_tokens_per_group:
 42 |             large_files.append(analysis)
 43 |         else:
 44 |             small_files.append((analysis, tokens))
 45 | 
 46 |     # Group small files
 47 |     groups = []
 48 |     current_group = []
 49 |     current_tokens = 0
 50 | 
 51 |     for analysis, tokens in small_files:
 52 |         if current_tokens + tokens > max_tokens_per_group:
 53 |             if current_group:  # Only append if group has items
 54 |                 groups.append(current_group)
 55 |             current_group = [analysis]
 56 |             current_tokens = tokens
 57 |         else:
 58 |             current_group.append(analysis)
 59 |             current_tokens += tokens
 60 | 
 61 |     if current_group:  # Add final group if it exists
 62 |         groups.append(current_group)
 63 | 
 64 |     return groups, large_files
 65 | 
 66 | def search(question: str, source_files: list[str], llm_concurrency: int = 200, refine: bool = True, analyze_skeletons: bool = True) -> tuple[list[AIException], str]:
 67 |     """
 68 |     Search source files for relevance to a question.
 69 |     
 70 |     Args:
 71 |         question: The question to analyze relevance against
 72 |         source_files: List of source file paths to analyze
 73 |         llm_concurrency: Maximum number of concurrent LLM requests
 74 |         refine: Whether to refine and combine analyses
 75 |         
 76 |     Returns:
 77 |         tuple[list[AIException], str]: A tuple containing:
 78 |             - List of non-fatal AIException errors encountered during processing
 79 |             - Formatted string containing the analysis results
 80 |     """
 81 |     # Create AI client and thread pool
 82 |     client = CachingClient()
 83 | 
 84 |     def process_phase(
 85 |         executor: ThreadPoolExecutor,
 86 |         items: list[T],
 87 |         process_fn: Callable[[T], T],
 88 |         desc: str,
 89 |         client: CachingClient
 90 |     ) -> tuple[list[T], list[AIException]]:
 91 |         """
 92 |         Process a batch of items with progress tracking and error handling.
 93 |         
 94 |         Args:
 95 |             executor: Thread pool executor
 96 |             items: List of items to process
 97 |             process_fn: Function to process each item
 98 |             desc: Description for progress bar
 99 |             client: AI client for progress tracking
100 |             
101 |         Returns:
102 |             tuple of (results, errors) where:
103 |             - results is a list of successfully processed items
104 |             - errors is a list of AIException errors encountered
105 |         """
106 |         results = []
107 |         errors = []
108 |         tqdm_postfix = {"Rcvd": 0}
109 |         futures = [executor.submit(process_fn, item) for item in items]
110 | 
111 |         with tqdm(total=len(futures), desc=desc,
112 |                  bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt}{postfix}') as pbar:
113 |             def cb(n_lines):
114 |                 tqdm_postfix['Rcvd'] += n_lines
115 |                 pbar.set_postfix(tqdm_postfix)
116 |             client.progress_callback = cb
117 | 
118 |             try:
119 |                 for future in as_completed(futures):
120 |                     try:
121 |                         results.append(future.result())
122 |                     except AIException as e:
123 |                         errors.append(e)
124 |                     pbar.update(1)
125 |             except Exception as e:
126 |                 # Cancel all remaining futures
127 |                 for future in futures:
128 |                     future.cancel()
129 |                 # Re-raise the exception
130 |                 raise e
131 |                 
132 |         return results, errors
133 | 
134 |     # Create thread pool and process files
135 |     errors = []
136 |     relevant_files = []
137 |     with ThreadPoolExecutor(max_workers=llm_concurrency) as executor:
138 |         # Split files by whether we can parse a skeleton (unless disabled)
139 |         if not analyze_skeletons:
140 |             parseable_files = set()
141 |         else:
142 |             parseable_files = {f for f in source_files if parseable_extension(f)}
143 |         other_files = [f for f in source_files if f not in parseable_files]
144 | 
145 |         # Phase 1: Generate initial relevance by batching skeletons for parseable files
146 |         if parseable_files:
147 |             # 1b) Group skeletons using existing collate method
148 |             skeletons = [SourceText(fp, extract_skeleton(fp)) for fp in parseable_files]
149 |             skeleton_batches, large_skeletons = collate(skeletons, 20000)
150 |             # Include large skeletons as single-item batches
151 |             # TODO truncate any extremely large skeletons
152 |             skeleton_batches.extend([[large_skel] for large_skel in large_skeletons])
153 | 
154 |             # 1c) Evaluate each skeleton batch concurrently
155 |             def check_skeleton_batch(batch):
156 |                 response = multi_skeleton_relevance(client, batch, question)
157 |                 # We parse out lines that match file paths from the LLM's response
158 |                 return [b.file_path for b in batch if b.file_path in response]
159 | 
160 |             batch_results, phase1_errors = process_phase(
161 |                 executor,
162 |                 skeleton_batches,
163 |                 check_skeleton_batch,
164 |                 "Skeleton analysis (batch)",
165 |                 client
166 |             )
167 |             errors.extend(phase1_errors)
168 | 
169 |             # 1d) Flatten the results to get the final set of relevant files
170 |             for relevant_list in batch_results:
171 |                 relevant_files.extend(relevant_list)
172 | 
173 |         # Add non-parseable files directly to relevant_files for full source analysis
174 |         relevant_files.extend(other_files)
175 | 
176 |         # Phase 2: extract and analyze source code chunks from relevant files
177 |         # First get all chunks
178 |         file_chunks, phase2a_errors = process_phase(
179 |             executor,
180 |             relevant_files,
181 |             lambda f: (f, chunk(f, client.max_tokens())),
182 |             "Parsing full source",
183 |             client
184 |         )
185 |         errors.extend(phase2a_errors)
186 | 
187 |         # Flatten chunks into (file_path, chunk_text) pairs for analysis
188 |         chunk_pairs = []
189 |         for file_path, chunks in file_chunks:
190 |             if chunks:
191 |                 for chunk_text in chunks:
192 |                     chunk_pairs.append((file_path, chunk_text))
193 | 
194 |         # Analyze all chunks
195 |         chunk_analyses, phase2b_errors = process_phase(
196 |             executor,
197 |             chunk_pairs,
198 |             lambda pair: full_source_relevance(client, pair[1], question, pair[0]),
199 |             "Analyzing full source",
200 |             client
201 |         )
202 |         errors.extend(phase2b_errors)
203 | 
204 |         # Group analyses by file and combine
205 |         analyses_by_file = defaultdict(list)
206 |         for (file_path, analysis) in chunk_analyses:
207 |             analyses_by_file[file_path].append(analysis)
208 | 
209 |         # sorted so the caching is deterministic
210 |         # Combine and truncate if needed
211 |         chunk_results = []
212 |         for file_path, analyses in sorted(analyses_by_file.items()):
213 |             combined = "\n\n".join(sorted(analyses))
214 |             truncated = maybe_truncate(combined, client.max_tokens(), file_path)
215 |             chunk_results.append(SourceText(file_path, truncated))
216 | 
217 |         # Collate and process results
218 |         groups, large_files = collate(chunk_results, client.max_tokens())
219 | 
220 |         # Refine groups in parallel
221 |         if refine:
222 |             processed_contexts, phase4_errors = process_phase(
223 |                 executor,
224 |                 groups,
225 |                 lambda g: refine_context(client, g, question),
226 |                 "Refining analysis",
227 |                 client
228 |             )
229 |             errors.extend(phase4_errors)
230 |         else:
231 |             # If no refinement, just flatten the groups into individual results
232 |             processed_contexts = [f'File{file_path}\n{analysis}\n\n'
233 |                                   for group in groups for file_path, analysis in group]
234 | 
235 |     # Build output string
236 |     output = ""
237 |     for context in processed_contexts:
238 |         if context:
239 |             output += f"{context}\n\n"
240 |     for file_path, analysis in large_files:
241 |         output += f"{file_path}:\n{analysis}\n\n"
242 |         
243 |     return errors, output
244 | 
245 | 
246 | def main():
247 |     parser = argparse.ArgumentParser(description='Analyze source files for relevance to a question')
248 |     parser.add_argument('question', help='Question to check relevance against')
249 |     parser.add_argument('--sample', type=int, help='Number of random files to sample from the input set')
250 |     parser.add_argument('--llm-concurrency', type=int, default=100, help='Maximum number of concurrent LLM requests')
251 |     parser.add_argument('--no-refine', action='store_false', dest='refine', help='Skip refinement and combination of analyses')
252 |     parser.add_argument('--no-skeletons', action='store_false', dest='analyze_skeletons', help='Skip skeleton analysis phase for all files')
253 |     args = parser.parse_args()
254 | 
255 |     # Read files from stdin
256 |     source_files = []
257 |     for line in sys.stdin:
258 |         file_path = line.strip()
259 |         if not os.path.isfile(file_path):
260 |             print(f"Warning: File does not exist: {file_path}", file=sys.stderr)
261 |             continue
262 |         source_files.append(file_path)
263 | 
264 |     if not source_files:
265 |         print("Error: No valid source files provided", file=sys.stderr)
266 |         return 1
267 | 
268 |     # Sample files if requested
269 |     if args.sample and args.sample < len(source_files):
270 |         source_files = random.sample(source_files, args.sample)
271 | 
272 |     errors, result = search(args.question, source_files, args.llm_concurrency, args.refine, args.analyze_skeletons)
273 |     if errors:
274 |         print("Errors encountered:", file=sys.stderr)
275 |         for error in errors:
276 |             print(error, file=sys.stderr)
277 |         print(file=sys.stderr)
278 |     print(result)
279 |         
280 | 
281 | if __name__ == "__main__":
282 |     main()
283 | 
284 | 
285 | 
286 | 


--------------------------------------------------------------------------------
/src/llmap/parse.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import sys
  4 | from pathlib import Path
  5 | 
  6 | from tree_sitter_languages import get_language, get_parser
  7 | 
  8 | from .deepseek_v3_tokenizer import tokenizer
  9 | 
 10 | 
 11 | QUERIES = {
 12 |     '.java': 'java',
 13 |     '.py': 'python',
 14 |     '.cs': 'c_sharp'
 15 | }
 16 | 
 17 | def token_count(text: str) -> int:
 18 |     return len(tokenizer.encode(text))
 19 | 
 20 | def maybe_truncate(text, max_tokens, source):
 21 |     """Truncate 'text' to 'max_tokens' tokens if needed and log to stderr."""
 22 |     encoded = tokenizer.encode(text)
 23 |     if len(encoded) <= max_tokens:
 24 |         return text
 25 |     print(f"[WARN] {source} exceeds {max_tokens} tokens; truncating.", file=sys.stderr)
 26 |     return tokenizer.decode(encoded[:max_tokens])
 27 | 
 28 | def get_query(file_path: str) -> str:
 29 |     """Load the correct .scm query based on extension."""
 30 |     ext = Path(file_path).suffix
 31 |     if ext not in QUERIES:
 32 |         raise ValueError(f"Unsupported file extension: {ext}")
 33 |     query_path = Path(__file__).parent / "queries" / QUERIES[ext] / "skeleton.scm"
 34 |     return query_path.read_text()
 35 | 
 36 | def parse_code(source_file: str):
 37 |     """
 38 |     Parse 'source_file' with Tree-sitter, run the appropriate query,
 39 |     and build IR (list of {type, start, end, text, node}).
 40 |     """
 41 |     code_str = Path(source_file).read_text()
 42 |     code_bytes = code_str.encode("utf8")
 43 | 
 44 |     ext = parseable_extension(source_file)
 45 |     if not ext:
 46 |         raise ValueError(f"Unsupported filetype in {source_file}")
 47 |     lang_name = QUERIES[ext]
 48 |     parser = get_parser(lang_name)
 49 |     language = get_language(lang_name)
 50 |     tree = parser.parse(code_bytes)
 51 | 
 52 |     captures = language.query(get_query(source_file)).captures(tree.root_node)
 53 |     ir = []
 54 |     for node, capture_name in captures:
 55 |         # Skip annotation nodes in IR
 56 |         if capture_name == 'annotation':
 57 |             continue
 58 |         snippet = code_bytes[node.start_byte: node.end_byte].decode("utf8")
 59 |         ir.append({
 60 |             'type': capture_name,
 61 |             'start': node.start_byte,
 62 |             'end': node.end_byte,
 63 |             'text': snippet,
 64 |             'node': node,
 65 |         })
 66 |     ir.sort(key=lambda x: x['start'])
 67 |     return code_str, code_bytes, tree, ir
 68 | 
 69 | def compute_indentation(node, code_bytes):
 70 |     """Compute leading spaces for 'node' based on the nearest preceding newline."""
 71 |     start_byte = node.start_byte
 72 |     newline_pos = code_bytes.rfind(b'\n', 0, start_byte)
 73 |     line_start = 0 if newline_pos < 0 else newline_pos + 1
 74 |     return " " * (start_byte - line_start)
 75 | 
 76 | def leading_whitespace_of_snippet(text):
 77 |     """Return the leading whitespace of 'text'."""
 78 |     idx = 0
 79 |     while idx < len(text) and text[idx] in (' ', '\t'):
 80 |         idx += 1
 81 |     return text[:idx]
 82 | 
 83 | def gather_head(ir, root_node, code_bytes):
 84 |     """
 85 |     Return (head_items, body_items, top_level_class_count).
 86 |     Head items are top-level class signatures (+ '{') plus any top-level fields.
 87 |     Body items are everything else. Also track how many top-level classes have bodies.
 88 |     """
 89 |     head, body = [], []
 90 |     top_level_class_count = 0
 91 | 
 92 |     for item in ir:
 93 |         node, snippet = item['node'], item['text']
 94 |         # Find the containing top-level class
 95 |         p, top_level_class = node, None
 96 |         while p and p != root_node:
 97 |             if p.type in ('class_declaration','interface_declaration','annotation_declaration','enum_declaration'):
 98 |                 if p.parent == root_node:
 99 |                     top_level_class = p
100 |                 break
101 |             p = p.parent
102 | 
103 |         if top_level_class and node == top_level_class and item['type'] == 'class.declaration':
104 |             body_node = node.child_by_field_name('body')
105 |             indent = leading_whitespace_of_snippet(snippet)
106 |             if body_node:
107 |                 top_level_class_count += 1
108 |                 sig_len = body_node.start_byte - node.start_byte
109 |                 partial = snippet[:sig_len].rstrip()
110 |                 head_text = partial + " {"
111 |                 if not head_text.startswith(indent):
112 |                     head_text = indent + head_text.lstrip()
113 |                 head.append({**item, 'text': head_text})
114 |             else:
115 |                 head.append(item)
116 |         elif top_level_class and item['type'] == 'field.declaration':
117 |             # Add one level of indentation for fields inside classes
118 |             field_text = item['text']
119 |             indent = leading_whitespace_of_snippet(field_text)
120 |             field_text = indent + "    " + field_text.lstrip()
121 |             head.append({**item, 'text': field_text})
122 |         else:
123 |             body.append(item)
124 |     return head, body, top_level_class_count
125 | 
126 | def build_body_blocks(body_ir, code_str, root_node):
127 |     """Group IR items so that nested classes remain intact and top-level items are not split."""
128 |     code_bytes = code_str.encode('utf8')
129 |     used, blocks = set(), []
130 | 
131 |     def fully_in(a, b):
132 |         return a.start_byte >= b.start_byte and a.end_byte <= b.end_byte
133 | 
134 |     for item in body_ir:
135 |         if (item['start'], item['end']) in used:
136 |             continue
137 |         node = item['node']
138 |         if node.type in ('class_declaration', 'interface_declaration','annotation_declaration','enum_declaration'):
139 |             snippet = code_bytes[node.start_byte: node.end_byte].decode('utf8')
140 |             blocks.append({'start': node.start_byte,'end': node.end_byte,'text': snippet})
141 |             for sub in body_ir:
142 |                 if fully_in(sub['node'], node):
143 |                     used.add((sub['start'], sub['end']))
144 |         else:
145 |             blocks.append({'start': item['start'],'end': item['end'],'text': item['text']})
146 |             used.add((item['start'], item['end']))
147 |     return sorted(blocks, key=lambda b: b['start'])
148 | 
149 | def chunk_from_ir_with_head(ir, root_node, code_str, max_tokens=65536):
150 |     """
151 |     Build code chunks under 'max_tokens'. The 'head' is repeated in each chunk
152 |     if it fits. Each nested class or method is kept intact.
153 |     """
154 |     code_bytes = code_str.encode("utf8")
155 |     head_items, body_items, top_level_count = gather_head(ir, root_node, code_bytes)
156 |     head_block = "\n".join(i['text'].rstrip('\r\n') for i in head_items).rstrip()
157 |     head_tokens = token_count(head_block) if head_block else 0
158 |     head_usable = head_block and (head_tokens <= (max_tokens // 2))
159 |     body_budget = max_tokens - head_tokens if head_usable else max_tokens
160 | 
161 |     blocks = build_body_blocks(body_items, code_str, root_node)
162 |     chunks, current_texts, current_tokens = [], [], 0
163 | 
164 |     def flush():
165 |         if not current_texts and not head_usable:
166 |             return
167 |         chunk_body = "\n\n".join(current_texts).rstrip()
168 |         if head_usable:
169 |             chunk = head_block + ("\n\n" + chunk_body if chunk_body else "")
170 |             if top_level_count > 0:
171 |                 chunk += "\n" + "\n".join("}" for _ in range(top_level_count))
172 |         else:
173 |             chunk = chunk_body
174 |         chunks.append(chunk)
175 | 
176 |     for b in blocks:
177 |         snippet = b['text']
178 |         tcount = token_count(snippet)
179 |         if tcount > body_budget:
180 |             snippet = maybe_truncate(snippet, body_budget, "Large IR block")
181 |             tcount = token_count(snippet)
182 |         if current_tokens + tcount > body_budget:
183 |             flush()
184 |             current_texts, current_tokens = [snippet], tcount
185 |         else:
186 |             current_texts.append(snippet)
187 |             current_tokens += tcount
188 |     if current_texts:
189 |         flush()
190 |     return chunks
191 | 
192 | def extract_skeleton(source_file: str) -> str:
193 |     """
194 |     Return a concise structural outline of the code: classes, methods, fields,
195 |     with indentation and { ... } placeholders.
196 |     """
197 |     code_str, code_bytes, tree, ir = parse_code(source_file)
198 |     lines, open_braces = [], []
199 | 
200 |     def text_slice(s, e):
201 |         return code_bytes[s:e].decode('utf8')
202 | 
203 |     for item in ir:
204 |         ctype, node = item['type'], item['node']
205 |         indent = compute_indentation(node, code_bytes)
206 |         if ctype in ('class.declaration','interface.declaration','annotation.declaration','enum.declaration'):
207 |             body = node.child_by_field_name('body')
208 |             if body:
209 |                 sig_part = text_slice(node.start_byte, body.start_byte).rstrip()
210 |                 lines.append(f"{indent}{sig_part} {{")
211 |                 open_braces.append(indent)
212 |             else:
213 |                 snippet = text_slice(node.start_byte, node.end_byte).rstrip()
214 |                 lines.append(f"{indent}{snippet}")
215 |         elif ctype == 'using.directive':
216 |             snippet = text_slice(node.start_byte, node.end_byte).rstrip()
217 |             lines.append(f"{indent}{snippet}")
218 |         elif ctype == 'method.declaration':
219 |             body = node.child_by_field_name('body')
220 |             ret_node = node.child_by_field_name('type')
221 |             start_pos = ret_node.start_byte if ret_node else node.start_byte
222 |             if body:
223 |                 sig_head = text_slice(start_pos, body.start_byte).rstrip()
224 |                 lines.append(f"{indent}{sig_head} {{...}}")
225 |             else:
226 |                 snippet = text_slice(start_pos, node.end_byte).rstrip()
227 |                 lines.append(f"{indent}{snippet}")
228 |         elif ctype == 'field.declaration':
229 |             snippet = text_slice(node.start_byte, node.end_byte).rstrip()
230 |             lines.append(f"{indent}{snippet}")
231 | 
232 |     while open_braces:
233 |         lines.append(f"{open_braces.pop()}}}")
234 |     return "\n".join(lines)
235 | 
236 | def parseable_extension(source_file: str) -> bool|None:
237 |     ext = Path(source_file).suffix.lower()
238 |     if ext in QUERIES.keys():
239 |         return ext
240 |     return None
241 | 
242 | def chunk(source_file: str, max_tokens: int):
243 |     """
244 |     Break the file's code into chunks that do not exceed 'max_tokens',
245 |     preserving the top-level head block and grouping items sensibly.
246 |     """
247 |     if not parseable_extension(source_file):
248 |         # For unsupported file types, just truncate the whole file
249 |         truncated = maybe_truncate(Path(source_file).read_text(), max_tokens, source_file)
250 |         return [truncated]
251 |     code_str, code_bytes, tree, ir = parse_code(source_file)
252 |     return chunk_from_ir_with_head(ir, tree.root_node, code_str, max_tokens)
253 | 
254 | if __name__ == '__main__':
255 |     if len(sys.argv) < 3:
256 |         print("Usage: parse.py <skeleton|chunk|tokens> <source_file> [source_file...]")
257 |         sys.exit(1)
258 | 
259 |     cmd = sys.argv[1]
260 |     fnames = sys.argv[2:]
261 | 
262 |     for fname in fnames:
263 |         if cmd == 'skeleton':
264 |             print(f"\n# {fname}\n")
265 |             print("--------------------------------------")
266 |             print(extract_skeleton(fname))
267 |         elif cmd == 'chunk':
268 |             print(f"\n# {fname}\n")
269 |             chs = chunk(fname, 2000)  # smaller max for demo
270 |             print("Chunks:")
271 |             print("--------------------------------------")
272 |             for i, ch in enumerate(chs, 1):
273 |                 print(f"\n--- Chunk {i} (length={token_count(ch)})---")
274 |                 print(ch + "\n")
275 |         elif cmd == 'tokens':
276 |             text = Path(fname).read_text()
277 |             count = token_count(text)
278 |             print(f"{fname} {count}")
279 |         else:
280 |             print("First argument must be one of: skeleton, chunk, tokens")
281 |             sys.exit(1)
282 | 


--------------------------------------------------------------------------------
/src/llmap/prompts.py:
--------------------------------------------------------------------------------
  1 | from textwrap import dedent
  2 | 
  3 | from .client import CachingClient, SourceText
  4 | 
  5 | 
  6 | def multi_skeleton_relevance(client: CachingClient, skeletons: list[SourceText], question: str) -> str:
  7 |     """
  8 |     Evaluate multiple skeletons for relevance.
  9 |     Skeletons is a list of SourceText objects containing file paths and skeleton text.
 10 |     Returns a string containing only the relevant file paths (one per line),
 11 |     or no paths if none are relevant.
 12 |     """
 13 |     combined = []
 14 |     for skeleton in skeletons:
 15 |         combined.append(f"### FILE: {skeleton.file_path}\n{skeleton.text}\n")
 16 |     combined_text = "\n\n".join(combined)
 17 | 
 18 |     messages = [
 19 |         {"role": "system", "content": "You are a helpful assistant designed to analyze and explain source code."},
 20 |         {"role": "user", "content": combined_text},
 21 |         {"role": "assistant", "content": "Thank you for providing your source code skeletons for analysis."},
 22 |         {"role": "user", "content": dedent(f"""
 23 |             I have given you multiple file skeletons, each labeled with "### FILE: path".
 24 |             Evaluate each skeleton for relevance to the following question:
 25 |             ```
 26 |             {question}
 27 |             ```
 28 | 
 29 |             Think about whether the skeleton provides sufficient information to determine relevance:
 30 |             - If the skeleton clearly indicates irrelevance to the question, eliminate it from consideration.
 31 |             - If the skeleton clearly shows that the code is relevant to the question,
 32 |               OR if implementation details are needed to determine relevance, output its FULL path.
 33 |             List ONLY the file paths that appear relevant to answering the question. 
 34 |             Output one path per line. If a file is not relevant, do not list it at all.
 35 |         """)},
 36 |         {"role": "assistant", "content": "Understood."},
 37 |     ]
 38 |     response = client.ask(messages, client.analyze_model)
 39 |     return response.choices[0].message.content
 40 | 
 41 | 
 42 | def full_source_relevance(client: CachingClient, source: str, question: str, file_path: str = None) -> SourceText:
 43 |     """
 44 |     Check source code for relevance
 45 |     Args:
 46 |         source: The source code to analyze
 47 |         question: The question to check relevance against
 48 |         file_path: Optional file path for error reporting
 49 |     Returns SourceAnalysis containing file path and evaluation text
 50 |     Raises AIException if a recoverable error occurs.
 51 |     """
 52 |     messages = [
 53 |         {"role": "system", "content": "You are a helpful assistant designed to analyze and explain source code."},
 54 |         {"role": "user", "content": source},
 55 |         {"role": "assistant", "content": "Thank you for providing your source code for analysis."},
 56 |         {"role": "user", "content": dedent(f"""
 57 |             Evaluate the above source code for relevance to the following question:
 58 |             ```
 59 |             {question}
 60 |             ```
 61 | 
 62 |             Give an overall summary, then give the most relevant section(s) of code, if any.
 63 |             Prefer to give relevant code in units of functions, classes, or methods, rather
 64 |             than isolated lines.
 65 |         """)}
 66 |     ]
 67 | 
 68 |     response = client.ask(messages, client.analyze_model, file_path)
 69 |     return SourceText(file_path, response.choices[0].message.content)
 70 | 
 71 | 
 72 | def refine_context(client: CachingClient, file_group: list[SourceText], question: str) -> str:
 73 |     """
 74 |     Process groups of file analyses to extract only the relevant context.
 75 | 
 76 |     Args:
 77 |         file_groups: List of lists of (file_path, analysis) tuples
 78 |         question: The original question being analyzed
 79 | 
 80 |     Returns:
 81 |         List of processed contexts, one per group
 82 |     """
 83 |     combined = "\n\n".join(f"File: {analysis.file_path}\n{analysis.text}" for analysis in file_group)
 84 | 
 85 |     messages = [
 86 |         {"role": "system", "content": "You are a helpful assistant designed to collate source code."},
 87 |         {"role": "user", "content": combined},
 88 |         {"role": "assistant", "content": "Thank you for providing your source code fragments."},
 89 |         {"role": "user", "content": dedent(f"""
 90 |             The above text contains analysis of multiple source files related to this question:
 91 |             ```
 92 |             {question}
 93 |             ```
 94 | 
 95 |             Extract only the most relevant context and code sections that help answer the question.
 96 |             Remove any irrelevant files completely, but preserve file paths for the relevant code fragments.
 97 |             Include the relevant code fragments as-is; do not truncate, summarize, or modify them.
 98 | 
 99 |             DO NOT include additional commentary or analysis of the provided text.
100 |         """)}
101 |     ]
102 | 
103 |     response = client.ask(messages, client.refine_model)
104 |     content1 = response.choices[0].message.content
105 |     messages += [
106 |         {"role": "assistant", "content": content1},
107 |         {"role": "user", "content": dedent(f"""
108 |             Take one more look and make sure you didn't miss anything important for answering
109 |             the question:
110 |             ```
111 |             {question}
112 |             ```
113 |         """)}
114 |     ]
115 |     response = client.ask(messages, client.refine_model)
116 |     content2 = response.choices[0].message.content
117 | 
118 |     return content1 + '\n\n' + content2
119 | 


--------------------------------------------------------------------------------
/src/llmap/queries/c_sharp/skeleton.scm:
--------------------------------------------------------------------------------
 1 | ; Using directives
 2 | (using_directive) @using.directive
 3 | 
 4 | ; Classes
 5 | (class_declaration
 6 |   name: (identifier) @class.name) @class.declaration
 7 | 
 8 | ; Interfaces
 9 | (interface_declaration
10 |   name: (identifier) @interface.name) @interface.declaration
11 | 
12 | ; Methods
13 | (method_declaration
14 |   name: (identifier) @method.name) @method.declaration
15 | 
16 | ; Fields
17 | (field_declaration
18 |   (variable_declaration
19 |     (variable_declarator
20 |       name: (identifier) @field.name))) @field.declaration
21 | 
22 | ; Properties
23 | (property_declaration
24 |   name: (identifier) @property.name) @property.declaration
25 | 
26 | ; Constructor
27 | (constructor_declaration) @constructor.declaration
28 | 
29 | ; Attributes
30 | (attribute_list) @annotation
31 | 


--------------------------------------------------------------------------------
/src/llmap/queries/java/skeleton.scm:
--------------------------------------------------------------------------------
 1 | ; Annotation declarations
 2 | (annotation_type_declaration
 3 |   body: (_)) @annotation.declaration
 4 | 
 5 | ; Class declarations
 6 | (class_declaration
 7 |   body: (_)) @class.declaration
 8 | 
 9 | ; Interface declarations
10 | (interface_declaration
11 |   body: (_)) @interface.declaration
12 | 
13 | ; Method declarations
14 | (method_declaration
15 |   body: (_)?) @method.declaration
16 | 
17 | ; Field declarations
18 | (field_declaration) @field.declaration
19 | 
20 | ; Enum declarations
21 | (enum_declaration
22 |   body: (enum_body
23 |     (enum_constant)*)) @enum.declaration
24 | 
25 | ; Annotations to strip
26 | (annotation) @annotation
27 | 


--------------------------------------------------------------------------------
/src/llmap/queries/python/skeleton.scm:
--------------------------------------------------------------------------------
 1 | ; Class declarations
 2 | (class_definition
 3 |   name: (identifier) @name
 4 |   body: (block)) @class.declaration
 5 | 
 6 | ; Function/method declarations
 7 | (function_definition
 8 |   name: (identifier) @name
 9 |   body: (block)) @method.declaration
10 | 
11 | ; Instance variable declarations
12 | (assignment 
13 |   left: (attribute
14 |     object: (identifier) @obj
15 |     (#eq? @obj "self"))) @field.declaration
16 | 


--------------------------------------------------------------------------------