├── .gitignore ├── LICENSE ├── cmake └── CPM.cmake ├── CMakeLists.txt ├── README.md └── include └── cpp-tree-sitter.h /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Nick Sumner 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /cmake/CPM.cmake: -------------------------------------------------------------------------------- 1 | set(CPM_DOWNLOAD_VERSION 0.38.2) 2 | 3 | if(CPM_SOURCE_CACHE) 4 | set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") 5 | elseif(DEFINED ENV{CPM_SOURCE_CACHE}) 6 | set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") 7 | else() 8 | set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake") 9 | endif() 10 | 11 | # Expand relative path. This is important if the provided path contains a tilde (~) 12 | get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE) 13 | 14 | function(download_cpm) 15 | message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}") 16 | file(DOWNLOAD 17 | https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake 18 | ${CPM_DOWNLOAD_LOCATION} 19 | ) 20 | endfunction() 21 | 22 | if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION})) 23 | download_cpm() 24 | else() 25 | # resume download if it previously failed 26 | file(READ ${CPM_DOWNLOAD_LOCATION} check) 27 | if("${check}" STREQUAL "") 28 | download_cpm() 29 | endif() 30 | unset(check) 31 | endif() 32 | 33 | include(${CPM_DOWNLOAD_LOCATION}) 34 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.19) 2 | project(cpp-tree-sitter) 3 | 4 | set(PACKAGE_NAME cpp-tree-sitter) 5 | set(PACKAGE_VERSION 0.0.3) 6 | set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}") 7 | set(PACKAGE_BUGREPORT "wsumner@sfu.ca") 8 | 9 | add_compile_options( 10 | "$<$:-Wall;-Wextra;-Wconversion>" 11 | ) 12 | 13 | include(cmake/CPM.cmake) 14 | 15 | # We want to automatically download and provide tree-sitter to users of 16 | # the package, so pull it in and retrofit cmake dependencies on top of it. 17 | CPMAddPackage( 18 | NAME tree-sitter 19 | GIT_REPOSITORY https://github.com/tree-sitter/tree-sitter.git 20 | VERSION 0.22.6 21 | DOWNLOAD_ONLY YES 22 | ) 23 | 24 | if (tree-sitter_ADDED) 25 | add_library(tree-sitter) 26 | target_sources(tree-sitter 27 | PRIVATE 28 | "${tree-sitter_SOURCE_DIR}/lib/src/lib.c" 29 | ) 30 | target_include_directories(tree-sitter 31 | PRIVATE 32 | $ 33 | PUBLIC 34 | $ 35 | $ 36 | ) 37 | target_compile_options(tree-sitter 38 | PRIVATE 39 | "$<$:-Wno-conversion>" 40 | ) 41 | endif() 42 | 43 | 44 | function(add_grammar_from_repo NAME REPO VERSION) 45 | CPMAddPackage( 46 | NAME ${NAME} 47 | GIT_REPOSITORY ${REPO} 48 | VERSION ${VERSION} 49 | DOWNLOAD_ONLY YES 50 | ) 51 | 52 | if ("${${NAME}_ADDED}") 53 | add_library(${NAME}) 54 | 55 | file(GLOB maybe_scanner "${${NAME}_SOURCE_DIR}/src/scanner.c") 56 | target_sources(${NAME} 57 | PRIVATE 58 | "${${NAME}_SOURCE_DIR}/src/parser.c" 59 | ${maybe_scanner} 60 | ) 61 | target_include_directories(${NAME} 62 | PRIVATE 63 | # parser.h is stored within the src directory, so we need to include 64 | # src in the search paths 65 | $ 66 | PUBLIC 67 | $ 68 | ) 69 | 70 | target_link_libraries(${NAME} 71 | INTERFACE 72 | tree-sitter 73 | ) 74 | target_compile_options(${NAME} 75 | PRIVATE 76 | "$<$:-Wno-unused-but-set-variable>" 77 | ) 78 | endif() 79 | endfunction(add_grammar_from_repo) 80 | 81 | 82 | add_library(cpp-tree-sitter INTERFACE) 83 | target_include_directories(cpp-tree-sitter 84 | INTERFACE 85 | $ 86 | $ 87 | ) 88 | target_link_libraries(cpp-tree-sitter 89 | INTERFACE 90 | tree-sitter 91 | ) 92 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cpp-tree-sitter 2 | 3 | ... is a simple C++ and CMake wrapper around tree-sitter. This project provides 4 | CMake definitions and a C++ wrapper that help with 5 | * managing tree-sitter and tree-sitter grammars as dependencies 6 | * accessing basic tree-sitter APIs for parse tree inspection 7 | 8 | ## Using in a CMake project 9 | 10 | ... requires the [CPM](https://github.com/cpm-cmake/CPM.cmake) CMake module 11 | for fetching and managing dependencies from github. Adding `cpp-tree-sitter` 12 | as a CPM dependency makes `cpp-tree-sitter` available as a library and 13 | provides a function, `add_grammar_from_repo`, that will download and 14 | make available a standard tree-sitter grammar on GitHub as a library. 15 | 16 | The tree-sitter parser 17 | [example](https://tree-sitter.github.io/tree-sitter/using-parsers#an-example-program) 18 | can be reproduced in a CMake project with CPM by including the following in 19 | `CMakeLists.txt`: 20 | 21 | ```cmake 22 | include(cmake/CPM.cmake) 23 | 24 | # Downloads this wrapper library and tree-sitter. 25 | # Makes them available via the `cpp-tree-sitter` CMake library target. 26 | CPMAddPackage( 27 | NAME cpp-tree-sitter 28 | GIT_REPOSITORY https://github.com/nsumner/cpp-tree-sitter.git 29 | GIT_TAG v0.0.1 30 | ) 31 | 32 | # Downloads a tree-sitter grammar from github and makes it available as a 33 | # cmake library target. 34 | add_grammar_from_repo(tree-sitter-json # Defines the library name for a grammar 35 | https://github.com/tree-sitter/tree-sitter-json.git # Repository URL of a tree-sitter grammar 36 | 0.19.0 # Version tag for the grammar 37 | ) 38 | 39 | # Use the library in a demo program. 40 | add_executable(demo) 41 | target_sources(demo 42 | PRIVATE 43 | demo.cpp 44 | ) 45 | target_link_libraries(demo 46 | tree-sitter-json 47 | cpp-tree-sitter 48 | ) 49 | ``` 50 | 51 | Translating the parsing and tree inspection operations from the example to 52 | use the C++ wrappers then yields a `demo.cpp` like: 53 | 54 | ```cpp 55 | #include 56 | #include 57 | #include 58 | #include 59 | 60 | #include 61 | 62 | 63 | extern "C" { 64 | TSLanguage* tree_sitter_json(); 65 | } 66 | 67 | 68 | int main() { 69 | // Create a language and parser. 70 | ts::Language language = tree_sitter_json(); 71 | ts::Parser parser{language}; 72 | 73 | // Parse the provided string into a syntax tree. 74 | std::string sourcecode = "[1, null]"; 75 | ts::Tree tree = parser.parseString(sourcecode); 76 | 77 | // Get the root node of the syntax tree. 78 | ts::Node root = tree.getRootNode(); 79 | 80 | // Get some child nodes. 81 | ts::Node array = root.getNamedChild(0); 82 | ts::Node number = array.getNamedChild(0); 83 | 84 | // Check that the nodes have the expected types. 85 | assert(root.getType() == "document"); 86 | assert(array.getType() == "array"); 87 | assert(number.getType() == "number"); 88 | 89 | // Check that the nodes have the expected child counts. 90 | assert(root.getNumChildren() == 1); 91 | assert(array.getNumChildren() == 5); 92 | assert(array.getNumNamedChildren() == 2); 93 | assert(number.getNumChildren() == 0); 94 | 95 | // Print the syntax tree as an S-expression. 96 | auto treestring = root.getSExpr(); 97 | printf("Syntax tree: %s\n", treestring.get()); 98 | 99 | return 0; 100 | } 101 | ``` 102 | 103 | In particular, some of the underlying APIs now use method calls for 104 | easier discoverability, and resource cleaning is automatic. 105 | -------------------------------------------------------------------------------- /include/cpp-tree-sitter.h: -------------------------------------------------------------------------------- 1 | #ifndef CPP_TREE_SITTER_H 2 | #define CPP_TREE_SITTER_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | // Including the API directly already pollutes the namespace, but the 11 | // functions are prefixed. Anything else that we include should be scoped 12 | // within a namespace. 13 | 14 | namespace ts { 15 | 16 | ///////////////////////////////////////////////////////////////////////////// 17 | // Helper classes. 18 | // These can be ignored while tring to understand the core APIs on demand. 19 | ///////////////////////////////////////////////////////////////////////////// 20 | 21 | 22 | struct FreeHelper{ 23 | template 24 | void 25 | operator()(T* raw_pointer) const { 26 | std::free(raw_pointer); 27 | } 28 | }; 29 | 30 | 31 | // An inclusive range representation 32 | template 33 | struct Extent { 34 | T start; 35 | T end; 36 | }; 37 | 38 | 39 | ///////////////////////////////////////////////////////////////////////////// 40 | // Aliases. 41 | // Create slightly stricter aliases for some of the core tree-sitter types. 42 | ///////////////////////////////////////////////////////////////////////////// 43 | 44 | 45 | // Direct alias of { row: uint32_t; column: uint32_t } 46 | using Point = TSPoint; 47 | 48 | using Symbol = uint16_t; 49 | 50 | using Version = uint32_t; 51 | 52 | using NodeID = uintptr_t; 53 | 54 | 55 | // For types that manage resources, create custom wrappers that ensure 56 | // clean-up. For types that can benefit from additional API discovery, 57 | // wrappers with implicit conversion allow for automated method discovery. 58 | 59 | struct Language { 60 | // NOTE: Allowing implicit conversions from TSLanguage to Language 61 | // improves the ergonomics a bit, as clients will still make use of the 62 | // custom language functions. 63 | 64 | /* implicit */ Language(TSLanguage const* language) 65 | : impl{language} 66 | { } 67 | 68 | [[nodiscard]] size_t 69 | getNumSymbols() const { 70 | return ts_language_symbol_count(impl); 71 | } 72 | 73 | [[nodiscard]] std::string_view 74 | getSymbolName(Symbol symbol) const { 75 | return ts_language_symbol_name(impl, symbol); 76 | } 77 | 78 | [[nodiscard]] Symbol 79 | getSymbolForName(std::string_view name, bool isNamed) const { 80 | return ts_language_symbol_for_name(impl, 81 | &name.front(), 82 | static_cast(name.size()), 83 | isNamed); 84 | } 85 | 86 | [[nodiscard]] Version 87 | getVersion() const { 88 | return ts_language_version(impl); 89 | } 90 | 91 | TSLanguage const* impl; 92 | }; 93 | 94 | 95 | class Cursor; 96 | 97 | struct Node { 98 | explicit Node(TSNode node) 99 | : impl{node} 100 | { } 101 | 102 | //////////////////////////////////////////////////////////////// 103 | // Flag checks on nodes 104 | //////////////////////////////////////////////////////////////// 105 | [[nodiscard]] bool 106 | isNull() const { 107 | return ts_node_is_null(impl); 108 | } 109 | 110 | [[nodiscard]] bool 111 | isNamed() const { 112 | return ts_node_is_named(impl); 113 | } 114 | 115 | [[nodiscard]] bool 116 | isMissing() const { 117 | return ts_node_is_missing(impl); 118 | } 119 | 120 | [[nodiscard]] bool 121 | isExtra() const { 122 | return ts_node_is_extra(impl); 123 | } 124 | 125 | [[nodiscard]] bool 126 | hasError() const { 127 | return ts_node_has_error(impl); 128 | } 129 | 130 | [[nodiscard]] bool 131 | isError() const { 132 | return ts_node_is_error(impl); 133 | } 134 | 135 | //////////////////////////////////////////////////////////////// 136 | // Navigation 137 | //////////////////////////////////////////////////////////////// 138 | 139 | // Direct parent/sibling/child connections and cursors 140 | 141 | [[nodiscard]] Node 142 | getParent() const { 143 | return Node{ts_node_parent(impl)}; 144 | } 145 | 146 | [[nodiscard]] Node 147 | getPreviousSibling() const { 148 | return Node{ts_node_prev_sibling(impl)}; 149 | } 150 | 151 | [[nodiscard]] Node 152 | getNextSibling() const { 153 | return Node{ts_node_next_sibling(impl)}; 154 | } 155 | 156 | [[nodiscard]] uint32_t 157 | getNumChildren() const { 158 | return ts_node_child_count(impl); 159 | } 160 | 161 | [[nodiscard]] Node 162 | getChild(uint32_t position) const { 163 | return Node{ts_node_child(impl, position)}; 164 | } 165 | 166 | // Named children 167 | 168 | [[nodiscard]] uint32_t 169 | getNumNamedChildren() const { 170 | return ts_node_named_child_count(impl); 171 | } 172 | 173 | [[nodiscard]] Node 174 | getNamedChild(uint32_t position) const { 175 | return Node{ts_node_named_child(impl, position)}; 176 | } 177 | 178 | // Named fields 179 | 180 | [[nodiscard]] std::string_view 181 | getFieldNameForChild(uint32_t child_position) const { 182 | return ts_node_field_name_for_child(impl, child_position); 183 | } 184 | 185 | [[nodiscard]] Node 186 | getChildByFieldName(std::string_view name) const { 187 | return Node{ts_node_child_by_field_name(impl, 188 | &name.front(), 189 | static_cast(name.size()))}; 190 | } 191 | 192 | // Definition deferred until after the definition of Cursor. 193 | [[nodiscard]] Cursor 194 | getCursor() const; 195 | 196 | //////////////////////////////////////////////////////////////// 197 | // Node attributes 198 | //////////////////////////////////////////////////////////////// 199 | 200 | // Returns a unique identifier for a node in a parse tree. 201 | // NodeIDs are numeric value types. 202 | [[nodiscard]] NodeID 203 | getID() const { 204 | return reinterpret_cast(impl.id); 205 | } 206 | 207 | // Returns an S-Expression representation of the subtree rooted at this node. 208 | [[nodiscard]] std::unique_ptr 209 | getSExpr() const { 210 | return std::unique_ptr{ts_node_string(impl)}; 211 | } 212 | 213 | [[nodiscard]] Symbol 214 | getSymbol() const { 215 | return ts_node_symbol(impl); 216 | } 217 | 218 | [[nodiscard]] std::string_view 219 | getType() const { 220 | return ts_node_type(impl); 221 | } 222 | 223 | [[nodiscard]] Language 224 | getLanguage() const { 225 | return ts_node_language(impl); 226 | } 227 | 228 | [[nodiscard]] Extent 229 | getByteRange() const { 230 | return {ts_node_start_byte(impl), ts_node_end_byte(impl)}; 231 | } 232 | 233 | [[nodiscard]] Extent 234 | getPointRange() const { 235 | return {ts_node_start_point(impl), ts_node_end_point(impl)}; 236 | } 237 | 238 | [[nodiscard]] std::string_view 239 | getSourceRange(std::string_view source) const { 240 | Extent extents = this->getByteRange(); 241 | return source.substr(extents.start, extents.end - extents.start); 242 | } 243 | 244 | TSNode impl; 245 | }; 246 | 247 | 248 | class Tree { 249 | public: 250 | Tree(TSTree* tree) 251 | : impl{tree, ts_tree_delete} 252 | { } 253 | 254 | [[nodiscard]] Node 255 | getRootNode() const { 256 | return Node{ts_tree_root_node(impl.get())}; 257 | } 258 | 259 | [[nodiscard]] Language 260 | getLanguage() const { 261 | return Language{ts_tree_language(impl.get())}; 262 | } 263 | 264 | [[nodiscard]] bool 265 | hasError() const { 266 | return getRootNode().hasError(); 267 | } 268 | 269 | private: 270 | std::unique_ptr impl; 271 | }; 272 | 273 | 274 | class Parser { 275 | public: 276 | Parser(Language language) 277 | : impl{ts_parser_new(), ts_parser_delete} { 278 | ts_parser_set_language(impl.get(), language.impl); 279 | } 280 | 281 | [[nodiscard]] Tree 282 | parseString(std::string_view buffer) { 283 | return ts_parser_parse_string( 284 | impl.get(), 285 | nullptr, 286 | &buffer.front(), 287 | static_cast(buffer.size()) 288 | ); 289 | } 290 | 291 | private: 292 | std::unique_ptr impl; 293 | }; 294 | 295 | 296 | class Cursor { 297 | public: 298 | Cursor(TSNode node) 299 | : impl{ts_tree_cursor_new(node)} 300 | { } 301 | 302 | Cursor(const TSTreeCursor& cursor) 303 | : impl{ts_tree_cursor_copy(&cursor)} 304 | { } 305 | 306 | // By default avoid copies until the ergonomics are clearer. 307 | Cursor(const Cursor& other) = delete; 308 | Cursor(Cursor&& other) 309 | : impl{} { 310 | std::swap(impl, other.impl); 311 | } 312 | Cursor& operator=(const Cursor& other) = delete; 313 | Cursor& operator=(Cursor&& other) { 314 | std::swap(impl, other.impl); 315 | return *this; 316 | } 317 | 318 | ~Cursor() { 319 | ts_tree_cursor_delete(&impl); 320 | } 321 | 322 | void 323 | reset(Node node) { 324 | ts_tree_cursor_reset(&impl, node.impl); 325 | } 326 | 327 | void 328 | reset(Cursor& cursor) { 329 | ts_tree_cursor_reset_to(&impl, &cursor.impl); 330 | } 331 | 332 | [[nodiscard]] Cursor 333 | copy() const { 334 | return Cursor(impl); 335 | } 336 | 337 | [[nodiscard]] Node 338 | getCurrentNode() const { 339 | return Node{ts_tree_cursor_current_node(&impl)}; 340 | } 341 | 342 | // Navigation 343 | 344 | [[nodiscard]] bool 345 | gotoParent() { 346 | return ts_tree_cursor_goto_parent(&impl); 347 | } 348 | 349 | [[nodiscard]] bool 350 | gotoNextSibling() { 351 | return ts_tree_cursor_goto_next_sibling(&impl); 352 | } 353 | 354 | [[nodiscard]] bool 355 | gotoPreviousSibling() { 356 | return ts_tree_cursor_goto_previous_sibling(&impl); 357 | } 358 | 359 | [[nodiscard]] bool 360 | gotoFirstChild() { 361 | return ts_tree_cursor_goto_first_child(&impl); 362 | } 363 | 364 | [[nodiscard]] bool 365 | gotoLastChild() { 366 | return ts_tree_cursor_goto_last_child(&impl); 367 | } 368 | 369 | [[nodiscard]] size_t 370 | getDepthFromOrigin() const { 371 | return ts_tree_cursor_current_depth(&impl); 372 | } 373 | 374 | private: 375 | TSTreeCursor impl; 376 | }; 377 | 378 | // To avoid cyclic dependencies and ODR violations, we define all methods 379 | // *using* Cursors inline after the definition of Cursor itself. 380 | [[nodiscard]] Cursor 381 | inline Node::getCursor() const { 382 | return Cursor{impl}; 383 | } 384 | 385 | 386 | //////////////////////////////////////////////////////////////// 387 | // Child node iterators 388 | //////////////////////////////////////////////////////////////// 389 | 390 | // These iterators make it possible to use C++ views on Nodes for 391 | // easy processing. 392 | 393 | class ChildIteratorSentinel { }; 394 | 395 | class ChildIterator { 396 | public: 397 | using value_type = ts::Node; 398 | using difference_type = int; 399 | using iterator_category = std::input_iterator_tag; 400 | 401 | explicit ChildIterator(const ts::Node& node) 402 | : cursor{node.getCursor()}, 403 | atEnd{!cursor.gotoFirstChild()} 404 | { } 405 | 406 | value_type 407 | operator*() const { 408 | return cursor.getCurrentNode(); 409 | } 410 | 411 | ChildIterator& 412 | operator++() { 413 | atEnd = !cursor.gotoNextSibling(); 414 | return *this; 415 | } 416 | 417 | ChildIterator& 418 | operator++(int) { 419 | atEnd = !cursor.gotoNextSibling(); 420 | return *this; 421 | } 422 | 423 | friend bool operator== (const ChildIterator& a, const ChildIteratorSentinel&) { return a.atEnd; } 424 | friend bool operator!= (const ChildIterator& a, const ChildIteratorSentinel& b) { return !(a == b); } 425 | friend bool operator== (const ChildIteratorSentinel& b, const ChildIterator& a) { return a == b; } 426 | friend bool operator!= (const ChildIteratorSentinel& b, const ChildIterator& a) { return a != b; } 427 | 428 | private: 429 | ts::Cursor cursor; 430 | bool atEnd; 431 | }; 432 | 433 | 434 | struct Children { 435 | using iterator = ChildIterator; 436 | using sentinel = ChildIteratorSentinel; 437 | 438 | auto begin() const -> iterator { return ChildIterator{node}; } 439 | auto end() const -> sentinel { return {}; } 440 | const ts::Node& node; 441 | }; 442 | 443 | static_assert(std::input_iterator); 444 | static_assert(std::sentinel_for); 445 | 446 | 447 | } 448 | 449 | #endif 450 | --------------------------------------------------------------------------------