├── .bazelrc
├── .bazelversion
├── BUILD.bazel
├── CONTRIBUTING.md
├── LICENSE
├── MODULE.bazel
├── README.md
├── WORKSPACE.bazel
├── docs
└── images
│ ├── vanir_detector_report.png
│ ├── vanir_macro_arch.png
│ └── vanir_micro_arch.png
├── extensions.bzl
├── repositories.bzl
├── requirements.txt
├── requirements_antlr4.txt
├── requirements_antlr4_lock.txt
├── requirements_lock.txt
└── vanir
├── cache
├── BUILD.bazel
└── ecosystem_file_lists.json
├── code_extractors
├── BUILD.bazel
├── code_extractor.py
├── code_extractor_android.py
├── code_extractor_android_test.py
├── code_extractor_base.py
└── code_extractor_test.py
├── detector_common_flags.py
├── detector_common_flags_test.py
├── detector_runner.py
├── detector_runner_test.py
├── file_list_manager.py
├── file_list_manager_test.py
├── hasher.py
├── hasher_test.py
├── integration_tests
├── BUILD.bazel
└── missing_patch_detection_hermetic_test.py
├── language_parsers
├── BUILD.bazel
├── abstract_language_parser.py
├── common.py
├── cpp
│ ├── BUILD.bazel
│ ├── cpp_parser.py
│ ├── cpp_parser_test.py
│ ├── parser_core.cc
│ ├── parser_core.h
│ └── python
│ │ ├── BUILD.bazel
│ │ └── parser_core.cc
├── java
│ ├── BUILD.bazel
│ ├── antlr4.external.bzl
│ ├── java_parser.py
│ ├── java_parser_test.py
│ ├── parser_core.cc
│ ├── parser_core.h
│ └── python
│ │ ├── BUILD.bazel
│ │ └── parser_core.cc
└── language_parsers.py
├── normalizer.py
├── normalizer_test.py
├── osv_client.py
├── overwrite_specs_validity_test.py
├── parser.py
├── parser_test.py
├── refiner.py
├── refiner_test.py
├── reporter.py
├── reporter_test.py
├── scanners
├── BUILD.bazel
├── android_kernel_scanner.py
├── android_kernel_scanner_test.py
├── offline_directory_scanner.py
├── offline_directory_scanner_test.py
├── package_identifier.py
├── package_identifier_test.py
├── package_scanner.py
├── package_scanner_test.py
├── repo_scanner.py
├── repo_scanner_test.py
├── scanner_base.py
├── scanner_base_test.py
├── target_selection_strategy.py
└── target_selection_strategy_test.py
├── sign_generator.py
├── sign_generator_runner.py
├── sign_generator_runner_test.py
├── sign_generator_test.py
├── signature.py
├── signature_test.py
├── testdata
├── BUILD.bazel
├── gitiles
│ ├── b2dc041a4e84986e3a6932b127d3a18ef02b6d0a.patch.base64
│ ├── b2dc041a4e84986e3a6932b127d3a18ef02b6d0a.patchinfo.base64
│ ├── b2dc041a4e84986e3a6932b127d3a18ef02b6d0a_services_core_java_com_android_server_om_OverlayManagerService.java.base64
│ └── c3c9ada6e5d946ce7d224649f1d6528ce80b1a24_services_core_java_com_android_server_om_OverlayManagerService.java.base64
├── test_frameworks_base.tar.gz
├── test_overwrite_specs.json
├── test_patch_file
├── test_patched_file
├── test_signatures.json
├── test_signatures.py
├── test_signatures.zip
├── test_unpatched_file
├── test_unrelated_file
├── test_vulnerabilities_kernel.json
└── test_vulnerabilities_platform.json
├── truncated_path.py
├── truncated_path_test.py
├── vanir_test_base.py
├── version_extractor.py
├── version_extractor_test.py
├── vulnerability.py
├── vulnerability_manager.py
├── vulnerability_manager_test.py
├── vulnerability_overwriter.py
├── vulnerability_overwriter_test.py
└── vulnerability_test.py
/.bazelrc:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | # The following file specifies the Bazel configuration. Currently, Vanir is
8 | # only tested with Bazel >= 6.0.0. For Bazel >= 7.2.0, Vanir runs
9 | # without any changes in this file. For earlier versions, following
10 | # configuration changes are needed.
11 |
12 | # In order to run on Bazel <= 7.1.0, following line needs to be
13 | # commented as bzlmod build does not work on Bazel 7.1 and earlier.
14 |
15 | common --enable_workspace=False
16 |
17 | # Additionally, for Bazel 7.1.0 and 7.0.0, following line needs
18 | # to be uncommented.
19 |
20 | # common --enable_bzlmod=False
21 |
22 | build --cxxopt=-std=c++17 --cxxopt=-fexceptions --cxxopt=-Wno-nonnull --cxxopt=-Wno-sign-compare --cxxopt=-Wno-parentheses --cxxopt=-Wno-deprecated-declarations
23 |
--------------------------------------------------------------------------------
/.bazelversion:
--------------------------------------------------------------------------------
1 | 8.1.1
2 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to contribute
2 |
3 | We'd love to accept your patches and contributions to this project.
4 |
5 | ## Before you begin
6 |
7 | ### Sign our Contributor License Agreement
8 |
9 | Contributions to this project must be accompanied by a
10 | [Contributor License Agreement](https://cla.developers.google.com/about) (CLA).
11 | You (or your employer) retain the copyright to your contribution; this simply
12 | gives us permission to use and redistribute your contributions as part of the
13 | project.
14 |
15 | If you or your current employer have already signed the Google CLA (even if it
16 | was for a different project), you probably don't need to do it again.
17 |
18 | Visit to see your current agreements or to
19 | sign a new one.
20 |
21 | ### Review our community guidelines
22 |
23 | This project follows
24 | [Google's Open Source Community Guidelines](https://opensource.google/conduct/).
25 |
26 | ## Contribution process
27 |
28 | ### Code reviews
29 |
30 | All submissions, including submissions by project members, require review. We
31 | use GitHub pull requests for this purpose. Consult
32 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
33 | information on using pull requests.
34 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2023 Google LLC
2 |
3 | Redistribution and use in source and binary forms, with or without
4 | modification, are permitted provided that the following conditions are
5 | met:
6 |
7 | * Redistributions of source code must retain the above copyright
8 | notice, this list of conditions and the following disclaimer.
9 | * Redistributions in binary form must reproduce the above
10 | copyright notice, this list of conditions and the following disclaimer
11 | in the documentation and/or other materials provided with the
12 | distribution.
13 | * Neither the name of Google LLC nor the names of its
14 | contributors may be used to endorse or promote products derived from
15 | this software without specific prior written permission.
16 |
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 |
--------------------------------------------------------------------------------
/MODULE.bazel:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | """Vanir build and test dependencies."""
8 |
9 | module(name = "vanir")
10 |
11 | bazel_dep(name = "rules_python", version = "1.1.0")
12 | bazel_dep(name = "platforms", version = "0.0.11")
13 | bazel_dep(name = "abseil-cpp", version = "20250127.0", repo_name = "com_google_absl")
14 | bazel_dep(name = "pybind11_bazel", version = "2.13.6")
15 | bazel_dep(name = "pybind11_abseil", version = "202402.0")
16 | bazel_dep(name = "rules_proto", version = "7.1.0")
17 | bazel_dep(name = "protobuf", version = "29.3", repo_name = "com_google_protobuf")
18 |
19 | python = use_extension("@rules_python//python/extensions:python.bzl", "python")
20 | python.toolchain(
21 | is_default = True,
22 | python_version = "3.9"
23 | )
24 |
25 | pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip")
26 | pip.parse(
27 | hub_name = "vanir_deps",
28 | python_version = "3.9",
29 | requirements_lock = "//:requirements_lock.txt",
30 | )
31 | use_repo(pip, "vanir_deps")
32 |
33 | pip.parse(
34 | hub_name = "antlr4_deps",
35 | python_version = "3.9",
36 | requirements_lock = "//:requirements_antlr4_lock.txt",
37 | )
38 | use_repo(pip, "antlr4_deps")
39 |
40 | antlr4_grammar_java_lexer_g4_extension = use_extension("//:extensions.bzl", "antlr4_grammar_java_lexer_g4_extension")
41 | use_repo(antlr4_grammar_java_lexer_g4_extension, "antlr4_grammar_java_lexer_g4")
42 |
43 | antlr4_grammar_java_parser_g4_extension = use_extension("//:extensions.bzl", "antlr4_grammar_java_parser_g4_extension")
44 | use_repo(antlr4_grammar_java_parser_g4_extension, "antlr4_grammar_java_parser_g4")
45 |
46 | antlr4_runtimes_extension = use_extension("//:extensions.bzl", "antlr4_runtimes_extension")
47 | use_repo(antlr4_runtimes_extension, "antlr4_runtimes")
48 |
49 | fuzzyc_extension = use_extension("//:extensions.bzl", "fuzzyc_extension")
50 | use_repo(fuzzyc_extension, "fuzzyc")
51 |
52 | com_google_osv_extension = use_extension("//:extensions.bzl", "com_google_osv_extension")
53 | use_repo(com_google_osv_extension, "com_google_osv")
54 |
55 | jsonpath_rw_extension = use_extension("//:extensions.bzl", "jsonpath_rw_extension")
56 | use_repo(jsonpath_rw_extension, "jsonpath-rw-git")
57 |
58 | antlr4_entry_points_extension = use_extension("//:extensions.bzl", "antlr4_entry_points_extension")
59 | use_repo(antlr4_entry_points_extension, "antlr4_entry_points")
60 |
--------------------------------------------------------------------------------
/WORKSPACE.bazel:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | # Bazel workspace file for Vanir.
8 |
9 | workspace(name = "vanir")
10 |
11 | load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive", "http_file")
12 | load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository", "new_git_repository")
13 |
14 | # Install Vanir python dependencies through PIP.
15 | http_archive(
16 | name = "rules_python",
17 | sha256 = "8c15896f6686beb5c631a4459a3aa8392daccaab805ea899c9d14215074b60ef",
18 | strip_prefix = "rules_python-0.17.3",
19 | url = "https://github.com/bazelbuild/rules_python/archive/refs/tags/0.17.3.tar.gz",
20 | )
21 |
22 | load("@rules_python//python:repositories.bzl", "py_repositories", "python_register_toolchains")
23 |
24 | py_repositories()
25 |
26 |
27 | python_register_toolchains(
28 | name = "python3_9",
29 | python_version = "3.9",
30 | )
31 |
32 | load("@rules_python//python:pip.bzl", "pip_parse")
33 | load("@python3_9//:defs.bzl", "interpreter")
34 |
35 | pip_parse(
36 | name = "vanir_deps",
37 | python_interpreter_target = interpreter,
38 | requirements_lock = "//:requirements_lock.txt",
39 | )
40 |
41 | load("@vanir_deps//:requirements.bzl", "install_deps")
42 |
43 | install_deps()
44 |
45 | # Install Fuzzyc and its transitive dependencies.
46 | git_repository(
47 | name = "fuzzyc",
48 | commit = "0f00ba6804c56f5b6d91bc214a91bb05fd17fcda",
49 | remote = "https://third-party-mirror.googlesource.com/fuzzyc",
50 | )
51 |
52 | # Install antlr4 tools
53 | pip_parse(
54 | name = "antlr4_deps",
55 | python_interpreter_target = interpreter,
56 | requirements_lock = "//:requirements_antlr4_lock.txt",
57 | )
58 |
59 | load("@antlr4_deps//:requirements.bzl", install_antlr4_deps = "install_deps")
60 | install_antlr4_deps()
61 |
62 | new_local_repository(
63 | name = "antlr4_entry_points",
64 | path = "vanir/language_parsers/java",
65 | build_file_content = """
66 | alias(
67 | name = "antlr4",
68 | actual = "@antlr4_deps_antlr4_tools//:rules_python_wheel_entry_point_antlr4",
69 | visibility = ["//visibility:public"],
70 | )
71 | """
72 | )
73 |
74 | # Download Antlr4 Java grammar
75 | ANTLR4_JAVA_REV = "c85ec510bd7cfba4649aec1ac2cf66bebd8ce2ed"
76 | http_file(
77 | name = "antlr4_grammar_java_lexer_g4",
78 | url = "https://github.com/antlr/grammars-v4/raw/%s/java/java/JavaLexer.g4" % ANTLR4_JAVA_REV,
79 | downloaded_file_path = "JavaLexer.g4",
80 | sha256 = "9a812eea62aeddc7bd54f8ba9dac4615d0f3f6b98328cf46b4143fdf75ba2c92",
81 | )
82 | http_file(
83 | name = "antlr4_grammar_java_parser_g4",
84 | url = "https://github.com/antlr/grammars-v4/raw/%s/java/java/JavaParser.g4" % ANTLR4_JAVA_REV,
85 | downloaded_file_path = "JavaParser.g4",
86 | sha256 = "0555bd978b2a7e47ec373ee0671cd13f6ba576ca8c26d127fa0b7467dd6df8ce",
87 | )
88 |
89 | # Antlr4 CC Runtime Library.
90 | http_archive(
91 | name = "antlr4_runtimes",
92 | build_file_content = """
93 | package(default_visibility = ["//visibility:public"])
94 | cc_library(
95 | name = "cpp",
96 | srcs = glob(["runtime/Cpp/runtime/src/**/*.cpp"]),
97 | hdrs = glob(["runtime/Cpp/runtime/src/**/*.h"]),
98 | includes = ["runtime/Cpp/runtime/src"],
99 | )
100 | """,
101 | sha256 = "50e87636a61daabd424d884c60f804387430920072f585a9fee2b90e2043fdcc",
102 | strip_prefix = "antlr4-4.11.1",
103 | urls = ["https://github.com/antlr/antlr4/archive/v4.11.1.tar.gz"],
104 | )
105 |
106 | # Google absl
107 | http_archive(
108 | name = "com_google_absl",
109 | sha256 = "3ea49a7d97421b88a8c48a0de16c16048e17725c7ec0f1d3ea2683a2a75adc21",
110 | strip_prefix = "abseil-cpp-20230125.0",
111 | urls = ["https://github.com/abseil/abseil-cpp/archive/refs/tags/20230125.0.tar.gz"],
112 | )
113 |
114 | # GoogleTest
115 | http_archive(
116 | name = "com_google_googletest",
117 | sha256 = "ad7fdba11ea011c1d925b3289cf4af2c66a352e18d4c7264392fead75e919363",
118 | strip_prefix = "googletest-1.13.0",
119 | urls = ["https://github.com/google/googletest/archive/refs/tags/v1.13.0.tar.gz"],
120 | )
121 |
122 | # Pybind11 Bazel extension
123 | git_repository(
124 | name = "pybind11_bazel",
125 | commit = "5f458fa53870223a0de7eeb60480dd278b442698",
126 | remote = "https://github.com/pybind/pybind11_bazel.git",
127 | )
128 |
129 | # Pybind11
130 | new_git_repository(
131 | name = "pybind11",
132 | build_file = "@pybind11_bazel//:pybind11.BUILD",
133 | remote = "https://github.com/pybind/pybind11.git",
134 | tag = "v2.10.4",
135 | )
136 |
137 | load("@pybind11_bazel//:python_configure.bzl", "python_configure")
138 |
139 | python_configure(
140 | name = "local_config_python",
141 | python_interpreter_target = interpreter,
142 | )
143 |
144 | # Pybind11 Abseil. Don't use master - it can be unstable.
145 | git_repository(
146 | name = "pybind11_abseil",
147 | commit = "13d4f99d5309df3d5afa80fe2ae332d7a2a64c6b",
148 | remote = "https://github.com/pybind/pybind11_abseil.git",
149 | )
150 |
151 | http_archive(
152 | name = "com_google_protobuf",
153 | urls = ["https://github.com/google/protobuf/archive/v3.10.0.zip"],
154 | strip_prefix = "protobuf-3.10.0",
155 | )
156 |
157 | # OSV
158 | OSV_REV = "bbb8ab4f0491bf367f8e1406d8ddf9e9dbf5de86"
159 | http_archive(
160 | name = "com_google_osv",
161 | strip_prefix = "osv.dev-%s" % OSV_REV,
162 | build_file_content = """
163 | load("@com_google_protobuf//:protobuf.bzl", "py_proto_library")
164 | load("@rules_proto//proto:defs.bzl", "proto_library")
165 |
166 | package(default_visibility = ["//visibility:public"])
167 |
168 | PROTO_FILES = [
169 | "osv/vulnerability.proto",
170 | ]
171 |
172 | filegroup(
173 | name = "protobuf_files",
174 | srcs = PROTO_FILES,
175 | visibility = ["//visibility:public"],
176 | )
177 |
178 | proto_library(
179 | name = 'vulnerability_proto',
180 | srcs = PROTO_FILES,
181 | deps = [
182 | '@com_google_protobuf//:struct_proto',
183 | '@com_google_protobuf//:timestamp_proto',
184 | ],
185 | visibility = ['//visibility:public'],
186 | )
187 |
188 | py_proto_library(
189 | name = "vulnerability_py_pb2",
190 | visibility = ["//visibility:public"],
191 | srcs = PROTO_FILES,
192 | deps = [
193 | "@com_google_protobuf//:protobuf_python",
194 | ],
195 | )
196 | """,
197 | urls = ["https://github.com/google/osv.dev/archive/%s.tar.gz" % OSV_REV],
198 | )
199 |
200 | load("@com_google_protobuf//:protobuf_deps.bzl", "protobuf_deps")
201 |
202 | protobuf_deps()
203 |
204 |
205 | # jsonpath_rw
206 | git_repository(
207 | name = "jsonpath-rw-git",
208 | build_file_content = """
209 | load("@rules_python//python:defs.bzl", "py_library")
210 | load("@vanir_deps//:requirements.bzl", "requirement")
211 |
212 | py_library(
213 | name = "jsonpath_rw",
214 | visibility = ["//visibility:public"],
215 | srcs = [
216 | "jsonpath_rw/__init__.py",
217 | "jsonpath_rw/jsonpath.py",
218 | "jsonpath_rw/lexer.py",
219 | "jsonpath_rw/parser.py"
220 | ],
221 | srcs_version = "PY3",
222 | deps = [
223 | requirement("six"),
224 | requirement("ply"),
225 | requirement("decorator"),
226 | ],
227 | )
228 | """,
229 | commit = "6f5647bb3ad2395c20f0191fef07a1df51c9fed8",
230 | remote = "https://github.com/kennknowles/python-jsonpath-rw.git",
231 | )
232 |
--------------------------------------------------------------------------------
/docs/images/vanir_detector_report.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/vanir/fe4afbc9215e786b643431694040a076c3af2c64/docs/images/vanir_detector_report.png
--------------------------------------------------------------------------------
/docs/images/vanir_macro_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/vanir/fe4afbc9215e786b643431694040a076c3af2c64/docs/images/vanir_macro_arch.png
--------------------------------------------------------------------------------
/docs/images/vanir_micro_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/vanir/fe4afbc9215e786b643431694040a076c3af2c64/docs/images/vanir_micro_arch.png
--------------------------------------------------------------------------------
/extensions.bzl:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | """Extensions for Vanir."""
8 |
9 | load(
10 | "//:repositories.bzl",
11 | "antlr4_entry_points_repo",
12 | "antlr4_grammar_java_lexer_g4_repo",
13 | "antlr4_grammar_java_parser_g4_repo",
14 | "antlr4_runtimes_repo",
15 | "com_google_osv_repo",
16 | "fuzzyc_repo",
17 | "jsonpath_rw_repo",
18 | )
19 |
20 | def _antlr4_grammar_java_lexer_g4_impl(_ctx):
21 | antlr4_grammar_java_lexer_g4_repo()
22 |
23 | antlr4_grammar_java_lexer_g4_extension = module_extension(
24 | implementation = _antlr4_grammar_java_lexer_g4_impl,
25 | )
26 |
27 | def _antlr4_grammar_java_parser_g4_impl(_ctx):
28 | antlr4_grammar_java_parser_g4_repo()
29 |
30 | antlr4_grammar_java_parser_g4_extension = module_extension(
31 | implementation = _antlr4_grammar_java_parser_g4_impl,
32 | )
33 |
34 | def _fuzzyc_impl(_ctx):
35 | fuzzyc_repo()
36 |
37 | fuzzyc_extension = module_extension(implementation = _fuzzyc_impl)
38 |
39 | def _antlr4_runtimes_impl(_ctx):
40 | antlr4_runtimes_repo()
41 |
42 | antlr4_runtimes_extension = module_extension(implementation = _antlr4_runtimes_impl)
43 |
44 | def _com_google_osv_impl(_ctx):
45 | com_google_osv_repo()
46 |
47 | com_google_osv_extension = module_extension(implementation = _com_google_osv_impl)
48 |
49 | def _jsonpath_rw_impl(_ctx):
50 | jsonpath_rw_repo()
51 |
52 | jsonpath_rw_extension = module_extension(implementation = _jsonpath_rw_impl)
53 |
54 | def _antlr4_entry_points_impl(_ctx):
55 | antlr4_entry_points_repo()
56 |
57 | antlr4_entry_points_extension = module_extension(implementation = _antlr4_entry_points_impl)
58 |
--------------------------------------------------------------------------------
/repositories.bzl:
--------------------------------------------------------------------------------
1 | # Copyright 2025 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | """Repositories for Vanir."""
8 |
9 | load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
10 | load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive", "http_file")
11 | load("@bazel_tools//tools/build_defs/repo:local.bzl", "new_local_repository")
12 |
13 | ANTLR4_JAVA_REV = "c85ec510bd7cfba4649aec1ac2cf66bebd8ce2ed"
14 |
15 | def antlr4_grammar_java_parser_g4_repo():
16 | # Download Antlr4 Java grammar - Parser
17 | http_file(
18 | name = "antlr4_grammar_java_parser_g4",
19 | url = "https://github.com/antlr/grammars-v4/raw/%s/java/java/JavaParser.g4" % ANTLR4_JAVA_REV,
20 | downloaded_file_path = "JavaParser.g4",
21 | sha256 = "0555bd978b2a7e47ec373ee0671cd13f6ba576ca8c26d127fa0b7467dd6df8ce",
22 | )
23 |
24 | def antlr4_grammar_java_lexer_g4_repo():
25 | # Download Antlr4 Java grammar - Lexer
26 | http_file(
27 | name = "antlr4_grammar_java_lexer_g4",
28 | url = "https://github.com/antlr/grammars-v4/raw/%s/java/java/JavaLexer.g4" % ANTLR4_JAVA_REV,
29 | downloaded_file_path = "JavaLexer.g4",
30 | sha256 = "9a812eea62aeddc7bd54f8ba9dac4615d0f3f6b98328cf46b4143fdf75ba2c92",
31 | )
32 |
33 | def fuzzyc_repo():
34 | git_repository(
35 | name = "fuzzyc",
36 | commit = "f227d19e433a53e264ec6151c66dd85ec53b4c71",
37 | remote = "https://third-party-mirror.googlesource.com/fuzzyc",
38 | )
39 |
40 | def antlr4_runtimes_repo():
41 | http_archive(
42 | name = "antlr4_runtimes",
43 | build_file_content = """
44 | package(default_visibility = ["//visibility:public"])
45 | cc_library(
46 | name = "cpp",
47 | srcs = glob(["runtime/Cpp/runtime/src/**/*.cpp"]),
48 | hdrs = glob(["runtime/Cpp/runtime/src/**/*.h"]),
49 | includes = ["runtime/Cpp/runtime/src"],
50 | )
51 | """,
52 | sha256 = "50e87636a61daabd424d884c60f804387430920072f585a9fee2b90e2043fdcc",
53 | strip_prefix = "antlr4-4.11.1",
54 | urls = ["https://github.com/antlr/antlr4/archive/v4.11.1.tar.gz"],
55 | )
56 |
57 | def com_google_osv_repo():
58 | # OSV
59 | OSV_REV = "bbb8ab4f0491bf367f8e1406d8ddf9e9dbf5de86"
60 | http_archive(
61 | name = "com_google_osv",
62 | strip_prefix = "osv.dev-%s" % OSV_REV,
63 | build_file_content = """
64 | load("@com_google_protobuf//bazel:py_proto_library.bzl", "py_proto_library")
65 | load("@rules_proto//proto:defs.bzl", "proto_library")
66 |
67 | package(default_visibility = ["//visibility:public"])
68 |
69 | PROTO_FILES = [
70 | "osv/vulnerability.proto",
71 | ]
72 |
73 | filegroup(
74 | name = "protobuf_files",
75 | srcs = PROTO_FILES,
76 | visibility = ["//visibility:public"],
77 | )
78 |
79 | proto_library(
80 | name = 'vulnerability_proto',
81 | srcs = PROTO_FILES,
82 | deps = [
83 | '@com_google_protobuf//:struct_proto',
84 | '@com_google_protobuf//:timestamp_proto',
85 | ],
86 | visibility = ['//visibility:public'],
87 | )
88 |
89 | py_proto_library(
90 | name = "vulnerability_py_pb2",
91 | deps = [
92 | "vulnerability_proto",
93 | ],
94 | )
95 | """,
96 | urls = ["https://github.com/google/osv.dev/archive/%s.tar.gz" % OSV_REV],
97 | )
98 |
99 | def jsonpath_rw_repo():
100 | git_repository(
101 | name = "jsonpath-rw-git",
102 | build_file_content = """
103 | load("@rules_python//python:defs.bzl", "py_library")
104 | load("@vanir_deps//:requirements.bzl", "requirement")
105 |
106 | py_library(
107 | name = "jsonpath_rw",
108 | visibility = ["//visibility:public"],
109 | srcs = [
110 | "jsonpath_rw/__init__.py",
111 | "jsonpath_rw/jsonpath.py",
112 | "jsonpath_rw/lexer.py",
113 | "jsonpath_rw/parser.py"
114 | ],
115 | srcs_version = "PY3",
116 | deps = [
117 | requirement("six"),
118 | requirement("ply"),
119 | requirement("decorator"),
120 | ],
121 | )
122 | """,
123 | commit = "6f5647bb3ad2395c20f0191fef07a1df51c9fed8",
124 | remote = "https://github.com/kennknowles/python-jsonpath-rw.git",
125 | )
126 |
127 | def antlr4_entry_points_repo():
128 | new_local_repository(
129 | name = "antlr4_entry_points",
130 | path = "vanir/language_parsers/java",
131 | build_file_content = """
132 | load("@rules_python//python/entry_points:py_console_script_binary.bzl", "py_console_script_binary")
133 |
134 | py_console_script_binary(
135 | name = "antlr4",
136 | pkg = "@antlr4_deps//antlr4_tools",
137 | script = "antlr4",
138 | visibility = ["//visibility:public"],
139 | )
140 | """,
141 | )
142 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # Vanir direct dependencies
2 | requests
3 | absl-py
4 | mmh3
5 | unidiff
6 | jinja2
7 | typing_extensions>=4,<5
8 | python-dateutil
9 |
10 | # jsonpath_rw dependencies
11 | six
12 | ply
13 | decorator
14 |
--------------------------------------------------------------------------------
/requirements_antlr4.txt:
--------------------------------------------------------------------------------
1 | antlr4-tools==0.2
2 |
--------------------------------------------------------------------------------
/requirements_antlr4_lock.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This file is autogenerated by pip-compile with Python 3.10
3 | # by the following command:
4 | #
5 | # pip-compile --output-file=requirements_antlr4_lock.txt requirements_antlr4.txt
6 | #
7 | antlr4-tools==0.2
8 | # via -r requirements_fuzzyc.txt
9 | install-jdk==0.3.0
10 | # via antlr4-tools
11 |
--------------------------------------------------------------------------------
/requirements_lock.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This file is autogenerated by pip-compile with Python 3.9
3 | # by the following command:
4 | #
5 | # pip-compile --output-file=requirements_lock.txt requirements.txt
6 | #
7 | absl-py==2.1.0
8 | # via -r requirements.txt
9 | certifi==2024.12.14
10 | # via requests
11 | charset-normalizer==3.4.1
12 | # via requests
13 | decorator==5.1.1
14 | # via -r requirements.txt
15 | idna==3.10
16 | # via requests
17 | jinja2==3.1.5
18 | # via -r requirements.txt
19 | markupsafe==3.0.2
20 | # via jinja2
21 | mmh3==5.0.1
22 | # via -r requirements.txt
23 | ply==3.11
24 | # via -r requirements.txt
25 | python-dateutil==2.9.0.post0
26 | # via -r requirements.txt
27 | requests==2.32.3
28 | # via -r requirements.txt
29 | six==1.17.0
30 | # via
31 | # -r requirements.txt
32 | # python-dateutil
33 | typing-extensions==4.12.2
34 | # via -r requirements.txt
35 | unidiff==0.7.5
36 | # via -r requirements.txt
37 | urllib3==2.3.0
38 | # via requests
39 |
--------------------------------------------------------------------------------
/vanir/cache/BUILD.bazel:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | # Package for Vanir cached data.
8 | package(default_visibility = ["//visibility:public"])
9 |
10 | filegroup(
11 | name = "ecosystem_file_lists",
12 | srcs = [
13 | "ecosystem_file_lists.json",
14 | ],
15 | )
16 |
--------------------------------------------------------------------------------
/vanir/code_extractors/BUILD.bazel:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | # Bazel build rules for Vanir code extractors.
8 | load("@rules_python//python:defs.bzl", "py_library", "py_test")
9 | load("@vanir_deps//:requirements.bzl", "requirement")
10 |
11 | package(default_visibility = [
12 | "//visibility:public",
13 | ])
14 |
15 | py_library(
16 | name = "code_extractor_base",
17 | srcs = ["code_extractor_base.py"],
18 | deps = [
19 | "//:vulnerability",
20 | requirement("requests"),
21 | requirement("unidiff"),
22 | ],
23 | )
24 |
25 | py_library(
26 | name = "code_extractor",
27 | srcs = ["code_extractor.py"],
28 | deps = [
29 | ":code_extractor_android",
30 | ":code_extractor_base",
31 | "//:vulnerability",
32 | requirement("requests"),
33 | ],
34 | )
35 |
36 | py_library(
37 | name = "code_extractor_android",
38 | srcs = ["code_extractor_android.py"],
39 | deps = [
40 | ":code_extractor_base",
41 | "//:vulnerability",
42 | requirement("requests"),
43 | requirement("unidiff"),
44 | ],
45 | )
46 |
47 | py_test(
48 | name = "code_extractor_test",
49 | srcs = ["code_extractor_test.py"],
50 | data = [
51 | "//vanir/testdata:test_patch_set",
52 | ],
53 | deps = [
54 | ":code_extractor",
55 | ":code_extractor_base",
56 | "//:vulnerability",
57 | requirement("absl-py"),
58 | ],
59 | )
60 |
61 | py_test(
62 | name = "code_extractor_android_test",
63 | srcs = ["code_extractor_android_test.py"],
64 | data = [
65 | "//vanir/testdata:test_patch_set",
66 | ],
67 | deps = [
68 | ":code_extractor_android",
69 | ":code_extractor_base",
70 | "//:vulnerability",
71 | requirement("absl-py"),
72 | requirement("requests"),
73 | ],
74 | )
75 |
--------------------------------------------------------------------------------
/vanir/code_extractors/code_extractor.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | """Extracts code snippets and metadata needed for CVE signature generation.
8 |
9 | This module contains utility classes and functions to extract code snippets and
10 | metadata of CVEs such as patch files (i.e., file diff) and unpatched files.
11 | """
12 |
13 | from typing import Collection, Optional, Sequence, Tuple, Type, TypeVar
14 |
15 | import requests
16 | from vanir import vulnerability
17 | # Simply importing the extractors will register them as subclasses of the
18 | # abstract extractor class and therefore available for use.
19 | # pylint: disable=unused-import
20 | from vanir.code_extractors import code_extractor_android
21 | # pylint: enable=unused-import
22 | from vanir.code_extractors import code_extractor_base
23 |
24 | _P = TypeVar('_P', bound=code_extractor_base.AbstractCodeExtractor)
25 |
26 | OSV_ID = 'id'
27 | REF_URL = 'url'
28 | REF_TYPE = 'type'
29 | REF_TYPE_FIX = 'FIX'
30 | VULN_AFFECTED = 'affected'
31 | AFFECTED_PACKAGE = 'package'
32 | PACKAGE_NAME = 'name'
33 | PACKAGE_ECOSYSTEM = 'ecosystem'
34 | AFFECTED_ECOSYSTEM_SPECIFIC = 'ecosystem_specific'
35 |
36 |
37 | class DuplicatedCodeExtractorError(Exception):
38 | pass
39 |
40 |
41 | def _get_extractor_class(ecosystem: str) -> Optional[Type[_P]]:
42 | """Returns the extractor class for the given ecosystem, or None."""
43 | extractors = code_extractor_base.AbstractCodeExtractor.__subclasses__()
44 | found_extractors = []
45 | for extractor_class in extractors:
46 | if extractor_class.is_supported_ecosystem(ecosystem):
47 | found_extractors.append(extractor_class)
48 |
49 | if not found_extractors:
50 | return None
51 | if len(found_extractors) > 1:
52 | raise DuplicatedCodeExtractorError(
53 | 'Multiple code extractors supported ecosystem "%s": %s' %
54 | (ecosystem, found_extractors))
55 | return found_extractors[0]
56 |
57 |
58 | def extract_for_affected_entry(
59 | affected: vulnerability.AffectedEntry,
60 | session: Optional[requests.sessions.Session] = None,
61 | ) -> Tuple[Sequence[code_extractor_base.Commit],
62 | Sequence[code_extractor_base.FailedCommitUrl]]:
63 | """Extracts fix commit data for the given Vulnerability.
64 |
65 | For each commit, this class extracts the following data:
66 | 1. commit message
67 | 2. per-file patch (diff)
68 | 3. unmodified & modified versions of the files changed by the patch
69 |
70 | Args:
71 | affected: the OSV affected entry to extract fixes for.
72 | session: requests session to use for retrieving files and patches. If
73 | None, a new session will be used.
74 |
75 | Returns:
76 | A tuple where the first item is the list of |Commit| objects pertaining
77 | to the given |vuln|, and the second item is the list of URLs found but
78 | failed to be converted to |Commit| objects.
79 | """
80 | extractor_class = _get_extractor_class(affected.ecosystem)
81 | if not extractor_class:
82 | raise NotImplementedError(f'Unsupported ecosystem: {affected.ecosystem}')
83 | return extractor_class(session).extract_commits_for_affected_entry(affected)
84 |
85 |
86 | def extract_files_at_tip_of_unaffected_versions(
87 | ecosystem: str,
88 | package_name: str,
89 | affected_versions: Sequence[str],
90 | files: Collection[str],
91 | session: Optional[requests.sessions.Session] = None,
92 | ) -> Tuple[
93 | Sequence[code_extractor_base.Commit],
94 | Sequence[code_extractor_base.FailedCommitUrl],
95 | ]:
96 | """Extracts files tip of unmentioned versions of the given package.
97 |
98 | This method checks the list of given versions and determine the active tips of
99 | branches that are not mentioned in the list and extract the listed files at
100 | the those tips.
101 |
102 | Args:
103 | ecosystem: the ecosystem of the package.
104 | package_name: the name of the package.
105 | affected_versions: the list of affected versions of the package. Tip of
106 | versions not in this list will be extracted.
107 | files: the list of files to include.
108 | session: requests session to use for retrieving files and patches. If
109 | None, a new session will be used.
110 |
111 | Returns:
112 | A tuple where the first item is the list of |Commit| objects pertaining
113 | to the tip of a version not mentioned in |versions|, and the second item
114 | is the list of tip URLs failed to convert to |Commit| objects.
115 | """
116 | extractor_class = _get_extractor_class(ecosystem)
117 | if not extractor_class:
118 | raise NotImplementedError(f'Unsupported ecosystem: {ecosystem}')
119 | return extractor_class(session).extract_files_at_tip_of_unaffected_versions(
120 | package_name, affected_versions, files,
121 | )
122 |
--------------------------------------------------------------------------------
/vanir/code_extractors/code_extractor_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | """Tests for code_extractor."""
8 |
9 | from vanir import vulnerability
10 | from vanir.code_extractors import code_extractor
11 | from vanir.code_extractors import code_extractor_base
12 |
13 | from absl.testing import absltest
14 | from absl.testing import parameterized
15 |
16 | _TEST_COMMIT = 'abcdef0000000000000000000000000000000000'
17 | _TEST_PARENT_COMMIT = 'fedcba1111111111111111111111111111111111'
18 |
19 | _ANDROID_PATCH_URL_BASE = 'https://android.googlesource.com/kernel/common/+/'
20 | _TEST_ANDROID_COMMIT_URL = _ANDROID_PATCH_URL_BASE + _TEST_COMMIT
21 |
22 |
23 | class CodeExtractorTest(parameterized.TestCase):
24 |
25 | @absltest.mock.patch.object(
26 | code_extractor_base, 'Commit', autospec=True, instance=True
27 | )
28 | @absltest.mock.patch.object(code_extractor_base, 'AbstractCodeExtractor')
29 | def test_extract(self, mock_extractor_class, mock_commit):
30 | mock_extractor_class.__subclasses__ = lambda self: [mock_extractor_class]
31 | mock_extractor_class.is_supported_ecosystem.side_effect = (
32 | lambda s: True if s == 'test_ecosystem' else False
33 | )
34 |
35 | mock_extractor_class(
36 | None
37 | ).extract_commits_for_affected_entry.return_value = ([mock_commit], [])
38 | test_affected = vulnerability.AffectedEntry(
39 | {'package': {'ecosystem': 'test_ecosystem', 'name': 'pkg'}}
40 | )
41 | commits, failures = code_extractor.extract_for_affected_entry(test_affected)
42 | self.assertEmpty(failures)
43 | self.assertListEqual(commits, [mock_commit])
44 |
45 | def test_extract_with_no_package(self):
46 | with self.assertRaisesRegex(ValueError, 'Missing package info.*'):
47 | code_extractor.extract_for_affected_entry(
48 | vulnerability.AffectedEntry({})
49 | )
50 |
51 | @absltest.mock.patch.object(code_extractor_base, 'AbstractCodeExtractor')
52 | def test_extract_with_no_patch_found(self, mock_extractor_class):
53 | mock_extractor_class.__subclasses__ = lambda self: [mock_extractor_class]
54 | mock_extractor_class.is_supported_ecosystem.side_effect = (
55 | lambda s: True if s == 'test_ecosystem' else False
56 | )
57 | mock_extractor_class(
58 | None
59 | ).extract_commits_for_affected_entry.return_value = ([], [])
60 | test_affected = vulnerability.AffectedEntry(
61 | {'package': {'ecosystem': 'test_ecosystem', 'name': 'pkg'}}
62 | )
63 | commits, failures = code_extractor.extract_for_affected_entry(test_affected)
64 | self.assertEmpty(commits)
65 | self.assertEmpty(failures)
66 |
67 | def test_extract_with_unsupported_ecosystem(self):
68 | test_affected = vulnerability.AffectedEntry(
69 | {'package': {'ecosystem': 'unknown_ecosystem', 'name': 'pkg'}}
70 | )
71 | with self.assertRaises(NotImplementedError):
72 | _, _ = code_extractor.extract_for_affected_entry(test_affected)
73 |
74 | @absltest.mock.patch.object(
75 | code_extractor_base, 'Commit', autospec=True, instance=True
76 | )
77 | @absltest.mock.patch.object(code_extractor_base, 'AbstractCodeExtractor')
78 | def test_extract_files_at_tip_of_unaffected_versions(
79 | self, mock_extractor_class, mock_commit,
80 | ):
81 | mock_extractor_class.__subclasses__ = lambda self: [mock_extractor_class]
82 | mock_extractor_class.is_supported_ecosystem.side_effect = (
83 | lambda s: True if s == 'test_ecosystem' else False
84 | )
85 | mock_extractor_class(
86 | None
87 | ).extract_files_at_tip_of_unaffected_versions.return_value = (
88 | [mock_commit], []
89 | )
90 |
91 | commits, failures = (
92 | code_extractor.extract_files_at_tip_of_unaffected_versions(
93 | 'test_ecosystem', 'test_package', ['1.0.0'], ['file1'], None,
94 | )
95 | )
96 | self.assertEmpty(failures)
97 | self.assertListEqual(commits, [mock_commit])
98 |
99 | def test_extract_files_at_tip_of_unaffected_versions_unsupported_ecosystem(
100 | self
101 | ):
102 | with self.assertRaises(NotImplementedError):
103 | code_extractor.extract_files_at_tip_of_unaffected_versions(
104 | 'test_ecosystem', 'test_package', ['1.0.0'], ['file1', 'file2'], None,
105 | )
106 |
107 |
108 | if __name__ == '__main__':
109 | absltest.main()
110 |
--------------------------------------------------------------------------------
/vanir/file_list_manager.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | """Module for managing known files for each ecosystem/package.
8 |
9 | This module manages lists of known files for each ecysostem & package needed
10 | for calculating truncated path level.
11 | """
12 |
13 | import collections
14 | import enum
15 | import json
16 | from typing import Mapping, Sequence
17 |
18 | from vanir import parser
19 |
20 |
21 |
22 | _GITFS_TIMEOUT_SEC = 60
23 | _GITFS_ADDR = 'blade:git'
24 |
25 | ANDROID_ECOSYSTEM = 'Android'
26 | KERNEL_PACKAGE = ':linux_kernel:'
27 | _MAINLINE_KERNEL_PROJECT = 'android:kernel/common:refs/heads/android-mainline:'
28 |
29 | _KNOWN_SOURCES = [(ANDROID_ECOSYSTEM, KERNEL_PACKAGE, _MAINLINE_KERNEL_PROJECT)]
30 |
31 | ECOSYSTEM_FILE_LISTS_CACHE = (
32 | 'vanir/cache/ecosystem_file_lists.json'
33 | )
34 |
35 |
36 | @enum.unique
37 | class Source(enum.Enum):
38 | CACHE = 'cache'
39 |
40 |
41 | def get_file_lists(
42 | source: Source = Source.CACHE,
43 | ) -> Mapping[str, Mapping[str, Sequence[str]]]:
44 | """Returns reference file lists for signature generation.
45 |
46 | Args:
47 | source: source to retrieve file lists.
48 |
49 | Returns:
50 | Reference file list map where the first key is ecosystem, the second key is
51 | package name and the value is list of files.
52 | """
53 | if source == Source.CACHE:
54 | resource = open(ECOSYSTEM_FILE_LISTS_CACHE, mode='rb').read()
55 | file_lists = json.loads(resource)
56 | return file_lists
57 | else:
58 | raise ValueError('Unknown file list source: %s' % source)
59 |
--------------------------------------------------------------------------------
/vanir/file_list_manager_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | """Test for file list manager module."""
8 |
9 | import json
10 |
11 | from vanir import file_list_manager
12 |
13 | from absl.testing import absltest
14 |
15 | _TEST_SUPPORTED_FILE_LIST = ['foo.c', 'bar.c']
16 | _TEST_UNSUPPORTED_FILE_LIST = ['unsupported_filetype.asp']
17 | _TEST_FILE_LIST = _TEST_SUPPORTED_FILE_LIST + _TEST_UNSUPPORTED_FILE_LIST
18 | _TEST_SHA = 'abcdef1234567890'
19 | _TEST_FILE_LISTS_JSON_STR = json.dumps(
20 | {'Android': {':linux_kernel:': _TEST_SUPPORTED_FILE_LIST}}
21 | )
22 |
23 |
24 | class FileListManagerTest(absltest.TestCase):
25 |
26 | def test_get_file_lists_with_cache(self):
27 | file_lists = file_list_manager.get_file_lists(
28 | file_list_manager.Source.CACHE
29 | )
30 | kernel_file_list = file_lists.get('Android', {}).get(':linux_kernel:')
31 | self.assertGreater(len(kernel_file_list), 50000)
32 |
33 | def test_get_file_lists_fail_with_unknown_source(self):
34 | with self.assertRaises(ValueError):
35 | file_list_manager.get_file_lists('unknown_source')
36 |
37 |
38 | if __name__ == '__main__':
39 | absltest.main()
40 |
--------------------------------------------------------------------------------
/vanir/hasher.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | """Hasher to generate signature hashes for the given code snippets."""
8 |
9 | import functools
10 | from typing import Mapping, Optional, Sequence, Tuple
11 |
12 | from absl import logging
13 | import mmh3
14 |
15 | # For experimental purpose, the n-gram size can be adjusted, but note that any
16 | # change on the line n-gram size requires regeneration of entire signatures.
17 | # Generally, decreasing n-gram size may end up with increase of findings,
18 | # including both true positives and false positives.
19 | _LINE_SIGNATURE_NGRAM_SIZE = 4
20 |
21 | _HASH = functools.partial(mmh3.hash128, seed=0, x64arch=True, signed=False)
22 |
23 |
24 | class _LineNgram:
25 | """Class for maintaining an n-gram where the units are code lines."""
26 |
27 | def __init__(self,
28 | normalized_code: Mapping[int, str],
29 | line_numbers: Sequence[int],
30 | is_first: Optional[bool] = False,
31 | is_last: Optional[bool] = False):
32 | """Initializes the line n-gram.
33 |
34 | The first and last ngram of a file must be explicitly marked through
35 | |is_first| and |is_last| in order to cover patch hunks adding lines at the
36 | top or the bottom of the file. Internally, the first ngram will be regarded
37 | as ranging from the line number negative infinity, and the last ngram
38 | ranging to the line number infinity.
39 |
40 | Args:
41 | normalized_code: dictionary of normalized code lines including (but not
42 | limited to) the lines for the n-gram. Each key is a line number, and the
43 | value is normalized line in string.
44 | line_numbers: the list of line numbers comprising the n-gram. Each line
45 | number must be a valid line number existing in |normalized_code|.
46 | is_first: True if the n-gram is the first n-gram of the target file.
47 | is_last: True if the n-gram is the last n-gram of the target file.
48 |
49 | Raises:
50 | ValueError: raises value error if any line number in |line_numbers| is not
51 | a valid line number in |normalized_code|.
52 | """
53 |
54 | self._normalized_code = normalized_code
55 | self._line_numbers = sorted(line_numbers)
56 | self._is_first = is_first
57 | self._is_last = is_last
58 |
59 | def is_overlapping(self, line_range: Tuple[int, int]) -> bool:
60 | """Returns true if the line ngram range overlaps with given |line_range|."""
61 | if self._is_first and self._is_last:
62 | # This n-gram is the first and the last n-gram, covering the entire file.
63 | return True
64 |
65 | range_start, range_end = line_range
66 | if range_start > range_end:
67 | raise ValueError(f'line_range: start ({range_start}) cannot be greater '
68 | f'than end ({range_end})')
69 |
70 | # For given ranges r1 and r2, if r1.start <= r2.end && r1.end >= r2.start,
71 | # r1 and r2 overlaps.
72 | if self._is_first:
73 | return self._line_numbers[-1] >= range_start
74 | if self._is_last:
75 | return self._line_numbers[0] <= range_end
76 | return (self._line_numbers[0] <= range_end and
77 | self._line_numbers[-1] >= range_start)
78 |
79 | def get_ngram_string(self) -> str:
80 | """Returns the actual string of the n-gram."""
81 | try:
82 | return ' '.join([
83 | self._normalized_code[line_number]
84 | for line_number in self._line_numbers
85 | ])
86 | except KeyError as e:
87 | raise KeyError(
88 | f'Invalid line numbers for ngram: {self._line_numbers}. This is a '
89 | 'bug and should never have happened. A _LineNgram object should only '
90 | 'be initialized with line_numbers being a subset of normalized_code.'
91 | ) from e
92 |
93 | def get_line_numbers(self) -> Sequence[int]:
94 | """Returns the line numbers comprising the n-gram."""
95 | return self._line_numbers
96 |
97 |
98 | def hash_function_chunk(normalized_code: str) -> int:
99 | """Computes hash for the normalized code of a function chunk.
100 |
101 | A function chunk signature is a Murmur3 128-bit x64 hash of the normalized
102 | function code.
103 |
104 | Args:
105 | normalized_code: a normalized function code in string.
106 |
107 | Returns:
108 | The 128-bit hash in integer.
109 | """
110 | return _HASH(normalized_code)
111 |
112 |
113 | def hash_line_chunk(
114 | normalized_code: Mapping[int, str],
115 | affected_line_ranges: Sequence[Tuple[int, int]]
116 | ) -> Tuple[Sequence[int], Sequence[int]]:
117 | """Computes hash for the normalized code of a line chunk.
118 |
119 | A line chunk signature is a set of n-gram line hashes. Each n-gram consists
120 | of affected lines and their context lines (up to n - 1 lines before and
121 | after the affected lines). Note that any empty lines / comment lines are not
122 | regarded as valid lines so the actual context lines can be located further
123 | than n - 1 lines of an affected line.
124 |
125 | Args:
126 | normalized_code: a normalized code of a line chunk.
127 | affected_line_ranges: list of the ranges indicating the lines changed by the
128 | patch in the chunk's target file. The line numbers are based on the
129 | unpatched file. Inclusive.
130 |
131 | Returns:
132 | A tuple of the hash list and used line list. The hash list is a list of
133 | 128-bit line n-gram hashes. The used line list is a list of integer line
134 | numbers used as elements of the n-grams.
135 | """
136 | valid_line_numbers = sorted(normalized_code.keys())
137 | if not valid_line_numbers:
138 | logging.debug('No valid line found from the normalized code. Returning '
139 | 'empty lists.')
140 | return [], []
141 |
142 | if not affected_line_ranges:
143 | # If no affected line range is specified, regard all lines as affected.
144 | affected_line_ranges = [
145 | (valid_line_numbers[0], valid_line_numbers[-1])
146 | ]
147 |
148 | # Make a list of all valid line ngrams.
149 | ngrams = []
150 | if len(valid_line_numbers) < _LINE_SIGNATURE_NGRAM_SIZE:
151 | # If the number of valid lines in a file is shorter than n-gram size,
152 | # just use all valid lines.
153 | ngrams.append(
154 | _LineNgram(
155 | normalized_code, valid_line_numbers, is_first=True, is_last=True))
156 | else:
157 | ngram_first_line_indices = range(
158 | len(valid_line_numbers) - _LINE_SIGNATURE_NGRAM_SIZE + 1)
159 | for line_index in ngram_first_line_indices:
160 | ngram_line_numbers = valid_line_numbers[
161 | line_index:_LINE_SIGNATURE_NGRAM_SIZE + line_index]
162 | is_first = line_index == ngram_first_line_indices[0]
163 | is_last = line_index == ngram_first_line_indices[-1]
164 | ngrams.append(
165 | _LineNgram(normalized_code, ngram_line_numbers, is_first, is_last))
166 |
167 | # For only "valid & affected" ngrams, compute ngram hashes.
168 | line_hashes = []
169 | used_lines = set()
170 | for affected_range in affected_line_ranges:
171 | for ngram in ngrams.copy():
172 | if ngram.is_overlapping(affected_range):
173 | ngram_hash = _HASH(ngram.get_ngram_string())
174 | line_hashes.append(ngram_hash)
175 | used_lines.update(ngram.get_line_numbers())
176 | ngrams.remove(ngram)
177 |
178 | return line_hashes, sorted(used_lines)
179 |
--------------------------------------------------------------------------------
/vanir/hasher_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | """Tests for hasher."""
8 |
9 | import functools
10 |
11 | from absl import logging
12 | import mmh3
13 | from vanir import hasher
14 |
15 | from absl.testing import absltest
16 | from absl.testing import parameterized
17 |
18 |
19 | _TEST_LINE_CHUNK_NORMALIZED_CODE = {
20 | 5: 'void __init testdev_init ( void )',
21 | 6: '{',
22 | 7: 'memset ( cdev , 0 , sizeof * cdev ) ;',
23 | 8: 'init_list_head ( & cdev -> list ) ;',
24 | 9: 'kobject_init ( & cdev -> kobj , & ktype_cdev_default ) ;',
25 | 10: 'cdev -> ops = fops ;',
26 | 11: '}',
27 | 19: 'cdev_map = kobj_map_init ( base_probe , & testdevs_lock ) ;',
28 | 20: '}',
29 | 24: 'export_symbol ( register_testdev_region ) ;',
30 | 34: 'export_symbol ( __register_testdev ) ;',
31 | 35: 'export_symbol ( __unregister_testdev ) ;'
32 | }
33 |
34 |
35 | class HasherTest(parameterized.TestCase):
36 |
37 | def setUp(self):
38 | self._hash = functools.partial(
39 | mmh3.hash128, seed=0, x64arch=True, signed=False)
40 | super().setUp()
41 |
42 | def test_function_chunk_hash(self):
43 | test_normalized_code = (
44 | 'DTYPE FUNCNAME ( const unsigned DTYPE PARAM ) { const DTYPE '
45 | '* VAR = ( DTYPE * ) globalvar -> data '
46 | '; FUNCCALL ( PARAM , VAR ) ; 0xe8 ( ) ; return 0 ; }')
47 |
48 | function_hash = hasher.hash_function_chunk(test_normalized_code)
49 |
50 | expected_function_hash = self._hash(test_normalized_code)
51 | self.assertEqual(function_hash, expected_function_hash)
52 |
53 | @parameterized.named_parameters(
54 | dict(
55 | testcase_name='with_no_affected_ranges',
56 | normalized_code=_TEST_LINE_CHUNK_NORMALIZED_CODE,
57 | affected_ranges=[],
58 | expected_used_lines=[5, 6, 7, 8, 9, 10, 11, 19, 20, 24, 34, 35]),
59 | dict(
60 | testcase_name='with_affected_ranges_in_middle',
61 | normalized_code=_TEST_LINE_CHUNK_NORMALIZED_CODE,
62 | affected_ranges=[(9, 10)],
63 | expected_used_lines=[6, 7, 8, 9, 10, 11, 19, 20]),
64 | dict(
65 | testcase_name='with_affected_ranges_at_file_start',
66 | normalized_code=_TEST_LINE_CHUNK_NORMALIZED_CODE,
67 | affected_ranges=[(0, 0)],
68 | expected_used_lines=[5, 6, 7, 8]),
69 | dict(
70 | testcase_name='with_affected_ranges_at_file_end',
71 | normalized_code=_TEST_LINE_CHUNK_NORMALIZED_CODE,
72 | affected_ranges=[(50, 50)],
73 | expected_used_lines=[20, 24, 34, 35]),
74 | dict(
75 | testcase_name='with_short_normalized_code',
76 | normalized_code={
77 | 3: '#define AUDIT_NAMES 5',
78 | 4: '#define auditsc_get_stamp ( c , t , s ) 0'
79 | },
80 | affected_ranges=[],
81 | expected_used_lines=[3, 4]))
82 | def test_line_chunk_hash(self, normalized_code, affected_ranges,
83 | expected_used_lines):
84 | """Tests various successful cases of line chunk hash generation.
85 |
86 | Args:
87 | normalized_code: the normalized code to test.
88 | affected_ranges: the affected ranges to test with for the normalized code.
89 | expected_used_lines: expected lines to be used for signature hash
90 | generation. This value varies depending on |affected_ranges| but this
91 | test explicitly requires this arg because we want to test the
92 | corresponding logic in the main code rather than to run the identical
93 | logic again in the test.
94 | """
95 | expected_hashes = []
96 | expected_line_number_ngrams = []
97 | index = 0
98 | while index + 3 < len(expected_used_lines):
99 | expected_line_number_ngrams.append(expected_used_lines[index:index + 4])
100 | index += 1
101 | if not expected_line_number_ngrams:
102 | expected_line_number_ngrams.append(normalized_code.keys())
103 |
104 | for line_numbers in expected_line_number_ngrams:
105 | ngram = ' '.join(
106 | [normalized_code[line_number] for line_number in line_numbers])
107 | expected_hashes.append(self._hash(ngram))
108 |
109 | line_hashes, used_lines = hasher.hash_line_chunk(normalized_code,
110 | affected_ranges)
111 |
112 | self.assertCountEqual(expected_used_lines, used_lines)
113 | self.assertEqual(expected_hashes, line_hashes)
114 |
115 | def test_line_chunk_hash_with_empty_normalized_code_is_warned(self):
116 | test_normalized_code = {}
117 | with self.assertLogs(level=logging.WARNING) as logs:
118 | line_hashes, used_lines = hasher.hash_line_chunk(test_normalized_code, [])
119 | self.assertIn(
120 | 'No valid line found from the normalized code. Returning empty lists.',
121 | logs.output[0])
122 | self.assertEmpty(line_hashes)
123 | self.assertEmpty(used_lines)
124 |
125 | def test_line_ngram_overlap_check_fails_with_reversed_line_range(self):
126 | # Case unable to be triggered by public class; directly test private class.
127 | test_ngram_line_numbers = [7, 8, 9, 10]
128 | test_affected_line_range = (10, 7)
129 | ngram = hasher._LineNgram(_TEST_LINE_CHUNK_NORMALIZED_CODE,
130 | test_ngram_line_numbers)
131 | expected_error_msg = (
132 | r'line_range: start \(10\) cannot be greater than end \(7\)')
133 | with self.assertRaisesRegex(ValueError, expected_error_msg):
134 | ngram.is_overlapping(test_affected_line_range)
135 |
136 | if __name__ == '__main__':
137 | absltest.main()
138 |
--------------------------------------------------------------------------------
/vanir/integration_tests/BUILD.bazel:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | # Integration tests for Vanir
8 |
9 | load("@rules_python//python:defs.bzl", "py_test")
10 | load("@vanir_deps//:requirements.bzl", "requirement")
11 |
12 | package(default_visibility = ["//visibility:public"])
13 |
14 | py_test(
15 | name = "missing_patch_detection_hermetic_test",
16 | srcs = ["missing_patch_detection_hermetic_test.py"],
17 | data = [
18 | "//vanir/testdata:test_gitiles_data",
19 | "//vanir/testdata:test_vulnerabilities",
20 | "//vanir/testdata:vanir_test_source_simplified",
21 | ],
22 | deps = [
23 | "//:osv_client",
24 | "//:reporter",
25 | "//:signature",
26 | "//:vanir_test_base",
27 | "//:vulnerability",
28 | "//:vulnerability_manager",
29 | "//vanir/scanners:scanner_base",
30 | "//vanir/scanners:target_selection_strategy",
31 | "//vanir/testdata:test_signatures",
32 | requirement("absl-py"),
33 | requirement("requests"),
34 | ],
35 | )
36 |
--------------------------------------------------------------------------------
/vanir/language_parsers/BUILD.bazel:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | # Package containing all programming language parsers for Vanir.
8 |
9 | load("@rules_python//python:defs.bzl", "py_library", "py_test")
10 |
11 | package(default_visibility = ["//visibility:public"])
12 |
13 | py_library(
14 | name = "abstract_language_parser",
15 | srcs = [
16 | "abstract_language_parser.py",
17 | ],
18 | deps = [
19 | ":common",
20 | ],
21 | )
22 |
23 | py_library(
24 | name = "common",
25 | srcs = [
26 | "common.py",
27 | ],
28 | )
29 |
30 | py_library(
31 | name = "language_parsers",
32 | srcs = ["language_parsers.py"],
33 | deps = [
34 | ":abstract_language_parser",
35 | ":common",
36 | "//vanir/language_parsers/cpp:cpp_parser",
37 | "//vanir/language_parsers/java:java_parser",
38 | ],
39 | )
40 |
--------------------------------------------------------------------------------
/vanir/language_parsers/abstract_language_parser.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | """Abstract Parser base class that all parsers implement.
8 | """
9 |
10 | import abc
11 | from typing import Optional, Sequence, Tuple
12 |
13 | from vanir.language_parsers import common
14 |
15 |
16 | class AbstractLanguageParser(abc.ABC):
17 | """Abstract language parser interface that all language parsers implement.
18 |
19 | A Parser object parses one file, optionally only on a set of select line
20 | ranges, and presents methods to extract function and line chunks to Vanir
21 | signature generation and scanning.
22 |
23 | A Parser supports a set of file extensions, given by each implementation as
24 | the return value of get_supported_extensions().
25 | """
26 |
27 | @classmethod
28 | @abc.abstractmethod
29 | def get_supported_extensions(cls) -> Sequence[str]:
30 | """Returns a list of supported file extensions. Should include the dot."""
31 |
32 | def __init__(self, filename: str):
33 | """Construct the Parser object for given filename.
34 |
35 | Args:
36 | filename: the absolute path to the file to analyze.
37 | """
38 |
39 | @abc.abstractmethod
40 | def get_chunks(
41 | self,
42 | affected_line_ranges_for_functions: Optional[
43 | Sequence[Tuple[int, int]]
44 | ] = None,
45 | ) -> common.ParseResults:
46 | """Parse the file and return the line chunk and function chunks.
47 |
48 | Args:
49 | affected_line_ranges_for_functions: list of line ranges of interest to
50 | filter function chunks on. A parser should return only functions that
51 | contains at least one line in this range. If
52 | affected_line_ranges_for_functions is empty, return all functions.
53 | Return: A ParseResults object containing all the parsing output.
54 | """
55 |
--------------------------------------------------------------------------------
/vanir/language_parsers/common.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | """Common data types for Vanir language parser."""
8 |
9 | import dataclasses
10 | from typing import Mapping, Sequence
11 |
12 |
13 | @dataclasses.dataclass(frozen=True)
14 | class FunctionChunkBase:
15 | """Data class representing a function and metadata extracted by a parser.
16 |
17 | Attributes:
18 | name: function name
19 | return_types: sequence of return types. Each return type is a sequence of
20 | tokens representing the type. E.g. [['struct', 'foo'], ['int']]
21 | parameters: sequence of parameter names that the function accepts
22 | used_data_types: sequence of all data types used by the function. Similar
23 | to return_types, each entry is a sequence of tokens for the type.
24 | local_variables: sequence of names of all local variables.
25 | called_functions: sequence of other function names called.
26 | tokens: sequence of tokens consisting of the function body.
27 | """
28 | name: str
29 | return_types: Sequence[Sequence[str]]
30 | parameters: Sequence[str]
31 | used_data_types: Sequence[Sequence[str]]
32 | local_variables: Sequence[str]
33 | called_functions: Sequence[str]
34 | tokens: Sequence[str]
35 |
36 |
37 | @dataclasses.dataclass(frozen=True)
38 | class LineChunkBase:
39 | """Data class for meaningful tokenized lines extracted by a parser."""
40 | tokens: Mapping[int, Sequence[str]]
41 |
42 |
43 | @dataclasses.dataclass(frozen=True)
44 | class ParseError:
45 | """Data class for holding an error found during parsing."""
46 | line: int
47 | column: int
48 | bad_token: str
49 | message: str
50 |
51 |
52 | @dataclasses.dataclass(frozen=True)
53 | class ParseResults:
54 | """Data class holding all parsing results (function/line chunk, errors)."""
55 | function_chunks: Sequence[FunctionChunkBase]
56 | line_chunk: LineChunkBase
57 | parse_errors: Sequence[ParseError]
58 |
--------------------------------------------------------------------------------
/vanir/language_parsers/cpp/BUILD.bazel:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | # Bazel build rules for Vanir parser core.
8 | load("@rules_python//python:defs.bzl", "py_library", "py_test")
9 | load("@vanir_deps//:requirements.bzl", "requirement")
10 |
11 | package(default_visibility = [
12 | "//visibility:public",
13 | ])
14 |
15 | cc_library(
16 | name = "parser_core",
17 | srcs = ["parser_core.cc"],
18 | hdrs = ["parser_core.h"],
19 | copts = [
20 | # fexception is needed only for catching Antlr4 exceptions.
21 | "-fexceptions",
22 | ],
23 | features = ["-use_header_modules"],
24 | deps = [
25 | "@com_google_absl//absl/status",
26 | "@com_google_absl//absl/status:statusor",
27 | "@com_google_absl//absl/strings",
28 | "@fuzzyc//:fuzzyc_cc_function",
29 | "@fuzzyc//:fuzzyc_cc_module",
30 | ],
31 | )
32 |
33 | py_library(
34 | name = "cpp_parser",
35 | srcs = ["cpp_parser.py"],
36 | data = [
37 | "//vanir/language_parsers/cpp/python:parser_core.so",
38 | "@pybind11_abseil//pybind11_abseil:status.so",
39 | ],
40 | deps = [
41 | "//vanir/language_parsers:abstract_language_parser",
42 | "//vanir/language_parsers:common",
43 | requirement("absl-py"),
44 | ],
45 | )
46 |
47 | py_test(
48 | name = "cpp_parser_test",
49 | size = "small",
50 | srcs = ["cpp_parser_test.py"],
51 | data = ["@pybind11_abseil//pybind11_abseil:status.so"],
52 | deps = [
53 | ":cpp_parser",
54 | requirement("absl-py"),
55 | ],
56 | )
57 |
--------------------------------------------------------------------------------
/vanir/language_parsers/cpp/cpp_parser.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | """Vanir C/C++ parser.
8 |
9 | This module interfaces with the native Antlr FuzzyC parser.
10 | """
11 | import os
12 | import tempfile
13 | from typing import Iterable, Optional, Sequence, Tuple
14 |
15 | from absl import logging
16 | from vanir.language_parsers import abstract_language_parser
17 | from vanir.language_parsers import common
18 | from vanir.language_parsers.cpp.python import parser_core
19 |
20 | from pybind11_abseil import status
21 |
22 | _ANTLR4_DECODE_ERROR = 'UTF-8 string contains an illegal byte sequence'
23 | _ALTNERNATIVE_ENCODINGS = ['LATIN-1']
24 |
25 |
26 | class CppParser(abstract_language_parser.AbstractLanguageParser):
27 | """Vanir C/C++ parser.
28 |
29 | This class implements the AbstractLanguageParser base class.
30 | """
31 |
32 | def __init__(self, filename: str):
33 | try:
34 | self.parser_core = parser_core.ParserCore(filename)
35 | self.parser_core.init()
36 | except status.StatusNotOk as e:
37 | if (
38 | e.code == status.StatusCode.INVALID_ARGUMENT.value
39 | and e.message == _ANTLR4_DECODE_ERROR
40 | ):
41 | # If encoding problem, try again after converting to UTF-8.
42 | logging.info('%s is not encoded in UTF-8. Trying altneratives.')
43 | self._temp_filename = self._convert_to_utf8(filename)
44 | self.parser_core = parser_core.ParserCore(self._temp_filename)
45 | self.parser_core.init()
46 | else:
47 | raise e
48 |
49 | def __del__(self):
50 | if getattr(self, '_temp_filename', None):
51 | os.unlink(self._temp_filename)
52 |
53 | @classmethod
54 | def get_supported_extensions(cls) -> Iterable[str]:
55 | return ['.c', '.h', '.cc', '.hh', '.cpp', '.hpp', '.cxx', '.hxx']
56 |
57 | @classmethod
58 | def _convert_to_utf8(cls, filename) -> str:
59 | """Creates a new file with UTF-8 encoding and returns the file name."""
60 | for encoding in _ALTNERNATIVE_ENCODINGS:
61 | try:
62 | with open(filename, encoding=encoding, mode='r') as file:
63 | new_file = tempfile.NamedTemporaryFile(
64 | encoding='UTF-8', mode='w', delete=False
65 | )
66 | new_file.write(file.read())
67 | new_file.close()
68 | return new_file.name
69 | except ValueError: # Try other encodings on decoding failure
70 | continue
71 | raise ValueError(
72 | 'Failed to deocde %s. Tried encodings: UTF-8, %s'
73 | % (filename, ', '.join(_ALTNERNATIVE_ENCODINGS))
74 | )
75 |
76 | def _to_standard_function_chunk_base(
77 | self, chunk: parser_core.FunctionChunkRaw
78 | ) -> common.FunctionChunkBase:
79 | return common.FunctionChunkBase(
80 | chunk.name,
81 | [chunk.return_type],
82 | chunk.parameters,
83 | chunk.used_data_types,
84 | chunk.local_variables,
85 | chunk.called_functions,
86 | chunk.tokens,
87 | )
88 |
89 | def get_chunks(
90 | self,
91 | affected_line_ranges_for_functions: Optional[
92 | Sequence[Tuple[int, int]]
93 | ] = None,
94 | ) -> common.ParseResults:
95 | if affected_line_ranges_for_functions is None:
96 | affected_line_ranges_for_functions = []
97 | function_chunks = [
98 | self._to_standard_function_chunk_base(function_chunk_raw)
99 | for function_chunk_raw in self.parser_core.get_function_chunks(
100 | affected_line_ranges_for_functions)
101 | ]
102 | line_chunk = common.LineChunkBase(self.parser_core.get_line_chunk().tokens)
103 | errors = [
104 | common.ParseError(e.line, e.column, e.bad_token, e.message)
105 | for e in self.parser_core.get_parse_errors()
106 | ]
107 | return common.ParseResults(function_chunks, line_chunk, errors)
108 |
--------------------------------------------------------------------------------
/vanir/language_parsers/cpp/cpp_parser_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | """Tests for parser_core Pybind wrapped by cpp_parser."""
8 |
9 | from unittest import mock
10 |
11 | from absl import logging
12 | from vanir.language_parsers.cpp import cpp_parser
13 |
14 | from absl.testing import absltest
15 | from pybind11_abseil import status
16 |
17 |
18 | class ParserCoreTest(absltest.TestCase):
19 |
20 | def setUp(self):
21 | super().setUp()
22 | self.testcode = """
23 | /* This code is for testing Vanir Parser. */ int test_globalvar = 10;
24 | int test_func1(const unsigned int64 test_arg) {
25 | const struct teststruct *testvar = (struct teststruct *)globalvar->data;
26 | test_func2(test_arg, testvar); // some comment.
27 | /* additional comment line. */
28 | 0xe8(); // broken code -- won't be counted as func call.
29 | return 0;
30 | }
31 | void test_func_decl(int myarg);
32 | void test_func_def(int myarg) {}
33 | """
34 | testfile = self.create_tempfile('testfile.c', content=self.testcode)
35 | self.test_filename = testfile.full_path
36 | self.expected_tokens = {
37 | 2: ['int', 'test_globalvar', '=', '10', ';'],
38 | 3: [
39 | 'int', 'test_func1', '(', 'const', 'unsigned', 'int64', 'test_arg',
40 | ')', '{'
41 | ],
42 | 4: [
43 | 'const', 'struct', 'teststruct', '*', 'testvar', '=', '(', 'struct',
44 | 'teststruct', '*', ')', 'globalvar', '->', 'data', ';'
45 | ],
46 | 5: ['test_func2', '(', 'test_arg', ',', 'testvar', ')', ';'],
47 | 7: ['0xe8', '(', ')', ';'],
48 | 8: ['return', '0', ';'],
49 | 9: ['}'],
50 | 10: ['void', 'test_func_decl', '(', 'int', 'myarg', ')', ';'],
51 | 11: ['void', 'test_func_def', '(', 'int', 'myarg', ')', '{', '}']
52 | }
53 |
54 | def test_cpp_parser_with_line_limits(self):
55 | parser = cpp_parser.CppParser(self.test_filename)
56 |
57 | results = parser.get_chunks([(5, 7)])
58 | self.assertEmpty(results.parse_errors)
59 | self.assertLen(results.function_chunks, 1)
60 | self.assertEqual(results.function_chunks[0].name, 'test_func1')
61 | self.assertEqual(results.function_chunks[0].return_types, [['int']])
62 | self.assertEqual(results.function_chunks[0].parameters, ['test_arg'])
63 | self.assertEqual(
64 | results.function_chunks[0].used_data_types,
65 | [['const', 'unsigned', 'int64'], ['const', 'struct', 'teststruct'],
66 | ['struct', 'teststruct']])
67 | self.assertEqual(results.function_chunks[0].local_variables, ['testvar'])
68 | self.assertEqual(
69 | results.function_chunks[0].called_functions, ['test_func2'])
70 |
71 | self.assertEqual(results.line_chunk.tokens, self.expected_tokens)
72 |
73 | def test_cpp_parser_without_line_limits(self):
74 | parser = cpp_parser.CppParser(self.test_filename)
75 |
76 | results = parser.get_chunks()
77 | self.assertEmpty(results.parse_errors)
78 | self.assertLen(results.function_chunks, 2)
79 | self.assertEqual(results.function_chunks[0].name, 'test_func1')
80 | self.assertEqual(results.function_chunks[0].parameters, ['test_arg'])
81 | self.assertEqual(
82 | results.function_chunks[0].used_data_types,
83 | [['const', 'unsigned', 'int64'], ['const', 'struct', 'teststruct'],
84 | ['struct', 'teststruct']])
85 | self.assertEqual(results.function_chunks[0].local_variables, ['testvar'])
86 | self.assertEqual(
87 | results.function_chunks[0].called_functions, ['test_func2'])
88 | self.assertEqual(results.function_chunks[1].name, 'test_func_def')
89 | self.assertEqual(results.function_chunks[1].return_types, [['void']])
90 | self.assertEqual(results.function_chunks[1].parameters, ['myarg'])
91 |
92 | self.assertEqual(results.line_chunk.tokens, self.expected_tokens)
93 |
94 | def test_cpp_parser_with_nonexistent_file_failure(self):
95 | filename = 'NonExistingFile.c'
96 | with self.assertRaisesRegex(status.StatusNotOk, 'Failed to open file:.*'):
97 | _ = cpp_parser.CppParser(filename)
98 |
99 | def test_cpp_parser_with_non_utf8_file(self):
100 | latin1_str = ' // \xE0'
101 | testfile = self.create_tempfile(
102 | 'testfile_latein1.c',
103 | content=self.testcode + latin1_str,
104 | encoding='LATIN-1',
105 | )
106 | with self.assertLogs(level=logging.INFO) as logs:
107 | parser = cpp_parser.CppParser(testfile.full_path)
108 | results = parser.get_chunks([(5, 7)])
109 | self.assertEmpty(results.parse_errors)
110 | self.assertLen(results.function_chunks, 1)
111 | self.assertIn(
112 | 'is not encoded in UTF-8. Trying altneratives.', logs.output[0]
113 | )
114 |
115 | def test_cpp_parser_with_known_encoding_file(self):
116 | latin1_str = ' // \xE0'
117 | testfile = self.create_tempfile(
118 | 'testfile_latein1.c',
119 | content=self.testcode + latin1_str,
120 | encoding='LATIN-1',
121 | )
122 | # Delete latin-1 from the alternative encoding.
123 | with mock.patch.object(cpp_parser, '_ALTNERNATIVE_ENCODINGS', []):
124 | with self.assertRaisesRegex(ValueError, 'Failed to deocde'):
125 | cpp_parser.CppParser(testfile.full_path)
126 |
127 |
128 | if __name__ == '__main__':
129 | absltest.main()
130 |
--------------------------------------------------------------------------------
/vanir/language_parsers/cpp/python/BUILD.bazel:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | # Build rule for binding C++ parser core to python using Pybind.
8 |
9 | load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
10 |
11 | package(default_visibility = [
12 | "//visibility:public",
13 | ])
14 |
15 | pybind_extension(
16 | name = "parser_core",
17 | srcs = ["parser_core.cc"],
18 | deps = [
19 | "//vanir/language_parsers/cpp:parser_core",
20 | "@pybind11_abseil//pybind11_abseil:absl_casters",
21 | "@pybind11_abseil//pybind11_abseil:status_casters",
22 | ],
23 | )
24 |
--------------------------------------------------------------------------------
/vanir/language_parsers/cpp/python/parser_core.cc:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Google LLC
2 | //
3 | // Use of this source code is governed by a BSD-style
4 | // license that can be found in the LICENSE file or at
5 | // https://developers.google.com/open-source/licenses/bsd
6 |
7 | #include "vanir/language_parsers/cpp/parser_core.h"
8 |
9 | #include "pybind11/pybind11.h"
10 | #include "pybind11_abseil/absl_casters.h"
11 | #include "pybind11_abseil/status_casters.h"
12 |
13 | namespace vanir {
14 | namespace cpp_parser {
15 | namespace {
16 |
17 | PYBIND11_MODULE(parser_core, m) {
18 | pybind11::google::ImportStatusModule();
19 | pybind11::class_(m, "FunctionChunkRaw")
20 | .def_readwrite("name", &FunctionChunk::name_)
21 | .def_readwrite("return_type", &FunctionChunk::return_type_)
22 | .def_readwrite("parameters", &FunctionChunk::parameters_)
23 | // Since individual data type element can be used after freeing the owner
24 | // chunk in Python, access to used_data_types_ transfers the ownership of
25 | // individual data type element to Python.
26 | .def_property_readonly("used_data_types",
27 | [](const FunctionChunk& func_chunk) {
28 | auto used_data_types = pybind11::list();
29 | for (auto& data_type : func_chunk.used_data_types_) {
30 | used_data_types.append(pybind11::cast(
31 | *data_type, pybind11::return_value_policy::take_ownership));
32 | }
33 | return used_data_types;
34 | })
35 | .def_readwrite("local_variables", &FunctionChunk::local_variables_)
36 | .def_readwrite("called_functions", &FunctionChunk::called_functions_)
37 | .def_readwrite("tokens", &FunctionChunk::tokens_);
38 |
39 | pybind11::class_(m, "LineChunkRaw")
40 | .def_readwrite("tokens", &LineChunk::tokens_);
41 |
42 | pybind11::class_(m, "ParseErrorRaw")
43 | .def_readonly("line", &ParseError::line)
44 | .def_readonly("column", &ParseError::column)
45 | .def_readonly("bad_token", &ParseError::bad_token)
46 | .def_readonly("message", &ParseError::message);
47 |
48 | // GetFunctionChunks and GetLineChunk transfer the ownership.
49 | pybind11::class_(m, "ParserCore")
50 | .def(pybind11::init())
51 | .def("init", &ParserCore::Init)
52 | .def("get_function_chunks", &ParserCore::GetFunctionChunks,
53 | pybind11::arg("affected_line_ranges"))
54 | .def("get_line_chunk", &ParserCore::GetLineChunk)
55 | .def("get_parse_errors", &ParserCore::GetParseErrors);
56 | }
57 |
58 | } // namespace
59 | } // namespace cpp_parser
60 | } // namespace vanir
61 |
--------------------------------------------------------------------------------
/vanir/language_parsers/java/BUILD.bazel:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | load("@rules_python//python:defs.bzl", "py_library", "py_test")
8 | load("@vanir_deps//:requirements.bzl", "requirement")
9 | load(":antlr4.external.bzl", "antlr4_cc_gen")
10 |
11 | package(default_visibility = [
12 | "//visibility:public",
13 | ])
14 |
15 | ANTLR4_VER = "4.11.1"
16 |
17 | antlr4_cc_gen(
18 | name = "java_cc_lexer",
19 | srcs = ["@antlr4_grammar_java_lexer_g4//file"],
20 | antlr4_ver = ANTLR4_VER,
21 | cc_files_prefix = "JavaLexer",
22 | cc_namespace = "java_cc_lexer",
23 | listener = False,
24 | )
25 |
26 | antlr4_cc_gen(
27 | name = "java_cc_parser",
28 | srcs = [
29 | "@antlr4_grammar_java_lexer_g4//file",
30 | "@antlr4_grammar_java_parser_g4//file",
31 | ],
32 | antlr4_ver = ANTLR4_VER,
33 | cc_files_prefix = "JavaParser",
34 | cc_namespace = "java_cc_parser",
35 | listener = True,
36 | )
37 |
38 | cc_library(
39 | name = "parser_core",
40 | srcs = ["parser_core.cc"],
41 | hdrs = ["parser_core.h"],
42 | deps = [
43 | ":java_cc_lexer",
44 | ":java_cc_parser",
45 | "@com_google_absl//absl/log:check",
46 | "@com_google_absl//absl/status:statusor",
47 | ],
48 | )
49 |
50 | py_library(
51 | name = "java_parser",
52 | srcs = ["java_parser.py"],
53 | data = [
54 | "//vanir/language_parsers/java/python:parser_core.so",
55 | ],
56 | deps = [
57 | "//vanir/language_parsers:abstract_language_parser",
58 | "//vanir/language_parsers:common",
59 | ],
60 | )
61 |
62 | py_test(
63 | name = "java_parser_test",
64 | size = "small",
65 | srcs = ["java_parser_test.py"],
66 | data = ["@pybind11_abseil//pybind11_abseil:status.so"],
67 | deps = [
68 | ":java_parser",
69 | requirement("absl-py"),
70 | ],
71 | )
72 |
--------------------------------------------------------------------------------
/vanir/language_parsers/java/antlr4.external.bzl:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | """Starlark macro to wrap Antlr4 code and library generation from grammar files."""
8 |
9 | load("@antlr4_deps//:requirements.bzl", "requirement")
10 |
11 | def antlr4_cc_gen(name, srcs, cc_namespace, cc_files_prefix, antlr4_ver, listener):
12 | """Generates the C++ source corresponding to an Antlr4 lexer definition.
13 |
14 | Args:
15 | name: name of the parser/lexer library target
16 | srcs: grammar files
17 | cc_namespace: C++ namespace to put the parser/lexer under
18 | cc_files_prefix: prefix for all generated C++ files
19 | antlr4_ver: specify antlr4 tools version
20 | listener: whether to generate antlr4 listener classes
21 | """
22 |
23 | out_src_files = [
24 | "%s.h" % cc_files_prefix,
25 | "%s.cpp" % cc_files_prefix,
26 | ]
27 | if listener:
28 | out_src_files += [
29 | "%sBaseListener.h" % cc_files_prefix,
30 | "%sBaseListener.cpp" % cc_files_prefix,
31 | "%sListener.h" % cc_files_prefix,
32 | "%sListener.cpp" % cc_files_prefix,
33 | ]
34 | extra_args = "-listener" if listener else "-no-listener"
35 | cmd = (
36 | "VANIR_ANTLR_TMPDIR=$$(mktemp -d);" +
37 | "$(locations @antlr4_entry_points//:antlr4) " +
38 | "-v " + antlr4_ver + " " +
39 | "$(SRCS) " +
40 | "-no-visitor " +
41 | "-Dlanguage=Cpp " +
42 | "-package " + cc_namespace + " " +
43 | "-o $$VANIR_ANTLR_TMPDIR " +
44 | "-Xexact-output-dir " +
45 | extra_args + ";" +
46 | "cp " + " ".join([("$$VANIR_ANTLR_TMPDIR/" + f) for f in out_src_files]) + " $(@D);" +
47 | "rm -r $$VANIR_ANTLR_TMPDIR"
48 | )
49 |
50 | native.genrule(
51 | name = name + "_src",
52 | srcs = srcs,
53 | outs = out_src_files,
54 | cmd = cmd,
55 | local = True,
56 | tools = [
57 | requirement("antlr4-tools"),
58 | "@antlr4_entry_points//:antlr4",
59 | ],
60 | )
61 | native.cc_library(
62 | name = name,
63 | srcs = [(":" + f) for f in out_src_files if f.endswith(".cpp")],
64 | hdrs = [(":" + f) for f in out_src_files if f.endswith(".h")],
65 | deps = [
66 | ":{target}_src".format(target = name),
67 | "@antlr4_runtimes//:cpp",
68 | ],
69 | linkstatic = 1,
70 | )
71 |
--------------------------------------------------------------------------------
/vanir/language_parsers/java/java_parser.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Google LLC
2 | #
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file or at
5 | # https://developers.google.com/open-source/licenses/bsd
6 |
7 | """Vanir Java parser.
8 |
9 | This module implements an AbstractLanguageParser that handles all .java files.
10 | """
11 |
12 | from typing import Iterable, Optional, Sequence, Tuple
13 |
14 | from vanir.language_parsers import abstract_language_parser
15 | from vanir.language_parsers import common
16 | from vanir.language_parsers.java.python import parser_core
17 |
18 |
19 | class JavaParser(abstract_language_parser.AbstractLanguageParser):
20 | """Vanir Java parser.
21 |
22 | This class implements the AbstractLanguageParser base class.
23 | """
24 |
25 | def __init__(self, filename: str):
26 | self.parser = parser_core.ParserCore(filename)
27 |
28 | @classmethod
29 | def get_supported_extensions(cls) -> Iterable[str]:
30 | return ['.java']
31 |
32 | def get_chunks(
33 | self,
34 | affected_line_ranges_for_functions: Optional[
35 | Sequence[Tuple[int, int]]
36 | ] = None,
37 | ) -> common.ParseResults:
38 | if not affected_line_ranges_for_functions:
39 | affected_line_ranges_for_functions = []
40 |
41 | function_chunks_raw, line_chunk_raw, errors_raw = self.parser.parse(
42 | affected_line_ranges_for_functions)
43 | function_chunks = []
44 | for function_chunk_raw in function_chunks_raw:
45 | function_chunks.append(
46 | common.FunctionChunkBase(
47 | name=function_chunk_raw.name,
48 | return_types=[function_chunk_raw.return_type],
49 | parameters=function_chunk_raw.parameters,
50 | used_data_types=function_chunk_raw.used_data_types,
51 | local_variables=function_chunk_raw.local_variables,
52 | called_functions=function_chunk_raw.called_functions,
53 | tokens=function_chunk_raw.tokens,
54 | )
55 | )
56 |
57 | errors = []
58 | for error_raw in errors_raw:
59 | errors.append(common.ParseError(
60 | error_raw.line, error_raw.column,
61 | error_raw.bad_token,
62 | error_raw.message))
63 |
64 | return common.ParseResults(
65 | function_chunks, common.LineChunkBase(line_chunk_raw.tokens_), errors)
66 |
--------------------------------------------------------------------------------
/vanir/language_parsers/java/parser_core.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2023 Google LLC
3 | *
4 | * Use of this source code is governed by a BSD-style
5 | * license that can be found in the LICENSE file or at
6 | * https://developers.google.com/open-source/licenses/bsd
7 | */
8 |
9 | #ifndef VANIR_LANGUAGE_PARSERS_JAVA_PARSER_CORE_H_
10 | #define VANIR_LANGUAGE_PARSERS_JAVA_PARSER_CORE_H_
11 |
12 | #include
13 | #include
14 | #include
15 | #include
16 |
17 | #include "absl/status/statusor.h"
18 | #include "vanir/language_parsers/java/JavaParser.h"
19 | #include "vanir/language_parsers/java/JavaParserBaseListener.h"
20 |
21 | namespace vanir {
22 | namespace java_parser {
23 |
24 | using ::java_cc_parser::JavaParser;
25 | using ::java_cc_parser::JavaParserBaseListener;
26 |
27 | // Container for a function and its metadata extracted by the parser
28 | class FunctionChunk {
29 | public:
30 | explicit FunctionChunk()
31 | : line_start_(0), line_stop_(0), start_token_idx_(0), stop_token_idx_(0)
32 | {}
33 |
34 | std::string name_;
35 | std::vector return_type_;
36 | std::vector parameters_;
37 | std::vector> used_data_types_;
38 | std::vector local_variables_;
39 | std::vector called_functions_;
40 | std::vector tokens_;
41 | size_t line_start_, line_stop_;
42 | size_t start_token_idx_, stop_token_idx_;
43 |
44 | private:
45 | FunctionChunk(const FunctionChunk &) = delete;
46 | FunctionChunk(FunctionChunk &&) = delete;
47 | FunctionChunk &operator=(const FunctionChunk &) = delete;
48 | FunctionChunk &operator=(FunctionChunk &&) = delete;
49 | };
50 |
51 | // LineChunk is a wrapper class for a map from line numbers to all tokens in
52 | // that line. This is needed instead of a simple type alias because pybind's
53 | // automatic conversion of wrappers (e.g. unique_ptr) only supports custom
54 | // types, and not e.g. unordered_map.
55 | class LineChunk {
56 | public:
57 | explicit LineChunk() {}
58 | std::unordered_map> tokens_;
59 |
60 | private:
61 | LineChunk(const LineChunk &) = delete;
62 | LineChunk(LineChunk &&) = delete;
63 | LineChunk &operator=(const LineChunk &) = delete;
64 | LineChunk &operator=(LineChunk &&) = delete;
65 | };
66 |
67 | // Container for any error encountered during parsing
68 | struct ParseError {
69 | size_t line, column;
70 | std::string bad_token;
71 | std::string message;
72 | };
73 |
74 | // Antlr4 parser tree walking listener.
75 | class FileListener : public JavaParserBaseListener {
76 | public:
77 | explicit FileListener(
78 | antlr4::BufferedTokenStream &tokens,
79 | std::vector> function_line_ranges = {})
80 | : token_stream_(tokens), function_line_ranges_(function_line_ranges) {}
81 |
82 | void enterMethodDeclaration(JavaParser::MethodDeclarationContext*) override;
83 | void exitMethodDeclaration(JavaParser::MethodDeclarationContext*) override;
84 | void enterConstructorDeclaration(
85 | JavaParser::ConstructorDeclarationContext*) override;
86 | void exitConstructorDeclaration(
87 | JavaParser::ConstructorDeclarationContext*) override;
88 |
89 | void enterTypeType(JavaParser::TypeTypeContext*) override;
90 |
91 | void enterLocalVariableDeclaration(
92 | JavaParser::LocalVariableDeclarationContext*) override;
93 |
94 | void enterMethodCall(JavaParser::MethodCallContext*) override;
95 | void enterCreator(JavaParser::CreatorContext*) override;
96 |
97 | std::vector> GetFunctionChunks();
98 |
99 | private:
100 | antlr4::BufferedTokenStream &token_stream_;
101 |
102 | // last_type_token_stop_idx_ holds the last token in a typeType, so that we
103 | // can ignore all other nested typeType, e.g. `ArrayList