├── CMakeLists.txt
├── LICENSE
├── README.md
├── docker
    ├── .env
    ├── docker-compose.development.yml
    ├── docker-compose.yml
    ├── dockerfile
    └── ros_entrypoint.sh
├── include
    ├── sgm_gpu
    │   ├── configuration.h
    │   ├── cost_aggregation.h
    │   ├── costs.h
    │   ├── hamming_cost.h
    │   ├── left_right_consistency.h
    │   ├── median_filter.h
    │   ├── sgm_gpu.h
    │   └── util.h
    └── sgm_gpu_node.h
├── launch
    └── test.launch
├── package.xml
├── src
    ├── costs.cu
    ├── hamming_cost.cu
    ├── left_right_consistency.cu
    ├── median_filter.cu
    ├── sgm_gpu.cu
    ├── sgm_gpu_node.cpp
    └── sgm_gpu_node_main.cpp
└── test_input.bag


/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # CMake>=3.8 supports CUDA C++ as intrinsically supported language
 2 | cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
 3 | project(sgm_gpu CUDA CXX)
 4 | 
 5 | find_package(catkin REQUIRED COMPONENTS
 6 |   cv_bridge
 7 |   image_geometry
 8 |   image_transport
 9 |   message_filters
10 |   roscpp
11 |   sensor_msgs
12 |   stereo_msgs
13 | )
14 | find_package(OpenCV REQUIRED)
15 | 
16 | set(CUDA_NVCC_FLAGS
17 |   ${CUDA_NVCC_FLAGS};
18 |   -O3
19 |   -gencode=arch=compute_30,code=sm_30
20 |   -gencode=arch=compute_35,code=sm_35
21 |   -gencode=arch=compute_50,code=sm_50
22 |   -gencode=arch=compute_52,code=sm_52
23 |   -gencode=arch=compute_61,code=sm_61
24 | )
25 | 
26 | catkin_package(
27 |   CATKIN_DEPENDS cv_bridge image_geometry roscpp sensor_msgs stereo_msgs
28 |   DEPENDS OpenCV
29 |   INCLUDE_DIRS include
30 |   LIBRARIES ${PROJECT_NAME}
31 | )
32 | 
33 | include_directories(
34 |   include
35 |   ${catkin_INCLUDE_DIRS}
36 | )
37 | 
38 | # Build lib${PROJECT_NAME}
39 | add_library(${PROJECT_NAME}
40 |   src/sgm_gpu.cu
41 |   src/costs.cu
42 |   src/hamming_cost.cu
43 |   src/left_right_consistency.cu
44 |   src/median_filter.cu
45 | )
46 | add_dependencies(${PROJECT_NAME} ${catkin_EXPORTED_TARGETS})
47 | target_link_libraries(${PROJECT_NAME}
48 |   ${catkin_LIBRARIES}
49 |   ${OpenCV_LIBS}
50 | )
51 | 
52 | # Build ${PROJECT_NAME}_node
53 | add_executable(${PROJECT_NAME}_node 
54 |   src/${PROJECT_NAME}_node.cpp
55 |   src/${PROJECT_NAME}_node_main.cpp
56 | )
57 | add_dependencies(${PROJECT_NAME}_node
58 |   ${PROJECT_NAME}
59 |   ${catkin_EXPORTED_TARGETS}
60 | )
61 | target_link_libraries(${PROJECT_NAME}_node 
62 |   ${PROJECT_NAME}
63 |   ${catkin_LIBRARIES}
64 | )
65 | 
66 | # Install lib${PROJECT_NAME}
67 | install(TARGETS ${PROJECT_NAME}
68 |  ARCHIVE DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION}
69 |  LIBRARY DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION}
70 |  RUNTIME DESTINATION ${CATKIN_GLOBAL_BIN_DESTINATION}
71 | )
72 | install(DIRECTORY include/${PROJECT_NAME}/
73 |   DESTINATION ${CATKIN_PACKAGE_INCLUDE_DESTINATION}
74 | )
75 | 
76 | # Install ${PROJECT_NAME}_node
77 | install(TARGETS ${PROJECT_NAME}_node
78 |   RUNTIME DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
79 | )
80 | 
81 | # Install launch, bag and document
82 | install(FILES test_input.bag README.md
83 |   DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}
84 | )
85 | install(DIRECTORY launch/
86 |   DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}
87 | )
88 | 
89 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <https://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <https://www.gnu.org/licenses/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # A ROS package of Semi-Global Matching on the GPU
 2 | 
 3 | `sgm_gpu` is a ROS package which contains a nodelet based on [Semi-Global Matching on the GPU by D. Hernandez-Juarez](https://github.com/dhernandez0/sgm) .
 4 | 
 5 | ## Prerequisite
 6 | 
 7 | ### Without Docker
 8 | 
 9 | - [ROS Melodic Morenia](http://wiki.ros.org/melodic)
10 | - [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit)
11 | 
12 | ### With Docker
13 | 
14 | - Docker
15 | - Docker Compose
16 | - NVIDIA Container Toolkit
17 | 
18 | ## Run
19 | 
20 | ```
21 | $ cd <YourCatkinWorkspace>/src
22 | $ git clone https://github.com/ActiveIntelligentSystemsLab/sgm_gpu_ros.git
23 | $ cd ..
24 | $ catkin_make
25 | $ roslaunch sgm_gpu test.launch
26 | ```
27 | 
28 | ## Run with Docker
29 | 
30 | ```
31 | $ git clone https://github.com/ActiveIntelligentSystemsLab/sgm_gpu_ros.git
32 | $ cd sgm_gpu_ros/docker
33 | $ xhost +local:root
34 | $ sudo docker compose up
35 | ```
36 | 
37 | ## sgm_gpu_node
38 | 
39 | A node calculates disparity from stereo image topic.
40 | 
41 | ### Subscribed topics
42 | 
43 | - `left_image` ([sensor_msgs/Image](http://docs.ros.org/api/sensor_msgs/html/msg/Image.html))
44 |   
45 |   Rectified image topic from left camera.
46 |   Should be remapped.
47 | 
48 | - `right_image` ([sensor_msgs/Image](http://docs.ros.org/api/sensor_msgs/html/msg/Image.html))
49 | 
50 |   Rectified image topic from right camera. Should be remapped.
51 | 
52 | - `<base topic of left_image>/camera_info` ([sensor_msgs/CameraInfo](http://docs.ros.org/api/sensor_msgs/html/msg/CameraInfo.html))
53 | 
54 |   Subscribed automatically based on topic of left_image.
55 | 
56 | - `<base topic of right_image>/camera_info` ([sensor_msgs/CameraInfo](http://docs.ros.org/api/sensor_msgs/html/msg/CameraInfo.html))
57 | 
58 |   Subscribed automatically based on topic of right_image.
59 | 
60 | ### Published topic
61 | 
62 | - `~disparity` ([stereo_msgs/DisparityImage](http://docs.ros.org/api/stereo_msgs/html/msg/DisparityImage.html))
63 | 
64 |   Disparity image computed by SGM
65 | 
66 | ### Parameters
67 | 
68 | - `~libsgm/p1` (int)
69 | 
70 |   Parameter used in SGM algorithm.
71 |   See [SGM on GPU papar](https://www.sciencedirect.com/science/article/pii/S1877050916306561) and [SGM paper](https://ieeexplore.ieee.org/document/4359315) .
72 | 
73 |   Default value is `6` from [SGM on GPU](https://github.com/dhernandez0/sgm) .
74 | 
75 | - `~libsgm/p2` (int) 
76 | 
77 |   Parameter used in SGM algorithm.
78 |   See [SGM on GPU papar](https://www.sciencedirect.com/science/article/pii/S1877050916306561) and [SGM paper](https://ieeexplore.ieee.org/document/4359315) .
79 | 
80 |   Default value is `96` from [SGM on GPU](https://github.com/dhernandez0/sgm) .
81 | 
82 | - `~libsgm/check_consistency` (bool)
83 | 
84 |   Check left-right consistency if true.
85 | 
86 |   Default value is `true`.
87 | 
88 | - `~image_transport` (string)
89 | 
90 |   See [image_transport](http://wiki.ros.org/image_transport)
91 | 
92 | ### Limitations
93 | 
94 | - Disparity range is `[0, 127]`
95 | 
96 | 


--------------------------------------------------------------------------------
/docker/.env:
--------------------------------------------------------------------------------
1 | COMPOSE_PROJECT_NAME=sgm_gpu
2 | #COMPOSE_FILE=docker-compose.yml:docker-compose.development.yml
3 | 
4 | 


--------------------------------------------------------------------------------
/docker/docker-compose.development.yml:
--------------------------------------------------------------------------------
1 | # Override for development 
2 | # Just run ROS master and access interactive shell by docker-compose exec
3 | version: "2.3"
4 | services: 
5 |   master:
6 |     command: roscore
7 | 
8 | 


--------------------------------------------------------------------------------
/docker/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "2.3"
 2 | services: 
 3 |   master:
 4 |     build: .
 5 |     container_name: ${COMPOSE_PROJECT_NAME}_master
 6 |     command: roslaunch sgm_gpu test.launch
 7 |     runtime: nvidia
 8 |     environment: 
 9 |       - DISPLAY
10 |       - QT_X11_NO_MITSHM=1
11 |     volumes: 
12 |       - /tmp/.X11-unix:/tmp/.X11-unix:rw
13 |       - sgm_gpu_root:/root
14 |       
15 | volumes:
16 |   sgm_gpu_root:
17 | 
18 | 


--------------------------------------------------------------------------------
/docker/dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:11.4.3-cudnn8-devel-ubuntu18.04
 2 | 
 3 | # Setup timezone
 4 | RUN echo 'Etc/UTC' > /etc/timezone && \
 5 |     ln -s /usr/share/zoneinfo/Etc/UTC /etc/localtime && \
 6 |     apt-get update && \
 7 |     apt-get install -y tzdata
 8 | 
 9 | # Install ROS
10 | RUN apt-get install -y dirmngr gnupg2 curl
11 | RUN sh -c 'echo "deb http://packages.ros.org/ros/ubuntu bionic main" > /etc/apt/sources.list.d/ros-latest.list'
12 | RUN curl -s https://raw.githubusercontent.com/ros/rosdistro/master/ros.asc | apt-key add -
13 | RUN apt-get update && \
14 |     apt-get install --no-install-recommends -y \
15 |       python-rosdep \
16 |       python-rosinstall \
17 |       python-vcstools \
18 |       ros-melodic-perception
19 | RUN rosdep init
20 | RUN rosdep update
21 | 
22 | # Install tools for development
23 | RUN apt-get update && \
24 |     apt-get install -y vim byobu
25 | 
26 | # Make catkin workspace
27 | RUN mkdir -p /root/catkin_ws/src
28 | WORKDIR /root/catkin_ws
29 | # Build sgm_gpu
30 | RUN git clone https://github.com/ActiveIntelligentSystemsLab/sgm_gpu_ros.git src/sgm_gpu_ros
31 | RUN bash -c "source /opt/ros/melodic/setup.bash && \
32 |              catkin_make -DCMAKE_BUILD_TYPE=Release"
33 | 
34 | # Load ROS environment at docker exec bash
35 | RUN echo "source /opt/ros/melodic/setup.bash" >> /root/.bashrc
36 | RUN echo "source /root/catkin_ws/devel/setup.bash" >> /root/.bashrc
37 | 
38 | # Set entrypoint to load catkin_ws at docker run
39 | COPY ./ros_entrypoint.sh /
40 | RUN chmod +x /ros_entrypoint.sh
41 | ENTRYPOINT ["/ros_entrypoint.sh"]
42 | CMD [ "bash" ]
43 | 
44 | 


--------------------------------------------------------------------------------
/docker/ros_entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | # setup ros environment
5 | source "/opt/ros/melodic/setup.bash"
6 | source "/root/catkin_ws/devel/setup.bash"
7 | exec "$@"
8 | 


--------------------------------------------------------------------------------
/include/sgm_gpu/configuration.h:
--------------------------------------------------------------------------------
 1 | /***********************************************************************
 2 |   Copyright (C) 2020 Hironori Fujimoto
 3 | 
 4 |   This program is free software: you can redistribute it and/or modify
 5 |   it under the terms of the GNU General Public License as published by
 6 |   the Free Software Foundation, either version 3 of the License, or
 7 |   (at your option) any later version.
 8 |  
 9 |   This program is distributed in the hope that it will be useful,
10 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |   GNU General Public License for more details.
13 |   You should have received a copy of the GNU General Public License
14 |   along with this program.  If not, see <http://www.gnu.org/licenses/>.
15 | ***********************************************************************/
16 | 
17 | #ifndef SGM_GPU__CONFIGURATION_H_
18 | #define SGM_GPU__CONFIGURATION_H_
19 | 
20 | #include <stdint.h>
21 | 
22 | #define	MAX_DISPARITY 128
23 | #define CENSUS_WIDTH  9
24 | #define CENSUS_HEIGHT 7
25 | 
26 | #define TOP  (CENSUS_HEIGHT-1)/2
27 | #define LEFT (CENSUS_WIDTH-1)/2
28 | 
29 | namespace sgm_gpu
30 | {
31 | 
32 | typedef uint32_t cost_t;
33 | 
34 | }
35 | 
36 | #define COSTAGG_BLOCKSIZE       GPU_THREADS_PER_BLOCK
37 | #define COSTAGG_BLOCKSIZE_HORIZ GPU_THREADS_PER_BLOCK
38 | 
39 | #endif // SGM_GPU__CONFIGURATION_H_
40 | 


--------------------------------------------------------------------------------
/include/sgm_gpu/cost_aggregation.h:
--------------------------------------------------------------------------------
  1 | /***********************************************************************
  2 |   Copyright (C) 2019 Hironori Fujimoto
  3 | 
  4 |   This program is free software: you can redistribute it and/or modify
  5 |   it under the terms of the GNU General Public License as published by
  6 |   the Free Software Foundation, either version 3 of the License, or
  7 |   (at your option) any later version.
  8 | 
  9 |   This program is distributed in the hope that it will be useful,
 10 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |   GNU General Public License for more details.
 13 |   You should have received a copy of the GNU General Public License
 14 |   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 15 | ***********************************************************************/
 16 | 
 17 | #ifndef COST_AGGREGATION_H_
 18 | #define COST_AGGREGATION_H_
 19 | 
 20 | #define ITER_COPY			0
 21 | #define ITER_NORMAL			1
 22 | 
 23 | #define MIN_COMPUTE			0
 24 | #define MIN_NOCOMPUTE		1
 25 | 
 26 | #define DIR_UPDOWN			0
 27 | #define DIR_DOWNUP			1
 28 | #define DIR_LEFTRIGHT		2
 29 | #define DIR_RIGHTLEFT		3
 30 | 
 31 | namespace sgm_gpu
 32 | {
 33 | 
 34 | template<int add_col, bool recompute, bool join_dispcomputation>
 35 | __device__ __forceinline__ void CostAggregationGenericIndexesIncrement(int *index, int *index_im, int *col, const int add_index, const int add_imindex) {
 36 |   *index += add_index;
 37 |   if(recompute || join_dispcomputation) {
 38 |     *index_im += add_imindex;
 39 |     if(recompute) {
 40 |       *col += add_col;
 41 |     }
 42 |   }
 43 | }
 44 | 
 45 | template<int add_index, bool recompute, bool join_dispcomputation>
 46 | __device__ __forceinline__ void CostAggregationDiagonalGenericIndexesIncrement(int *index, int *index_im, int *col, const int cols, const int initial_row, const int i, const int dis) {
 47 |   *col += add_index;
 48 |   if(add_index > 0 && *col >= cols) {
 49 |     *col = 0;
 50 |   } else if(*col < 0) {
 51 |     *col = cols-1;
 52 |   }
 53 |   *index = abs(initial_row-i)*cols*MAX_DISPARITY+*col*MAX_DISPARITY+dis;
 54 |   if(recompute || join_dispcomputation) {
 55 |     *index_im = abs(initial_row-i)*cols+*col;
 56 |   }
 57 | }
 58 | 
 59 | template<class T, int iter_type, int min_type, int dir_type, bool first_iteration, bool recompute, bool join_dispcomputation>
 60 | __device__ __forceinline__ void CostAggregationGenericIteration(int index, int index_im, int col, uint32_t *old_values, int *old_value1, int *old_value2, int *old_value3, int *old_value4, uint32_t *min_cost, uint32_t *min_cost_p2, uint8_t* d_cost, uint8_t *d_L, uint16_t *d_s, const int p1_vector, const int p2_vector, const T *_d_transform0, const T *_d_transform1, const int lane, const int MAX_PAD, const int dis, T *rp0, T *rp1, T *rp2, T *rp3, uint8_t* __restrict__ d_disparity, const uint8_t* d_L0, const uint8_t* d_L1, const uint8_t* d_L2, const uint8_t* d_L3, const uint8_t* d_L4, const uint8_t* d_L5, const uint8_t* d_L6) {
 61 |   const T __restrict__ *d_transform0 = _d_transform0;
 62 |   const T __restrict__ *d_transform1 = _d_transform1;
 63 |   uint32_t costs, next_dis, prev_dis;
 64 | 
 65 |   if(iter_type == ITER_NORMAL) {
 66 |     // First shuffle
 67 |     int prev_dis1 = shfl_up_32(*old_value4, 1);
 68 |     if(lane == 0) {
 69 |       prev_dis1 = MAX_PAD;
 70 |     }
 71 | 
 72 |     // Second shuffle
 73 |     int next_dis4 = shfl_down_32(*old_value1, 1);
 74 |     if(lane == 31) {
 75 |       next_dis4 = MAX_PAD;
 76 |     }
 77 | 
 78 |     // Shift + rotate
 79 |     //next_dis = __funnelshift_r(next_dis4, *old_values, 8);
 80 |     next_dis = __byte_perm(*old_values, next_dis4, 0x4321);
 81 |     prev_dis = __byte_perm(*old_values, prev_dis1, 0x2104);
 82 | 
 83 |     next_dis = next_dis + p1_vector;
 84 |     prev_dis = prev_dis + p1_vector;
 85 |   }
 86 |   if(recompute) {
 87 |     const int dif = col - dis;
 88 |     if(dir_type == DIR_LEFTRIGHT) {
 89 |       if(lane == 0) {
 90 |         // lane = 0 is dis = 0, no need to subtract dis
 91 |         *rp0 = d_transform1[index_im];
 92 |       }
 93 |     } else if(dir_type == DIR_RIGHTLEFT) {
 94 |       // First iteration, load D pixels
 95 |       if(first_iteration) {
 96 |         const uint4 right = reinterpret_cast<const uint4*>(&d_transform1[index_im-dis-3])[0];
 97 |         *rp3 = right.x;
 98 |         *rp2 = right.y;
 99 |         *rp1 = right.z;
100 |         *rp0 = right.w;
101 |       } else if(lane == 31 && dif >= 3) {
102 |         *rp3 = d_transform1[index_im-dis-3];
103 |       }
104 |     } else {
105 |   /*
106 |       __shared__ T right_p[MAX_DISPARITY+32];
107 |       const int warp_id = threadIdx.x / WARP_SIZE;
108 |       if(warp_id < 5) {
109 |         const int block_imindex = index_im - warp_id + 32;
110 |         const int rp_index = warp_id*WARP_SIZE+lane;
111 |         const int col_cpy = col-warp_id+32;
112 |         right_p[rp_index] = ((col_cpy-(159-rp_index)) >= 0) ? ld_gbl_cs(&d_transform1[block_imindex-(159-rp_index)]) : 0;
113 |       }*/
114 | 
115 |       __shared__ T right_p[128+32];
116 |       const int warp_id = threadIdx.x / WARP_SIZE;
117 |       const int block_imindex = index_im - warp_id + 2;
118 |       const int rp_index = warp_id*WARP_SIZE+lane;
119 |       const int col_cpy = col-warp_id+2;
120 |       right_p[rp_index] = ((col_cpy-(129-rp_index)) >= 0) ? d_transform1[block_imindex-(129-rp_index)] : 0;
121 |       right_p[rp_index+64] = ((col_cpy-(129-rp_index-64)) >= 0) ? d_transform1[block_imindex-(129-rp_index-64)] : 0;
122 |       //right_p[rp_index+128] = ld_gbl_cs(&d_transform1[block_imindex-(129-rp_index-128)]);
123 |       if(warp_id == 0) {
124 |         right_p[128+lane] = ld_gbl_cs(&d_transform1[block_imindex-(129-lane)]);
125 |       }
126 |       __syncthreads();
127 | 
128 |       const int px = MAX_DISPARITY+warp_id-dis-1;
129 |       *rp0 = right_p[px];
130 |       *rp1 = right_p[px-1];
131 |       *rp2 = right_p[px-2];
132 |       *rp3 = right_p[px-3];
133 |     }
134 |     const T left_pixel = d_transform0[index_im];
135 |     *old_value1 = popcount(left_pixel ^ *rp0);
136 |     *old_value2 = popcount(left_pixel ^ *rp1);
137 |     *old_value3 = popcount(left_pixel ^ *rp2);
138 |     *old_value4 = popcount(left_pixel ^ *rp3);
139 |     if(iter_type == ITER_COPY) {
140 |       *old_values = uchars_to_uint32(*old_value1, *old_value2, *old_value3, *old_value4);
141 |     } else {
142 |       costs = uchars_to_uint32(*old_value1, *old_value2, *old_value3, *old_value4);
143 |     }
144 |     // Prepare for next iteration
145 |     if(dir_type == DIR_LEFTRIGHT) {
146 |       *rp3 = shfl_up_32(*rp3, 1);
147 |     } else if(dir_type == DIR_RIGHTLEFT) {
148 |       *rp0 = shfl_down_32(*rp0, 1);
149 |     }
150 |   } else {
151 |     if(iter_type == ITER_COPY) {
152 |       *old_values = ld_gbl_ca(reinterpret_cast<const uint32_t*>(&d_cost[index]));
153 |     } else {
154 |       costs = ld_gbl_ca(reinterpret_cast<const uint32_t*>(&d_cost[index]));
155 |     }
156 |   }
157 | 
158 |   if(iter_type == ITER_NORMAL) {
159 |     const uint32_t min1 = __vminu4(*old_values, prev_dis);
160 |     const uint32_t min2 = __vminu4(next_dis, *min_cost_p2);
161 |     const uint32_t min_prev = __vminu4(min1, min2);
162 |     *old_values = costs + (min_prev - *min_cost);
163 |   }
164 |   if(iter_type == ITER_NORMAL || !recompute) {
165 |     uint32_to_uchars(*old_values, old_value1, old_value2, old_value3, old_value4);
166 |   }
167 | 
168 |   if(join_dispcomputation) {
169 |     const uint32_t L0_costs = *((uint32_t*) (d_L0+index));
170 |     const uint32_t L1_costs = *((uint32_t*) (d_L1+index));
171 |     const uint32_t L2_costs = *((uint32_t*) (d_L2+index));
172 |     const uint32_t L3_costs = *((uint32_t*) (d_L3+index));
173 |     const uint32_t L4_costs = *((uint32_t*) (d_L4+index));
174 |     const uint32_t L5_costs = *((uint32_t*) (d_L5+index));
175 |     const uint32_t L6_costs = *((uint32_t*) (d_L6+index));
176 | 
177 |     int l0_x, l0_y, l0_z, l0_w;
178 |     int l1_x, l1_y, l1_z, l1_w;
179 |     int l2_x, l2_y, l2_z, l2_w;
180 |     int l3_x, l3_y, l3_z, l3_w;
181 |     int l4_x, l4_y, l4_z, l4_w;
182 |     int l5_x, l5_y, l5_z, l5_w;
183 |     int l6_x, l6_y, l6_z, l6_w;
184 | 
185 |     uint32_to_uchars(L0_costs, &l0_x, &l0_y, &l0_z, &l0_w);
186 |     uint32_to_uchars(L1_costs, &l1_x, &l1_y, &l1_z, &l1_w);
187 |     uint32_to_uchars(L2_costs, &l2_x, &l2_y, &l2_z, &l2_w);
188 |     uint32_to_uchars(L3_costs, &l3_x, &l3_y, &l3_z, &l3_w);
189 |     uint32_to_uchars(L4_costs, &l4_x, &l4_y, &l4_z, &l4_w);
190 |     uint32_to_uchars(L5_costs, &l5_x, &l5_y, &l5_z, &l5_w);
191 |     uint32_to_uchars(L6_costs, &l6_x, &l6_y, &l6_z, &l6_w);
192 | 
193 |     const uint16_t val1 = l0_x + l1_x + l2_x + l3_x + l4_x + l5_x + l6_x + *old_value1;
194 |     const uint16_t val2 = l0_y + l1_y + l2_y + l3_y + l4_y + l5_y + l6_y + *old_value2;
195 |     const uint16_t val3 = l0_z + l1_z + l2_z + l3_z + l4_z + l5_z + l6_z + *old_value3;
196 |     const uint16_t val4 = l0_w + l1_w + l2_w + l3_w + l4_w + l5_w + l6_w + *old_value4;
197 | 
198 |     int min_idx1 = dis;
199 |     uint16_t min1 = val1;
200 |     if(val1 > val2) {
201 |       min1 = val2;
202 |       min_idx1 = dis+1;
203 |     }
204 | 
205 |     int min_idx2 = dis+2;
206 |     uint16_t min2 = val3;
207 |     if(val3 > val4) {
208 |       min2 = val4;
209 |       min_idx2 = dis+3;
210 |     }
211 | 
212 |     uint16_t minval = min1;
213 |     int min_idx = min_idx1;
214 |     if(min1 > min2) {
215 |       minval = min2;
216 |       min_idx = min_idx2;
217 |     }
218 | 
219 |     const int min_warpindex = warpReduceMinIndex(minval, min_idx);
220 |     if(lane == 0) {
221 |       d_disparity[index_im] = min_warpindex;
222 |     }
223 |     
224 |     // Save smoothed cost to obtain right disparity
225 |     d_s[index] = val1;
226 |     d_s[index+1] = val2;
227 |     d_s[index+2] = val3;
228 |     d_s[index+3] = val4;
229 |   } else {
230 |     st_gbl_cs(reinterpret_cast<uint32_t*>(&d_L[index]), *old_values);
231 |   }
232 |   if(min_type == MIN_COMPUTE) {
233 |     int min_cost_scalar = min(min(*old_value1, *old_value2), min(*old_value3, *old_value4));
234 |     *min_cost = uchar_to_uint32(warpReduceMin(min_cost_scalar));
235 |     *min_cost_p2 = *min_cost + p2_vector;
236 |   }
237 | }
238 | 
239 | template<class T, int add_col, int dir_type, bool recompute, bool join_dispcomputation>
240 | __device__ __forceinline__ void CostAggregationGeneric(uint8_t* d_cost, uint8_t *d_L, uint16_t *d_s, const int P1, const int P2, const int initial_row, const int initial_col, const int max_iter, const int cols, int add_index, const T *_d_transform0, const T *_d_transform1, const int add_imindex, uint8_t* __restrict__ d_disparity, const uint8_t* d_L0, const uint8_t* d_L1, const uint8_t* d_L2, const uint8_t* d_L3, const uint8_t* d_L4, const uint8_t* d_L5, const uint8_t* d_L6) {
241 |   const int lane = threadIdx.x % WARP_SIZE;
242 |   const int dis = 4*lane;
243 |   int index = initial_row*cols*MAX_DISPARITY+initial_col*MAX_DISPARITY+dis;
244 |   int col, index_im;
245 |   if(recompute || join_dispcomputation) {
246 |     if(recompute) {
247 |       col = initial_col;
248 |     }
249 |     index_im = initial_row*cols+initial_col;
250 |   }
251 | 
252 |   const int MAX_PAD = UCHAR_MAX-P1;
253 |   const uint32_t p1_vector = uchars_to_uint32(P1, P1, P1, P1);
254 |   const uint32_t p2_vector = uchars_to_uint32(P2, P2, P2, P2);
255 |   int old_value1;
256 |   int old_value2;
257 |   int old_value3;
258 |   int old_value4;
259 |   uint32_t min_cost, min_cost_p2, old_values;
260 |   T rp0, rp1, rp2, rp3;
261 | 
262 |   if(recompute) {
263 |     if(dir_type == DIR_LEFTRIGHT) {
264 |       CostAggregationGenericIteration<T, ITER_COPY, MIN_COMPUTE, dir_type, true, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
265 |       CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
266 |       CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp3, &rp0, &rp1, &rp2, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
267 |       CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
268 |       CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp2, &rp3, &rp0, &rp1, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
269 |       CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
270 |       CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp1, &rp2, &rp3, &rp0, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
271 |       for(int i = 4; i < max_iter-3; i+=4) {
272 |         CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
273 |         CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
274 |         CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
275 |         CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp3, &rp0, &rp1, &rp2, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
276 |         CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
277 |         CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp2, &rp3, &rp0, &rp1, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
278 |         CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
279 |         CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp1, &rp2, &rp3, &rp0, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
280 |       }
281 |       CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
282 |       CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
283 |       CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
284 |       CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp3, &rp0, &rp1, &rp2, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
285 |       CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
286 |       CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp2, &rp3, &rp0, &rp1, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
287 |       CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
288 |       CostAggregationGenericIteration<T, ITER_NORMAL, MIN_NOCOMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp1, &rp2, &rp3, &rp0, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
289 |     } else if(dir_type == DIR_RIGHTLEFT) {
290 |       CostAggregationGenericIteration<T, ITER_COPY, MIN_COMPUTE, dir_type, true, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
291 |       CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
292 |       CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp1, &rp2, &rp3, &rp0, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
293 |       CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
294 |       CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp2, &rp3, &rp0, &rp1, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
295 |       CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
296 |       CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp3, &rp0, &rp1, &rp2, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
297 |       for(int i = 4; i < max_iter-3; i+=4) {
298 |         CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
299 |         CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
300 |         CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
301 |         CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp1, &rp2, &rp3, &rp0, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
302 |         CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
303 |         CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp2, &rp3, &rp0, &rp1, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
304 |         CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
305 |         CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp3, &rp0, &rp1, &rp2, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
306 |       }
307 |       CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
308 |       CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
309 |       CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
310 |       CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp1, &rp2, &rp3, &rp0, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
311 |       CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
312 |       CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp2, &rp3, &rp0, &rp1, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
313 |       CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
314 |       CostAggregationGenericIteration<T, ITER_NORMAL, MIN_NOCOMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp3, &rp0, &rp1, &rp2, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
315 |     } else {
316 |       CostAggregationGenericIteration<T, ITER_COPY, MIN_COMPUTE, dir_type, true, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
317 |       for(int i = 1; i < max_iter; i++) {
318 |         CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
319 |         CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
320 |       }
321 |       CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
322 |       CostAggregationGenericIteration<T, ITER_NORMAL, MIN_NOCOMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
323 |     }
324 |   } else {
325 |     CostAggregationGenericIteration<T, ITER_COPY, MIN_COMPUTE, dir_type, true, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
326 | 
327 |     for(int i = 1; i < max_iter; i++) {
328 |       CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
329 |       CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
330 |     }
331 |     CostAggregationGenericIndexesIncrement<add_col, recompute, join_dispcomputation>(&index, &index_im, &col, add_index, add_imindex);
332 |     CostAggregationGenericIteration<T, ITER_NORMAL, MIN_NOCOMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
333 |   }
334 | }
335 | 
336 | template<int add_index, class T, int dir_type, bool recompute, bool join_dispcomputation>
337 | __device__ __forceinline__ void CostAggregationDiagonalGeneric(uint8_t* d_cost, uint8_t *d_L, uint16_t *d_s, const int P1, const int P2, const int initial_row, const int initial_col, const int max_iter, const int col_nomin, const int col_copycost, const int cols, const T *_d_transform0, const T *_d_transform1, uint8_t* __restrict__ d_disparity, const uint8_t* d_L0, const uint8_t* d_L1, const uint8_t* d_L2, const uint8_t* d_L3, const uint8_t* d_L4, const uint8_t* d_L5, const uint8_t* d_L6) {
338 |   const int lane = threadIdx.x % WARP_SIZE;
339 |   const int dis = 4*lane;
340 |   int col = initial_col;
341 |   int index = initial_row*cols*MAX_DISPARITY+initial_col*MAX_DISPARITY+dis;
342 |   int index_im;
343 |   if(recompute || join_dispcomputation) {
344 |     index_im = initial_row*cols+col;
345 |   }
346 |   const int MAX_PAD = UCHAR_MAX-P1;
347 |   const uint32_t p1_vector = uchars_to_uint32(P1, P1, P1, P1);
348 |   const uint32_t p2_vector = uchars_to_uint32(P2, P2, P2, P2);
349 |   int old_value1;
350 |   int old_value2;
351 |   int old_value3;
352 |   int old_value4;
353 |   uint32_t min_cost, min_cost_p2, old_values;
354 |   T rp0, rp1, rp2, rp3;
355 | 
356 |   CostAggregationGenericIteration<T, ITER_COPY, MIN_COMPUTE, dir_type, true, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
357 |   for(int i = 1; i < max_iter; i++) {
358 |     CostAggregationDiagonalGenericIndexesIncrement<add_index, recompute, join_dispcomputation>(&index, &index_im, &col, cols, initial_row, i, dis);
359 |     if(col == col_copycost) {
360 |       CostAggregationGenericIteration<T, ITER_COPY, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
361 |     } else {
362 |       CostAggregationGenericIteration<T, ITER_NORMAL, MIN_COMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
363 |     }
364 |   }
365 | 
366 |   CostAggregationDiagonalGenericIndexesIncrement<add_index, recompute, join_dispcomputation>(&index, &index_im, &col, cols, max_iter, initial_row, dis);
367 |   if(col == col_copycost) {
368 |     CostAggregationGenericIteration<T, ITER_COPY, MIN_NOCOMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
369 |   } else {
370 |     CostAggregationGenericIteration<T, ITER_NORMAL, MIN_NOCOMPUTE, dir_type, false, recompute, join_dispcomputation>(index, index_im, col, &old_values, &old_value1, &old_value2, &old_value3, &old_value4, &min_cost, &min_cost_p2, d_cost, d_L, d_s, p1_vector, p2_vector, _d_transform0, _d_transform1, lane, MAX_PAD, dis, &rp0, &rp1, &rp2, &rp3, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
371 |   }
372 | }
373 | 
374 | template<class T>
375 | 
376 | __global__ void CostAggregationKernelDiagonalDownUpRightLeft(uint8_t* d_cost, uint8_t *d_L, uint16_t *d_s, const int P1, const int P2, const int rows, const int cols, const T *d_transform0, const T *d_transform1, uint8_t* __restrict__ d_disparity, const uint8_t* d_L0, const uint8_t* d_L1, const uint8_t* d_L2, const uint8_t* d_L3, const uint8_t* d_L4, const uint8_t* d_L5, const uint8_t* d_L6) {
377 |   const int initial_col = cols - (blockIdx.x*(blockDim.x/WARP_SIZE) + (threadIdx.x / WARP_SIZE)) - 1;
378 |   if(initial_col < cols) {
379 |     const int initial_row = rows-1;
380 |     const int add_index = -1;
381 |     const int col_nomin = 0;
382 |     const int col_copycost = cols-1;
383 |     const int max_iter = rows-1;
384 |     const bool recompute = false;
385 |     const bool join_dispcomputation = false;
386 | 
387 |     CostAggregationDiagonalGeneric<add_index, T, DIR_DOWNUP, recompute, join_dispcomputation>(d_cost, d_L, d_s, P1, P2, initial_row, initial_col, max_iter, col_nomin, col_copycost, cols, d_transform0, d_transform1, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
388 |   }
389 | }
390 | 
391 | template<class T>
392 | __global__ void CostAggregationKernelDiagonalDownUpLeftRight(uint8_t* d_cost, uint8_t *d_L, uint16_t *d_s, const int P1, const int P2, const int rows, const int cols, const T *d_transform0, const T *d_transform1, uint8_t* __restrict__ d_disparity, const uint8_t* d_L0, const uint8_t* d_L1, const uint8_t* d_L2, const uint8_t* d_L3, const uint8_t* d_L4, const uint8_t* d_L5, const uint8_t* d_L6) {
393 |   const int initial_col = cols - (blockIdx.x*(blockDim.x/WARP_SIZE) + (threadIdx.x / WARP_SIZE)) - 1;
394 |   if(initial_col >= 0) {
395 |     const int initial_row = rows-1;
396 |     const int add_index = 1;
397 |     const int col_nomin = cols-1;
398 |     const int col_copycost = 0;
399 |     const int max_iter = rows-1;
400 |     const bool recompute = false;
401 |     const bool join_dispcomputation = false;
402 | 
403 |     CostAggregationDiagonalGeneric<add_index, T, DIR_DOWNUP, recompute, join_dispcomputation>(d_cost, d_L, d_s, P1, P2, initial_row, initial_col, max_iter, col_nomin, col_copycost, cols, d_transform0, d_transform1, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
404 |   }
405 | }
406 | 
407 | template<class T>
408 | 
409 | __global__ void CostAggregationKernelDiagonalUpDownRightLeft(uint8_t* d_cost, uint8_t *d_L, uint16_t *d_s, const int P1, const int P2, const int rows, const int cols, const T *d_transform0, const T *d_transform1, uint8_t* __restrict__ d_disparity, const uint8_t* d_L0, const uint8_t* d_L1, const uint8_t* d_L2, const uint8_t* d_L3, const uint8_t* d_L4, const uint8_t* d_L5, const uint8_t* d_L6) {
410 |   const int initial_col = blockIdx.x*(blockDim.x/WARP_SIZE) + (threadIdx.x / WARP_SIZE);
411 |   if(initial_col < cols) {
412 |     const int initial_row = 0;
413 |     const int add_index = -1;
414 |     const int col_nomin = 0;
415 |     const int col_copycost = cols-1;
416 |     const int max_iter = rows-1;
417 |     const bool recompute = false;
418 |     const bool join_dispcomputation = true;
419 | 
420 |     CostAggregationDiagonalGeneric<add_index, T, DIR_UPDOWN, recompute, join_dispcomputation>(d_cost, d_L, d_s, P1, P2, initial_row, initial_col, max_iter, col_nomin, col_copycost, cols, d_transform0, d_transform1, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
421 |   }
422 | }
423 | 
424 | template<class T>
425 | 
426 | __global__ void CostAggregationKernelDiagonalUpDownLeftRight(uint8_t* d_cost, uint8_t *d_L, uint16_t *d_s, const int P1, const int P2, const int rows, const int cols, const T *d_transform0, const T *d_transform1, uint8_t* __restrict__ d_disparity, const uint8_t* d_L0, const uint8_t* d_L1, const uint8_t* d_L2, const uint8_t* d_L3, const uint8_t* d_L4, const uint8_t* d_L5, const uint8_t* d_L6) {
427 |   const int initial_col = blockIdx.x*(blockDim.x/WARP_SIZE) + (threadIdx.x / WARP_SIZE);
428 |   if(initial_col < cols) {
429 |     const int initial_row = 0;
430 |     const int add_index = 1;
431 |     const int col_nomin = cols-1;
432 |     const int col_copycost = 0;
433 |     const int max_iter = rows-1;
434 |     const bool recompute = false;
435 |     const bool join_dispcomputation = false;
436 | 
437 |     CostAggregationDiagonalGeneric<add_index, T, DIR_UPDOWN, recompute, join_dispcomputation>(d_cost, d_L, d_s, P1, P2, initial_row, initial_col, max_iter, col_nomin, col_copycost, cols, d_transform0, d_transform1, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
438 |   }
439 | }
440 | 
441 | template<class T>
442 | 
443 | __global__ void CostAggregationKernelLeftToRight(uint8_t* d_cost, uint8_t *d_L, uint16_t *d_s, const int P1, const int P2, const int rows, const int cols, const T *d_transform0, const T *d_transform1, uint8_t* __restrict__ d_disparity, const uint8_t* d_L0, const uint8_t* d_L1, const uint8_t* d_L2, const uint8_t* d_L3, const uint8_t* d_L4, const uint8_t* d_L5, const uint8_t* d_L6) {
444 |   const int initial_row = blockIdx.x*(blockDim.x/WARP_SIZE) + (threadIdx.x / WARP_SIZE);
445 |   if(initial_row < rows) {
446 |     const int initial_col = 0;
447 |     const int add_index = MAX_DISPARITY;
448 |     const int add_imindex = 1;
449 |     const int max_iter = cols-1;
450 |     const int add_col = 1;
451 |     const bool recompute = true;
452 |     const bool join_dispcomputation = false;
453 | 
454 |     CostAggregationGeneric<T, add_col, DIR_LEFTRIGHT, recompute, join_dispcomputation>(d_cost, d_L, d_s, P1, P2, initial_row, initial_col, max_iter, cols, add_index, d_transform0, d_transform1, add_imindex, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
455 |   }
456 | }
457 | 
458 | template<class T>
459 | 
460 | __global__ void CostAggregationKernelRightToLeft(uint8_t* d_cost, uint8_t *d_L, uint16_t *d_s, const int P1, const int P2, const int rows, const int cols, const T *d_transform0, const T *d_transform1, uint8_t* __restrict__ d_disparity, const uint8_t* d_L0, const uint8_t* d_L1, const uint8_t* d_L2, const uint8_t* d_L3, const uint8_t* d_L4, const uint8_t* d_L5, const uint8_t* d_L6) {
461 |   const int initial_row = blockIdx.x*(blockDim.x/WARP_SIZE) + (threadIdx.x / WARP_SIZE);
462 |   if(initial_row < rows) {
463 |     const int initial_col = cols-1;
464 |     const int add_index = -MAX_DISPARITY;
465 |     const int add_imindex = -1;
466 |     const int max_iter = cols-1;
467 |     const int add_col = -1;
468 |     const bool recompute = true;
469 |     const bool join_dispcomputation = false;
470 | 
471 |     CostAggregationGeneric<T, add_col, DIR_RIGHTLEFT, recompute, join_dispcomputation>(d_cost, d_L, d_s, P1, P2, initial_row, initial_col, max_iter, cols, add_index, d_transform0, d_transform1, add_imindex, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
472 |   }
473 | }
474 | 
475 | template<class T>
476 | __global__ void CostAggregationKernelDownToUp(uint8_t* d_cost, uint8_t *d_L, uint16_t *d_s, const int P1, const int P2, const int rows, const int cols, const T *d_transform0, const T *d_transform1, uint8_t* __restrict__ d_disparity, const uint8_t* d_L0, const uint8_t* d_L1, const uint8_t* d_L2, const uint8_t* d_L3, const uint8_t* d_L4, const uint8_t* d_L5, const uint8_t* d_L6) {
477 |   const int initial_col = blockIdx.x*(blockDim.x/WARP_SIZE) + (threadIdx.x / WARP_SIZE);
478 |   if(initial_col < cols) {
479 |     const int initial_row = rows-1;
480 |     const int add_index = -cols*MAX_DISPARITY;
481 |     const int add_imindex = -cols;
482 |     const int max_iter = rows-1;
483 |     const int add_col = 0;
484 |     const bool recompute = false;
485 |     const bool join_dispcomputation = false;
486 | 
487 |     CostAggregationGeneric<T, add_col, DIR_DOWNUP, recompute, join_dispcomputation>(d_cost, d_L, d_s, P1, P2, initial_row, initial_col, max_iter, cols, add_index, d_transform0, d_transform1, add_imindex, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
488 |   }
489 | }
490 | 
491 | template<class T>
492 | //__launch_bounds__(64, 16)
493 | __global__ void CostAggregationKernelUpToDown(uint8_t* d_cost, uint8_t *d_L, uint16_t* d_s, const int P1, const int P2, const int rows, const int cols, const T *d_transform0, const T *d_transform1, uint8_t* __restrict__ d_disparity, const uint8_t* d_L0, const uint8_t* d_L1, const uint8_t* d_L2, const uint8_t* d_L3, const uint8_t* d_L4, const uint8_t* d_L5, const uint8_t* d_L6) {
494 |   const int initial_col = blockIdx.x*(blockDim.x/WARP_SIZE) + (threadIdx.x / WARP_SIZE);
495 |   if(initial_col < cols) {
496 |     const int initial_row = 0;
497 |     const int add_index = cols*MAX_DISPARITY;
498 |     const int add_imindex = cols;
499 |     const int max_iter = rows-1;
500 |     const int add_col = 0;
501 |     const bool recompute = false;
502 |     const bool join_dispcomputation = false;
503 | 
504 |     CostAggregationGeneric<T, add_col, DIR_UPDOWN, recompute, join_dispcomputation>(d_cost, d_L, d_s, P1, P2, initial_row, initial_col, max_iter, cols, add_index, d_transform0, d_transform1, add_imindex, d_disparity, d_L0, d_L1, d_L2, d_L3, d_L4, d_L5, d_L6);
505 |   }
506 | }
507 | 
508 | } // namespace sgm_gpu
509 | 
510 | #endif /* COST_AGGREGATION_H_ */
511 | 


--------------------------------------------------------------------------------
/include/sgm_gpu/costs.h:
--------------------------------------------------------------------------------
 1 | /***********************************************************************
 2 |   Copyright (C) 2020 Hironori Fujimoto
 3 | 
 4 |   This program is free software: you can redistribute it and/or modify
 5 |   it under the terms of the GNU General Public License as published by
 6 |   the Free Software Foundation, either version 3 of the License, or
 7 |   (at your option) any later version.
 8 |  
 9 |   This program is distributed in the hope that it will be useful,
10 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |   GNU General Public License for more details.
13 |   You should have received a copy of the GNU General Public License
14 |   along with this program.  If not, see <http://www.gnu.org/licenses/>.
15 | ***********************************************************************/
16 | #ifndef SGM_GPU__COSTS_H_
17 | #define SGM_GPU__COSTS_H_
18 | 
19 | #include <stdint.h>
20 | #include "sgm_gpu/configuration.h"
21 | 
22 | namespace sgm_gpu
23 | {
24 | 
25 | __global__ void CenterSymmetricCensusKernelSM2(const uint8_t *im, const uint8_t *im2, cost_t *transform, cost_t *transform2, const uint32_t rows, const uint32_t cols);
26 | 
27 | }
28 | 
29 | #endif // SGM_GPU__COSTS_H_
30 | 
31 | 


--------------------------------------------------------------------------------
/include/sgm_gpu/hamming_cost.h:
--------------------------------------------------------------------------------
 1 | /***********************************************************************
 2 |   Copyright (C) 2020 Hironori Fujimoto
 3 | 
 4 |   This program is free software: you can redistribute it and/or modify
 5 |   it under the terms of the GNU General Public License as published by
 6 |   the Free Software Foundation, either version 3 of the License, or
 7 |   (at your option) any later version.
 8 | 
 9 |   This program is distributed in the hope that it will be useful,
10 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |   GNU General Public License for more details.
13 |   You should have received a copy of the GNU General Public License
14 |   along with this program.  If not, see <http://www.gnu.org/licenses/>.
15 | ***********************************************************************/
16 | 
17 | #ifndef SGM_GPU__HAMMING_COST_H_
18 | #define SGM_GPU__HAMMING_COST_H_
19 | 
20 | #include "sgm_gpu/configuration.h"
21 | #include "sgm_gpu/util.h"
22 | #include <stdint.h>
23 | 
24 | namespace sgm_gpu
25 | {
26 | 
27 | __global__ void
28 | HammingDistanceCostKernel(const cost_t *d_transform0, const cost_t *d_transform1, uint8_t *d_cost, const int rows, const int cols );
29 | 
30 | }
31 | 
32 | #endif // SGM_GPU__HAMMING_COST_H_ 
33 | 
34 | 


--------------------------------------------------------------------------------
/include/sgm_gpu/left_right_consistency.h:
--------------------------------------------------------------------------------
 1 | /***********************************************************************
 2 |   Copyright (C) 2020 Hironori Fujimoto
 3 | 
 4 |   This program is free software: you can redistribute it and/or modify
 5 |   it under the terms of the GNU General Public License as published by
 6 |   the Free Software Foundation, either version 3 of the License, or
 7 |   (at your option) any later version.
 8 |  
 9 |   This program is distributed in the hope that it will be useful,
10 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |   GNU General Public License for more details.
13 |   You should have received a copy of the GNU General Public License
14 |   along with this program.  If not, see <http://www.gnu.org/licenses/>.
15 | ***********************************************************************/
16 | 
17 | #ifndef SGM_GPU__LEFT_RIGHT_CONSISTENCY_H_
18 | #define SGM_GPU__LEFT_RIGHT_CONSISTENCY_H_
19 | 
20 | #include <stdint.h>
21 | 
22 | namespace sgm_gpu
23 | {
24 | 
25 | __global__ void ChooseRightDisparity(uint8_t *right_disparity, const uint16_t *smoothed_cost, const uint32_t rows, const uint32_t cols);
26 | __global__ void LeftRightConsistencyCheck(uint8_t *disparity, const uint8_t *disparity_right, const uint32_t rows, const uint32_t cols);
27 | 
28 | }
29 | 
30 | #endif // SGM_GPU__LEFT_RIGHT_CONSISTENCY_H_
31 | 
32 | 


--------------------------------------------------------------------------------
/include/sgm_gpu/median_filter.h:
--------------------------------------------------------------------------------
 1 | /***********************************************************************
 2 |   Copyright (C) 2020 Hironori Fujimoto
 3 | 
 4 |   This program is free software: you can redistribute it and/or modify
 5 |   it under the terms of the GNU General Public License as published by
 6 |   the Free Software Foundation, either version 3 of the License, or
 7 |   (at your option) any later version.
 8 | 
 9 |   This program is distributed in the hope that it will be useful,
10 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |   GNU General Public License for more details.
13 |   You should have received a copy of the GNU General Public License
14 |   along with this program.  If not, see <http://www.gnu.org/licenses/>.
15 | ***********************************************************************/
16 | 
17 | #ifndef SGM_GPU__MEDIAN_FILTER_H_
18 | #define SGM_GPU__MEDIAN_FILTER_H_
19 | 
20 | #include <stdint.h>
21 | 
22 | namespace sgm_gpu
23 | {
24 | 
25 | __global__ void MedianFilter3x3(const uint8_t* __restrict__ d_input, uint8_t* __restrict__ d_out, const uint32_t rows, const uint32_t cols);
26 | 
27 | template<int n, typename T>
28 | __inline__ __device__ void MedianFilter(const T* __restrict__ d_input, T* __restrict__ d_out, const uint32_t rows, const uint32_t cols) {
29 |   const uint32_t idx = blockIdx.x*blockDim.x+threadIdx.x;
30 |   const uint32_t row = idx / cols;
31 |   const uint32_t col = idx % cols;
32 |   T window[n*n];
33 |   int half = n/2;
34 | 
35 |   if(row >= half && col >= half && row < rows-half && col < cols-half) {
36 |     for(uint32_t i = 0; i < n; i++) {
37 |       for(uint32_t j = 0; j < n; j++) {
38 |         window[i*n+j] = d_input[(row-half+i)*cols+col-half+j];
39 |       }
40 |     }
41 | 
42 |     for(uint32_t i = 0; i < (n*n/2)+1; i++) {
43 |       uint32_t min_idx = i;
44 |       for(uint32_t j = i+1; j < n*n; j++) {
45 |         if(window[j] < window[min_idx]) {
46 |           min_idx = j;
47 |         }
48 |       }
49 |       const T tmp = window[i];
50 |       window[i] = window[min_idx];
51 |       window[min_idx] = tmp;
52 |     }
53 |     d_out[idx] = window[n*n/2];
54 |   } else if(row < rows && col < cols) {
55 |     d_out[idx] = d_input[idx];
56 |   }
57 | }
58 | 
59 | } // namespace sgm_gpu
60 | 
61 | #endif // SGM_GPU__MEDIAN_FILTER_H_
62 | 
63 | 


--------------------------------------------------------------------------------
/include/sgm_gpu/sgm_gpu.h:
--------------------------------------------------------------------------------
  1 | /***********************************************************************
  2 |   Copyright (C) 2020 Hironori Fujimoto
  3 | 
  4 |   This program is free software: you can redistribute it and/or modify
  5 |   it under the terms of the GNU General Public License as published by
  6 |   the Free Software Foundation, either version 3 of the License, or
  7 |   (at your option) any later version.
  8 |  
  9 |   This program is distributed in the hope that it will be useful,
 10 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |   GNU General Public License for more details.
 13 |   You should have received a copy of the GNU General Public License
 14 |   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 15 | ***********************************************************************/
 16 | #ifndef SGM_GPU__SGM_GPU_H_
 17 | #define SGM_GPU__SGM_GPU_H_
 18 | 
 19 | #include "sgm_gpu/configuration.h"
 20 | 
 21 | #include <ros/ros.h>
 22 | #include <sensor_msgs/CameraInfo.h>
 23 | #include <sensor_msgs/Image.h>
 24 | #include <stereo_msgs/DisparityImage.h>
 25 | 
 26 | #include <opencv2/opencv.hpp>
 27 | 
 28 | namespace sgm_gpu
 29 | {
 30 | 
 31 | class SgmGpu
 32 | {
 33 | private:
 34 |   std::shared_ptr<ros::NodeHandle> private_node_handle_;
 35 |   /**
 36 |    * @brief Parameter used in SGM algorithm
 37 |    *
 38 |    * See SGM paper.
 39 |    */
 40 |   uint8_t p1_, p2_;
 41 |   /**
 42 |    * @brief Enable/disable left-right consistency check
 43 |    */
 44 |   bool check_consistency_;
 45 | 
 46 |   // Memory for disparity computation
 47 |   // d_: for device
 48 |   uint8_t *d_im0_;
 49 |   uint8_t *d_im1_;
 50 |   uint32_t *d_transform0_;
 51 |   uint32_t *d_transform1_;
 52 |   uint8_t *d_cost_;
 53 |   uint8_t *d_disparity_;
 54 |   uint8_t *d_disparity_filtered_uchar_;
 55 |   uint8_t *d_disparity_right_;
 56 |   uint8_t *d_disparity_right_filtered_uchar_;
 57 |   uint8_t *d_L0_;
 58 |   uint8_t *d_L1_;
 59 |   uint8_t *d_L2_;
 60 |   uint8_t *d_L3_;
 61 |   uint8_t *d_L4_;
 62 |   uint8_t *d_L5_;
 63 |   uint8_t *d_L6_;
 64 |   uint8_t *d_L7_;
 65 |   uint16_t *d_s_;
 66 | 
 67 |   bool memory_allocated_;
 68 | 
 69 |   uint32_t cols_, rows_;
 70 | 
 71 |   void allocateMemory(uint32_t cols, uint32_t rows);
 72 |   void freeMemory();
 73 | 
 74 |   /**
 75 |    * @brief Resize images to be width and height divisible by 4 for limit of CUDA code
 76 |    */
 77 |   void resizeToDivisibleBy4(cv::Mat& left_image, cv::Mat& right_image);
 78 | 
 79 |   void convertToMsg(
 80 |     const cv::Mat_<unsigned char>& disparity, 
 81 |     const sensor_msgs::CameraInfo& left_camera_info,
 82 |     const sensor_msgs::CameraInfo& right_camera_info,
 83 |     stereo_msgs::DisparityImage& disparity_msg
 84 |   );
 85 | 
 86 | public:
 87 |   /**
 88 |    * @brief Constructor which use namespace <parent>/libsgm_gpu for ROS param
 89 |    */
 90 |   SgmGpu(const ros::NodeHandle& parent_node_handle);
 91 |   ~SgmGpu();
 92 | 
 93 |   bool computeDisparity(
 94 |     const sensor_msgs::Image& left_image, 
 95 |     const sensor_msgs::Image& right_image,
 96 |     const sensor_msgs::CameraInfo& left_camera_info,
 97 |     const sensor_msgs::CameraInfo& right_camera_info,
 98 |     stereo_msgs::DisparityImage& disparity_msg
 99 |   );
100 | };
101 | 
102 | } // namespace sgm_gpu
103 | 
104 | #endif // SGM_GPU__SGM_GPU_H_
105 | 
106 | 


--------------------------------------------------------------------------------
/include/sgm_gpu/util.h:
--------------------------------------------------------------------------------
  1 | /***********************************************************************
  2 |   Copyright (C) 2020 Hironori Fujimoto
  3 | 
  4 |   This program is free software: you can redistribute it and/or modify
  5 |   it under the terms of the GNU General Public License as published by
  6 |   the Free Software Foundation, either version 3 of the License, or
  7 |   (at your option) any later version.
  8 | 
  9 |   This program is distributed in the hope that it will be useful,
 10 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |   GNU General Public License for more details.
 13 |   You should have received a copy of the GNU General Public License
 14 |   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 15 | ***********************************************************************/
 16 | 
 17 | #ifndef SGM_GPU__UTIL_H_
 18 | #define SGM_GPU__UTIL_H_
 19 | 
 20 | #include <iostream>
 21 | #include <dirent.h>
 22 | #include <stdio.h>
 23 | 
 24 | #define FERMI false
 25 | 
 26 | #define GPU_THREADS_PER_BLOCK_FERMI 256
 27 | #define GPU_THREADS_PER_BLOCK_MAXWELL 64
 28 | 
 29 | /* Defines related to GPU Architecture */
 30 | #if FERMI
 31 |   #define GPU_THREADS_PER_BLOCK   GPU_THREADS_PER_BLOCK_FERMI
 32 | #else
 33 |   #define GPU_THREADS_PER_BLOCK   GPU_THREADS_PER_BLOCK_MAXWELL
 34 | #endif
 35 | 
 36 | #define WARP_SIZE		32
 37 | 
 38 | namespace sgm_gpu
 39 | {
 40 | 
 41 | static void CheckCudaErrorAux (const char *, unsigned, const char *, cudaError_t);
 42 | #define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)
 43 | 
 44 | /**
 45 | * Check the return value of the CUDA runtime API call and exit
 46 | * the application if the call has failed.
 47 | */
 48 | static void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err) {
 49 |   if (err == cudaSuccess)
 50 |     return;
 51 |   std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
 52 |   exit (1);
 53 | }
 54 | 
 55 | /*************************************
 56 | GPU Side defines (ASM instructions)
 57 | **************************************/
 58 | 
 59 | // output temporal carry in internal register
 60 | #define UADD__CARRY_OUT(c, a, b) \
 61 |   asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b))
 62 | 
 63 | // add & output with temporal carry of internal register
 64 | #define UADD__IN_CARRY_OUT(c, a, b) \
 65 |   asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b))
 66 | 
 67 | // add with temporal carry of internal register
 68 | #define UADD__IN_CARRY(c, a, b) \
 69 |   asm volatile("addc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b))
 70 | 
 71 | // packing and unpacking: from uint64_t to uint2
 72 | #define V2S_B64(v,s) \
 73 |   asm("mov.b64 %0, {%1,%2};" : "=l"(s) : "r"(v.x), "r"(v.y))
 74 | 
 75 | // packing and unpacking: from uint2 to uint64_t
 76 | #define S2V_B64(s,v) \
 77 |   asm("mov.b64 {%0,%1}, %2;" : "=r"(v.x), "=r"(v.y) : "l"(s))
 78 | 
 79 | 
 80 | /*************************************
 81 | DEVICE side basic block primitives
 82 | **************************************/
 83 | 
 84 | #if FERMI
 85 |   #define LDG(ptr)  (* ptr)
 86 | #else
 87 |   #define LDG(ptr)  __ldg(ptr)
 88 | #endif
 89 | 
 90 | #if FERMI
 91 | __shared__ int interBuff[GPU_THREADS_PER_BLOCK];
 92 | __inline__ __device__ int __emulated_shfl(const int scalarValue, const uint32_t source_lane) {
 93 |   const int warpIdx = threadIdx.x / WARP_SIZE;
 94 |   const int laneIdx = threadIdx.x % WARP_SIZE;
 95 |   volatile int *interShuffle = interBuff + (warpIdx * WARP_SIZE);
 96 |   interShuffle[laneIdx] = scalarValue;
 97 |   return(interShuffle[source_lane % WARP_SIZE]);
 98 | }
 99 | #endif
100 | 
101 | __inline__ __device__ int shfl_32(int scalarValue, const int lane) {
102 |   #if FERMI
103 |     return __emulated_shfl(scalarValue, (uint32_t)lane);
104 |   #else
105 |     return __shfl_sync(0xffffffff, scalarValue, lane);
106 |   #endif
107 | }
108 | 
109 | __inline__ __device__ int shfl_up_32(int scalarValue, const int n) {
110 |   #if FERMI
111 |     int lane = threadIdx.x % WARP_SIZE;
112 |     lane -= n;
113 |     return shfl_32(scalarValue, lane);
114 |   #else
115 |     return __shfl_up_sync(0xffffffff, scalarValue, n);
116 |   #endif
117 | }
118 | 
119 | __inline__ __device__ int shfl_down_32(int scalarValue, const int n) {
120 |   #if FERMI
121 |     int lane = threadIdx.x % WARP_SIZE;
122 |     lane += n;
123 |     return shfl_32(scalarValue, lane);
124 |   #else
125 |     return __shfl_down_sync(0xffffffff, scalarValue, n);
126 |   #endif
127 | }
128 | 
129 | __inline__ __device__ int shfl_xor_32(int scalarValue, const int n) {
130 |   #if FERMI
131 |     int lane = threadIdx.x % WARP_SIZE;
132 |     lane = lane ^ n;
133 |     return shfl_32(scalarValue, lane);
134 |   #else
135 |     return __shfl_xor_sync(0xffffffff, scalarValue, n);
136 |   #endif
137 | }
138 | 
139 | __device__ __forceinline__ uint32_t ld_gbl_ca(const __restrict__ uint32_t *addr) {
140 |   uint32_t return_value;
141 |   asm("ld.global.ca.u32 %0, [%1];" : "=r"(return_value) : "l"(addr));
142 |   return return_value;
143 | }
144 | 
145 | __device__ __forceinline__ uint32_t ld_gbl_cs(const __restrict__ uint32_t *addr) {
146 |   uint32_t return_value;
147 |   asm("ld.global.cs.u32 %0, [%1];" : "=r"(return_value) : "l"(addr));
148 |   return return_value;
149 | }
150 | 
151 | __device__ __forceinline__ void st_gbl_wt(const __restrict__ uint32_t *addr, const uint32_t value) {
152 |   asm("st.global.wt.u32 [%0], %1;" :: "l"(addr), "r"(value));
153 | }
154 | 
155 | __device__ __forceinline__ void st_gbl_cs(const __restrict__ uint32_t *addr, const uint32_t value) {
156 |   asm("st.global.cs.u32 [%0], %1;" :: "l"(addr), "r"(value));
157 | }
158 | 
159 | __device__ __forceinline__ uint32_t gpu_get_sm_idx(){
160 |   uint32_t smid;
161 |   asm volatile("mov.u32 %0, %%smid;" : "=r"(smid));
162 |   return(smid);
163 | }
164 | 
165 | __device__ __forceinline__ void uint32_to_uchars(const uint32_t s, int *u1, int *u2, int *u3, int *u4) {
166 |   //*u1 = s & 0xff;
167 |   *u1 = __byte_perm(s, 0, 0x4440);
168 |   //*u2 = (s>>8) & 0xff;
169 |   *u2 = __byte_perm(s, 0, 0x4441);
170 |   //*u3 = (s>>16) & 0xff;
171 |   *u3 = __byte_perm(s, 0, 0x4442);
172 |   //*u4 = s>>24;
173 |   *u4 = __byte_perm(s, 0, 0x4443);
174 | }
175 | 
176 | __device__ __forceinline__ uint32_t uchars_to_uint32(int u1, int u2, int u3, int u4) {
177 |   //return u1 | (u2<<8) | (u3<<16) | (u4<<24);
178 |   //return __byte_perm(u1, u2, 0x7740) + __byte_perm(u3, u4, 0x4077);
179 |   return u1 | (u2<<8) | __byte_perm(u3, u4, 0x4077);
180 | }
181 | 
182 | __device__ __forceinline__ uint32_t uchar_to_uint32(int u1) {
183 |   return __byte_perm(u1, u1, 0x0);
184 | }
185 | 
186 | __device__ __forceinline__ unsigned int vcmpgeu4(unsigned int a, unsigned int b) {
187 |     unsigned int r, c;
188 |     c = a-b;
189 |     asm ("prmt.b32 %0,%1,0,0xba98;" : "=r"(r) : "r"(c));// build mask from msbs
190 |     return r;           // byte-wise unsigned gt-eq comparison with mask result
191 | }
192 | 
193 | __device__ __forceinline__ unsigned int vminu4(unsigned int a, unsigned int b) {
194 |     unsigned int r, s;
195 |     s = vcmpgeu4 (b, a);// mask = 0xff if a >= b
196 |     r = a & s;          // select a when b >= a
197 |     s = b & ~s;         // select b when b < a
198 |     r = r | s;          // combine byte selections
199 |     return r;
200 | }
201 | 
202 | __device__ __forceinline__ void print_uchars(const char* str, const uint32_t s) {
203 |   int u1, u2, u3, u4;
204 |   uint32_to_uchars(s, &u1, &u2, &u3, &u4);
205 |   printf("%s: %d %d %d %d\n", str, u1, u2, u3, u4);
206 | }
207 | 
208 | template<class T>
209 | __device__ __forceinline__ int popcount(T n) {
210 | #if CSCT or CSCT_RECOMPUTE
211 |   return __popc(n);
212 | #else
213 |   return __popcll(n);
214 | #endif
215 | }
216 | 
217 | __inline__ __device__ uint8_t minu8_index4(int *min_idx, const uint8_t val1, const int dis, const uint8_t val2, const int dis2, const uint8_t val3, const int dis3, const uint8_t val4, const int dis4) {
218 |   int min_idx1 = dis;
219 |   uint8_t min1 = val1;
220 |   if(val1 > val2) {
221 |     min1 = val2;
222 |     min_idx1 = dis2;
223 |   }
224 | 
225 |   int min_idx2 = dis3;
226 |   uint8_t min2 = val3;
227 |   if(val3 > val4) {
228 |     min2 = val4;
229 |     min_idx2 = dis4;
230 |   }
231 | 
232 |   uint8_t minval = min1;
233 |   *min_idx = min_idx1;
234 |   if(min1 > min2) {
235 |     minval = min2;
236 |     *min_idx = min_idx2;
237 |   }
238 |   return minval;
239 | }
240 | 
241 | __inline__ __device__ uint8_t minu8_index8(int *min_idx, const uint8_t val1, const int dis, const uint8_t val2, const int dis2, const uint8_t val3, const int dis3, const uint8_t val4, const int dis4, const uint8_t val5, const int dis5, const uint8_t val6, const int dis6, const uint8_t val7, const int dis7, const uint8_t val8, const int dis8) {
242 |   int min_idx1, min_idx2;
243 |   uint8_t minval1, minval2;
244 | 
245 |   minval1 = minu8_index4(&min_idx1, val1, dis, val2, dis2, val3, dis3, val4, dis4);
246 |   minval2 = minu8_index4(&min_idx2, val5, dis5, val6, dis6, val7, dis7, val8, dis8);
247 | 
248 |   *min_idx = min_idx1;
249 |   uint8_t minval = minval1;
250 |   if(minval1 > minval2) {
251 |     *min_idx = min_idx2;
252 |     minval = minval2;
253 |   }
254 |   return minval;
255 | }
256 | 
257 | __inline__ __device__ int warpReduceMinIndex2(int *val, int idx) {
258 |   for(int d = 1; d < WARP_SIZE; d *= 2) {
259 |     int tmp = shfl_xor_32(*val, d);
260 |     int tmp_idx = shfl_xor_32(idx, d);
261 |     if(*val > tmp) {
262 |       *val = tmp;
263 |       idx = tmp_idx;
264 |     }
265 |   }
266 |   return idx;
267 | }
268 | 
269 | __inline__ __device__ int warpReduceMinIndex(int val, int idx) {
270 |   for(int d = 1; d < WARP_SIZE; d *= 2) {
271 |     int tmp = shfl_xor_32(val, d);
272 |     int tmp_idx = shfl_xor_32(idx, d);
273 |     if(val > tmp) {
274 |       val = tmp;
275 |       idx = tmp_idx;
276 |     }
277 |   }
278 |   return idx;
279 | }
280 | 
281 | __inline__ __device__ int warpReduceMin(int val) {
282 |   val = min(val, shfl_xor_32(val, 1));
283 |   val = min(val, shfl_xor_32(val, 2));
284 |   val = min(val, shfl_xor_32(val, 4));
285 |   val = min(val, shfl_xor_32(val, 8));
286 |   val = min(val, shfl_xor_32(val, 16));
287 |   return val;
288 | }
289 | 
290 | __inline__ __device__ int blockReduceMin(int val) {
291 |   static __shared__ int shared[WARP_SIZE]; // Shared mem for WARP_SIZE partial sums
292 |   const int lane = threadIdx.x % WARP_SIZE;
293 |   const int wid = threadIdx.x / WARP_SIZE;
294 | 
295 |   val = warpReduceMin(val);     // Each warp performs partial reduction
296 | 
297 |   if (lane==0) shared[wid]=val; // Write reduced value to shared memory
298 | 
299 |   __syncthreads();              // Wait for all partial reductions
300 | 
301 |   //read from shared memory only if that warp existed
302 |   val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : INT_MAX;
303 | 
304 |   if (wid==0) val = warpReduceMin(val); //Final reduce within first warp
305 | 
306 |   return val;
307 | }
308 | 
309 | __inline__ __device__ int blockReduceMinIndex(int val, int idx) {
310 |   static __shared__ int shared_val[WARP_SIZE]; // Shared mem for WARP_SIZE partial mins
311 |   static __shared__ int shared_idx[WARP_SIZE]; // Shared mem for WARP_SIZE indexes
312 |   const int lane = threadIdx.x % WARP_SIZE;
313 |   const int wid = threadIdx.x / WARP_SIZE;
314 | 
315 |   idx = warpReduceMinIndex2(&val, idx);     // Each warp performs partial reduction
316 | 
317 |   if (lane==0) {
318 |     shared_val[wid]=val;
319 |     shared_idx[wid]=idx;
320 |   }
321 | 
322 |   __syncthreads();              // Wait for all partial reductions
323 | 
324 |   //read from shared memory only if that warp existed
325 |   val = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared_val[lane] : INT_MAX;
326 |   idx = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared_idx[lane] : INT_MAX;
327 | 
328 |   if (wid==0) {
329 |     idx = warpReduceMinIndex2(&val, idx); //Final reduce within first warp
330 |   }
331 | 
332 |   return idx;
333 | }
334 | 
335 | 
336 | __inline__ __device__ bool blockAny(bool local_condition) {
337 |   __shared__ bool conditions[WARP_SIZE];
338 |   const int lane = threadIdx.x % WARP_SIZE;
339 |   const int wid = threadIdx.x / WARP_SIZE;
340 | 
341 |   local_condition = __any_sync(0xffffffff, local_condition);     // Each warp performs __any
342 | 
343 |   if (lane==0) {
344 |     conditions[wid]=local_condition;
345 |   }
346 | 
347 |   __syncthreads();              // Wait for all partial __any
348 | 
349 |   //read from shared memory only if that warp existed
350 |   local_condition = (threadIdx.x < blockDim.x / WARP_SIZE) ? conditions[lane] : false;
351 | 
352 |   if (wid==0) {
353 |     local_condition = __any_sync(0xffffffff, local_condition); //Final __any within first warp
354 |   }
355 | 
356 |   return local_condition;
357 | }
358 | 
359 | } //namespace sgm_gpu
360 | 
361 | #endif // SGM_GPU__UTIL_H_
362 | 
363 | 


--------------------------------------------------------------------------------
/include/sgm_gpu_node.h:
--------------------------------------------------------------------------------
 1 | /***********************************************************************
 2 |   Copyright (C) 2020 Hironori Fujimoto
 3 | 
 4 |   This program is free software: you can redistribute it and/or modify
 5 |   it under the terms of the GNU General Public License as published by
 6 |   the Free Software Foundation, either version 3 of the License, or
 7 |   (at your option) any later version.
 8 |  
 9 |   This program is distributed in the hope that it will be useful,
10 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |   GNU General Public License for more details.
13 |   You should have received a copy of the GNU General Public License
14 |   along with this program.  If not, see <http://www.gnu.org/licenses/>.
15 | ***********************************************************************/
16 | #ifndef SGM_GPU__SGM_GPU_NODE_H_
17 | #define SGM_GPU__SGM_GPU_NODE_H_
18 | 
19 | #include "sgm_gpu/sgm_gpu.h"
20 | 
21 | #include <image_transport/camera_common.h>
22 | #include <image_transport/image_transport.h>
23 | #include <image_transport/subscriber_filter.h>
24 | #include <message_filters/subscriber.h>
25 | #include <message_filters/time_synchronizer.h>
26 | 
27 | namespace sgm_gpu
28 | {
29 | 
30 | class SgmGpuNode
31 | {
32 | private:
33 |   std::shared_ptr<ros::NodeHandle> node_handle_;
34 |   std::shared_ptr<ros::NodeHandle> private_node_handle_;
35 | 
36 |   std::shared_ptr<image_transport::ImageTransport> image_transport_;
37 | 
38 |   std::shared_ptr<SgmGpu> sgm_;
39 | 
40 |   image_transport::SubscriberFilter left_image_sub_;
41 |   image_transport::SubscriberFilter right_image_sub_;
42 |   message_filters::Subscriber<sensor_msgs::CameraInfo> left_info_sub_;
43 |   message_filters::Subscriber<sensor_msgs::CameraInfo> right_info_sub_;
44 | 
45 |   using StereoSynchronizer = message_filters::TimeSynchronizer
46 |   <
47 |     sensor_msgs::Image, sensor_msgs::Image, 
48 |     sensor_msgs::CameraInfo, sensor_msgs::CameraInfo
49 |   >;
50 |   std::shared_ptr<StereoSynchronizer> stereo_synchronizer_;
51 | 
52 |   ros::Publisher disparity_pub_;
53 |   
54 |   void stereoCallback(
55 |     const sensor_msgs::ImageConstPtr &left_image_msg, 
56 |     const sensor_msgs::ImageConstPtr &right_image_msg, 
57 |     const sensor_msgs::CameraInfoConstPtr &left_info_msg, 
58 |     const sensor_msgs::CameraInfoConstPtr &right_info_msg
59 |   );
60 | 
61 | public:
62 |   SgmGpuNode();
63 | };
64 | 
65 | } // namespace sgm_gpu
66 | 
67 | #endif
68 | 
69 | 


--------------------------------------------------------------------------------
/launch/test.launch:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | 
 3 | <launch>
 4 |   <node name="bag_player" pkg="rosbag" type="play" args="--loop $(find sgm_gpu)/test_input.bag"/>
 5 | 
 6 |   <node name="sgm_gpu_node" pkg="sgm_gpu" type="sgm_gpu_node">
 7 |     <param name="image_transport" value="compressed"/>
 8 |     <remap from="left_image" to="stereo_robot/mobile_base/camera/left/image_raw"/>
 9 |     <remap from="right_image" to="stereo_robot/mobile_base/camera/right/image_raw"/>
10 |   </node>
11 | 
12 |   <node name="input_viewer" pkg="image_view" type="image_view">
13 |     <param name="image_transport" value="compressed"/>
14 |     <remap from="image" to="stereo_robot/mobile_base/camera/left/image_raw"/>
15 |   </node>
16 |   <node name="disparity_viewer" pkg="image_view" type="disparity_view">
17 |     <remap from="image" to="sgm_gpu_node/disparity"/>
18 |   </node>
19 | </launch>
20 | 


--------------------------------------------------------------------------------
/package.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-model href="http://download.ros.org/schema/package_format2.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?> 
 3 | 
 4 | <package format="2">
 5 |   <name>sgm_gpu</name>
 6 |   <version>0.0.0</version>
 7 |   <description>A ROS package of Semi-Global Matching on the GPU</description>
 8 |   <maintainer email="fujimoto@aisl.cs.tut.ac.jp">Hironori Fujimoto</maintainer>
 9 |   <license>GPL3</license>
10 | 	<url type="website">https://github.com/ActiveIntelligentSystemsLab/sgm_gpu_ros</url>
11 | 
12 |   <buildtool_depend>catkin</buildtool_depend>
13 |   <depend>cv_bridge</depend>
14 |   <depend>image_geometry</depend>
15 |   <depend>image_transport</depend>
16 |   <depend>message_filters</depend>
17 |   <depend>roscpp</depend>
18 |   <depend>sensor_msgs</depend>
19 |   <depend>stereo_msgs</depend>
20 |   <!-- rosdep keys -->
21 |   <depend>libopencv-dev</depend>
22 |   <exec_depend>nvidia-cuda</exec_depend>
23 |   <build_depend>nvidia-cuda-dev</build_depend>
24 | </package>
25 | 
26 | 


--------------------------------------------------------------------------------
/src/costs.cu:
--------------------------------------------------------------------------------
  1 | /***********************************************************************
  2 |   Copyright (C) 2019 Hironori Fujimoto
  3 | 
  4 |   This program is free software: you can redistribute it and/or modify
  5 |   it under the terms of the GNU General Public License as published by
  6 |   the Free Software Foundation, either version 3 of the License, or
  7 |   (at your option) any later version.
  8 | 
  9 |   This program is distributed in the hope that it will be useful,
 10 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |   GNU General Public License for more details.
 13 |   You should have received a copy of the GNU General Public License
 14 |   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 15 | ***********************************************************************/
 16 | 
 17 | #include "sgm_gpu/costs.h"
 18 | #include <stdio.h>
 19 | 
 20 | namespace sgm_gpu
 21 | {
 22 | 
 23 | __global__ void 
 24 | __launch_bounds__(1024, 2)
 25 | CenterSymmetricCensusKernelSM2(const uint8_t *im, const uint8_t *im2, cost_t *transform, cost_t *transform2, const uint32_t rows, const uint32_t cols) {
 26 |   const int idx = blockIdx.x*blockDim.x+threadIdx.x;
 27 |   const int idy = blockIdx.y*blockDim.y+threadIdx.y;
 28 | 
 29 |   const int win_cols = (32+LEFT*2); // 32+4*2 = 40
 30 |   const int win_rows = (32+TOP*2); // 32+3*2 = 38
 31 | 
 32 |   __shared__ uint8_t window[win_cols*win_rows];
 33 |   __shared__ uint8_t window2[win_cols*win_rows];
 34 | 
 35 |   const int id = threadIdx.y*blockDim.x+threadIdx.x;
 36 |   const int sm_row = id / win_cols;
 37 |   const int sm_col = id % win_cols;
 38 | 
 39 |   const int im_row = blockIdx.y*blockDim.y+sm_row-TOP;
 40 |   const int im_col = blockIdx.x*blockDim.x+sm_col-LEFT;
 41 |   const bool boundaries = (im_row >= 0 && im_col >= 0 && im_row < rows && im_col < cols);
 42 |   window[sm_row*win_cols+sm_col] = boundaries ? im[im_row*cols+im_col] : 0;
 43 |   window2[sm_row*win_cols+sm_col] = boundaries ? im2[im_row*cols+im_col] : 0;
 44 | 
 45 |   // Not enough threads to fill window and window2
 46 |   const int block_size = blockDim.x*blockDim.y;
 47 |   if(id < (win_cols*win_rows-block_size)) {
 48 |     const int id = threadIdx.y*blockDim.x+threadIdx.x+block_size;
 49 |     const int sm_row = id / win_cols;
 50 |     const int sm_col = id % win_cols;
 51 | 
 52 |     const int im_row = blockIdx.y*blockDim.y+sm_row-TOP;
 53 |     const int im_col = blockIdx.x*blockDim.x+sm_col-LEFT;
 54 |     const bool boundaries = (im_row >= 0 && im_col >= 0 && im_row < rows && im_col < cols);
 55 |     window[sm_row*win_cols+sm_col] = boundaries ? im[im_row*cols+im_col] : 0;
 56 |     window2[sm_row*win_cols+sm_col] = boundaries ? im2[im_row*cols+im_col] : 0;
 57 |   }
 58 | 
 59 |   __syncthreads();
 60 |   cost_t census = 0;
 61 |   cost_t census2 = 0;
 62 |   if(idy < rows && idx < cols) {
 63 |       for(int k = 0; k < CENSUS_HEIGHT/2; k++) {
 64 |         for(int m = 0; m < CENSUS_WIDTH; m++) {
 65 |           const uint8_t e1 = window[(threadIdx.y+k)*win_cols+threadIdx.x+m];
 66 |           const uint8_t e2 = window[(threadIdx.y+2*TOP-k)*win_cols+threadIdx.x+2*LEFT-m];
 67 |           const uint8_t i1 = window2[(threadIdx.y+k)*win_cols+threadIdx.x+m];
 68 |           const uint8_t i2 = window2[(threadIdx.y+2*TOP-k)*win_cols+threadIdx.x+2*LEFT-m];
 69 | 
 70 |           const int shft = k*CENSUS_WIDTH+m;
 71 |           // Compare to the center
 72 |           cost_t tmp = (e1 >= e2);
 73 |           // Shift to the desired position
 74 |           tmp <<= shft;
 75 |           // Add it to its place
 76 |           census |= tmp;
 77 |           // Compare to the center
 78 |           cost_t tmp2 = (i1 >= i2);
 79 |           // Shift to the desired position
 80 |           tmp2 <<= shft;
 81 |           // Add it to its place
 82 |           census2 |= tmp2;
 83 |         }
 84 |       }
 85 |       if(CENSUS_HEIGHT % 2 != 0) {
 86 |         const int k = CENSUS_HEIGHT/2;
 87 |         for(int m = 0; m < CENSUS_WIDTH/2; m++) {
 88 |           const uint8_t e1 = window[(threadIdx.y+k)*win_cols+threadIdx.x+m];
 89 |           const uint8_t e2 = window[(threadIdx.y+2*TOP-k)*win_cols+threadIdx.x+2*LEFT-m];
 90 |           const uint8_t i1 = window2[(threadIdx.y+k)*win_cols+threadIdx.x+m];
 91 |           const uint8_t i2 = window2[(threadIdx.y+2*TOP-k)*win_cols+threadIdx.x+2*LEFT-m];
 92 | 
 93 |           const int shft = k*CENSUS_WIDTH+m;
 94 |           // Compare to the center
 95 |           cost_t tmp = (e1 >= e2);
 96 |           // Shift to the desired position
 97 |           tmp <<= shft;
 98 |           // Add it to its place
 99 |           census |= tmp;
100 |           // Compare to the center
101 |           cost_t tmp2 = (i1 >= i2);
102 |           // Shift to the desired position
103 |           tmp2 <<= shft;
104 |           // Add it to its place
105 |           census2 |= tmp2;
106 |         }
107 |       }
108 | 
109 |     transform[idy*cols+idx] = census;
110 |     transform2[idy*cols+idx] = census2;
111 |   }
112 | }
113 | 
114 | } // namespace sgm_gpu
115 | 
116 | 


--------------------------------------------------------------------------------
/src/hamming_cost.cu:
--------------------------------------------------------------------------------
 1 | /***********************************************************************
 2 |   Copyright (C) 2019 Hironori Fujimoto
 3 | 
 4 |   This program is free software: you can redistribute it and/or modify
 5 |   it under the terms of the GNU General Public License as published by
 6 |   the Free Software Foundation, either version 3 of the License, or
 7 |   (at your option) any later version.
 8 | 
 9 |   This program is distributed in the hope that it will be useful,
10 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |   GNU General Public License for more details.
13 |   You should have received a copy of the GNU General Public License
14 |   along with this program.  If not, see <http://www.gnu.org/licenses/>.
15 | ***********************************************************************/
16 | 
17 | #include "sgm_gpu/hamming_cost.h"
18 | 
19 | namespace sgm_gpu
20 | {
21 | 
22 | //d_transform0, d_transform1, d_cost, rows, cols
23 | __global__ void
24 | HammingDistanceCostKernel (  const cost_t *d_transform0, const cost_t *d_transform1,
25 |     uint8_t *d_cost, const int rows, const int cols ) {
26 |   //const int Dmax=   blockDim.x;  // Dmax is CTA size
27 |   const int y=      blockIdx.x;  // y is CTA Identifier
28 |   const int THRid = threadIdx.x; // THRid is Thread Identifier
29 | 
30 |   __shared__ cost_t SharedMatch[2*MAX_DISPARITY];
31 |   __shared__ cost_t SharedBase [MAX_DISPARITY];
32 | 
33 |   SharedMatch [MAX_DISPARITY+THRid] = d_transform1[y*cols+0];  // init position
34 | 
35 |   int n_iter = cols/MAX_DISPARITY;
36 |   for (int ix=0; ix<n_iter; ix++) {
37 |     const int x = ix*MAX_DISPARITY;
38 |     SharedMatch [THRid]      = SharedMatch [THRid + MAX_DISPARITY];
39 |     SharedMatch [THRid+MAX_DISPARITY] = d_transform1 [y*cols+x+THRid];
40 |     SharedBase  [THRid]      = d_transform0 [y*cols+x+THRid];
41 | 
42 |     __syncthreads();
43 |     for (int i=0; i<MAX_DISPARITY; i++) {
44 |       const cost_t base  = SharedBase [i];
45 |       const cost_t match = SharedMatch[(MAX_DISPARITY-1-THRid)+1+i];
46 |       d_cost[(y*cols+x+i)*MAX_DISPARITY+THRid] = popcount( base ^ match );
47 |     }
48 |     __syncthreads();
49 |   }
50 |   // For images with cols not multiples of MAX_DISPARITY
51 |   const int x = MAX_DISPARITY*(cols/MAX_DISPARITY);
52 |   const int left = cols-x;
53 |   if(left > 0) {
54 |     SharedMatch [THRid]      = SharedMatch [THRid + MAX_DISPARITY];
55 |     if(THRid < left) {
56 |       SharedMatch [THRid+MAX_DISPARITY] = d_transform1 [y*cols+x+THRid];
57 |       SharedBase  [THRid]      = d_transform0 [y*cols+x+THRid];
58 |     }
59 | 
60 |     __syncthreads();
61 |     for (int i=0; i<left; i++) {
62 |       const cost_t base  = SharedBase [i];
63 |       const cost_t match = SharedMatch[(MAX_DISPARITY-1-THRid)+1+i];
64 |       d_cost[(y*cols+x+i)*MAX_DISPARITY+THRid] = popcount( base ^ match );
65 |     }
66 |     __syncthreads();
67 |   }
68 | }
69 | 
70 | } // namespace sgm_gpu
71 | 
72 | 


--------------------------------------------------------------------------------
/src/left_right_consistency.cu:
--------------------------------------------------------------------------------
 1 | /***********************************************************************
 2 |   Copyright (C) 2019 Hironori Fujimoto
 3 | 
 4 |   This program is free software: you can redistribute it and/or modify
 5 |   it under the terms of the GNU General Public License as published by
 6 |   the Free Software Foundation, either version 3 of the License, or
 7 |   (at your option) any later version.
 8 |  
 9 |   This program is distributed in the hope that it will be useful,
10 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |   GNU General Public License for more details.
13 |   You should have received a copy of the GNU General Public License
14 |   along with this program.  If not, see <http://www.gnu.org/licenses/>.
15 | ***********************************************************************/
16 | 
17 | #include "sgm_gpu/left_right_consistency.h"
18 | #include "sgm_gpu/configuration.h"
19 | 
20 | namespace sgm_gpu
21 | {
22 | 
23 | __global__ void ChooseRightDisparity(uint8_t *right_disparity, const uint16_t *smoothed_cost, const uint32_t rows, const uint32_t cols) {
24 |   const int x = blockIdx.x*blockDim.x+threadIdx.x;
25 |   const int y = blockIdx.y*blockDim.y+threadIdx.y;
26 |   
27 |   if (x >= cols || y >= rows)
28 |     return;
29 |   
30 |   int min_cost_disparity = 0;
31 |   uint16_t min_cost = smoothed_cost[(y*cols + x)*MAX_DISPARITY + min_cost_disparity];
32 |   
33 |   for (int d = 1; d < MAX_DISPARITY; d++) {
34 |     if (x + d >= cols)
35 |       break;
36 |     uint16_t tmp_cost = smoothed_cost[(y*cols + (x+d))*MAX_DISPARITY + d];
37 |     if (tmp_cost < min_cost) {
38 |       min_cost = tmp_cost;
39 |       min_cost_disparity = d;
40 |     }
41 |   }
42 |   
43 |   right_disparity[y*cols+x] = min_cost_disparity;
44 | }
45 | 
46 | __global__ void LeftRightConsistencyCheck(uint8_t* disparity, const uint8_t* disparity_right, uint32_t rows, uint32_t cols)
47 | {
48 |   const int x = blockIdx.x*blockDim.x+threadIdx.x;
49 |   const int y = blockIdx.y*blockDim.y+threadIdx.y;
50 |   
51 |   if (x >= cols || y >= rows)
52 |     return;
53 |     
54 |   const int x_right = x - disparity[y*cols + x];
55 |   
56 |   if (x_right < 0) {
57 |     disparity[y*cols + x] = 255;
58 |     return;
59 |   }
60 |   
61 |   int diff = disparity[y*cols + x] - disparity_right[y*cols + x_right];
62 |   diff = diff < 0 ? diff * -1 : diff;
63 |   if (diff > 1) {
64 |     disparity[y*cols + x] = 255;
65 |   }
66 | }
67 | 
68 | } // namespace sgm_gpu
69 | 
70 | 


--------------------------------------------------------------------------------
/src/median_filter.cu:
--------------------------------------------------------------------------------
 1 | /***********************************************************************
 2 |   Copyright (C) 2019 Hironori Fujimoto
 3 | 
 4 |   This program is free software: you can redistribute it and/or modify
 5 |   it under the terms of the GNU General Public License as published by
 6 |   the Free Software Foundation, either version 3 of the License, or
 7 |   (at your option) any later version.
 8 | 
 9 |   This program is distributed in the hope that it will be useful,
10 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |   GNU General Public License for more details.
13 |   You should have received a copy of the GNU General Public License
14 |   along with this program.  If not, see <http://www.gnu.org/licenses/>.
15 | ***********************************************************************/
16 | 
17 | #include "sgm_gpu/median_filter.h"
18 | 
19 | namespace sgm_gpu
20 | {
21 | 
22 | __global__ void MedianFilter3x3(const uint8_t* __restrict__ d_input, uint8_t* __restrict__ d_out, const uint32_t rows, const uint32_t cols) {
23 |   MedianFilter<3>(d_input, d_out, rows, cols);
24 | }
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/src/sgm_gpu.cu:
--------------------------------------------------------------------------------
  1 | /***********************************************************************
  2 |   Copyright (C) 2020 Hironori Fujimoto
  3 | 
  4 |   This program is free software: you can redistribute it and/or modify
  5 |   it under the terms of the GNU General Public License as published by
  6 |   the Free Software Foundation, either version 3 of the License, or
  7 |   (at your option) any later version.
  8 |  
  9 |   This program is distributed in the hope that it will be useful,
 10 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 |   GNU General Public License for more details.
 13 |   You should have received a copy of the GNU General Public License
 14 |   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 15 | ***********************************************************************/
 16 | #include "sgm_gpu/sgm_gpu.h"
 17 | 
 18 | #include "sgm_gpu/costs.h"
 19 | #include "sgm_gpu/hamming_cost.h"
 20 | #include "sgm_gpu/median_filter.h"
 21 | #include "sgm_gpu/cost_aggregation.h"
 22 | #include "sgm_gpu/left_right_consistency.h"
 23 | 
 24 | #include <image_geometry/stereo_camera_model.h>
 25 | #include <cv_bridge/cv_bridge.h>
 26 | 
 27 | namespace sgm_gpu
 28 | {
 29 | // Variables which have CUDA-related type are put here
 30 | //   in order to include sgm_gpu.h from non-CUDA package
 31 | cudaStream_t stream1_;
 32 | cudaStream_t stream2_;
 33 | cudaStream_t stream3_;
 34 | 
 35 | dim3 BLOCK_SIZE_;
 36 | dim3 grid_size_;
 37 | 
 38 | SgmGpu::SgmGpu(const ros::NodeHandle& parent_node_handle) 
 39 |   : memory_allocated_(false), cols_(0), rows_(0)
 40 | {
 41 |   private_node_handle_.reset(
 42 |     new ros::NodeHandle(parent_node_handle, "libsgm_gpu")
 43 |   );
 44 |   // Get parameters used in SGM algorithm
 45 |   p1_ = static_cast<uint8_t>(private_node_handle_->param("p1", 6));
 46 |   p2_ = static_cast<uint8_t>(private_node_handle_->param("p2", 96));
 47 |   check_consistency_ = private_node_handle_->param("check_consistency", true);
 48 | 
 49 |   // Create streams
 50 |   cudaStreamCreate(&stream1_);
 51 |   cudaStreamCreate(&stream2_);
 52 |   cudaStreamCreate(&stream3_);
 53 | }
 54 | 
 55 | SgmGpu::~SgmGpu()
 56 | {
 57 |   freeMemory();
 58 | 
 59 |   cudaStreamDestroy(stream1_);
 60 |   cudaStreamDestroy(stream2_);
 61 |   cudaStreamDestroy(stream3_);
 62 | }
 63 | 
 64 | void SgmGpu::allocateMemory(uint32_t cols, uint32_t rows)
 65 | {
 66 |   freeMemory();
 67 | 
 68 |   cols_ = cols;
 69 |   rows_ = rows;
 70 | 
 71 |   int total_pixel = cols_ * rows_;
 72 |   cudaMalloc((void **)&d_im0_, sizeof(uint8_t) * total_pixel);
 73 |   cudaMalloc((void **)&d_im1_, sizeof(uint8_t) * total_pixel);
 74 | 
 75 |   cudaMalloc((void **)&d_transform0_, sizeof(cost_t) * total_pixel);
 76 |   cudaMalloc((void **)&d_transform1_, sizeof(cost_t) * total_pixel);
 77 | 
 78 |   int cost_volume_size = total_pixel * MAX_DISPARITY;
 79 |   cudaMalloc((void **)&d_cost_, sizeof(uint8_t) * cost_volume_size);
 80 | 
 81 |   cudaMalloc((void **)&d_L0_, sizeof(uint8_t) * cost_volume_size);
 82 |   cudaMalloc((void **)&d_L1_, sizeof(uint8_t) * cost_volume_size);
 83 |   cudaMalloc((void **)&d_L2_, sizeof(uint8_t) * cost_volume_size);
 84 |   cudaMalloc((void **)&d_L3_, sizeof(uint8_t) * cost_volume_size);
 85 |   cudaMalloc((void **)&d_L4_, sizeof(uint8_t) * cost_volume_size);
 86 |   cudaMalloc((void **)&d_L5_, sizeof(uint8_t) * cost_volume_size);
 87 |   cudaMalloc((void **)&d_L6_, sizeof(uint8_t) * cost_volume_size);
 88 |   cudaMalloc((void **)&d_L7_, sizeof(uint8_t) * cost_volume_size);
 89 | 
 90 |   cudaMalloc((void **)&d_s_, sizeof(uint16_t) * cost_volume_size);
 91 | 
 92 |   cudaMalloc((void **)&d_disparity_, sizeof(uint8_t) * total_pixel);
 93 |   cudaMalloc((void **)&d_disparity_filtered_uchar_, sizeof(uint8_t) * total_pixel);
 94 |   cudaMalloc((void **)&d_disparity_right_, sizeof(uint8_t) * total_pixel);
 95 |   cudaMalloc((void **)&d_disparity_right_filtered_uchar_, sizeof(uint8_t) * total_pixel);
 96 | 
 97 |   memory_allocated_ = true;
 98 | }
 99 | 
100 | void SgmGpu::freeMemory() {
101 |   if (!memory_allocated_)
102 |     return;
103 | 
104 |   cudaFree(d_im0_);
105 |   cudaFree(d_im1_);
106 |   cudaFree(d_transform0_);
107 |   cudaFree(d_transform1_);
108 |   cudaFree(d_L0_);
109 |   cudaFree(d_L1_);
110 |   cudaFree(d_L2_);
111 |   cudaFree(d_L3_);
112 |   cudaFree(d_L4_);
113 |   cudaFree(d_L5_);
114 |   cudaFree(d_L6_);
115 |   cudaFree(d_L7_);
116 |   cudaFree(d_disparity_);
117 |   cudaFree(d_disparity_filtered_uchar_);
118 |   cudaFree(d_disparity_right_);
119 |   cudaFree(d_disparity_right_filtered_uchar_);
120 |   cudaFree(d_cost_);
121 |   cudaFree(d_s_);
122 | 
123 |   memory_allocated_ = false;
124 | }
125 | 
126 | 
127 | bool SgmGpu::computeDisparity(
128 |   const sensor_msgs::Image& left_image, 
129 |   const sensor_msgs::Image& right_image,
130 |   const sensor_msgs::CameraInfo& left_camera_info,
131 |   const sensor_msgs::CameraInfo& right_camera_info,
132 |   stereo_msgs::DisparityImage& disparity_msg
133 | )
134 | {
135 |   if (left_image.width != right_image.width || left_image.height != right_image.height)
136 |   {
137 |     ROS_ERROR_STREAM_NAMED("libsgm_gpu",
138 |       "Image dimension of left and right are not same: \n" << 
139 |       "Left: " << left_image.width << "x" << left_image.height << "\n" <<
140 |       "Right: " << right_image.width << "x" << right_image.height
141 |     );
142 |     return false;
143 |   }
144 |   
145 |   if (left_image.encoding != right_image.encoding)
146 |   {
147 |     ROS_ERROR_STREAM_NAMED("libsgm_gpu",
148 |       "Image encoding of left and right are not same: \n" << 
149 |       "Left: " << left_image.encoding << "\n" <<
150 |       "Right: " << right_image.encoding
151 |     );
152 |     return false;
153 |   }
154 | 
155 |   // Convert to 8 bit grayscale image
156 |   cv_bridge::CvImagePtr left_mono8 = cv_bridge::toCvCopy(
157 |     left_image, 
158 |     sensor_msgs::image_encodings::MONO8
159 |   );
160 |   cv_bridge::CvImagePtr right_mono8 = cv_bridge::toCvCopy(
161 |     right_image, 
162 |     sensor_msgs::image_encodings::MONO8
163 |   );
164 |   
165 |   // Resize images to their width and height divisible by 4 for limit of CUDA code
166 |   resizeToDivisibleBy4(left_mono8->image, right_mono8->image);
167 | 
168 |   // Reallocate memory if needed
169 |   bool size_changed = (
170 |     cols_ != left_mono8->image.cols || 
171 |     rows_ != left_mono8->image.rows
172 |   );
173 |   if (!memory_allocated_ || size_changed)
174 |     allocateMemory(left_mono8->image.cols, left_mono8->image.rows);
175 |   
176 |   // Copy image to GPU device
177 |   size_t mono8_image_size = left_mono8->image.total() * sizeof(uint8_t);
178 |   cudaMemcpyAsync(d_im0_, left_mono8->image.ptr<uint8_t>(), 
179 |     mono8_image_size, cudaMemcpyHostToDevice, stream1_);
180 |   cudaMemcpyAsync(d_im1_, right_mono8->image.ptr<uint8_t>(), 
181 |     mono8_image_size, cudaMemcpyHostToDevice, stream1_);
182 | 
183 |   BLOCK_SIZE_.x = 32;
184 |   BLOCK_SIZE_.y = 32;
185 | 
186 |   grid_size_.x = (cols_ + BLOCK_SIZE_.x-1) / BLOCK_SIZE_.x;
187 |   grid_size_.y = (rows_ + BLOCK_SIZE_.y-1) / BLOCK_SIZE_.y;
188 | 
189 |   CenterSymmetricCensusKernelSM2<<<grid_size_, BLOCK_SIZE_, 0, stream1_>>>(d_im0_, d_im1_, d_transform0_, d_transform1_, rows_, cols_);
190 | 
191 |   cudaStreamSynchronize(stream1_);
192 |   HammingDistanceCostKernel<<<rows_, MAX_DISPARITY, 0, stream1_>>>(d_transform0_, d_transform1_, d_cost_, rows_, cols_);
193 | 
194 |   const int PIXELS_PER_BLOCK = COSTAGG_BLOCKSIZE/WARP_SIZE;
195 |   const int PIXELS_PER_BLOCK_HORIZ = COSTAGG_BLOCKSIZE_HORIZ/WARP_SIZE;
196 | 
197 |   // Cost Aggregation
198 |   CostAggregationKernelLeftToRight<<<(rows_+PIXELS_PER_BLOCK_HORIZ-1)/PIXELS_PER_BLOCK_HORIZ, COSTAGG_BLOCKSIZE_HORIZ, 0, stream2_>>>(d_cost_, d_L0_, d_s_, p1_, p2_, rows_, cols_, d_transform0_, d_transform1_, d_disparity_, d_L0_, d_L1_, d_L2_, d_L3_, d_L4_, d_L5_, d_L6_);
199 |   CostAggregationKernelRightToLeft<<<(rows_+PIXELS_PER_BLOCK_HORIZ-1)/PIXELS_PER_BLOCK_HORIZ, COSTAGG_BLOCKSIZE_HORIZ, 0, stream3_>>>(d_cost_, d_L1_, d_s_, p1_, p2_, rows_, cols_, d_transform0_, d_transform1_, d_disparity_, d_L0_, d_L1_, d_L2_, d_L3_, d_L4_, d_L5_, d_L6_);
200 |   CostAggregationKernelUpToDown<<<(cols_+PIXELS_PER_BLOCK-1)/PIXELS_PER_BLOCK, COSTAGG_BLOCKSIZE, 0, stream1_>>>(d_cost_, d_L2_, d_s_, p1_, p2_, rows_, cols_, d_transform0_, d_transform1_, d_disparity_, d_L0_, d_L1_, d_L2_, d_L3_, d_L4_, d_L5_, d_L6_);
201 |   CostAggregationKernelDownToUp<<<(cols_+PIXELS_PER_BLOCK-1)/PIXELS_PER_BLOCK, COSTAGG_BLOCKSIZE, 0, stream1_>>>(d_cost_, d_L3_, d_s_, p1_, p2_, rows_, cols_, d_transform0_, d_transform1_, d_disparity_, d_L0_, d_L1_, d_L2_, d_L3_, d_L4_, d_L5_, d_L6_);
202 |   CostAggregationKernelDiagonalDownUpLeftRight<<<(cols_+PIXELS_PER_BLOCK-1)/PIXELS_PER_BLOCK, COSTAGG_BLOCKSIZE, 0, stream1_>>>(d_cost_, d_L4_, d_s_, p1_, p2_, rows_, cols_, d_transform0_, d_transform1_, d_disparity_, d_L0_, d_L1_, d_L2_, d_L3_, d_L4_, d_L5_, d_L6_);
203 |   CostAggregationKernelDiagonalUpDownLeftRight<<<(cols_+PIXELS_PER_BLOCK-1)/PIXELS_PER_BLOCK, COSTAGG_BLOCKSIZE, 0, stream1_>>>(d_cost_, d_L5_, d_s_, p1_, p2_, rows_, cols_, d_transform0_, d_transform1_, d_disparity_, d_L0_, d_L1_, d_L2_, d_L3_, d_L4_, d_L5_, d_L6_);
204 |   CostAggregationKernelDiagonalDownUpRightLeft<<<(cols_+PIXELS_PER_BLOCK-1)/PIXELS_PER_BLOCK, COSTAGG_BLOCKSIZE, 0, stream1_>>>(d_cost_, d_L6_, d_s_, p1_, p2_, rows_, cols_, d_transform0_, d_transform1_, d_disparity_, d_L0_, d_L1_, d_L2_, d_L3_, d_L4_, d_L5_, d_L6_);
205 |   CostAggregationKernelDiagonalUpDownRightLeft<<<(cols_+PIXELS_PER_BLOCK-1)/PIXELS_PER_BLOCK, COSTAGG_BLOCKSIZE, 0, stream1_>>>(d_cost_, d_L7_, d_s_, p1_, p2_, rows_, cols_, d_transform0_, d_transform1_, d_disparity_, d_L0_, d_L1_, d_L2_, d_L3_, d_L4_, d_L5_, d_L6_);
206 | 
207 |   int total_pixel = rows_ * cols_;
208 |   MedianFilter3x3<<<(total_pixel+MAX_DISPARITY-1)/MAX_DISPARITY, MAX_DISPARITY, 0, stream1_>>>(d_disparity_, d_disparity_filtered_uchar_, rows_, cols_);
209 |   
210 |   if (check_consistency_) {
211 |     ChooseRightDisparity<<<grid_size_, BLOCK_SIZE_, 0, stream1_>>>(d_disparity_right_, d_s_, rows_, cols_);
212 |     MedianFilter3x3<<<(total_pixel+MAX_DISPARITY-1)/MAX_DISPARITY, MAX_DISPARITY, 0, stream1_>>>(d_disparity_right_, d_disparity_right_filtered_uchar_, rows_, cols_);
213 |     
214 |     LeftRightConsistencyCheck<<<grid_size_, BLOCK_SIZE_, 0, stream1_>>>(d_disparity_filtered_uchar_, d_disparity_right_filtered_uchar_, rows_, cols_);
215 |   }
216 |   cudaError_t err = cudaGetLastError();
217 |   if (err != cudaSuccess) {
218 |     ROS_ERROR_NAMED("libsgm_gpu", "%s %d\n", cudaGetErrorString(err), err);
219 |     return false;
220 |   }
221 | 
222 |   cudaDeviceSynchronize();
223 |   cv::Mat disparity(rows_, cols_, CV_8UC1);
224 |   cudaMemcpy(disparity.data, d_disparity_filtered_uchar_, sizeof(uint8_t)*total_pixel, cudaMemcpyDeviceToHost);
225 | 
226 |   // Restore image size if resized to be divisible by 4
227 |   if (cols_ != left_image.width || rows_ != left_image.height)
228 |   {
229 |     cv::Size input_size(left_image.width, left_image.height);
230 |     cv::resize(disparity, disparity, input_size, 0, 0, cv::INTER_AREA);
231 |   }
232 | 
233 |   convertToMsg(disparity, left_camera_info, right_camera_info, disparity_msg);
234 | 
235 |   return true;
236 | }
237 | 
238 | void SgmGpu::resizeToDivisibleBy4(cv::Mat& left_image, cv::Mat& right_image)
239 | {
240 |   bool need_resize = false;
241 |   cv::Size original_size, resized_size; 
242 | 
243 |   original_size = cv::Size(left_image.cols, left_image.rows);
244 |   resized_size = original_size;
245 |   if (original_size.width % 4 != 0)
246 |   {
247 |     need_resize = true;
248 |     resized_size.width = (original_size.width / 4 + 1) * 4;
249 |   }
250 |   if (original_size.height % 4 != 0)
251 |   {
252 |     need_resize = true;
253 |     resized_size.height = (original_size.height / 4 + 1) * 4;
254 |   }
255 | 
256 |   if (need_resize)
257 |   {
258 |     cv::resize(left_image, left_image, resized_size, 0, 0, cv::INTER_LINEAR);
259 |     cv::resize(right_image, right_image, resized_size, 0, 0, cv::INTER_LINEAR);
260 |   }
261 | }
262 | 
263 | void SgmGpu::convertToMsg(
264 |   const cv::Mat_<unsigned char>& disparity, 
265 |   const sensor_msgs::CameraInfo& left_camera_info,
266 |   const sensor_msgs::CameraInfo& right_camera_info,
267 |   stereo_msgs::DisparityImage& disparity_msg
268 | )
269 | {
270 |   cv::Mat disparity_32f;
271 |   disparity.convertTo(disparity_32f, CV_32F);
272 |   cv_bridge::CvImage disparity_converter(
273 |     left_camera_info.header, 
274 |     sensor_msgs::image_encodings::TYPE_32FC1, 
275 |     disparity_32f
276 |   );
277 |   disparity_converter.toImageMsg(disparity_msg.image);
278 | 
279 |   disparity_msg.header = left_camera_info.header;
280 | 
281 |   image_geometry::StereoCameraModel stereo_model;
282 |   stereo_model.fromCameraInfo(left_camera_info, right_camera_info);
283 |   disparity_msg.f = stereo_model.left().fx();
284 |   disparity_msg.T = stereo_model.baseline();
285 | 
286 |   disparity_msg.min_disparity = 0.0;
287 |   disparity_msg.max_disparity = MAX_DISPARITY;
288 |   disparity_msg.delta_d = 1.0;
289 | }
290 | 
291 | } // namespace sgm_gpu
292 | 


--------------------------------------------------------------------------------
/src/sgm_gpu_node.cpp:
--------------------------------------------------------------------------------
 1 | /***********************************************************************
 2 |   Copyright (C) 2020 Hironori Fujimoto
 3 | 
 4 |   This program is free software: you can redistribute it and/or modify
 5 |   it under the terms of the GNU General Public License as published by
 6 |   the Free Software Foundation, either version 3 of the License, or
 7 |   (at your option) any later version.
 8 |  
 9 |   This program is distributed in the hope that it will be useful,
10 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |   GNU General Public License for more details.
13 |   You should have received a copy of the GNU General Public License
14 |   along with this program.  If not, see <http://www.gnu.org/licenses/>.
15 | ***********************************************************************/
16 | 
17 | #include "sgm_gpu_node.h"
18 | 
19 | namespace sgm_gpu
20 | {
21 | 
22 | SgmGpuNode::SgmGpuNode()
23 | {
24 |   node_handle_.reset(new ros::NodeHandle());
25 |   private_node_handle_.reset(new ros::NodeHandle("~"));
26 | 
27 |   image_transport_.reset(new image_transport::ImageTransport(*node_handle_));
28 | 
29 |   sgm_.reset(new SgmGpu(*private_node_handle_));
30 | 
31 |   disparity_pub_ = private_node_handle_->advertise<stereo_msgs::DisparityImage>("disparity", 1);
32 | 
33 |   // Subscribe left and right Image topic
34 |   std::string left_base_topic = node_handle_->resolveName("left_image");
35 |   std::string right_base_topic = node_handle_->resolveName("right_image");
36 |   left_image_sub_.subscribe(*image_transport_, left_base_topic, 10);
37 |   right_image_sub_.subscribe(*image_transport_, right_base_topic, 10);
38 | 
39 |   // Find CameraInfo topic from corresponded Image topic and subscribe it
40 |   std::string left_info_topic = image_transport::getCameraInfoTopic(left_base_topic);
41 |   std::string right_info_topic = image_transport::getCameraInfoTopic(right_base_topic);
42 |   left_info_sub_.subscribe(*node_handle_, left_info_topic, 10);
43 |   right_info_sub_.subscribe(*node_handle_, right_info_topic, 10);
44 | 
45 |   stereo_synchronizer_.reset(
46 |     new StereoSynchronizer(left_image_sub_, right_image_sub_, left_info_sub_, right_info_sub_, 10)
47 |   );
48 |   stereo_synchronizer_->registerCallback(&SgmGpuNode::stereoCallback, this);
49 | }
50 | 
51 | void SgmGpuNode::stereoCallback(
52 |   const sensor_msgs::ImageConstPtr &left_image,
53 |   const sensor_msgs::ImageConstPtr &right_image,
54 |   const sensor_msgs::CameraInfoConstPtr &left_info,
55 |   const sensor_msgs::CameraInfoConstPtr &right_info
56 | )
57 | {
58 |   if (disparity_pub_.getNumSubscribers() == 0)
59 |     return;
60 | 
61 |   stereo_msgs::DisparityImage disparity;
62 |   sgm_->computeDisparity(*left_image, *right_image, *left_info, *right_info, disparity);
63 | 
64 |   disparity_pub_.publish(disparity);
65 | }
66 | 
67 | } // namespace sgm_gpu
68 | 


--------------------------------------------------------------------------------
/src/sgm_gpu_node_main.cpp:
--------------------------------------------------------------------------------
 1 | /***********************************************************************
 2 |   Copyright (C) 2020 Hironori Fujimoto
 3 | 
 4 |   This program is free software: you can redistribute it and/or modify
 5 |   it under the terms of the GNU General Public License as published by
 6 |   the Free Software Foundation, either version 3 of the License, or
 7 |   (at your option) any later version.
 8 |  
 9 |   This program is distributed in the hope that it will be useful,
10 |   but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |   GNU General Public License for more details.
13 |   You should have received a copy of the GNU General Public License
14 |   along with this program.  If not, see <http://www.gnu.org/licenses/>.
15 | ***********************************************************************/
16 | 
17 | #include "sgm_gpu_node.h"
18 | #include <ros/ros.h>
19 | 
20 | int main(int argc, char** argv)
21 | {
22 |   ros::init(argc, argv, "sgm_gpu_node");
23 |   sgm_gpu::SgmGpuNode sgm_gpu;
24 |   ros::spin();
25 | 
26 |   return 0;
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/test_input.bag:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ActiveIntelligentSystemsLab/sgm_gpu_ros/72920928827efa7b0aec9c77b6bfe749a667caf7/test_input.bag


--------------------------------------------------------------------------------