├── .gitignore
├── APACHE_LICENSE-2_0
├── AUTHORS
├── CMakeLists.txt
├── LICENSE
├── NOTICE
├── Papers.bib
├── README
├── README.md
├── ReleaseAuthorization.pdf
├── gpu.c
├── gpu.h
├── kdtree
    ├── Bounds1d.c
    ├── Bounds1d.h
    ├── Bounds2d.c
    ├── Bounds2d.h
    ├── CMakeLists.txt
    ├── Globals1d.h
    ├── Globals2d.h
    ├── KDTree1d.c
    ├── KDTree1d.h
    ├── KDTree2d.c
    └── KDTree2d.h
├── neigh.c
├── neigh2d.c
├── neigh2d_kern.cl
├── neigh_kern.cl
├── remap.c
├── remap2d.c
├── remap2d_kern.cl
├── remap_kern.cl
├── sort.c
├── sort2d.c
├── sort2d_kern.cl
├── sort_kern.cl
├── table.c
├── table.data
├── table_kern.cl
├── tablelarge.c
├── tablelarge.data
├── tablelarge_kern.cl
├── timer.c
└── timer.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Object files
 2 | *.o
 3 | *.ko
 4 | 
 5 | # Libraries
 6 | *.lib
 7 | *.a
 8 | 
 9 | # Shared objects (inc. Windows DLLs)
10 | *.dll
11 | *.so
12 | *.so.*
13 | *.dylib
14 | 
15 | # Executables
16 | *.exe
17 | *.out
18 | *.app
19 | 


--------------------------------------------------------------------------------
/APACHE_LICENSE-2_0:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 
204 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
 1 | Authors:
 2 | 
 3 |  Bob Robey XCP-2 (brobey@lanl.gov)
 4 | 
 5 |  David Nicholaeff (dnic@lanl.gov, mtrxKnight@aol.com)
 6 | 
 7 |  Rachel Robey (rnrobey@gmail.com)
 8 | 
 9 |  Marcus Daniels (mdaniels@lanl.gov)
10 | 
11 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required (VERSION 3.1)
 2 | project (PerfectHash)
 3 | 
 4 | if (DEVICE_DETECT_DEBUG)
 5 |    add_definitions(-DDEVICE_DETECT_DEBUG=1)
 6 | endif (DEVICE_DETECT_DEBUG)
 7 | 
 8 | if (NOT CMAKE_BUILD_TYPE)
 9 |    set(CMAKE_BUILD_TYPE RelWithDebInfo)
10 | endif() 
11 | 
12 | find_package(OpenCL)
13 | if (OpenCL_FOUND)
14 |    add_definitions(-DHAVE_OPENCL)
15 |    set(HAVE_CL_DOUBLE ON CACHE BOOL "Have OpenCL Double")
16 |    set(NO_CL_DOUBLE OFF)
17 |    include_directories(${OpenCL_INCLUDE_DIRS})
18 |    #message("OpenCL_INCLUDE_DIRS ${OpenCL_INCLUDE_DIRS}")
19 |    #message("OpenCL_LIBRARIES ${OpenCL_LIBRARIES}")
20 | endif (OpenCL_FOUND)
21 | 
22 | add_subdirectory(kdtree)
23 | 
24 | # Adds build target of sort with source code files
25 | add_executable(sort sort.c gpu.c timer.c gpu.h timer.h)
26 | target_link_libraries(sort ${OpenCL_LIBRARIES} m)
27 | 
28 | # Adds build target of sort2d with source code files
29 | add_executable(sort2d sort2d.c gpu.c timer.c gpu.h timer.h)
30 | target_link_libraries(sort2d ${OpenCL_LIBRARIES} m)
31 | 
32 | # Adds build target of remap with source code files
33 | add_executable(remap remap.c gpu.c timer.c gpu.h timer.h)
34 | target_link_libraries(remap ${OpenCL_LIBRARIES} kdtree m)
35 | 
36 | # Adds build target of remap2d with source code files
37 | add_executable(remap2d remap2d.c gpu.c timer.c gpu.h timer.h)
38 | target_link_libraries(remap2d ${OpenCL_LIBRARIES} kdtree m)
39 | 
40 | # Adds build target of neigh with source code files
41 | add_executable(neigh neigh.c gpu.c timer.c gpu.h timer.h)
42 | target_link_libraries(neigh ${OpenCL_LIBRARIES} kdtree m)
43 | 
44 | # Adds build target of neigh2d with source code files
45 | add_executable(neigh2d neigh2d.c gpu.c timer.c gpu.h timer.h)
46 | target_link_libraries(neigh2d ${OpenCL_LIBRARIES} kdtree m)
47 | 
48 | # Adds build target of table with source code files
49 | add_executable(table table.c gpu.c timer.c gpu.h timer.h)
50 | target_link_libraries(table ${OpenCL_LIBRARIES} m)
51 | 
52 | # Adds build target of tablelarge with source code files
53 | add_executable(tablelarge tablelarge.c gpu.c timer.c gpu.h timer.h)
54 | target_link_libraries(tablelarge ${OpenCL_LIBRARIES} m)
55 | 
56 | # Cleanup
57 | SET_DIRECTORY_PROPERTIES(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES
58 |        	"CMakeCache.txt;Makefile;cmake_install.cmake;ipo_out.optrpt")
59 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
 3 |  *  All rights Reserved.
 4 |  *
 5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
 6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
 7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
 8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
 9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
12 |  * to produce derivative works, such modified software should be clearly marked,
13 |  * so as not to confuse it with the version available from LANL.   
14 |  *
15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
16 |  * use this file except in compliance with the License. You may obtain a copy
17 |  * of the License at 
18 |  *
19 |  * http://www.apache.org/licenses/LICENSE-2.0
20 |  *
21 |  * Unless required by applicable law or agreed to in writing, software distributed
22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
24 |  * specific language governing permissions and limitations under the License.”
25 |  *
26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
27 |  *
28 |  *  Authors: Bob Robey       XCP-2   brobey@lanl.gov
29 |  *           David Nicholaeff        dnic@lanl.gov, mtrxknight@aol.com
30 |  *           Rachel Robey            rnrobey@gmail.com
31 |  * 
32 |  */
33 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | This is the code released under LANL Copyright Disclosure C13002/LA-CC-12-022
2 | Copyright 2012-2018.  Triad National Security, LLC. This material was produced
3 | under U.S. Government contract 89233218CNA000001 for Los Alamos National
4 | Laboratory (LANL), which is operated by Triad National Security, LLC
5 | for the U.S. Department of Energy. See LICENSE file for details.
6 | 


--------------------------------------------------------------------------------
/Papers.bib:
--------------------------------------------------------------------------------
 1 | %% This BibTeX bibliography file was created using BibDesk.
 2 | %% http://bibdesk.sourceforge.net/
 3 | 
 4 | 
 5 | %% Created for lbrobey at 2014-02-16 09:42:24 -0700 
 6 | 
 7 | 
 8 | %% Saved with string encoding Unicode (UTF-8) 
 9 | 
10 | 
11 | 
12 | @article{Robey_RN_2013,
13 | 	Author = {Robey, R.N. and Nicholaeff, D. and Robey, R.W.},
14 | 	Date-Added = {2012-04-29 14:16:38 -0600},
15 | 	Date-Modified = {2014-02-16 16:41:30 +0000},
16 | 	Journal = {SIAM Journal of Scientific Computing},
17 | 	Month = {July},
18 | 	Number = {4},
19 | 	Pages = {C346--C368},
20 | 	Title = {Hash-Based Algorithms for Discretized Data},
21 | 	Volume = {35},
22 | 	Year = {2013}}
23 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | This code is a set of hash functions to support the paper "Hash-based Algorithms
 2 | for Discretized Data" to be published in the SIAM Journal of Scientific 
 3 | Computing". The publication details are below and in the Papers.bib file in bibtex
 4 | format. 
 5 | 
 6 |    Robey,R.N., Nicholaeff,D., and Robey,R.W. "Hash-Based Algorithms for Discretized Data", 
 7 |    SIAM Journal of Scientific Computing, July 2013, Volume 35, Number 4, C346--C368
 8 | 
 9 |    The pre-publication version has the LANL report number LA-UR-12-01566.
10 | 
11 | This code has been released under an open-source Apache 2 license to
12 | encourage further development of hashing methods. See the LICENSE file for more
13 | information about the license and the use of this code.
14 | 
15 | Through a web search we hope to gather statistics on the use of the method
16 | and its improvements and help to encourage more open technology transfer by LANL
17 | and other government research organizations. This can be thought of as analagous
18 | to journal article citations, but within software products. If code is not
19 | distributed with a software product, a reference should be provided in a text
20 | file so that attribution can be determined.
21 | 
22 | Under this license, it is required to include a reference to this work. We
23 | request that each derivative work contain a reference to LANL Copyright 
24 | Disclosure C13002/LA-CC-12-022 so that this work’s impact can be roughly
25 | measured. In addition, it is requested that a modifier is included as in
26 | the following example:
27 | 
28 | //<Uses | improves on | modified from> LANL Copyright Disclosure C13002/LA-CC-12-022
29 | 
30 | This is LANL Copyright Disclosure C13002/LA-CC-12-022
31 | 
32 | Authors: Bob Robey       XCP-2   brobey@lanl.gov
33 |          David Nicholaeff        dnic@lanl.gov, mtrxknight@aol.com
34 |          Rachel Robey            rnrobey@gmail.com
35 | 
36 | This code uses cmake for builds. To build the code:
37 | 
38 | cmake .
39 | make
40 | 
41 | There will be several executables built -- sort, sort2d, neigh, neigh2d, remap, remap2d, table
42 | 
43 | Each executable runs all of the methods for the mesh operation. There is a controlling
44 | loop at the top of the main routine that users may want to modify for the algorithms
45 | to be run. Also, there is a random number seed for varying the problem setups that is
46 | normally off, but users may want to turn on for some investigations. Results do vary for
47 | different problems, but the pattern does not change significantly. Also, the OpenCL library
48 | looks for a GPU to use for the OpenCL code. This may need to be modified for your particular
49 | hardware.
50 | 
51 | Output should look something like the following:
52 | 
53 |     Sorting Performance Results
54 | 
55 | Size,   	Qsort,    	Heapsort, 	Mergesort, 	Hash CPU, 	Hash GPU
56 | 
57 | Max diff is 1 times min_diff
58 | 1024,     	0.000105,	0.000168,	0.000123,	0.000010,	0.000294,
59 | 2048,     	0.000223,	0.000353,	0.000262,	0.000020,	0.000330,
60 | 4096,     	0.000493,	0.000781,	0.000573,	0.000040,	0.000400,
61 | 8192,     	0.001036,	0.001683,	0.001135,	0.000079,	0.000610,
62 | 16384,     	0.002242,	0.003470,	0.002613,	0.000161,	0.000934,
63 | 32768,     	0.005360,	0.007348,	0.005469,	0.000415,	0.001535,
64 | 65536,     	0.011800,	0.015796,	0.012560,	0.000800,	0.002629,
65 | 131072,     	0.020707,	0.036958,	0.024513,	0.001561,	0.005294,
66 | 262144,     	0.042710,	0.075466,	0.052864,	0.003751,	0.011301,
67 | 524288,     	0.089662,	0.167398,	0.113505,	0.011855,	0.024291,
68 | 1048576,     	0.185135,	0.407333,	0.249813,	0.026447,	0.018890,
69 | 2097152,     	0.385789,	1.018970,	0.498638,	0.063970,	0.114473,
70 | 4194304,     	0.818016,	2.475966,	1.052133,	0.141997,	0.246392,
71 | 8388608,     	1.696586,	5.843146,	2.184568,	0.307373,	0.503904,
72 | 
73 | Max diff is 2 times min_diff
74 | 1024,     	0.000102,	0.000168,	0.000121,	0.000015,	0.000303,
75 | 2048,     	0.000224,	0.000332,	0.000240,	0.000026,	0.000339,
76 |    ...
77 |    ...
78 |    ...
79 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | PerfectHash
 2 | ===========
 3 | 
 4 | A perfect hash code for CPUs and GPUs using OpenCL
 5 | 
 6 | This code is a set of hash functions to support the paper "Hash-based Algorithms
 7 | for Discretized Data to be published in the SIAM Journal of Scientific 
 8 | Computing". The publication details are below and in the Papers.bib file in bibtex
 9 | format. 
10 | 
11 |     Robey,R.N., Nicholaeff,D., and Robey,R.W. "Hash-Based Algorithms for Discretized Data", 
12 |     SIAM Journal of Scientific Computing, July 2013, Volume 35, Number 4, C346--C368
13 | 


--------------------------------------------------------------------------------
/ReleaseAuthorization.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lanl/PerfectHash/be8c6e1b5afad67dd4c656cd689d441d0e95a433/ReleaseAuthorization.pdf


--------------------------------------------------------------------------------
/gpu.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
  3 |  *  All rights Reserved.
  4 |  *
  5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
  6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
  7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
  8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
  9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
 10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
 11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
 12 |  * to produce derivative works, such modified software should be clearly marked,
 13 |  * so as not to confuse it with the version available from LANL.   
 14 |  *
 15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 16 |  * use this file except in compliance with the License. You may obtain a copy
 17 |  * of the License at 
 18 |  *
 19 |  * http://www.apache.org/licenses/LICENSE-2.0
 20 |  *
 21 |  * Unless required by applicable law or agreed to in writing, software distributed
 22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 24 |  * specific language governing permissions and limitations under the License.”
 25 |  *
 26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
 27 |  *
 28 |  */
 29 | 
 30 | /*
 31 |  *  Authors: Bob Robey       XCP-2   brobey@lanl.gov
 32 |  *           David Nicholaeff        dnic@lanl.gov, mtrxknight@aol.com
 33 |  *           Rachel Robey            rnrobey@gmail.com
 34 |  */
 35 | 
 36 | #include <stdlib.h>
 37 | #include <stdio.h>
 38 | #include <string.h>
 39 | #include <sys/stat.h>
 40 | #include "gpu.h"
 41 | 
 42 | #ifdef HAVE_CL_DOUBLE
 43 | typedef double real;
 44 | #ifdef HAVE_OPENCL
 45 | typedef cl_double cl_real;
 46 | typedef cl_double4 cl_real4;
 47 | #endif
 48 | #else
 49 | typedef float real;
 50 | #ifdef HAVE_OPENCL
 51 | typedef cl_float cl_real;
 52 | typedef cl_float4 cl_real4;
 53 | #endif
 54 | #endif
 55 | 
 56 | #ifndef DEVICE_DETECT_DEBUG
 57 | #define DEVICE_DETECT_DEBUG 0
 58 | #endif
 59 | 
 60 | #ifdef HAVE_OPENCL
 61 | void GPUInit(cl_context *context, cl_command_queue *queue, int *is_nvidia, cl_program *program, char *filename) {
 62 |     
 63 |    cl_platform_id* platforms;
 64 |    cl_platform_id platform = NULL;
 65 |    cl_uint num_platforms;
 66 |    cl_uint num_devices;
 67 |    cl_device_id* devices;
 68 |    cl_uint nDevices_selected=0;
 69 |    int *device_appropriate;
 70 |    int device_selected = -99;
 71 |    cl_int platform_selected = -1;
 72 |    //cl_program program;
 73 |    cl_int ierr = 0;
 74 |   
 75 |    // Get the number of platforms first, then allocate and get the platform
 76 |    ierr = clGetPlatformIDs(0, NULL, &num_platforms);
 77 |    if (ierr != CL_SUCCESS){
 78 |       printf("GPU_INIT: Error with clGetPlatformIDs call in file %s at line %d\n", __FILE__, __LINE__);
 79 |       if (ierr == CL_INVALID_VALUE){
 80 |          printf("GPU_INIT: Invalid value in clGetPlatformID call\n");
 81 |       }
 82 |       exit(ierr);
 83 |    }
 84 |    if (num_platforms == 0) {
 85 |       printf("GPU_INIT: Error -- No opencl platforms detected in file %s at line %d\n", __FILE__, __LINE__);
 86 |       exit(-1);
 87 |    }
 88 |    if (DEVICE_DETECT_DEBUG){
 89 |       printf("\n\nGPU_INIT: %d opencl platform(s) detected\n",num_platforms);
 90 |    }
 91 | 
 92 |    platforms = (cl_platform_id *)malloc(num_platforms*sizeof(cl_platform_id));
 93 | 
 94 |    ierr = clGetPlatformIDs(num_platforms, platforms, NULL);
 95 |    if (ierr != CL_SUCCESS){
 96 |       printf("GPU_INIT: Error with clGetPlatformIDs call in file %s at line %d\n", __FILE__, __LINE__);
 97 |       if (ierr == CL_INVALID_VALUE){
 98 |          printf("Invalid value in clGetPlatformID call\n");
 99 |       }
100 |    }
101 | 
102 |    if (DEVICE_DETECT_DEBUG){
103 |       char info[1024];
104 |       for (uint iplatform=0; iplatform<num_platforms; iplatform++){
105 |          printf("  Platform %d:\n",iplatform+1);
106 | 
107 |          //clGetPlatformInfo(platforms[iplatform],CL_PLATFORM_PROFILE,   1024L,info,0);
108 |          //printf("    CL_PLATFORM_PROFILE    : %s\n",info);
109 | 
110 |          clGetPlatformInfo(platforms[iplatform],CL_PLATFORM_VERSION,   1024L,info,0);
111 |          printf("    CL_PLATFORM_VERSION    : %s\n",info);
112 | 
113 |          clGetPlatformInfo(platforms[iplatform],CL_PLATFORM_NAME,      1024L,info,0);
114 |          printf("    CL_PLATFORM_NAME       : %s\n",info);
115 | 
116 |          clGetPlatformInfo(platforms[iplatform],CL_PLATFORM_VENDOR,    1024L,info,0);
117 |          printf("    CL_PLATFORM_VENDOR     : %s\n",info);
118 | 
119 |          //clGetPlatformInfo(platforms[iplatform],CL_PLATFORM_EXTENSIONS,1024L,info,0);
120 |          //printf("    CL_PLATFORM_EXTENSIONS : %s\n",info);
121 |       }
122 |       printf("\n");
123 |    }
124 | 
125 |    char info[1024];
126 |    clGetPlatformInfo(platforms[0],CL_PLATFORM_VENDOR, 1024, info, 0);
127 | 
128 |    // Get the number of devices, allocate, and get the devices
129 |    for (uint iplatform=0; iplatform<num_platforms; iplatform++){
130 |       ierr = clGetDeviceIDs(platforms[iplatform],CL_DEVICE_TYPE_GPU,0,NULL,&num_devices);
131 |       if (ierr == CL_DEVICE_NOT_FOUND) {
132 |          if (DEVICE_DETECT_DEBUG) {
133 |            printf("Warning: Device of requested type not found for platform %d in clGetDeviceID call\n",iplatform);
134 |          }
135 |          continue;
136 |       }
137 |       if (ierr != CL_SUCCESS) {
138 |         /* Possible Errors
139 |          *  CL_INVALID_PLATFORM:
140 |          *  CL_INVALID_DEVICE_TYPE:
141 |          *  CL_INVALID_VALUE:
142 |          *  CL_DEVICE_NOT_FOUND:
143 |          */
144 |         printf("GPU_INIT clGetDeviceIDs ierr %d file %s line %d\n", ierr, __FILE__, __LINE__);
145 |       }
146 |       if (DEVICE_DETECT_DEBUG){
147 |          printf("GPU_INIT: %d opencl devices(s) detected\n",num_devices);
148 |       }
149 |       platform_selected = iplatform;
150 |       platform = platforms[iplatform];
151 |       nDevices_selected = num_devices;
152 |    }
153 | 
154 |    if (platform_selected == -1){
155 |       printf("Warning: Device of requested type not found in clGetDeviceID call\n");
156 |       exit(-1);
157 |    }
158 | 
159 |    num_devices = nDevices_selected;
160 | 
161 |    devices = (cl_device_id *)malloc(num_devices*sizeof(cl_device_id));
162 |    device_appropriate = malloc(num_devices*sizeof(int));
163 |   
164 |    ierr = clGetDeviceIDs(platforms[platform_selected], CL_DEVICE_TYPE_GPU, num_devices, devices, NULL);
165 |    if(ierr != CL_SUCCESS) {
166 |      printf("Error getting device ids\n");
167 |      exit(ierr);
168 |    }
169 |  
170 |   int idevice_appropriate = 0;
171 |   for (uint idevice=0; idevice<num_devices; idevice++){
172 |      device_appropriate[idevice] = device_double_support(devices[idevice]);;
173 |      if (device_appropriate[idevice] == 1){
174 |         if (device_selected == -99) device_selected = idevice;
175 |         devices[idevice_appropriate] = devices[idevice];
176 |         idevice_appropriate++;
177 |      }
178 |      if (DEVICE_DETECT_DEBUG){
179 |         printf(  "  Device %d:\n", idevice+1);
180 |         device_info(devices[idevice]);
181 |      }
182 |   }
183 |   num_devices = idevice_appropriate;
184 | 
185 |   if (DEVICE_DETECT_DEBUG) {
186 |      printf("Device selected is %d number of appropriate devices %d\n",device_selected, num_devices);
187 |   }
188 | 
189 |   cl_context_properties context_properties[3]=
190 |   {
191 |     CL_CONTEXT_PLATFORM,
192 |     (cl_context_properties)platform,
193 |     0 // 0 terminates list
194 |   };   
195 | 
196 |   *context = clCreateContext(context_properties, num_devices, devices, NULL, NULL, &ierr);
197 |   if(ierr != CL_SUCCESS) {
198 |     printf("Error creating context\n");
199 |     exit(ierr);
200 |   }
201 |   *queue = clCreateCommandQueue(*context, devices[0], CL_QUEUE_PROFILING_ENABLE, &ierr);
202 |   if(ierr != CL_SUCCESS) {
203 |     printf("Error creating command queue\n");
204 |     exit(ierr);
205 |   }
206 |   
207 |   // Load the kernel source code into the array source
208 |   struct stat statbuf;
209 |   FILE *fh;
210 |   char *source;
211 |   
212 |   fh = fopen(filename, "r");
213 |   if (!fh) {
214 |       fprintf(stderr, "Failed to load kernel.\n");
215 |       exit(-1);
216 |   }
217 |   stat(filename, &statbuf);
218 |   source = (char*)malloc(statbuf.st_size + 1);
219 |   if( fread(source, statbuf.st_size, 1, fh) != 1) {
220 |       printf("Problem reading program source file\n");
221 |   }
222 |   source[statbuf.st_size] = '\0';
223 |   fclose( fh );
224 |   
225 |   *program = clCreateProgramWithSource(*context, 1, (const char**) &source, NULL, &ierr);
226 |   if (ierr != CL_SUCCESS){
227 |       printf("clCreateProgramWithSource returned an ierr %d at line %d in file %s\n", ierr,__LINE__,__FILE__);
228 |   }
229 |   //printf("%d %s\n", (int)statbuf.st_size, source);
230 |   
231 |   size_t nReportSize;
232 |   char* BuildReport;
233 |   
234 | #ifdef HAVE_CL_DOUBLE
235 |   if (*is_nvidia) {
236 |      ierr = clBuildProgram(*program, 0, NULL, "-DHAVE_CL_DOUBLE -DIS_NVIDIA", NULL, NULL);
237 |   } else {
238 |      ierr = clBuildProgram(*program, 0, NULL, "-DHAVE_CL_DOUBLE", NULL, NULL);
239 |   }
240 | #else
241 |   if (*is_nvidia) {
242 |      ierr = clBuildProgram(*program, 0, NULL, "-DNO_CL_DOUBLE -DIS_NVIDIA -cl-single-precision-constant", NULL, NULL);
243 |   } else {
244 |      ierr = clBuildProgram(*program, 0, NULL, "-DNO_CL_DOUBLE -cl-single-precision-constant", NULL, NULL);
245 |   }
246 | #endif
247 |   if (ierr != CL_SUCCESS){
248 |       printf("clBuildProgram returned an ierr %d at line %d in file %s\n", ierr,__LINE__,__FILE__);
249 |       ierr = clGetProgramBuildInfo(*program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &nReportSize);
250 |       if (ierr != CL_SUCCESS) {
251 |           switch (ierr){
252 |               case CL_INVALID_DEVICE:
253 |                   printf("Invalid device in clProgramBuildInfo\n");
254 |                   break;
255 |               case CL_INVALID_VALUE:
256 |                   printf("Invalid value in clProgramBuildInfo\n");
257 |                   break;
258 |               case CL_INVALID_PROGRAM:
259 |                   printf("Invalid program in clProgramBuildInfo\n");
260 |                   break;
261 |           }
262 |       }
263 |       
264 |       BuildReport = (char *)malloc(nReportSize);
265 |       
266 |       ierr = clGetProgramBuildInfo(*program, devices[0], CL_PROGRAM_BUILD_LOG, nReportSize, BuildReport, NULL);
267 |       if (ierr != CL_SUCCESS) {
268 |           switch (ierr){
269 |               case CL_INVALID_DEVICE:
270 |                   printf("Invalid device in clProgramBuildInfo\n");
271 |                   break;
272 |               case CL_INVALID_VALUE:
273 |                   printf("Invalid value in clProgramBuildInfo\n");
274 |                   break;
275 |               case CL_INVALID_PROGRAM:
276 |                   printf("Invalid program in clProgramBuildInfo\n");
277 |                   break;
278 |           }
279 |       }
280 |       printf("%s\n", BuildReport);
281 |   }
282 |   
283 | }
284 | 
285 | int device_double_support(cl_device_id device){
286 |    int have_double = 0;
287 |    char info[1024];
288 | 
289 |    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(info), &info, NULL);
290 | 
291 |    if (!(strstr(info,"cl_khr_fp64") == NULL)){
292 |      if (DEVICE_DETECT_DEBUG){
293 |         printf(  "    Device has double : %s\n\n", strstr(info,"cl_khr_fp64"));
294 |      }
295 |      have_double = 1;
296 |    }
297 | 
298 |    return(have_double);
299 | }
300 | 
301 | void device_info(cl_device_id device){
302 |    if (device == NULL) {
303 |       printf(" Error with device in device_info\n");
304 |    }
305 |    char info[1024];
306 |    cl_bool iflag;
307 |    cl_uint inum;
308 |    size_t isize;
309 |    cl_ulong ilong;
310 |    cl_device_type device_type;
311 |    cl_command_queue_properties iprop;
312 | 
313 |    clGetDeviceInfo(device,CL_DEVICE_TYPE,sizeof(device_type),&device_type,0);
314 |    if( device_type & CL_DEVICE_TYPE_CPU )
315 |       printf("    CL_DEVICE_TYPE                       : %s\n", "CL_DEVICE_TYPE_CPU");
316 |    if( device_type & CL_DEVICE_TYPE_GPU )
317 |       printf("    CL_DEVICE_TYPE                       : %s\n", "CL_DEVICE_TYPE_GPU");
318 |    if( device_type & CL_DEVICE_TYPE_ACCELERATOR )
319 |       printf("    CL_DEVICE_TYPE                       : %s\n", "CL_DEVICE_TYPE_ACCELERATOR");
320 |    if( device_type & CL_DEVICE_TYPE_DEFAULT )
321 |       printf("    CL_DEVICE_TYPE                       : %s\n", "CL_DEVICE_TYPE_DEFAULT");
322 | 
323 |    clGetDeviceInfo(device,CL_DEVICE_AVAILABLE,sizeof(iflag),&iflag,0);
324 |    if (iflag == CL_TRUE) {
325 |       printf(  "    CL_DEVICE_AVAILABLE                  : TRUE\n");
326 |    } else {
327 |       printf(  "    CL_DEVICE_AVAILABLE                  : FALSE\n");
328 |    }
329 | 
330 |    clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(info), &info, NULL);
331 |    printf(  "    CL_DEVICE_VENDOR                     : %s\n", info);
332 | 
333 |    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(info), &info, NULL);
334 |    printf(  "    CL_DEVICE_NAME                       : %s\n", info);
335 | 
336 |    clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(info), &info, NULL);
337 |    printf(  "    CL_DRIVER_VERSION                    : %s\n", info);
338 | 
339 |    clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(info), &info, NULL);
340 |    printf(  "    CL_DEVICE_VERSION                    : %s\n", info);
341 | 
342 |    clGetDeviceInfo(device,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(inum),&inum,0);
343 |    printf(  "    CL_DEVICE_MAX_COMPUTE_UNITS          : %d\n", inum);
344 | 
345 |    clGetDeviceInfo(device,CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,sizeof(inum),&inum,0);
346 |    printf(  "    CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS   : %d\n", inum);
347 | 
348 |    size_t *item_sizes = (size_t *)malloc(inum*sizeof(size_t));
349 |    clGetDeviceInfo(device,CL_DEVICE_MAX_WORK_ITEM_SIZES,sizeof(item_sizes),item_sizes,0);
350 |    printf(  "    CL_DEVICE_MAX_WORK_ITEM_SIZES        : %ld %ld %ld\n",
351 |          item_sizes[0], item_sizes[1], item_sizes[2]);
352 |    free(item_sizes);
353 | 
354 |    clGetDeviceInfo(device,CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(isize),&isize,0);
355 |    printf(  "    CL_DEVICE_MAX_WORK_GROUP_SIZE        : %ld\n", isize);
356 | 
357 |    clGetDeviceInfo(device,CL_DEVICE_MAX_CLOCK_FREQUENCY,sizeof(inum),&inum,0);
358 |    printf(  "    CL_DEVICE_MAX_CLOCK_FREQUENCY        : %d\n", inum);
359 | 
360 |    clGetDeviceInfo(device,CL_DEVICE_MAX_MEM_ALLOC_SIZE,sizeof(inum),&inum,0);
361 |    printf(  "    CL_DEVICE_MAX_MEM_ALLOC_SIZE         : %d\n", inum);
362 | 
363 | #ifdef __APPLE_CC__
364 |    clGetDeviceInfo(device,CL_DEVICE_GLOBAL_MEM_SIZE,sizeof(ilong),&ilong,0);
365 |    printf(  "    CL_DEVICE_GLOBAL_MEM_SIZE            : %llu\n", ilong);
366 | 
367 |    clGetDeviceInfo(device,CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,sizeof(ilong),&ilong,0);
368 |    printf(  "    CL_DEVICE_GLOBAL_MEM_CACHE_SIZE      : %llu\n", ilong);
369 | #else
370 |    clGetDeviceInfo(device,CL_DEVICE_GLOBAL_MEM_SIZE,sizeof(ilong),&ilong,0);
371 |    printf(  "    CL_DEVICE_GLOBAL_MEM_SIZE            : %lu\n", ilong);
372 | 
373 |    clGetDeviceInfo(device,CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,sizeof(ilong),&ilong,0);
374 |    printf(  "    CL_DEVICE_GLOBAL_MEM_CACHE_SIZE      : %lu\n", ilong);
375 | #endif
376 |    clGetDeviceInfo(device,CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE,sizeof(inum),&inum,0);
377 |    printf(  "    CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE  : %d\n", inum);
378 | 
379 |    clGetDeviceInfo(device,CL_DEVICE_MAX_CONSTANT_ARGS,sizeof(inum),&inum,0);
380 |    printf(  "    CL_DEVICE_GLOBAL_MAX_CONSTANT_ARGS   : %d\n", inum);
381 | 
382 |    clGetDeviceInfo(device,CL_DEVICE_ERROR_CORRECTION_SUPPORT,sizeof(iflag),&iflag,0);
383 |    if (iflag == CL_TRUE) {
384 |       printf(  "    CL_DEVICE_ERROR_CORRECTION_SUPPORT   : TRUE\n");
385 |    } else {
386 |       printf(  "    CL_DEVICE_ERROR_CORRECTION_SUPPORT   : FALSE\n");
387 |    }
388 | 
389 |    clGetDeviceInfo(device,CL_DEVICE_PROFILING_TIMER_RESOLUTION,sizeof(isize),&isize,0);
390 |    printf(  "    CL_DEVICE_PROFILING_TIMER_RESOLUTION : %ld nanosecs\n", isize);
391 | 
392 |    clGetDeviceInfo(device,CL_DEVICE_QUEUE_PROPERTIES,sizeof(iprop),&iprop,0);
393 |    if (iprop & CL_QUEUE_PROFILING_ENABLE) {
394 |       printf(  "    CL_DEVICE_QUEUE PROFILING            : AVAILABLE\n");
395 |    } else {
396 |       printf(  "    CL_DEVICE_QUEUE PROFILING            : NOT AVAILABLE\n");
397 |    }
398 | 
399 |    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(info), &info, NULL);
400 |    printf(  "    CL_DEVICE_EXTENSIONS                 : %s\n\n", info);
401 | 
402 | }
403 | #endif
404 | 
405 | 


--------------------------------------------------------------------------------
/gpu.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
 3 |  *  All rights Reserved.
 4 |  *
 5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
 6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
 7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
 8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
 9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
12 |  * to produce derivative works, such modified software should be clearly marked,
13 |  * so as not to confuse it with the version available from LANL.   
14 |  *
15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
16 |  * use this file except in compliance with the License. You may obtain a copy
17 |  * of the License at 
18 |  *
19 |  * http://www.apache.org/licenses/LICENSE-2.0
20 |  *
21 |  * Unless required by applicable law or agreed to in writing, software distributed
22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
24 |  * specific language governing permissions and limitations under the License.”
25 |  *
26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
27 |  *
28 |  */
29 | 
30 | /*
31 |  *  Authors: Bob Robey       XCP-2   brobey@lanl.gov
32 |  *           David Nicholaeff        dnic@lanl.gov, mtrxknight@aol.com
33 |  *           Rachel Robey            rnrobey@gmail.com
34 |  */
35 | 
36 | #ifdef HAVE_CONFIG_H
37 | #include "config.h"
38 | #endif
39 | 
40 | #ifdef __APPLE_CC__
41 | #include <OpenCL/OpenCL.h>
42 | #else
43 | #include <CL/cl.h>
44 | #endif
45 | 
46 | extern cl_kernel interpolate_kernel;
47 | 
48 | void GPUInit(cl_context *context, cl_command_queue *queue, int *is_nvidia, cl_program *program, char *filename);
49 | int device_double_support(cl_device_id device);
50 | void device_info(cl_device_id device);
51 | 


--------------------------------------------------------------------------------
/kdtree/Bounds1d.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
 3 |  *  All rights Reserved.
 4 |  *
 5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
 6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
 7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
 8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
 9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
12 |  * to produce derivative works, such modified software should be clearly marked,
13 |  * so as not to confuse it with the version available from LANL.   
14 |  *
15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
16 |  * use this file except in compliance with the License. You may obtain a copy
17 |  * of the License at 
18 |  *
19 |  * http://www.apache.org/licenses/LICENSE-2.0
20 |  *
21 |  * Unless required by applicable law or agreed to in writing, software distributed
22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
24 |  * specific language governing permissions and limitations under the License.”
25 |  *
26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
27 |  *
28 |  */
29 | 
30 | #include "Bounds1d.h"
31 | 
32 | void Bounds_Copy1d(TBounds1d* src, TBounds1d* dest) {
33 |    assert(src && dest);
34 |    MEMCPY(src, dest, 1, TBounds1d);
35 | }
36 | 
37 | void Bounds_Infinite1d(TBounds1d* b){
38 |    assert(b);
39 |    b->min.x = POSITIVE_INFINITY;
40 |    b->max.x = NEGATIVE_INFINITY;
41 | }
42 | 
43 | void Bounds_AddBounds1d(TBounds1d* b, TBounds1d* add) {
44 |    assert(b && add);
45 |    b->min.x = MIN(b->min.x, add->min.x);
46 |    b->max.x = MAX(b->max.x, add->max.x);
47 | }
48 | 
49 | void Bounds_AddEpsilon1d(TBounds1d* b, double add) {
50 |    assert(b);
51 |    b->min.x = b->min.x - add;
52 |    b->max.x = b->max.x + add;
53 | }
54 | 
55 | boolean Bounds_IsOverlappingBounds1d(TBounds1d* b, TBounds1d* tst) {
56 |    assert(b && tst);
57 |    if((tst->max.x < b->min.x) || (tst->min.x > b->max.x))
58 |       return(false);
59 |    return(true);
60 | }
61 | 
62 | double Bounds_WidthAxis1d(TBounds1d* b, unsigned long axis)
63 | {
64 |    double width;
65 |    
66 |    assert(b);
67 |    if(axis == XAXIS)
68 |       width = b->max.x - b->min.x;
69 |    else
70 |       assert(NULL);
71 |    return(width);
72 | }
73 | 
74 | double Bounds_CenterAxis1d(TBounds1d* b, unsigned long axis)
75 | {
76 |    double center;
77 |    
78 |    assert(b);
79 |    if(axis == XAXIS)
80 |       center = (b->min.x + b->max.x) * 0.5;
81 |    else
82 |       assert(NULL);
83 |    return(center);
84 | }
85 | 


--------------------------------------------------------------------------------
/kdtree/Bounds1d.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
 3 |  *  All rights Reserved.
 4 |  *
 5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
 6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
 7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
 8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
 9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
12 |  * to produce derivative works, such modified software should be clearly marked,
13 |  * so as not to confuse it with the version available from LANL.   
14 |  *
15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
16 |  * use this file except in compliance with the License. You may obtain a copy
17 |  * of the License at 
18 |  *
19 |  * http://www.apache.org/licenses/LICENSE-2.0
20 |  *
21 |  * Unless required by applicable law or agreed to in writing, software distributed
22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
24 |  * specific language governing permissions and limitations under the License.”
25 |  *
26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
27 |  *
28 |  */
29 | 
30 | #ifndef _Bounds1d_
31 | #define _Bounds1d_
32 | 
33 | #ifdef __cplusplus
34 | extern "C"
35 | {
36 | #endif
37 |    
38 | #include "Globals1d.h"
39 | 
40 | typedef struct {
41 |    TVector1d min, max;
42 | } TBounds1d;
43 | 
44 | extern void Bounds_Copy1d(TBounds1d* src, TBounds1d* dest);
45 | extern void Bounds_Infinite1d(TBounds1d* b);
46 | extern void Bounds_AddBounds1d(TBounds1d* b, TBounds1d* add);
47 | extern void Bounds_AddEpsilon1d(TBounds1d* b, double add);
48 | extern boolean Bounds_IsOverlappingBounds1d(TBounds1d* b, TBounds1d* tst);
49 | extern double Bounds_WidthAxis1d(TBounds1d* b, unsigned long axis);
50 | extern double Bounds_CenterAxis1d(TBounds1d* b, unsigned long axis);
51 | 
52 | #ifdef __cplusplus
53 | }
54 | #endif
55 | 
56 | #endif
57 | 


--------------------------------------------------------------------------------
/kdtree/Bounds2d.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
 3 |  *  All rights Reserved.
 4 |  *
 5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
 6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
 7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
 8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
 9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
12 |  * to produce derivative works, such modified software should be clearly marked,
13 |  * so as not to confuse it with the version available from LANL.   
14 |  *
15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
16 |  * use this file except in compliance with the License. You may obtain a copy
17 |  * of the License at 
18 |  *
19 |  * http://www.apache.org/licenses/LICENSE-2.0
20 |  *
21 |  * Unless required by applicable law or agreed to in writing, software distributed
22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
24 |  * specific language governing permissions and limitations under the License.”
25 |  *
26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
27 |  *
28 |  */
29 | 
30 | #include "Bounds2d.h"
31 | 
32 | void Bounds_Copy2d(TBounds2d* src, TBounds2d* dest) {
33 |    assert(src && dest);
34 |    MEMCPY(src, dest, 1, TBounds2d);
35 | }
36 | 
37 | void Bounds_Infinite2d(TBounds2d* b){
38 |    assert(b);
39 |    b->min.x = POSITIVE_INFINITY;
40 |    b->min.y = POSITIVE_INFINITY;
41 |    b->max.x = NEGATIVE_INFINITY;
42 |    b->max.y = NEGATIVE_INFINITY;
43 | }
44 | 
45 | void Bounds_AddBounds2d(TBounds2d* b, TBounds2d* add) {
46 |    assert(b && add);
47 |    b->min.x = MIN(b->min.x, add->min.x);
48 |    b->min.y = MIN(b->min.y, add->min.y);
49 |    b->max.x = MAX(b->max.x, add->max.x);
50 |    b->max.y = MAX(b->max.y, add->max.y);
51 | }
52 | 
53 | void Bounds_AddEpsilon2d(TBounds2d* b, double add) {
54 |    assert(b);
55 |    b->min.x = b->min.x - add;
56 |    b->min.y = b->min.y - add;
57 |    b->max.x = b->max.x + add;
58 |    b->max.y = b->max.y + add;
59 | }
60 | 
61 | boolean Bounds_IsOverlappingBounds2d(TBounds2d* b, TBounds2d* tst) {
62 |    assert(b && tst);
63 |    if((tst->max.x < b->min.x) || (tst->min.x > b->max.x))
64 |       return(false);
65 |    if((tst->max.y < b->min.y) || (tst->min.y > b->max.y))
66 |       return(false);
67 |    return(true);
68 | }
69 | 
70 | double Bounds_WidthAxis2d(TBounds2d* b, unsigned long axis)
71 | {
72 |    double width;
73 |    
74 |    assert(b);
75 |    if(axis == XAXIS)
76 |       width = b->max.x - b->min.x;
77 |    else if(axis == YAXIS)
78 |       width = b->max.y - b->min.y;
79 |    else
80 |       assert(NULL);
81 |    return(width);
82 | }
83 | 
84 | double Bounds_CenterAxis2d(TBounds2d* b, unsigned long axis)
85 | {
86 |    double center;
87 |    
88 |    assert(b);
89 |    if(axis == XAXIS)
90 |       center = (b->min.x + b->max.x) * 0.5;
91 |    else if(axis == YAXIS)
92 |       center = (b->min.y + b->max.y) * 0.5;
93 |    else
94 |       assert(NULL);
95 |    return(center);
96 | }
97 | 


--------------------------------------------------------------------------------
/kdtree/Bounds2d.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
 3 |  *  All rights Reserved.
 4 |  *
 5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
 6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
 7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
 8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
 9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
12 |  * to produce derivative works, such modified software should be clearly marked,
13 |  * so as not to confuse it with the version available from LANL.   
14 |  *
15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
16 |  * use this file except in compliance with the License. You may obtain a copy
17 |  * of the License at 
18 |  *
19 |  * http://www.apache.org/licenses/LICENSE-2.0
20 |  *
21 |  * Unless required by applicable law or agreed to in writing, software distributed
22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
24 |  * specific language governing permissions and limitations under the License.”
25 |  *
26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
27 |  *
28 |  */
29 | 
30 | #ifndef _Bounds2d_
31 | #define _Bounds2d_
32 | 
33 | #ifdef __cplusplus
34 | extern "C"
35 | {
36 | #endif
37 |    
38 | #include "Globals2d.h"
39 | 
40 | typedef struct {
41 |    TVector2d min, max;
42 | } TBounds2d;
43 | 
44 | extern void Bounds_Copy2d(TBounds2d* src, TBounds2d* dest);
45 | extern void Bounds_Infinite2d(TBounds2d* b);
46 | extern void Bounds_AddBounds2d(TBounds2d* b, TBounds2d* add);
47 | extern void Bounds_AddEpsilon2d(TBounds2d* b, double add);
48 | extern boolean Bounds_IsOverlappingBounds2d(TBounds2d* b, TBounds2d* tst);
49 | extern double Bounds_WidthAxis2d(TBounds2d* b, unsigned long axis);
50 | extern double Bounds_CenterAxis2d(TBounds2d* b, unsigned long axis);
51 | 
52 | #ifdef __cplusplus
53 | }
54 | #endif
55 | 
56 | #endif
57 | 


--------------------------------------------------------------------------------
/kdtree/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # enable @rpath in the install name for any shared library being built
 2 | set(CMAKE_MACOSX_RPATH 1)
 3 | 
 4 | ########### global settings ###############
 5 | set(H_SRCS Bounds1d.h Bounds2d.h Globals1d.h Globals2d.h KDTree1d.h KDTree2d.h)
 6 | 
 7 | set(C_SRCS Bounds1d.c Bounds2d.c KDTree1d.c KDTree2d.c)
 8 | 
 9 | set(kdtree_LIB_SRCS ${C_SRCS} ${H_SRCS})
10 | 
11 | ########### kdtree target ###############
12 | 
13 | add_library(kdtree SHARED ${kdtree_LIB_SRCS})
14 | 
15 | set_target_properties(kdtree PROPERTIES VERSION 1.0.0 SOVERSION 2)
16 | install(TARGETS kdtree DESTINATION lib)
17 | 
18 | # Cleanup
19 | SET_DIRECTORY_PROPERTIES(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES
20 |        	"CMakeCache.txt;CMakeFiles;Makefile;cmake_install.cmake;ipo_out.optrpt")
21 | 
22 | ########### install files ###############
23 | 
24 | #install(FILES  KDTree.h DESTINATION include)
25 | 
26 | #========== original Makefile.am contents follow ===========
27 | 
28 | #original Makefile.am contents follow:
29 | 
30 | #default: libkdtree.la
31 | #all: libkdtree.la
32 | #
33 | #AM_MAKEFLAGS = -j 4
34 | #
35 | #DEFAULT_INCLUDES=-I. -I..
36 | #
37 | #lib_LTLIBRARIES = libkdtree.la
38 | #include_HEADERS = KDTree.h
39 | #
40 | #libkdtree_la_SOURCES = ${C_SRCS} ${H_SRCS}
41 | 


--------------------------------------------------------------------------------
/kdtree/Globals1d.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
 3 |  *  All rights Reserved.
 4 |  *
 5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
 6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
 7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
 8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
 9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
12 |  * to produce derivative works, such modified software should be clearly marked,
13 |  * so as not to confuse it with the version available from LANL.   
14 |  *
15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
16 |  * use this file except in compliance with the License. You may obtain a copy
17 |  * of the License at 
18 |  *
19 |  * http://www.apache.org/licenses/LICENSE-2.0
20 |  *
21 |  * Unless required by applicable law or agreed to in writing, software distributed
22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
24 |  * specific language governing permissions and limitations under the License.”
25 |  *
26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
27 |  *
28 |  */
29 | 
30 | #ifndef _Globals1d_
31 | #define _Globals1d_
32 | 
33 | #ifdef __cplusplus
34 | extern "C"
35 | {
36 | #endif
37 |    
38 | //#define NDEBUG 1
39 | #include <assert.h>
40 | #include <string.h>
41 | #include <stdio.h>
42 | #include <stdlib.h>
43 | 
44 | #ifndef ENTITY_COINCIDENCE_TOLERANCE
45 | #define ENTITY_COINCIDENCE_TOLERANCE      ((double)1.0E-5)
46 | 
47 | #define KDTREE_ELEMENT_BLOCKING_SIZE      ((long)1024)
48 | #endif
49 | 
50 | #ifndef POSITIVE_INFINITY
51 | #define POSITIVE_INFINITY (+1.0E+64)
52 | #define NEGATIVE_INFINITY (-1.0E+64)
53 | #endif
54 | 
55 | #define XAXIS ((unsigned long)0)
56 | 
57 | typedef struct {
58 |    double x;
59 | } TVector1d;
60 | 
61 | #ifndef _BOOL
62 | #define _BOOL
63 | typedef unsigned char boolean;
64 | #define true  ((boolean)1)
65 | #define false ((boolean)0)
66 | #endif
67 | 
68 | #ifndef MIN
69 | #define MIN(a,b) ((a) < (b) ? (a) : (b))
70 | #define MAX(a,b) ((a) > (b) ? (a) : (b))
71 | #endif
72 | 
73 | #ifndef SWAP
74 | #define SWAP(a,b,t) {t h; h = a; a = b; b = h; }
75 | #endif
76 | 
77 | #ifndef MALLOC
78 | #define MALLOC(n,t) ((t*)(malloc(n * sizeof(t))))
79 | #define REALLOC(p,n,t) ((t*)(realloc((void*)p, n * sizeof(t))))
80 | #define FREE(p) { if (p) free(p); }
81 | #define MEMCPY(s,d,n,t) {memcpy((void*)d, (void*)s, n * sizeof(t)); }
82 | #endif
83 | 
84 | #ifdef __cplusplus
85 | }
86 | #endif
87 |    
88 | #endif
89 | 


--------------------------------------------------------------------------------
/kdtree/Globals2d.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
 3 |  *  All rights Reserved.
 4 |  *
 5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
 6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
 7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
 8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
 9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
12 |  * to produce derivative works, such modified software should be clearly marked,
13 |  * so as not to confuse it with the version available from LANL.   
14 |  *
15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
16 |  * use this file except in compliance with the License. You may obtain a copy
17 |  * of the License at 
18 |  *
19 |  * http://www.apache.org/licenses/LICENSE-2.0
20 |  *
21 |  * Unless required by applicable law or agreed to in writing, software distributed
22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
24 |  * specific language governing permissions and limitations under the License.”
25 |  *
26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
27 |  *
28 |  */
29 | 
30 | #ifndef _Globals2d_
31 | #define _Globals2d_
32 | 
33 | 
34 | #ifdef __cplusplus
35 | extern "C"
36 | {
37 | #endif
38 |    
39 | //#define NDEBUG 1
40 | #include <assert.h>
41 | #include <string.h>
42 | #include <stdio.h>
43 | #include <stdlib.h>
44 | 
45 | #define ENTITY_COINCIDENCE_TOLERANCE      ((double)1.0E-5)
46 | 
47 | #define KDTREE_ELEMENT_BLOCKING_SIZE      ((long)1024)
48 | 
49 | #define POSITIVE_INFINITY (+1.0E+64)
50 | #define NEGATIVE_INFINITY (-1.0E+64)
51 | 
52 | #define XAXIS ((unsigned long)0)
53 | #define YAXIS ((unsigned long)1)
54 | 
55 | typedef struct {
56 |    double x, y;
57 | } TVector2d;
58 | 
59 | #ifndef _BOOL
60 | #define _BOOL
61 | typedef unsigned char boolean;
62 | #define true  ((boolean)1)
63 | #define false ((boolean)0)
64 | #endif
65 | 
66 | #define MIN(a,b) ((a) < (b) ? (a) : (b))
67 | #define MAX(a,b) ((a) > (b) ? (a) : (b))
68 | 
69 | #ifndef SWAP
70 | #define SWAP(a,b,t) {t h; h = a; a = b; b = h; }
71 | #endif
72 | 
73 | #define MALLOC(n,t) ((t*)(malloc(n * sizeof(t))))
74 | #define REALLOC(p,n,t) ((t*)(realloc((void*)p, n * sizeof(t))))
75 | #define FREE(p) { if (p) free(p); }
76 | #define MEMCPY(s,d,n,t) {memcpy((void*)d, (void*)s, n * sizeof(t)); }
77 | 
78 | #ifdef __cplusplus
79 | }
80 | #endif
81 |    
82 | #endif
83 | 


--------------------------------------------------------------------------------
/kdtree/KDTree1d.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
  3 |  *  All rights Reserved.
  4 |  *
  5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
  6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
  7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
  8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
  9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
 10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
 11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
 12 |  * to produce derivative works, such modified software should be clearly marked,
 13 |  * so as not to confuse it with the version available from LANL.   
 14 |  *
 15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 16 |  * use this file except in compliance with the License. You may obtain a copy
 17 |  * of the License at 
 18 |  *
 19 |  * http://www.apache.org/licenses/LICENSE-2.0
 20 |  *
 21 |  * Unless required by applicable law or agreed to in writing, software distributed
 22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 24 |  * specific language governing permissions and limitations under the License.”
 25 |  *
 26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
 27 |  *
 28 |  */
 29 | 
 30 | #include <math.h>
 31 | #include "KDTree1d.h"
 32 | 
 33 | static void median_sort1d(TKDTree1d* t,
 34 |                         int cut_direction, int k, int num, int* idx)
 35 | {
 36 |    int left, mid, right, a, i, j;
 37 |    
 38 |    for (left = 0, right = num - 1; (right - left) > 1;) {
 39 |       mid = (left + right) / 2;
 40 |       SWAP(idx[mid], idx[left + 1], int);
 41 |       if(Bounds_CenterAxis1d(&(t->elements[idx[left + 1]]), cut_direction) >
 42 |          Bounds_CenterAxis1d(&(t->elements[idx[right]]), cut_direction))
 43 |          SWAP(idx[left + 1], idx[right], int);
 44 |       if(Bounds_CenterAxis1d(&(t->elements[idx[left]]), cut_direction) >
 45 |          Bounds_CenterAxis1d(&(t->elements[idx[right]]), cut_direction))
 46 |          SWAP(idx[left], idx[right], int);
 47 |       if(Bounds_CenterAxis1d(&(t->elements[idx[left + 1]]), cut_direction) >
 48 |          Bounds_CenterAxis1d(&(t->elements[idx[left]]), cut_direction))
 49 |          SWAP(idx[left + 1], idx[left], int);
 50 |       a = idx[left];
 51 |       i = left + 1;
 52 |       j = right;
 53 |       while (1) {
 54 |          for (i++;
 55 |               Bounds_CenterAxis1d(&(t->elements[idx[i]]), cut_direction) <
 56 |                 Bounds_CenterAxis1d(&(t->elements[a]), cut_direction);
 57 |               i++);
 58 |          for (j--;
 59 |               Bounds_CenterAxis1d(&(t->elements[idx[j]]), cut_direction) >
 60 |               Bounds_CenterAxis1d(&(t->elements[a]), cut_direction);
 61 |               j--);
 62 |          if(j < i)
 63 |             break;
 64 |          SWAP(idx[i], idx[j], int);
 65 |       }
 66 |       idx[left] = idx[j];
 67 |       idx[j] = a;
 68 |       if(j >= k)
 69 |          right = j - 1;
 70 |       if(j <= k)
 71 |          left = i;
 72 |    }
 73 |    if(((right - left) ==1) &&
 74 |       (Bounds_CenterAxis1d(&(t->elements[idx[right]]), cut_direction) <
 75 |        Bounds_CenterAxis1d(&(t->elements[idx[left]]), cut_direction)))
 76 |       SWAP(idx[right], idx[left], int);
 77 | }
 78 | 
 79 | void KDTree_Initialize1d(TKDTree1d* t)
 80 | {
 81 |    assert(t);
 82 |    /* Flush the overall tree extent */
 83 |    Bounds_Infinite1d(&(t->extent));
 84 |    /* Allocate the initial memory for tree elements */
 85 |    t->elements_num = 0;
 86 |    t->elements_allocated = KDTREE_ELEMENT_BLOCKING_SIZE;
 87 |    t->elements = MALLOC(t->elements_allocated, TBounds1d);
 88 |    assert(t->elements);
 89 |    /* Start without a built tree */
 90 |    t->tree_built = false;
 91 |    t->tree_size = 0;
 92 |    t->tree_safety_boxes = NULL;
 93 |    t->tree_link = NULL;
 94 | }
 95 | 
 96 | void KDTree_Destroy1d(TKDTree1d* t)
 97 | {
 98 |    assert(t);
 99 |    /* Flush the overall tree extent */
100 |    Bounds_Infinite1d(&(t->extent));
101 |    /* Destroy the element list */
102 |    t->elements_num = 0;
103 |    t->elements_allocated = 0;
104 |    FREE(t->elements);
105 |    t->elements = NULL;
106 |    /* Destroy the actual tree */
107 |    t->tree_built = false;
108 |    t->tree_size = 0;
109 |    FREE(t->tree_safety_boxes);
110 |    t->tree_safety_boxes = NULL;
111 |    FREE(t->tree_link);
112 |    t->tree_link = NULL;
113 | }
114 | 
115 | 
116 | 
117 | void KDTree_AddElement1d(TKDTree1d* t, TBounds1d* add)
118 | {
119 |    assert(t && add);
120 |    /* Destroy the current tree if it is built */
121 |    if(t->tree_built) {
122 |       t->tree_built = false;
123 |       t->tree_size = 0;
124 |       FREE(t->tree_safety_boxes);
125 |       t->tree_safety_boxes = NULL;
126 |       FREE(t->tree_link);
127 |       t->tree_link = NULL;
128 |    }
129 |    /* Expand the element array if necessary */
130 |    if(t->elements_num == t->elements_allocated) {
131 |       t->elements_allocated += KDTREE_ELEMENT_BLOCKING_SIZE;
132 |       t->elements = REALLOC(t->elements, t->elements_allocated, TBounds1d);
133 |       assert(t->elements);
134 |    }
135 |    /* Add the new element to the overall extent and the element list */
136 |    Bounds_AddBounds1d(&(t->extent), add);
137 |    Bounds_Copy1d(add, &(t->elements[t->elements_num]));
138 |    t->elements_num++;
139 | }
140 | 
141 | void KDTree_CreateTree1d(TKDTree1d* t)
142 | {
143 |    int i, next_node, stack_ptr, min, mid, max, parent, cut_direction;
144 |    double width, max_width;
145 |    int* stack;
146 |    int* idx;
147 |    
148 |    assert(t);
149 |    /* If the tree is already built, we don't have to do anything */
150 |    if(t->tree_built)
151 |       return;
152 |    /* If there are no elements in the tree, we don't have to do anything */
153 |    if(t->elements_num > 0) {
154 |       /* Allocate the k-D tree memory */
155 |       t->tree_size = 2 * t->elements_num;
156 |       t->tree_safety_boxes = MALLOC(t->tree_size, TBounds1d);
157 |       t->tree_link = MALLOC(t->tree_size, int);
158 |       /* Create and initialize temporary arrays */
159 |       next_node = 0;
160 |       stack_ptr = 0;
161 |       stack = MALLOC(3 * t->tree_size, int);
162 |       idx = MALLOC(t->elements_num, int);
163 |       for (i = 0; i <  t->elements_num; i++) {
164 |          idx[i] = i;
165 |       }
166 |       /* Setup the root node of the tree and put it on the stack */
167 |       stack[stack_ptr++] = 0;                   /* Node Number in the Tree */
168 |       stack[stack_ptr++] = 0;                   /* Element Span Minumum */
169 |       stack[stack_ptr++] = t->elements_num - 1; /* Element Span Maximum */
170 |       Bounds_Copy1d(&(t->extent), &(t->tree_safety_boxes[0]));
171 |       next_node++;
172 |       /* Construct k-D tree by setting up each pair of child nodes */
173 |       while (stack_ptr) {
174 |          /* Pop the top entry off the stack */
175 |          max = stack[--stack_ptr];
176 |          min = stack[--stack_ptr];
177 |          parent = stack[--stack_ptr];
178 |          /* If the current node should be a leaf node, make it one */
179 |          if ((max - min) == 0) {
180 |             Bounds_Copy1d(&(t->elements[idx[min]]), &(t->tree_safety_boxes[parent]));
181 |             t->tree_link[parent] = - idx[min];
182 |             continue;
183 |          }
184 |          /* Select optimum cutting direction for the parent node's safety box */
185 |          cut_direction = -1;
186 |          max_width = NEGATIVE_INFINITY;
187 |          for (i = 0; i < 1; i++) {
188 |             width = Bounds_WidthAxis1d(&(t->tree_safety_boxes[parent]), i);
189 |             if(width > max_width) {
190 |                max_width = width;
191 |                cut_direction = i;
192 |             }
193 |          }
194 |          assert(cut_direction >= 0);
195 |          /* Do a median sort of the elements under the parent node. The sort key
196 |             is the center point of the element bounding boxes along the selected
197 |             cutting direction. */
198 |          mid = (min + max) /2;
199 |          median_sort1d(t, cut_direction, mid - min, max - min + 1, &(idx[min]));
200 |          /* Give the parent a reference to its two children */
201 |          t->tree_link[parent] = next_node;
202 |          /* Add the "left" child to the tree and the stack */
203 |          stack[stack_ptr++] = next_node;  /* Node Number in the Tree */
204 |          stack[stack_ptr++] = min;        /* Element Span Minimum */
205 |          stack[stack_ptr++] = mid;        /* Element Span Maximum */
206 |          Bounds_Infinite1d(&(t->tree_safety_boxes[next_node]));
207 |          for (i = min; i <= mid; i++) {
208 |             Bounds_AddBounds1d(&(t->tree_safety_boxes[next_node]),
209 |                              &(t->elements[idx[i]]));
210 |          }
211 |          next_node++;
212 |          /* Add the "right" child to the tree and the stack */
213 |          stack[stack_ptr++] = next_node;  /* Node Number in the Tree */
214 |          stack[stack_ptr++] = mid + 1;    /* Element Span Minimum */
215 |          stack[stack_ptr++] = max;        /* Element Span Maximum */
216 |          Bounds_Infinite1d(&(t->tree_safety_boxes[next_node]));
217 |          for (i = min + 1; i <= max; i++) {
218 |             Bounds_AddBounds1d(&(t->tree_safety_boxes[next_node]),
219 |                              &(t->elements[idx[i]]));
220 |          }
221 |          next_node++;
222 |       }
223 |       /* Destroy the temporary arrays */
224 |       FREE(stack);
225 |       FREE(idx);
226 |    }
227 |    /* Mark the tree "built" */
228 |    t->tree_built = true;
229 | }
230 | 
231 | void KDTree_QueryBoxIntersect1d(TKDTree1d* t,
232 |                               int* result_num, int* result_indicies,
233 |                               TBounds1d* box)
234 | {
235 |    int stack_ptr, node;
236 |    TBounds1d sb;
237 |    int* stack;
238 |    
239 |    assert(t && result_num && result_indicies && box);
240 |    /* Build the k-D tree if necessary */
241 |    if(!t->tree_built){
242 |       //printf("BUILDING TREE... \n");
243 |       //fflush(stdout);
244 |       KDTree_CreateTree1d(t);
245 |    }
246 |    /* Allocate the results array */
247 |    *result_num = 0;
248 |    /* Create the temporary stack array */
249 |    stack_ptr = 0;
250 |    stack = MALLOC(t->tree_size, int);
251 |    
252 |    /* Put the root node of the tree onto the stack */
253 |    stack[stack_ptr++] = 0;
254 |    /* Search the k-D tree until the stack is empty */
255 |    
256 |    while (stack_ptr) {
257 |       /* Pop the top entry off the stack */
258 |       node = stack[--stack_ptr];
259 |       /* Check if the query box intersects an epsilon-expanded safety box for
260 |          the current node. */
261 |       Bounds_Copy1d(&(t->tree_safety_boxes[node]), &sb);
262 |       //Bounds_AddEpsilon1d(&sb, ENTITY_COINCIDENCE_TOLERANCE);
263 |       /* If the query box doesn't intersect this node's safety box, we are done
264 |          visiting the node and should continue with the next node */
265 |       if(!Bounds_IsOverlappingBounds1d(&sb, box))
266 |          continue;
267 |       /* If the current node is a leaf node, add it to the collision list. If
268 |          the current node is an interior node, add its children to the stack. */
269 |       if(t->tree_link[node] <= 0) {
270 |          result_indicies[*result_num] = - t->tree_link[node];
271 |          (*result_num)++;
272 |       }
273 |       else {
274 |          stack[stack_ptr++] = t->tree_link[node];
275 |          stack[stack_ptr++] = t->tree_link[node] + 1;
276 |       }
277 |    }
278 |    /* Destroy the temporary stack array */
279 |    FREE(stack);
280 | }
281 | 
282 | 


--------------------------------------------------------------------------------
/kdtree/KDTree1d.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
 3 |  *  All rights Reserved.
 4 |  *
 5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
 6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
 7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
 8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
 9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
12 |  * to produce derivative works, such modified software should be clearly marked,
13 |  * so as not to confuse it with the version available from LANL.   
14 |  *
15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
16 |  * use this file except in compliance with the License. You may obtain a copy
17 |  * of the License at 
18 |  *
19 |  * http://www.apache.org/licenses/LICENSE-2.0
20 |  *
21 |  * Unless required by applicable law or agreed to in writing, software distributed
22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
24 |  * specific language governing permissions and limitations under the License.”
25 |  *
26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
27 |  *
28 |  */
29 | 
30 | /* 
31 |  * 
32 |  *  Implements a 2-dimensional k-D tree. One begins to use the k-D tree by
33 |  *  adding the bounding box of geometric "elements" to the tree structure
34 |  *  through a call to "KDTreeAddElement". Every element should be of the same
35 |  *  type, but could be a single point, a line segment, triangles, etc. Once
36 |  *  all the element bounding boxes have been added, the user of the structure
37 |  *  may make queries against the tree. The actual tree is constructed lazily
38 |  *  when an actual query occurs on the structure.
39 |  *
40 |  *  This version only has one query -- intersection of a box with the elements
41 |  *  and a set of "candidate" elements are returned. The candidates are identified
42 |  *  by an index number (0, ...) signifying the order in which the element was
43 |  *  added to the tree. It is up to the calling code to do additional processing
44 |  *  based on the type of element being used to determine "real" intersections.
45 |  *
46 |  *  The process of actually building the tree takes "n log n" time. Queries 
47 |  *  take "log n" time.
48 |  *
49 |  */
50 | 
51 | #ifndef _KDTree1d_
52 | #define _KDTree1d_
53 | 
54 | #ifdef __cplusplus
55 | extern "C"
56 | {
57 | #endif
58 |   
59 | #include "Globals1d.h"
60 | #include "Bounds1d.h"
61 |    
62 | #define LEFT_HALF   0
63 | #define RIGHT_HALF  1
64 | #define BOTTOM_HALF 0
65 | #define TOP_HALF    1   
66 | 
67 | typedef struct {
68 |    TBounds1d extent;
69 |    int elements_num, elements_allocated;
70 |    TBounds1d* elements;
71 |    boolean tree_built;
72 |    int tree_size;
73 |    TBounds1d* tree_safety_boxes;
74 |    int * tree_link;
75 | } TKDTree1d;
76 | 
77 | extern void KDTree_Initialize1d(TKDTree1d *t);
78 | extern void KDTree_Destroy1d(TKDTree1d* t);
79 | extern void KDTree_AddElement1d(TKDTree1d* t, TBounds1d* add);
80 | extern void KDTree_CreateTree1d(TKDTree1d* t);
81 | extern void KDTree_QueryBoxIntersect1d(TKDTree1d* t,
82 |                                      int* result_num, int* result_indicies,
83 |                                      TBounds1d* box);
84 |    
85 | #ifdef __cplusplus
86 | }
87 | #endif
88 | 
89 | #endif
90 | 


--------------------------------------------------------------------------------
/kdtree/KDTree2d.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
  3 |  *  All rights Reserved.
  4 |  *
  5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
  6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
  7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
  8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
  9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
 10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
 11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
 12 |  * to produce derivative works, such modified software should be clearly marked,
 13 |  * so as not to confuse it with the version available from LANL.   
 14 |  *
 15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 16 |  * use this file except in compliance with the License. You may obtain a copy
 17 |  * of the License at 
 18 |  *
 19 |  * http://www.apache.org/licenses/LICENSE-2.0
 20 |  *
 21 |  * Unless required by applicable law or agreed to in writing, software distributed
 22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 24 |  * specific language governing permissions and limitations under the License.”
 25 |  *
 26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
 27 |  *
 28 |  */
 29 | 
 30 | #include <math.h>
 31 | #include "KDTree2d.h"
 32 | 
 33 | static void median_sort2d(TKDTree2d* t,
 34 |                         int cut_direction, int k, int num, int* idx)
 35 | {
 36 |    int left, mid, right, a, i, j;
 37 |    
 38 |    for (left = 0, right = num - 1; (right - left) > 1;) {
 39 |       mid = (left + right) / 2;
 40 |       SWAP(idx[mid], idx[left + 1], int);
 41 |       if(Bounds_CenterAxis2d(&(t->elements[idx[left + 1]]), cut_direction) >
 42 |          Bounds_CenterAxis2d(&(t->elements[idx[right]]), cut_direction))
 43 |          SWAP(idx[left + 1], idx[right], int);
 44 |       if(Bounds_CenterAxis2d(&(t->elements[idx[left]]), cut_direction) >
 45 |          Bounds_CenterAxis2d(&(t->elements[idx[right]]), cut_direction))
 46 |          SWAP(idx[left], idx[right], int);
 47 |       if(Bounds_CenterAxis2d(&(t->elements[idx[left + 1]]), cut_direction) >
 48 |          Bounds_CenterAxis2d(&(t->elements[idx[left]]), cut_direction))
 49 |          SWAP(idx[left + 1], idx[left], int);
 50 |       a = idx[left];
 51 |       i = left + 1;
 52 |       j = right;
 53 |       while (1) {
 54 |          for (i++;
 55 |               Bounds_CenterAxis2d(&(t->elements[idx[i]]), cut_direction) <
 56 |                 Bounds_CenterAxis2d(&(t->elements[a]), cut_direction);
 57 |               i++);
 58 |          for (j--;
 59 |               Bounds_CenterAxis2d(&(t->elements[idx[j]]), cut_direction) >
 60 |               Bounds_CenterAxis2d(&(t->elements[a]), cut_direction);
 61 |               j--);
 62 |          if(j < i)
 63 |             break;
 64 |          SWAP(idx[i], idx[j], int);
 65 |       }
 66 |       idx[left] = idx[j];
 67 |       idx[j] = a;
 68 |       if(j >= k)
 69 |          right = j - 1;
 70 |       if(j <= k)
 71 |          left = i;
 72 |    }
 73 |    if(((right - left) ==1) &&
 74 |       (Bounds_CenterAxis2d(&(t->elements[idx[right]]), cut_direction) <
 75 |        Bounds_CenterAxis2d(&(t->elements[idx[left]]), cut_direction)))
 76 |       SWAP(idx[right], idx[left], int);
 77 | }
 78 | 
 79 | void KDTree_Initialize2d(TKDTree2d* t)
 80 | {
 81 |    assert(t);
 82 |    /* Flush the overall tree extent */
 83 |    Bounds_Infinite2d(&(t->extent));
 84 |    /* Allocate the initial memory for tree elements */
 85 |    t->elements_num = 0;
 86 |    t->elements_allocated = KDTREE_ELEMENT_BLOCKING_SIZE;
 87 |    t->elements = MALLOC(t->elements_allocated, TBounds2d);
 88 |    assert(t->elements);
 89 |    /* Start without a built tree */
 90 |    t->tree_built = false;
 91 |    t->tree_size = 0;
 92 |    t->tree_safety_boxes = NULL;
 93 |    t->tree_link = NULL;
 94 | }
 95 | 
 96 | void KDTree_Destroy2d(TKDTree2d* t)
 97 | {
 98 |    assert(t);
 99 |    /* Flush the overall tree extent */
100 |    Bounds_Infinite2d(&(t->extent));
101 |    /* Destroy the element list */
102 |    t->elements_num = 0;
103 |    t->elements_allocated = 0;
104 |    FREE(t->elements);
105 |    t->elements = NULL;
106 |    /* Destroy the actual tree */
107 |    t->tree_built = false;
108 |    t->tree_size = 0;
109 |    FREE(t->tree_safety_boxes);
110 |    t->tree_safety_boxes = NULL;
111 |    FREE(t->tree_link);
112 |    t->tree_link = NULL;
113 | }
114 | 
115 | 
116 | 
117 | void KDTree_AddElement2d(TKDTree2d* t, TBounds2d* add)
118 | {
119 |    assert(t && add);
120 |    /* Destroy the current tree if it is built */
121 |    if(t->tree_built) {
122 |       t->tree_built = false;
123 |       t->tree_size = 0;
124 |       FREE(t->tree_safety_boxes);
125 |       t->tree_safety_boxes = NULL;
126 |       FREE(t->tree_link);
127 |       t->tree_link = NULL;
128 |    }
129 |    /* Expand the element array if necessary */
130 |    if(t->elements_num == t->elements_allocated) {
131 |       t->elements_allocated += KDTREE_ELEMENT_BLOCKING_SIZE;
132 |       t->elements = REALLOC(t->elements, t->elements_allocated, TBounds2d);
133 |       assert(t->elements);
134 |    }
135 |    /* Add the new element to the overall extent and the element list */
136 |    Bounds_AddBounds2d(&(t->extent), add);
137 |    Bounds_Copy2d(add, &(t->elements[t->elements_num]));
138 |    t->elements_num++;
139 | }
140 | 
141 | void KDTree_CreateTree2d(TKDTree2d* t)
142 | {
143 |    int i, next_node, stack_ptr, min, mid, max, parent, cut_direction;
144 |    double width, max_width;
145 |    int* stack;
146 |    int* idx;
147 |    
148 |    assert(t);
149 |    /* If the tree is already built, we don't have to do anything */
150 |    if(t->tree_built)
151 |       return;
152 |    /* If there are no elements in the tree, we don't have to do anything */
153 |    if(t->elements_num > 0) {
154 |       /* Allocate the k-D tree memory */
155 |       t->tree_size = 2 * t->elements_num;
156 |       t->tree_safety_boxes = MALLOC(t->tree_size, TBounds2d);
157 |       t->tree_link = MALLOC(t->tree_size, int);
158 |       /* Create and initialize temporary arrays */
159 |       next_node = 0;
160 |       stack_ptr = 0;
161 |       stack = MALLOC(3 * t->tree_size, int);
162 |       idx = MALLOC(t->elements_num, int);
163 |       for (i = 0; i <  t->elements_num; i++) {
164 |          idx[i] = i;
165 |       }
166 |       /* Setup the root node of the tree and put it on the stack */
167 |       stack[stack_ptr++] = 0;                   /* Node Number in the Tree */
168 |       stack[stack_ptr++] = 0;                   /* Element Span Minumum */
169 |       stack[stack_ptr++] = t->elements_num - 1; /* Element Span Maximum */
170 |       Bounds_Copy2d(&(t->extent), &(t->tree_safety_boxes[0]));
171 |       next_node++;
172 |       /* Construct k-D tree by setting up each pair of child nodes */
173 |       while (stack_ptr) {
174 |          /* Pop the top entry off the stack */
175 |          max = stack[--stack_ptr];
176 |          min = stack[--stack_ptr];
177 |          parent = stack[--stack_ptr];
178 |          /* If the current node should be a leaf node, make it one */
179 |          if ((max - min) == 0) {
180 |             Bounds_Copy2d(&(t->elements[idx[min]]), &(t->tree_safety_boxes[parent]));
181 |             t->tree_link[parent] = - idx[min];
182 |             continue;
183 |          }
184 |          /* Select optimum cutting direction for the parent node's safety box */
185 |          cut_direction = -1;
186 |          max_width = NEGATIVE_INFINITY;
187 |          for (i = 0; i < 2; i++) {
188 |             width = Bounds_WidthAxis2d(&(t->tree_safety_boxes[parent]), i);
189 |             if(width > max_width) {
190 |                max_width = width;
191 |                cut_direction = i;
192 |             }
193 |          }
194 |          assert(cut_direction >= 0);
195 |          /* Do a median sort of the elements under the parent node. The sort key
196 |             is the center point of the element bounding boxes along the selected
197 |             cutting direction. */
198 |          mid = (min + max) /2;
199 |          median_sort2d(t, cut_direction, mid - min, max - min + 1, &(idx[min]));
200 |          /* Give the parent a reference to its two children */
201 |          t->tree_link[parent] = next_node;
202 |          /* Add the "left" child to the tree and the stack */
203 |          stack[stack_ptr++] = next_node;  /* Node Number in the Tree */
204 |          stack[stack_ptr++] = min;        /* Element Span Minimum */
205 |          stack[stack_ptr++] = mid;        /* Element Span Maximum */
206 |          Bounds_Infinite2d(&(t->tree_safety_boxes[next_node]));
207 |          for (i = min; i <= mid; i++) {
208 |             Bounds_AddBounds2d(&(t->tree_safety_boxes[next_node]),
209 |                              &(t->elements[idx[i]]));
210 |          }
211 |          next_node++;
212 |          /* Add the "right" child to the tree and the stack */
213 |          stack[stack_ptr++] = next_node;  /* Node Number in the Tree */
214 |          stack[stack_ptr++] = mid + 1;    /* Element Span Minimum */
215 |          stack[stack_ptr++] = max;        /* Element Span Maximum */
216 |          Bounds_Infinite2d(&(t->tree_safety_boxes[next_node]));
217 |          for (i = min + 1; i <= max; i++) {
218 |             Bounds_AddBounds2d(&(t->tree_safety_boxes[next_node]),
219 |                              &(t->elements[idx[i]]));
220 |          }
221 |          next_node++;
222 |       }
223 |       /* Destroy the temporary arrays */
224 |       FREE(stack);
225 |       FREE(idx);
226 |    }
227 |    /* Mark the tree "built" */
228 |    t->tree_built = true;
229 | }
230 | 
231 | void KDTree_QueryBoxIntersect2d(TKDTree2d* t,
232 |                               int* result_num, int* result_indicies,
233 |                               TBounds2d* box)
234 | {
235 |    int stack_ptr, node;
236 |    TBounds2d sb;
237 |    int* stack;
238 |    
239 |    assert(t && result_num && result_indicies && box);
240 |    /* Build the k-D tree if necessary */
241 |    if(!t->tree_built){
242 |       //printf("BUILDING TREE... \n");
243 |       //fflush(stdout);
244 |       KDTree_CreateTree2d(t);
245 |    }
246 |    /* Allocate the results array */
247 |    *result_num = 0;
248 |    /* Create the temporary stack array */
249 |    stack_ptr = 0;
250 |    stack = MALLOC(t->tree_size, int);
251 |    
252 |    /* Put the root node of the tree onto the stack */
253 |    stack[stack_ptr++] = 0;
254 |    /* Search the k-D tree until the stack is empty */
255 |    
256 |    while (stack_ptr) {
257 |       /* Pop the top entry off the stack */
258 |       node = stack[--stack_ptr];
259 |       /* Check if the query box intersects an epsilon-expanded safety box for
260 |          the current node. */
261 |       Bounds_Copy2d(&(t->tree_safety_boxes[node]), &sb);
262 |       //Bounds_AddEpsilon(&sb, ENTITY_COINCIDENCE_TOLERANCE);
263 |       /* If the query box doesn't intersect this node's safety box, we are done
264 |          visiting the node and should continue with the next node */
265 |       if(!Bounds_IsOverlappingBounds2d(&sb, box))
266 |          continue;
267 |       /* If the current node is a leaf node, add it to the collision list. If
268 |          the current node is an interior node, add its children to the stack. */
269 |       if(t->tree_link[node] <= 0) {
270 |          result_indicies[*result_num] = - t->tree_link[node];
271 |          (*result_num)++;
272 |       }
273 |       else {
274 |          stack[stack_ptr++] = t->tree_link[node];
275 |          stack[stack_ptr++] = t->tree_link[node] + 1;
276 |       }
277 |    }
278 |    /* Destroy the temporary stack array */
279 |    FREE(stack);
280 | }
281 | 
282 | 


--------------------------------------------------------------------------------
/kdtree/KDTree2d.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
 3 |  *  All rights Reserved.
 4 |  *
 5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
 6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
 7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
 8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
 9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
12 |  * to produce derivative works, such modified software should be clearly marked,
13 |  * so as not to confuse it with the version available from LANL.   
14 |  *
15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
16 |  * use this file except in compliance with the License. You may obtain a copy
17 |  * of the License at 
18 |  *
19 |  * http://www.apache.org/licenses/LICENSE-2.0
20 |  *
21 |  * Unless required by applicable law or agreed to in writing, software distributed
22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
24 |  * specific language governing permissions and limitations under the License.”
25 |  *
26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
27 |  *
28 |  */
29 | 
30 | /* 
31 |  *  Implements a 2-dimensional k-D tree. One begins to use the k-D tree by
32 |  *  adding the bounding box of geometric "elements" to the tree structure
33 |  *  through a call to "KDTreeAddElement". Every element should be of the same
34 |  *  type, but could be a single point, a line segment, triangles, etc. Once
35 |  *  all the element bounding boxes have been added, the user of the structure
36 |  *  may make queries against the tree. The actual tree is constructed lazily
37 |  *  when an actual query occurs on the structure.
38 |  *
39 |  *  This version only has one query -- intersection of a box with the elements
40 |  *  and a set of "candidate" elements are returned. The candidates are identified
41 |  *  by an index number (0, ...) signifying the order in which the element was
42 |  *  added to the tree. It is up to the calling code to do additional processing
43 |  *  based on the type of element being used to determine "real" intersections.
44 |  *
45 |  *  The process of actually building the tree takes "n log n" time. Queries 
46 |  *  take "log n" time.
47 |  *
48 |  */
49 | 
50 | #ifndef _KDTree2d_
51 | #define _KDTree2d_
52 | 
53 | #ifdef __cplusplus
54 | extern "C"
55 | {
56 | #endif
57 |   
58 | 
59 | #ifdef HAVE_CONFIG_H
60 | #include "config.h"
61 | #endif
62 | 
63 | #include "Globals2d.h"
64 | #include "Bounds2d.h"
65 |    
66 |    
67 | #define LEFT_HALF   0
68 | #define RIGHT_HALF  1
69 | #define BOTTOM_HALF 0
70 | #define TOP_HALF    1   
71 | 
72 | typedef struct {
73 |    TBounds2d extent;
74 |    int elements_num, elements_allocated;
75 |    TBounds2d* elements;
76 |    boolean tree_built;
77 |    int tree_size;
78 |    TBounds2d* tree_safety_boxes;
79 |    int * tree_link;
80 | } TKDTree2d;
81 | 
82 | extern void KDTree_Initialize2d(TKDTree2d *t);
83 | extern void KDTree_Destroy2d(TKDTree2d* t);
84 | extern void KDTree_AddElement2d(TKDTree2d* t, TBounds2d* add);
85 | extern void KDTree_CreateTree2d(TKDTree2d* t);
86 | extern void KDTree_QueryBoxIntersect2d(TKDTree2d* t,
87 |                                      int* result_num, int* result_indicies,
88 |                                      TBounds2d* box);
89 | #ifdef __cplusplus
90 | }
91 | #endif
92 | 
93 | #endif
94 | 


--------------------------------------------------------------------------------
/neigh.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
  3 |  *  All rights Reserved.
  4 |  *
  5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
  6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
  7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
  8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
  9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
 10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
 11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
 12 |  * to produce derivative works, such modified software should be clearly marked,
 13 |  * so as not to confuse it with the version available from LANL.   
 14 |  *
 15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 16 |  * use this file except in compliance with the License. You may obtain a copy
 17 |  * of the License at 
 18 |  *
 19 |  * http://www.apache.org/licenses/LICENSE-2.0
 20 |  *
 21 |  * Unless required by applicable law or agreed to in writing, software distributed
 22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 24 |  * specific language governing permissions and limitations under the License.”
 25 |  *
 26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
 27 |  *
 28 |  */
 29 | 
 30 | /*
 31 |  *  Authors: Bob Robey       XCP-2   brobey@lanl.gov
 32 |  *           David Nicholaeff        dnic@lanl.gov, mtrxknight@aol.com
 33 |  *           Rachel Robey            rnrobey@gmail.com
 34 |  */
 35 | 
 36 | #include <stdio.h>
 37 | #include <stdlib.h>
 38 | #include <time.h>
 39 | #include <sys/time.h>
 40 | #include <string.h>
 41 | #include <math.h>
 42 | #include <sys/stat.h>
 43 | #include "kdtree/KDTree1d.h"
 44 | #include "gpu.h"
 45 | #include "timer.h"
 46 | 
 47 | #ifdef HAVE_CONFIG_H
 48 | #include "config.h"
 49 | #endif
 50 | 
 51 | #ifdef __APPLE_CC__
 52 | #include <OpenCL/OpenCL.h>
 53 | #else
 54 | #include <CL/cl.h>
 55 | #endif
 56 | 
 57 | #ifdef HAVE_CL_DOUBLE
 58 | typedef double real;
 59 | typedef cl_double cl_real;
 60 | #define ONE 1.0
 61 | #define TWO 2.0
 62 | #else
 63 | typedef float real;
 64 | typedef cl_float cl_real;
 65 | #define ONE 1.0f
 66 | #define TWO 2.0f
 67 | #endif
 68 | 
 69 | #define SQR(x) (( (x)*(x) ))
 70 | 
 71 | typedef unsigned int uint;
 72 | 
 73 | #define CHECK 1
 74 | #define TILE_SIZE 256
 75 | #define DETAILED_TIMING 0
 76 | #define LONG_RUNS 1
 77 | 
 78 | #ifndef MIN
 79 | #define MIN(a,b) ((a)>(b)?(b):(a))
 80 | #define MAX(a,b) ((a)<(b)?(b):(a))
 81 | #endif
 82 | 
 83 | struct neighbor {
 84 |     uint left;
 85 |     uint right;
 86 | };
 87 | 
 88 | struct timespec tstart;
 89 | double time_sum;
 90 | 
 91 | int is_nvidia = 0;
 92 | #define BRUTE_FORCE_SIZE_LIMIT 500000
 93 | 
 94 | cl_context context;
 95 | cl_command_queue queue;
 96 | cl_program program;
 97 | cl_kernel init_kernel, hash_kernel, get_neighbor_kernel;
 98 | 
 99 | void neighbors( uint length, double min_diff, double max_diff, double min_val );
100 | struct neighbor *neighbors_bruteforce( uint length, double *xcoor, double min_val, double max_val);
101 | struct neighbor *neighbors_kdtree( uint length, double *xcoor, double *xmin, double *xmax,
102 |    double min_diff, double max_val, double min_val );
103 | struct neighbor *neighbors_hashcpu( uint length, double *xcoor, double min_diff, double max_val, double min_val );
104 | cl_mem neighbors_hashgpu( uint length, cl_mem data_buffer, double min_diff, double max_val, double min_val, double *time );
105 | double generate_array_wminmax( uint size, double *ptr, double *xmin, double *xmax,
106 |     double mindx, double maxdx, double min, double *max );
107 | 
108 | int main (int argc, const char * argv[]) {
109 | 
110 |     cl_int error;
111 | 
112 | #ifdef HAVE_OPENCL
113 |     GPUInit(&context, &queue, &is_nvidia, &program, "neigh_kern.cl");
114 | 
115 |     init_kernel = clCreateKernel(program, "init_kern", &error);
116 |     hash_kernel = clCreateKernel(program, "hash_kern", &error);
117 |     get_neighbor_kernel = clCreateKernel(program, "get_neighbor_kern", &error);
118 | #endif
119 | 
120 |     printf("\n    Neighbors Performance Results\n\n");
121 |     if (LONG_RUNS == 1)
122 |        printf("Size,   \tBrute,    \tkDtree   \tHash CPU, \tHash GPU\n");
123 |     else
124 |        printf("Size,   \tkDtree   \tHash CPU, \tHash GPU\n");
125 | 
126 |     for (uint max_mult = 1; max_mult <= 32; max_mult *= 2){
127 |        printf("\nMax diff is %d times min_diff\n",max_mult);
128 |        for( uint i = 64; i <= 5000000; i*=2 ) {
129 |           printf("%d,     ", i);
130 |           neighbors(i, 2.0, (double)max_mult*2.0, 0.0);
131 |           printf("\n");
132 |        }
133 |     }
134 | }
135 | 
136 | /* find right and left neighbors of element at index index in array of size length */
137 | void neighbors( uint length, double min_diff, double max_diff, double min_val ) 
138 | {
139 |    double *xcoor, *xmin, *xmax;
140 |    double max_val = min_val; //reset in generate array call
141 |    struct neighbor *neigh_gold, *neigh_test;
142 | 
143 |    xcoor = (double*)malloc(length*sizeof(double));
144 |    xmin  = (double*)malloc(length*sizeof(double));
145 |    xmax  = (double*)malloc(length*sizeof(double));
146 | 
147 |    generate_array_wminmax(length, xcoor, xmin, xmax, min_diff, max_diff, min_val, &max_val);
148 |    //for (uint i=0; i<length; i++) {printf("i %d xcoor %lf\n",i,xcoor[i]);}
149 | 
150 |    if (length < BRUTE_FORCE_SIZE_LIMIT) {
151 |       cpu_timer_start(&tstart);
152 |       neigh_gold = neighbors_bruteforce(length, xcoor, min_val, max_val);
153 |       time_sum += cpu_timer_stop(tstart);
154 |       printf("\t%.6lf,", time_sum);
155 | 
156 | #ifdef XXX
157 |       printf("\n");
158 |       for (uint index=0; index<length; index++){
159 |          int left  = neigh_gold[index].left;
160 |          int right = neigh_gold[index].right;
161 |          printf("%2d: Element %.2lf  \tRight neighbor: index %2d val %.2lf   \tLeft neighbor index %2d val %.2lf\n",
162 |             index, xcoor[index], right, xcoor[right], left, xcoor[left]);
163 |       }
164 | #endif
165 | 
166 |    } else {
167 |       printf("\tnot_run,  ");
168 |    }
169 | 
170 |    cpu_timer_start(&tstart);
171 |    if (length < BRUTE_FORCE_SIZE_LIMIT)
172 |       neigh_test = neighbors_kdtree(length, xcoor, xmin, xmax, min_diff, max_val, min_val);
173 |    else
174 |       neigh_gold = neighbors_kdtree(length, xcoor, xmin, xmax, min_diff, max_val, min_val);
175 | 
176 |    time_sum += cpu_timer_stop(tstart);
177 |    printf("\t%.6lf,", time_sum);
178 | 
179 | #ifdef XXX
180 |    for (uint index=0; index<length; index++){
181 |       if (neigh_test[index].left != neigh_gold[index].left || neigh_test[index].right != neigh_gold[index].right){
182 |          printf("%2d: neigh_test Element %.2lf  \tRight neighbor: index %2d val %.2lf   \tLeft neighbor index %2d val %.2lf\n",
183 |             index, xcoor[index], neigh_test[index].right, xcoor[neigh_test[index].right], neigh_test[index].left, xcoor[neigh_test[index].left]);
184 |          printf("%2d: neigh_gold Element %.2lf  \tRight neighbor: index %2d val %.2lf   \tLeft neighbor index %2d val %.2lf\n",
185 |             index, xcoor[index], neigh_gold[index].right, xcoor[neigh_gold[index].right], neigh_gold[index].left, xcoor[neigh_gold[index].left]);
186 |          printf("\n");
187 |       }
188 |    }
189 |    if (length < 200000) free(neigh_test);
190 | #endif
191 | 
192 |    cpu_timer_start(&tstart);
193 |    neigh_test = neighbors_hashcpu(length, xcoor, min_diff, max_val, min_val);
194 |    time_sum += cpu_timer_stop(tstart);
195 |    printf("\t%.6lf,", time_sum);
196 | 
197 |    for (uint index=0; index<length; index++){
198 |       if (neigh_test[index].left != neigh_gold[index].left || neigh_test[index].right != neigh_gold[index].right){
199 |          printf("%2d: neigh_test Element %.2lf  \tRight neighbor: index %2d val %.2lf   \tLeft neighbor index %2d val %.2lf\n",
200 |             index, xcoor[index], neigh_test[index].right, xcoor[neigh_test[index].right], neigh_test[index].left, xcoor[neigh_test[index].left]);
201 |          printf("%2d: neigh_gold Element %.2lf  \tRight neighbor: index %2d val %.2lf   \tLeft neighbor index %2d val %.2lf\n",
202 |             index, xcoor[index], neigh_gold[index].right, xcoor[neigh_gold[index].right], neigh_gold[index].left, xcoor[neigh_gold[index].left]);
203 |          printf("\n");
204 |       }
205 |    }
206 |    free(neigh_test);
207 | 
208 | 
209 | #ifdef HAVE_OPENCL
210 |    cl_int error = 0;
211 |    cl_mem data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, length*sizeof(real), NULL, &error);
212 |    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
213 |    error = clEnqueueWriteBuffer(queue, data_buffer, CL_TRUE, 0, length*sizeof(real), xcoor, 0, NULL, NULL);
214 |    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
215 | 
216 |    cl_mem neigh_buffer = neighbors_hashgpu(length, data_buffer, min_diff, max_val, min_val, &time_sum);
217 |    clReleaseMemObject(data_buffer);
218 | 
219 |    if (neigh_buffer != NULL) {
220 |       printf("\t%.6lf,", time_sum);
221 | 
222 |       neigh_test = (struct neighbor *)malloc(length*sizeof(struct neighbor));
223 |       error = clEnqueueReadBuffer(queue, neigh_buffer, CL_TRUE, 0, length*sizeof(cl_uint2), neigh_test, 0, NULL, NULL);
224 |       if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
225 |       clReleaseMemObject(neigh_buffer);
226 | 
227 |       for (uint index=0; index<length; index++){
228 |          if (neigh_test[index].left != neigh_gold[index].left || neigh_test[index].right != neigh_gold[index].right){
229 |             printf("%2d: neigh_test Element %.2lf  \tRight neighbor: index %2d val %.2lf   \tLeft neighbor index %2d val %.2lf\n",
230 |                index, xcoor[index], neigh_test[index].right, xcoor[neigh_test[index].right], neigh_test[index].left, xcoor[neigh_test[index].left]);
231 |             printf("%2d: neigh_gold Element %.2lf  \tRight neighbor: index %2d val %.2lf   \tLeft neighbor index %2d val %.2lf\n",
232 |                index, xcoor[index], neigh_gold[index].right, xcoor[neigh_gold[index].right], neigh_gold[index].left, xcoor[neigh_gold[index].left]);
233 |             printf("\n");
234 |          }
235 |       }
236 |       free(neigh_test);
237 |    } else {
238 |       printf("\tnot_run,  ");
239 |    }
240 | #endif
241 | 
242 |    free(xcoor);
243 |    free(xmin);
244 |    free(xmax);
245 |    free(neigh_gold);
246 | }
247 | 
248 | struct neighbor *neighbors_bruteforce( uint length, double *xcoor, double min_val, double max_val ) 
249 | {
250 |    double xleft, xright;
251 |    int left=0, right=0;
252 | 
253 |    struct neighbor *neigh = (struct neighbor *)malloc(length*sizeof(struct neighbor));
254 | 
255 |    for (uint index1 = 0; index1 < length; index1++) {
256 |       left  = index1;
257 |       right = index1;
258 |       xleft = min_val;
259 |       xright = max_val;
260 |       for (uint index2 = 0; index2 < length; index2++) {
261 |          if (index2 == index1) continue;
262 |          if (xcoor[index2] < xcoor[index1] && xcoor[index2] >= xleft  ) {xleft  = xcoor[index2]; left = index2; }
263 | 
264 |          if (xcoor[index2] > xcoor[index1] && xcoor[index2] <= xright ) {xright = xcoor[index2]; right = index2;}
265 |       }
266 |       neigh[index1].left = left;
267 |       neigh[index1].right = right;
268 |    }
269 | 
270 |    return(neigh);
271 | }
272 | 
273 | struct neighbor *neighbors_kdtree( uint length, double *xcoor, double *xmin, double *xmax,
274 |    double min_diff, double max_val, double min_val ) 
275 | {
276 |    TKDTree1d tree;
277 | 
278 |    KDTree_Initialize1d(&tree);
279 | 
280 |    TBounds1d box;
281 |    for(uint i = 0; i < length; i++) {
282 |      box.min.x = xmin[i];
283 |      box.max.x = xmax[i];
284 |      KDTree_AddElement1d(&tree, &box);
285 |    }
286 | 
287 |    struct neighbor *neigh = (struct neighbor *)malloc(length*sizeof(struct neighbor));
288 | 
289 |    int index_list[10];
290 |    int num;
291 |    for (uint index = 0; index < length; index++) {
292 |       neigh[index].left = index;
293 |       neigh[index].right = index;
294 |       box.min.x = xmin[index]-min_diff*0.25;
295 |       box.max.x = xmin[index]-min_diff*0.20;
296 |       KDTree_QueryBoxIntersect1d(&tree, &num, &(index_list[0]), &box);
297 |       if (num == 1) neigh[index].left = index_list[0];
298 | 
299 |       box.min.x = xmax[index]+min_diff*0.20;
300 |       box.max.x = xmax[index]+min_diff*0.25;
301 |       KDTree_QueryBoxIntersect1d(&tree, &num, &(index_list[0]), &box);
302 |       if (num == 1) neigh[index].right = index_list[0];
303 |    }
304 | 
305 |    KDTree_Destroy1d(&tree);
306 | 
307 |    return(neigh);
308 | }
309 | 
310 | /* find right and left neighbors of element at index index in array of size length */
311 | struct neighbor *neighbors_hashcpu( uint length, double *xcoor, double min_diff, double max_val, double min_val ) 
312 | {
313 |    uint hash_size = (uint)((max_val - min_val)/min_diff + 2.5);	//create hash table with buckets of size min_diff -- +2.5 rounds up and adds one space to either side
314 |    int *hash = (int*)malloc(hash_size*sizeof(int));
315 | 	
316 |    /* Sort elements into hash array hash */
317 |    memset(hash, -1, hash_size*sizeof(int));			//set all elements of hash array to -1
318 | 	
319 |    for(uint i = 0; i < length; i++) { hash[(int)((xcoor[i]+min_val)/min_diff)] = i; }
320 |    //place index of current xcoor element into hash according to where the xcoor value
321 | 
322 |    struct neighbor *neigh = (struct neighbor *)malloc(length*sizeof(struct neighbor));
323 | 
324 |    for (uint index = 0; index < length; index++) {
325 |       /* move left and right through hash array from desired element to find its neighbors */
326 |       int idx_new = (int)((xcoor[index]-min_val)/min_diff);	//where the index element is in the hash array
327 |       int left = index, right = index;
328 | 
329 |       for(int i = idx_new+1; i < hash_size; i++) {	//store index of neigbor in original unsorted array, if greatest/least, than left as -1
330 |          if(hash[i] != -1) {
331 |             right = hash[i];
332 |             break;
333 |          }
334 |       }
335 |       for(int i = idx_new-1; i >= 0; i--) {
336 |          if(hash[i]  != -1) {
337 |             left = hash[i];
338 |             break;
339 |          }
340 |       }
341 |       neigh[index].left  = left;
342 |       neigh[index].right = right;
343 |    }
344 | 
345 |    free(hash);
346 | 
347 |    return(neigh);
348 | }
349 | 
350 | #ifdef HAVE_OPENCL
351 | /* find right and left neighbors of element at index index in array of size length */
352 | cl_mem neighbors_hashgpu( uint length, cl_mem data_buffer, double min_diff, double max_val, double min_val, double *time ) 
353 | {
354 |    cl_mem hash_buffer, neighbor_buffer;
355 | 
356 |    cl_int error = 0;
357 |    long gpu_time = 0;
358 | 
359 |    uint hash_size = (uint)((max_val - min_val)/min_diff + 2.5);	//create hash table with buckets of size min_diff -- +2.5 rounds up and adds one space to either side
360 | 
361 |    real min_val_real = (real)min_val;
362 |    real min_diff_real = (real)min_diff;
363 | 
364 |    hash_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, hash_size*sizeof(int), NULL, &error);
365 |    if (error != CL_SUCCESS) {
366 |       //printf("Error is %d at line %d\n",error,__LINE__);
367 |       return(NULL);
368 |    }
369 | 
370 |    /******************
371 |     * Init to -1
372 |     *******************/
373 |  
374 |    error = clSetKernelArg(init_kernel, 0, sizeof(cl_uint), &hash_size);
375 |    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
376 |    error = clSetKernelArg(init_kernel, 1, sizeof(cl_mem), (void*)&hash_buffer);
377 |    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
378 | 
379 |    size_t global_work_size[1];
380 |    size_t local_work_size[1];
381 |    
382 |    local_work_size[0] = TILE_SIZE;
383 |    global_work_size[0] = ((hash_size+local_work_size[0]-1)/local_work_size[0])*local_work_size[0];
384 |    
385 |    cl_event hash_init_event;
386 | 
387 |    error = clEnqueueNDRangeKernel(queue, init_kernel, 1, 0, global_work_size, local_work_size, 0, NULL, &hash_init_event);
388 |    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
389 | 
390 |    /******************
391 |     * Hash Kernel
392 |     ******************/
393 | 
394 |    error = clSetKernelArg(hash_kernel, 0, sizeof(real), &min_val_real);
395 |    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
396 |    error = clSetKernelArg(hash_kernel, 1, sizeof(real), &min_diff_real);
397 |    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
398 |    error = clSetKernelArg(hash_kernel, 2, sizeof(cl_uint), &length);
399 |    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
400 |    error = clSetKernelArg(hash_kernel, 3, sizeof(cl_mem), (void*)&data_buffer);
401 |    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
402 |    error = clSetKernelArg(hash_kernel, 4, sizeof(cl_mem), (void*)&hash_buffer);
403 |    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
404 | 
405 |    global_work_size[0] = ((length+local_work_size[0]-1)/local_work_size[0])*local_work_size[0];
406 | 
407 |    cl_event hash_kernel_event;
408 | 
409 |    error = clEnqueueNDRangeKernel(queue, hash_kernel, 1, 0, global_work_size, local_work_size, 0, NULL, &hash_kernel_event);
410 |    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
411 | 
412 |    /******************
413 |     * Get Neighbor Kernel
414 |     ******************/
415 | 
416 |    neighbor_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, length*sizeof(cl_uint2), NULL, &error);
417 |    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
418 | 
419 |    error = clSetKernelArg(get_neighbor_kernel, 0, sizeof(real), &min_val_real);
420 |    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
421 |    error = clSetKernelArg(get_neighbor_kernel, 1, sizeof(real), &min_diff_real);
422 |    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
423 |    error = clSetKernelArg(get_neighbor_kernel, 2, sizeof(cl_uint), &length);
424 |    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
425 |    error = clSetKernelArg(get_neighbor_kernel, 3, sizeof(cl_mem), (void*)&data_buffer);
426 |    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
427 |    error = clSetKernelArg(get_neighbor_kernel, 4, sizeof(cl_mem), (void*)&hash_buffer);
428 |    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
429 |    error = clSetKernelArg(get_neighbor_kernel, 5, sizeof(cl_uint), &hash_size);
430 |    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
431 |    error = clSetKernelArg(get_neighbor_kernel, 6, sizeof(cl_mem), &neighbor_buffer);
432 |    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
433 | 
434 |    cl_event get_neighbor_event;
435 | 
436 |    error = clEnqueueNDRangeKernel(queue, get_neighbor_kernel, 1, 0, global_work_size, local_work_size, 0, NULL, &get_neighbor_event);
437 |    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
438 | 
439 |    long gpu_time_start, gpu_time_end;
440 | 
441 |    clWaitForEvents(1,&get_neighbor_event);
442 | 
443 |    clGetEventProfilingInfo(hash_init_event, CL_PROFILING_COMMAND_START, sizeof(gpu_time_start), &gpu_time_start, NULL);
444 |    clGetEventProfilingInfo(hash_init_event, CL_PROFILING_COMMAND_END, sizeof(gpu_time_end), &gpu_time_end, NULL);
445 |    gpu_time += gpu_time_end - gpu_time_start;
446 |    clReleaseEvent(hash_init_event);
447 | 
448 |    if (DETAILED_TIMING) printf("\tinit %.6lf,", (double)(gpu_time_end - gpu_time_start)*1.0e-9);
449 | 
450 |    clGetEventProfilingInfo(hash_kernel_event, CL_PROFILING_COMMAND_START, sizeof(gpu_time_start), &gpu_time_start, NULL);
451 |    clGetEventProfilingInfo(hash_kernel_event, CL_PROFILING_COMMAND_END, sizeof(gpu_time_end), &gpu_time_end, NULL);
452 |    gpu_time += gpu_time_end - gpu_time_start;
453 |    clReleaseEvent(hash_kernel_event);
454 | 
455 |    if (DETAILED_TIMING) printf("hash %.6lf,", (double)(gpu_time_end - gpu_time_start)*1.0e-9);
456 | 
457 |    clGetEventProfilingInfo(get_neighbor_event, CL_PROFILING_COMMAND_START, sizeof(gpu_time_start), &gpu_time_start, NULL);
458 |    clGetEventProfilingInfo(get_neighbor_event, CL_PROFILING_COMMAND_END, sizeof(gpu_time_end), &gpu_time_end, NULL);
459 |    gpu_time += gpu_time_end - gpu_time_start;
460 |    clReleaseEvent(get_neighbor_event);
461 | 
462 |    if (DETAILED_TIMING) printf("hash %.6lf,", (double)(gpu_time_end - gpu_time_start)*1.0e-9);
463 | 
464 |    *time = (double)gpu_time*1.0e-9;
465 | 
466 |    clReleaseMemObject(hash_buffer);
467 | 
468 |    return(neighbor_buffer);
469 | 
470 | }
471 | #endif
472 | 
473 | double generate_array_wminmax( uint size, double *ptr, double *xmin, double *xmax,
474 |      double mindx, double maxdx, double min, double *max ) {
475 | 	
476 |      double swap;
477 | 	int index, front = 0;
478 |     double running_min = maxdx;
479 | 		
480 | 	struct timespec tim;				//random seeding
481 | 	cpu_timer_start(&tim);
482 | 	//srand(tim.tv_sec*tim.tv_nsec);
483 | 	
484 | 	srand(0);
485 | 	
486 | 	ptr[0] = min;		//start the array using the minimum value
487 | 	
488 | 	/* for each element, add a random value between mindx and maxdx to the previous element's value */
489 | 	for(int i = 1; i < size; i++) {
490 | 		ptr[i] = ptr[i-1] + mindx + ((double)rand() * (maxdx - mindx) / (double)RAND_MAX);
491 |         if(ptr[i]-ptr[i-1] < running_min) running_min = ptr[i]-ptr[i-1];
492 | 	}
493 | 
494 | 
495 | 	*max = ptr[size-1];					//set the max value to the last element's value
496 | 	//*max = min + (size-1) * maxdx;	//force the range for timings isolating a different variable
497 | 	
498 |         xmin[0] = min;
499 |         for (int i=1; i<size; i++){
500 |            xmin[i] = (ptr[i] + ptr[i-1]) * 0.5;
501 |            xmax[i-1] = xmin[i];
502 |         }
503 |         xmax[size-1]=*max;
504 | 
505 | 	/* Mix up the array by selecting elements from shrinking front portion of array and placing them on back end of array */
506 | 	for(int i = 0; (i < size) && (size - i != 0) ; i++) {
507 | 		index = rand() % (size - i - front) + front;
508 | 		swap = ptr[size-i-1];
509 | 		ptr[size-i-1] = ptr[index];
510 | 		ptr[index] = swap;
511 |                 swap = xmin[size-i-1];
512 |                 xmin[size-i-1] = xmin[index];
513 |                 xmin[index] = swap;
514 |                 swap = xmax[size-i-1];
515 |                 xmax[size-i-1] = xmax[index];
516 |                 xmax[index] = swap;
517 | 	}
518 |     return running_min;
519 | }
520 | 


--------------------------------------------------------------------------------
/neigh2d_kern.cl:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
  3 |  *  All rights Reserved.
  4 |  *
  5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
  6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
  7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
  8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
  9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
 10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
 11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
 12 |  * to produce derivative works, such modified software should be clearly marked,
 13 |  * so as not to confuse it with the version available from LANL.   
 14 |  *
 15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 16 |  * use this file except in compliance with the License. You may obtain a copy
 17 |  * of the License at 
 18 |  *
 19 |  * http://www.apache.org/licenses/LICENSE-2.0
 20 |  *
 21 |  * Unless required by applicable law or agreed to in writing, software distributed
 22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 24 |  * specific language governing permissions and limitations under the License.”
 25 |  *
 26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
 27 |  *
 28 |  */
 29 | 
 30 | /*
 31 |  *  Authors: Bob Robey       XCP-2   brobey@lanl.gov
 32 |  *           David Nicholaeff        dnic@lanl.gov, mtrxknight@aol.com
 33 |  *           Rachel Robey            rnrobey@gmail.com
 34 |  */
 35 | 
 36 | /* neigh2d_kern.cl */
 37 | 
 38 | #ifdef HAVE_CL_DOUBLE
 39 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 40 | typedef double  real;
 41 | #else
 42 | typedef float   real;
 43 | #endif
 44 | 
 45 | struct neighbor2d {
 46 |    uint left;
 47 |    uint right;
 48 |    uint bottom;
 49 |    uint top;
 50 | };
 51 | 
 52 | __kernel void init_kern(
 53 |         const uint size,
 54 | 	__global int *temp) {
 55 | 
 56 | 	const uint idx = get_global_id(0);
 57 | 
 58 |         if (idx >= size) return;
 59 | 
 60 | 	temp[idx] = -1;
 61 | }
 62 | 
 63 | __kernel void hash_kern(
 64 | 	const real min_val,
 65 | 	const real min_diff,
 66 |         const uint length,
 67 | 	__global const real *arr,
 68 | 	__global int *temp) {
 69 | 	
 70 | 	const uint idx = get_global_id(0);
 71 | 	
 72 |         if(idx >= length) return;
 73 | 
 74 |         temp[(uint)((arr[idx]-min_val)/min_diff)] = idx;
 75 | }
 76 | 
 77 | #define hashval(j,i) hash[(j)*imaxsize+(i)]
 78 | 
 79 | __kernel void hash_setup_kern(
 80 |       const uint isize,
 81 |       const uint mesh_size,
 82 |       const uint levmx,
 83 |       __global const int  *levtable,
 84 |       __global const int  *i,
 85 |       __global const int  *j,
 86 |       __global const int  *level,
 87 |       __global int  *hash
 88 |       ) {
 89 | 
 90 |    const uint giX = get_global_id(0);
 91 | 
 92 |    if (giX >= isize) return;
 93 | 
 94 |    int imaxsize = mesh_size*levtable[levmx];
 95 | 
 96 |    int lev = level[giX];
 97 |    int ii = i[giX];
 98 |    int jj = j[giX];
 99 | 
100 |    int levdiff = levmx - lev;
101 | 
102 |    int iimin =  ii   *levtable[levdiff];
103 |    int iimax = (ii+1)*levtable[levdiff];
104 |    int jjmin =  jj   *levtable[levdiff];
105 |    int jjmax = (jj+1)*levtable[levdiff];
106 | 
107 |    for (   int jjj = jjmin; jjj < jjmax; jjj++) {
108 |       for (int iii = iimin; iii < iimax; iii++) {
109 |          hashval(jjj, iii) = giX;
110 |       }
111 |    }
112 | 
113 | }
114 | 
115 | __kernel void calc_neighbor2d_kern(
116 |       const int isize,
117 |       const uint mesh_size,
118 |       const int levmx,
119 |       __global const int *levtable,
120 |       __global const int *i,
121 |       __global const int *j,
122 |       __global const int *level,
123 |       __global const int *hash,
124 |       __global struct neighbor2d *neigh2d
125 |       ) {
126 | 
127 |    const uint giX  = get_global_id(0);
128 | 
129 |    if (giX >= isize) return;
130 | 
131 |    int imaxsize = mesh_size*levtable[levmx];
132 |    int jmaxsize = mesh_size*levtable[levmx];
133 | 
134 |    int ii = i[giX];
135 |    int jj = j[giX];
136 |    int lev = level[giX];
137 |    int levmult = levtable[levmx-lev];
138 | 
139 |    int nlftval = hashval(      jj   *levmult               , max(  ii   *levmult-1, 0         ));
140 |    int nrhtval = hashval(      jj   *levmult               , min( (ii+1)*levmult,   imaxsize-1));
141 |    int nbotval = hashval(max(  jj   *levmult-1, 0)         ,       ii   *levmult               );
142 |    int ntopval = hashval(min( (jj+1)*levmult,   jmaxsize-1),       ii   *levmult               );
143 | 
144 |    neigh2d[giX].left = nlftval;
145 |    neigh2d[giX].right = nrhtval;
146 |    neigh2d[giX].bottom = nbotval;
147 |    neigh2d[giX].top = ntopval;
148 | }
149 | 


--------------------------------------------------------------------------------
/neigh_kern.cl:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
  3 |  *  All rights Reserved.
  4 |  *
  5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
  6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
  7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
  8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
  9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
 10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
 11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
 12 |  * to produce derivative works, such modified software should be clearly marked,
 13 |  * so as not to confuse it with the version available from LANL.   
 14 |  *
 15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 16 |  * use this file except in compliance with the License. You may obtain a copy
 17 |  * of the License at 
 18 |  *
 19 |  * http://www.apache.org/licenses/LICENSE-2.0
 20 |  *
 21 |  * Unless required by applicable law or agreed to in writing, software distributed
 22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 24 |  * specific language governing permissions and limitations under the License.”
 25 |  *
 26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
 27 |  *
 28 |  */
 29 | 
 30 | /*
 31 |  *  Authors: Bob Robey       XCP-2   brobey@lanl.gov
 32 |  *           David Nicholaeff        dnic@lanl.gov, mtrxknight@aol.com
 33 |  *           Rachel Robey            rnrobey@gmail.com
 34 |  */
 35 | 
 36 | /* neigh_kern.cl */
 37 | 
 38 | #ifdef HAVE_CL_DOUBLE
 39 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 40 | typedef double  real;
 41 | #else
 42 | typedef float   real;
 43 | #endif
 44 | 
 45 | struct neighbor {
 46 |    uint left;
 47 |    uint right;
 48 | };
 49 | 
 50 | __kernel void init_kern(
 51 |         const uint size,
 52 | 	__global int *temp) {
 53 | 
 54 | 	const uint idx = get_global_id(0);
 55 | 
 56 |         if (idx >= size) return;
 57 | 
 58 | 	temp[idx] = -1;
 59 | }
 60 | 
 61 | __kernel void hash_kern(
 62 | 	const real min_val,
 63 | 	const real min_diff,
 64 |         const uint length,
 65 | 	__global const real *arr,
 66 | 	__global int *temp) {
 67 | 	
 68 | 	const uint idx = get_global_id(0);
 69 | 	
 70 |         if(idx >= length) return;
 71 | 
 72 |         temp[(uint)((arr[idx]-min_val)/min_diff)] = idx;
 73 | }
 74 | 
 75 | __kernel void get_neighbor_kern(
 76 | 	const real min_val,
 77 | 	const real min_diff,
 78 |         const uint length,
 79 | 	__global const real *arr,
 80 | 	__global const int *temp,
 81 |         const uint temp_size,
 82 |         __global struct neighbor *neighbor_buffer) {
 83 | 	
 84 | 	const uint idx = get_global_id(0);
 85 | 	
 86 |         if(idx >= length) return;
 87 | 
 88 |         int idx_new = (int)((arr[idx]-min_val)/min_diff);
 89 | 
 90 |         int left = idx;
 91 |         int right = idx;
 92 | 
 93 |         for (int i = idx_new+1; i < temp_size; i++) {
 94 |            if (temp[i] != -1) {
 95 |               right = temp[i];
 96 |               break;
 97 |            }
 98 |         }
 99 | 
100 |         for (int i = idx_new-1; i >= 0; i--) {
101 |            if (temp[i] != -1) {
102 |               left = temp[i];
103 |               break;
104 |            }
105 |         }
106 | 
107 |         neighbor_buffer[idx].left  = left;
108 |         neighbor_buffer[idx].right = right;
109 | }
110 | 


--------------------------------------------------------------------------------
/remap2d_kern.cl:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
  3 |  *  All rights Reserved.
  4 |  *
  5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
  6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
  7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
  8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
  9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
 10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
 11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
 12 |  * to produce derivative works, such modified software should be clearly marked,
 13 |  * so as not to confuse it with the version available from LANL.   
 14 |  *
 15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 16 |  * use this file except in compliance with the License. You may obtain a copy
 17 |  * of the License at 
 18 |  *
 19 |  * http://www.apache.org/licenses/LICENSE-2.0
 20 |  *
 21 |  * Unless required by applicable law or agreed to in writing, software distributed
 22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 24 |  * specific language governing permissions and limitations under the License.”
 25 |  *
 26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
 27 |  *
 28 |  */
 29 | 
 30 | /*
 31 |  *  Authors: Bob Robey       XCP-2   brobey@lanl.gov
 32 |  *           David Nicholaeff        dnic@lanl.gov, mtrxknight@aol.com
 33 |  *           Rachel Robey            rnrobey@gmail.com
 34 |  */
 35 | 
 36 | /* remap_kern2d.cl */
 37 | 
 38 | #ifdef HAVE_CL_DOUBLE
 39 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 40 | typedef double  real;
 41 | #else
 42 | typedef float   real;
 43 | #endif
 44 | 
 45 | // Cartesian Coordinate Indexing
 46 | #define two_to_the(ishift)       (1u <<(ishift) )
 47 | #define four_to_the(ishift)      (1u << ( (ishift)*2 ) )
 48 | 
 49 | /* Remap Kernels */
 50 | __kernel void remap_hash_creation_kern(
 51 |    __global int* hash_table,
 52 |    __global const int* i,
 53 |    __global const int* j,
 54 |    __global const int* level,
 55 |    const int ncells_a,
 56 |    const int mesh_size,
 57 |    const int levmx) {
 58 | 
 59 |    const int ic = get_global_id(0);
 60 | 
 61 |    uint i_max = mesh_size*two_to_the(levmx);
 62 | 
 63 |    if(ic < ncells_a) {
 64 |        int ii = i[ic];
 65 |        int jj = j[ic];
 66 |        int lev = level[ic];
 67 |        // If at the maximum level just set the one cell
 68 |        if (lev == levmx) {
 69 |            hash_table[(jj*i_max)+ii] = ic;
 70 |        } else {
 71 |            // Set the square block of cells at the finest level
 72 |            // to the index number
 73 |            int lev_mod = two_to_the(levmx - lev);
 74 |            for (int jjj = jj*lev_mod; jjj < (jj+1)*lev_mod; jjj++) {
 75 |               for (int iii = ii*lev_mod; iii < (ii+1)*lev_mod; iii++) {
 76 |                   hash_table[(jjj*i_max)+iii] = ic;
 77 |               }
 78 |            }
 79 |        }
 80 |    }
 81 | 
 82 | }
 83 | 
 84 | 
 85 | __kernel void remap_hash_retrieval_kern(
 86 |    __global real* V_remap,
 87 |    __global const real* V_a,
 88 |    __global const int* hash_table,
 89 |    __global const int* mesh_a_i,
 90 |    __global const int* mesh_a_j,
 91 |    __global const int* mesh_a_level,
 92 |    __global const int* mesh_b_i,
 93 |    __global const int* mesh_b_j,
 94 |    __global const int* mesh_b_level,
 95 |    const int ncells_b,
 96 |    const int mesh_size,
 97 |    const int levmx) {
 98 | 
 99 |    const int jc = get_global_id(0);
100 | 
101 |    uint i_max = mesh_size*two_to_the(levmx);
102 | 
103 |    if(jc < ncells_b) {
104 |       int ii = mesh_b_i[jc];
105 |       int jj = mesh_b_j[jc];
106 |       int lev = mesh_b_level[jc];
107 |       int lev_mod = two_to_the(levmx - lev);
108 |       real val_sum = 0.0;
109 |       for(int jjj = jj*lev_mod; jjj < (jj+1)*lev_mod; jjj++) {
110 |          for(int iii = ii*lev_mod; iii < (ii+1)*lev_mod; iii++) {
111 |             int ic = hash_table[jjj*i_max+iii];
112 |             val_sum += V_a[ic] / (real)four_to_the(levmx-mesh_a_level[ic]);
113 |          }
114 |       }
115 |       V_remap[jc] += val_sum;
116 |    }
117 | 
118 | }
119 | 
120 | 


--------------------------------------------------------------------------------
/remap_kern.cl:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
  3 |  *  All rights Reserved.
  4 |  *
  5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
  6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
  7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
  8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
  9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
 10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
 11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
 12 |  * to produce derivative works, such modified software should be clearly marked,
 13 |  * so as not to confuse it with the version available from LANL.   
 14 |  *
 15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 16 |  * use this file except in compliance with the License. You may obtain a copy
 17 |  * of the License at 
 18 |  *
 19 |  * http://www.apache.org/licenses/LICENSE-2.0
 20 |  *
 21 |  * Unless required by applicable law or agreed to in writing, software distributed
 22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 24 |  * specific language governing permissions and limitations under the License.”
 25 |  *
 26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
 27 |  *
 28 |  */
 29 | 
 30 | /*
 31 |  *  Authors: Bob Robey       XCP-2   brobey@lanl.gov
 32 |  *           David Nicholaeff        dnic@lanl.gov, mtrxknight@aol.com
 33 |  *           Rachel Robey            rnrobey@gmail.com
 34 |  */
 35 | 
 36 | /* remap_kern.cl */
 37 | 
 38 | #ifdef HAVE_CL_DOUBLE
 39 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 40 | typedef double  real;
 41 | #else
 42 | typedef float   real;
 43 | #endif
 44 | 
 45 | #ifndef MIN
 46 | #define MIN(a,b) ((a)>(b)?(b):(a))
 47 | #endif
 48 | 
 49 | struct rcell {
 50 |     real low;
 51 |     real high;
 52 | };
 53 | 
 54 | __kernel void hash_kern(
 55 | 	const real min_val,
 56 | 	const real min_diff,
 57 |     const uint length,
 58 | 	__global const real *arr,
 59 | 	__global int *temp) {
 60 | 	
 61 | 	const uint idx = get_global_id(0);
 62 | 	
 63 |     if(idx >= length) return;
 64 | 
 65 |     temp[(uint)((arr[idx]-min_val)/min_diff)] = idx;
 66 | }
 67 | 
 68 | 
 69 | /* Remap Kernels */
 70 | 
 71 | __kernel void cellHash_kern(
 72 |     const real min_val,
 73 |     const real min_diff,
 74 |     const uint length,
 75 |     __global const struct rcell *arr,
 76 |     __global int *temp) {
 77 |     
 78 |     const uint idx = get_global_id(0);
 79 |     
 80 |     if( idx < length ) {
 81 |     
 82 |         uint start = (int)((arr[idx].low+min_val)/min_diff);
 83 |         uint end = (int)((arr[idx].high+min_val)/min_diff);
 84 |     
 85 |         while( start < end ) {
 86 |             temp[start] = idx;
 87 |             start++;
 88 |         }
 89 |     }
 90 | 
 91 | }
 92 | 
 93 | __kernel void remap1_kern(
 94 |     const real min_val,
 95 |     const real mindx,
 96 |     const uint hash_size,
 97 |     const uint bsize,
 98 |     __global struct rcell *arr_a,
 99 |     __global real *arr_v,
100 |     __global struct rcell *arr_b,
101 |     __global int *hash,
102 |     __global real *remap) {
103 |     
104 |     const uint idx = get_global_id(0);
105 |     if( idx < bsize ) {
106 |     
107 |         uint start = (arr_b[idx].low - min_val)/mindx;
108 |         uint end = (arr_b[idx].high - min_val)/mindx;
109 |     
110 |         if(start > hash_size - 1) { remap[idx] = 0.0; return; }
111 |         if(end > hash_size) end = hash_size;
112 |     
113 |         remap[idx] = 0.;
114 |         for( uint i = start; i < end; i++ ) {
115 |             if(hash[i] >= 0) {
116 |                 remap[idx] += arr_v[hash[i]] * 1./(arr_a[hash[i]].high - arr_a[hash[i]].low);   //assume state variable value of 1 in each original cell
117 |             }
118 |         }
119 |     }
120 | }
121 | 
122 | 


--------------------------------------------------------------------------------
/sort.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
  3 |  *  All rights Reserved.
  4 |  *
  5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
  6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
  7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
  8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
  9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
 10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
 11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
 12 |  * to produce derivative works, such modified software should be clearly marked,
 13 |  * so as not to confuse it with the version available from LANL.   
 14 |  *
 15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 16 |  * use this file except in compliance with the License. You may obtain a copy
 17 |  * of the License at 
 18 |  *
 19 |  * http://www.apache.org/licenses/LICENSE-2.0
 20 |  *
 21 |  * Unless required by applicable law or agreed to in writing, software distributed
 22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 24 |  * specific language governing permissions and limitations under the License.”
 25 |  *
 26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
 27 |  *
 28 |  */
 29 | 
 30 | /*
 31 |  *  Authors: Bob Robey       XCP-2   brobey@lanl.gov
 32 |  *           David Nicholaeff        dnic@lanl.gov, mtrxknight@aol.com
 33 |  *           Rachel Robey            rnrobey@gmail.com
 34 |  */
 35 | 
 36 | #include <stdio.h>
 37 | #include <stdlib.h>
 38 | #include <time.h>
 39 | #include <sys/time.h>
 40 | #include <string.h>
 41 | #include <math.h>
 42 | #include <sys/stat.h>
 43 | #include "gpu.h"
 44 | #include "timer.h"
 45 | 
 46 | #ifdef HAVE_CONFIG_H
 47 | #include "config.h"
 48 | #endif
 49 | 
 50 | #ifdef __APPLE_CC__
 51 | #include <OpenCL/OpenCL.h>
 52 | #else
 53 | #include <CL/cl.h>
 54 | #endif
 55 | 
 56 | #ifdef HAVE_CL_DOUBLE
 57 | typedef double real;
 58 | typedef cl_double cl_real;
 59 | #define EPS 1.0e-12
 60 | #else
 61 | typedef float real;
 62 | typedef cl_float cl_real;
 63 | #define EPS 1.0e-7
 64 | #endif
 65 | 
 66 | #define SQR(x) (( (x)*(x) ))
 67 | 
 68 | typedef unsigned int uint;
 69 | 
 70 | #define CHECK 1
 71 | #define TILE_SIZE 256
 72 | #define DETAILED_TIMING 0
 73 | 
 74 | struct timespec tstart;
 75 | double time_sum;
 76 | 
 77 | int is_nvidia = 0;
 78 | 
 79 | cl_context context;
 80 | cl_command_queue queue;
 81 | cl_program program;
 82 | cl_kernel init_kernel, hash_kernel, scan1_kernel, scan2_kernel, scan3_kernel;
 83 | 
 84 | void sorts( uint length, double min_diff, double max_diff, double min_val );
 85 | cl_mem parallelHash( uint length, cl_mem arr, double min_diff, double max_diff, double min_val, double max_val, double *time );
 86 | double* hashsort( uint length, double *arr, double min_diff, double min_val, double max_val );
 87 | double generate_array( uint size, double *ptr, double mindx, double maxdx, double min, double *max );
 88 | 
 89 | //int compare (const void * a, const void * b) { return ( *(double*)a - *(double*)b ); }
 90 | 
 91 | int compare (const void *a, const void *b)
 92 | {
 93 |   const double *da = (const double *) a;
 94 |   const double *db = (const double *) b;
 95 | 
 96 |   return (*da > *db) - (*da < *db);
 97 | }
 98 | 
 99 | int main (int argc, const char * argv[]) 
100 | {
101 |     cl_int error;
102 | 
103 | #ifdef HAVE_OPENCL
104 |     GPUInit(&context, &queue, &is_nvidia, &program, "sort_kern.cl");
105 | #endif
106 | 
107 |     struct timespec tim;                //random seeding
108 |     clock_gettime(CLOCK_MONOTONIC, &tim);
109 |     //srand(tim.tv_sec*tim.tv_nsec);
110 | 
111 |     srand(0);
112 | 
113 | #ifdef HAVE_OPENCL
114 |     init_kernel = clCreateKernel(program, "init_kern", &error);
115 |     hash_kernel = clCreateKernel(program, "hash_kern", &error);
116 |     scan1_kernel = clCreateKernel(program, "scan1", &error);
117 |     scan2_kernel = clCreateKernel(program, "scan2", &error);
118 |     scan3_kernel = clCreateKernel(program, "scan3", &error);
119 | #endif
120 | 
121 |     printf("\n    Sorting Performance Results\n\n");
122 | #ifdef __APPLE_CC__
123 |     printf("Size,   \tQsort,    \tHeapsort, \tMergesort, \tHash CPU, \tHash GPU\n");
124 | #else
125 |     printf("Size,   \tQsort,    \tHash CPU, \tHash GPU\n");
126 | #endif
127 | 
128 |     uint max_size = 0;
129 | #ifdef HAVE_CL_DOUBLE
130 |     max_size = 100000000;
131 | #else
132 |     max_size = 10000000;
133 | #endif
134 |     //else max_size = 131071;
135 | 
136 |     for (uint max_mult = 2; max_mult <= 8; max_mult *= 2){
137 |        printf("\nMax diff is %d times min_diff\n",max_mult);
138 |        for( uint i = 1024; i <= max_size; i*=2 ) {
139 | #ifndef HAVE_CL_DOUBLE
140 |           if (max_mult > 2  && i > 5000000) continue;
141 |           if (max_mult > 4  && i > 4000000) continue;
142 |           if (max_mult > 8  && i > 2000000) continue;
143 |           if (max_mult > 16  && i > 1000000) continue;
144 | #endif
145 |           if (max_mult > 10 && i > 50000000) continue;
146 |           if (max_mult > 30 && i > 20000000) continue;
147 |           printf("%d,     ", i);
148 |           sorts(i, 2.0, (double)max_mult*2.0, 0.0);
149 |           printf("\n");
150 |        }
151 |     }
152 | 
153 | }
154 | 
155 | void sorts( uint length, double min_diff, double max_diff, double min_val ) {
156 |     int icount;
157 |     cl_int error = 0;
158 |     double max_val = min_val; //reset in generate_array call
159 |     double *sorted=NULL, *sort_test=NULL, *arr=NULL;
160 |     
161 |     arr = (double*)malloc(length*sizeof(double));
162 |     
163 |     //generate randomly shuffled array with given conditions to be sorted
164 |     generate_array(length, arr, min_diff, max_diff, min_val, &max_val);
165 |     
166 |     /* Qsort */
167 |     sorted = (double*)malloc(length*sizeof(double));
168 |     for(uint i = 0; i < length; i++) { sorted[i] = arr[i]; }
169 |     cpu_timer_start(&tstart);
170 |     qsort(sorted, length, sizeof(double), compare);
171 |     time_sum += cpu_timer_stop(tstart);
172 |     printf("\t%.6lf,", time_sum);
173 | 
174 | 
175 | #ifdef __APPLE_CC__
176 |     /* Heapsort */
177 |     sort_test = (double*)malloc(length*sizeof(double));
178 |     for(uint i = 0; i < length; i++) { sort_test[i] = arr[i]; }
179 |     cpu_timer_start(&tstart);
180 |     heapsort(sort_test, length, sizeof(double), compare);
181 |     time_sum += cpu_timer_stop(tstart);
182 |     printf("\t%.6lf,", time_sum);
183 | #ifdef CHECK
184 |     for(uint i = 0; i < length; i++) { if (sort_test[i] != sorted[i]) printf("Check failed for heapsort index %d heapsort value %lf gold standard %lf\n",i,sort_test[i],sorted[i]); }
185 | #endif
186 |     free(sort_test);
187 |     sort_test = NULL;
188 | 
189 |     /* Mergesort */
190 |     sort_test = (double*)malloc(length*sizeof(double));
191 |     for(uint i = 0; i < length; i++) { sort_test[i] = arr[i]; }
192 |     cpu_timer_start(&tstart);
193 |     mergesort(sort_test, length, sizeof(double), compare);
194 |     time_sum += cpu_timer_stop(tstart);
195 |     printf("\t%.6lf,", time_sum);
196 | #ifdef CHECK
197 |     for(uint i = 0; i < length; i++) { if (sort_test[i] != sorted[i]) printf("Check failed for mergesort index %d mergesort value %lf gold standard %lf\n",i,sort_test[i],sorted[i]); }
198 | #endif
199 |     free(sort_test);
200 |     sort_test = NULL;
201 | #endif
202 | 
203 | 
204 |     /* Hashsort CPU */
205 |     cpu_timer_start(&tstart);
206 |     sort_test = hashsort(length, arr, min_diff, min_val, max_val);
207 |     time_sum += cpu_timer_stop(tstart);
208 |     printf("\t%.6lf,", time_sum);
209 | #ifdef CHECK
210 |     icount=0;
211 |     for(uint i = 0; i < length; i++) {
212 |        if (sort_test[i] != sorted[i]) {
213 |           printf("Check failed for hashsort CPU index %d hashsort value %lf gold standard %lf\n",i,sort_test[i],sorted[i]);
214 |           icount++;
215 |        }
216 |     }
217 | #endif
218 |     free(sort_test);
219 |     sort_test = NULL;
220 | 
221 | 
222 | #ifdef HAVE_OPENCL
223 |     uint hash_size = (uint)((max_val - min_val)/min_diff + 2.5);
224 |     uint alloc_size = 2*length*sizeof(real)+hash_size*sizeof(int)+(hash_size+hash_size-1)/TILE_SIZE*sizeof(int);
225 |     //printf("\tSize is %lu\t", alloc_size);
226 |     if (is_nvidia || alloc_size < 850000000) {
227 |        /* Hashsort GPU */
228 |        real *arr_real = (real*)malloc(length*sizeof(real));
229 |        for(uint i = 0; i < length; i++) { arr_real[i] = (real)arr[i]; }
230 |        cl_mem xcoor_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, length*sizeof(real), NULL, &error);
231 |        cl_mem sorted_buffer = NULL;
232 |        if (xcoor_buffer != NULL) {
233 |           if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
234 |           error = clEnqueueWriteBuffer(queue, xcoor_buffer, CL_TRUE, 0, length*sizeof(real), arr_real, 0, NULL, NULL);
235 |           if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
236 | 
237 |           sorted_buffer = parallelHash(length, xcoor_buffer,  min_diff, max_diff, min_val, max_val, &time_sum);
238 |           clReleaseMemObject(xcoor_buffer);
239 |        }
240 |        free(arr_real);
241 |        if (sorted_buffer != NULL) {
242 | 
243 |           real *sort_real = (real*)malloc(length*sizeof(real));
244 |           error = clEnqueueReadBuffer(queue, sorted_buffer, CL_TRUE, 0, length*sizeof(real), sort_real, 0, NULL, NULL);
245 |           if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
246 |           clReleaseMemObject(sorted_buffer);
247 | 
248 |           printf("\t%.6lf,", time_sum);
249 |           sort_test = (double*)malloc(length*sizeof(double));
250 |           for(uint i = 0; i < length; i++) { sort_test[i] = (double)sort_real[i]; }
251 |           free(sort_real);
252 | #ifdef CHECK
253 | 
254 |           icount=0;
255 |           for(uint i = 0; i < length; i++) {
256 |              if (fabs(sort_test[i] - sorted[i])/sorted[i] > EPS) {
257 |                 printf("Check failed for hashsort GPU index %d hashsort value %lf gold standard %lf\n",i,sort_test[i],sorted[i]);
258 |                 icount++;
259 |              }
260 |              if (icount > 20) exit(0);
261 |           }
262 | #endif
263 |           free(sort_test);
264 |           sort_test = NULL;
265 |        } else {
266 |           printf("\tnot_run,  ");
267 |        } 
268 |     } else {
269 |        printf("\tnot_run,  ");
270 |     }
271 | #endif
272 | 
273 | 
274 |     free(sorted);
275 |     sorted = NULL;
276 |     free(arr);
277 |     arr=NULL;
278 | }
279 | 
280 | double* hashsort( uint length, double *arr, double min_diff, double min_val, double max_val ) {
281 |     uint hash_size;
282 |     int *hash=NULL;
283 |     double *sorted=NULL;
284 |     
285 |     sorted = (double*)malloc(length*sizeof(double));
286 | 
287 |     //create hash table with buckets of size min_diff 
288 |     //   -- +2.5 rounds up and adds one space to either side
289 |     hash_size = (uint)((max_val - min_val)/min_diff + 2.5);
290 |     hash = (int*)malloc(hash_size*sizeof(int));
291 | 
292 |     //set all elements of hash array to -1
293 |     memset(hash, -1, hash_size*sizeof(int));
294 |     
295 |     for(uint i = 0; i < length; i++) {
296 |        //place index of current arr element into hash according to where the arr value
297 |         hash[(int)((arr[i]-min_val)/min_diff)] = i;
298 |     }
299 |     
300 |     int count=0;
301 |     for(uint i = 0; i < hash_size; i++) {
302 |         if(hash[i] >= 0) {
303 |             //sweep through hash and put set values in a sorted array
304 |             sorted[count] = arr[hash[i]];
305 |             count++;
306 |         }
307 |     }
308 |     
309 |     free(hash);
310 |     return sorted;
311 | }
312 | 
313 | /* generate a randomly mixed up array with size size to be stored in pointer. the elements will have a minimum value min, and
314 |     the difference between elements when sorted will be between mindx and maxdx. the maximum value is recorded in max. */
315 | double generate_array( uint size, double *ptr, double mindx, double maxdx, double min, double *max ) {
316 |     
317 |     double swap;
318 |     int index, front = 0;
319 |     double running_min = maxdx;
320 |         
321 |     ptr[0] = min;        //start the array using the minimum value
322 |     
323 |     /* for each element, add a random value between mindx and maxdx to the previous element's value */
324 |     for(int i = 1; i < size; i++) {
325 |         ptr[i] = ptr[i-1] + mindx + ((double)rand() * (maxdx - mindx) / (double)RAND_MAX);
326 |         if(ptr[i]-ptr[i-1] < running_min) running_min = ptr[i]-ptr[i-1];
327 |     }
328 | 
329 |     *max = ptr[size-1];                    //set the max value to the last element's value
330 |     //*max = min + (size-1) * maxdx;    //force the range for timings isolating a different variable
331 |     
332 |     /* Mix up the array by selecting elements from shrinking front portion of array and placing them on back end of array */
333 |     for(int i = 0; (i < size) && (size - i != 0) ; i++) {
334 |         index = rand() % (size - i - front) + front;
335 |         swap = ptr[size-i-1];
336 |         ptr[size-i-1] = ptr[index];
337 |         ptr[index] = swap;
338 |     }
339 |     return running_min;
340 | }
341 | 
342 | #ifdef HAVE_OPENCL
343 | cl_mem parallelHash( uint length, cl_mem xcoor_buffer, double min_diff, double max_diff, double min_val, double max_val, double *time ) {
344 | 
345 |     cl_mem sorted_buffer, hash_buffer, ioffset_buffer;
346 |  
347 |     cl_int error = 0;
348 |     long gpu_time = 0;
349 |  
350 |     uint hash_size = (uint)((max_val - min_val)/min_diff + 2.5);
351 |  
352 |     real min_val_real = (real)min_val;
353 |     real min_diff_real = (real)min_diff;
354 |     
355 |     hash_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, hash_size*sizeof(int), NULL, &error);
356 |     if (error != CL_SUCCESS) {
357 |        //printf("Error is %d at line %d\n",error,__LINE__);
358 |        return(NULL);
359 |     }
360 | 
361 | /******************
362 |  * Init to -1
363 |  ******************/
364 |   
365 |     error = clSetKernelArg(init_kernel, 0, sizeof(cl_uint), &hash_size);
366 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
367 |     error = clSetKernelArg(init_kernel, 1, sizeof(cl_mem), (void*)&hash_buffer);
368 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
369 |  
370 |     size_t global_work_size[1];
371 |     size_t local_work_size[1];
372 |     
373 |     local_work_size[0] = TILE_SIZE;
374 |     global_work_size[0] = ((hash_size+local_work_size[0]-1)/local_work_size[0])*local_work_size[0];
375 |     
376 |     cl_event hash_init_event;
377 |  
378 |     error = clEnqueueNDRangeKernel(queue, init_kernel, 1, 0, global_work_size, local_work_size, 0, NULL, &hash_init_event);
379 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
380 | 
381 | /******************
382 |  * Hash Kernel
383 |  ******************/
384 |      
385 |     error = clSetKernelArg(hash_kernel, 0, sizeof(real), &min_val_real);
386 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
387 |     error = clSetKernelArg(hash_kernel, 1, sizeof(real), &min_diff_real);
388 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
389 |     error = clSetKernelArg(hash_kernel, 2, sizeof(cl_uint), &length);
390 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
391 |     error = clSetKernelArg(hash_kernel, 3, sizeof(cl_mem), (void*)&xcoor_buffer);
392 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
393 |     error = clSetKernelArg(hash_kernel, 4, sizeof(cl_mem), (void*)&hash_buffer);
394 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
395 |  
396 |     global_work_size[0] = ((length+local_work_size[0]-1)/local_work_size[0])*local_work_size[0];
397 |  
398 |     cl_event hash_kernel_event;
399 |     
400 |     error = clEnqueueNDRangeKernel(queue, hash_kernel, 1, 0, global_work_size, local_work_size, 0, NULL, &hash_kernel_event);
401 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
402 | 
403 | /***********************
404 |  * Prefix Scan Kernels
405 |  ***********************/
406 | 
407 |     /* scan 1 */
408 |     global_work_size[0] = ((hash_size+local_work_size[0]-1)/local_work_size[0])*local_work_size[0];
409 |  
410 |     int group_size = (int)(global_work_size[0]/local_work_size[0]);
411 |     
412 |     ioffset_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, group_size*sizeof(uint), NULL, &error);
413 |     if (error != CL_SUCCESS) {
414 |        //printf("Error is %d at line %d\n",error,__LINE__);
415 |        clReleaseMemObject(hash_buffer);
416 |        return(NULL);
417 |     }
418 |   
419 |     error = clSetKernelArg(scan1_kernel, 0, sizeof(cl_uint), &hash_size);
420 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
421 |     error = clSetKernelArg(scan1_kernel, 1, sizeof(cl_mem), (void*)&ioffset_buffer);
422 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
423 |     error = clSetKernelArg(scan1_kernel, 2, local_work_size[0]*sizeof(uint), NULL);
424 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
425 |     error = clSetKernelArg(scan1_kernel, 3, sizeof(cl_mem), (void*)&hash_buffer);
426 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
427 |   
428 |     cl_event scan1_event;
429 |     
430 |     error = clEnqueueNDRangeKernel(queue, scan1_kernel, 1, 0, global_work_size, local_work_size, 0, NULL, &scan1_event);
431 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
432 | 
433 |     //clWaitForEvents(1, &scan1_event);
434 |     //exit(0);
435 | 
436 |     /* scan 2 */
437 |     //global_work_size[0] = ((group_size+local_work_size[0]-1)/local_work_size[0])*local_work_size[0];
438 |     global_work_size[0] = local_work_size[0];
439 | 
440 |     cl_event scan2_event;
441 |     
442 |     //printf("\n local: %d global: %d\n", local_work_size[0], global_work_size[0]);
443 | 
444 |         
445 |     int elements_per_thread = (group_size+local_work_size[0]-1)/local_work_size[0];
446 |     //printf("\ngroup_size %d EPT %d\n",group_size,elements_per_thread );
447 |                 
448 |     error = clSetKernelArg(scan2_kernel, 0, local_work_size[0]*sizeof(uint), NULL);
449 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
450 |     error = clSetKernelArg(scan2_kernel, 1, sizeof(cl_mem), (void*)&ioffset_buffer);
451 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
452 |     error = clSetKernelArg(scan2_kernel, 2, sizeof(uint), &group_size);
453 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
454 |     error = clEnqueueNDRangeKernel(queue, scan2_kernel, 1, 0, global_work_size, local_work_size, 0, NULL, &scan2_event);
455 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
456 | 
457 | #ifdef XXX
458 |     uint *ioffset = (uint *)malloc(group_size*sizeof(uint));
459 |     error = clEnqueueReadBuffer(queue, ioffset_buffer, CL_TRUE, 0, group_size*sizeof(uint), ioffset, 0, NULL, NULL);
460 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
461 | 
462 |     printf("\n");
463 |     for (uint i=0; i<group_size; i++){
464 |        printf("%d ioffset %u\n",i,ioffset[i]);
465 |     }
466 |  
467 |     uint *mailbox = (uint *)malloc(local_work_size[0]*sizeof(uint));
468 |     error = clEnqueueReadBuffer(queue, mailbox_buffer, CL_TRUE, 0, local_work_size[0]*sizeof(int), mailbox, 0, NULL, NULL);
469 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
470 | 
471 |     //printf("\n");
472 |     //for (int i=0; i<local_work_size[0]; i++){
473 |     //   printf("%d mailbox %d\n",i,mailbox[i]);
474 |     //}
475 | 
476 |     //int *hash = (int *)malloc(hash_size*sizeof(int));
477 |     //error = clEnqueueReadBuffer(queue, hash_buffer, CL_TRUE, 0, hash_size*sizeof(int), hash, 0, NULL, NULL);
478 |     //if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
479 | 
480 |     //printf("\n");
481 |     //for (int i=0; i<hash_size; i++){
482 |     //   printf("%d hash %d\n",i,hash[i]);
483 |     //}
484 | #endif
485 |  
486 |     /* scan 3 */
487 |     sorted_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, length*sizeof(real), NULL, &error);
488 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
489 |     
490 |     global_work_size[0] = ((hash_size+local_work_size[0]-1)/local_work_size[0])*local_work_size[0];
491 |         
492 |     error = clSetKernelArg(scan3_kernel, 0, sizeof(cl_uint), &hash_size);
493 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
494 |     error = clSetKernelArg(scan3_kernel, 1, sizeof(cl_mem), (void*)&ioffset_buffer);
495 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
496 |     error = clSetKernelArg(scan3_kernel, 2, local_work_size[0]*sizeof(uint), NULL);
497 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
498 |     error = clSetKernelArg(scan3_kernel, 3, sizeof(cl_mem), (void*)&hash_buffer) ;
499 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
500 |     error = clSetKernelArg(scan3_kernel, 4, sizeof(cl_mem), (void *)&xcoor_buffer);
501 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
502 |     error = clSetKernelArg(scan3_kernel, 5, sizeof(cl_mem), (void *)&sorted_buffer);
503 |     if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
504 |     
505 |     cl_event scan3_event;
506 |     
507 |     if (clEnqueueNDRangeKernel(queue, scan3_kernel, 1, 0, global_work_size, local_work_size, 0, NULL, &scan3_event) != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
508 |     
509 |     long gpu_time_start, gpu_time_end;
510 |     
511 |     clWaitForEvents(1, &scan3_event);
512 |     clGetEventProfilingInfo(hash_init_event, CL_PROFILING_COMMAND_START, sizeof(gpu_time_start), &gpu_time_start, NULL);
513 |     clGetEventProfilingInfo(hash_init_event, CL_PROFILING_COMMAND_END, sizeof(gpu_time_end), &gpu_time_end, NULL);
514 |     gpu_time += gpu_time_end - gpu_time_start;
515 |     clReleaseEvent(hash_init_event);
516 |     
517 |     if (DETAILED_TIMING) printf("\tinit %.6lf,", (double)(gpu_time_end - gpu_time_start)*1.0e-9);
518 | 
519 |     clGetEventProfilingInfo(hash_kernel_event, CL_PROFILING_COMMAND_START, sizeof(gpu_time_start), &gpu_time_start, NULL);
520 |     clGetEventProfilingInfo(hash_kernel_event, CL_PROFILING_COMMAND_END, sizeof(gpu_time_end), &gpu_time_end, NULL);
521 |     gpu_time += gpu_time_end - gpu_time_start;
522 |     clReleaseEvent(hash_kernel_event);
523 | 
524 |     if (DETAILED_TIMING) printf("hash %.6lf,", (double)(gpu_time_end - gpu_time_start)*1.0e-9);
525 | 
526 |     clGetEventProfilingInfo(scan1_event, CL_PROFILING_COMMAND_START, sizeof(gpu_time_start), &gpu_time_start, NULL);
527 |     clGetEventProfilingInfo(scan1_event, CL_PROFILING_COMMAND_END, sizeof(gpu_time_end), &gpu_time_end, NULL);
528 |     gpu_time += gpu_time_end - gpu_time_start;
529 |     clReleaseEvent(scan1_event);
530 |     
531 |     if (DETAILED_TIMING) printf("scan 1 %.6lf,", (double)(gpu_time_end - gpu_time_start)*1.0e-9);
532 | 
533 |     clGetEventProfilingInfo(scan2_event, CL_PROFILING_COMMAND_START, sizeof(gpu_time_start), &gpu_time_start, NULL);
534 |     clGetEventProfilingInfo(scan2_event, CL_PROFILING_COMMAND_END, sizeof(gpu_time_end), &gpu_time_end, NULL);
535 |     gpu_time += gpu_time_end - gpu_time_start;
536 |     clReleaseEvent(scan2_event);
537 |     
538 |     if (DETAILED_TIMING) printf("scan 2 %.6lf,", (double)(gpu_time_end - gpu_time_start)*1.0e-9);
539 | 
540 |     clGetEventProfilingInfo(scan3_event, CL_PROFILING_COMMAND_START, sizeof(gpu_time_start), &gpu_time_start, NULL);
541 |     clGetEventProfilingInfo(scan3_event, CL_PROFILING_COMMAND_END, sizeof(gpu_time_end), &gpu_time_end, NULL);
542 |     gpu_time += gpu_time_end - gpu_time_start;
543 |     clReleaseEvent(scan3_event);
544 | 
545 |     if (DETAILED_TIMING) printf("scan 3 %.6lf,", (double)(gpu_time_end - gpu_time_start)*1.0e-9);
546 | 
547 |     *time = (double)gpu_time*1.0e-9;
548 | 
549 |     /* cleanup */
550 |     clReleaseMemObject(hash_buffer);
551 |     clReleaseMemObject(ioffset_buffer);
552 |     
553 |     return(sorted_buffer);
554 | }
555 | #endif
556 | 
557 | 


--------------------------------------------------------------------------------
/sort2d_kern.cl:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
  3 |  *  All rights Reserved.
  4 |  *
  5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
  6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
  7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
  8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
  9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
 10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
 11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
 12 |  * to produce derivative works, such modified software should be clearly marked,
 13 |  * so as not to confuse it with the version available from LANL.   
 14 |  *
 15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 16 |  * use this file except in compliance with the License. You may obtain a copy
 17 |  * of the License at 
 18 |  *
 19 |  * http://www.apache.org/licenses/LICENSE-2.0
 20 |  *
 21 |  * Unless required by applicable law or agreed to in writing, software distributed
 22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 24 |  * specific language governing permissions and limitations under the License.”
 25 |  *
 26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
 27 |  *
 28 |  */
 29 | 
 30 | /*
 31 |  *  Authors: Bob Robey       XCP-2   brobey@lanl.gov
 32 |  *           David Nicholaeff        dnic@lanl.gov, mtrxknight@aol.com
 33 |  *           Rachel Robey            rnrobey@gmail.com
 34 |  */
 35 | 
 36 | /* sort2d_kern.cl */
 37 | 
 38 | #ifdef HAVE_CL_DOUBLE
 39 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 40 | typedef double  real;
 41 | #define ONE 1.0
 42 | #define TWO 2.0
 43 | typedef struct {
 44 |    long level;
 45 |    double xval;
 46 |    double yval;
 47 | } cell;
 48 | #else
 49 | typedef float   real;
 50 | #define ONE 1.0f
 51 | #define TWO 2.0f
 52 | typedef struct {
 53 |    long level;
 54 |    float xval;
 55 |    float yval;
 56 | } cell;
 57 | #endif
 58 | 
 59 | #define category cell
 60 | 
 61 | inline uint scan_warp_exclusive(__local volatile uint *input, const uint idx, const uint lane);
 62 | inline uint scan_warp_inclusive(__local volatile uint *input, const uint idx, const uint lane);
 63 | inline uint scan_workgroup_exclusive(
 64 |                                      __local uint* tile,
 65 |                                      const uint tiX,
 66 |                                      const uint lane,
 67 |                                      const uint warpID);
 68 | 
 69 | __kernel void hash_init_cl(
 70 |         const int hsize,
 71 | 	__global int *hash_table) {
 72 | 
 73 |    const int ic = get_global_id(0);
 74 |    if(ic < hsize) hash_table[ic] = -1;
 75 | 
 76 | }
 77 | 
 78 | 
 79 | #define MESH_SIZE mesh_size
 80 | //#define MESH_SIZE 256
 81 | 
 82 | int powerOfTwo(int n) {
 83 |    int val = 1;
 84 |    int ic;
 85 |    for(ic = 0; ic < n; ic++) {val *= 2;}
 86 |    return val;
 87 | }
 88 | 
 89 | // XXX Make sure MESH_SIZE GETS SET XXX
 90 | // Cartesian Coordinate Indexing
 91 | #define HASHY_CL (( powerOfTwo(levmx)*MESH_SIZE ))
 92 | #define XY_TO_IJ_CL(x) (( (x-(ONE/(TWO*(real)MESH_SIZE*(real)powerOfTwo(mesh[ic].level))))*(real)HASHY_CL ))
 93 | #define HASH_MAX_CL (( SQR(HASHY_CL) ))
 94 | #define HASH_KEY_CL (( XY_TO_IJ_CL(mesh[ic].xval) + XY_TO_IJ_CL(mesh[ic].yval)*(real)HASHY_CL ))
 95 | 
 96 | __kernel void hash_build_cl(
 97 |         __global const cell* mesh,
 98 |         const int levmx,
 99 |         const int size,
100 | 	__global int *hash_table,
101 |         const int mesh_size) {
102 | 	
103 | 	const int ic = get_global_id(0);
104 |         if(ic < size) hash_table[(int)HASH_KEY_CL] = ic;
105 | }
106 | 
107 | __kernel void scan1(
108 | 	const uint isize,
109 | 	__global uint *ioffset,
110 | 	__local volatile uint *itile,
111 | 	__global const int *temp) {
112 | 		
113 | 	const uint giX = get_global_id(0);
114 | 	const uint tiX = get_local_id(0);
115 | 	const uint ntX = get_local_size(0);
116 | 	const uint group_id = get_group_id(0);
117 | 
118 |         int temp_val = -1;
119 |         if (giX < isize) temp_val = temp[giX];
120 | 
121 |         itile[tiX] = temp_val >= 0 ? 1 : 0;
122 | 	barrier(CLK_GLOBAL_MEM_FENCE);
123 | 	
124 | 	for(uint offset = ntX >> 1; offset > 32; offset >>= 1) {
125 | 		if(tiX < offset) {
126 | 			itile[tiX] += itile[tiX+offset];
127 | 		}
128 | 		barrier(CLK_LOCAL_MEM_FENCE);
129 | 	}
130 | 
131 |         if(giX >= isize) return;
132 | 
133 |     //  Unroll the remainder of the loop as 32 threads must proceed in lockstep.
134 |     if (tiX < 32)
135 |     {  itile[tiX] += itile[tiX+32];
136 |        itile[tiX] += itile[tiX+16];
137 |        itile[tiX] += itile[tiX+8];
138 |        itile[tiX] += itile[tiX+4];
139 |        itile[tiX] += itile[tiX+2];
140 |        itile[tiX] += itile[tiX+1]; }
141 | 
142 |     if(tiX == 0) {
143 |         ioffset[group_id] = itile[0];
144 |     }
145 | }
146 | 
147 | inline uint scan_warp_exclusive(__local volatile uint *input, const uint idx, const uint lane) {
148 |     if (lane > 0 ) input[idx] += input[idx - 1];
149 |     if (lane > 1 ) input[idx] += input[idx - 2];
150 |     if (lane > 3 ) input[idx] += input[idx - 4];
151 |     if (lane > 7 ) input[idx] += input[idx - 8];
152 |     if (lane > 15) input[idx] += input[idx - 16];
153 |     
154 |     return (lane > 0) ? input[idx-1] : 0;
155 | }
156 | 
157 | inline uint scan_warp_inclusive(__local volatile uint *input, const uint idx, const uint lane) {
158 |     if (1) {
159 |        if (lane > 0 ) input[idx] += input[idx - 1];
160 |        if (lane > 1 ) input[idx] += input[idx - 2];
161 |        if (lane > 3 ) input[idx] += input[idx - 4];
162 |        if (lane > 7 ) input[idx] += input[idx - 8];
163 |        if (lane > 15) input[idx] += input[idx - 16];
164 |     
165 |        return input[idx];
166 |     }
167 | }
168 | 
169 | inline uint scan_workgroup_exclusive(
170 |     __local uint* itile,
171 |     const uint tiX,
172 |     const uint lane,
173 |     const uint warpID) {
174 |     
175 |     // Step 1: scan each warp
176 |     uint val = scan_warp_exclusive(itile, tiX, lane);
177 |     barrier(CLK_LOCAL_MEM_FENCE);
178 |     
179 |     // Step 2: Collect per-warp sums
180 |     if (lane == 31) itile[warpID] = itile[tiX];
181 |     barrier(CLK_LOCAL_MEM_FENCE);
182 |     
183 |     // Step 3: Use 1st warp to scan per-warp sums
184 |     if (warpID == 0) scan_warp_inclusive(itile, tiX, lane);
185 |     barrier(CLK_LOCAL_MEM_FENCE);
186 |     
187 |     // Step 4: Accumulate results from Steps 1 and 3
188 |     if (warpID > 0) val += itile[warpID-1];
189 |     barrier(CLK_LOCAL_MEM_FENCE);
190 |     
191 |     // Step 6: Write and return the final result
192 |     itile[tiX] = val;
193 |     barrier(CLK_LOCAL_MEM_FENCE);
194 |     
195 |     return val;
196 | }
197 | 
198 | __kernel void scan2(
199 |     __local uint* itile,
200 |     __global uint* ioffset,
201 |     const uint size) {
202 | 
203 |     size_t tiX = get_local_id(0);
204 |     const uint gID = get_group_id(0);
205 |     const uint ntX = get_local_size(0);
206 |     
207 |     const uint lane = tiX & 31;
208 |     const uint warpID = tiX >> 5;
209 |     const uint EPT = (size+ntX-1)/ntX; //elements_per_thread;
210 |     
211 |     uint reduceValue = 0;
212 |     
213 | //  #pragma unroll 4
214 |     for(uint i = 0; i < EPT; ++i)
215 |     {
216 |        uint offsetIdx = i * ntX + tiX;
217 | 
218 | #ifdef IS_NVIDIA
219 | //     if (offsetIdx >= size) return;
220 | #endif
221 |         
222 |        // Step 1: Read ntX elements from global (off-chip) memory to local memory (on-chip)
223 |        uint input = 0;
224 |        if (offsetIdx < size) input = ioffset[offsetIdx];           
225 |        itile[tiX] = input;           
226 |        barrier(CLK_LOCAL_MEM_FENCE);
227 |         
228 |        // Step 2: Perform scan on ntX elements
229 |        uint val = scan_workgroup_exclusive(itile, tiX, lane, warpID);
230 |         
231 |        // Step 3: Propagate reduced result from previous block of ntX elements
232 |        val += reduceValue;
233 |         
234 |        // Step 4: Write out data to global memory
235 |        if (offsetIdx < size) ioffset[offsetIdx] = val;
236 |      
237 |        // Step 5: Choose reduced value for next iteration
238 |        if (tiX == (ntX-1)) itile[tiX] = input + val;
239 |        barrier(CLK_LOCAL_MEM_FENCE);
240 |         
241 |        reduceValue = itile[ntX-1];
242 |        barrier(CLK_LOCAL_MEM_FENCE);
243 |     }
244 | }
245 | 
246 | __kernel void scan3 (
247 |     const int isize,
248 |     __global const uint *ioffset,
249 |     __local uint *itile,
250 |     __global const int *temp,
251 |     __global const cell *arr,
252 |     __global cell *sorted) {
253 |     
254 |     const uint giX = get_global_id(0);
255 |     const uint tiX = get_local_id(0);
256 |     const uint group_id = get_group_id(0);
257 | 
258 |     const uint lane   = tiX & 31;
259 |     const uint warpid = tiX >> 5;
260 | 
261 |     // Step 1: load global data into tile
262 |     int temp_val = 0;
263 |     if (giX < isize) temp_val = temp[giX];
264 |     itile[tiX] = 0;
265 |     if (temp_val >= 0) itile[tiX] = 1;
266 |     barrier(CLK_LOCAL_MEM_FENCE);
267 | 
268 |     // Step 2: scan each warp
269 |     uint val = scan_warp_exclusive(itile, tiX, lane);
270 |     barrier(CLK_LOCAL_MEM_FENCE);
271 | 
272 |     // Step 3: Collect per-warp sums
273 |     if (lane == 31) itile[warpid] = itile[tiX];
274 |     barrier(CLK_LOCAL_MEM_FENCE);
275 | 
276 |     // Step 4: Use 1st warp to scan per-warp sums
277 |     if (warpid == 0) scan_warp_inclusive(itile, tiX, lane);
278 |     barrier(CLK_LOCAL_MEM_FENCE);
279 | 
280 |     // Step 5: Accumulate results from Steps 2 and 4
281 |     if (warpid > 0) val += itile[warpid-1];
282 |     barrier(CLK_LOCAL_MEM_FENCE);
283 | 
284 |     if (giX >= isize || temp_val < 0) return;
285 | 
286 |     // Step 6: Write and return the final result
287 |     //itile[tiX] = val;
288 |     //barrier(CLK_LOCAL_MEM_FENCE);
289 | 
290 |     val += ioffset[group_id];   //index to write to for each thread
291 | 
292 |     sorted[val]     = arr[temp_val];
293 | }
294 | 
295 | 


--------------------------------------------------------------------------------
/sort_kern.cl:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
  3 |  *  All rights Reserved.
  4 |  *
  5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
  6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
  7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
  8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
  9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
 10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
 11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
 12 |  * to produce derivative works, such modified software should be clearly marked,
 13 |  * so as not to confuse it with the version available from LANL.   
 14 |  *
 15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 16 |  * use this file except in compliance with the License. You may obtain a copy
 17 |  * of the License at 
 18 |  *
 19 |  * http://www.apache.org/licenses/LICENSE-2.0
 20 |  *
 21 |  * Unless required by applicable law or agreed to in writing, software distributed
 22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 24 |  * specific language governing permissions and limitations under the License.”
 25 |  *
 26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
 27 |  *
 28 |  */
 29 | 
 30 | /*
 31 |  *  Authors: Bob Robey       XCP-2   brobey@lanl.gov
 32 |  *           David Nicholaeff        dnic@lanl.gov, mtrxknight@aol.com
 33 |  *           Rachel Robey            rnrobey@gmail.com
 34 |  */
 35 | 
 36 | /* sort_kern.cl */
 37 | 
 38 | #ifdef HAVE_CL_DOUBLE
 39 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 40 | typedef double  real;
 41 | #else
 42 | typedef float   real;
 43 | #endif
 44 | 
 45 | __kernel void init_kern(
 46 |         const uint size,
 47 | 	__global int *temp) {
 48 | 
 49 | 	const uint idx = get_global_id(0);
 50 | 
 51 |         if (idx >= size) return;
 52 | 
 53 | 	temp[idx] = -1;
 54 | }
 55 | 
 56 | __kernel void hash_kern(
 57 | 	const real min_val,
 58 | 	const real min_diff,
 59 |         const uint length,
 60 | 	__global const real *arr,
 61 | 	__global int *temp) {
 62 | 	
 63 | 	const uint idx = get_global_id(0);
 64 | 	
 65 |         if(idx >= length) return;
 66 | 
 67 |         temp[(uint)((arr[idx]-min_val)/min_diff)] = idx;
 68 | }
 69 | 
 70 | __kernel void scan1(
 71 | 	const uint isize,
 72 | 	__global uint *ioffset,
 73 | 	__local volatile uint *itile,
 74 | 	__global const int *temp) {
 75 | 		
 76 | 	const uint giX = get_global_id(0);
 77 | 	const uint tiX = get_local_id(0);
 78 | 	const uint ntX = get_local_size(0);
 79 | 	const uint group_id = get_group_id(0);
 80 | 
 81 |         int temp_val = -1;
 82 |         if (giX < isize) temp_val = temp[giX];
 83 | 
 84 |         itile[tiX] = temp_val >= 0 ? 1 : 0;
 85 | 	barrier(CLK_GLOBAL_MEM_FENCE);
 86 | 	
 87 | 	for(uint offset = ntX >> 1; offset > 32; offset >>= 1) {
 88 | 		if(tiX < offset) {
 89 | 			itile[tiX] += itile[tiX+offset];
 90 | 		}
 91 | 		barrier(CLK_LOCAL_MEM_FENCE);
 92 | 	}
 93 | 
 94 |         if(giX >= isize) return;
 95 | 
 96 |     //  Unroll the remainder of the loop as 32 threads must proceed in lockstep.
 97 |     if (tiX < 32)
 98 |     {  itile[tiX] += itile[tiX+32];
 99 |        itile[tiX] += itile[tiX+16];
100 |        itile[tiX] += itile[tiX+8];
101 |        itile[tiX] += itile[tiX+4];
102 |        itile[tiX] += itile[tiX+2];
103 |        itile[tiX] += itile[tiX+1]; }
104 | 
105 |     if(tiX == 0) {
106 |         ioffset[group_id] = itile[0];
107 |     }
108 | }
109 | 
110 | inline uint scan_warp_exclusive(__local volatile uint *input, const uint idx, const uint lane) {
111 |     if (lane > 0 ) input[idx] += input[idx - 1];
112 |     if (lane > 1 ) input[idx] += input[idx - 2];
113 |     if (lane > 3 ) input[idx] += input[idx - 4];
114 |     if (lane > 7 ) input[idx] += input[idx - 8];
115 |     if (lane > 15) input[idx] += input[idx - 16];
116 |     
117 |     return (lane > 0) ? input[idx-1] : 0;
118 | }
119 | 
120 | inline uint scan_warp_inclusive(__local volatile uint *input, const uint idx, const uint lane) {
121 |     if (1) {
122 |        if (lane > 0 ) input[idx] += input[idx - 1];
123 |        if (lane > 1 ) input[idx] += input[idx - 2];
124 |        if (lane > 3 ) input[idx] += input[idx - 4];
125 |        if (lane > 7 ) input[idx] += input[idx - 8];
126 |        if (lane > 15) input[idx] += input[idx - 16];
127 |        return input[idx];
128 |     }
129 | }
130 | 
131 | inline uint scan_workgroup_exclusive(
132 |     __local uint* itile,
133 |     const uint tiX,
134 |     const uint lane,
135 |     const uint warpID) {
136 |     
137 |     // Step 1: scan each warp
138 |     uint val = scan_warp_exclusive(itile, tiX, lane);
139 |     barrier(CLK_LOCAL_MEM_FENCE);
140 |     
141 |     // Step 2: Collect per-warp sums
142 |     if (lane == 31) itile[warpID] = itile[tiX];
143 |     barrier(CLK_LOCAL_MEM_FENCE);
144 |     
145 |     // Step 3: Use 1st warp to scan per-warp sums
146 |     if (warpID == 0) scan_warp_inclusive(itile, tiX, lane);
147 |     barrier(CLK_LOCAL_MEM_FENCE);
148 |     
149 |     // Step 4: Accumulate results from Steps 1 and 3
150 |     if (warpID > 0) val += itile[warpID-1];
151 |     barrier(CLK_LOCAL_MEM_FENCE);
152 |     
153 |     // Step 6: Write and return the final result
154 |     itile[tiX] = val;
155 |     barrier(CLK_LOCAL_MEM_FENCE);
156 |     
157 |     return val;
158 | }
159 | 
160 | __kernel void scan2(
161 |     __local uint* itile,
162 |     __global uint* ioffset,
163 |     const uint size) {
164 | 
165 |     size_t tiX = get_local_id(0);
166 |     const uint gID = get_group_id(0);
167 |     const uint ntX = get_local_size(0);
168 |     
169 |     const uint lane = tiX & 31;
170 |     const uint warpID = tiX >> 5;
171 |     const uint EPT = (size+ntX-1)/ntX; //elements_per_thread;
172 |     
173 |     uint reduceValue = 0;
174 |     
175 | //  #pragma unroll 4
176 |     for(uint i = 0; i < EPT; ++i)
177 |     {
178 |        uint offsetIdx = i * ntX + tiX;
179 | 
180 | #ifdef IS_NVIDIA
181 | //     if (offsetIdx >= size) return;
182 | #endif
183 |         
184 |        // Step 1: Read ntX elements from global (off-chip) memory to local memory (on-chip)
185 |        uint input = 0;
186 |        if (offsetIdx < size) input = ioffset[offsetIdx];           
187 |        itile[tiX] = input;           
188 |        barrier(CLK_LOCAL_MEM_FENCE);
189 |         
190 |        // Step 2: Perform scan on ntX elements
191 |        uint val = scan_workgroup_exclusive(itile, tiX, lane, warpID);
192 |         
193 |        // Step 3: Propagate reduced result from previous block of ntX elements
194 |        val += reduceValue;
195 |         
196 |        // Step 4: Write out data to global memory
197 |        if (offsetIdx < size) ioffset[offsetIdx] = val;
198 |      
199 |        // Step 5: Choose reduced value for next iteration
200 |        if (tiX == (ntX-1)) itile[tiX] = input + val;
201 |        barrier(CLK_LOCAL_MEM_FENCE);
202 |         
203 |        reduceValue = itile[ntX-1];
204 |        barrier(CLK_LOCAL_MEM_FENCE);
205 |     }
206 | }
207 | 
208 | inline uint do_element_pass(uint offsetIdx, uint ntX, uint tiX, uint lane, uint warpID,
209 |       uint reduceValue, uint size, __global uint *ioffset, __local uint *itile) {
210 |     barrier(CLK_LOCAL_MEM_FENCE);
211 | 
212 |     // Step 1: Read ntX elements from global (off-chip) memory to local memory (on-chip)
213 |     uint input = 0;
214 |     if (offsetIdx < size) input = ioffset[offsetIdx];           
215 |     itile[tiX] = input;           
216 |     barrier(CLK_LOCAL_MEM_FENCE);
217 |     
218 |     // Step 2: Perform scan on ntX elements
219 |     uint val = scan_workgroup_exclusive(itile, tiX, lane, warpID);
220 |    
221 |     // Step 3: Propagate reduced result from previous block of ntX elements
222 |     val += reduceValue;
223 |   
224 |     // Step 4: Write out data to global memory
225 |     if (offsetIdx < size) ioffset[offsetIdx] = val;
226 |   
227 |     // Step 5: Choose reduced value for next iteration
228 |     if (tiX == (ntX-1)) itile[tiX] = input + val;
229 |     barrier(CLK_LOCAL_MEM_FENCE);
230 | 
231 |     reduceValue = itile[ntX-1];
232 | 
233 |     return(reduceValue);
234 | }
235 | 
236 | __kernel void scan_lev(
237 |     __local uint* itile,
238 |     __global uint* ioffset,
239 |     __global uint* workgroup_results,
240 |     const uint size) {
241 | 
242 |     uint tiX = get_local_id(0);
243 |     uint giX = get_global_id(0);
244 |     const uint gID = get_group_id(0);
245 |     const uint ntX = get_local_size(0);
246 |     
247 |     const uint lane = tiX & 31;
248 |     const uint warpID = tiX >> 5;
249 | 
250 |     workgroup_results[gID] = 0;
251 | 
252 |     // Step 1: Read ntX elements from global (off-chip) memory to local memory (on-chip)
253 |     uint input = 0;
254 |     if (giX < size) input = ioffset[giX];
255 |     itile[tiX] = input;
256 |     barrier(CLK_LOCAL_MEM_FENCE);
257 | 
258 |     // Step 2: Perform scan on ntX elements
259 |     uint val = scan_workgroup_exclusive(itile, tiX, lane, warpID);
260 | 
261 |     // Step 3: Collect per-workgroup partial results
262 |     workgroup_results[gID] = itile[tiX];
263 | }
264 | 
265 | __kernel void scan_workgroup_results(
266 |     __global uint* workgroup_results)
267 | {
268 |     uint tiX = get_local_id(0);
269 |     
270 |     const uint lane = tiX & 31;
271 |     const uint warpID = tiX >> 5;
272 | 
273 |     // Step 4: Use 1st warp to scan workgroup_results
274 |     //if (warpID == 0) scan_warp_inclusive(workgroup_results, tiX, lane);
275 | }
276 | 
277 | __kernel void accumulate_workgroup_results()
278 | {
279 |     uint tiX = get_local_id(0);
280 |     
281 |     const uint warpID = tiX >> 5;
282 | 
283 |     // Step 5: Accumulate results from steps 2 and 4
284 |     //uint val += itile[warpID-1];
285 | }
286 | 
287 | __kernel void scan3 (
288 |     const int isize,
289 |     __global const uint *ioffset,
290 |     __local uint *itile,
291 |     __global const int *temp,
292 |     __global const real *arr,
293 |     __global real *sorted) {
294 |     
295 |     const uint giX = get_global_id(0);
296 |     const uint tiX = get_local_id(0);
297 |     const uint group_id = get_group_id(0);
298 | 
299 |     const uint lane   = tiX & 31;
300 |     const uint warpid = tiX >> 5;
301 | 
302 |     // Step 1: load global data into tile
303 |     int temp_val = 0;
304 |     if (giX < isize) temp_val = temp[giX];
305 |     itile[tiX] = 0;
306 |     if (temp_val >= 0) itile[tiX] = 1;
307 |     barrier(CLK_LOCAL_MEM_FENCE);
308 | 
309 |     // Step 2: scan each warp
310 |     uint val = scan_warp_exclusive(itile, tiX, lane);
311 |     barrier(CLK_LOCAL_MEM_FENCE);
312 | 
313 |     // Step 3: Collect per-warp sums
314 |     if (lane == 31) itile[warpid] = itile[tiX];
315 |     barrier(CLK_LOCAL_MEM_FENCE);
316 | 
317 |     // Step 4: Use 1st warp to scan per-warp sums
318 |     if (warpid == 0) scan_warp_inclusive(itile, tiX, lane);
319 |     barrier(CLK_LOCAL_MEM_FENCE);
320 | 
321 |     // Step 5: Accumulate results from Steps 2 and 4
322 |     if (warpid > 0) val += itile[warpid-1];
323 |     barrier(CLK_LOCAL_MEM_FENCE);
324 | 
325 |     if (giX >= isize || temp_val < 0) return;
326 | 
327 |     // Step 6: Write and return the final result
328 |     //itile[tiX] = val;
329 |     //barrier(CLK_LOCAL_MEM_FENCE);
330 | 
331 |     val += ioffset[group_id];   //index to write to for each thread
332 | 
333 |     sorted[val] = arr[temp_val];
334 | }
335 | 
336 | 


--------------------------------------------------------------------------------
/table.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
  3 |  *  All rights Reserved.
  4 |  *
  5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
  6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
  7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
  8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
  9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
 10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
 11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
 12 |  * to produce derivative works, such modified software should be clearly marked,
 13 |  * so as not to confuse it with the version available from LANL.   
 14 |  *
 15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 16 |  * use this file except in compliance with the License. You may obtain a copy
 17 |  * of the License at 
 18 |  *
 19 |  * http://www.apache.org/licenses/LICENSE-2.0
 20 |  *
 21 |  * Unless required by applicable law or agreed to in writing, software distributed
 22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 24 |  * specific language governing permissions and limitations under the License.”
 25 |  *
 26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
 27 |  *
 28 |  */
 29 | 
 30 | /*
 31 |  *  Authors: Bob Robey       XCP-2   brobey@lanl.gov
 32 |  *           David Nicholaeff        dnic@lanl.gov, mtrxknight@aol.com
 33 |  *           Rachel Robey            rnrobey@gmail.com
 34 |  */
 35 | 
 36 | #include <stdlib.h>
 37 | #include <stdio.h>
 38 | #include <math.h>
 39 | #include <time.h>
 40 | #include <sys/time.h>
 41 | #include "gpu.h"
 42 | #include "timer.h"
 43 | 
 44 | cl_kernel interpolate_kernel;
 45 | 
 46 | #ifdef HAVE_CONFIG_H
 47 | #include "config.h"
 48 | #endif
 49 | 
 50 | #ifdef __APPLE_CC__
 51 | #include <OpenCL/OpenCL.h>
 52 | #else
 53 | #include <CL/cl.h>
 54 | #endif
 55 | 
 56 | #ifdef HAVE_CL_DOUBLE
 57 | typedef double real;
 58 | typedef cl_double cl_real;
 59 | typedef cl_double4 cl_real4;
 60 | #define EPS 1.0e-8
 61 | #else
 62 | typedef float real;
 63 | typedef cl_float cl_real;
 64 | typedef cl_float4 cl_real4;
 65 | #define EPS 1.0e-5
 66 | #endif
 67 | 
 68 | #define TILE_SIZE 256
 69 | #define dataval(x,y) data[(x)+((y)*xstride)]
 70 | 
 71 | cl_context context;
 72 | cl_command_queue queue;
 73 | cl_program program;
 74 | int is_nvidia=0;
 75 | 
 76 | double random_normal_dist(void);
 77 | double *interpolate_bruteforce(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis,
 78 |       double *density_array, double *temp_array, double *data);
 79 | double *interpolate_bisection(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis,
 80 |       double *density_array, double *temp_array, double *data);
 81 | int bisection(double *axis, int axis_size, double value);
 82 | double *interpolate_hashcpu(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis,
 83 |       double *density_array, double *temp_array, double *data);
 84 | cl_mem interpolate_hashgpu(int isize, int xstride, int density_axis_size, int temp_axis_size, cl_mem density_axis_buffer, cl_mem temp_axis_buffer,
 85 |       cl_mem density_array_buffer, cl_mem temp_array_buffer, cl_mem data_buffer, double *time);
 86 | 
 87 | #include "table.data"
 88 | 
 89 | int main(int argc, char *argv[])
 90 | {
 91 |    cl_int error;
 92 | 
 93 | #ifdef HAVE_OPENCL
 94 |    GPUInit(&context, &queue, &is_nvidia, &program, "table_kern.cl");
 95 | 
 96 |    interpolate_kernel = clCreateKernel(program, "interpolate_kernel", &error);
 97 |    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
 98 | #endif
 99 | 
100 |    int i;
101 | 
102 |    double temp, density;
103 | 
104 |    cl_int ierror;
105 |    int data_size = sizeof(data)/sizeof(data[0]);
106 |    int density_axis_size = sizeof(density_axis)/sizeof(density_axis[0]);
107 |    int temp_axis_size = sizeof(temp_axis)/sizeof(temp_axis[0]);
108 | 
109 |    double density_increment = (density_axis[density_axis_size-1]-density_axis[0])/(double)(density_axis_size-1);
110 |    double temp_increment = (temp_axis[temp_axis_size-1]-temp_axis[0])/(double)(temp_axis_size-1);
111 | 
112 |    double density_avg = (density_axis[density_axis_size-1]+density_axis[0])/2.0;
113 |    double temp_avg = (temp_axis[temp_axis_size-1]+temp_axis[0])/2.0;
114 | 
115 |    double density_stddev = (density_axis[density_axis_size-1]-density_axis[0])/6.0;
116 |    double temp_stddev = (temp_axis[temp_axis_size-1]-temp_axis[0])/6.0;
117 | 
118 |    for (i=1; i<density_axis_size; i++){
119 |       density_axis[i] = density_axis[0]+(double)i*density_increment;
120 |    }
121 | 
122 |    for (i=1; i<temp_axis_size; i++){
123 |       temp_axis[i] = temp_axis[0]+(double)i*temp_increment;
124 |    }
125 | 
126 | #ifdef HAVE_OPENCL
127 |    real *data_real = (real *)malloc(data_size*sizeof(real));
128 |    for (i=0; i<data_size; i++) { data_real[i] = (real)data[i]; }
129 |    cl_mem data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, data_size*sizeof(real), NULL, &ierror);
130 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
131 |    ierror = clEnqueueWriteBuffer(queue, data_buffer, CL_TRUE, 0, data_size*sizeof(real), data_real, 0, NULL, NULL);
132 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
133 |    free(data_real);
134 | 
135 |    real *density_axis_real = (real *)malloc(density_axis_size*sizeof(real));
136 |    for (i=0; i<density_axis_size; i++) { density_axis_real[i] = (real)density_axis[i]; }
137 |    cl_mem density_axis_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, density_axis_size*sizeof(real), NULL, &ierror);
138 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
139 |    ierror = clEnqueueWriteBuffer(queue, density_axis_buffer, CL_TRUE, 0, density_axis_size*sizeof(real), density_axis_real, 0, NULL, NULL);
140 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
141 |    free(density_axis_real);
142 | 
143 |    real *temp_axis_real = (real *)malloc(temp_axis_size*sizeof(real));
144 |    for (i=0; i<temp_axis_size; i++) { temp_axis_real[i] = (real)temp_axis[i]; }
145 |    cl_mem temp_axis_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, temp_axis_size*sizeof(real), NULL, &ierror);
146 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
147 |    ierror = clEnqueueWriteBuffer(queue, temp_axis_buffer, CL_TRUE, 0, temp_axis_size*sizeof(real), temp_axis_real, 0, NULL, NULL);
148 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
149 |    free(temp_axis_real);
150 | #endif
151 | 
152 |    printf("\n    Table Interpolate Performance Results\n\n");
153 | 
154 |    printf("Size,   \tBrute,    \tBisection \tHash CPU, \tHash GPU\n");
155 | 
156 |    double *value_gold, *value_test;
157 | 
158 |    int isize;
159 |    for( isize = 64; isize <= 50000000; isize*=2 ) {
160 |       printf("%d\t",isize);
161 | 
162 |       // Initialize look-up data
163 |       double *temp_array=(double *)malloc(isize*sizeof(double));
164 |       double *density_array=(double *)malloc(isize*sizeof(double));
165 | 
166 |       for (i = 0; i<isize; i++){
167 |          temp_array[i]    = random_normal_dist()*temp_stddev    + temp_avg;
168 |          density_array[i] = random_normal_dist()*density_stddev + density_avg;
169 |       }
170 | 
171 |       int xstride = density_axis_size;
172 |       struct timespec tstart;
173 |       double time_sum;
174 | 
175 |       // call data table interpolation routine
176 |       cpu_timer_start(&tstart);
177 |       value_gold = interpolate_bruteforce(isize, xstride, density_axis_size, temp_axis_size, density_axis, temp_axis,
178 |          density_array, temp_array, data);
179 |       time_sum += cpu_timer_stop(tstart);
180 |       printf("\t%.6lf,", time_sum);
181 | 
182 |       cpu_timer_start(&tstart);
183 |       value_test = interpolate_bisection(isize, xstride, density_axis_size, temp_axis_size, density_axis, temp_axis,
184 |          density_array, temp_array, data);
185 |       time_sum += cpu_timer_stop(tstart);
186 |       printf("\t%.6lf,", time_sum);
187 | 
188 |       for (i= 0; i<isize; i++){
189 |          if (value_test[i] != value_gold[i]){
190 |             printf("Warning %d does not match -- test %lf gold %lf\n",i,value_test[i],value_gold[i]);
191 |          }
192 |       }
193 | 
194 |       free(value_test);
195 | 
196 |       cpu_timer_start(&tstart);
197 |       value_test = interpolate_hashcpu(isize, xstride, density_axis_size, temp_axis_size, density_axis, temp_axis,
198 |          density_array, temp_array, data);
199 |       time_sum += cpu_timer_stop(tstart);
200 |       printf("\t%.6lf,", time_sum);
201 | 
202 |       for (i= 0; i<isize; i++){
203 |          if (value_test[i] != value_gold[i]){
204 |             printf("Warning %d does not match -- test %lf gold %lf\n",i,value_test[i],value_gold[i]);
205 |          }
206 |       }
207 | 
208 |       free(value_test);
209 | 
210 | #ifdef HAVE_OPENCL
211 |       real *density_array_real = (real *)malloc(isize*sizeof(real));
212 |       for (i=0; i<isize; i++) { density_array_real[i] = (real)density_array[i]; }
213 |       cl_mem density_array_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, isize*sizeof(real), NULL, &ierror);
214 |       if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
215 |       ierror = clEnqueueWriteBuffer(queue, density_array_buffer, CL_TRUE, 0, isize*sizeof(real), density_array_real, 0, NULL, NULL);
216 |       if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
217 |       free(density_array_real);
218 | 
219 |       real *temp_array_real = (real *)malloc(isize*sizeof(real));
220 |       for (i=0; i<isize; i++) { temp_array_real[i] = (real)temp_array[i]; }
221 |       cl_mem temp_array_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, isize*sizeof(real), NULL, &ierror);
222 |       if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
223 |       ierror = clEnqueueWriteBuffer(queue, temp_array_buffer, CL_TRUE, 0, isize*sizeof(real), temp_array_real, 0, NULL, NULL);
224 |       if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
225 |       free(temp_array_real);
226 | 
227 |       cl_mem value_buffer = interpolate_hashgpu(isize, xstride, density_axis_size, temp_axis_size, density_axis_buffer, temp_axis_buffer,
228 |          density_array_buffer, temp_array_buffer, data_buffer, &time_sum);
229 |       printf("\t%.6lf,", time_sum);
230 | 
231 |       clReleaseMemObject(density_array_buffer);
232 |       clReleaseMemObject(temp_array_buffer);
233 | 
234 |       real *value_array_real = (real *)malloc(isize*sizeof(real));
235 |       
236 |       ierror = clEnqueueReadBuffer(queue, value_buffer, CL_TRUE, 0, isize*sizeof(real), value_array_real, 0, NULL, NULL);
237 |       if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
238 | 
239 |       clReleaseMemObject(value_buffer);
240 |     
241 |       value_test = (double *)malloc(isize*sizeof(double));
242 |       for (i=0; i<isize; i++) { value_test[i] = (double)value_array_real[i]; }
243 |     
244 |       for (i= 0; i<isize; i++){
245 |          if (fabs(value_test[i] - value_gold[i]) > EPS ){
246 |             printf("Warning %d does not match -- test %lf gold %lf\n",i,value_test[i],value_gold[i]);
247 |          }
248 |       }
249 | 
250 |       free(value_test);
251 | #endif
252 | 
253 |       printf("\n");
254 | 
255 |       free(value_gold);
256 |    }
257 | 
258 | #ifdef HAVE_OPENCL
259 |    clReleaseMemObject(data_buffer);
260 |    clReleaseMemObject(density_axis_buffer);
261 |    clReleaseMemObject(temp_axis_buffer);
262 | #endif
263 | }
264 | 
265 | 
266 | double random_normal_dist(void)
267 | {
268 |     double x1, x2, x3, result;
269 | 
270 |     x1 = 2.0*drand48() - 1.0;
271 |     x2 = 2.0*drand48() - 1.0;
272 |     x3 = 2.0*drand48() - 1.0;
273 |     result = x1 + x2 + x3;
274 | 
275 |     return(result);
276 | }
277 | 
278 | double *interpolate_bruteforce(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis,
279 |       double *density_array, double *temp_array, double *data)
280 | {
281 |    int i;
282 | 
283 |    double *value_array=(double *)malloc(isize*sizeof(double));
284 | 
285 |    for (i = 0; i<isize; i++){
286 |       int temp_slot, density_slot;
287 | 
288 |       for (temp_slot=0; temp_slot<temp_axis_size-2 && temp_array[i] > temp_axis[temp_slot+1]; temp_slot++);
289 |       for (density_slot=0; density_slot<density_axis_size-2 && density_array[i] > density_axis[density_slot+1]; density_slot++);
290 | 
291 |       double xfrac = (density_array[i]-density_axis[density_slot])/(density_axis[density_slot+1]-density_axis[density_slot]);
292 |       double yfrac = (temp_array[i]-temp_axis[temp_slot])/(temp_axis[temp_slot+1]-temp_axis[temp_slot]);
293 |       value_array[i] =      xfrac *     yfrac *dataval(density_slot+1,temp_slot+1) 
294 |                      + (1.0-xfrac)*     yfrac *dataval(density_slot,  temp_slot+1)
295 |                      +      xfrac *(1.0-yfrac)*dataval(density_slot+1,temp_slot)
296 |                      + (1.0-xfrac)*(1.0-yfrac)*dataval(density_slot,  temp_slot);
297 | 
298 |    }
299 | 
300 |    return(value_array);
301 | }
302 | 
303 | double *interpolate_bisection(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis,
304 |       double *density_array, double *temp_array, double *data)
305 | {
306 |    int i;
307 | 
308 |    double *value_array=(double *)malloc(isize*sizeof(double));
309 | 
310 |    for (i = 0; i<isize; i++){
311 |       int temp_slot = bisection(temp_axis, temp_axis_size-2, temp_array[i]);
312 |       int density_slot = bisection(density_axis, density_axis_size-2, density_array[i]);
313 | 
314 |       double xfrac = (density_array[i]-density_axis[density_slot])/(density_axis[density_slot+1]-density_axis[density_slot]);
315 |       double yfrac = (temp_array[i]-temp_axis[temp_slot])/(temp_axis[temp_slot+1]-temp_axis[temp_slot]);
316 |       value_array[i] =      xfrac *     yfrac *dataval(density_slot+1,temp_slot+1) 
317 |                      + (1.0-xfrac)*     yfrac *dataval(density_slot,  temp_slot+1)
318 |                      +      xfrac *(1.0-yfrac)*dataval(density_slot+1,temp_slot)
319 |                      + (1.0-xfrac)*(1.0-yfrac)*dataval(density_slot,  temp_slot);
320 |    }
321 | 
322 |    return(value_array);
323 | }
324 | 
325 | int bisection(double *axis, int axis_size, double value)
326 | {
327 |    int ibot = 0;
328 |    int itop = axis_size+1;
329 | 
330 |    while (itop - ibot > 1){
331 |       int imid = (itop + ibot) /2;
332 |       if ( value >= axis[imid] ) 
333 |          ibot = imid;
334 |       else
335 |          itop = imid;
336 |    }
337 |    return(ibot);
338 | }
339 | 
340 | double *interpolate_hashcpu(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis,
341 |       double *density_array, double *temp_array, double *data)
342 | {
343 |    int i;
344 |    // Computes a constant increment for each axis data look-up
345 |    double density_increment = (density_axis[density_axis_size-1]-density_axis[0])/(double)(density_axis_size-1);
346 |    double temp_increment = (temp_axis[temp_axis_size-1]-temp_axis[0])/(double)(temp_axis_size-1);
347 | 
348 |    double *value_array=(double *)malloc(isize*sizeof(double));
349 | 
350 |    for (i = 0; i<isize; i++){
351 |       // Determine the interval for interpolation and the fraction in the interval
352 |       int temp_slot = (temp_array[i]-temp_axis[0])/temp_increment;
353 |       int density_slot = (density_array[i]-density_axis[0])/density_increment;
354 | 
355 |       double xfrac = (density_array[i]-density_axis[density_slot])/(density_axis[density_slot+1]-density_axis[density_slot]);
356 |       double yfrac = (temp_array[i]-temp_axis[temp_slot])/(temp_axis[temp_slot+1]-temp_axis[temp_slot]);
357 |       // Bi-linear interpolation
358 |       value_array[i] =      xfrac *     yfrac *dataval(density_slot+1,temp_slot+1) 
359 |                      + (1.0-xfrac)*     yfrac *dataval(density_slot,  temp_slot+1)
360 |                      +      xfrac *(1.0-yfrac)*dataval(density_slot+1,temp_slot)
361 |                      + (1.0-xfrac)*(1.0-yfrac)*dataval(density_slot,  temp_slot);
362 |    }
363 | 
364 |    return(value_array);
365 | }
366 | 
367 | #ifdef HAVE_OPENCL
368 | cl_mem interpolate_hashgpu(int isize, int xstride, int density_axis_size, int temp_axis_size, cl_mem density_axis_buffer, cl_mem temp_axis_buffer,
369 |       cl_mem density_array_buffer, cl_mem temp_array_buffer, cl_mem data_buffer, double *time)
370 | {
371 |    int i;
372 |    cl_int ierror;
373 | 
374 |    *time = 0.0;
375 | 
376 |    int data_size = sizeof(data)/sizeof(data[0]);
377 | 
378 |    cl_mem value_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, isize*sizeof(real), NULL, &ierror);
379 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
380 | 
381 |    ierror = clSetKernelArg(interpolate_kernel, 0, sizeof(cl_uint), &isize);
382 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
383 |    ierror = clSetKernelArg(interpolate_kernel, 1, sizeof(cl_uint), &density_axis_size);
384 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
385 |    ierror = clSetKernelArg(interpolate_kernel, 2, sizeof(cl_uint), &temp_axis_size);
386 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
387 |    ierror = clSetKernelArg(interpolate_kernel, 3, sizeof(cl_uint), &data_size);
388 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
389 |    ierror = clSetKernelArg(interpolate_kernel, 4, sizeof(cl_mem), (void*)&density_axis_buffer);
390 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
391 |    ierror = clSetKernelArg(interpolate_kernel, 5, sizeof(cl_mem), (void*)&temp_axis_buffer);
392 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
393 |    ierror = clSetKernelArg(interpolate_kernel, 6, sizeof(cl_mem), (void*)&data_buffer);
394 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
395 |    ierror = clSetKernelArg(interpolate_kernel, 7, density_axis_size*sizeof(cl_real), NULL);
396 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
397 |    ierror = clSetKernelArg(interpolate_kernel, 8, temp_axis_size*sizeof(cl_real), NULL);
398 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
399 |    ierror = clSetKernelArg(interpolate_kernel, 9, data_size*sizeof(cl_real), NULL);
400 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
401 |    ierror = clSetKernelArg(interpolate_kernel, 10, sizeof(cl_mem), (void*)&density_array_buffer);
402 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
403 |    ierror = clSetKernelArg(interpolate_kernel, 11, sizeof(cl_mem), (void*)&temp_array_buffer);
404 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
405 |    ierror = clSetKernelArg(interpolate_kernel, 12, sizeof(cl_mem), (void*)&value_buffer);
406 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
407 | 
408 |    size_t local_work_size[1];
409 |    size_t global_work_size[1];
410 | 
411 |    local_work_size[0] = TILE_SIZE;
412 |    global_work_size[0] = ((isize+local_work_size[0]-1)/local_work_size[0])*local_work_size[0];
413 | 
414 |    cl_event interpolate_event;
415 | 
416 |    ierror = clEnqueueNDRangeKernel(queue, interpolate_kernel, 1, 0, global_work_size, local_work_size, 0, NULL, &interpolate_event);
417 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
418 | 
419 |    long gpu_time_start, gpu_time_end;
420 | 
421 |    clWaitForEvents(1,&interpolate_event);
422 |    clGetEventProfilingInfo(interpolate_event, CL_PROFILING_COMMAND_START, sizeof(gpu_time_start), &gpu_time_start, NULL);
423 |    clGetEventProfilingInfo(interpolate_event, CL_PROFILING_COMMAND_END, sizeof(gpu_time_end), &gpu_time_end, NULL);
424 |    long gpu_time = gpu_time_end - gpu_time_start;
425 |    clReleaseEvent(interpolate_event);
426 | 
427 |    *time = (double)gpu_time*1.0e-9;
428 | 
429 |    return(value_buffer);
430 | }
431 | #endif
432 | 
433 | 


--------------------------------------------------------------------------------
/table_kern.cl:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
  3 |  *  All rights Reserved.
  4 |  *
  5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
  6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
  7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
  8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
  9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
 10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
 11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
 12 |  * to produce derivative works, such modified software should be clearly marked,
 13 |  * so as not to confuse it with the version available from LANL.   
 14 |  *
 15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 16 |  * use this file except in compliance with the License. You may obtain a copy
 17 |  * of the License at 
 18 |  *
 19 |  * http://www.apache.org/licenses/LICENSE-2.0
 20 |  *
 21 |  * Unless required by applicable law or agreed to in writing, software distributed
 22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 24 |  * specific language governing permissions and limitations under the License.”
 25 |  *
 26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
 27 |  *
 28 |  */
 29 | 
 30 | /*
 31 |  *  Authors: Bob Robey       XCP-2   brobey@lanl.gov
 32 |  *           David Nicholaeff        dnic@lanl.gov, mtrxknight@aol.com
 33 |  *           Rachel Robey            rnrobey@gmail.com
 34 |  */
 35 | 
 36 | /* table_kern.cl */
 37 | 
 38 | #ifdef HAVE_CL_DOUBLE
 39 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 40 | typedef double  real;
 41 | #else
 42 | typedef float   real;
 43 | #endif
 44 | 
 45 | #define dataval(x,y) data[(x)+((y)*xstride)]
 46 | 
 47 | __kernel void interpolate_kernel(
 48 |    const uint isize,
 49 |    const uint xaxis_size,
 50 |    const uint yaxis_size,
 51 |    const uint data_size,
 52 |    __global const real *xaxis_buffer,
 53 |    __global const real *yaxis_buffer,
 54 |    __global const real *data_buffer,
 55 |    __local        real *xaxis,
 56 |    __local        real *yaxis,
 57 |    __local        real *data,
 58 |    __global const real *x_array,
 59 |    __global const real *y_array,
 60 |    __global       real *value
 61 |    )
 62 | {
 63 |    const uint tid = get_local_id(0);
 64 |    const uint wgs = get_local_size(0);
 65 |    const uint gid = get_global_id(0);
 66 | 
 67 |    // Loads the axis data values
 68 |    if (tid < xaxis_size) xaxis[tid]=xaxis_buffer[tid];
 69 |    if (tid < yaxis_size) yaxis[tid]=yaxis_buffer[tid];
 70 | 
 71 |    // Loads the data table
 72 |    for (uint wid = tid; wid<data_size; wid+=wgs){
 73 |       data[wid] = data_buffer[wid];
 74 |    }
 75 |    // Need to synchronize before table queries
 76 |    barrier(CLK_LOCAL_MEM_FENCE);
 77 | 
 78 |    // Computes a constant increment for each axis data look-up
 79 |    real x_increment = (xaxis[xaxis_size-1]-xaxis[0])/(double)(xaxis_size-1);
 80 |    real y_increment = (yaxis[yaxis_size-1]-yaxis[0])/(double)(yaxis_size-1);
 81 | 
 82 |    int xstride = xaxis_size;
 83 | 
 84 |    if (gid < isize) {
 85 |       // Loads the next data value
 86 |       real xdata = x_array[gid];
 87 |       real ydata = y_array[gid];
 88 | 
 89 |       // Determine the interval for interpolation and the fraction in the interval
 90 |       int islot = (int)((xdata-xaxis[0])/x_increment);
 91 |       int jslot = (int)((ydata-yaxis[0])/y_increment);
 92 |       real xfrac = (xdata-xaxis[islot])/(xaxis[islot+1]-xaxis[islot]);
 93 |       real yfrac = (ydata-yaxis[jslot])/(yaxis[jslot+1]-yaxis[jslot]);
 94 | 
 95 |       // Bi-linear interpolation
 96 |       value[gid] =      xfrac *     yfrac *dataval(islot+1,jslot+1)
 97 |                  + (1.0-xfrac)*     yfrac *dataval(islot,  jslot+1)
 98 |                  +      xfrac *(1.0-yfrac)*dataval(islot+1,jslot)
 99 |                  + (1.0-xfrac)*(1.0-yfrac)*dataval(islot,  jslot);
100 |    }
101 | }
102 | 
103 | 


--------------------------------------------------------------------------------
/tablelarge.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
  3 |  *  All rights Reserved.
  4 |  *
  5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
  6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
  7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
  8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
  9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
 10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
 11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
 12 |  * to produce derivative works, such modified software should be clearly marked,
 13 |  * so as not to confuse it with the version available from LANL.   
 14 |  *
 15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 16 |  * use this file except in compliance with the License. You may obtain a copy
 17 |  * of the License at 
 18 |  *
 19 |  * http://www.apache.org/licenses/LICENSE-2.0
 20 |  *
 21 |  * Unless required by applicable law or agreed to in writing, software distributed
 22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 24 |  * specific language governing permissions and limitations under the License.”
 25 |  *
 26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
 27 |  *
 28 |  */
 29 | 
 30 | /*
 31 |  *  Authors: Bob Robey       XCP-2   brobey@lanl.gov
 32 |  *           David Nicholaeff        dnic@lanl.gov, mtrxknight@aol.com
 33 |  *           Rachel Robey            rnrobey@gmail.com
 34 |  */
 35 | 
 36 | #include <stdlib.h>
 37 | #include <stdio.h>
 38 | #include <math.h>
 39 | #include <time.h>
 40 | #include <sys/time.h>
 41 | #include "gpu.h"
 42 | #include "timer.h"
 43 | 
 44 | cl_kernel interpolate_kernel;
 45 | 
 46 | #ifdef HAVE_CONFIG_H
 47 | #include "config.h"
 48 | #endif
 49 | 
 50 | #ifdef __APPLE_CC__
 51 | #include <OpenCL/OpenCL.h>
 52 | #else
 53 | #include <CL/cl.h>
 54 | #endif
 55 | 
 56 | #ifdef HAVE_CL_DOUBLE
 57 | typedef double real;
 58 | typedef cl_double cl_real;
 59 | typedef cl_double4 cl_real4;
 60 | #define EPS 1.0e-8
 61 | #else
 62 | typedef float real;
 63 | typedef cl_float cl_real;
 64 | typedef cl_float4 cl_real4;
 65 | #define EPS 1.0e-5
 66 | #endif
 67 | 
 68 | #define TILE_SIZE 256
 69 | #define dataval(x,y) data[(x)+((y)*xstride)]
 70 | 
 71 | cl_context context;
 72 | cl_command_queue queue;
 73 | cl_program program;
 74 | int is_nvidia=0;
 75 | 
 76 | double random_normal_dist(void);
 77 | double *interpolate_bruteforce(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis,
 78 |       double *density_array, double *temp_array, double *data);
 79 | double *interpolate_bisection(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis,
 80 |       double *density_array, double *temp_array, double *data);
 81 | int bisection(double *axis, int axis_size, double value);
 82 | double *interpolate_hashcpu(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis,
 83 |       double *density_array, double *temp_array, double *data);
 84 | #ifdef HAVE_OPENCL
 85 | cl_mem interpolate_hashgpu(int isize, int xstride, int density_axis_size, int temp_axis_size, cl_mem density_axis_buffer, cl_mem temp_axis_buffer,
 86 |       cl_mem density_array_buffer, cl_mem temp_array_buffer, cl_mem data_buffer, double *time);
 87 | #endif
 88 | 
 89 | #include "tablelarge.data"
 90 | 
 91 | int main(int argc, char *argv[])
 92 | {
 93 |    cl_int error;
 94 | 
 95 | #ifdef HAVE_OPENCL
 96 |    GPUInit(&context, &queue, &is_nvidia, &program, "table_kern.cl");
 97 | 
 98 |    interpolate_kernel = clCreateKernel(program, "interpolate_kernel", &error);
 99 |    if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__);
100 | #endif
101 | 
102 |    int i;
103 | 
104 |    double temp, density;
105 | 
106 |    cl_int ierror;
107 |    int data_size = sizeof(data)/sizeof(data[0]);
108 |    int density_axis_size = sizeof(density_axis)/sizeof(density_axis[0]);
109 |    int temp_axis_size = sizeof(temp_axis)/sizeof(temp_axis[0]);
110 | 
111 |    double density_increment = (density_axis[density_axis_size-1]-density_axis[0])/(double)(density_axis_size-1);
112 |    double temp_increment = (temp_axis[temp_axis_size-1]-temp_axis[0])/(double)(temp_axis_size-1);
113 | 
114 |    double density_avg = (density_axis[density_axis_size-1]+density_axis[0])/2.0;
115 |    double temp_avg = (temp_axis[temp_axis_size-1]+temp_axis[0])/2.0;
116 | 
117 |    double density_stddev = (density_axis[density_axis_size-1]-density_axis[0])/6.0;
118 |    double temp_stddev = (temp_axis[temp_axis_size-1]-temp_axis[0])/6.0;
119 | 
120 |    for (i=1; i<density_axis_size; i++){
121 |       density_axis[i] = density_axis[0]+(double)i*density_increment;
122 |    }
123 | 
124 |    for (i=1; i<temp_axis_size; i++){
125 |       temp_axis[i] = temp_axis[0]+(double)i*temp_increment;
126 |    }
127 | 
128 | #ifdef HAVE_OPENCL
129 |    real *data_real = (real *)malloc(data_size*sizeof(real));
130 |    for (i=0; i<data_size; i++) { data_real[i] = (real)data[i]; }
131 |    cl_mem data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, data_size*sizeof(real), NULL, &ierror);
132 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
133 |    ierror = clEnqueueWriteBuffer(queue, data_buffer, CL_TRUE, 0, data_size*sizeof(real), data_real, 0, NULL, NULL);
134 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
135 |    free(data_real);
136 | 
137 |    real *density_axis_real = (real *)malloc(density_axis_size*sizeof(real));
138 |    for (i=0; i<density_axis_size; i++) { density_axis_real[i] = (real)density_axis[i]; }
139 |    cl_mem density_axis_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, density_axis_size*sizeof(real), NULL, &ierror);
140 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
141 |    ierror = clEnqueueWriteBuffer(queue, density_axis_buffer, CL_TRUE, 0, density_axis_size*sizeof(real), density_axis_real, 0, NULL, NULL);
142 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
143 |    free(density_axis_real);
144 | 
145 |    real *temp_axis_real = (real *)malloc(temp_axis_size*sizeof(real));
146 |    for (i=0; i<temp_axis_size; i++) { temp_axis_real[i] = (real)temp_axis[i]; }
147 |    cl_mem temp_axis_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, temp_axis_size*sizeof(real), NULL, &ierror);
148 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
149 |    ierror = clEnqueueWriteBuffer(queue, temp_axis_buffer, CL_TRUE, 0, temp_axis_size*sizeof(real), temp_axis_real, 0, NULL, NULL);
150 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
151 |    free(temp_axis_real);
152 | #endif
153 | 
154 |    printf("\n    Table Interpolate Performance Results\n\n");
155 | 
156 |    printf("Size,   \tBrute,    \tBisection \tHash CPU, \tHash GPU\n");
157 | 
158 |    double *value_gold, *value_test;
159 | 
160 |    int isize;
161 |    for( isize = 64; isize <= 50000000; isize*=2 ) {
162 |       printf("%d\t",isize);
163 | 
164 |       // Initialize look-up data
165 |       double *temp_array=(double *)malloc(isize*sizeof(double));
166 |       double *density_array=(double *)malloc(isize*sizeof(double));
167 | 
168 |       for (i = 0; i<isize; i++){
169 |          temp_array[i]    = random_normal_dist()*temp_stddev    + temp_avg;
170 |          density_array[i] = random_normal_dist()*density_stddev + density_avg;
171 |       }
172 | 
173 |       int xstride = density_axis_size;
174 |       struct timespec tstart;
175 |       double time_sum;
176 | 
177 |       // call data table interpolation routine
178 |       cpu_timer_start(&tstart);
179 |       value_gold = interpolate_bruteforce(isize, xstride, density_axis_size, temp_axis_size, density_axis, temp_axis,
180 |          density_array, temp_array, data);
181 |       time_sum += cpu_timer_stop(tstart);
182 |       printf("\t%.6lf,", time_sum);
183 | 
184 |       cpu_timer_start(&tstart);
185 |       value_test = interpolate_bisection(isize, xstride, density_axis_size, temp_axis_size, density_axis, temp_axis,
186 |          density_array, temp_array, data);
187 |       time_sum += cpu_timer_stop(tstart);
188 |       printf("\t%.6lf,", time_sum);
189 | 
190 |       for (i= 0; i<isize; i++){
191 |          if (value_test[i] != value_gold[i]){
192 |             printf("Warning %d does not match -- test %lf gold %lf\n",i,value_test[i],value_gold[i]);
193 |          }
194 |       }
195 | 
196 |       free(value_test);
197 | 
198 |       cpu_timer_start(&tstart);
199 |       value_test = interpolate_hashcpu(isize, xstride, density_axis_size, temp_axis_size, density_axis, temp_axis,
200 |          density_array, temp_array, data);
201 |       time_sum += cpu_timer_stop(tstart);
202 |       printf("\t%.6lf,", time_sum);
203 | 
204 |       for (i= 0; i<isize; i++){
205 |          if (value_test[i] != value_gold[i]){
206 |             printf("Warning %d does not match -- test %lf gold %lf\n",i,value_test[i],value_gold[i]);
207 |          }
208 |       }
209 | 
210 |       free(value_test);
211 | 
212 | #ifdef XXX
213 |       real *density_array_real = (real *)malloc(isize*sizeof(real));
214 |       for (i=0; i<isize; i++) { density_array_real[i] = (real)density_array[i]; }
215 |       cl_mem density_array_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, isize*sizeof(real), NULL, &ierror);
216 |       if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
217 |       ierror = clEnqueueWriteBuffer(queue, density_array_buffer, CL_TRUE, 0, isize*sizeof(real), density_array_real, 0, NULL, NULL);
218 |       if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
219 |       free(density_array_real);
220 | 
221 |       real *temp_array_real = (real *)malloc(isize*sizeof(real));
222 |       for (i=0; i<isize; i++) { temp_array_real[i] = (real)temp_array[i]; }
223 |       cl_mem temp_array_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, isize*sizeof(real), NULL, &ierror);
224 |       if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
225 |       ierror = clEnqueueWriteBuffer(queue, temp_array_buffer, CL_TRUE, 0, isize*sizeof(real), temp_array_real, 0, NULL, NULL);
226 |       if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
227 |       free(temp_array_real);
228 | 
229 |       cl_mem value_buffer = interpolate_hashgpu(isize, xstride, density_axis_size, temp_axis_size, density_axis_buffer, temp_axis_buffer,
230 |          density_array_buffer, temp_array_buffer, data_buffer, &time_sum);
231 |       printf("\t%.6lf,", time_sum);
232 | 
233 |       clReleaseMemObject(density_array_buffer);
234 |       clReleaseMemObject(temp_array_buffer);
235 | 
236 |       real *value_array_real = (real *)malloc(isize*sizeof(real));
237 |       
238 |       ierror = clEnqueueReadBuffer(queue, value_buffer, CL_TRUE, 0, isize*sizeof(real), value_array_real, 0, NULL, NULL);
239 |       if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
240 | 
241 |       clReleaseMemObject(value_buffer);
242 |     
243 |       value_test = (double *)malloc(isize*sizeof(double));
244 |       for (i=0; i<isize; i++) { value_test[i] = (double)value_array_real[i]; }
245 |     
246 |       for (i= 0; i<isize; i++){
247 |          if (fabs(value_test[i] - value_gold[i]) > EPS ){
248 |             printf("Warning %d does not match -- test %lf gold %lf\n",i,value_test[i],value_gold[i]);
249 |          }
250 |       }
251 | 
252 |       free(value_test);
253 | #endif
254 | 
255 |       printf("\n");
256 | 
257 |       free(value_gold);
258 |    }
259 | 
260 | #ifdef HAVE_OPENCL
261 |    clReleaseMemObject(data_buffer);
262 |    clReleaseMemObject(density_axis_buffer);
263 |    clReleaseMemObject(temp_axis_buffer);
264 | #endif
265 | }
266 | 
267 | 
268 | double random_normal_dist(void)
269 | {
270 |     double x1, x2, x3, result;
271 | 
272 |     x1 = 2.0*drand48() - 1.0;
273 |     x2 = 2.0*drand48() - 1.0;
274 |     x3 = 2.0*drand48() - 1.0;
275 |     result = x1 + x2 + x3;
276 | 
277 |     return(result);
278 | }
279 | 
280 | double *interpolate_bruteforce(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis,
281 |       double *density_array, double *temp_array, double *data)
282 | {
283 |    int i;
284 | 
285 |    double *value_array=(double *)malloc(isize*sizeof(double));
286 | 
287 |    for (i = 0; i<isize; i++){
288 |       int temp_slot, density_slot;
289 | 
290 |       for (temp_slot=0; temp_slot<temp_axis_size-2 && temp_array[i] > temp_axis[temp_slot+1]; temp_slot++);
291 |       for (density_slot=0; density_slot<density_axis_size-2 && density_array[i] > density_axis[density_slot+1]; density_slot++);
292 | 
293 |       double xfrac = (density_array[i]-density_axis[density_slot])/(density_axis[density_slot+1]-density_axis[density_slot]);
294 |       double yfrac = (temp_array[i]-temp_axis[temp_slot])/(temp_axis[temp_slot+1]-temp_axis[temp_slot]);
295 |       value_array[i] =      xfrac *     yfrac *dataval(density_slot+1,temp_slot+1) 
296 |                      + (1.0-xfrac)*     yfrac *dataval(density_slot,  temp_slot+1)
297 |                      +      xfrac *(1.0-yfrac)*dataval(density_slot+1,temp_slot)
298 |                      + (1.0-xfrac)*(1.0-yfrac)*dataval(density_slot,  temp_slot);
299 | 
300 |    }
301 | 
302 |    return(value_array);
303 | }
304 | 
305 | double *interpolate_bisection(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis,
306 |       double *density_array, double *temp_array, double *data)
307 | {
308 |    int i;
309 | 
310 |    double *value_array=(double *)malloc(isize*sizeof(double));
311 | 
312 |    for (i = 0; i<isize; i++){
313 |       int temp_slot = bisection(temp_axis, temp_axis_size-2, temp_array[i]);
314 |       int density_slot = bisection(density_axis, density_axis_size-2, density_array[i]);
315 | 
316 |       double xfrac = (density_array[i]-density_axis[density_slot])/(density_axis[density_slot+1]-density_axis[density_slot]);
317 |       double yfrac = (temp_array[i]-temp_axis[temp_slot])/(temp_axis[temp_slot+1]-temp_axis[temp_slot]);
318 |       value_array[i] =      xfrac *     yfrac *dataval(density_slot+1,temp_slot+1) 
319 |                      + (1.0-xfrac)*     yfrac *dataval(density_slot,  temp_slot+1)
320 |                      +      xfrac *(1.0-yfrac)*dataval(density_slot+1,temp_slot)
321 |                      + (1.0-xfrac)*(1.0-yfrac)*dataval(density_slot,  temp_slot);
322 |    }
323 | 
324 |    return(value_array);
325 | }
326 | 
327 | int bisection(double *axis, int axis_size, double value)
328 | {
329 |    int ibot = 0;
330 |    int itop = axis_size+1;
331 | 
332 |    while (itop - ibot > 1){
333 |       int imid = (itop + ibot) /2;
334 |       if ( value >= axis[imid] ) 
335 |          ibot = imid;
336 |       else
337 |          itop = imid;
338 |    }
339 |    return(ibot);
340 | }
341 | 
342 | double *interpolate_hashcpu(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis,
343 |       double *density_array, double *temp_array, double *data)
344 | {
345 |    int i;
346 |    // Computes a constant increment for each axis data look-up
347 |    double density_increment = (density_axis[density_axis_size-1]-density_axis[0])/(double)(density_axis_size-1);
348 |    double temp_increment = (temp_axis[temp_axis_size-1]-temp_axis[0])/(double)(temp_axis_size-1);
349 | 
350 |    double *value_array=(double *)malloc(isize*sizeof(double));
351 | 
352 |    for (i = 0; i<isize; i++){
353 |       // Determine the interval for interpolation and the fraction in the interval
354 |       int temp_slot = (temp_array[i]-temp_axis[0])/temp_increment;
355 |       int density_slot = (density_array[i]-density_axis[0])/density_increment;
356 | 
357 |       double xfrac = (density_array[i]-density_axis[density_slot])/(density_axis[density_slot+1]-density_axis[density_slot]);
358 |       double yfrac = (temp_array[i]-temp_axis[temp_slot])/(temp_axis[temp_slot+1]-temp_axis[temp_slot]);
359 |       // Bi-linear interpolation
360 |       value_array[i] =      xfrac *     yfrac *dataval(density_slot+1,temp_slot+1) 
361 |                      + (1.0-xfrac)*     yfrac *dataval(density_slot,  temp_slot+1)
362 |                      +      xfrac *(1.0-yfrac)*dataval(density_slot+1,temp_slot)
363 |                      + (1.0-xfrac)*(1.0-yfrac)*dataval(density_slot,  temp_slot);
364 |    }
365 | 
366 |    return(value_array);
367 | }
368 | 
369 | #ifdef HAVE_OPENCL
370 | cl_mem interpolate_hashgpu(int isize, int xstride, int density_axis_size, int temp_axis_size, cl_mem density_axis_buffer, cl_mem temp_axis_buffer,
371 |       cl_mem density_array_buffer, cl_mem temp_array_buffer, cl_mem data_buffer, double *time)
372 | {
373 |    int i;
374 |    cl_int ierror;
375 | 
376 |    *time = 0.0;
377 | 
378 |    int data_size = sizeof(data)/sizeof(data[0]);
379 | 
380 |    cl_mem value_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, isize*sizeof(real), NULL, &ierror);
381 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
382 | 
383 |    ierror = clSetKernelArg(interpolate_kernel, 0, sizeof(cl_uint), &isize);
384 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
385 |    ierror = clSetKernelArg(interpolate_kernel, 1, sizeof(cl_uint), &density_axis_size);
386 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
387 |    ierror = clSetKernelArg(interpolate_kernel, 2, sizeof(cl_uint), &temp_axis_size);
388 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
389 |    ierror = clSetKernelArg(interpolate_kernel, 3, sizeof(cl_uint), &data_size);
390 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
391 |    ierror = clSetKernelArg(interpolate_kernel, 4, sizeof(cl_mem), (void*)&density_axis_buffer);
392 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
393 |    ierror = clSetKernelArg(interpolate_kernel, 5, sizeof(cl_mem), (void*)&temp_axis_buffer);
394 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
395 |    ierror = clSetKernelArg(interpolate_kernel, 6, sizeof(cl_mem), (void*)&data_buffer);
396 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
397 |    ierror = clSetKernelArg(interpolate_kernel, 7, density_axis_size*sizeof(cl_real), NULL);
398 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
399 |    ierror = clSetKernelArg(interpolate_kernel, 8, temp_axis_size*sizeof(cl_real), NULL);
400 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
401 |    ierror = clSetKernelArg(interpolate_kernel, 9, data_size*sizeof(cl_real), NULL);
402 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
403 |    ierror = clSetKernelArg(interpolate_kernel, 10, sizeof(cl_mem), (void*)&density_array_buffer);
404 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
405 |    ierror = clSetKernelArg(interpolate_kernel, 11, sizeof(cl_mem), (void*)&temp_array_buffer);
406 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
407 |    ierror = clSetKernelArg(interpolate_kernel, 12, sizeof(cl_mem), (void*)&value_buffer);
408 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
409 | 
410 |    size_t local_work_size[1];
411 |    size_t global_work_size[1];
412 | 
413 |    local_work_size[0] = TILE_SIZE;
414 |    global_work_size[0] = ((isize+local_work_size[0]-1)/local_work_size[0])*local_work_size[0];
415 | 
416 |    cl_event interpolate_event;
417 | 
418 |    ierror = clEnqueueNDRangeKernel(queue, interpolate_kernel, 1, 0, global_work_size, local_work_size, 0, NULL, &interpolate_event);
419 |    if (ierror != CL_SUCCESS) printf("Error is %d at line %d\n",ierror,__LINE__);
420 | 
421 |    long gpu_time_start, gpu_time_end;
422 | 
423 |    clWaitForEvents(1,&interpolate_event);
424 |    clGetEventProfilingInfo(interpolate_event, CL_PROFILING_COMMAND_START, sizeof(gpu_time_start), &gpu_time_start, NULL);
425 |    clGetEventProfilingInfo(interpolate_event, CL_PROFILING_COMMAND_END, sizeof(gpu_time_end), &gpu_time_end, NULL);
426 |    long gpu_time = gpu_time_end - gpu_time_start;
427 |    clReleaseEvent(interpolate_event);
428 | 
429 |    *time = (double)gpu_time*1.0e-9;
430 | 
431 |    return(value_buffer);
432 | }
433 | #endif
434 | 
435 | 


--------------------------------------------------------------------------------
/tablelarge_kern.cl:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
  3 |  *  All rights Reserved.
  4 |  *
  5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
  6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
  7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
  8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
  9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
 10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
 11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
 12 |  * to produce derivative works, such modified software should be clearly marked,
 13 |  * so as not to confuse it with the version available from LANL.   
 14 |  *
 15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 16 |  * use this file except in compliance with the License. You may obtain a copy
 17 |  * of the License at 
 18 |  *
 19 |  * http://www.apache.org/licenses/LICENSE-2.0
 20 |  *
 21 |  * Unless required by applicable law or agreed to in writing, software distributed
 22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
 24 |  * specific language governing permissions and limitations under the License.”
 25 |  *
 26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
 27 |  *
 28 |  */
 29 | 
 30 | /*
 31 |  *  Authors: Bob Robey       XCP-2   brobey@lanl.gov
 32 |  *           David Nicholaeff        dnic@lanl.gov, mtrxknight@aol.com
 33 |  *           Rachel Robey            rnrobey@gmail.com
 34 |  */
 35 | 
 36 | /* table_kern.cl */
 37 | 
 38 | #ifdef HAVE_CL_DOUBLE
 39 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 40 | typedef double  real;
 41 | #else
 42 | typedef float   real;
 43 | #endif
 44 | 
 45 | #define dataval(x,y) data[(x)+((y)*xstride)]
 46 | 
 47 | __kernel void interpolate_kernel(
 48 |    const uint isize,
 49 |    const uint xaxis_size,
 50 |    const uint yaxis_size,
 51 |    const uint data_size,
 52 |    __global const real *xaxis_buffer,
 53 |    __global const real *yaxis_buffer,
 54 |    __global const real *data_buffer,
 55 |    __local        real *xaxis,
 56 |    __local        real *yaxis,
 57 |    __local        real *data,
 58 |    __global const real *x_array,
 59 |    __global const real *y_array,
 60 |    __global       real *value
 61 |    )
 62 | {
 63 |    const uint tid = get_local_id(0);
 64 |    const uint wgs = get_local_size(0);
 65 |    const uint gid = get_global_id(0);
 66 | 
 67 |    // Loads the axis data values
 68 |    if (tid < xaxis_size) xaxis[tid]=xaxis_buffer[tid];
 69 |    if (tid < yaxis_size) yaxis[tid]=yaxis_buffer[tid];
 70 | 
 71 |    // Loads the data table
 72 |    for (uint wid = tid; wid<data_size; wid+=wgs){
 73 |       data[wid] = data_buffer[wid];
 74 |    }
 75 |    // Need to synchronize before table queries
 76 |    barrier(CLK_LOCAL_MEM_FENCE);
 77 | 
 78 |    // Computes a constant increment for each axis data look-up
 79 |    real x_increment = (xaxis[110]-xaxis[0])/110.0;
 80 |    real y_increment = (yaxis[77]-yaxis[0])/77.0;
 81 | 
 82 |    int xstride = 111;
 83 | 
 84 |    if (gid < isize) {
 85 |       // Loads the next data value
 86 |       real xdata = x_array[gid];
 87 |       real ydata = y_array[gid];
 88 | 
 89 |       // Determine the interval for interpolation and the fraction in the interval
 90 |       int islot = (int)((xdata-xaxis[0])/x_increment);
 91 |       int jslot = (int)((ydata-yaxis[0])/y_increment);
 92 |       real xfrac = (xdata-xaxis[islot])/(xaxis[islot+1]-xaxis[islot]);
 93 |       real yfrac = (ydata-yaxis[jslot])/(yaxis[jslot+1]-yaxis[jslot]);
 94 | 
 95 |       // Bi-linear interpolation
 96 |       value[gid] =      xfrac *     yfrac *dataval(islot+1,jslot+1)
 97 |                  + (1.0-xfrac)*     yfrac *dataval(islot,  jslot+1)
 98 |                  +      xfrac *(1.0-yfrac)*dataval(islot+1,jslot)
 99 |                  + (1.0-xfrac)*(1.0-yfrac)*dataval(islot,  jslot);
100 |    }
101 | }
102 | 
103 | 


--------------------------------------------------------------------------------
/timer.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright (c) 2012-2019, Triad National Security, LLC.
 3 |  *  All rights Reserved.
 4 |  *
 5 |  * Copyright 2012-2019.  Triad National Security, LLC. This material was produced
 6 |  * under U.S. Government contract 89233218CNA000001 for Los Alamos National 
 7 |  * Laboratory (LANL), which is operated by Triad National Security, LLC
 8 |  * for the U.S. Department of Energy. The U.S. Government has rights to use,
 9 |  * reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
10 |  * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
11 |  * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
12 |  * to produce derivative works, such modified software should be clearly marked,
13 |  * so as not to confuse it with the version available from LANL.   
14 |  *
15 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
16 |  * use this file except in compliance with the License. You may obtain a copy
17 |  * of the License at 
18 |  *
19 |  * http://www.apache.org/licenses/LICENSE-2.0
20 |  *
21 |  * Unless required by applicable law or agreed to in writing, software distributed
22 |  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
23 |  * CONDITIONS OF ANY KIND, either express or implied. See the License for the
24 |  * specific language governing permissions and limitations under the License.”
25 |  *
26 |  * This is LANL Copyright Disclosure C13002/LA-CC-12-022
27 |  *
28 |  *  Authors: Bob Robey       XCP-2   brobey@lanl.gov
29 |  *           David Nicholaeff        dnic@lanl.gov, mtrxknight@aol.com
30 |  *           Rachel Robey            rnrobey@gmail.com
31 |  * 
32 |  */
33 | #include "timer.h"
34 | 
35 | void cpu_timer_start(struct timespec *tstart_cpu)
36 | {
37 |    clock_gettime(CLOCK_MONOTONIC, tstart_cpu);
38 | }
39 | double cpu_timer_stop(struct timespec tstart_cpu)
40 | {
41 |    struct timespec tstop_cpu, tresult;
42 |    clock_gettime(CLOCK_MONOTONIC, &tstop_cpu);
43 |    tresult.tv_sec = tstop_cpu.tv_sec - tstart_cpu.tv_sec;
44 |    tresult.tv_nsec = tstop_cpu.tv_nsec - tstart_cpu.tv_nsec;
45 |    double result = (double)tresult.tv_sec + (double)tresult.tv_nsec*1.0e-9;
46 | 
47 |    return(result);
48 | }
49 | 


--------------------------------------------------------------------------------
/timer.h:
--------------------------------------------------------------------------------
1 | #ifndef _TIMER_H
2 | #define _TIMER_H
3 | #include <time.h>
4 | 
5 | void cpu_timer_start(struct timespec *tstart_cpu);
6 | double cpu_timer_stop(struct timespec tstart_cpu);
7 | #endif
8 | 


--------------------------------------------------------------------------------