├── .gitignore ├── APACHE_LICENSE-2_0 ├── AUTHORS ├── CMakeLists.txt ├── LICENSE ├── NOTICE ├── Papers.bib ├── README ├── README.md ├── ReleaseAuthorization.pdf ├── gpu.c ├── gpu.h ├── kdtree ├── Bounds1d.c ├── Bounds1d.h ├── Bounds2d.c ├── Bounds2d.h ├── CMakeLists.txt ├── Globals1d.h ├── Globals2d.h ├── KDTree1d.c ├── KDTree1d.h ├── KDTree2d.c └── KDTree2d.h ├── neigh.c ├── neigh2d.c ├── neigh2d_kern.cl ├── neigh_kern.cl ├── remap.c ├── remap2d.c ├── remap2d_kern.cl ├── remap_kern.cl ├── sort.c ├── sort2d.c ├── sort2d_kern.cl ├── sort_kern.cl ├── table.c ├── table.data ├── table_kern.cl ├── tablelarge.c ├── tablelarge.data ├── tablelarge_kern.cl ├── timer.c └── timer.h /.gitignore: -------------------------------------------------------------------------------- 1 | # Object files 2 | *.o 3 | *.ko 4 | 5 | # Libraries 6 | *.lib 7 | *.a 8 | 9 | # Shared objects (inc. Windows DLLs) 10 | *.dll 11 | *.so 12 | *.so.* 13 | *.dylib 14 | 15 | # Executables 16 | *.exe 17 | *.out 18 | *.app 19 | -------------------------------------------------------------------------------- /APACHE_LICENSE-2_0: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | 204 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Authors: 2 | 3 | Bob Robey XCP-2 (brobey@lanl.gov) 4 | 5 | David Nicholaeff (dnic@lanl.gov, mtrxKnight@aol.com) 6 | 7 | Rachel Robey (rnrobey@gmail.com) 8 | 9 | Marcus Daniels (mdaniels@lanl.gov) 10 | 11 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required (VERSION 3.1) 2 | project (PerfectHash) 3 | 4 | if (DEVICE_DETECT_DEBUG) 5 | add_definitions(-DDEVICE_DETECT_DEBUG=1) 6 | endif (DEVICE_DETECT_DEBUG) 7 | 8 | if (NOT CMAKE_BUILD_TYPE) 9 | set(CMAKE_BUILD_TYPE RelWithDebInfo) 10 | endif() 11 | 12 | find_package(OpenCL) 13 | if (OpenCL_FOUND) 14 | add_definitions(-DHAVE_OPENCL) 15 | set(HAVE_CL_DOUBLE ON CACHE BOOL "Have OpenCL Double") 16 | set(NO_CL_DOUBLE OFF) 17 | include_directories(${OpenCL_INCLUDE_DIRS}) 18 | #message("OpenCL_INCLUDE_DIRS ${OpenCL_INCLUDE_DIRS}") 19 | #message("OpenCL_LIBRARIES ${OpenCL_LIBRARIES}") 20 | endif (OpenCL_FOUND) 21 | 22 | add_subdirectory(kdtree) 23 | 24 | # Adds build target of sort with source code files 25 | add_executable(sort sort.c gpu.c timer.c gpu.h timer.h) 26 | target_link_libraries(sort ${OpenCL_LIBRARIES} m) 27 | 28 | # Adds build target of sort2d with source code files 29 | add_executable(sort2d sort2d.c gpu.c timer.c gpu.h timer.h) 30 | target_link_libraries(sort2d ${OpenCL_LIBRARIES} m) 31 | 32 | # Adds build target of remap with source code files 33 | add_executable(remap remap.c gpu.c timer.c gpu.h timer.h) 34 | target_link_libraries(remap ${OpenCL_LIBRARIES} kdtree m) 35 | 36 | # Adds build target of remap2d with source code files 37 | add_executable(remap2d remap2d.c gpu.c timer.c gpu.h timer.h) 38 | target_link_libraries(remap2d ${OpenCL_LIBRARIES} kdtree m) 39 | 40 | # Adds build target of neigh with source code files 41 | add_executable(neigh neigh.c gpu.c timer.c gpu.h timer.h) 42 | target_link_libraries(neigh ${OpenCL_LIBRARIES} kdtree m) 43 | 44 | # Adds build target of neigh2d with source code files 45 | add_executable(neigh2d neigh2d.c gpu.c timer.c gpu.h timer.h) 46 | target_link_libraries(neigh2d ${OpenCL_LIBRARIES} kdtree m) 47 | 48 | # Adds build target of table with source code files 49 | add_executable(table table.c gpu.c timer.c gpu.h timer.h) 50 | target_link_libraries(table ${OpenCL_LIBRARIES} m) 51 | 52 | # Adds build target of tablelarge with source code files 53 | add_executable(tablelarge tablelarge.c gpu.c timer.c gpu.h timer.h) 54 | target_link_libraries(tablelarge ${OpenCL_LIBRARIES} m) 55 | 56 | # Cleanup 57 | SET_DIRECTORY_PROPERTIES(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES 58 | "CMakeCache.txt;Makefile;cmake_install.cmake;ipo_out.optrpt") 59 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012-2019, Triad National Security, LLC. 3 | * All rights Reserved. 4 | * 5 | * Copyright 2012-2019. Triad National Security, LLC. This material was produced 6 | * under U.S. Government contract 89233218CNA000001 for Los Alamos National 7 | * Laboratory (LANL), which is operated by Triad National Security, LLC 8 | * for the U.S. Department of Energy. The U.S. Government has rights to use, 9 | * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR 10 | * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 11 | * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified 12 | * to produce derivative works, such modified software should be clearly marked, 13 | * so as not to confuse it with the version available from LANL. 14 | * 15 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 16 | * use this file except in compliance with the License. You may obtain a copy 17 | * of the License at 18 | * 19 | * http://www.apache.org/licenses/LICENSE-2.0 20 | * 21 | * Unless required by applicable law or agreed to in writing, software distributed 22 | * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 23 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations under the License.” 25 | * 26 | * This is LANL Copyright Disclosure C13002/LA-CC-12-022 27 | * 28 | * Authors: Bob Robey XCP-2 brobey@lanl.gov 29 | * David Nicholaeff dnic@lanl.gov, mtrxknight@aol.com 30 | * Rachel Robey rnrobey@gmail.com 31 | * 32 | */ 33 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | This is the code released under LANL Copyright Disclosure C13002/LA-CC-12-022 2 | Copyright 2012-2018. Triad National Security, LLC. This material was produced 3 | under U.S. Government contract 89233218CNA000001 for Los Alamos National 4 | Laboratory (LANL), which is operated by Triad National Security, LLC 5 | for the U.S. Department of Energy. See LICENSE file for details. 6 | -------------------------------------------------------------------------------- /Papers.bib: -------------------------------------------------------------------------------- 1 | %% This BibTeX bibliography file was created using BibDesk. 2 | %% http://bibdesk.sourceforge.net/ 3 | 4 | 5 | %% Created for lbrobey at 2014-02-16 09:42:24 -0700 6 | 7 | 8 | %% Saved with string encoding Unicode (UTF-8) 9 | 10 | 11 | 12 | @article{Robey_RN_2013, 13 | Author = {Robey, R.N. and Nicholaeff, D. and Robey, R.W.}, 14 | Date-Added = {2012-04-29 14:16:38 -0600}, 15 | Date-Modified = {2014-02-16 16:41:30 +0000}, 16 | Journal = {SIAM Journal of Scientific Computing}, 17 | Month = {July}, 18 | Number = {4}, 19 | Pages = {C346--C368}, 20 | Title = {Hash-Based Algorithms for Discretized Data}, 21 | Volume = {35}, 22 | Year = {2013}} 23 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | This code is a set of hash functions to support the paper "Hash-based Algorithms 2 | for Discretized Data" to be published in the SIAM Journal of Scientific 3 | Computing". The publication details are below and in the Papers.bib file in bibtex 4 | format. 5 | 6 | Robey,R.N., Nicholaeff,D., and Robey,R.W. "Hash-Based Algorithms for Discretized Data", 7 | SIAM Journal of Scientific Computing, July 2013, Volume 35, Number 4, C346--C368 8 | 9 | The pre-publication version has the LANL report number LA-UR-12-01566. 10 | 11 | This code has been released under an open-source Apache 2 license to 12 | encourage further development of hashing methods. See the LICENSE file for more 13 | information about the license and the use of this code. 14 | 15 | Through a web search we hope to gather statistics on the use of the method 16 | and its improvements and help to encourage more open technology transfer by LANL 17 | and other government research organizations. This can be thought of as analagous 18 | to journal article citations, but within software products. If code is not 19 | distributed with a software product, a reference should be provided in a text 20 | file so that attribution can be determined. 21 | 22 | Under this license, it is required to include a reference to this work. We 23 | request that each derivative work contain a reference to LANL Copyright 24 | Disclosure C13002/LA-CC-12-022 so that this work’s impact can be roughly 25 | measured. In addition, it is requested that a modifier is included as in 26 | the following example: 27 | 28 | // LANL Copyright Disclosure C13002/LA-CC-12-022 29 | 30 | This is LANL Copyright Disclosure C13002/LA-CC-12-022 31 | 32 | Authors: Bob Robey XCP-2 brobey@lanl.gov 33 | David Nicholaeff dnic@lanl.gov, mtrxknight@aol.com 34 | Rachel Robey rnrobey@gmail.com 35 | 36 | This code uses cmake for builds. To build the code: 37 | 38 | cmake . 39 | make 40 | 41 | There will be several executables built -- sort, sort2d, neigh, neigh2d, remap, remap2d, table 42 | 43 | Each executable runs all of the methods for the mesh operation. There is a controlling 44 | loop at the top of the main routine that users may want to modify for the algorithms 45 | to be run. Also, there is a random number seed for varying the problem setups that is 46 | normally off, but users may want to turn on for some investigations. Results do vary for 47 | different problems, but the pattern does not change significantly. Also, the OpenCL library 48 | looks for a GPU to use for the OpenCL code. This may need to be modified for your particular 49 | hardware. 50 | 51 | Output should look something like the following: 52 | 53 | Sorting Performance Results 54 | 55 | Size, Qsort, Heapsort, Mergesort, Hash CPU, Hash GPU 56 | 57 | Max diff is 1 times min_diff 58 | 1024, 0.000105, 0.000168, 0.000123, 0.000010, 0.000294, 59 | 2048, 0.000223, 0.000353, 0.000262, 0.000020, 0.000330, 60 | 4096, 0.000493, 0.000781, 0.000573, 0.000040, 0.000400, 61 | 8192, 0.001036, 0.001683, 0.001135, 0.000079, 0.000610, 62 | 16384, 0.002242, 0.003470, 0.002613, 0.000161, 0.000934, 63 | 32768, 0.005360, 0.007348, 0.005469, 0.000415, 0.001535, 64 | 65536, 0.011800, 0.015796, 0.012560, 0.000800, 0.002629, 65 | 131072, 0.020707, 0.036958, 0.024513, 0.001561, 0.005294, 66 | 262144, 0.042710, 0.075466, 0.052864, 0.003751, 0.011301, 67 | 524288, 0.089662, 0.167398, 0.113505, 0.011855, 0.024291, 68 | 1048576, 0.185135, 0.407333, 0.249813, 0.026447, 0.018890, 69 | 2097152, 0.385789, 1.018970, 0.498638, 0.063970, 0.114473, 70 | 4194304, 0.818016, 2.475966, 1.052133, 0.141997, 0.246392, 71 | 8388608, 1.696586, 5.843146, 2.184568, 0.307373, 0.503904, 72 | 73 | Max diff is 2 times min_diff 74 | 1024, 0.000102, 0.000168, 0.000121, 0.000015, 0.000303, 75 | 2048, 0.000224, 0.000332, 0.000240, 0.000026, 0.000339, 76 | ... 77 | ... 78 | ... 79 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | PerfectHash 2 | =========== 3 | 4 | A perfect hash code for CPUs and GPUs using OpenCL 5 | 6 | This code is a set of hash functions to support the paper "Hash-based Algorithms 7 | for Discretized Data to be published in the SIAM Journal of Scientific 8 | Computing". The publication details are below and in the Papers.bib file in bibtex 9 | format. 10 | 11 | Robey,R.N., Nicholaeff,D., and Robey,R.W. "Hash-Based Algorithms for Discretized Data", 12 | SIAM Journal of Scientific Computing, July 2013, Volume 35, Number 4, C346--C368 13 | -------------------------------------------------------------------------------- /ReleaseAuthorization.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lanl/PerfectHash/be8c6e1b5afad67dd4c656cd689d441d0e95a433/ReleaseAuthorization.pdf -------------------------------------------------------------------------------- /gpu.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012-2019, Triad National Security, LLC. 3 | * All rights Reserved. 4 | * 5 | * Copyright 2012-2019. Triad National Security, LLC. This material was produced 6 | * under U.S. Government contract 89233218CNA000001 for Los Alamos National 7 | * Laboratory (LANL), which is operated by Triad National Security, LLC 8 | * for the U.S. Department of Energy. The U.S. Government has rights to use, 9 | * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR 10 | * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 11 | * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified 12 | * to produce derivative works, such modified software should be clearly marked, 13 | * so as not to confuse it with the version available from LANL. 14 | * 15 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 16 | * use this file except in compliance with the License. You may obtain a copy 17 | * of the License at 18 | * 19 | * http://www.apache.org/licenses/LICENSE-2.0 20 | * 21 | * Unless required by applicable law or agreed to in writing, software distributed 22 | * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 23 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations under the License.” 25 | * 26 | * This is LANL Copyright Disclosure C13002/LA-CC-12-022 27 | * 28 | */ 29 | 30 | /* 31 | * Authors: Bob Robey XCP-2 brobey@lanl.gov 32 | * David Nicholaeff dnic@lanl.gov, mtrxknight@aol.com 33 | * Rachel Robey rnrobey@gmail.com 34 | */ 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include "gpu.h" 41 | 42 | #ifdef HAVE_CL_DOUBLE 43 | typedef double real; 44 | #ifdef HAVE_OPENCL 45 | typedef cl_double cl_real; 46 | typedef cl_double4 cl_real4; 47 | #endif 48 | #else 49 | typedef float real; 50 | #ifdef HAVE_OPENCL 51 | typedef cl_float cl_real; 52 | typedef cl_float4 cl_real4; 53 | #endif 54 | #endif 55 | 56 | #ifndef DEVICE_DETECT_DEBUG 57 | #define DEVICE_DETECT_DEBUG 0 58 | #endif 59 | 60 | #ifdef HAVE_OPENCL 61 | void GPUInit(cl_context *context, cl_command_queue *queue, int *is_nvidia, cl_program *program, char *filename) { 62 | 63 | cl_platform_id* platforms; 64 | cl_platform_id platform = NULL; 65 | cl_uint num_platforms; 66 | cl_uint num_devices; 67 | cl_device_id* devices; 68 | cl_uint nDevices_selected=0; 69 | int *device_appropriate; 70 | int device_selected = -99; 71 | cl_int platform_selected = -1; 72 | //cl_program program; 73 | cl_int ierr = 0; 74 | 75 | // Get the number of platforms first, then allocate and get the platform 76 | ierr = clGetPlatformIDs(0, NULL, &num_platforms); 77 | if (ierr != CL_SUCCESS){ 78 | printf("GPU_INIT: Error with clGetPlatformIDs call in file %s at line %d\n", __FILE__, __LINE__); 79 | if (ierr == CL_INVALID_VALUE){ 80 | printf("GPU_INIT: Invalid value in clGetPlatformID call\n"); 81 | } 82 | exit(ierr); 83 | } 84 | if (num_platforms == 0) { 85 | printf("GPU_INIT: Error -- No opencl platforms detected in file %s at line %d\n", __FILE__, __LINE__); 86 | exit(-1); 87 | } 88 | if (DEVICE_DETECT_DEBUG){ 89 | printf("\n\nGPU_INIT: %d opencl platform(s) detected\n",num_platforms); 90 | } 91 | 92 | platforms = (cl_platform_id *)malloc(num_platforms*sizeof(cl_platform_id)); 93 | 94 | ierr = clGetPlatformIDs(num_platforms, platforms, NULL); 95 | if (ierr != CL_SUCCESS){ 96 | printf("GPU_INIT: Error with clGetPlatformIDs call in file %s at line %d\n", __FILE__, __LINE__); 97 | if (ierr == CL_INVALID_VALUE){ 98 | printf("Invalid value in clGetPlatformID call\n"); 99 | } 100 | } 101 | 102 | if (DEVICE_DETECT_DEBUG){ 103 | char info[1024]; 104 | for (uint iplatform=0; iplatform 42 | #else 43 | #include 44 | #endif 45 | 46 | extern cl_kernel interpolate_kernel; 47 | 48 | void GPUInit(cl_context *context, cl_command_queue *queue, int *is_nvidia, cl_program *program, char *filename); 49 | int device_double_support(cl_device_id device); 50 | void device_info(cl_device_id device); 51 | -------------------------------------------------------------------------------- /kdtree/Bounds1d.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012-2019, Triad National Security, LLC. 3 | * All rights Reserved. 4 | * 5 | * Copyright 2012-2019. Triad National Security, LLC. This material was produced 6 | * under U.S. Government contract 89233218CNA000001 for Los Alamos National 7 | * Laboratory (LANL), which is operated by Triad National Security, LLC 8 | * for the U.S. Department of Energy. The U.S. Government has rights to use, 9 | * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR 10 | * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 11 | * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified 12 | * to produce derivative works, such modified software should be clearly marked, 13 | * so as not to confuse it with the version available from LANL. 14 | * 15 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 16 | * use this file except in compliance with the License. You may obtain a copy 17 | * of the License at 18 | * 19 | * http://www.apache.org/licenses/LICENSE-2.0 20 | * 21 | * Unless required by applicable law or agreed to in writing, software distributed 22 | * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 23 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations under the License.” 25 | * 26 | * This is LANL Copyright Disclosure C13002/LA-CC-12-022 27 | * 28 | */ 29 | 30 | #include "Bounds1d.h" 31 | 32 | void Bounds_Copy1d(TBounds1d* src, TBounds1d* dest) { 33 | assert(src && dest); 34 | MEMCPY(src, dest, 1, TBounds1d); 35 | } 36 | 37 | void Bounds_Infinite1d(TBounds1d* b){ 38 | assert(b); 39 | b->min.x = POSITIVE_INFINITY; 40 | b->max.x = NEGATIVE_INFINITY; 41 | } 42 | 43 | void Bounds_AddBounds1d(TBounds1d* b, TBounds1d* add) { 44 | assert(b && add); 45 | b->min.x = MIN(b->min.x, add->min.x); 46 | b->max.x = MAX(b->max.x, add->max.x); 47 | } 48 | 49 | void Bounds_AddEpsilon1d(TBounds1d* b, double add) { 50 | assert(b); 51 | b->min.x = b->min.x - add; 52 | b->max.x = b->max.x + add; 53 | } 54 | 55 | boolean Bounds_IsOverlappingBounds1d(TBounds1d* b, TBounds1d* tst) { 56 | assert(b && tst); 57 | if((tst->max.x < b->min.x) || (tst->min.x > b->max.x)) 58 | return(false); 59 | return(true); 60 | } 61 | 62 | double Bounds_WidthAxis1d(TBounds1d* b, unsigned long axis) 63 | { 64 | double width; 65 | 66 | assert(b); 67 | if(axis == XAXIS) 68 | width = b->max.x - b->min.x; 69 | else 70 | assert(NULL); 71 | return(width); 72 | } 73 | 74 | double Bounds_CenterAxis1d(TBounds1d* b, unsigned long axis) 75 | { 76 | double center; 77 | 78 | assert(b); 79 | if(axis == XAXIS) 80 | center = (b->min.x + b->max.x) * 0.5; 81 | else 82 | assert(NULL); 83 | return(center); 84 | } 85 | -------------------------------------------------------------------------------- /kdtree/Bounds1d.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012-2019, Triad National Security, LLC. 3 | * All rights Reserved. 4 | * 5 | * Copyright 2012-2019. Triad National Security, LLC. This material was produced 6 | * under U.S. Government contract 89233218CNA000001 for Los Alamos National 7 | * Laboratory (LANL), which is operated by Triad National Security, LLC 8 | * for the U.S. Department of Energy. The U.S. Government has rights to use, 9 | * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR 10 | * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 11 | * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified 12 | * to produce derivative works, such modified software should be clearly marked, 13 | * so as not to confuse it with the version available from LANL. 14 | * 15 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 16 | * use this file except in compliance with the License. You may obtain a copy 17 | * of the License at 18 | * 19 | * http://www.apache.org/licenses/LICENSE-2.0 20 | * 21 | * Unless required by applicable law or agreed to in writing, software distributed 22 | * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 23 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations under the License.” 25 | * 26 | * This is LANL Copyright Disclosure C13002/LA-CC-12-022 27 | * 28 | */ 29 | 30 | #ifndef _Bounds1d_ 31 | #define _Bounds1d_ 32 | 33 | #ifdef __cplusplus 34 | extern "C" 35 | { 36 | #endif 37 | 38 | #include "Globals1d.h" 39 | 40 | typedef struct { 41 | TVector1d min, max; 42 | } TBounds1d; 43 | 44 | extern void Bounds_Copy1d(TBounds1d* src, TBounds1d* dest); 45 | extern void Bounds_Infinite1d(TBounds1d* b); 46 | extern void Bounds_AddBounds1d(TBounds1d* b, TBounds1d* add); 47 | extern void Bounds_AddEpsilon1d(TBounds1d* b, double add); 48 | extern boolean Bounds_IsOverlappingBounds1d(TBounds1d* b, TBounds1d* tst); 49 | extern double Bounds_WidthAxis1d(TBounds1d* b, unsigned long axis); 50 | extern double Bounds_CenterAxis1d(TBounds1d* b, unsigned long axis); 51 | 52 | #ifdef __cplusplus 53 | } 54 | #endif 55 | 56 | #endif 57 | -------------------------------------------------------------------------------- /kdtree/Bounds2d.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012-2019, Triad National Security, LLC. 3 | * All rights Reserved. 4 | * 5 | * Copyright 2012-2019. Triad National Security, LLC. This material was produced 6 | * under U.S. Government contract 89233218CNA000001 for Los Alamos National 7 | * Laboratory (LANL), which is operated by Triad National Security, LLC 8 | * for the U.S. Department of Energy. The U.S. Government has rights to use, 9 | * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR 10 | * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 11 | * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified 12 | * to produce derivative works, such modified software should be clearly marked, 13 | * so as not to confuse it with the version available from LANL. 14 | * 15 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 16 | * use this file except in compliance with the License. You may obtain a copy 17 | * of the License at 18 | * 19 | * http://www.apache.org/licenses/LICENSE-2.0 20 | * 21 | * Unless required by applicable law or agreed to in writing, software distributed 22 | * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 23 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations under the License.” 25 | * 26 | * This is LANL Copyright Disclosure C13002/LA-CC-12-022 27 | * 28 | */ 29 | 30 | #include "Bounds2d.h" 31 | 32 | void Bounds_Copy2d(TBounds2d* src, TBounds2d* dest) { 33 | assert(src && dest); 34 | MEMCPY(src, dest, 1, TBounds2d); 35 | } 36 | 37 | void Bounds_Infinite2d(TBounds2d* b){ 38 | assert(b); 39 | b->min.x = POSITIVE_INFINITY; 40 | b->min.y = POSITIVE_INFINITY; 41 | b->max.x = NEGATIVE_INFINITY; 42 | b->max.y = NEGATIVE_INFINITY; 43 | } 44 | 45 | void Bounds_AddBounds2d(TBounds2d* b, TBounds2d* add) { 46 | assert(b && add); 47 | b->min.x = MIN(b->min.x, add->min.x); 48 | b->min.y = MIN(b->min.y, add->min.y); 49 | b->max.x = MAX(b->max.x, add->max.x); 50 | b->max.y = MAX(b->max.y, add->max.y); 51 | } 52 | 53 | void Bounds_AddEpsilon2d(TBounds2d* b, double add) { 54 | assert(b); 55 | b->min.x = b->min.x - add; 56 | b->min.y = b->min.y - add; 57 | b->max.x = b->max.x + add; 58 | b->max.y = b->max.y + add; 59 | } 60 | 61 | boolean Bounds_IsOverlappingBounds2d(TBounds2d* b, TBounds2d* tst) { 62 | assert(b && tst); 63 | if((tst->max.x < b->min.x) || (tst->min.x > b->max.x)) 64 | return(false); 65 | if((tst->max.y < b->min.y) || (tst->min.y > b->max.y)) 66 | return(false); 67 | return(true); 68 | } 69 | 70 | double Bounds_WidthAxis2d(TBounds2d* b, unsigned long axis) 71 | { 72 | double width; 73 | 74 | assert(b); 75 | if(axis == XAXIS) 76 | width = b->max.x - b->min.x; 77 | else if(axis == YAXIS) 78 | width = b->max.y - b->min.y; 79 | else 80 | assert(NULL); 81 | return(width); 82 | } 83 | 84 | double Bounds_CenterAxis2d(TBounds2d* b, unsigned long axis) 85 | { 86 | double center; 87 | 88 | assert(b); 89 | if(axis == XAXIS) 90 | center = (b->min.x + b->max.x) * 0.5; 91 | else if(axis == YAXIS) 92 | center = (b->min.y + b->max.y) * 0.5; 93 | else 94 | assert(NULL); 95 | return(center); 96 | } 97 | -------------------------------------------------------------------------------- /kdtree/Bounds2d.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012-2019, Triad National Security, LLC. 3 | * All rights Reserved. 4 | * 5 | * Copyright 2012-2019. Triad National Security, LLC. This material was produced 6 | * under U.S. Government contract 89233218CNA000001 for Los Alamos National 7 | * Laboratory (LANL), which is operated by Triad National Security, LLC 8 | * for the U.S. Department of Energy. The U.S. Government has rights to use, 9 | * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR 10 | * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 11 | * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified 12 | * to produce derivative works, such modified software should be clearly marked, 13 | * so as not to confuse it with the version available from LANL. 14 | * 15 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 16 | * use this file except in compliance with the License. You may obtain a copy 17 | * of the License at 18 | * 19 | * http://www.apache.org/licenses/LICENSE-2.0 20 | * 21 | * Unless required by applicable law or agreed to in writing, software distributed 22 | * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 23 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations under the License.” 25 | * 26 | * This is LANL Copyright Disclosure C13002/LA-CC-12-022 27 | * 28 | */ 29 | 30 | #ifndef _Bounds2d_ 31 | #define _Bounds2d_ 32 | 33 | #ifdef __cplusplus 34 | extern "C" 35 | { 36 | #endif 37 | 38 | #include "Globals2d.h" 39 | 40 | typedef struct { 41 | TVector2d min, max; 42 | } TBounds2d; 43 | 44 | extern void Bounds_Copy2d(TBounds2d* src, TBounds2d* dest); 45 | extern void Bounds_Infinite2d(TBounds2d* b); 46 | extern void Bounds_AddBounds2d(TBounds2d* b, TBounds2d* add); 47 | extern void Bounds_AddEpsilon2d(TBounds2d* b, double add); 48 | extern boolean Bounds_IsOverlappingBounds2d(TBounds2d* b, TBounds2d* tst); 49 | extern double Bounds_WidthAxis2d(TBounds2d* b, unsigned long axis); 50 | extern double Bounds_CenterAxis2d(TBounds2d* b, unsigned long axis); 51 | 52 | #ifdef __cplusplus 53 | } 54 | #endif 55 | 56 | #endif 57 | -------------------------------------------------------------------------------- /kdtree/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # enable @rpath in the install name for any shared library being built 2 | set(CMAKE_MACOSX_RPATH 1) 3 | 4 | ########### global settings ############### 5 | set(H_SRCS Bounds1d.h Bounds2d.h Globals1d.h Globals2d.h KDTree1d.h KDTree2d.h) 6 | 7 | set(C_SRCS Bounds1d.c Bounds2d.c KDTree1d.c KDTree2d.c) 8 | 9 | set(kdtree_LIB_SRCS ${C_SRCS} ${H_SRCS}) 10 | 11 | ########### kdtree target ############### 12 | 13 | add_library(kdtree SHARED ${kdtree_LIB_SRCS}) 14 | 15 | set_target_properties(kdtree PROPERTIES VERSION 1.0.0 SOVERSION 2) 16 | install(TARGETS kdtree DESTINATION lib) 17 | 18 | # Cleanup 19 | SET_DIRECTORY_PROPERTIES(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES 20 | "CMakeCache.txt;CMakeFiles;Makefile;cmake_install.cmake;ipo_out.optrpt") 21 | 22 | ########### install files ############### 23 | 24 | #install(FILES KDTree.h DESTINATION include) 25 | 26 | #========== original Makefile.am contents follow =========== 27 | 28 | #original Makefile.am contents follow: 29 | 30 | #default: libkdtree.la 31 | #all: libkdtree.la 32 | # 33 | #AM_MAKEFLAGS = -j 4 34 | # 35 | #DEFAULT_INCLUDES=-I. -I.. 36 | # 37 | #lib_LTLIBRARIES = libkdtree.la 38 | #include_HEADERS = KDTree.h 39 | # 40 | #libkdtree_la_SOURCES = ${C_SRCS} ${H_SRCS} 41 | -------------------------------------------------------------------------------- /kdtree/Globals1d.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012-2019, Triad National Security, LLC. 3 | * All rights Reserved. 4 | * 5 | * Copyright 2012-2019. Triad National Security, LLC. This material was produced 6 | * under U.S. Government contract 89233218CNA000001 for Los Alamos National 7 | * Laboratory (LANL), which is operated by Triad National Security, LLC 8 | * for the U.S. Department of Energy. The U.S. Government has rights to use, 9 | * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR 10 | * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 11 | * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified 12 | * to produce derivative works, such modified software should be clearly marked, 13 | * so as not to confuse it with the version available from LANL. 14 | * 15 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 16 | * use this file except in compliance with the License. You may obtain a copy 17 | * of the License at 18 | * 19 | * http://www.apache.org/licenses/LICENSE-2.0 20 | * 21 | * Unless required by applicable law or agreed to in writing, software distributed 22 | * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 23 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations under the License.” 25 | * 26 | * This is LANL Copyright Disclosure C13002/LA-CC-12-022 27 | * 28 | */ 29 | 30 | #ifndef _Globals1d_ 31 | #define _Globals1d_ 32 | 33 | #ifdef __cplusplus 34 | extern "C" 35 | { 36 | #endif 37 | 38 | //#define NDEBUG 1 39 | #include 40 | #include 41 | #include 42 | #include 43 | 44 | #ifndef ENTITY_COINCIDENCE_TOLERANCE 45 | #define ENTITY_COINCIDENCE_TOLERANCE ((double)1.0E-5) 46 | 47 | #define KDTREE_ELEMENT_BLOCKING_SIZE ((long)1024) 48 | #endif 49 | 50 | #ifndef POSITIVE_INFINITY 51 | #define POSITIVE_INFINITY (+1.0E+64) 52 | #define NEGATIVE_INFINITY (-1.0E+64) 53 | #endif 54 | 55 | #define XAXIS ((unsigned long)0) 56 | 57 | typedef struct { 58 | double x; 59 | } TVector1d; 60 | 61 | #ifndef _BOOL 62 | #define _BOOL 63 | typedef unsigned char boolean; 64 | #define true ((boolean)1) 65 | #define false ((boolean)0) 66 | #endif 67 | 68 | #ifndef MIN 69 | #define MIN(a,b) ((a) < (b) ? (a) : (b)) 70 | #define MAX(a,b) ((a) > (b) ? (a) : (b)) 71 | #endif 72 | 73 | #ifndef SWAP 74 | #define SWAP(a,b,t) {t h; h = a; a = b; b = h; } 75 | #endif 76 | 77 | #ifndef MALLOC 78 | #define MALLOC(n,t) ((t*)(malloc(n * sizeof(t)))) 79 | #define REALLOC(p,n,t) ((t*)(realloc((void*)p, n * sizeof(t)))) 80 | #define FREE(p) { if (p) free(p); } 81 | #define MEMCPY(s,d,n,t) {memcpy((void*)d, (void*)s, n * sizeof(t)); } 82 | #endif 83 | 84 | #ifdef __cplusplus 85 | } 86 | #endif 87 | 88 | #endif 89 | -------------------------------------------------------------------------------- /kdtree/Globals2d.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012-2019, Triad National Security, LLC. 3 | * All rights Reserved. 4 | * 5 | * Copyright 2012-2019. Triad National Security, LLC. This material was produced 6 | * under U.S. Government contract 89233218CNA000001 for Los Alamos National 7 | * Laboratory (LANL), which is operated by Triad National Security, LLC 8 | * for the U.S. Department of Energy. The U.S. Government has rights to use, 9 | * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR 10 | * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 11 | * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified 12 | * to produce derivative works, such modified software should be clearly marked, 13 | * so as not to confuse it with the version available from LANL. 14 | * 15 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 16 | * use this file except in compliance with the License. You may obtain a copy 17 | * of the License at 18 | * 19 | * http://www.apache.org/licenses/LICENSE-2.0 20 | * 21 | * Unless required by applicable law or agreed to in writing, software distributed 22 | * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 23 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations under the License.” 25 | * 26 | * This is LANL Copyright Disclosure C13002/LA-CC-12-022 27 | * 28 | */ 29 | 30 | #ifndef _Globals2d_ 31 | #define _Globals2d_ 32 | 33 | 34 | #ifdef __cplusplus 35 | extern "C" 36 | { 37 | #endif 38 | 39 | //#define NDEBUG 1 40 | #include 41 | #include 42 | #include 43 | #include 44 | 45 | #define ENTITY_COINCIDENCE_TOLERANCE ((double)1.0E-5) 46 | 47 | #define KDTREE_ELEMENT_BLOCKING_SIZE ((long)1024) 48 | 49 | #define POSITIVE_INFINITY (+1.0E+64) 50 | #define NEGATIVE_INFINITY (-1.0E+64) 51 | 52 | #define XAXIS ((unsigned long)0) 53 | #define YAXIS ((unsigned long)1) 54 | 55 | typedef struct { 56 | double x, y; 57 | } TVector2d; 58 | 59 | #ifndef _BOOL 60 | #define _BOOL 61 | typedef unsigned char boolean; 62 | #define true ((boolean)1) 63 | #define false ((boolean)0) 64 | #endif 65 | 66 | #define MIN(a,b) ((a) < (b) ? (a) : (b)) 67 | #define MAX(a,b) ((a) > (b) ? (a) : (b)) 68 | 69 | #ifndef SWAP 70 | #define SWAP(a,b,t) {t h; h = a; a = b; b = h; } 71 | #endif 72 | 73 | #define MALLOC(n,t) ((t*)(malloc(n * sizeof(t)))) 74 | #define REALLOC(p,n,t) ((t*)(realloc((void*)p, n * sizeof(t)))) 75 | #define FREE(p) { if (p) free(p); } 76 | #define MEMCPY(s,d,n,t) {memcpy((void*)d, (void*)s, n * sizeof(t)); } 77 | 78 | #ifdef __cplusplus 79 | } 80 | #endif 81 | 82 | #endif 83 | -------------------------------------------------------------------------------- /kdtree/KDTree1d.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012-2019, Triad National Security, LLC. 3 | * All rights Reserved. 4 | * 5 | * Copyright 2012-2019. Triad National Security, LLC. This material was produced 6 | * under U.S. Government contract 89233218CNA000001 for Los Alamos National 7 | * Laboratory (LANL), which is operated by Triad National Security, LLC 8 | * for the U.S. Department of Energy. The U.S. Government has rights to use, 9 | * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR 10 | * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 11 | * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified 12 | * to produce derivative works, such modified software should be clearly marked, 13 | * so as not to confuse it with the version available from LANL. 14 | * 15 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 16 | * use this file except in compliance with the License. You may obtain a copy 17 | * of the License at 18 | * 19 | * http://www.apache.org/licenses/LICENSE-2.0 20 | * 21 | * Unless required by applicable law or agreed to in writing, software distributed 22 | * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 23 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations under the License.” 25 | * 26 | * This is LANL Copyright Disclosure C13002/LA-CC-12-022 27 | * 28 | */ 29 | 30 | #include 31 | #include "KDTree1d.h" 32 | 33 | static void median_sort1d(TKDTree1d* t, 34 | int cut_direction, int k, int num, int* idx) 35 | { 36 | int left, mid, right, a, i, j; 37 | 38 | for (left = 0, right = num - 1; (right - left) > 1;) { 39 | mid = (left + right) / 2; 40 | SWAP(idx[mid], idx[left + 1], int); 41 | if(Bounds_CenterAxis1d(&(t->elements[idx[left + 1]]), cut_direction) > 42 | Bounds_CenterAxis1d(&(t->elements[idx[right]]), cut_direction)) 43 | SWAP(idx[left + 1], idx[right], int); 44 | if(Bounds_CenterAxis1d(&(t->elements[idx[left]]), cut_direction) > 45 | Bounds_CenterAxis1d(&(t->elements[idx[right]]), cut_direction)) 46 | SWAP(idx[left], idx[right], int); 47 | if(Bounds_CenterAxis1d(&(t->elements[idx[left + 1]]), cut_direction) > 48 | Bounds_CenterAxis1d(&(t->elements[idx[left]]), cut_direction)) 49 | SWAP(idx[left + 1], idx[left], int); 50 | a = idx[left]; 51 | i = left + 1; 52 | j = right; 53 | while (1) { 54 | for (i++; 55 | Bounds_CenterAxis1d(&(t->elements[idx[i]]), cut_direction) < 56 | Bounds_CenterAxis1d(&(t->elements[a]), cut_direction); 57 | i++); 58 | for (j--; 59 | Bounds_CenterAxis1d(&(t->elements[idx[j]]), cut_direction) > 60 | Bounds_CenterAxis1d(&(t->elements[a]), cut_direction); 61 | j--); 62 | if(j < i) 63 | break; 64 | SWAP(idx[i], idx[j], int); 65 | } 66 | idx[left] = idx[j]; 67 | idx[j] = a; 68 | if(j >= k) 69 | right = j - 1; 70 | if(j <= k) 71 | left = i; 72 | } 73 | if(((right - left) ==1) && 74 | (Bounds_CenterAxis1d(&(t->elements[idx[right]]), cut_direction) < 75 | Bounds_CenterAxis1d(&(t->elements[idx[left]]), cut_direction))) 76 | SWAP(idx[right], idx[left], int); 77 | } 78 | 79 | void KDTree_Initialize1d(TKDTree1d* t) 80 | { 81 | assert(t); 82 | /* Flush the overall tree extent */ 83 | Bounds_Infinite1d(&(t->extent)); 84 | /* Allocate the initial memory for tree elements */ 85 | t->elements_num = 0; 86 | t->elements_allocated = KDTREE_ELEMENT_BLOCKING_SIZE; 87 | t->elements = MALLOC(t->elements_allocated, TBounds1d); 88 | assert(t->elements); 89 | /* Start without a built tree */ 90 | t->tree_built = false; 91 | t->tree_size = 0; 92 | t->tree_safety_boxes = NULL; 93 | t->tree_link = NULL; 94 | } 95 | 96 | void KDTree_Destroy1d(TKDTree1d* t) 97 | { 98 | assert(t); 99 | /* Flush the overall tree extent */ 100 | Bounds_Infinite1d(&(t->extent)); 101 | /* Destroy the element list */ 102 | t->elements_num = 0; 103 | t->elements_allocated = 0; 104 | FREE(t->elements); 105 | t->elements = NULL; 106 | /* Destroy the actual tree */ 107 | t->tree_built = false; 108 | t->tree_size = 0; 109 | FREE(t->tree_safety_boxes); 110 | t->tree_safety_boxes = NULL; 111 | FREE(t->tree_link); 112 | t->tree_link = NULL; 113 | } 114 | 115 | 116 | 117 | void KDTree_AddElement1d(TKDTree1d* t, TBounds1d* add) 118 | { 119 | assert(t && add); 120 | /* Destroy the current tree if it is built */ 121 | if(t->tree_built) { 122 | t->tree_built = false; 123 | t->tree_size = 0; 124 | FREE(t->tree_safety_boxes); 125 | t->tree_safety_boxes = NULL; 126 | FREE(t->tree_link); 127 | t->tree_link = NULL; 128 | } 129 | /* Expand the element array if necessary */ 130 | if(t->elements_num == t->elements_allocated) { 131 | t->elements_allocated += KDTREE_ELEMENT_BLOCKING_SIZE; 132 | t->elements = REALLOC(t->elements, t->elements_allocated, TBounds1d); 133 | assert(t->elements); 134 | } 135 | /* Add the new element to the overall extent and the element list */ 136 | Bounds_AddBounds1d(&(t->extent), add); 137 | Bounds_Copy1d(add, &(t->elements[t->elements_num])); 138 | t->elements_num++; 139 | } 140 | 141 | void KDTree_CreateTree1d(TKDTree1d* t) 142 | { 143 | int i, next_node, stack_ptr, min, mid, max, parent, cut_direction; 144 | double width, max_width; 145 | int* stack; 146 | int* idx; 147 | 148 | assert(t); 149 | /* If the tree is already built, we don't have to do anything */ 150 | if(t->tree_built) 151 | return; 152 | /* If there are no elements in the tree, we don't have to do anything */ 153 | if(t->elements_num > 0) { 154 | /* Allocate the k-D tree memory */ 155 | t->tree_size = 2 * t->elements_num; 156 | t->tree_safety_boxes = MALLOC(t->tree_size, TBounds1d); 157 | t->tree_link = MALLOC(t->tree_size, int); 158 | /* Create and initialize temporary arrays */ 159 | next_node = 0; 160 | stack_ptr = 0; 161 | stack = MALLOC(3 * t->tree_size, int); 162 | idx = MALLOC(t->elements_num, int); 163 | for (i = 0; i < t->elements_num; i++) { 164 | idx[i] = i; 165 | } 166 | /* Setup the root node of the tree and put it on the stack */ 167 | stack[stack_ptr++] = 0; /* Node Number in the Tree */ 168 | stack[stack_ptr++] = 0; /* Element Span Minumum */ 169 | stack[stack_ptr++] = t->elements_num - 1; /* Element Span Maximum */ 170 | Bounds_Copy1d(&(t->extent), &(t->tree_safety_boxes[0])); 171 | next_node++; 172 | /* Construct k-D tree by setting up each pair of child nodes */ 173 | while (stack_ptr) { 174 | /* Pop the top entry off the stack */ 175 | max = stack[--stack_ptr]; 176 | min = stack[--stack_ptr]; 177 | parent = stack[--stack_ptr]; 178 | /* If the current node should be a leaf node, make it one */ 179 | if ((max - min) == 0) { 180 | Bounds_Copy1d(&(t->elements[idx[min]]), &(t->tree_safety_boxes[parent])); 181 | t->tree_link[parent] = - idx[min]; 182 | continue; 183 | } 184 | /* Select optimum cutting direction for the parent node's safety box */ 185 | cut_direction = -1; 186 | max_width = NEGATIVE_INFINITY; 187 | for (i = 0; i < 1; i++) { 188 | width = Bounds_WidthAxis1d(&(t->tree_safety_boxes[parent]), i); 189 | if(width > max_width) { 190 | max_width = width; 191 | cut_direction = i; 192 | } 193 | } 194 | assert(cut_direction >= 0); 195 | /* Do a median sort of the elements under the parent node. The sort key 196 | is the center point of the element bounding boxes along the selected 197 | cutting direction. */ 198 | mid = (min + max) /2; 199 | median_sort1d(t, cut_direction, mid - min, max - min + 1, &(idx[min])); 200 | /* Give the parent a reference to its two children */ 201 | t->tree_link[parent] = next_node; 202 | /* Add the "left" child to the tree and the stack */ 203 | stack[stack_ptr++] = next_node; /* Node Number in the Tree */ 204 | stack[stack_ptr++] = min; /* Element Span Minimum */ 205 | stack[stack_ptr++] = mid; /* Element Span Maximum */ 206 | Bounds_Infinite1d(&(t->tree_safety_boxes[next_node])); 207 | for (i = min; i <= mid; i++) { 208 | Bounds_AddBounds1d(&(t->tree_safety_boxes[next_node]), 209 | &(t->elements[idx[i]])); 210 | } 211 | next_node++; 212 | /* Add the "right" child to the tree and the stack */ 213 | stack[stack_ptr++] = next_node; /* Node Number in the Tree */ 214 | stack[stack_ptr++] = mid + 1; /* Element Span Minimum */ 215 | stack[stack_ptr++] = max; /* Element Span Maximum */ 216 | Bounds_Infinite1d(&(t->tree_safety_boxes[next_node])); 217 | for (i = min + 1; i <= max; i++) { 218 | Bounds_AddBounds1d(&(t->tree_safety_boxes[next_node]), 219 | &(t->elements[idx[i]])); 220 | } 221 | next_node++; 222 | } 223 | /* Destroy the temporary arrays */ 224 | FREE(stack); 225 | FREE(idx); 226 | } 227 | /* Mark the tree "built" */ 228 | t->tree_built = true; 229 | } 230 | 231 | void KDTree_QueryBoxIntersect1d(TKDTree1d* t, 232 | int* result_num, int* result_indicies, 233 | TBounds1d* box) 234 | { 235 | int stack_ptr, node; 236 | TBounds1d sb; 237 | int* stack; 238 | 239 | assert(t && result_num && result_indicies && box); 240 | /* Build the k-D tree if necessary */ 241 | if(!t->tree_built){ 242 | //printf("BUILDING TREE... \n"); 243 | //fflush(stdout); 244 | KDTree_CreateTree1d(t); 245 | } 246 | /* Allocate the results array */ 247 | *result_num = 0; 248 | /* Create the temporary stack array */ 249 | stack_ptr = 0; 250 | stack = MALLOC(t->tree_size, int); 251 | 252 | /* Put the root node of the tree onto the stack */ 253 | stack[stack_ptr++] = 0; 254 | /* Search the k-D tree until the stack is empty */ 255 | 256 | while (stack_ptr) { 257 | /* Pop the top entry off the stack */ 258 | node = stack[--stack_ptr]; 259 | /* Check if the query box intersects an epsilon-expanded safety box for 260 | the current node. */ 261 | Bounds_Copy1d(&(t->tree_safety_boxes[node]), &sb); 262 | //Bounds_AddEpsilon1d(&sb, ENTITY_COINCIDENCE_TOLERANCE); 263 | /* If the query box doesn't intersect this node's safety box, we are done 264 | visiting the node and should continue with the next node */ 265 | if(!Bounds_IsOverlappingBounds1d(&sb, box)) 266 | continue; 267 | /* If the current node is a leaf node, add it to the collision list. If 268 | the current node is an interior node, add its children to the stack. */ 269 | if(t->tree_link[node] <= 0) { 270 | result_indicies[*result_num] = - t->tree_link[node]; 271 | (*result_num)++; 272 | } 273 | else { 274 | stack[stack_ptr++] = t->tree_link[node]; 275 | stack[stack_ptr++] = t->tree_link[node] + 1; 276 | } 277 | } 278 | /* Destroy the temporary stack array */ 279 | FREE(stack); 280 | } 281 | 282 | -------------------------------------------------------------------------------- /kdtree/KDTree1d.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012-2019, Triad National Security, LLC. 3 | * All rights Reserved. 4 | * 5 | * Copyright 2012-2019. Triad National Security, LLC. This material was produced 6 | * under U.S. Government contract 89233218CNA000001 for Los Alamos National 7 | * Laboratory (LANL), which is operated by Triad National Security, LLC 8 | * for the U.S. Department of Energy. The U.S. Government has rights to use, 9 | * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR 10 | * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 11 | * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified 12 | * to produce derivative works, such modified software should be clearly marked, 13 | * so as not to confuse it with the version available from LANL. 14 | * 15 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 16 | * use this file except in compliance with the License. You may obtain a copy 17 | * of the License at 18 | * 19 | * http://www.apache.org/licenses/LICENSE-2.0 20 | * 21 | * Unless required by applicable law or agreed to in writing, software distributed 22 | * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 23 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations under the License.” 25 | * 26 | * This is LANL Copyright Disclosure C13002/LA-CC-12-022 27 | * 28 | */ 29 | 30 | /* 31 | * 32 | * Implements a 2-dimensional k-D tree. One begins to use the k-D tree by 33 | * adding the bounding box of geometric "elements" to the tree structure 34 | * through a call to "KDTreeAddElement". Every element should be of the same 35 | * type, but could be a single point, a line segment, triangles, etc. Once 36 | * all the element bounding boxes have been added, the user of the structure 37 | * may make queries against the tree. The actual tree is constructed lazily 38 | * when an actual query occurs on the structure. 39 | * 40 | * This version only has one query -- intersection of a box with the elements 41 | * and a set of "candidate" elements are returned. The candidates are identified 42 | * by an index number (0, ...) signifying the order in which the element was 43 | * added to the tree. It is up to the calling code to do additional processing 44 | * based on the type of element being used to determine "real" intersections. 45 | * 46 | * The process of actually building the tree takes "n log n" time. Queries 47 | * take "log n" time. 48 | * 49 | */ 50 | 51 | #ifndef _KDTree1d_ 52 | #define _KDTree1d_ 53 | 54 | #ifdef __cplusplus 55 | extern "C" 56 | { 57 | #endif 58 | 59 | #include "Globals1d.h" 60 | #include "Bounds1d.h" 61 | 62 | #define LEFT_HALF 0 63 | #define RIGHT_HALF 1 64 | #define BOTTOM_HALF 0 65 | #define TOP_HALF 1 66 | 67 | typedef struct { 68 | TBounds1d extent; 69 | int elements_num, elements_allocated; 70 | TBounds1d* elements; 71 | boolean tree_built; 72 | int tree_size; 73 | TBounds1d* tree_safety_boxes; 74 | int * tree_link; 75 | } TKDTree1d; 76 | 77 | extern void KDTree_Initialize1d(TKDTree1d *t); 78 | extern void KDTree_Destroy1d(TKDTree1d* t); 79 | extern void KDTree_AddElement1d(TKDTree1d* t, TBounds1d* add); 80 | extern void KDTree_CreateTree1d(TKDTree1d* t); 81 | extern void KDTree_QueryBoxIntersect1d(TKDTree1d* t, 82 | int* result_num, int* result_indicies, 83 | TBounds1d* box); 84 | 85 | #ifdef __cplusplus 86 | } 87 | #endif 88 | 89 | #endif 90 | -------------------------------------------------------------------------------- /kdtree/KDTree2d.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012-2019, Triad National Security, LLC. 3 | * All rights Reserved. 4 | * 5 | * Copyright 2012-2019. Triad National Security, LLC. This material was produced 6 | * under U.S. Government contract 89233218CNA000001 for Los Alamos National 7 | * Laboratory (LANL), which is operated by Triad National Security, LLC 8 | * for the U.S. Department of Energy. The U.S. Government has rights to use, 9 | * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR 10 | * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 11 | * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified 12 | * to produce derivative works, such modified software should be clearly marked, 13 | * so as not to confuse it with the version available from LANL. 14 | * 15 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 16 | * use this file except in compliance with the License. You may obtain a copy 17 | * of the License at 18 | * 19 | * http://www.apache.org/licenses/LICENSE-2.0 20 | * 21 | * Unless required by applicable law or agreed to in writing, software distributed 22 | * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 23 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations under the License.” 25 | * 26 | * This is LANL Copyright Disclosure C13002/LA-CC-12-022 27 | * 28 | */ 29 | 30 | #include 31 | #include "KDTree2d.h" 32 | 33 | static void median_sort2d(TKDTree2d* t, 34 | int cut_direction, int k, int num, int* idx) 35 | { 36 | int left, mid, right, a, i, j; 37 | 38 | for (left = 0, right = num - 1; (right - left) > 1;) { 39 | mid = (left + right) / 2; 40 | SWAP(idx[mid], idx[left + 1], int); 41 | if(Bounds_CenterAxis2d(&(t->elements[idx[left + 1]]), cut_direction) > 42 | Bounds_CenterAxis2d(&(t->elements[idx[right]]), cut_direction)) 43 | SWAP(idx[left + 1], idx[right], int); 44 | if(Bounds_CenterAxis2d(&(t->elements[idx[left]]), cut_direction) > 45 | Bounds_CenterAxis2d(&(t->elements[idx[right]]), cut_direction)) 46 | SWAP(idx[left], idx[right], int); 47 | if(Bounds_CenterAxis2d(&(t->elements[idx[left + 1]]), cut_direction) > 48 | Bounds_CenterAxis2d(&(t->elements[idx[left]]), cut_direction)) 49 | SWAP(idx[left + 1], idx[left], int); 50 | a = idx[left]; 51 | i = left + 1; 52 | j = right; 53 | while (1) { 54 | for (i++; 55 | Bounds_CenterAxis2d(&(t->elements[idx[i]]), cut_direction) < 56 | Bounds_CenterAxis2d(&(t->elements[a]), cut_direction); 57 | i++); 58 | for (j--; 59 | Bounds_CenterAxis2d(&(t->elements[idx[j]]), cut_direction) > 60 | Bounds_CenterAxis2d(&(t->elements[a]), cut_direction); 61 | j--); 62 | if(j < i) 63 | break; 64 | SWAP(idx[i], idx[j], int); 65 | } 66 | idx[left] = idx[j]; 67 | idx[j] = a; 68 | if(j >= k) 69 | right = j - 1; 70 | if(j <= k) 71 | left = i; 72 | } 73 | if(((right - left) ==1) && 74 | (Bounds_CenterAxis2d(&(t->elements[idx[right]]), cut_direction) < 75 | Bounds_CenterAxis2d(&(t->elements[idx[left]]), cut_direction))) 76 | SWAP(idx[right], idx[left], int); 77 | } 78 | 79 | void KDTree_Initialize2d(TKDTree2d* t) 80 | { 81 | assert(t); 82 | /* Flush the overall tree extent */ 83 | Bounds_Infinite2d(&(t->extent)); 84 | /* Allocate the initial memory for tree elements */ 85 | t->elements_num = 0; 86 | t->elements_allocated = KDTREE_ELEMENT_BLOCKING_SIZE; 87 | t->elements = MALLOC(t->elements_allocated, TBounds2d); 88 | assert(t->elements); 89 | /* Start without a built tree */ 90 | t->tree_built = false; 91 | t->tree_size = 0; 92 | t->tree_safety_boxes = NULL; 93 | t->tree_link = NULL; 94 | } 95 | 96 | void KDTree_Destroy2d(TKDTree2d* t) 97 | { 98 | assert(t); 99 | /* Flush the overall tree extent */ 100 | Bounds_Infinite2d(&(t->extent)); 101 | /* Destroy the element list */ 102 | t->elements_num = 0; 103 | t->elements_allocated = 0; 104 | FREE(t->elements); 105 | t->elements = NULL; 106 | /* Destroy the actual tree */ 107 | t->tree_built = false; 108 | t->tree_size = 0; 109 | FREE(t->tree_safety_boxes); 110 | t->tree_safety_boxes = NULL; 111 | FREE(t->tree_link); 112 | t->tree_link = NULL; 113 | } 114 | 115 | 116 | 117 | void KDTree_AddElement2d(TKDTree2d* t, TBounds2d* add) 118 | { 119 | assert(t && add); 120 | /* Destroy the current tree if it is built */ 121 | if(t->tree_built) { 122 | t->tree_built = false; 123 | t->tree_size = 0; 124 | FREE(t->tree_safety_boxes); 125 | t->tree_safety_boxes = NULL; 126 | FREE(t->tree_link); 127 | t->tree_link = NULL; 128 | } 129 | /* Expand the element array if necessary */ 130 | if(t->elements_num == t->elements_allocated) { 131 | t->elements_allocated += KDTREE_ELEMENT_BLOCKING_SIZE; 132 | t->elements = REALLOC(t->elements, t->elements_allocated, TBounds2d); 133 | assert(t->elements); 134 | } 135 | /* Add the new element to the overall extent and the element list */ 136 | Bounds_AddBounds2d(&(t->extent), add); 137 | Bounds_Copy2d(add, &(t->elements[t->elements_num])); 138 | t->elements_num++; 139 | } 140 | 141 | void KDTree_CreateTree2d(TKDTree2d* t) 142 | { 143 | int i, next_node, stack_ptr, min, mid, max, parent, cut_direction; 144 | double width, max_width; 145 | int* stack; 146 | int* idx; 147 | 148 | assert(t); 149 | /* If the tree is already built, we don't have to do anything */ 150 | if(t->tree_built) 151 | return; 152 | /* If there are no elements in the tree, we don't have to do anything */ 153 | if(t->elements_num > 0) { 154 | /* Allocate the k-D tree memory */ 155 | t->tree_size = 2 * t->elements_num; 156 | t->tree_safety_boxes = MALLOC(t->tree_size, TBounds2d); 157 | t->tree_link = MALLOC(t->tree_size, int); 158 | /* Create and initialize temporary arrays */ 159 | next_node = 0; 160 | stack_ptr = 0; 161 | stack = MALLOC(3 * t->tree_size, int); 162 | idx = MALLOC(t->elements_num, int); 163 | for (i = 0; i < t->elements_num; i++) { 164 | idx[i] = i; 165 | } 166 | /* Setup the root node of the tree and put it on the stack */ 167 | stack[stack_ptr++] = 0; /* Node Number in the Tree */ 168 | stack[stack_ptr++] = 0; /* Element Span Minumum */ 169 | stack[stack_ptr++] = t->elements_num - 1; /* Element Span Maximum */ 170 | Bounds_Copy2d(&(t->extent), &(t->tree_safety_boxes[0])); 171 | next_node++; 172 | /* Construct k-D tree by setting up each pair of child nodes */ 173 | while (stack_ptr) { 174 | /* Pop the top entry off the stack */ 175 | max = stack[--stack_ptr]; 176 | min = stack[--stack_ptr]; 177 | parent = stack[--stack_ptr]; 178 | /* If the current node should be a leaf node, make it one */ 179 | if ((max - min) == 0) { 180 | Bounds_Copy2d(&(t->elements[idx[min]]), &(t->tree_safety_boxes[parent])); 181 | t->tree_link[parent] = - idx[min]; 182 | continue; 183 | } 184 | /* Select optimum cutting direction for the parent node's safety box */ 185 | cut_direction = -1; 186 | max_width = NEGATIVE_INFINITY; 187 | for (i = 0; i < 2; i++) { 188 | width = Bounds_WidthAxis2d(&(t->tree_safety_boxes[parent]), i); 189 | if(width > max_width) { 190 | max_width = width; 191 | cut_direction = i; 192 | } 193 | } 194 | assert(cut_direction >= 0); 195 | /* Do a median sort of the elements under the parent node. The sort key 196 | is the center point of the element bounding boxes along the selected 197 | cutting direction. */ 198 | mid = (min + max) /2; 199 | median_sort2d(t, cut_direction, mid - min, max - min + 1, &(idx[min])); 200 | /* Give the parent a reference to its two children */ 201 | t->tree_link[parent] = next_node; 202 | /* Add the "left" child to the tree and the stack */ 203 | stack[stack_ptr++] = next_node; /* Node Number in the Tree */ 204 | stack[stack_ptr++] = min; /* Element Span Minimum */ 205 | stack[stack_ptr++] = mid; /* Element Span Maximum */ 206 | Bounds_Infinite2d(&(t->tree_safety_boxes[next_node])); 207 | for (i = min; i <= mid; i++) { 208 | Bounds_AddBounds2d(&(t->tree_safety_boxes[next_node]), 209 | &(t->elements[idx[i]])); 210 | } 211 | next_node++; 212 | /* Add the "right" child to the tree and the stack */ 213 | stack[stack_ptr++] = next_node; /* Node Number in the Tree */ 214 | stack[stack_ptr++] = mid + 1; /* Element Span Minimum */ 215 | stack[stack_ptr++] = max; /* Element Span Maximum */ 216 | Bounds_Infinite2d(&(t->tree_safety_boxes[next_node])); 217 | for (i = min + 1; i <= max; i++) { 218 | Bounds_AddBounds2d(&(t->tree_safety_boxes[next_node]), 219 | &(t->elements[idx[i]])); 220 | } 221 | next_node++; 222 | } 223 | /* Destroy the temporary arrays */ 224 | FREE(stack); 225 | FREE(idx); 226 | } 227 | /* Mark the tree "built" */ 228 | t->tree_built = true; 229 | } 230 | 231 | void KDTree_QueryBoxIntersect2d(TKDTree2d* t, 232 | int* result_num, int* result_indicies, 233 | TBounds2d* box) 234 | { 235 | int stack_ptr, node; 236 | TBounds2d sb; 237 | int* stack; 238 | 239 | assert(t && result_num && result_indicies && box); 240 | /* Build the k-D tree if necessary */ 241 | if(!t->tree_built){ 242 | //printf("BUILDING TREE... \n"); 243 | //fflush(stdout); 244 | KDTree_CreateTree2d(t); 245 | } 246 | /* Allocate the results array */ 247 | *result_num = 0; 248 | /* Create the temporary stack array */ 249 | stack_ptr = 0; 250 | stack = MALLOC(t->tree_size, int); 251 | 252 | /* Put the root node of the tree onto the stack */ 253 | stack[stack_ptr++] = 0; 254 | /* Search the k-D tree until the stack is empty */ 255 | 256 | while (stack_ptr) { 257 | /* Pop the top entry off the stack */ 258 | node = stack[--stack_ptr]; 259 | /* Check if the query box intersects an epsilon-expanded safety box for 260 | the current node. */ 261 | Bounds_Copy2d(&(t->tree_safety_boxes[node]), &sb); 262 | //Bounds_AddEpsilon(&sb, ENTITY_COINCIDENCE_TOLERANCE); 263 | /* If the query box doesn't intersect this node's safety box, we are done 264 | visiting the node and should continue with the next node */ 265 | if(!Bounds_IsOverlappingBounds2d(&sb, box)) 266 | continue; 267 | /* If the current node is a leaf node, add it to the collision list. If 268 | the current node is an interior node, add its children to the stack. */ 269 | if(t->tree_link[node] <= 0) { 270 | result_indicies[*result_num] = - t->tree_link[node]; 271 | (*result_num)++; 272 | } 273 | else { 274 | stack[stack_ptr++] = t->tree_link[node]; 275 | stack[stack_ptr++] = t->tree_link[node] + 1; 276 | } 277 | } 278 | /* Destroy the temporary stack array */ 279 | FREE(stack); 280 | } 281 | 282 | -------------------------------------------------------------------------------- /kdtree/KDTree2d.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012-2019, Triad National Security, LLC. 3 | * All rights Reserved. 4 | * 5 | * Copyright 2012-2019. Triad National Security, LLC. This material was produced 6 | * under U.S. Government contract 89233218CNA000001 for Los Alamos National 7 | * Laboratory (LANL), which is operated by Triad National Security, LLC 8 | * for the U.S. Department of Energy. The U.S. Government has rights to use, 9 | * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR 10 | * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 11 | * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified 12 | * to produce derivative works, such modified software should be clearly marked, 13 | * so as not to confuse it with the version available from LANL. 14 | * 15 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 16 | * use this file except in compliance with the License. You may obtain a copy 17 | * of the License at 18 | * 19 | * http://www.apache.org/licenses/LICENSE-2.0 20 | * 21 | * Unless required by applicable law or agreed to in writing, software distributed 22 | * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 23 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations under the License.” 25 | * 26 | * This is LANL Copyright Disclosure C13002/LA-CC-12-022 27 | * 28 | */ 29 | 30 | /* 31 | * Implements a 2-dimensional k-D tree. One begins to use the k-D tree by 32 | * adding the bounding box of geometric "elements" to the tree structure 33 | * through a call to "KDTreeAddElement". Every element should be of the same 34 | * type, but could be a single point, a line segment, triangles, etc. Once 35 | * all the element bounding boxes have been added, the user of the structure 36 | * may make queries against the tree. The actual tree is constructed lazily 37 | * when an actual query occurs on the structure. 38 | * 39 | * This version only has one query -- intersection of a box with the elements 40 | * and a set of "candidate" elements are returned. The candidates are identified 41 | * by an index number (0, ...) signifying the order in which the element was 42 | * added to the tree. It is up to the calling code to do additional processing 43 | * based on the type of element being used to determine "real" intersections. 44 | * 45 | * The process of actually building the tree takes "n log n" time. Queries 46 | * take "log n" time. 47 | * 48 | */ 49 | 50 | #ifndef _KDTree2d_ 51 | #define _KDTree2d_ 52 | 53 | #ifdef __cplusplus 54 | extern "C" 55 | { 56 | #endif 57 | 58 | 59 | #ifdef HAVE_CONFIG_H 60 | #include "config.h" 61 | #endif 62 | 63 | #include "Globals2d.h" 64 | #include "Bounds2d.h" 65 | 66 | 67 | #define LEFT_HALF 0 68 | #define RIGHT_HALF 1 69 | #define BOTTOM_HALF 0 70 | #define TOP_HALF 1 71 | 72 | typedef struct { 73 | TBounds2d extent; 74 | int elements_num, elements_allocated; 75 | TBounds2d* elements; 76 | boolean tree_built; 77 | int tree_size; 78 | TBounds2d* tree_safety_boxes; 79 | int * tree_link; 80 | } TKDTree2d; 81 | 82 | extern void KDTree_Initialize2d(TKDTree2d *t); 83 | extern void KDTree_Destroy2d(TKDTree2d* t); 84 | extern void KDTree_AddElement2d(TKDTree2d* t, TBounds2d* add); 85 | extern void KDTree_CreateTree2d(TKDTree2d* t); 86 | extern void KDTree_QueryBoxIntersect2d(TKDTree2d* t, 87 | int* result_num, int* result_indicies, 88 | TBounds2d* box); 89 | #ifdef __cplusplus 90 | } 91 | #endif 92 | 93 | #endif 94 | -------------------------------------------------------------------------------- /neigh.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012-2019, Triad National Security, LLC. 3 | * All rights Reserved. 4 | * 5 | * Copyright 2012-2019. Triad National Security, LLC. This material was produced 6 | * under U.S. Government contract 89233218CNA000001 for Los Alamos National 7 | * Laboratory (LANL), which is operated by Triad National Security, LLC 8 | * for the U.S. Department of Energy. The U.S. Government has rights to use, 9 | * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR 10 | * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 11 | * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified 12 | * to produce derivative works, such modified software should be clearly marked, 13 | * so as not to confuse it with the version available from LANL. 14 | * 15 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 16 | * use this file except in compliance with the License. You may obtain a copy 17 | * of the License at 18 | * 19 | * http://www.apache.org/licenses/LICENSE-2.0 20 | * 21 | * Unless required by applicable law or agreed to in writing, software distributed 22 | * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 23 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations under the License.” 25 | * 26 | * This is LANL Copyright Disclosure C13002/LA-CC-12-022 27 | * 28 | */ 29 | 30 | /* 31 | * Authors: Bob Robey XCP-2 brobey@lanl.gov 32 | * David Nicholaeff dnic@lanl.gov, mtrxknight@aol.com 33 | * Rachel Robey rnrobey@gmail.com 34 | */ 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include "kdtree/KDTree1d.h" 44 | #include "gpu.h" 45 | #include "timer.h" 46 | 47 | #ifdef HAVE_CONFIG_H 48 | #include "config.h" 49 | #endif 50 | 51 | #ifdef __APPLE_CC__ 52 | #include 53 | #else 54 | #include 55 | #endif 56 | 57 | #ifdef HAVE_CL_DOUBLE 58 | typedef double real; 59 | typedef cl_double cl_real; 60 | #define ONE 1.0 61 | #define TWO 2.0 62 | #else 63 | typedef float real; 64 | typedef cl_float cl_real; 65 | #define ONE 1.0f 66 | #define TWO 2.0f 67 | #endif 68 | 69 | #define SQR(x) (( (x)*(x) )) 70 | 71 | typedef unsigned int uint; 72 | 73 | #define CHECK 1 74 | #define TILE_SIZE 256 75 | #define DETAILED_TIMING 0 76 | #define LONG_RUNS 1 77 | 78 | #ifndef MIN 79 | #define MIN(a,b) ((a)>(b)?(b):(a)) 80 | #define MAX(a,b) ((a)<(b)?(b):(a)) 81 | #endif 82 | 83 | struct neighbor { 84 | uint left; 85 | uint right; 86 | }; 87 | 88 | struct timespec tstart; 89 | double time_sum; 90 | 91 | int is_nvidia = 0; 92 | #define BRUTE_FORCE_SIZE_LIMIT 500000 93 | 94 | cl_context context; 95 | cl_command_queue queue; 96 | cl_program program; 97 | cl_kernel init_kernel, hash_kernel, get_neighbor_kernel; 98 | 99 | void neighbors( uint length, double min_diff, double max_diff, double min_val ); 100 | struct neighbor *neighbors_bruteforce( uint length, double *xcoor, double min_val, double max_val); 101 | struct neighbor *neighbors_kdtree( uint length, double *xcoor, double *xmin, double *xmax, 102 | double min_diff, double max_val, double min_val ); 103 | struct neighbor *neighbors_hashcpu( uint length, double *xcoor, double min_diff, double max_val, double min_val ); 104 | cl_mem neighbors_hashgpu( uint length, cl_mem data_buffer, double min_diff, double max_val, double min_val, double *time ); 105 | double generate_array_wminmax( uint size, double *ptr, double *xmin, double *xmax, 106 | double mindx, double maxdx, double min, double *max ); 107 | 108 | int main (int argc, const char * argv[]) { 109 | 110 | cl_int error; 111 | 112 | #ifdef HAVE_OPENCL 113 | GPUInit(&context, &queue, &is_nvidia, &program, "neigh_kern.cl"); 114 | 115 | init_kernel = clCreateKernel(program, "init_kern", &error); 116 | hash_kernel = clCreateKernel(program, "hash_kern", &error); 117 | get_neighbor_kernel = clCreateKernel(program, "get_neighbor_kern", &error); 118 | #endif 119 | 120 | printf("\n Neighbors Performance Results\n\n"); 121 | if (LONG_RUNS == 1) 122 | printf("Size, \tBrute, \tkDtree \tHash CPU, \tHash GPU\n"); 123 | else 124 | printf("Size, \tkDtree \tHash CPU, \tHash GPU\n"); 125 | 126 | for (uint max_mult = 1; max_mult <= 32; max_mult *= 2){ 127 | printf("\nMax diff is %d times min_diff\n",max_mult); 128 | for( uint i = 64; i <= 5000000; i*=2 ) { 129 | printf("%d, ", i); 130 | neighbors(i, 2.0, (double)max_mult*2.0, 0.0); 131 | printf("\n"); 132 | } 133 | } 134 | } 135 | 136 | /* find right and left neighbors of element at index index in array of size length */ 137 | void neighbors( uint length, double min_diff, double max_diff, double min_val ) 138 | { 139 | double *xcoor, *xmin, *xmax; 140 | double max_val = min_val; //reset in generate array call 141 | struct neighbor *neigh_gold, *neigh_test; 142 | 143 | xcoor = (double*)malloc(length*sizeof(double)); 144 | xmin = (double*)malloc(length*sizeof(double)); 145 | xmax = (double*)malloc(length*sizeof(double)); 146 | 147 | generate_array_wminmax(length, xcoor, xmin, xmax, min_diff, max_diff, min_val, &max_val); 148 | //for (uint i=0; i= xleft ) {xleft = xcoor[index2]; left = index2; } 263 | 264 | if (xcoor[index2] > xcoor[index1] && xcoor[index2] <= xright ) {xright = xcoor[index2]; right = index2;} 265 | } 266 | neigh[index1].left = left; 267 | neigh[index1].right = right; 268 | } 269 | 270 | return(neigh); 271 | } 272 | 273 | struct neighbor *neighbors_kdtree( uint length, double *xcoor, double *xmin, double *xmax, 274 | double min_diff, double max_val, double min_val ) 275 | { 276 | TKDTree1d tree; 277 | 278 | KDTree_Initialize1d(&tree); 279 | 280 | TBounds1d box; 281 | for(uint i = 0; i < length; i++) { 282 | box.min.x = xmin[i]; 283 | box.max.x = xmax[i]; 284 | KDTree_AddElement1d(&tree, &box); 285 | } 286 | 287 | struct neighbor *neigh = (struct neighbor *)malloc(length*sizeof(struct neighbor)); 288 | 289 | int index_list[10]; 290 | int num; 291 | for (uint index = 0; index < length; index++) { 292 | neigh[index].left = index; 293 | neigh[index].right = index; 294 | box.min.x = xmin[index]-min_diff*0.25; 295 | box.max.x = xmin[index]-min_diff*0.20; 296 | KDTree_QueryBoxIntersect1d(&tree, &num, &(index_list[0]), &box); 297 | if (num == 1) neigh[index].left = index_list[0]; 298 | 299 | box.min.x = xmax[index]+min_diff*0.20; 300 | box.max.x = xmax[index]+min_diff*0.25; 301 | KDTree_QueryBoxIntersect1d(&tree, &num, &(index_list[0]), &box); 302 | if (num == 1) neigh[index].right = index_list[0]; 303 | } 304 | 305 | KDTree_Destroy1d(&tree); 306 | 307 | return(neigh); 308 | } 309 | 310 | /* find right and left neighbors of element at index index in array of size length */ 311 | struct neighbor *neighbors_hashcpu( uint length, double *xcoor, double min_diff, double max_val, double min_val ) 312 | { 313 | uint hash_size = (uint)((max_val - min_val)/min_diff + 2.5); //create hash table with buckets of size min_diff -- +2.5 rounds up and adds one space to either side 314 | int *hash = (int*)malloc(hash_size*sizeof(int)); 315 | 316 | /* Sort elements into hash array hash */ 317 | memset(hash, -1, hash_size*sizeof(int)); //set all elements of hash array to -1 318 | 319 | for(uint i = 0; i < length; i++) { hash[(int)((xcoor[i]+min_val)/min_diff)] = i; } 320 | //place index of current xcoor element into hash according to where the xcoor value 321 | 322 | struct neighbor *neigh = (struct neighbor *)malloc(length*sizeof(struct neighbor)); 323 | 324 | for (uint index = 0; index < length; index++) { 325 | /* move left and right through hash array from desired element to find its neighbors */ 326 | int idx_new = (int)((xcoor[index]-min_val)/min_diff); //where the index element is in the hash array 327 | int left = index, right = index; 328 | 329 | for(int i = idx_new+1; i < hash_size; i++) { //store index of neigbor in original unsorted array, if greatest/least, than left as -1 330 | if(hash[i] != -1) { 331 | right = hash[i]; 332 | break; 333 | } 334 | } 335 | for(int i = idx_new-1; i >= 0; i--) { 336 | if(hash[i] != -1) { 337 | left = hash[i]; 338 | break; 339 | } 340 | } 341 | neigh[index].left = left; 342 | neigh[index].right = right; 343 | } 344 | 345 | free(hash); 346 | 347 | return(neigh); 348 | } 349 | 350 | #ifdef HAVE_OPENCL 351 | /* find right and left neighbors of element at index index in array of size length */ 352 | cl_mem neighbors_hashgpu( uint length, cl_mem data_buffer, double min_diff, double max_val, double min_val, double *time ) 353 | { 354 | cl_mem hash_buffer, neighbor_buffer; 355 | 356 | cl_int error = 0; 357 | long gpu_time = 0; 358 | 359 | uint hash_size = (uint)((max_val - min_val)/min_diff + 2.5); //create hash table with buckets of size min_diff -- +2.5 rounds up and adds one space to either side 360 | 361 | real min_val_real = (real)min_val; 362 | real min_diff_real = (real)min_diff; 363 | 364 | hash_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, hash_size*sizeof(int), NULL, &error); 365 | if (error != CL_SUCCESS) { 366 | //printf("Error is %d at line %d\n",error,__LINE__); 367 | return(NULL); 368 | } 369 | 370 | /****************** 371 | * Init to -1 372 | *******************/ 373 | 374 | error = clSetKernelArg(init_kernel, 0, sizeof(cl_uint), &hash_size); 375 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 376 | error = clSetKernelArg(init_kernel, 1, sizeof(cl_mem), (void*)&hash_buffer); 377 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 378 | 379 | size_t global_work_size[1]; 380 | size_t local_work_size[1]; 381 | 382 | local_work_size[0] = TILE_SIZE; 383 | global_work_size[0] = ((hash_size+local_work_size[0]-1)/local_work_size[0])*local_work_size[0]; 384 | 385 | cl_event hash_init_event; 386 | 387 | error = clEnqueueNDRangeKernel(queue, init_kernel, 1, 0, global_work_size, local_work_size, 0, NULL, &hash_init_event); 388 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 389 | 390 | /****************** 391 | * Hash Kernel 392 | ******************/ 393 | 394 | error = clSetKernelArg(hash_kernel, 0, sizeof(real), &min_val_real); 395 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 396 | error = clSetKernelArg(hash_kernel, 1, sizeof(real), &min_diff_real); 397 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 398 | error = clSetKernelArg(hash_kernel, 2, sizeof(cl_uint), &length); 399 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 400 | error = clSetKernelArg(hash_kernel, 3, sizeof(cl_mem), (void*)&data_buffer); 401 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 402 | error = clSetKernelArg(hash_kernel, 4, sizeof(cl_mem), (void*)&hash_buffer); 403 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 404 | 405 | global_work_size[0] = ((length+local_work_size[0]-1)/local_work_size[0])*local_work_size[0]; 406 | 407 | cl_event hash_kernel_event; 408 | 409 | error = clEnqueueNDRangeKernel(queue, hash_kernel, 1, 0, global_work_size, local_work_size, 0, NULL, &hash_kernel_event); 410 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 411 | 412 | /****************** 413 | * Get Neighbor Kernel 414 | ******************/ 415 | 416 | neighbor_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, length*sizeof(cl_uint2), NULL, &error); 417 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 418 | 419 | error = clSetKernelArg(get_neighbor_kernel, 0, sizeof(real), &min_val_real); 420 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 421 | error = clSetKernelArg(get_neighbor_kernel, 1, sizeof(real), &min_diff_real); 422 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 423 | error = clSetKernelArg(get_neighbor_kernel, 2, sizeof(cl_uint), &length); 424 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 425 | error = clSetKernelArg(get_neighbor_kernel, 3, sizeof(cl_mem), (void*)&data_buffer); 426 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 427 | error = clSetKernelArg(get_neighbor_kernel, 4, sizeof(cl_mem), (void*)&hash_buffer); 428 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 429 | error = clSetKernelArg(get_neighbor_kernel, 5, sizeof(cl_uint), &hash_size); 430 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 431 | error = clSetKernelArg(get_neighbor_kernel, 6, sizeof(cl_mem), &neighbor_buffer); 432 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 433 | 434 | cl_event get_neighbor_event; 435 | 436 | error = clEnqueueNDRangeKernel(queue, get_neighbor_kernel, 1, 0, global_work_size, local_work_size, 0, NULL, &get_neighbor_event); 437 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 438 | 439 | long gpu_time_start, gpu_time_end; 440 | 441 | clWaitForEvents(1,&get_neighbor_event); 442 | 443 | clGetEventProfilingInfo(hash_init_event, CL_PROFILING_COMMAND_START, sizeof(gpu_time_start), &gpu_time_start, NULL); 444 | clGetEventProfilingInfo(hash_init_event, CL_PROFILING_COMMAND_END, sizeof(gpu_time_end), &gpu_time_end, NULL); 445 | gpu_time += gpu_time_end - gpu_time_start; 446 | clReleaseEvent(hash_init_event); 447 | 448 | if (DETAILED_TIMING) printf("\tinit %.6lf,", (double)(gpu_time_end - gpu_time_start)*1.0e-9); 449 | 450 | clGetEventProfilingInfo(hash_kernel_event, CL_PROFILING_COMMAND_START, sizeof(gpu_time_start), &gpu_time_start, NULL); 451 | clGetEventProfilingInfo(hash_kernel_event, CL_PROFILING_COMMAND_END, sizeof(gpu_time_end), &gpu_time_end, NULL); 452 | gpu_time += gpu_time_end - gpu_time_start; 453 | clReleaseEvent(hash_kernel_event); 454 | 455 | if (DETAILED_TIMING) printf("hash %.6lf,", (double)(gpu_time_end - gpu_time_start)*1.0e-9); 456 | 457 | clGetEventProfilingInfo(get_neighbor_event, CL_PROFILING_COMMAND_START, sizeof(gpu_time_start), &gpu_time_start, NULL); 458 | clGetEventProfilingInfo(get_neighbor_event, CL_PROFILING_COMMAND_END, sizeof(gpu_time_end), &gpu_time_end, NULL); 459 | gpu_time += gpu_time_end - gpu_time_start; 460 | clReleaseEvent(get_neighbor_event); 461 | 462 | if (DETAILED_TIMING) printf("hash %.6lf,", (double)(gpu_time_end - gpu_time_start)*1.0e-9); 463 | 464 | *time = (double)gpu_time*1.0e-9; 465 | 466 | clReleaseMemObject(hash_buffer); 467 | 468 | return(neighbor_buffer); 469 | 470 | } 471 | #endif 472 | 473 | double generate_array_wminmax( uint size, double *ptr, double *xmin, double *xmax, 474 | double mindx, double maxdx, double min, double *max ) { 475 | 476 | double swap; 477 | int index, front = 0; 478 | double running_min = maxdx; 479 | 480 | struct timespec tim; //random seeding 481 | cpu_timer_start(&tim); 482 | //srand(tim.tv_sec*tim.tv_nsec); 483 | 484 | srand(0); 485 | 486 | ptr[0] = min; //start the array using the minimum value 487 | 488 | /* for each element, add a random value between mindx and maxdx to the previous element's value */ 489 | for(int i = 1; i < size; i++) { 490 | ptr[i] = ptr[i-1] + mindx + ((double)rand() * (maxdx - mindx) / (double)RAND_MAX); 491 | if(ptr[i]-ptr[i-1] < running_min) running_min = ptr[i]-ptr[i-1]; 492 | } 493 | 494 | 495 | *max = ptr[size-1]; //set the max value to the last element's value 496 | //*max = min + (size-1) * maxdx; //force the range for timings isolating a different variable 497 | 498 | xmin[0] = min; 499 | for (int i=1; i= size) return; 59 | 60 | temp[idx] = -1; 61 | } 62 | 63 | __kernel void hash_kern( 64 | const real min_val, 65 | const real min_diff, 66 | const uint length, 67 | __global const real *arr, 68 | __global int *temp) { 69 | 70 | const uint idx = get_global_id(0); 71 | 72 | if(idx >= length) return; 73 | 74 | temp[(uint)((arr[idx]-min_val)/min_diff)] = idx; 75 | } 76 | 77 | #define hashval(j,i) hash[(j)*imaxsize+(i)] 78 | 79 | __kernel void hash_setup_kern( 80 | const uint isize, 81 | const uint mesh_size, 82 | const uint levmx, 83 | __global const int *levtable, 84 | __global const int *i, 85 | __global const int *j, 86 | __global const int *level, 87 | __global int *hash 88 | ) { 89 | 90 | const uint giX = get_global_id(0); 91 | 92 | if (giX >= isize) return; 93 | 94 | int imaxsize = mesh_size*levtable[levmx]; 95 | 96 | int lev = level[giX]; 97 | int ii = i[giX]; 98 | int jj = j[giX]; 99 | 100 | int levdiff = levmx - lev; 101 | 102 | int iimin = ii *levtable[levdiff]; 103 | int iimax = (ii+1)*levtable[levdiff]; 104 | int jjmin = jj *levtable[levdiff]; 105 | int jjmax = (jj+1)*levtable[levdiff]; 106 | 107 | for ( int jjj = jjmin; jjj < jjmax; jjj++) { 108 | for (int iii = iimin; iii < iimax; iii++) { 109 | hashval(jjj, iii) = giX; 110 | } 111 | } 112 | 113 | } 114 | 115 | __kernel void calc_neighbor2d_kern( 116 | const int isize, 117 | const uint mesh_size, 118 | const int levmx, 119 | __global const int *levtable, 120 | __global const int *i, 121 | __global const int *j, 122 | __global const int *level, 123 | __global const int *hash, 124 | __global struct neighbor2d *neigh2d 125 | ) { 126 | 127 | const uint giX = get_global_id(0); 128 | 129 | if (giX >= isize) return; 130 | 131 | int imaxsize = mesh_size*levtable[levmx]; 132 | int jmaxsize = mesh_size*levtable[levmx]; 133 | 134 | int ii = i[giX]; 135 | int jj = j[giX]; 136 | int lev = level[giX]; 137 | int levmult = levtable[levmx-lev]; 138 | 139 | int nlftval = hashval( jj *levmult , max( ii *levmult-1, 0 )); 140 | int nrhtval = hashval( jj *levmult , min( (ii+1)*levmult, imaxsize-1)); 141 | int nbotval = hashval(max( jj *levmult-1, 0) , ii *levmult ); 142 | int ntopval = hashval(min( (jj+1)*levmult, jmaxsize-1), ii *levmult ); 143 | 144 | neigh2d[giX].left = nlftval; 145 | neigh2d[giX].right = nrhtval; 146 | neigh2d[giX].bottom = nbotval; 147 | neigh2d[giX].top = ntopval; 148 | } 149 | -------------------------------------------------------------------------------- /neigh_kern.cl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012-2019, Triad National Security, LLC. 3 | * All rights Reserved. 4 | * 5 | * Copyright 2012-2019. Triad National Security, LLC. This material was produced 6 | * under U.S. Government contract 89233218CNA000001 for Los Alamos National 7 | * Laboratory (LANL), which is operated by Triad National Security, LLC 8 | * for the U.S. Department of Energy. The U.S. Government has rights to use, 9 | * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR 10 | * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 11 | * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified 12 | * to produce derivative works, such modified software should be clearly marked, 13 | * so as not to confuse it with the version available from LANL. 14 | * 15 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 16 | * use this file except in compliance with the License. You may obtain a copy 17 | * of the License at 18 | * 19 | * http://www.apache.org/licenses/LICENSE-2.0 20 | * 21 | * Unless required by applicable law or agreed to in writing, software distributed 22 | * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 23 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations under the License.” 25 | * 26 | * This is LANL Copyright Disclosure C13002/LA-CC-12-022 27 | * 28 | */ 29 | 30 | /* 31 | * Authors: Bob Robey XCP-2 brobey@lanl.gov 32 | * David Nicholaeff dnic@lanl.gov, mtrxknight@aol.com 33 | * Rachel Robey rnrobey@gmail.com 34 | */ 35 | 36 | /* neigh_kern.cl */ 37 | 38 | #ifdef HAVE_CL_DOUBLE 39 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable 40 | typedef double real; 41 | #else 42 | typedef float real; 43 | #endif 44 | 45 | struct neighbor { 46 | uint left; 47 | uint right; 48 | }; 49 | 50 | __kernel void init_kern( 51 | const uint size, 52 | __global int *temp) { 53 | 54 | const uint idx = get_global_id(0); 55 | 56 | if (idx >= size) return; 57 | 58 | temp[idx] = -1; 59 | } 60 | 61 | __kernel void hash_kern( 62 | const real min_val, 63 | const real min_diff, 64 | const uint length, 65 | __global const real *arr, 66 | __global int *temp) { 67 | 68 | const uint idx = get_global_id(0); 69 | 70 | if(idx >= length) return; 71 | 72 | temp[(uint)((arr[idx]-min_val)/min_diff)] = idx; 73 | } 74 | 75 | __kernel void get_neighbor_kern( 76 | const real min_val, 77 | const real min_diff, 78 | const uint length, 79 | __global const real *arr, 80 | __global const int *temp, 81 | const uint temp_size, 82 | __global struct neighbor *neighbor_buffer) { 83 | 84 | const uint idx = get_global_id(0); 85 | 86 | if(idx >= length) return; 87 | 88 | int idx_new = (int)((arr[idx]-min_val)/min_diff); 89 | 90 | int left = idx; 91 | int right = idx; 92 | 93 | for (int i = idx_new+1; i < temp_size; i++) { 94 | if (temp[i] != -1) { 95 | right = temp[i]; 96 | break; 97 | } 98 | } 99 | 100 | for (int i = idx_new-1; i >= 0; i--) { 101 | if (temp[i] != -1) { 102 | left = temp[i]; 103 | break; 104 | } 105 | } 106 | 107 | neighbor_buffer[idx].left = left; 108 | neighbor_buffer[idx].right = right; 109 | } 110 | -------------------------------------------------------------------------------- /remap2d_kern.cl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012-2019, Triad National Security, LLC. 3 | * All rights Reserved. 4 | * 5 | * Copyright 2012-2019. Triad National Security, LLC. This material was produced 6 | * under U.S. Government contract 89233218CNA000001 for Los Alamos National 7 | * Laboratory (LANL), which is operated by Triad National Security, LLC 8 | * for the U.S. Department of Energy. The U.S. Government has rights to use, 9 | * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR 10 | * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 11 | * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified 12 | * to produce derivative works, such modified software should be clearly marked, 13 | * so as not to confuse it with the version available from LANL. 14 | * 15 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 16 | * use this file except in compliance with the License. You may obtain a copy 17 | * of the License at 18 | * 19 | * http://www.apache.org/licenses/LICENSE-2.0 20 | * 21 | * Unless required by applicable law or agreed to in writing, software distributed 22 | * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 23 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations under the License.” 25 | * 26 | * This is LANL Copyright Disclosure C13002/LA-CC-12-022 27 | * 28 | */ 29 | 30 | /* 31 | * Authors: Bob Robey XCP-2 brobey@lanl.gov 32 | * David Nicholaeff dnic@lanl.gov, mtrxknight@aol.com 33 | * Rachel Robey rnrobey@gmail.com 34 | */ 35 | 36 | /* remap_kern2d.cl */ 37 | 38 | #ifdef HAVE_CL_DOUBLE 39 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable 40 | typedef double real; 41 | #else 42 | typedef float real; 43 | #endif 44 | 45 | // Cartesian Coordinate Indexing 46 | #define two_to_the(ishift) (1u <<(ishift) ) 47 | #define four_to_the(ishift) (1u << ( (ishift)*2 ) ) 48 | 49 | /* Remap Kernels */ 50 | __kernel void remap_hash_creation_kern( 51 | __global int* hash_table, 52 | __global const int* i, 53 | __global const int* j, 54 | __global const int* level, 55 | const int ncells_a, 56 | const int mesh_size, 57 | const int levmx) { 58 | 59 | const int ic = get_global_id(0); 60 | 61 | uint i_max = mesh_size*two_to_the(levmx); 62 | 63 | if(ic < ncells_a) { 64 | int ii = i[ic]; 65 | int jj = j[ic]; 66 | int lev = level[ic]; 67 | // If at the maximum level just set the one cell 68 | if (lev == levmx) { 69 | hash_table[(jj*i_max)+ii] = ic; 70 | } else { 71 | // Set the square block of cells at the finest level 72 | // to the index number 73 | int lev_mod = two_to_the(levmx - lev); 74 | for (int jjj = jj*lev_mod; jjj < (jj+1)*lev_mod; jjj++) { 75 | for (int iii = ii*lev_mod; iii < (ii+1)*lev_mod; iii++) { 76 | hash_table[(jjj*i_max)+iii] = ic; 77 | } 78 | } 79 | } 80 | } 81 | 82 | } 83 | 84 | 85 | __kernel void remap_hash_retrieval_kern( 86 | __global real* V_remap, 87 | __global const real* V_a, 88 | __global const int* hash_table, 89 | __global const int* mesh_a_i, 90 | __global const int* mesh_a_j, 91 | __global const int* mesh_a_level, 92 | __global const int* mesh_b_i, 93 | __global const int* mesh_b_j, 94 | __global const int* mesh_b_level, 95 | const int ncells_b, 96 | const int mesh_size, 97 | const int levmx) { 98 | 99 | const int jc = get_global_id(0); 100 | 101 | uint i_max = mesh_size*two_to_the(levmx); 102 | 103 | if(jc < ncells_b) { 104 | int ii = mesh_b_i[jc]; 105 | int jj = mesh_b_j[jc]; 106 | int lev = mesh_b_level[jc]; 107 | int lev_mod = two_to_the(levmx - lev); 108 | real val_sum = 0.0; 109 | for(int jjj = jj*lev_mod; jjj < (jj+1)*lev_mod; jjj++) { 110 | for(int iii = ii*lev_mod; iii < (ii+1)*lev_mod; iii++) { 111 | int ic = hash_table[jjj*i_max+iii]; 112 | val_sum += V_a[ic] / (real)four_to_the(levmx-mesh_a_level[ic]); 113 | } 114 | } 115 | V_remap[jc] += val_sum; 116 | } 117 | 118 | } 119 | 120 | -------------------------------------------------------------------------------- /remap_kern.cl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012-2019, Triad National Security, LLC. 3 | * All rights Reserved. 4 | * 5 | * Copyright 2012-2019. Triad National Security, LLC. This material was produced 6 | * under U.S. Government contract 89233218CNA000001 for Los Alamos National 7 | * Laboratory (LANL), which is operated by Triad National Security, LLC 8 | * for the U.S. Department of Energy. The U.S. Government has rights to use, 9 | * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR 10 | * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 11 | * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified 12 | * to produce derivative works, such modified software should be clearly marked, 13 | * so as not to confuse it with the version available from LANL. 14 | * 15 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 16 | * use this file except in compliance with the License. You may obtain a copy 17 | * of the License at 18 | * 19 | * http://www.apache.org/licenses/LICENSE-2.0 20 | * 21 | * Unless required by applicable law or agreed to in writing, software distributed 22 | * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 23 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations under the License.” 25 | * 26 | * This is LANL Copyright Disclosure C13002/LA-CC-12-022 27 | * 28 | */ 29 | 30 | /* 31 | * Authors: Bob Robey XCP-2 brobey@lanl.gov 32 | * David Nicholaeff dnic@lanl.gov, mtrxknight@aol.com 33 | * Rachel Robey rnrobey@gmail.com 34 | */ 35 | 36 | /* remap_kern.cl */ 37 | 38 | #ifdef HAVE_CL_DOUBLE 39 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable 40 | typedef double real; 41 | #else 42 | typedef float real; 43 | #endif 44 | 45 | #ifndef MIN 46 | #define MIN(a,b) ((a)>(b)?(b):(a)) 47 | #endif 48 | 49 | struct rcell { 50 | real low; 51 | real high; 52 | }; 53 | 54 | __kernel void hash_kern( 55 | const real min_val, 56 | const real min_diff, 57 | const uint length, 58 | __global const real *arr, 59 | __global int *temp) { 60 | 61 | const uint idx = get_global_id(0); 62 | 63 | if(idx >= length) return; 64 | 65 | temp[(uint)((arr[idx]-min_val)/min_diff)] = idx; 66 | } 67 | 68 | 69 | /* Remap Kernels */ 70 | 71 | __kernel void cellHash_kern( 72 | const real min_val, 73 | const real min_diff, 74 | const uint length, 75 | __global const struct rcell *arr, 76 | __global int *temp) { 77 | 78 | const uint idx = get_global_id(0); 79 | 80 | if( idx < length ) { 81 | 82 | uint start = (int)((arr[idx].low+min_val)/min_diff); 83 | uint end = (int)((arr[idx].high+min_val)/min_diff); 84 | 85 | while( start < end ) { 86 | temp[start] = idx; 87 | start++; 88 | } 89 | } 90 | 91 | } 92 | 93 | __kernel void remap1_kern( 94 | const real min_val, 95 | const real mindx, 96 | const uint hash_size, 97 | const uint bsize, 98 | __global struct rcell *arr_a, 99 | __global real *arr_v, 100 | __global struct rcell *arr_b, 101 | __global int *hash, 102 | __global real *remap) { 103 | 104 | const uint idx = get_global_id(0); 105 | if( idx < bsize ) { 106 | 107 | uint start = (arr_b[idx].low - min_val)/mindx; 108 | uint end = (arr_b[idx].high - min_val)/mindx; 109 | 110 | if(start > hash_size - 1) { remap[idx] = 0.0; return; } 111 | if(end > hash_size) end = hash_size; 112 | 113 | remap[idx] = 0.; 114 | for( uint i = start; i < end; i++ ) { 115 | if(hash[i] >= 0) { 116 | remap[idx] += arr_v[hash[i]] * 1./(arr_a[hash[i]].high - arr_a[hash[i]].low); //assume state variable value of 1 in each original cell 117 | } 118 | } 119 | } 120 | } 121 | 122 | -------------------------------------------------------------------------------- /sort.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012-2019, Triad National Security, LLC. 3 | * All rights Reserved. 4 | * 5 | * Copyright 2012-2019. Triad National Security, LLC. This material was produced 6 | * under U.S. Government contract 89233218CNA000001 for Los Alamos National 7 | * Laboratory (LANL), which is operated by Triad National Security, LLC 8 | * for the U.S. Department of Energy. The U.S. Government has rights to use, 9 | * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR 10 | * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 11 | * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified 12 | * to produce derivative works, such modified software should be clearly marked, 13 | * so as not to confuse it with the version available from LANL. 14 | * 15 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 16 | * use this file except in compliance with the License. You may obtain a copy 17 | * of the License at 18 | * 19 | * http://www.apache.org/licenses/LICENSE-2.0 20 | * 21 | * Unless required by applicable law or agreed to in writing, software distributed 22 | * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 23 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations under the License.” 25 | * 26 | * This is LANL Copyright Disclosure C13002/LA-CC-12-022 27 | * 28 | */ 29 | 30 | /* 31 | * Authors: Bob Robey XCP-2 brobey@lanl.gov 32 | * David Nicholaeff dnic@lanl.gov, mtrxknight@aol.com 33 | * Rachel Robey rnrobey@gmail.com 34 | */ 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include "gpu.h" 44 | #include "timer.h" 45 | 46 | #ifdef HAVE_CONFIG_H 47 | #include "config.h" 48 | #endif 49 | 50 | #ifdef __APPLE_CC__ 51 | #include 52 | #else 53 | #include 54 | #endif 55 | 56 | #ifdef HAVE_CL_DOUBLE 57 | typedef double real; 58 | typedef cl_double cl_real; 59 | #define EPS 1.0e-12 60 | #else 61 | typedef float real; 62 | typedef cl_float cl_real; 63 | #define EPS 1.0e-7 64 | #endif 65 | 66 | #define SQR(x) (( (x)*(x) )) 67 | 68 | typedef unsigned int uint; 69 | 70 | #define CHECK 1 71 | #define TILE_SIZE 256 72 | #define DETAILED_TIMING 0 73 | 74 | struct timespec tstart; 75 | double time_sum; 76 | 77 | int is_nvidia = 0; 78 | 79 | cl_context context; 80 | cl_command_queue queue; 81 | cl_program program; 82 | cl_kernel init_kernel, hash_kernel, scan1_kernel, scan2_kernel, scan3_kernel; 83 | 84 | void sorts( uint length, double min_diff, double max_diff, double min_val ); 85 | cl_mem parallelHash( uint length, cl_mem arr, double min_diff, double max_diff, double min_val, double max_val, double *time ); 86 | double* hashsort( uint length, double *arr, double min_diff, double min_val, double max_val ); 87 | double generate_array( uint size, double *ptr, double mindx, double maxdx, double min, double *max ); 88 | 89 | //int compare (const void * a, const void * b) { return ( *(double*)a - *(double*)b ); } 90 | 91 | int compare (const void *a, const void *b) 92 | { 93 | const double *da = (const double *) a; 94 | const double *db = (const double *) b; 95 | 96 | return (*da > *db) - (*da < *db); 97 | } 98 | 99 | int main (int argc, const char * argv[]) 100 | { 101 | cl_int error; 102 | 103 | #ifdef HAVE_OPENCL 104 | GPUInit(&context, &queue, &is_nvidia, &program, "sort_kern.cl"); 105 | #endif 106 | 107 | struct timespec tim; //random seeding 108 | clock_gettime(CLOCK_MONOTONIC, &tim); 109 | //srand(tim.tv_sec*tim.tv_nsec); 110 | 111 | srand(0); 112 | 113 | #ifdef HAVE_OPENCL 114 | init_kernel = clCreateKernel(program, "init_kern", &error); 115 | hash_kernel = clCreateKernel(program, "hash_kern", &error); 116 | scan1_kernel = clCreateKernel(program, "scan1", &error); 117 | scan2_kernel = clCreateKernel(program, "scan2", &error); 118 | scan3_kernel = clCreateKernel(program, "scan3", &error); 119 | #endif 120 | 121 | printf("\n Sorting Performance Results\n\n"); 122 | #ifdef __APPLE_CC__ 123 | printf("Size, \tQsort, \tHeapsort, \tMergesort, \tHash CPU, \tHash GPU\n"); 124 | #else 125 | printf("Size, \tQsort, \tHash CPU, \tHash GPU\n"); 126 | #endif 127 | 128 | uint max_size = 0; 129 | #ifdef HAVE_CL_DOUBLE 130 | max_size = 100000000; 131 | #else 132 | max_size = 10000000; 133 | #endif 134 | //else max_size = 131071; 135 | 136 | for (uint max_mult = 2; max_mult <= 8; max_mult *= 2){ 137 | printf("\nMax diff is %d times min_diff\n",max_mult); 138 | for( uint i = 1024; i <= max_size; i*=2 ) { 139 | #ifndef HAVE_CL_DOUBLE 140 | if (max_mult > 2 && i > 5000000) continue; 141 | if (max_mult > 4 && i > 4000000) continue; 142 | if (max_mult > 8 && i > 2000000) continue; 143 | if (max_mult > 16 && i > 1000000) continue; 144 | #endif 145 | if (max_mult > 10 && i > 50000000) continue; 146 | if (max_mult > 30 && i > 20000000) continue; 147 | printf("%d, ", i); 148 | sorts(i, 2.0, (double)max_mult*2.0, 0.0); 149 | printf("\n"); 150 | } 151 | } 152 | 153 | } 154 | 155 | void sorts( uint length, double min_diff, double max_diff, double min_val ) { 156 | int icount; 157 | cl_int error = 0; 158 | double max_val = min_val; //reset in generate_array call 159 | double *sorted=NULL, *sort_test=NULL, *arr=NULL; 160 | 161 | arr = (double*)malloc(length*sizeof(double)); 162 | 163 | //generate randomly shuffled array with given conditions to be sorted 164 | generate_array(length, arr, min_diff, max_diff, min_val, &max_val); 165 | 166 | /* Qsort */ 167 | sorted = (double*)malloc(length*sizeof(double)); 168 | for(uint i = 0; i < length; i++) { sorted[i] = arr[i]; } 169 | cpu_timer_start(&tstart); 170 | qsort(sorted, length, sizeof(double), compare); 171 | time_sum += cpu_timer_stop(tstart); 172 | printf("\t%.6lf,", time_sum); 173 | 174 | 175 | #ifdef __APPLE_CC__ 176 | /* Heapsort */ 177 | sort_test = (double*)malloc(length*sizeof(double)); 178 | for(uint i = 0; i < length; i++) { sort_test[i] = arr[i]; } 179 | cpu_timer_start(&tstart); 180 | heapsort(sort_test, length, sizeof(double), compare); 181 | time_sum += cpu_timer_stop(tstart); 182 | printf("\t%.6lf,", time_sum); 183 | #ifdef CHECK 184 | for(uint i = 0; i < length; i++) { if (sort_test[i] != sorted[i]) printf("Check failed for heapsort index %d heapsort value %lf gold standard %lf\n",i,sort_test[i],sorted[i]); } 185 | #endif 186 | free(sort_test); 187 | sort_test = NULL; 188 | 189 | /* Mergesort */ 190 | sort_test = (double*)malloc(length*sizeof(double)); 191 | for(uint i = 0; i < length; i++) { sort_test[i] = arr[i]; } 192 | cpu_timer_start(&tstart); 193 | mergesort(sort_test, length, sizeof(double), compare); 194 | time_sum += cpu_timer_stop(tstart); 195 | printf("\t%.6lf,", time_sum); 196 | #ifdef CHECK 197 | for(uint i = 0; i < length; i++) { if (sort_test[i] != sorted[i]) printf("Check failed for mergesort index %d mergesort value %lf gold standard %lf\n",i,sort_test[i],sorted[i]); } 198 | #endif 199 | free(sort_test); 200 | sort_test = NULL; 201 | #endif 202 | 203 | 204 | /* Hashsort CPU */ 205 | cpu_timer_start(&tstart); 206 | sort_test = hashsort(length, arr, min_diff, min_val, max_val); 207 | time_sum += cpu_timer_stop(tstart); 208 | printf("\t%.6lf,", time_sum); 209 | #ifdef CHECK 210 | icount=0; 211 | for(uint i = 0; i < length; i++) { 212 | if (sort_test[i] != sorted[i]) { 213 | printf("Check failed for hashsort CPU index %d hashsort value %lf gold standard %lf\n",i,sort_test[i],sorted[i]); 214 | icount++; 215 | } 216 | } 217 | #endif 218 | free(sort_test); 219 | sort_test = NULL; 220 | 221 | 222 | #ifdef HAVE_OPENCL 223 | uint hash_size = (uint)((max_val - min_val)/min_diff + 2.5); 224 | uint alloc_size = 2*length*sizeof(real)+hash_size*sizeof(int)+(hash_size+hash_size-1)/TILE_SIZE*sizeof(int); 225 | //printf("\tSize is %lu\t", alloc_size); 226 | if (is_nvidia || alloc_size < 850000000) { 227 | /* Hashsort GPU */ 228 | real *arr_real = (real*)malloc(length*sizeof(real)); 229 | for(uint i = 0; i < length; i++) { arr_real[i] = (real)arr[i]; } 230 | cl_mem xcoor_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, length*sizeof(real), NULL, &error); 231 | cl_mem sorted_buffer = NULL; 232 | if (xcoor_buffer != NULL) { 233 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 234 | error = clEnqueueWriteBuffer(queue, xcoor_buffer, CL_TRUE, 0, length*sizeof(real), arr_real, 0, NULL, NULL); 235 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 236 | 237 | sorted_buffer = parallelHash(length, xcoor_buffer, min_diff, max_diff, min_val, max_val, &time_sum); 238 | clReleaseMemObject(xcoor_buffer); 239 | } 240 | free(arr_real); 241 | if (sorted_buffer != NULL) { 242 | 243 | real *sort_real = (real*)malloc(length*sizeof(real)); 244 | error = clEnqueueReadBuffer(queue, sorted_buffer, CL_TRUE, 0, length*sizeof(real), sort_real, 0, NULL, NULL); 245 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 246 | clReleaseMemObject(sorted_buffer); 247 | 248 | printf("\t%.6lf,", time_sum); 249 | sort_test = (double*)malloc(length*sizeof(double)); 250 | for(uint i = 0; i < length; i++) { sort_test[i] = (double)sort_real[i]; } 251 | free(sort_real); 252 | #ifdef CHECK 253 | 254 | icount=0; 255 | for(uint i = 0; i < length; i++) { 256 | if (fabs(sort_test[i] - sorted[i])/sorted[i] > EPS) { 257 | printf("Check failed for hashsort GPU index %d hashsort value %lf gold standard %lf\n",i,sort_test[i],sorted[i]); 258 | icount++; 259 | } 260 | if (icount > 20) exit(0); 261 | } 262 | #endif 263 | free(sort_test); 264 | sort_test = NULL; 265 | } else { 266 | printf("\tnot_run, "); 267 | } 268 | } else { 269 | printf("\tnot_run, "); 270 | } 271 | #endif 272 | 273 | 274 | free(sorted); 275 | sorted = NULL; 276 | free(arr); 277 | arr=NULL; 278 | } 279 | 280 | double* hashsort( uint length, double *arr, double min_diff, double min_val, double max_val ) { 281 | uint hash_size; 282 | int *hash=NULL; 283 | double *sorted=NULL; 284 | 285 | sorted = (double*)malloc(length*sizeof(double)); 286 | 287 | //create hash table with buckets of size min_diff 288 | // -- +2.5 rounds up and adds one space to either side 289 | hash_size = (uint)((max_val - min_val)/min_diff + 2.5); 290 | hash = (int*)malloc(hash_size*sizeof(int)); 291 | 292 | //set all elements of hash array to -1 293 | memset(hash, -1, hash_size*sizeof(int)); 294 | 295 | for(uint i = 0; i < length; i++) { 296 | //place index of current arr element into hash according to where the arr value 297 | hash[(int)((arr[i]-min_val)/min_diff)] = i; 298 | } 299 | 300 | int count=0; 301 | for(uint i = 0; i < hash_size; i++) { 302 | if(hash[i] >= 0) { 303 | //sweep through hash and put set values in a sorted array 304 | sorted[count] = arr[hash[i]]; 305 | count++; 306 | } 307 | } 308 | 309 | free(hash); 310 | return sorted; 311 | } 312 | 313 | /* generate a randomly mixed up array with size size to be stored in pointer. the elements will have a minimum value min, and 314 | the difference between elements when sorted will be between mindx and maxdx. the maximum value is recorded in max. */ 315 | double generate_array( uint size, double *ptr, double mindx, double maxdx, double min, double *max ) { 316 | 317 | double swap; 318 | int index, front = 0; 319 | double running_min = maxdx; 320 | 321 | ptr[0] = min; //start the array using the minimum value 322 | 323 | /* for each element, add a random value between mindx and maxdx to the previous element's value */ 324 | for(int i = 1; i < size; i++) { 325 | ptr[i] = ptr[i-1] + mindx + ((double)rand() * (maxdx - mindx) / (double)RAND_MAX); 326 | if(ptr[i]-ptr[i-1] < running_min) running_min = ptr[i]-ptr[i-1]; 327 | } 328 | 329 | *max = ptr[size-1]; //set the max value to the last element's value 330 | //*max = min + (size-1) * maxdx; //force the range for timings isolating a different variable 331 | 332 | /* Mix up the array by selecting elements from shrinking front portion of array and placing them on back end of array */ 333 | for(int i = 0; (i < size) && (size - i != 0) ; i++) { 334 | index = rand() % (size - i - front) + front; 335 | swap = ptr[size-i-1]; 336 | ptr[size-i-1] = ptr[index]; 337 | ptr[index] = swap; 338 | } 339 | return running_min; 340 | } 341 | 342 | #ifdef HAVE_OPENCL 343 | cl_mem parallelHash( uint length, cl_mem xcoor_buffer, double min_diff, double max_diff, double min_val, double max_val, double *time ) { 344 | 345 | cl_mem sorted_buffer, hash_buffer, ioffset_buffer; 346 | 347 | cl_int error = 0; 348 | long gpu_time = 0; 349 | 350 | uint hash_size = (uint)((max_val - min_val)/min_diff + 2.5); 351 | 352 | real min_val_real = (real)min_val; 353 | real min_diff_real = (real)min_diff; 354 | 355 | hash_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, hash_size*sizeof(int), NULL, &error); 356 | if (error != CL_SUCCESS) { 357 | //printf("Error is %d at line %d\n",error,__LINE__); 358 | return(NULL); 359 | } 360 | 361 | /****************** 362 | * Init to -1 363 | ******************/ 364 | 365 | error = clSetKernelArg(init_kernel, 0, sizeof(cl_uint), &hash_size); 366 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 367 | error = clSetKernelArg(init_kernel, 1, sizeof(cl_mem), (void*)&hash_buffer); 368 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 369 | 370 | size_t global_work_size[1]; 371 | size_t local_work_size[1]; 372 | 373 | local_work_size[0] = TILE_SIZE; 374 | global_work_size[0] = ((hash_size+local_work_size[0]-1)/local_work_size[0])*local_work_size[0]; 375 | 376 | cl_event hash_init_event; 377 | 378 | error = clEnqueueNDRangeKernel(queue, init_kernel, 1, 0, global_work_size, local_work_size, 0, NULL, &hash_init_event); 379 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 380 | 381 | /****************** 382 | * Hash Kernel 383 | ******************/ 384 | 385 | error = clSetKernelArg(hash_kernel, 0, sizeof(real), &min_val_real); 386 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 387 | error = clSetKernelArg(hash_kernel, 1, sizeof(real), &min_diff_real); 388 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 389 | error = clSetKernelArg(hash_kernel, 2, sizeof(cl_uint), &length); 390 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 391 | error = clSetKernelArg(hash_kernel, 3, sizeof(cl_mem), (void*)&xcoor_buffer); 392 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 393 | error = clSetKernelArg(hash_kernel, 4, sizeof(cl_mem), (void*)&hash_buffer); 394 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 395 | 396 | global_work_size[0] = ((length+local_work_size[0]-1)/local_work_size[0])*local_work_size[0]; 397 | 398 | cl_event hash_kernel_event; 399 | 400 | error = clEnqueueNDRangeKernel(queue, hash_kernel, 1, 0, global_work_size, local_work_size, 0, NULL, &hash_kernel_event); 401 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 402 | 403 | /*********************** 404 | * Prefix Scan Kernels 405 | ***********************/ 406 | 407 | /* scan 1 */ 408 | global_work_size[0] = ((hash_size+local_work_size[0]-1)/local_work_size[0])*local_work_size[0]; 409 | 410 | int group_size = (int)(global_work_size[0]/local_work_size[0]); 411 | 412 | ioffset_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, group_size*sizeof(uint), NULL, &error); 413 | if (error != CL_SUCCESS) { 414 | //printf("Error is %d at line %d\n",error,__LINE__); 415 | clReleaseMemObject(hash_buffer); 416 | return(NULL); 417 | } 418 | 419 | error = clSetKernelArg(scan1_kernel, 0, sizeof(cl_uint), &hash_size); 420 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 421 | error = clSetKernelArg(scan1_kernel, 1, sizeof(cl_mem), (void*)&ioffset_buffer); 422 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 423 | error = clSetKernelArg(scan1_kernel, 2, local_work_size[0]*sizeof(uint), NULL); 424 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 425 | error = clSetKernelArg(scan1_kernel, 3, sizeof(cl_mem), (void*)&hash_buffer); 426 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 427 | 428 | cl_event scan1_event; 429 | 430 | error = clEnqueueNDRangeKernel(queue, scan1_kernel, 1, 0, global_work_size, local_work_size, 0, NULL, &scan1_event); 431 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 432 | 433 | //clWaitForEvents(1, &scan1_event); 434 | //exit(0); 435 | 436 | /* scan 2 */ 437 | //global_work_size[0] = ((group_size+local_work_size[0]-1)/local_work_size[0])*local_work_size[0]; 438 | global_work_size[0] = local_work_size[0]; 439 | 440 | cl_event scan2_event; 441 | 442 | //printf("\n local: %d global: %d\n", local_work_size[0], global_work_size[0]); 443 | 444 | 445 | int elements_per_thread = (group_size+local_work_size[0]-1)/local_work_size[0]; 446 | //printf("\ngroup_size %d EPT %d\n",group_size,elements_per_thread ); 447 | 448 | error = clSetKernelArg(scan2_kernel, 0, local_work_size[0]*sizeof(uint), NULL); 449 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 450 | error = clSetKernelArg(scan2_kernel, 1, sizeof(cl_mem), (void*)&ioffset_buffer); 451 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 452 | error = clSetKernelArg(scan2_kernel, 2, sizeof(uint), &group_size); 453 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 454 | error = clEnqueueNDRangeKernel(queue, scan2_kernel, 1, 0, global_work_size, local_work_size, 0, NULL, &scan2_event); 455 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 456 | 457 | #ifdef XXX 458 | uint *ioffset = (uint *)malloc(group_size*sizeof(uint)); 459 | error = clEnqueueReadBuffer(queue, ioffset_buffer, CL_TRUE, 0, group_size*sizeof(uint), ioffset, 0, NULL, NULL); 460 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 461 | 462 | printf("\n"); 463 | for (uint i=0; i= 0 ? 1 : 0; 122 | barrier(CLK_GLOBAL_MEM_FENCE); 123 | 124 | for(uint offset = ntX >> 1; offset > 32; offset >>= 1) { 125 | if(tiX < offset) { 126 | itile[tiX] += itile[tiX+offset]; 127 | } 128 | barrier(CLK_LOCAL_MEM_FENCE); 129 | } 130 | 131 | if(giX >= isize) return; 132 | 133 | // Unroll the remainder of the loop as 32 threads must proceed in lockstep. 134 | if (tiX < 32) 135 | { itile[tiX] += itile[tiX+32]; 136 | itile[tiX] += itile[tiX+16]; 137 | itile[tiX] += itile[tiX+8]; 138 | itile[tiX] += itile[tiX+4]; 139 | itile[tiX] += itile[tiX+2]; 140 | itile[tiX] += itile[tiX+1]; } 141 | 142 | if(tiX == 0) { 143 | ioffset[group_id] = itile[0]; 144 | } 145 | } 146 | 147 | inline uint scan_warp_exclusive(__local volatile uint *input, const uint idx, const uint lane) { 148 | if (lane > 0 ) input[idx] += input[idx - 1]; 149 | if (lane > 1 ) input[idx] += input[idx - 2]; 150 | if (lane > 3 ) input[idx] += input[idx - 4]; 151 | if (lane > 7 ) input[idx] += input[idx - 8]; 152 | if (lane > 15) input[idx] += input[idx - 16]; 153 | 154 | return (lane > 0) ? input[idx-1] : 0; 155 | } 156 | 157 | inline uint scan_warp_inclusive(__local volatile uint *input, const uint idx, const uint lane) { 158 | if (1) { 159 | if (lane > 0 ) input[idx] += input[idx - 1]; 160 | if (lane > 1 ) input[idx] += input[idx - 2]; 161 | if (lane > 3 ) input[idx] += input[idx - 4]; 162 | if (lane > 7 ) input[idx] += input[idx - 8]; 163 | if (lane > 15) input[idx] += input[idx - 16]; 164 | 165 | return input[idx]; 166 | } 167 | } 168 | 169 | inline uint scan_workgroup_exclusive( 170 | __local uint* itile, 171 | const uint tiX, 172 | const uint lane, 173 | const uint warpID) { 174 | 175 | // Step 1: scan each warp 176 | uint val = scan_warp_exclusive(itile, tiX, lane); 177 | barrier(CLK_LOCAL_MEM_FENCE); 178 | 179 | // Step 2: Collect per-warp sums 180 | if (lane == 31) itile[warpID] = itile[tiX]; 181 | barrier(CLK_LOCAL_MEM_FENCE); 182 | 183 | // Step 3: Use 1st warp to scan per-warp sums 184 | if (warpID == 0) scan_warp_inclusive(itile, tiX, lane); 185 | barrier(CLK_LOCAL_MEM_FENCE); 186 | 187 | // Step 4: Accumulate results from Steps 1 and 3 188 | if (warpID > 0) val += itile[warpID-1]; 189 | barrier(CLK_LOCAL_MEM_FENCE); 190 | 191 | // Step 6: Write and return the final result 192 | itile[tiX] = val; 193 | barrier(CLK_LOCAL_MEM_FENCE); 194 | 195 | return val; 196 | } 197 | 198 | __kernel void scan2( 199 | __local uint* itile, 200 | __global uint* ioffset, 201 | const uint size) { 202 | 203 | size_t tiX = get_local_id(0); 204 | const uint gID = get_group_id(0); 205 | const uint ntX = get_local_size(0); 206 | 207 | const uint lane = tiX & 31; 208 | const uint warpID = tiX >> 5; 209 | const uint EPT = (size+ntX-1)/ntX; //elements_per_thread; 210 | 211 | uint reduceValue = 0; 212 | 213 | // #pragma unroll 4 214 | for(uint i = 0; i < EPT; ++i) 215 | { 216 | uint offsetIdx = i * ntX + tiX; 217 | 218 | #ifdef IS_NVIDIA 219 | // if (offsetIdx >= size) return; 220 | #endif 221 | 222 | // Step 1: Read ntX elements from global (off-chip) memory to local memory (on-chip) 223 | uint input = 0; 224 | if (offsetIdx < size) input = ioffset[offsetIdx]; 225 | itile[tiX] = input; 226 | barrier(CLK_LOCAL_MEM_FENCE); 227 | 228 | // Step 2: Perform scan on ntX elements 229 | uint val = scan_workgroup_exclusive(itile, tiX, lane, warpID); 230 | 231 | // Step 3: Propagate reduced result from previous block of ntX elements 232 | val += reduceValue; 233 | 234 | // Step 4: Write out data to global memory 235 | if (offsetIdx < size) ioffset[offsetIdx] = val; 236 | 237 | // Step 5: Choose reduced value for next iteration 238 | if (tiX == (ntX-1)) itile[tiX] = input + val; 239 | barrier(CLK_LOCAL_MEM_FENCE); 240 | 241 | reduceValue = itile[ntX-1]; 242 | barrier(CLK_LOCAL_MEM_FENCE); 243 | } 244 | } 245 | 246 | __kernel void scan3 ( 247 | const int isize, 248 | __global const uint *ioffset, 249 | __local uint *itile, 250 | __global const int *temp, 251 | __global const cell *arr, 252 | __global cell *sorted) { 253 | 254 | const uint giX = get_global_id(0); 255 | const uint tiX = get_local_id(0); 256 | const uint group_id = get_group_id(0); 257 | 258 | const uint lane = tiX & 31; 259 | const uint warpid = tiX >> 5; 260 | 261 | // Step 1: load global data into tile 262 | int temp_val = 0; 263 | if (giX < isize) temp_val = temp[giX]; 264 | itile[tiX] = 0; 265 | if (temp_val >= 0) itile[tiX] = 1; 266 | barrier(CLK_LOCAL_MEM_FENCE); 267 | 268 | // Step 2: scan each warp 269 | uint val = scan_warp_exclusive(itile, tiX, lane); 270 | barrier(CLK_LOCAL_MEM_FENCE); 271 | 272 | // Step 3: Collect per-warp sums 273 | if (lane == 31) itile[warpid] = itile[tiX]; 274 | barrier(CLK_LOCAL_MEM_FENCE); 275 | 276 | // Step 4: Use 1st warp to scan per-warp sums 277 | if (warpid == 0) scan_warp_inclusive(itile, tiX, lane); 278 | barrier(CLK_LOCAL_MEM_FENCE); 279 | 280 | // Step 5: Accumulate results from Steps 2 and 4 281 | if (warpid > 0) val += itile[warpid-1]; 282 | barrier(CLK_LOCAL_MEM_FENCE); 283 | 284 | if (giX >= isize || temp_val < 0) return; 285 | 286 | // Step 6: Write and return the final result 287 | //itile[tiX] = val; 288 | //barrier(CLK_LOCAL_MEM_FENCE); 289 | 290 | val += ioffset[group_id]; //index to write to for each thread 291 | 292 | sorted[val] = arr[temp_val]; 293 | } 294 | 295 | -------------------------------------------------------------------------------- /sort_kern.cl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012-2019, Triad National Security, LLC. 3 | * All rights Reserved. 4 | * 5 | * Copyright 2012-2019. Triad National Security, LLC. This material was produced 6 | * under U.S. Government contract 89233218CNA000001 for Los Alamos National 7 | * Laboratory (LANL), which is operated by Triad National Security, LLC 8 | * for the U.S. Department of Energy. The U.S. Government has rights to use, 9 | * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR 10 | * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 11 | * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified 12 | * to produce derivative works, such modified software should be clearly marked, 13 | * so as not to confuse it with the version available from LANL. 14 | * 15 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 16 | * use this file except in compliance with the License. You may obtain a copy 17 | * of the License at 18 | * 19 | * http://www.apache.org/licenses/LICENSE-2.0 20 | * 21 | * Unless required by applicable law or agreed to in writing, software distributed 22 | * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 23 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations under the License.” 25 | * 26 | * This is LANL Copyright Disclosure C13002/LA-CC-12-022 27 | * 28 | */ 29 | 30 | /* 31 | * Authors: Bob Robey XCP-2 brobey@lanl.gov 32 | * David Nicholaeff dnic@lanl.gov, mtrxknight@aol.com 33 | * Rachel Robey rnrobey@gmail.com 34 | */ 35 | 36 | /* sort_kern.cl */ 37 | 38 | #ifdef HAVE_CL_DOUBLE 39 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable 40 | typedef double real; 41 | #else 42 | typedef float real; 43 | #endif 44 | 45 | __kernel void init_kern( 46 | const uint size, 47 | __global int *temp) { 48 | 49 | const uint idx = get_global_id(0); 50 | 51 | if (idx >= size) return; 52 | 53 | temp[idx] = -1; 54 | } 55 | 56 | __kernel void hash_kern( 57 | const real min_val, 58 | const real min_diff, 59 | const uint length, 60 | __global const real *arr, 61 | __global int *temp) { 62 | 63 | const uint idx = get_global_id(0); 64 | 65 | if(idx >= length) return; 66 | 67 | temp[(uint)((arr[idx]-min_val)/min_diff)] = idx; 68 | } 69 | 70 | __kernel void scan1( 71 | const uint isize, 72 | __global uint *ioffset, 73 | __local volatile uint *itile, 74 | __global const int *temp) { 75 | 76 | const uint giX = get_global_id(0); 77 | const uint tiX = get_local_id(0); 78 | const uint ntX = get_local_size(0); 79 | const uint group_id = get_group_id(0); 80 | 81 | int temp_val = -1; 82 | if (giX < isize) temp_val = temp[giX]; 83 | 84 | itile[tiX] = temp_val >= 0 ? 1 : 0; 85 | barrier(CLK_GLOBAL_MEM_FENCE); 86 | 87 | for(uint offset = ntX >> 1; offset > 32; offset >>= 1) { 88 | if(tiX < offset) { 89 | itile[tiX] += itile[tiX+offset]; 90 | } 91 | barrier(CLK_LOCAL_MEM_FENCE); 92 | } 93 | 94 | if(giX >= isize) return; 95 | 96 | // Unroll the remainder of the loop as 32 threads must proceed in lockstep. 97 | if (tiX < 32) 98 | { itile[tiX] += itile[tiX+32]; 99 | itile[tiX] += itile[tiX+16]; 100 | itile[tiX] += itile[tiX+8]; 101 | itile[tiX] += itile[tiX+4]; 102 | itile[tiX] += itile[tiX+2]; 103 | itile[tiX] += itile[tiX+1]; } 104 | 105 | if(tiX == 0) { 106 | ioffset[group_id] = itile[0]; 107 | } 108 | } 109 | 110 | inline uint scan_warp_exclusive(__local volatile uint *input, const uint idx, const uint lane) { 111 | if (lane > 0 ) input[idx] += input[idx - 1]; 112 | if (lane > 1 ) input[idx] += input[idx - 2]; 113 | if (lane > 3 ) input[idx] += input[idx - 4]; 114 | if (lane > 7 ) input[idx] += input[idx - 8]; 115 | if (lane > 15) input[idx] += input[idx - 16]; 116 | 117 | return (lane > 0) ? input[idx-1] : 0; 118 | } 119 | 120 | inline uint scan_warp_inclusive(__local volatile uint *input, const uint idx, const uint lane) { 121 | if (1) { 122 | if (lane > 0 ) input[idx] += input[idx - 1]; 123 | if (lane > 1 ) input[idx] += input[idx - 2]; 124 | if (lane > 3 ) input[idx] += input[idx - 4]; 125 | if (lane > 7 ) input[idx] += input[idx - 8]; 126 | if (lane > 15) input[idx] += input[idx - 16]; 127 | return input[idx]; 128 | } 129 | } 130 | 131 | inline uint scan_workgroup_exclusive( 132 | __local uint* itile, 133 | const uint tiX, 134 | const uint lane, 135 | const uint warpID) { 136 | 137 | // Step 1: scan each warp 138 | uint val = scan_warp_exclusive(itile, tiX, lane); 139 | barrier(CLK_LOCAL_MEM_FENCE); 140 | 141 | // Step 2: Collect per-warp sums 142 | if (lane == 31) itile[warpID] = itile[tiX]; 143 | barrier(CLK_LOCAL_MEM_FENCE); 144 | 145 | // Step 3: Use 1st warp to scan per-warp sums 146 | if (warpID == 0) scan_warp_inclusive(itile, tiX, lane); 147 | barrier(CLK_LOCAL_MEM_FENCE); 148 | 149 | // Step 4: Accumulate results from Steps 1 and 3 150 | if (warpID > 0) val += itile[warpID-1]; 151 | barrier(CLK_LOCAL_MEM_FENCE); 152 | 153 | // Step 6: Write and return the final result 154 | itile[tiX] = val; 155 | barrier(CLK_LOCAL_MEM_FENCE); 156 | 157 | return val; 158 | } 159 | 160 | __kernel void scan2( 161 | __local uint* itile, 162 | __global uint* ioffset, 163 | const uint size) { 164 | 165 | size_t tiX = get_local_id(0); 166 | const uint gID = get_group_id(0); 167 | const uint ntX = get_local_size(0); 168 | 169 | const uint lane = tiX & 31; 170 | const uint warpID = tiX >> 5; 171 | const uint EPT = (size+ntX-1)/ntX; //elements_per_thread; 172 | 173 | uint reduceValue = 0; 174 | 175 | // #pragma unroll 4 176 | for(uint i = 0; i < EPT; ++i) 177 | { 178 | uint offsetIdx = i * ntX + tiX; 179 | 180 | #ifdef IS_NVIDIA 181 | // if (offsetIdx >= size) return; 182 | #endif 183 | 184 | // Step 1: Read ntX elements from global (off-chip) memory to local memory (on-chip) 185 | uint input = 0; 186 | if (offsetIdx < size) input = ioffset[offsetIdx]; 187 | itile[tiX] = input; 188 | barrier(CLK_LOCAL_MEM_FENCE); 189 | 190 | // Step 2: Perform scan on ntX elements 191 | uint val = scan_workgroup_exclusive(itile, tiX, lane, warpID); 192 | 193 | // Step 3: Propagate reduced result from previous block of ntX elements 194 | val += reduceValue; 195 | 196 | // Step 4: Write out data to global memory 197 | if (offsetIdx < size) ioffset[offsetIdx] = val; 198 | 199 | // Step 5: Choose reduced value for next iteration 200 | if (tiX == (ntX-1)) itile[tiX] = input + val; 201 | barrier(CLK_LOCAL_MEM_FENCE); 202 | 203 | reduceValue = itile[ntX-1]; 204 | barrier(CLK_LOCAL_MEM_FENCE); 205 | } 206 | } 207 | 208 | inline uint do_element_pass(uint offsetIdx, uint ntX, uint tiX, uint lane, uint warpID, 209 | uint reduceValue, uint size, __global uint *ioffset, __local uint *itile) { 210 | barrier(CLK_LOCAL_MEM_FENCE); 211 | 212 | // Step 1: Read ntX elements from global (off-chip) memory to local memory (on-chip) 213 | uint input = 0; 214 | if (offsetIdx < size) input = ioffset[offsetIdx]; 215 | itile[tiX] = input; 216 | barrier(CLK_LOCAL_MEM_FENCE); 217 | 218 | // Step 2: Perform scan on ntX elements 219 | uint val = scan_workgroup_exclusive(itile, tiX, lane, warpID); 220 | 221 | // Step 3: Propagate reduced result from previous block of ntX elements 222 | val += reduceValue; 223 | 224 | // Step 4: Write out data to global memory 225 | if (offsetIdx < size) ioffset[offsetIdx] = val; 226 | 227 | // Step 5: Choose reduced value for next iteration 228 | if (tiX == (ntX-1)) itile[tiX] = input + val; 229 | barrier(CLK_LOCAL_MEM_FENCE); 230 | 231 | reduceValue = itile[ntX-1]; 232 | 233 | return(reduceValue); 234 | } 235 | 236 | __kernel void scan_lev( 237 | __local uint* itile, 238 | __global uint* ioffset, 239 | __global uint* workgroup_results, 240 | const uint size) { 241 | 242 | uint tiX = get_local_id(0); 243 | uint giX = get_global_id(0); 244 | const uint gID = get_group_id(0); 245 | const uint ntX = get_local_size(0); 246 | 247 | const uint lane = tiX & 31; 248 | const uint warpID = tiX >> 5; 249 | 250 | workgroup_results[gID] = 0; 251 | 252 | // Step 1: Read ntX elements from global (off-chip) memory to local memory (on-chip) 253 | uint input = 0; 254 | if (giX < size) input = ioffset[giX]; 255 | itile[tiX] = input; 256 | barrier(CLK_LOCAL_MEM_FENCE); 257 | 258 | // Step 2: Perform scan on ntX elements 259 | uint val = scan_workgroup_exclusive(itile, tiX, lane, warpID); 260 | 261 | // Step 3: Collect per-workgroup partial results 262 | workgroup_results[gID] = itile[tiX]; 263 | } 264 | 265 | __kernel void scan_workgroup_results( 266 | __global uint* workgroup_results) 267 | { 268 | uint tiX = get_local_id(0); 269 | 270 | const uint lane = tiX & 31; 271 | const uint warpID = tiX >> 5; 272 | 273 | // Step 4: Use 1st warp to scan workgroup_results 274 | //if (warpID == 0) scan_warp_inclusive(workgroup_results, tiX, lane); 275 | } 276 | 277 | __kernel void accumulate_workgroup_results() 278 | { 279 | uint tiX = get_local_id(0); 280 | 281 | const uint warpID = tiX >> 5; 282 | 283 | // Step 5: Accumulate results from steps 2 and 4 284 | //uint val += itile[warpID-1]; 285 | } 286 | 287 | __kernel void scan3 ( 288 | const int isize, 289 | __global const uint *ioffset, 290 | __local uint *itile, 291 | __global const int *temp, 292 | __global const real *arr, 293 | __global real *sorted) { 294 | 295 | const uint giX = get_global_id(0); 296 | const uint tiX = get_local_id(0); 297 | const uint group_id = get_group_id(0); 298 | 299 | const uint lane = tiX & 31; 300 | const uint warpid = tiX >> 5; 301 | 302 | // Step 1: load global data into tile 303 | int temp_val = 0; 304 | if (giX < isize) temp_val = temp[giX]; 305 | itile[tiX] = 0; 306 | if (temp_val >= 0) itile[tiX] = 1; 307 | barrier(CLK_LOCAL_MEM_FENCE); 308 | 309 | // Step 2: scan each warp 310 | uint val = scan_warp_exclusive(itile, tiX, lane); 311 | barrier(CLK_LOCAL_MEM_FENCE); 312 | 313 | // Step 3: Collect per-warp sums 314 | if (lane == 31) itile[warpid] = itile[tiX]; 315 | barrier(CLK_LOCAL_MEM_FENCE); 316 | 317 | // Step 4: Use 1st warp to scan per-warp sums 318 | if (warpid == 0) scan_warp_inclusive(itile, tiX, lane); 319 | barrier(CLK_LOCAL_MEM_FENCE); 320 | 321 | // Step 5: Accumulate results from Steps 2 and 4 322 | if (warpid > 0) val += itile[warpid-1]; 323 | barrier(CLK_LOCAL_MEM_FENCE); 324 | 325 | if (giX >= isize || temp_val < 0) return; 326 | 327 | // Step 6: Write and return the final result 328 | //itile[tiX] = val; 329 | //barrier(CLK_LOCAL_MEM_FENCE); 330 | 331 | val += ioffset[group_id]; //index to write to for each thread 332 | 333 | sorted[val] = arr[temp_val]; 334 | } 335 | 336 | -------------------------------------------------------------------------------- /table.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012-2019, Triad National Security, LLC. 3 | * All rights Reserved. 4 | * 5 | * Copyright 2012-2019. Triad National Security, LLC. This material was produced 6 | * under U.S. Government contract 89233218CNA000001 for Los Alamos National 7 | * Laboratory (LANL), which is operated by Triad National Security, LLC 8 | * for the U.S. Department of Energy. The U.S. Government has rights to use, 9 | * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR 10 | * TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 11 | * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified 12 | * to produce derivative works, such modified software should be clearly marked, 13 | * so as not to confuse it with the version available from LANL. 14 | * 15 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 16 | * use this file except in compliance with the License. You may obtain a copy 17 | * of the License at 18 | * 19 | * http://www.apache.org/licenses/LICENSE-2.0 20 | * 21 | * Unless required by applicable law or agreed to in writing, software distributed 22 | * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 23 | * CONDITIONS OF ANY KIND, either express or implied. See the License for the 24 | * specific language governing permissions and limitations under the License.” 25 | * 26 | * This is LANL Copyright Disclosure C13002/LA-CC-12-022 27 | * 28 | */ 29 | 30 | /* 31 | * Authors: Bob Robey XCP-2 brobey@lanl.gov 32 | * David Nicholaeff dnic@lanl.gov, mtrxknight@aol.com 33 | * Rachel Robey rnrobey@gmail.com 34 | */ 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include "gpu.h" 42 | #include "timer.h" 43 | 44 | cl_kernel interpolate_kernel; 45 | 46 | #ifdef HAVE_CONFIG_H 47 | #include "config.h" 48 | #endif 49 | 50 | #ifdef __APPLE_CC__ 51 | #include 52 | #else 53 | #include 54 | #endif 55 | 56 | #ifdef HAVE_CL_DOUBLE 57 | typedef double real; 58 | typedef cl_double cl_real; 59 | typedef cl_double4 cl_real4; 60 | #define EPS 1.0e-8 61 | #else 62 | typedef float real; 63 | typedef cl_float cl_real; 64 | typedef cl_float4 cl_real4; 65 | #define EPS 1.0e-5 66 | #endif 67 | 68 | #define TILE_SIZE 256 69 | #define dataval(x,y) data[(x)+((y)*xstride)] 70 | 71 | cl_context context; 72 | cl_command_queue queue; 73 | cl_program program; 74 | int is_nvidia=0; 75 | 76 | double random_normal_dist(void); 77 | double *interpolate_bruteforce(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis, 78 | double *density_array, double *temp_array, double *data); 79 | double *interpolate_bisection(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis, 80 | double *density_array, double *temp_array, double *data); 81 | int bisection(double *axis, int axis_size, double value); 82 | double *interpolate_hashcpu(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis, 83 | double *density_array, double *temp_array, double *data); 84 | cl_mem interpolate_hashgpu(int isize, int xstride, int density_axis_size, int temp_axis_size, cl_mem density_axis_buffer, cl_mem temp_axis_buffer, 85 | cl_mem density_array_buffer, cl_mem temp_array_buffer, cl_mem data_buffer, double *time); 86 | 87 | #include "table.data" 88 | 89 | int main(int argc, char *argv[]) 90 | { 91 | cl_int error; 92 | 93 | #ifdef HAVE_OPENCL 94 | GPUInit(&context, &queue, &is_nvidia, &program, "table_kern.cl"); 95 | 96 | interpolate_kernel = clCreateKernel(program, "interpolate_kernel", &error); 97 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 98 | #endif 99 | 100 | int i; 101 | 102 | double temp, density; 103 | 104 | cl_int ierror; 105 | int data_size = sizeof(data)/sizeof(data[0]); 106 | int density_axis_size = sizeof(density_axis)/sizeof(density_axis[0]); 107 | int temp_axis_size = sizeof(temp_axis)/sizeof(temp_axis[0]); 108 | 109 | double density_increment = (density_axis[density_axis_size-1]-density_axis[0])/(double)(density_axis_size-1); 110 | double temp_increment = (temp_axis[temp_axis_size-1]-temp_axis[0])/(double)(temp_axis_size-1); 111 | 112 | double density_avg = (density_axis[density_axis_size-1]+density_axis[0])/2.0; 113 | double temp_avg = (temp_axis[temp_axis_size-1]+temp_axis[0])/2.0; 114 | 115 | double density_stddev = (density_axis[density_axis_size-1]-density_axis[0])/6.0; 116 | double temp_stddev = (temp_axis[temp_axis_size-1]-temp_axis[0])/6.0; 117 | 118 | for (i=1; i EPS ){ 246 | printf("Warning %d does not match -- test %lf gold %lf\n",i,value_test[i],value_gold[i]); 247 | } 248 | } 249 | 250 | free(value_test); 251 | #endif 252 | 253 | printf("\n"); 254 | 255 | free(value_gold); 256 | } 257 | 258 | #ifdef HAVE_OPENCL 259 | clReleaseMemObject(data_buffer); 260 | clReleaseMemObject(density_axis_buffer); 261 | clReleaseMemObject(temp_axis_buffer); 262 | #endif 263 | } 264 | 265 | 266 | double random_normal_dist(void) 267 | { 268 | double x1, x2, x3, result; 269 | 270 | x1 = 2.0*drand48() - 1.0; 271 | x2 = 2.0*drand48() - 1.0; 272 | x3 = 2.0*drand48() - 1.0; 273 | result = x1 + x2 + x3; 274 | 275 | return(result); 276 | } 277 | 278 | double *interpolate_bruteforce(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis, 279 | double *density_array, double *temp_array, double *data) 280 | { 281 | int i; 282 | 283 | double *value_array=(double *)malloc(isize*sizeof(double)); 284 | 285 | for (i = 0; i temp_axis[temp_slot+1]; temp_slot++); 289 | for (density_slot=0; density_slot density_axis[density_slot+1]; density_slot++); 290 | 291 | double xfrac = (density_array[i]-density_axis[density_slot])/(density_axis[density_slot+1]-density_axis[density_slot]); 292 | double yfrac = (temp_array[i]-temp_axis[temp_slot])/(temp_axis[temp_slot+1]-temp_axis[temp_slot]); 293 | value_array[i] = xfrac * yfrac *dataval(density_slot+1,temp_slot+1) 294 | + (1.0-xfrac)* yfrac *dataval(density_slot, temp_slot+1) 295 | + xfrac *(1.0-yfrac)*dataval(density_slot+1,temp_slot) 296 | + (1.0-xfrac)*(1.0-yfrac)*dataval(density_slot, temp_slot); 297 | 298 | } 299 | 300 | return(value_array); 301 | } 302 | 303 | double *interpolate_bisection(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis, 304 | double *density_array, double *temp_array, double *data) 305 | { 306 | int i; 307 | 308 | double *value_array=(double *)malloc(isize*sizeof(double)); 309 | 310 | for (i = 0; i 1){ 331 | int imid = (itop + ibot) /2; 332 | if ( value >= axis[imid] ) 333 | ibot = imid; 334 | else 335 | itop = imid; 336 | } 337 | return(ibot); 338 | } 339 | 340 | double *interpolate_hashcpu(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis, 341 | double *density_array, double *temp_array, double *data) 342 | { 343 | int i; 344 | // Computes a constant increment for each axis data look-up 345 | double density_increment = (density_axis[density_axis_size-1]-density_axis[0])/(double)(density_axis_size-1); 346 | double temp_increment = (temp_axis[temp_axis_size-1]-temp_axis[0])/(double)(temp_axis_size-1); 347 | 348 | double *value_array=(double *)malloc(isize*sizeof(double)); 349 | 350 | for (i = 0; i 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include "gpu.h" 42 | #include "timer.h" 43 | 44 | cl_kernel interpolate_kernel; 45 | 46 | #ifdef HAVE_CONFIG_H 47 | #include "config.h" 48 | #endif 49 | 50 | #ifdef __APPLE_CC__ 51 | #include 52 | #else 53 | #include 54 | #endif 55 | 56 | #ifdef HAVE_CL_DOUBLE 57 | typedef double real; 58 | typedef cl_double cl_real; 59 | typedef cl_double4 cl_real4; 60 | #define EPS 1.0e-8 61 | #else 62 | typedef float real; 63 | typedef cl_float cl_real; 64 | typedef cl_float4 cl_real4; 65 | #define EPS 1.0e-5 66 | #endif 67 | 68 | #define TILE_SIZE 256 69 | #define dataval(x,y) data[(x)+((y)*xstride)] 70 | 71 | cl_context context; 72 | cl_command_queue queue; 73 | cl_program program; 74 | int is_nvidia=0; 75 | 76 | double random_normal_dist(void); 77 | double *interpolate_bruteforce(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis, 78 | double *density_array, double *temp_array, double *data); 79 | double *interpolate_bisection(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis, 80 | double *density_array, double *temp_array, double *data); 81 | int bisection(double *axis, int axis_size, double value); 82 | double *interpolate_hashcpu(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis, 83 | double *density_array, double *temp_array, double *data); 84 | #ifdef HAVE_OPENCL 85 | cl_mem interpolate_hashgpu(int isize, int xstride, int density_axis_size, int temp_axis_size, cl_mem density_axis_buffer, cl_mem temp_axis_buffer, 86 | cl_mem density_array_buffer, cl_mem temp_array_buffer, cl_mem data_buffer, double *time); 87 | #endif 88 | 89 | #include "tablelarge.data" 90 | 91 | int main(int argc, char *argv[]) 92 | { 93 | cl_int error; 94 | 95 | #ifdef HAVE_OPENCL 96 | GPUInit(&context, &queue, &is_nvidia, &program, "table_kern.cl"); 97 | 98 | interpolate_kernel = clCreateKernel(program, "interpolate_kernel", &error); 99 | if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); 100 | #endif 101 | 102 | int i; 103 | 104 | double temp, density; 105 | 106 | cl_int ierror; 107 | int data_size = sizeof(data)/sizeof(data[0]); 108 | int density_axis_size = sizeof(density_axis)/sizeof(density_axis[0]); 109 | int temp_axis_size = sizeof(temp_axis)/sizeof(temp_axis[0]); 110 | 111 | double density_increment = (density_axis[density_axis_size-1]-density_axis[0])/(double)(density_axis_size-1); 112 | double temp_increment = (temp_axis[temp_axis_size-1]-temp_axis[0])/(double)(temp_axis_size-1); 113 | 114 | double density_avg = (density_axis[density_axis_size-1]+density_axis[0])/2.0; 115 | double temp_avg = (temp_axis[temp_axis_size-1]+temp_axis[0])/2.0; 116 | 117 | double density_stddev = (density_axis[density_axis_size-1]-density_axis[0])/6.0; 118 | double temp_stddev = (temp_axis[temp_axis_size-1]-temp_axis[0])/6.0; 119 | 120 | for (i=1; i EPS ){ 248 | printf("Warning %d does not match -- test %lf gold %lf\n",i,value_test[i],value_gold[i]); 249 | } 250 | } 251 | 252 | free(value_test); 253 | #endif 254 | 255 | printf("\n"); 256 | 257 | free(value_gold); 258 | } 259 | 260 | #ifdef HAVE_OPENCL 261 | clReleaseMemObject(data_buffer); 262 | clReleaseMemObject(density_axis_buffer); 263 | clReleaseMemObject(temp_axis_buffer); 264 | #endif 265 | } 266 | 267 | 268 | double random_normal_dist(void) 269 | { 270 | double x1, x2, x3, result; 271 | 272 | x1 = 2.0*drand48() - 1.0; 273 | x2 = 2.0*drand48() - 1.0; 274 | x3 = 2.0*drand48() - 1.0; 275 | result = x1 + x2 + x3; 276 | 277 | return(result); 278 | } 279 | 280 | double *interpolate_bruteforce(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis, 281 | double *density_array, double *temp_array, double *data) 282 | { 283 | int i; 284 | 285 | double *value_array=(double *)malloc(isize*sizeof(double)); 286 | 287 | for (i = 0; i temp_axis[temp_slot+1]; temp_slot++); 291 | for (density_slot=0; density_slot density_axis[density_slot+1]; density_slot++); 292 | 293 | double xfrac = (density_array[i]-density_axis[density_slot])/(density_axis[density_slot+1]-density_axis[density_slot]); 294 | double yfrac = (temp_array[i]-temp_axis[temp_slot])/(temp_axis[temp_slot+1]-temp_axis[temp_slot]); 295 | value_array[i] = xfrac * yfrac *dataval(density_slot+1,temp_slot+1) 296 | + (1.0-xfrac)* yfrac *dataval(density_slot, temp_slot+1) 297 | + xfrac *(1.0-yfrac)*dataval(density_slot+1,temp_slot) 298 | + (1.0-xfrac)*(1.0-yfrac)*dataval(density_slot, temp_slot); 299 | 300 | } 301 | 302 | return(value_array); 303 | } 304 | 305 | double *interpolate_bisection(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis, 306 | double *density_array, double *temp_array, double *data) 307 | { 308 | int i; 309 | 310 | double *value_array=(double *)malloc(isize*sizeof(double)); 311 | 312 | for (i = 0; i 1){ 333 | int imid = (itop + ibot) /2; 334 | if ( value >= axis[imid] ) 335 | ibot = imid; 336 | else 337 | itop = imid; 338 | } 339 | return(ibot); 340 | } 341 | 342 | double *interpolate_hashcpu(int isize, int xstride, int density_axis_size, int temp_axis_size, double *density_axis, double *temp_axis, 343 | double *density_array, double *temp_array, double *data) 344 | { 345 | int i; 346 | // Computes a constant increment for each axis data look-up 347 | double density_increment = (density_axis[density_axis_size-1]-density_axis[0])/(double)(density_axis_size-1); 348 | double temp_increment = (temp_axis[temp_axis_size-1]-temp_axis[0])/(double)(temp_axis_size-1); 349 | 350 | double *value_array=(double *)malloc(isize*sizeof(double)); 351 | 352 | for (i = 0; i 4 | 5 | void cpu_timer_start(struct timespec *tstart_cpu); 6 | double cpu_timer_stop(struct timespec tstart_cpu); 7 | #endif 8 | --------------------------------------------------------------------------------