├── .gitignore ├── 3rd_party_dgemm_kernels └── amd_dgemm_2015_08_05 │ └── amddgemm.so ├── 3rd_party_template ├── config.mak ├── kernel.cl ├── makefile ├── makefiles └── template.cpp ├── AUTHORS ├── COPYING ├── COPYING.LESSER ├── README ├── ati_patch ├── 10.10 │ ├── install_org.sh │ ├── install_patched.sh │ └── libaticaldd.so.xdelta └── 10.9 │ ├── fix.cpp │ ├── foo │ ├── foo.s │ ├── install_org.sh │ ├── install_patched.sh │ └── libaticaldd.so.xdelta ├── benchmark.cpp ├── cal_fake.h ├── cal_private_ext.h ├── caldgemm.cl ├── caldgemm.cpp ├── caldgemm.h ├── caldgemm.il ├── caldgemm_adl.cpp ├── caldgemm_cal.cpp ├── caldgemm_cal.h ├── caldgemm_cblas_wrapper.h ├── caldgemm_common.h ├── caldgemm_config.sample ├── caldgemm_config_load.h ├── caldgemm_cpu.cpp ├── caldgemm_cpu.h ├── caldgemm_cuda.cu ├── caldgemm_cuda.h ├── caldgemm_opencl.cpp ├── caldgemm_opencl.h ├── caldgemm_parse_parameters.h ├── cmodules ├── affinity.cpp ├── affinity.h ├── get_private_profile.h ├── linux_helpers.h ├── os_low_level_helper.h ├── pthread_mutex_win32_wrapper.h ├── qmalloc.cpp ├── qmalloc.h ├── qmath.h ├── qmultialloc.cpp ├── qmultialloc.h ├── qsem.cpp ├── qsem.h ├── sched_affinity_win32_wrapper.h ├── switchtemplate.h ├── threadserver.cpp ├── threadserver.h ├── timer.cpp ├── timer.h ├── util_adl.cpp └── util_adl.h ├── config.mak ├── config_options.sample ├── config_options_load.mak ├── cudakernel.cu ├── environment └── caldgemm_setenv.sh.sample ├── gcc_patch └── libgomp.patch ├── gotoblas_patch └── gotoblas.patch ├── makefile ├── makefiles ├── as ├── callvc.bat ├── config.mak.sample ├── i686-pc-cygwin.mak ├── i686-pc-linux-gnu.mak ├── include.S ├── makefile ├── makefile_opencl_compiler.cpp ├── opencl_compiler_structs.h ├── opencl_obtain_program.h ├── x86_64-pc-linux-gnu.mak └── x86_64-unknown-cygwin.mak └── memtest ├── .gitignore ├── build.sh ├── cmd ├── info.sh ├── mem.cpp ├── timer.cpp └── timer.h /.gitignore: -------------------------------------------------------------------------------- 1 | release 2 | dgemm_bench.exe 3 | dgemm_bench 4 | .svn 5 | vcproject 6 | *.o 7 | caldgemm_config.h 8 | config_options.mak 9 | 3rd_party_template/dgemm_template.so 10 | amd_dgemm* 11 | -------------------------------------------------------------------------------- /3rd_party_dgemm_kernels/amd_dgemm_2015_08_05/amddgemm.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidrohr/caldgemm/bc875b373f78b47c8e58a10353cce7ef751210a1/3rd_party_dgemm_kernels/amd_dgemm_2015_08_05/amddgemm.so -------------------------------------------------------------------------------- /3rd_party_template/config.mak: -------------------------------------------------------------------------------- 1 | INTELARCH = SSE4.2 2 | CUDAVERSION = 20 3 | CUDAREGS = 64 4 | ARCHBITS = 64 5 | 6 | HIDEECHO = @ 7 | CC_x86_64-pc-linux-gnu = GCC 8 | CC_i686-pc-cygwin = ICC 9 | 10 | TARGET = dgemm_template 11 | 12 | INTELFLAGSUSE = $(INTELFLAGSOPT) 13 | VSNETFLAGSUSE = $(VSNETFLAGSOPT) 14 | GCCFLAGSUSE = $(GCCFLAGSOPT) 15 | NVCCFLAGSUSE = $(NVCCFLAGSOPT) 16 | 17 | TARGETTYPE = LIB 18 | 19 | CPPFILES = template.cpp 20 | 21 | CONFIG_OPENCL = 1 -------------------------------------------------------------------------------- /3rd_party_template/kernel.cl: -------------------------------------------------------------------------------- 1 | "__kernel void oclkernel(__global double* C, __global const double* __restrict const A, __global const double* __restrict const B, int height1, int height2, int width, double alpha, double beta, int pitch, ulong offset)\n" 2 | "//input parameters are standard DGEMM parameters (however in modified order, width = k, height1 = m, height2 = n, pitch = LDC)\n" 3 | "//matrices area assumed in row-major (you can get col-major by swapping A and B (and m,n)\n" 4 | "//there is no transposition parameters, the kernel can assume the best settings for optimal performance, the library must export the required options, caldgemm will tread the rest\n" 5 | "//LDA and LDB parameters are not present, they are as small as possible and hence equal m, n, k\n" 6 | "{\n" 7 | " int i, j, k;\n" 8 | " for (i = get_global_id(1);i < height2;i += get_global_size(1))\n" 9 | " {\n" 10 | " for (j = get_global_id(0);j < height1;j += get_global_size(0))\n" 11 | " {\n" 12 | " double addval = 0.;\n" 13 | " for (k = 0;k < width;k++)\n" 14 | " {\n" 15 | " addval += A[k * height2 + i] * B[k * height1 + j];\n" 16 | " }\n" 17 | " C[offset + i * pitch + j] = beta * C[offset + i * pitch + j] + alpha * addval;\n" 18 | " }\n" 19 | " }\n" 20 | "}\n" 21 | ; 22 | -------------------------------------------------------------------------------- /3rd_party_template/makefile: -------------------------------------------------------------------------------- 1 | include makefiles/makefile -------------------------------------------------------------------------------- /3rd_party_template/makefiles: -------------------------------------------------------------------------------- 1 | ../makefiles -------------------------------------------------------------------------------- /3rd_party_template/template.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define STD_OUT stdout 5 | 6 | #ifdef __WIN32 7 | #define DLL_EXPORT __declspec(dllexport) 8 | #else 9 | #define DLL_EXPORT 10 | #endif 11 | 12 | #define ERRRET(...) {fprintf(STD_OUT, __VA_ARGS__);fprintf(STD_OUT, "\n");return(1);} 13 | #define CHKRET(result, ...) \ 14 | if (result != CL_SUCCESS) \ 15 | { \ 16 | fprintf(STD_OUT, __VA_ARGS__); \ 17 | fprintf(STD_OUT, ":\n"); \ 18 | fprintf(STD_OUT, "OpenCL Error %d: (%s: %d) %s\n", result, __FILE__, __LINE__, opencl_error_string(result)); \ 19 | return(0); \ 20 | } 21 | 22 | #define quit(arg) {fprintf(stderr, arg "\n");exit(1);} 23 | 24 | //We must export several functions, kernelLibCreate to return the kernel object, kernelLibQuerySettings to return some parameters. 25 | //Initialize is called after loading the library at very first, it can return 1 in case of error, terminate is used to clean up. 26 | //The others are used to provide some insight in good matrix sizes to caldgemm. 27 | extern "C" DLL_EXPORT cl_kernel kernelLibCreate(cl_context* context, int nDevices, cl_device_id* devices, int kernelType, int k, int betazero); 28 | extern "C" DLL_EXPORT void kernelLibQuerySettings(int* tiling_x, int* tiling_y, bool* transposeA, bool* transposeB, bool* texture_buffers, int* group_size_x, int* group_size_y, int* min_tile_size, int* min_k); 29 | extern "C" DLL_EXPORT void kernelLibTerminate(); 30 | extern "C" DLL_EXPORT size_t suggestedMaxHeight(); 31 | extern "C" DLL_EXPORT size_t getAutoHeight(size_t MaxGpuM, size_t MaxGpuN, int nDevices, size_t Width); 32 | extern "C" DLL_EXPORT void modHeight(size_t MOD_OVER, size_t MOD_GPU); 33 | extern "C" DLL_EXPORT int kernelLibInitialize(cl_platform_id platform); 34 | 35 | //The kernels can be subject to some optimizations, depending on the parameters: 36 | //betazero indicates that beta can be assumed zero, regardless of other parameters 37 | //kernelType: 38 | //0 - no further optimizations 39 | //1 - can assume alpha = 1 40 | //2 - can assume alpha = 1, beta = 0/1 depending on betazero, and k is fixed to the parameter passed as k 41 | //3 - not used 42 | //4 - can assume alpha = -1, beta = 0/1 depending on betazero and k is fixed to the parameter passed as k 43 | //5 44 | //6 45 | 46 | //kernelLibQuerySettings must return 47 | //The tiling size in x and y (defines how many work-items are started 48 | //transposeA and transposeB define whether the kernel expects A or B input matrices in transposed form or not 49 | //texture_buffers = 1 means input is read from images, 0 stands for standard buffers 50 | //group_size_x/y defines the work-group-size 51 | 52 | cl_program ocl_program, ocl_programx; 53 | 54 | const char* kernel_str = 55 | #include "kernel.cl" 56 | ; 57 | 58 | int program_initialized = 0; 59 | 60 | const char* opencl_error_string(int errorcode) 61 | { 62 | switch (errorcode) 63 | { 64 | case CL_SUCCESS: return "Success!"; 65 | case CL_DEVICE_NOT_FOUND: return "Device not found."; 66 | case CL_DEVICE_NOT_AVAILABLE: return "Device not available"; 67 | case CL_COMPILER_NOT_AVAILABLE: return "Compiler not available"; 68 | case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "Memory object allocation failure"; 69 | case CL_OUT_OF_RESOURCES: return "Out of resources"; 70 | case CL_OUT_OF_HOST_MEMORY: return "Out of host memory"; 71 | case CL_PROFILING_INFO_NOT_AVAILABLE: return "Profiling information not available"; 72 | case CL_MEM_COPY_OVERLAP: return "Memory copy overlap"; 73 | case CL_IMAGE_FORMAT_MISMATCH: return "Image format mismatch"; 74 | case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "Image format not supported"; 75 | case CL_BUILD_PROGRAM_FAILURE: return "Program build failure"; 76 | case CL_MAP_FAILURE: return "Map failure"; 77 | case CL_INVALID_VALUE: return "Invalid value"; 78 | case CL_INVALID_DEVICE_TYPE: return "Invalid device type"; 79 | case CL_INVALID_PLATFORM: return "Invalid platform"; 80 | case CL_INVALID_DEVICE: return "Invalid device"; 81 | case CL_INVALID_CONTEXT: return "Invalid context"; 82 | case CL_INVALID_QUEUE_PROPERTIES: return "Invalid queue properties"; 83 | case CL_INVALID_COMMAND_QUEUE: return "Invalid command queue"; 84 | case CL_INVALID_HOST_PTR: return "Invalid host pointer"; 85 | case CL_INVALID_MEM_OBJECT: return "Invalid memory object"; 86 | case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "Invalid image format descriptor"; 87 | case CL_INVALID_IMAGE_SIZE: return "Invalid image size"; 88 | case CL_INVALID_SAMPLER: return "Invalid sampler"; 89 | case CL_INVALID_BINARY: return "Invalid binary"; 90 | case CL_INVALID_BUILD_OPTIONS: return "Invalid build options"; 91 | case CL_INVALID_PROGRAM: return "Invalid program"; 92 | case CL_INVALID_PROGRAM_EXECUTABLE: return "Invalid program executable"; 93 | case CL_INVALID_KERNEL_NAME: return "Invalid kernel name"; 94 | case CL_INVALID_KERNEL_DEFINITION: return "Invalid kernel definition"; 95 | case CL_INVALID_KERNEL: return "Invalid kernel"; 96 | case CL_INVALID_ARG_INDEX: return "Invalid argument index"; 97 | case CL_INVALID_ARG_VALUE: return "Invalid argument value"; 98 | case CL_INVALID_ARG_SIZE: return "Invalid argument size"; 99 | case CL_INVALID_KERNEL_ARGS: return "Invalid kernel arguments"; 100 | case CL_INVALID_WORK_DIMENSION: return "Invalid work dimension"; 101 | case CL_INVALID_WORK_GROUP_SIZE: return "Invalid work group size"; 102 | case CL_INVALID_WORK_ITEM_SIZE: return "Invalid work item size"; 103 | case CL_INVALID_GLOBAL_OFFSET: return "Invalid global offset"; 104 | case CL_INVALID_EVENT_WAIT_LIST: return "Invalid event wait list"; 105 | case CL_INVALID_EVENT: return "Invalid event"; 106 | case CL_INVALID_OPERATION: return "Invalid operation"; 107 | case CL_INVALID_GL_OBJECT: return "Invalid OpenGL object"; 108 | case CL_INVALID_BUFFER_SIZE: return "Invalid buffer size"; 109 | case CL_INVALID_MIP_LEVEL: return "Invalid mip-map level"; 110 | default: return "Unknown Errorcode"; 111 | } 112 | } 113 | 114 | cl_kernel kernelLibCreate(cl_context* context, int nDevices, cl_device_id* devices, int kernelType, int k, int betazero) 115 | { 116 | cl_int ocl_error; 117 | if (program_initialized == 0) 118 | { 119 | ocl_program = clCreateProgramWithSource(*context, 1, &kernel_str, NULL, &ocl_error); 120 | CHKRET(ocl_error, "Error creating program object"); 121 | ocl_error = clBuildProgram(ocl_program, nDevices, devices, 0, NULL, NULL); 122 | if (ocl_error != CL_SUCCESS) 123 | { 124 | fprintf(STD_OUT, "OpenCL Error while building program: %d\n", ocl_error); 125 | fprintf(STD_OUT, "OpenCL Kernel:\n\n%s\n\n", kernel_str); 126 | char build_log[16384]; 127 | for (int i = 0;i < nDevices;i++) 128 | { 129 | clGetProgramBuildInfo(ocl_program, devices[i], CL_PROGRAM_BUILD_LOG, 16384, build_log, NULL); 130 | fprintf(STD_OUT, "Build Log (device %d):\n\n%s\n\n", i, build_log); 131 | } 132 | return(0); 133 | } 134 | program_initialized = 1; 135 | } 136 | cl_kernel tmp = clCreateKernel(ocl_program, "oclkernel", &ocl_error); 137 | CHKRET(ocl_error, "Error creating kernel"); 138 | 139 | return(tmp); 140 | } 141 | 142 | void kernelLibTerminate() 143 | { 144 | if (program_initialized) 145 | { 146 | clReleaseProgram(ocl_program); 147 | program_initialized = 0; 148 | } 149 | } 150 | 151 | void kernelLibQuerySettings(int* tiling_x, int* tiling_y, bool* transposeA, bool* transposeB, bool* texture_buffers, int* group_size_x, int* group_size_y, int* min_tile_size, int* min_k) 152 | { 153 | *group_size_x = *group_size_y = 8; //We start a grid with work-group-size 8x8 and in total m/tilingx x n/tiling_y work items 154 | *tiling_x = *tiling_y = 4; 155 | *texture_buffers = false; 156 | *transposeA = true; 157 | *transposeB = false; 158 | *min_tile_size = 32; 159 | *min_k = 4; 160 | } 161 | 162 | int kernelLibInitialize(cl_platform_id platform) 163 | { 164 | return(0); 165 | } 166 | 167 | size_t suggestedMaxHeight() 168 | { 169 | return(4096); 170 | } 171 | 172 | //Suggest different height parameters depending on Matrix Size 173 | size_t getAutoHeight(size_t MaxGpuM, size_t MaxGpuN, int nDevices, size_t Width) 174 | { 175 | //Do not provide standard values for other GPU types, we rely on caldgemm defaults by returning 0 176 | return 0; 177 | } 178 | 179 | void modHeight(size_t MOD_OVER, size_t MOD_GPU) 180 | { 181 | } 182 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | David Rohr (drohr@jwdt.org) 2 | Matthias Bach (bach@compeng.uni-frankfurt.de) 3 | Matthias Kretz (kretz@compeng.uni-frankfurt.de) 4 | -------------------------------------------------------------------------------- /COPYING.LESSER: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /ati_patch/10.10/install_org.sh: -------------------------------------------------------------------------------- 1 | sudo cp libaticaldd.so.orig /usr/lib64/libaticaldd.so 2 | -------------------------------------------------------------------------------- /ati_patch/10.10/install_patched.sh: -------------------------------------------------------------------------------- 1 | sudo cp libaticaldd.so /usr/lib64/libaticaldd.so 2 | -------------------------------------------------------------------------------- /ati_patch/10.10/libaticaldd.so.xdelta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidrohr/caldgemm/bc875b373f78b47c8e58a10353cce7ef751210a1/ati_patch/10.10/libaticaldd.so.xdelta -------------------------------------------------------------------------------- /ati_patch/10.9/fix.cpp: -------------------------------------------------------------------------------- 1 | extern void **ddi_interface; 2 | 3 | void fixCal() 4 | { 5 | unsigned char *func = ddi_interface[0xa8/8]; 6 | func += 0x7fffe591b631 - 0x7fffe591b560; 7 | if (func[0] == 0x74) { 8 | func[0] = 0xeb; 9 | fprintf(stderr, "Replaced je with jmpq\n"); 10 | } else { 11 | fprintf(stderr, "Did not find je at the expected position\n"); 12 | } 13 | } 14 | void fixCal() 15 | { 16 | fprintf(stderr, "x\n"); 17 | unsigned char *foo = (unsigned char *)(&calCtxRunProgram); 18 | unsigned char **bar = *(unsigned char ***)((size_t)(*(unsigned int *)(foo + 2)) + foo + 6); 19 | fprintf(stderr, "bar = %p, ddi_interface[?] = %p\n", bar, 20 | bar + (0x10f588 - 0x4220)/sizeof(void*)); 21 | unsigned char *func = *(bar + (0x10f588 - 0x4220)/sizeof(void*)); 22 | func += 0x7fffe591b631 - 0x7fffe591b560; 23 | fprintf(stderr, "Read jump\n"); 24 | if (func[0] == 0x74) { 25 | fprintf(stderr, "Replace je with jmpq\n"); 26 | func[0] = 0xeb; 27 | fprintf(stderr, "Replaced je with jmpq\n"); 28 | } else { 29 | fprintf(stderr, "Did not find je at the expected position\n"); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /ati_patch/10.9/install_org.sh: -------------------------------------------------------------------------------- 1 | sudo cp libaticaldd.so.orig /usr/lib64/libaticaldd.so 2 | -------------------------------------------------------------------------------- /ati_patch/10.9/install_patched.sh: -------------------------------------------------------------------------------- 1 | sudo cp libaticaldd.so /usr/lib64/libaticaldd.so 2 | -------------------------------------------------------------------------------- /ati_patch/10.9/libaticaldd.so.xdelta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidrohr/caldgemm/bc875b373f78b47c8e58a10353cce7ef751210a1/ati_patch/10.9/libaticaldd.so.xdelta -------------------------------------------------------------------------------- /cal_fake.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the CALDGEMM library. 3 | * 4 | * Copyright 2015: 5 | * - David Rohr (drohr@jwdt.org) 6 | * - Matthias Bach (bach@compeng.uni-frankfurt.de) 7 | * - Matthias Kretz (kretz@compeng.uni-frankfurt.de) 8 | * 9 | * This file is part of CALDGEMM. 10 | * 11 | * CALDGEMM is free software: you can redistribute it and/or modify 12 | * it under the terms of the GNU Lesser General Public License as published by 13 | * the Free Software Foundation, either version 3 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * CALDGEMM is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU Lesser General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU Lesser General Public License 22 | * along with CALDGEMM. If not, see . 23 | */ 24 | 25 | #ifndef CAL_FAKE_H 26 | #define CAL_FAKE_H 27 | 28 | #include "cmodules/timer.h" 29 | #ifdef _WIN32 30 | #include "cmodules/pthread_mutex_win32_wrapper.h" 31 | #else 32 | #include 33 | #endif 34 | #include 35 | 36 | #define NUM_FAKE_EVENTS 1000000 37 | #define NUM_FAKE_MEM 10000 38 | #define NUM_FAKE_MODULE 100 39 | #define NUM_FAKE_NAME 1000 40 | #define NUM_MODULE_NAMES 13 41 | 42 | #define CAL_FAKE_PASSTHROUGH 43 | #define CAL_FAKE_CHECKMEM 44 | //#define CAL_FAKE_VERBOSE 45 | 46 | class cal_fake_event 47 | { 48 | public: 49 | HighResTimer timer; 50 | int initialized; 51 | int queried; 52 | int reused; 53 | double delay; 54 | int mems[NUM_MODULE_NAMES]; 55 | int nmems; 56 | CALevent through; 57 | 58 | cal_fake_event() {initialized = queried = reused = 0;} 59 | }; 60 | 61 | class cal_fake_mem 62 | { 63 | public: 64 | int released; 65 | int active; 66 | 67 | CALmem through; 68 | }; 69 | 70 | class cal_fake_module 71 | { 72 | public: 73 | int released; 74 | int nnames; 75 | int names[NUM_MODULE_NAMES]; 76 | 77 | CALmodule through; 78 | CALfunc throughFunc; 79 | }; 80 | 81 | class cal_fake_name 82 | { 83 | public: 84 | int mem; 85 | 86 | CALname through; 87 | }; 88 | 89 | class cal_fake 90 | { 91 | public: 92 | cal_fake_event event[NUM_FAKE_EVENTS]; 93 | pthread_mutex_t mutex; 94 | int curevent; 95 | 96 | cal_fake_mem mem[NUM_FAKE_MEM]; 97 | int curmem; 98 | 99 | cal_fake_module module[NUM_FAKE_MODULE]; 100 | int curmodule; 101 | 102 | cal_fake_name name[NUM_FAKE_NAME]; 103 | int curname; 104 | 105 | cal_fake() 106 | { 107 | pthread_mutex_init(&mutex, NULL); 108 | curevent = 0; 109 | curmem = 0; 110 | curmodule = 0; 111 | curname = 0; 112 | } 113 | 114 | ~cal_fake() 115 | { 116 | pthread_mutex_destroy(&mutex); 117 | for (int i = 0;i < curevent;i++) 118 | { 119 | if (event[i].queried == 0) printf("Warning, event %d not queried\n", i); 120 | } 121 | } 122 | 123 | CALresult AddEvent(CALevent* pevent, bool lock = true) 124 | { 125 | #ifdef CAL_FAKE_VERBOSE 126 | fprintf(STD_OUT, "CREATE EVENT %d\n", curevent); 127 | #endif 128 | *pevent = curevent; 129 | if (lock) pthread_mutex_lock(&mutex); 130 | if (event[curevent].initialized && !event[curevent].queried) 131 | { 132 | printf("------------------------ Event reused before queried\n"); 133 | while (true); 134 | } 135 | if (event[curevent].initialized) event[curevent].reused = 1; 136 | event[curevent].initialized = 1; 137 | event[curevent].queried = 0; 138 | event[curevent].timer.Reset(); 139 | event[curevent].timer.Start(); 140 | event[curevent].delay = (rand() % 1000) / 100000.; 141 | event[curevent].nmems = 0; 142 | curevent = (curevent + 1) % NUM_FAKE_EVENTS; 143 | if (lock) pthread_mutex_unlock(&mutex); 144 | return(CAL_RESULT_OK); 145 | } 146 | 147 | CALresult QueryEvent(CALevent num) 148 | { 149 | #ifdef CAL_FAKE_VERBOSE 150 | fprintf(STD_OUT, "QUERY EVENT %d\n", num); 151 | #endif 152 | CALresult retVal; 153 | pthread_mutex_lock(&mutex); 154 | if (num >= NUM_FAKE_EVENTS) 155 | { 156 | printf("------------------------- Requested fake event with handle %d >= %d\n", num, NUM_FAKE_EVENTS); 157 | retVal = CAL_RESULT_BAD_HANDLE; 158 | } 159 | else if (event[num].initialized == 0) 160 | { 161 | printf("------------------------- Fake event with handle %d not initialized\n", num); 162 | retVal = CAL_RESULT_BAD_HANDLE; 163 | } 164 | else if (event[num].queried) 165 | { 166 | printf("------------------------- Fake event with handle %d already queried\n", num); 167 | retVal = CAL_RESULT_BAD_HANDLE; 168 | } 169 | else 170 | { 171 | event[num].timer.Stop(); 172 | #ifndef CAL_FAKE_PASSTHROUGH 173 | if (event[num].timer.GetElapsedTime() <= event[num].delay) 174 | { 175 | event[num].timer.Start(); 176 | retVal = CAL_RESULT_PENDING; 177 | } 178 | else 179 | #endif 180 | { 181 | event[num].queried = 1; 182 | for (int i = 0;i < event[num].nmems;i++) mem[event[num].mems[i]].active--; 183 | retVal = CAL_RESULT_OK; 184 | } 185 | } 186 | pthread_mutex_unlock(&mutex); 187 | if(retVal == CAL_RESULT_BAD_HANDLE) while(true); 188 | return(retVal); 189 | } 190 | 191 | void ListMemCollisions(int mem) 192 | { 193 | for (int i = 0;i < NUM_FAKE_EVENTS;i++) 194 | { 195 | if (event[i].initialized && !event[i].queried) 196 | { 197 | for (int j = 0;j < event[i].nmems;j++) 198 | { 199 | if (event[i].mems[j] == mem) 200 | { 201 | printf("Collision with event %d\n", i); 202 | } 203 | } 204 | } 205 | } 206 | } 207 | 208 | CALresult AddMemHandle(CALmem* m) 209 | { 210 | pthread_mutex_lock(&mutex); 211 | if (curmem == NUM_FAKE_MEM) 212 | { 213 | fprintf(stderr, "NUM_FAKE_MEM overflow\n"); 214 | while(true); 215 | } 216 | *m = curmem; 217 | mem[curmem].released = 0; 218 | mem[curmem].active = 0; 219 | curmem++; 220 | pthread_mutex_unlock(&mutex); 221 | return(CAL_RESULT_OK); 222 | } 223 | 224 | CALresult AddModule(CALmodule* mod) 225 | { 226 | pthread_mutex_lock(&mutex); 227 | if (curmodule == NUM_FAKE_MODULE) 228 | { 229 | fprintf(stderr, "NUM_FAKE_MODULE overflow\n"); 230 | while(true); 231 | } 232 | *mod = curmodule; 233 | module[curmodule].released = 0; 234 | module[curmodule].nnames = 0; 235 | curmodule++; 236 | pthread_mutex_unlock(&mutex); 237 | return(CAL_RESULT_OK); 238 | } 239 | 240 | CALresult AddName(CALname* nam, CALmodule mod) 241 | { 242 | //printf("Giving name %d (mod %d)\n", curname, mod); 243 | pthread_mutex_lock(&mutex); 244 | if (curname == NUM_FAKE_NAME) 245 | { 246 | fprintf(stderr, "NUM_FAKE_NAME overflow\n"); 247 | while(true); 248 | } 249 | if (mod > (unsigned) curmodule) 250 | { 251 | fprintf(stderr, "Invalid Module\n"); 252 | while(true); 253 | } 254 | if (module[mod].nnames == NUM_MODULE_NAMES) 255 | { 256 | fprintf(stderr, "NUM_MODULE_NAMES overflow\n"); 257 | while(true); 258 | } 259 | *nam = curname; 260 | module[mod].names[module[mod].nnames] = curname; 261 | module[mod].nnames++; 262 | name[curname].mem = 0; 263 | curname++; 264 | pthread_mutex_unlock(&mutex); 265 | return(CAL_RESULT_OK); 266 | } 267 | 268 | CALresult FakeMemcpy(CALmem mem1, CALmem mem2, CALevent* ev, int allowOverlap = 0) 269 | { 270 | pthread_mutex_lock(&mutex); 271 | #ifdef CAL_FAKE_CHECKMEM 272 | if (allowOverlap == 0 && (mem[mem1].active || mem[mem2].active)) 273 | { 274 | fprintf(stderr, "Memory active when starting memcpy (src: %d, dst: %d)\n", mem[mem1].active, mem[mem2].active); 275 | while(true); 276 | } 277 | #endif 278 | AddEvent(ev, false); 279 | event[*ev].nmems = 2; 280 | event[*ev].mems[0] = mem1; 281 | event[*ev].mems[1] = mem2; 282 | mem[mem1].active++; 283 | mem[mem2].active++; 284 | pthread_mutex_unlock(&mutex); 285 | return(CAL_RESULT_OK); 286 | } 287 | 288 | CALresult FakeKernel(CALfunc func, CALevent* ev, int allowOverlap) 289 | { 290 | pthread_mutex_lock(&mutex); 291 | if (func > (unsigned) curmodule) 292 | { 293 | fprintf(stderr, "Invalid func/module"); 294 | while(true); 295 | } 296 | #ifdef CAL_FAKE_CHECKMEM 297 | for (int i = 0;i < module[func].nnames;i++) 298 | { 299 | if (i >= allowOverlap && mem[name[module[func].names[i]].mem].active) 300 | { 301 | fprintf(stderr, "Memory %d (of %d) active when starting kernel (allowed overlap %d)\n", i, module[func].nnames, allowOverlap); 302 | ListMemCollisions(name[module[func].names[i]].mem); 303 | while(true); 304 | } 305 | mem[name[module[func].names[i]].mem].active++; 306 | } 307 | #endif 308 | AddEvent(ev, false); 309 | event[*ev].nmems = module[func].nnames; 310 | for (int i = 0;i < module[func].nnames;i++) event[*ev].mems[i] = name[module[func].names[i]].mem; 311 | pthread_mutex_unlock(&mutex); 312 | return(CAL_RESULT_OK); 313 | } 314 | 315 | CALresult SetMem(CALname nam, CALmem m) 316 | { 317 | if (nam > (unsigned) curname || m > (unsigned) curmem) 318 | { 319 | fprintf(stderr, "Invalid name/mem\n"); 320 | while(true); 321 | } 322 | name[nam].mem = m; 323 | return(CAL_RESULT_OK); 324 | } 325 | 326 | CALresult GetFunc(CALfunc* fun, CALmodule mod) 327 | { 328 | *fun = mod; 329 | return(CAL_RESULT_OK); 330 | } 331 | 332 | CALresult ReleaseMem(int m) 333 | { 334 | mem[m].released = 1; 335 | return(CAL_RESULT_OK); 336 | } 337 | 338 | CALresult UnloadModule(int mod) 339 | { 340 | module[mod].released = 1; 341 | return(CAL_RESULT_OK); 342 | } 343 | }; 344 | 345 | cal_fake fake; 346 | 347 | #ifndef CAL_FAKE_PASSTHROUGH 348 | #define calCtxRunProgram(event, ctx, func, rect) fake.FakeKernel(func, event) 349 | #define calMemCopy(event, ctx, src, dest, flags) fake.FakeMemcpy(src, dest, event) 350 | #define calCtxIsEventDone(ctx, event) fake.QueryEvent(event) 351 | #define calCtxGetMem(mem, ctx, res) fake.AddMemHandle(mem) 352 | #define calCtxSetMem(ctx, name, mem) fake.SetMem(name, mem) 353 | #define calCtxReleaseMem(ctx, mem) fake.ReleaseMem(mem) 354 | #define calModuleLoad(module, ctx, image) fake.AddModule(module) 355 | #define calModuleUnload(ctx, module) fake.UnloadModule(module) 356 | #define calModuleGetName(name, ctx, module, string) fake.AddName(name, module) 357 | #define calModuleGetEntry(func, ctx, module, string) fake.GetFunc(func, module) 358 | #else 359 | 360 | static inline CALresult calCtxRunProgram_a(CALevent* event, CALcontext ctx, CALfunc func, CALdomain* rect) 361 | { 362 | fake.FakeKernel(func, event, 0); 363 | return(calCtxRunProgram(&fake.event[*event].through, ctx, fake.module[func].throughFunc, rect)); 364 | } 365 | 366 | static inline CALresult calMemCopy_a(CALevent* event, CALcontext ctx, CALmem src, CALmem dest, CALuint flags) 367 | { 368 | fake.FakeMemcpy(src, dest, event, 0); 369 | return(calMemCopy(&fake.event[*event].through, ctx, fake.mem[src].through, fake.mem[dest].through, flags)); 370 | } 371 | 372 | static inline CALresult calCtxRunProgram_b(CALevent* event, CALcontext ctx, CALfunc func, CALdomain* rect, int allowOverlap = 0) 373 | { 374 | fake.FakeKernel(func, event, allowOverlap); 375 | return(calCtxRunProgram(&fake.event[*event].through, ctx, fake.module[func].throughFunc, rect)); 376 | } 377 | 378 | static inline CALresult calMemCopy_b(CALevent* event, CALcontext ctx, CALmem src, CALmem dest, CALuint flags, int allowOverlap = 0) 379 | { 380 | fake.FakeMemcpy(src, dest, event, allowOverlap); 381 | return(calMemCopy(&fake.event[*event].through, ctx, fake.mem[src].through, fake.mem[dest].through, flags)); 382 | } 383 | 384 | static inline CALresult calCtxIsEventDone_a(CALcontext ctx, CALevent event) 385 | { 386 | CALresult retVal = calCtxIsEventDone(ctx, fake.event[event].through); 387 | if (retVal == CAL_RESULT_OK) fake.QueryEvent(event); 388 | return(retVal); 389 | } 390 | 391 | static inline CALresult calCtxGetMem_a(CALmem* mem, CALcontext ctx, CALresource res) 392 | { 393 | fake.AddMemHandle(mem); 394 | return(calCtxGetMem(&fake.mem[*mem].through, ctx, res)); 395 | } 396 | 397 | static inline CALresult calCtxSetMem_a(CALcontext ctx, CALname name, CALmem mem) 398 | { 399 | fake.SetMem(name, mem); 400 | return(calCtxSetMem(ctx, fake.name[name].through, fake.mem[mem].through)); 401 | } 402 | 403 | static inline CALresult calCtxReleaseMem_a(CALcontext ctx, CALmem mem) 404 | { 405 | fake.ReleaseMem(mem); 406 | return(calCtxReleaseMem(ctx, fake.mem[mem].through)); 407 | } 408 | 409 | static inline CALresult calModuleLoad_a(CALmodule* module, CALcontext ctx, CALimage image) 410 | { 411 | fake.AddModule(module); 412 | return(calModuleLoad(&fake.module[*module].through, ctx, image)); 413 | } 414 | 415 | static inline CALresult calModuleUnload_a(CALcontext ctx, CALmodule module) 416 | { 417 | fake.UnloadModule(module); 418 | return(calModuleUnload(ctx, fake.module[module].through)); 419 | } 420 | 421 | static inline CALresult calModuleGetName_a(CALname* name, CALcontext ctx, CALmodule module, const CALchar* symbolname) 422 | { 423 | fake.AddName(name, module); 424 | return(calModuleGetName(&fake.name[*name].through, ctx, fake.module[module].through, symbolname)); 425 | } 426 | 427 | static inline CALresult calModuleGetEntry_a(CALfunc* func, CALcontext ctx, CALmodule module, const CALchar* symbolname) 428 | { 429 | fake.GetFunc(func, module); 430 | return(calModuleGetEntry(&fake.module[module].throughFunc, ctx, fake.module[module].through, symbolname)); 431 | } 432 | 433 | #define calCtxRunProgram calCtxRunProgram_a 434 | #define calMemCopy calMemCopy_a 435 | #define calCtxIsEventDone calCtxIsEventDone_a 436 | #define calCtxGetMem calCtxGetMem_a 437 | #define calCtxSetMem calCtxSetMem_a 438 | #define calCtxReleaseMem calCtxReleaseMem_a 439 | #define calModuleLoad calModuleLoad_a 440 | #define calModuleUnload calModuleUnload_a 441 | #define calModuleGetName calModuleGetName_a 442 | #define calModuleGetEntry calModuleGetEntry_a 443 | 444 | #endif 445 | 446 | #endif -------------------------------------------------------------------------------- /cal_private_ext.h: -------------------------------------------------------------------------------- 1 | /* ============================================================ 2 | 3 | Copyright (c) 2007 Advanced Micro Devices, Inc. All rights reserved. 4 | 5 | Redistribution and use of this material is permitted under the following 6 | conditions: 7 | 8 | Redistributions must retain the above copyright notice and all terms of this 9 | license. 10 | 11 | In no event shall anyone redistributing or accessing or using this material 12 | commence or participate in any arbitration or legal action relating to this 13 | material against Advanced Micro Devices, Inc. or any copyright holders or 14 | contributors. The foregoing shall survive any expiration or termination of 15 | this license or any agreement or access or use related to this material. 16 | 17 | ANY BREACH OF ANY TERM OF THIS LICENSE SHALL RESULT IN THE IMMEDIATE REVOCATION 18 | OF ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE THIS MATERIAL. 19 | 20 | THIS MATERIAL IS PROVIDED BY ADVANCED MICRO DEVICES, INC. AND ANY COPYRIGHT 21 | HOLDERS AND CONTRIBUTORS "AS IS" IN ITS CURRENT CONDITION AND WITHOUT ANY 22 | REPRESENTATIONS, GUARANTEE, OR WARRANTY OF ANY KIND OR IN ANY WAY RELATED TO 23 | SUPPORT, INDEMNITY, ERROR FREE OR UNINTERRUPTED OPERATION, OR THAT IT IS FREE 24 | FROM DEFECTS OR VIRUSES. ALL OBLIGATIONS ARE HEREBY DISCLAIMED - WHETHER 25 | EXPRESS, IMPLIED, OR STATUTORY - INCLUDING, BUT NOT LIMITED TO, ANY IMPLIED 26 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, 27 | ACCURACY, COMPLETENESS, OPERABILITY, QUALITY OF SERVICE, OR NON-INFRINGEMENT. 28 | IN NO EVENT SHALL ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR 29 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, PUNITIVE, 30 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT 31 | OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, REVENUE, DATA, OR PROFITS; OR 32 | BUSINESS INTERRUPTION) HOWEVER CAUSED OR BASED ON ANY THEORY OF LIABILITY 33 | ARISING IN ANY WAY RELATED TO THIS MATERIAL, EVEN IF ADVISED OF THE POSSIBILITY 34 | OF SUCH DAMAGE. THE ENTIRE AND AGGREGATE LIABILITY OF ADVANCED MICRO DEVICES, 35 | INC. AND ANY COPYRIGHT HOLDERS AND CONTRIBUTORS SHALL NOT EXCEED TEN DOLLARS 36 | (US $10.00). ANYONE REDISTRIBUTING OR ACCESSING OR USING THIS MATERIAL ACCEPTS 37 | THIS ALLOCATION OF RISK AND AGREES TO RELEASE ADVANCED MICRO DEVICES, INC. AND 38 | ANY COPYRIGHT HOLDERS AND CONTRIBUTORS FROM ANY AND ALL LIABILITIES, 39 | OBLIGATIONS, CLAIMS, OR DEMANDS IN EXCESS OF TEN DOLLARS (US $10.00). THE 40 | FOREGOING ARE ESSENTIAL TERMS OF THIS LICENSE AND, IF ANY OF THESE TERMS ARE 41 | CONSTRUED AS UNENFORCEABLE, FAIL IN ESSENTIAL PURPOSE, OR BECOME VOID OR 42 | DETRIMENTAL TO ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR 43 | CONTRIBUTORS FOR ANY REASON, THEN ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE 44 | THIS MATERIAL SHALL TERMINATE IMMEDIATELY. MOREOVER, THE FOREGOING SHALL 45 | SURVIVE ANY EXPIRATION OR TERMINATION OF THIS LICENSE OR ANY AGREEMENT OR 46 | ACCESS OR USE RELATED TO THIS MATERIAL. 47 | 48 | NOTICE IS HEREBY PROVIDED, AND BY REDISTRIBUTING OR ACCESSING OR USING THIS 49 | MATERIAL SUCH NOTICE IS ACKNOWLEDGED, THAT THIS MATERIAL MAY BE SUBJECT TO 50 | RESTRICTIONS UNDER THE LAWS AND REGULATIONS OF THE UNITED STATES OR OTHER 51 | COUNTRIES, WHICH INCLUDE BUT ARE NOT LIMITED TO, U.S. EXPORT CONTROL LAWS SUCH 52 | AS THE EXPORT ADMINISTRATION REGULATIONS AND NATIONAL SECURITY CONTROLS AS 53 | DEFINED THEREUNDER, AS WELL AS STATE DEPARTMENT CONTROLS UNDER THE U.S. 54 | MUNITIONS LIST. THIS MATERIAL MAY NOT BE USED, RELEASED, TRANSFERRED, IMPORTED, 55 | EXPORTED AND/OR RE-EXPORTED IN ANY MANNER PROHIBITED UNDER ANY APPLICABLE LAWS, 56 | INCLUDING U.S. EXPORT CONTROL LAWS REGARDING SPECIFICALLY DESIGNATED PERSONS, 57 | COUNTRIES AND NATIONALS OF COUNTRIES SUBJECT TO NATIONAL SECURITY CONTROLS. 58 | MOREOVER, THE FOREGOING SHALL SURVIVE ANY EXPIRATION OR TERMINATION OF ANY 59 | LICENSE OR AGREEMENT OR ACCESS OR USE RELATED TO THIS MATERIAL. 60 | 61 | NOTICE REGARDING THE U.S. GOVERNMENT AND DOD AGENCIES: This material is 62 | provided with "RESTRICTED RIGHTS" and/or "LIMITED RIGHTS" as applicable to 63 | computer software and technical data, respectively. Use, duplication, 64 | distribution or disclosure by the U.S. Government and/or DOD agencies is 65 | subject to the full extent of restrictions in all applicable regulations, 66 | including those found at FAR52.227 and DFARS252.227 et seq. and any successor 67 | regulations thereof. Use of this material by the U.S. Government and/or DOD 68 | agencies is acknowledgment of the proprietary rights of any copyright holders 69 | and contributors, including those of Advanced Micro Devices, Inc., as well as 70 | the provisions of FAR52.227-14 through 23 regarding privately developed and/or 71 | commercial computer software. 72 | 73 | This license forms the entire agreement regarding the subject matter hereof and 74 | supersedes all proposals and prior discussions and writings between the parties 75 | with respect thereto. This license does not affect any ownership, rights, title, 76 | or interest in, or relating to, this material. No terms of this license can be 77 | modified or waived, and no breach of this license can be excused, unless done 78 | so in a writing signed by all affected parties. Each term of this license is 79 | separately enforceable. If any term of this license is determined to be or 80 | becomes unenforceable or illegal, such term shall be reformed to the minimum 81 | extent necessary in order for this license to remain in effect in accordance 82 | with its terms as modified by such reformation. This license shall be governed 83 | by and construed in accordance with the laws of the State of Texas without 84 | regard to rules on conflicts of law of any state or jurisdiction or the United 85 | Nations Convention on the International Sale of Goods. All disputes arising out 86 | of this license shall be subject to the jurisdiction of the federal and state 87 | courts in Austin, Texas, and all defenses are hereby waived concerning personal 88 | jurisdiction and venue of these courts. 89 | 90 | ============================================================ */ 91 | 92 | #ifndef __CAL_PRIVATE_EXT_H__ 93 | #define __CAL_PRIVATE_EXT_H__ 94 | 95 | #include "cal_ext.h" 96 | 97 | #ifdef __cplusplus 98 | extern "C" { 99 | #endif 100 | 101 | #ifndef CALAPIENTRYP 102 | #define CALAPIENTRYP CALAPIENTRY * 103 | #endif 104 | 105 | 106 | typedef enum calPrivateExtidEnum { 107 | CAL_PRIVATE_EXT_SYNC_OBJECT = 0x8009, 108 | } calPrivateExtid; 109 | 110 | 111 | // flags for calCtxWaitForEvents 112 | typedef enum CALwaitTypeEnum 113 | { 114 | CAL_WAIT_LOW_CPU_UTILIZATION = 0, 115 | CAL_WAIT_POLLING = 1, 116 | } CALwaitType; 117 | 118 | /** 119 | * @fn calCtxWaitForEvents(CALcontext ctx, 120 | * CALevent *events, 121 | * CALuint n, 122 | * CALuint flags) 123 | * 124 | * @brief wait until all programs referenced by event list have executed. 125 | * 126 | * @param ctx (in) - CAL context 127 | * @param events (in) - array of events 128 | * @param n (in) - number of events 129 | * @param flags (in) - currently unused. 130 | * 131 | * @return Returns CAL_RESULT_OK on success, CAL_RESULT_ERROR if there was an error. 132 | * 133 | */ 134 | typedef CALresult (CALAPIENTRYP PFNCALCTXWAITFOREVENTS) (CALcontext ctx, CALevent *events, CALuint n, CALuint flags); 135 | 136 | 137 | #ifdef __cplusplus 138 | } 139 | #endif 140 | #endif // __CAL_PRIVATE_EXT_H__ 141 | 142 | 143 | 144 | -------------------------------------------------------------------------------- /caldgemm.cl: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the CALDGEMM library. 3 | * 4 | * Copyright 2015: 5 | * - David Rohr (drohr@jwdt.org) 6 | * - Matthias Bach (bach@compeng.uni-frankfurt.de) 7 | * - Matthias Kretz (kretz@compeng.uni-frankfurt.de) 8 | * 9 | * This file is part of CALDGEMM. 10 | * 11 | * CALDGEMM is free software: you can redistribute it and/or modify 12 | * it under the terms of the GNU Lesser General Public License as published by 13 | * the Free Software Foundation, either version 3 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * CALDGEMM is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU Lesser General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU Lesser General Public License 22 | * along with CALDGEMM. If not, see . 23 | */ 24 | 25 | #define qon_mstr(a) #a 26 | #define qon_mxstr(a) qon_mstr(a) 27 | 28 | #ifdef OCL_USE_SIMPLE_BUFFERS 29 | 30 | #ifdef CALDGEMM_TRANSPOSED_B 31 | 32 | const char *caldgemm_opencl::OCLKernelName = 33 | OCL_KERNEL_PRE 34 | "//KERNEL TRANSPOSED B SIMPLE BUFFERS\n" 35 | "__kernel void oclkernel(__global double* C, __global const double* __restrict const A, __global const double* __restrict const B, int height1, int height2, int width, double alpha, double beta, int pitch, ulong offset)\n" 36 | "{\n" 37 | " int i, j, k;\n" 38 | " for (i = get_global_id(1);i < height2;i += get_global_size(1))\n" 39 | " {\n" 40 | " for (j = get_global_id(0);j < height1;j += get_global_size(0))\n" 41 | " {\n" 42 | " double addval = 0.;\n" 43 | #ifdef CALDGEMM_FORCE_K 44 | " for (k = 0;k < " qon_mxstr(CALDGEMM_FORCE_K) ";k++)\n" 45 | #else 46 | " for (k = 0;k < width;k++)\n" 47 | #endif 48 | " {\n" 49 | " addval += A[i * width + k] * B[j * width + k];\n" 50 | " }\n" 51 | #ifdef CALDGEMM_ALPHA1 52 | " C[offset + i * pitch + j] = beta * C[offset + i * pitch + j] + addval;\n" 53 | #else 54 | " C[offset + i * pitch + j] = beta * C[offset + i * pitch + j] + alpha * addval;\n" 55 | #endif 56 | " }\n" 57 | " }\n" 58 | "}\n" 59 | ; 60 | 61 | #else 62 | 63 | const char *caldgemm_opencl::OCLKernelName = 64 | OCL_KERNEL_PRE 65 | "//KERNEL TRANSPOSED A SIMPLE BUFFERS\n" 66 | "__kernel void oclkernel(__global double* C, __global const double* __restrict const A, __global const double* __restrict const B, int height1, int height2, int width, double alpha, double beta, int pitch, ulong offset)\n" 67 | "{\n" 68 | " int i, j, k;\n" 69 | " for (i = get_global_id(1);i < height2;i += get_global_size(1))\n" 70 | " {\n" 71 | " for (j = get_global_id(0);j < height1;j += get_global_size(0))\n" 72 | " {\n" 73 | " double addval = 0.;\n" 74 | #ifdef CALDGEMM_FORCE_K 75 | " for (k = 0;k < " qon_mxstr(CALDGEMM_FORCE_K) ";k++)\n" 76 | #else 77 | " for (k = 0;k < width;k++)\n" 78 | #endif 79 | " {\n" 80 | " addval += A[k * height2 + i] * B[k * height1 + j];\n" 81 | " }\n" 82 | #ifdef CALDGEMM_ALPHA1 83 | " C[offset + i * pitch + j] = beta * C[offset + i * pitch + j] + addval;\n" 84 | #else 85 | " C[offset + i * pitch + j] = beta * C[offset + i * pitch + j] + alpha * addval;\n" 86 | #endif 87 | " }\n" 88 | " }\n" 89 | "}\n" 90 | ; 91 | 92 | #endif 93 | 94 | 95 | #else //OCL_USE_SIMPLE_BUFFERS 96 | 97 | 98 | #ifdef CALDGEMM_TRANSPOSED_B 99 | 100 | const char *caldgemm_opencl::OCLKernelName = 101 | OCL_KERNEL_PRE 102 | "//KERNEL TRANSPOSED B TEXTURE BUFFERS\n" 103 | "union double_read {uint4 f; double2 d;};\n" 104 | "__kernel void oclkernel(__global double* C, image2d_t A, image2d_t B, int height1, int height2, int width, double alpha, double beta, int pitch, ulong offset)\n" 105 | "{\n" 106 | " const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n" 107 | " int i, j, k;\n" 108 | " for (i = get_global_id(1);i < height2;i += get_global_size(1))\n" 109 | " {\n" 110 | " for (j = get_global_id(0);j < height1;j += get_global_size(0))\n" 111 | " {\n" 112 | " double addval = 0.;\n" 113 | #ifdef CALDGEMM_FORCE_K 114 | " for (k = 0;k < " qon_mxstr(CALDGEMM_FORCE_K) " / 2;k++)\n" 115 | #else 116 | " for (k = 0;k < width / 2;k++)\n" 117 | #endif 118 | " {\n" 119 | " float2 coord;\n" 120 | " union double_read tmp, tmp2;\n" 121 | " coord.x = k;\n" 122 | " coord.y = i;\n" 123 | " tmp.f = read_imageui(A, sampler, coord);\n" 124 | " coord.y = j;\n" 125 | " tmp2.f = read_imageui(B, sampler, coord);\n" 126 | " addval += tmp.d.x * tmp2.d.x + tmp.d.y * tmp2.d.y;\n" 127 | " }\n" 128 | #ifdef CALDGEMM_ALPHA1 129 | " C[offset + i * pitch + j] = beta * C[offset + i * pitch + j] + addval;\n" 130 | #else 131 | " C[offset + i * pitch + j] = beta * C[offset + i * pitch + j] + alpha * addval;\n" 132 | #endif 133 | " }\n" 134 | " }\n" 135 | "}\n" 136 | ; 137 | 138 | #elif defined(CALDGEMM_TRANSPOSED_A) 139 | 140 | #ifndef OCL_TILED_KERNEL 141 | 142 | const char *caldgemm_opencl::OCLKernelName = 143 | OCL_KERNEL_PRE 144 | "//KERNEL TRANSPOSED A TEXTURE BUFFERS\n" 145 | "union double_read {uint4 f; double2 d;};\n" 146 | "__kernel void oclkernel(__global double* C, image2d_t A, image2d_t B, int height1, int height2, int width, double alpha, double beta, int pitch, ulong offset)\n" 147 | "{\n" 148 | " const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n" 149 | " int i, j, k;\n" 150 | " for (i = get_global_id(1);i < height2;i += get_global_size(1))\n" 151 | " {\n" 152 | " for (j = get_global_id(0);j < height1;j += get_global_size(0))\n" 153 | " {\n" 154 | " double addval = 0.;\n" 155 | #ifdef CALDGEMM_FORCE_K 156 | " for (k = 0;k < " qon_mxstr(CALDGEMM_FORCE_K) ";k++)\n" 157 | #else 158 | " for (k = 0;k < width;k++)\n" 159 | #endif 160 | " {\n" 161 | " float2 coord;\n" 162 | " union double_read tmp, tmp2;\n" 163 | " coord.x = i / 2;\n" 164 | " coord.y = k;\n" 165 | " tmp.f = read_imageui(A, sampler, coord);\n" 166 | " coord.x = j / 2;\n" 167 | " tmp2.f = read_imageui(B, sampler, coord);\n" 168 | " double v1 = (i & 1) ? tmp.d.y : tmp.d.x, v2 = (j & 1) ? tmp2.d.y : tmp2.d.x;\n" 169 | " addval += v1 * v2;\n" 170 | " }\n" 171 | #ifdef CALDGEMM_ALPHA1 172 | " C[offset + i * pitch + j] = beta * C[offset + i * pitch + j] + addval;\n" 173 | #else 174 | " C[offset + i * pitch + j] = beta * C[offset + i * pitch + j] + alpha * addval;\n" 175 | #endif 176 | " }\n" 177 | " }\n" 178 | "}\n" 179 | ; 180 | 181 | #else 182 | 183 | const char *caldgemm_opencl::OCLKernelName = 184 | OCL_KERNEL_PRE 185 | "//KERNEL TRANSPOSED A TEXTURE BUFFERS TILED\n" 186 | "//#pragma OPENCL EXTENSION CP_FP_FMA\n" 187 | "union double_read {uint4 f; double2 d;};\n" 188 | "#define OCL_TILING_X " qon_mxstr(OCL_TILING_X) "\n" 189 | "#define OCL_TILING_Y " qon_mxstr(OCL_TILING_Y) "\n" 190 | "__kernel void oclkernel(__global double* C, image2d_t A, image2d_t B, int height1, int height2, int width, double alpha, double beta, int pitch, ulong offset)\n" 191 | "{\n" 192 | " const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n" 193 | " int i, j, k, l, m;\n" 194 | " for (i = get_global_id(1) * OCL_TILING_Y;i < height2;i += get_global_size(1) * OCL_TILING_Y)\n" 195 | " {\n" 196 | " for (j = get_global_id(0) * OCL_TILING_X;j < height1;j += get_global_size(0) * OCL_TILING_X)\n" 197 | " {\n" 198 | " double addval[OCL_TILING_X][OCL_TILING_Y];\n" 199 | "#pragma unroll\n" 200 | " for (k = 0;k < OCL_TILING_X;k++) for (l = 0;l < OCL_TILING_Y;l++) addval[k][l] = 0.;\n" 201 | "#pragma unroll 1\n" 202 | #ifdef CALDGEMM_FORCE_K 203 | " for (k = 0;k < " qon_mxstr(CALDGEMM_FORCE_K) ";k++)\n" 204 | #else 205 | " for (k = 0;k < width;k++)\n" 206 | #endif 207 | " {\n" 208 | " float2 coord;\n" 209 | " union double_read tmp[OCL_TILING_X / 2], tmp2[OCL_TILING_Y / 2];\n" 210 | " coord.y = k;\n" 211 | "#pragma unroll\n" 212 | " for (l = 0;l < OCL_TILING_X / 2;l++)\n" 213 | " {\n" 214 | " coord.x = i / 2 + l;\n" 215 | " tmp[l].f = read_imageui(A, sampler, coord);\n" 216 | " }\n" 217 | " for (l = 0;l < OCL_TILING_Y / 2;l++)\n" 218 | " {\n" 219 | " coord.x = j / 2 + l;\n" 220 | " tmp2[l].f = read_imageui(B, sampler, coord);\n" 221 | " }\n" 222 | "#pragma unroll\n" 223 | " for (l = 0;l < OCL_TILING_X / 2;l++)\n" 224 | " {\n" 225 | "#pragma unroll\n" 226 | " for (m = 0;m < OCL_TILING_Y / 2;m++)\n" 227 | " {\n" 228 | " addval[2 * l][2 * m] = mad(tmp[l].d.x, tmp2[m].d.x, addval[2 * l][2 * m]);\n" 229 | " addval[2 * l + 1][2 * m] = mad(tmp[l].d.y, tmp2[m].d.x, addval[2 * l + 1][2 * m]);\n" 230 | " addval[2 * l][2 * m + 1] = mad(tmp[l].d.x, tmp2[m].d.y, addval[2 * l][2 * m + 1]);\n" 231 | " addval[2 * l + 1][2 * m + 1] = mad(tmp[l].d.y, tmp2[m].d.y, addval[2 * l + 1][2 * m + 1]);\n" 232 | 233 | " }\n" 234 | " }\n" 235 | " }\n" 236 | "#pragma unroll\n" 237 | " for (k = 0;k < OCL_TILING_X;k++)\n" 238 | " {\n" 239 | "#pragma unroll\n" 240 | " for (l = 0;l < OCL_TILING_Y;l++)\n" 241 | " {\n" 242 | #ifdef CALDGEMM_ALPHA1 243 | " C[offset + (i + k) * pitch + j + l] = beta * C[offset + (i + k) * pitch + j + l] + addval[k][l];\n" 244 | #else 245 | " C[offset + (i + k) * pitch + j + l] = beta * C[offset + (i + k) * pitch + j + l] + alpha * addval[k][l];\n" 246 | #endif 247 | " }\n" 248 | " }\n" 249 | " }\n" 250 | " }\n" 251 | "}\n" 252 | ; 253 | 254 | #endif 255 | #endif 256 | #endif 257 | -------------------------------------------------------------------------------- /caldgemm_adl.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the CALDGEMM library. 3 | * 4 | * Copyright 2015: 5 | * - David Rohr (drohr@jwdt.org) 6 | * - Matthias Bach (bach@compeng.uni-frankfurt.de) 7 | * - Matthias Kretz (kretz@compeng.uni-frankfurt.de) 8 | * 9 | * This file is part of CALDGEMM. 10 | * 11 | * CALDGEMM is free software: you can redistribute it and/or modify 12 | * it under the terms of the GNU Lesser General Public License as published by 13 | * the Free Software Foundation, either version 3 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * CALDGEMM is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU Lesser General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU Lesser General Public License 22 | * along with CALDGEMM. If not, see . 23 | */ 24 | 25 | #include "caldgemm_config_load.h" 26 | #include "cmodules/util_adl.cpp" 27 | -------------------------------------------------------------------------------- /caldgemm_cal.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Interface of the CALDGEMM library. 3 | * 4 | * Copyright 2015: 5 | * - David Rohr (drohr@jwdt.org) 6 | * - Matthias Bach (bach@compeng.uni-frankfurt.de) 7 | * - Matthias Kretz (kretz@compeng.uni-frankfurt.de) 8 | * 9 | * This file is part of CALDGEMM. 10 | * 11 | * CALDGEMM is free software: you can redistribute it and/or modify 12 | * it under the terms of the GNU Lesser General Public License as published by 13 | * the Free Software Foundation, either version 3 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * CALDGEMM is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU Lesser General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU Lesser General Public License 22 | * along with CALDGEMM. If not, see . 23 | */ 24 | 25 | #ifndef CALDGEMM_CAL_H 26 | #define CALDGEMM_CAL_H 27 | 28 | #include 29 | #include 30 | #include 31 | #include "cal_private_ext.h" 32 | 33 | #include 34 | 35 | #include "caldgemm.h" 36 | 37 | class caldgemm_cal : public caldgemm 38 | { 39 | public: 40 | caldgemm_cal(); 41 | virtual ~caldgemm_cal(); 42 | 43 | virtual double getMaxGPUTemperature(); 44 | 45 | private: 46 | int adl_util_initialized; 47 | virtual int UseOutputPthreads(); 48 | virtual int UseInputPthreads(); 49 | virtual int UseMutexPerDevice(); 50 | 51 | unsigned int numInputs, numOutputs, numConstantBuffers; 52 | 53 | #ifdef CALDGEMM_44 54 | #ifdef CALDGEMM_SINGLE_BUFFER 55 | static const unsigned int dwBuffersA = 1; 56 | #elif !defined(CALDGEMM_48) & !defined(CALDGEMM_DOUBLE_BUFFERS) 57 | static const unsigned int dwBuffersA = 2; 58 | #else 59 | static const unsigned int dwBuffersA = 4; 60 | #endif 61 | #ifdef CALDGEMM_SINGLE_BUFFER 62 | static const unsigned int dwBuffersB = 1; 63 | #elif !defined(CALDGEMM_84) & !defined(CALDGEMM_DOUBLE_BUFFERS) 64 | static const unsigned int dwBuffersB = 2; 65 | #else 66 | static const unsigned int dwBuffersB = 4; 67 | #endif 68 | #else //CALDGEMM_44 69 | #ifdef CALDGEMM_TRANSPOSED_A 70 | static const unsigned int dwBuffersA = 2; 71 | #else 72 | static const unsigned int dwBuffersA = 8; 73 | #endif 74 | static const unsigned int dwBuffersB = 2; 75 | #endif //CALDGEMM_44 76 | 77 | #ifdef CALDGEMM_USE_MEMEXPORT 78 | static const unsigned int dwBuffersC = 1; 79 | #else 80 | static const unsigned int dwBuffersC = 8; 81 | #endif 82 | 83 | struct BufferProperties 84 | { 85 | union 86 | { 87 | float* ptr_float; 88 | unsigned int* ptr_uint; 89 | int* ptr_int; 90 | double* ptr_double; 91 | char* ptr_char; 92 | void* ptr_void; 93 | }; 94 | unsigned int Width; 95 | unsigned int Height; 96 | unsigned int VectorSize; 97 | unsigned int DataSize; 98 | 99 | bool CALMemory; 100 | CALresource res; 101 | CALmem mem; 102 | CALmem dstMem; 103 | unsigned int pitch; 104 | CALresource tmpres; 105 | CALmem tmpmem; 106 | 107 | BufferProperties* conversionBuffer; 108 | }; 109 | 110 | int divideBuffer(BufferProperties* dst, double* src, int width, int height, int gpu_width, int gpu_height, int pitch, int numBuffers, bool transpose CALDGEMM_DIVBUFA); 111 | int mergeBuffers(double* dst, BufferProperties* src, int width, int height, int gpu_width, int gpu_height, int pitch, int numBuffers); 112 | void checkCalPatch(); 113 | void cal_init_constant_data(BufferProperties* &data, double alpha); 114 | virtual int DGEMM_prepare_backend(size_t k, int j, unsigned int num_device, bool prepareM, bool prepareN, bool buffersSufficiant, bool buffersSufficiant0 CALDGEMM_DIVBUFA); 115 | 116 | struct CALVersion {unsigned int major, minor, imp;}; 117 | 118 | virtual int Initialize (bool nocalinit); 119 | int SetupKernel(const char* ILKernel, CALmodule* module, CALcontext* ctx, unsigned int device_num, bool disassemble = false); 120 | int RunProgram(CALcontext* ctx, CALmodule* module, unsigned int Width, unsigned int Height, CALevent* event); 121 | int CleanupData(CALcontext* ctx, CALresource* &resourceHandler, BufferProperties* &data, unsigned int numHandles, int nContext, unsigned int num_device); 122 | int Cleanup(CALdevice* device, CALcontext* ctx, CALmodule* module, CALresource* &resourceHandler, BufferProperties* &data, unsigned int numHandles, int nContext, unsigned int num_device); 123 | int SetupData(CALmodule* module, CALresource* &_Res, BufferProperties* &data, CALdevice* device, CALcontext* ctx, unsigned int numInputs, unsigned int numOutputs, unsigned int numConstantBuffers, CALname** ctxProgNames, int nContext, unsigned int num_device); 124 | int CopyDataFromGPU(int nDevice, CALresource* _Res, BufferProperties* data, unsigned int num, int nContext, size_t lastm, size_t lastn, int mustlock = 0); 125 | int CopyDataToGPU(int nDevice, CALresource* _Res, BufferProperties* data, unsigned int num, int nContext, bool constants, BufferProperties* dest_data = NULL); 126 | int ValidateCALRuntime(); 127 | 128 | class eventCls 129 | { 130 | public: 131 | #ifdef CALDGEMM_QUERY_ALL_EVENTS 132 | CALevent events[13]; 133 | volatile int nEvents; 134 | inline CALevent* GetNextEvent() 135 | { 136 | if (nEvents == 13) 137 | { 138 | fprintf(STD_OUT, "Event buffer overflow\n"); 139 | exit(1); 140 | } 141 | return(&events[nEvents++]); 142 | } 143 | inline void Reset() {nEvents = 0;} 144 | #else 145 | CALevent events[1]; 146 | static const int nEvents = 1; 147 | inline CALevent* GetNextEvent() {return(&events[0]);} 148 | inline void Reset() {}; 149 | #endif 150 | }; 151 | 152 | PFNCALCTXWAITFOREVENTS calCtxWaitForEvents; 153 | 154 | BufferProperties* datas[max_devices][max_bbuffers]; 155 | CALdevice devices[max_devices]; 156 | CALcontext ctxs[max_devices]; 157 | CALresource* resourceHandlers[max_devices][max_bbuffers]; 158 | CALmodule modules[max_devices][kernel_count]; 159 | CALmodule modulesConvert[max_devices]; 160 | CALmodule fakeModule; 161 | CALname *progNames[max_devices][kernel_count]; 162 | CALname progNamesConvert[max_devices][2 * dwBuffersA]; 163 | eventCls events[max_devices][obuffercount]; 164 | unsigned int device_nums[max_devices]; 165 | 166 | static const char *ILKernel, *ILKernelALPHA1, *ILKernelLinpack, *ILFakeKernel, *ILConvertKernel; 167 | 168 | virtual int ValidateRuntime(); 169 | virtual int CheckDevices(); 170 | virtual int InitDevices(); 171 | virtual int ReinitDevices(); 172 | virtual int InitConstantData(double alpha); 173 | virtual int ExecuteKernels(caldgemm::DGEMMPrepareAndExecuteTask& Task, int blockm, int blockn); 174 | virtual int ExitRuntime(); 175 | virtual int ExitDevices(); 176 | virtual int WaitForEvent(int, int, int lock = 0); 177 | virtual int FetchResult(int device, int j, int m, int n, int mustlock = 0); 178 | virtual int CheckDMAQueue(int device, int forcej = -1); 179 | virtual int RunMergeBuffers(double* dst, int device, int j, int width, int height, int gpu_width, int gpu_height, int pitch); 180 | }; 181 | 182 | #endif 183 | -------------------------------------------------------------------------------- /caldgemm_cblas_wrapper.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the CALDGEMM library. 3 | * 4 | * Copyright 2015: 5 | * - David Rohr (drohr@jwdt.org) 6 | * - Matthias Bach (bach@compeng.uni-frankfurt.de) 7 | * - Matthias Kretz (kretz@compeng.uni-frankfurt.de) 8 | * 9 | * This file is part of CALDGEMM. 10 | * 11 | * CALDGEMM is free software: you can redistribute it and/or modify 12 | * it under the terms of the GNU Lesser General Public License as published by 13 | * the Free Software Foundation, either version 3 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * CALDGEMM is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU Lesser General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU Lesser General Public License 22 | * along with CALDGEMM. If not, see . 23 | */ 24 | 25 | void cblas_dtrsma(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); 26 | void cblas_dgemva(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy); 27 | void cblas_dgemma(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); 28 | void cblas_daxpya(blasint n, double, double *x, blasint incx, double *y, blasint incy); 29 | void cblas_dscala(blasint N, double alpha, double *X, blasint incX); 30 | -------------------------------------------------------------------------------- /caldgemm_common.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the CALDGEMM library. 3 | * 4 | * Copyright 2015: 5 | * - David Rohr (drohr@jwdt.org) 6 | * - Matthias Bach (bach@compeng.uni-frankfurt.de) 7 | * - Matthias Kretz (kretz@compeng.uni-frankfurt.de) 8 | * 9 | * This file is part of CALDGEMM. 10 | * 11 | * CALDGEMM is free software: you can redistribute it and/or modify 12 | * it under the terms of the GNU Lesser General Public License as published by 13 | * the Free Software Foundation, either version 3 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * CALDGEMM is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU Lesser General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU Lesser General Public License 22 | * along with CALDGEMM. If not, see . 23 | */ 24 | 25 | #ifndef CALDGEMM_COMMON_H 26 | #define CALDGEMM_COMMON_H 27 | 28 | #include 29 | 30 | #ifdef _WIN32 31 | #define __INTRIN_H_ 32 | #define _Complex 33 | #ifndef __CUDA_ARCH__ 34 | #ifndef __restrict__ 35 | #define __restrict__ __restrict 36 | #endif 37 | #endif 38 | #endif 39 | 40 | #if !defined(_WIN32) & defined(USE_GOTO_BLAS) 41 | extern "C" { 42 | #define CBLAS 43 | #define ASSEMBLER 44 | #include 45 | #undef ASSEMBLER 46 | #include 47 | } 48 | #else 49 | 50 | #ifndef USE_GOTO_BLAS 51 | #include 52 | #endif 53 | 54 | extern "C" int get_num_procs(); 55 | static inline void caldgemm_goto_reserve_cpu(int, int) {} 56 | static inline void caldgemm_goto_reserve_cpus(int) {} 57 | 58 | typedef int blasint; 59 | extern "C" { 60 | #ifdef USE_MKL 61 | #include 62 | #else 63 | #include 64 | #endif 65 | } 66 | 67 | #ifndef _WIN32 68 | void goto_set_num_threads(int num); 69 | void caldgemm_goto_restrict_cpus(int); 70 | 71 | #ifdef USE_MKL 72 | #define CBLAS_ENUM 73 | #else 74 | #define CBLAS_ENUM enum 75 | #endif 76 | 77 | extern "C" { 78 | void cblas_dtrsma(CBLAS_ENUM CBLAS_ORDER Order, CBLAS_ENUM CBLAS_SIDE Side, CBLAS_ENUM CBLAS_UPLO Uplo, CBLAS_ENUM CBLAS_TRANSPOSE TransA, CBLAS_ENUM CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); 79 | void cblas_dgemva(CBLAS_ENUM CBLAS_ORDER order, CBLAS_ENUM CBLAS_TRANSPOSE trans, blasint m, blasint n, double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy); 80 | void cblas_dgemma(CBLAS_ENUM CBLAS_ORDER Order, CBLAS_ENUM CBLAS_TRANSPOSE TransA, CBLAS_ENUM CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); 81 | void cblas_daxpya(blasint n, double, double *x, blasint incx, double *y, blasint incy); 82 | void cblas_dscala(blasint N, double alpha, double *X, blasint incX); 83 | } 84 | #else 85 | static inline void goto_set_num_threads(int) {} 86 | static inline void caldgemm_goto_restrict_cpus(int) {} 87 | #endif 88 | 89 | #endif 90 | 91 | #ifndef _WIN32 92 | #define CAST_FOR_MMPREFETCH 93 | #else 94 | #define CAST_FOR_MMPREFETCH (char*) 95 | #endif 96 | 97 | #ifdef VTRACE 98 | #include 99 | #include 100 | extern pthread_mutex_t global_vt_mutex; 101 | #define VT_USER_START_A(a) {pthread_mutex_lock(&global_vt_mutex);VT_USER_START(a);pthread_mutex_unlock(&global_vt_mutex);} 102 | #define VT_USER_END_A(a) {pthread_mutex_lock(&global_vt_mutex);VT_USER_END(a);pthread_mutex_unlock(&global_vt_mutex);} 103 | #else 104 | #define VT_USER_START_A(a) 105 | #define VT_USER_END_A(a) 106 | #endif 107 | 108 | #define mcat(a, b) a ## b 109 | #define mxcat(a, b) mcat(a, b) 110 | 111 | #define str(s) xstr(s) 112 | #define xstr(s) #s 113 | 114 | #define PASS_ARG(arg) arg 115 | #define COMMA , 116 | #define EMPTY 117 | 118 | #define RED "\033[22;31m" 119 | #define BOLDRED "\033[1m\033[31m" 120 | #define BOLDBLACK "\033[1m\033[30m" 121 | #define RESET "\033[0m" 122 | 123 | #define COMPARE_GENERAL(a, b) ((a) != (b)) 124 | #define COMPARE_STRING(a, b) (strcmp(a, b)) 125 | 126 | #define PRINT_CONFIG_BASE(name1, type, type2, name2_old, name2_new, name2_conf, compare) \ 127 | { \ 128 | if (oldConfig) \ 129 | { \ 130 | if (compare((name2_old), (name2_new))) \ 131 | fprintf(STD_OUT, "%35s: " type " changed to " BOLDRED type RESET "\n", name1, (type2) (name2_old), (type2) (name2_new)); \ 132 | } \ 133 | else \ 134 | { \ 135 | fprintf(STD_OUT, "%35s: " type "\n", name1, (type2) name2_conf); \ 136 | } \ 137 | } 138 | 139 | #define PRINT_CONFIG_BASE_WRAP(name1, name2, name1param, type, type2, conf, hide1, hide2, hide1val, hide2val) \ 140 | { \ 141 | char tmpBuffer[256]; \ 142 | sprintf(tmpBuffer, str(name1) name1param); \ 143 | PRINT_CONFIG_BASE(tmpBuffer, type, type2, (hide1) ? (hide1val) : oldConfig->name2, (hide2) ? (hide2val) : newConfig->name2, conf->name2, COMPARE_GENERAL) \ 144 | } 145 | 146 | 147 | #define PRINT_CONFIG_BASE_THIS(name1, name2, name1param, type, type2, conf) \ 148 | { \ 149 | char tmpBuffer[256]; \ 150 | sprintf(tmpBuffer, str(name1) name1param); \ 151 | if (oldConfig == NULL) fprintf(STD_OUT, "%35s: " type "\n", tmpBuffer, (type2) conf->name2); \ 152 | } 153 | 154 | #define PRINT_CONFIG_INT(name) PRINT_CONFIG_BASE_WRAP(name, name, EMPTY, "%5d", int, myConfig, 0, 0, 0, 0) 155 | #define PRINT_CONFIG_CHAR(name) PRINT_CONFIG_BASE_WRAP(name, name, EMPTY, "%5c", char, myConfig, 0, 0, 0, 0) 156 | #define PRINT_CONFIG_DOUBLE(name) PRINT_CONFIG_BASE_WRAP(name, name, EMPTY, "%2.3f", double, myConfig, 0, 0, 0, 0) 157 | #define PRINT_CONFIG_STRING(name) \ 158 | { \ 159 | const char* strEmpty = ""; \ 160 | const char* str1 = (myConfig->name ? myConfig->name : strEmpty); \ 161 | const char* str2 = (oldConfig && oldConfig->name ? oldConfig->name : strEmpty); \ 162 | const char* str3 = (newConfig && newConfig->name ? newConfig->name : strEmpty); \ 163 | PRINT_CONFIG_BASE(str(name), "%5s", char*, str2, str3, str1, COMPARE_STRING) \ 164 | } 165 | 166 | #define PRINT_CONFIG_INT_THIS(name) PRINT_CONFIG_BASE_THIS(name, name, EMPTY, "%5d", int, this) 167 | 168 | #define PRINT_CONFIG_LOOP_INT(name, loopvar) \ 169 | { \ 170 | for (int i = 0;i < mymax(oldConfig ? oldConfig->loopvar : 0, newConfig->loopvar);i++) \ 171 | { \ 172 | PRINT_CONFIG_BASE_WRAP(name[%d], name[i], PASS_ARG(COMMA) i, "%5d", int, myConfig, oldConfig && oldConfig->loopvar <= i, newConfig->loopvar <= i, -1, -1) \ 173 | } \ 174 | } 175 | 176 | #define CALDGEMM_PREPARE_BACKEND_VARS1 \ 177 | size_t blockm, blockn; \ 178 | DGEMM_getblocks(k, blockm, blockn); \ 179 | const size_t HeightM = ((blockm == gpu_m / Config->Height) ? (gpu_m % Config->Height) : Config->Height); \ 180 | const size_t HeightN = ((blockn == gpu_n / Config->Height) ? (gpu_n % Config->Height) : Config->Height); 181 | 182 | #define CALDGEMM_PREPARE_BACKEND_VARS2 \ 183 | char myMat = iMat ? 'B' : 'A'; \ 184 | int& my_next_buffer = iMat ? next_buffer_B[num_device] : next_buffer_A[num_device]; \ 185 | int*& my_buffer_pointers = iMat ? buffer_pointers_B[num_device] : buffer_pointers_A[num_device]; \ 186 | size_t& myblock = iMat ? blockn : blockm; \ 187 | bool& myTranspose = iMat ? TransposeB : TransposeA; \ 188 | const bool myKernelTranspose = iMat ? KernelSettings.transposeB : KernelSettings.transposeA; \ 189 | const size_t& myHeight = iMat ? HeightN : HeightM; \ 190 | const size_t pitch = iMat ? B_pitch : A_pitch; \ 191 | double* src_ptr = iMat ? \ 192 | (B + blockn * Config->Height * (myTranspose ? B_pitch : 1)) : \ 193 | (A + blockm * Config->Height * (myTranspose ? 1 : A_pitch)); \ 194 | const bool access_bbuffers = (bool) (!DGEMM_favor_m && buffersSufficiant0) ^ (bool) iMat; \ 195 | const int destbuffer = access_bbuffers ? \ 196 | ((!iMat || buffersSufficiant) ? (my_buffer_pointers[myblock] % ((iMat || buffersSufficiant) ? bbuffers[num_device] : ibuffercount)) : (my_next_buffer % ibuffercount)) : \ 197 | my_next_buffer % ibuffercount; \ 198 | if (iMat) Timers.divideB++; else Timers.divideA++; 199 | 200 | #define PREALLOC_ALTERNATE_LOOKAHEAD 4 201 | 202 | #endif 203 | -------------------------------------------------------------------------------- /caldgemm_config.sample: -------------------------------------------------------------------------------- 1 | /** 2 | * Compile time configuration of the CALDGEMM library. 3 | * 4 | * Copyright 2015: 5 | * - David Rohr (drohr@jwdt.org) 6 | * - Matthias Bach (bach@compeng.uni-frankfurt.de) 7 | * - Matthias Kretz (kretz@compeng.uni-frankfurt.de) 8 | * 9 | * This file is part of CALDGEMM. 10 | * 11 | * CALDGEMM is free software: you can redistribute it and/or modify 12 | * it under the terms of the GNU Lesser General Public License as published by 13 | * the Free Software Foundation, either version 3 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * CALDGEMM is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU Lesser General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU Lesser General Public License 22 | * along with CALDGEMM. If not, see . 23 | */ 24 | 25 | //CAL DGEMM Kernel Settings 26 | #define CALDGEMM_TRANSPOSED_A //Use Kernel for transposed A Matrix 27 | //#define CALDGEMM_TRANSPOSED_B //Use Kernel for transposed B Matrix 28 | //#define CALDGEMM_88 //8x8 tiling (implies memexport) 29 | //#define CALDGEMM_84 //8x4 tiling (implies memexport) 30 | //#define CALDGEMM_48 //4x8 tiling (implies memexport) 31 | #define CALDGEMM_44 //4x4 tiling 32 | //#define CALDGEMM_USE_MEMEXPORT //Use Memexport for output instead of color buffers 33 | //#define CALDGEMM_COMPUTE_SHADER 64 //Use compute shader, define compute group size 34 | //#define CALDGEMM_DIAGONAL_TEXTURE //Alternate storage format, only valid for 4x4 kernel, obsolete 35 | #define CALDGEMM_DUAL_ENTRY //Unroll factor of 2 for 4x4 tiling 36 | //#define CALDGEMM_SINGLE_BUFFER //Use a single buffer, 4x4 tiling a transposed, experimental 37 | //#define CALDGEMM_SINGLE_BUFFER_IMPROVED //Alternative access scheme for single buffer, experimental 38 | //#define CALDGEMM_DUAL_BUFFERS //Double number of buffers, 4x4 tiling a transposed, experimental 39 | #define CALDGEMM_LATE_EXIT_CONDITION //Put exit condition at end of while loop 40 | #define CALDGEMM_SHIFT_TEXTURE 1 //Shift even numbered rows in texture by n pixels 41 | //#define CALDGEMM_44_BT_64 //64 bit DMA transfers for 4x4 B transposed kernel 42 | //#define CALDGEMM_44_BT_64_CONVERT //Perform 64 bit DMA transfer but transform to 128 bit for kernel input 43 | 44 | //Other Settings 45 | //#define TESTMODE //Activate Test Mode for debugging 46 | //#define CALDGEMM_LOOP_DETECTION //Enable loop detection 47 | //#define TEST_KERNEL 48 | //#define TEST_PARAMETERS 49 | //#define CALDGEMM_UNALIGNED_ADDRESSES 50 | 51 | #ifndef STD_OUT 52 | #define STD_OUT stdout //Output for all messages 53 | #endif 54 | 55 | #define CALDGEMM_OUTPUT_THREADS 1 //Number of Output threads 56 | #define CALDGEMM_OUTPUT_THREADS_SLOW 2 //Number of output threads when KeepBuffersMapped = false 57 | #define CALDGEMM_EXTRA_OUTPUT_THREADS_LINPACK 0 //Number of additional output threads when running in linpack mode 58 | #define REUSE_BBUFFERS //Allocate many BBuffers on the GPU so B is not necessarily retransferred, used for A as well 59 | //#define WASTE_MEMORY //Allocate extra memory before and after every memory segment allocated 60 | //#define CALDGEMM_BENCHMARK_KERNEL 1 61 | 62 | //#define DEBUG_MSG_ALLOCATION //Debug Messages considering GPU buffer allocation when in Debug = true 63 | //#define DEBUG_MSG_TIMED //Add timestamps to all messages 64 | 65 | //#define CALDGEMM_SGEMM //Experimental SGEMM implementation (requires MemExport) 66 | //#define CALDGEMM_IGEMM //Experimental IGEMM implementation (Integer instead of single) (requires SGEMM) 67 | //#define CALDGEMM_BGEMM //Experimental 68 | 69 | #define CALDGEMM_MIN_TILE_DIM 32 //Tile Dimension must be multiple of this 70 | #define CALDGEMM_MIN_TILE_DIM2 128 //Min dimension of a tile 71 | #define CALDGEMM_MIN_CORRECTION_SIZE 768 //Min tile size used to calculate correction ratio for tile distribution 72 | 73 | //#define CALDGEMM_FORCE_K 16 //Force K Parameter to simulate different kernel perfoemance 74 | 75 | #define _NO_AMD_CPU //Set to run on CPU without 3dnow (which nowadays also include AMD CPUs) 76 | #define _NO_AVX //Do not use AVX instructions (Only relevant for OpenCL code atm) 77 | #define _NO_ADL //Do not use ADL library to read GPU temps 78 | //#define _NO_AFFINITY //Disable affinity setting 79 | //#define USE_OLD_HUGE_MALLOC //Use old method to allocate huge tables 80 | //#define VTRACE 81 | 82 | #define CALDGEMM_USE_VEC_MEMCPY_PREFETCH //Use prefetching in Divide / Merge Buffer 83 | #define CALDGEMM_STREAMING_STORES_DIVIDE //Use streaming stores in Divide Buffer 84 | #define CALDGEMM_STREAMING_STORES_MERGE //Use streaming stores in Merge buffer 85 | #define CALDGEMM_PREFETCH_MERGE_STORES //Use prefetching in Merge buffer even when using streaming stores 86 | #define CALDGEMM_MERGE_NOPS 20 //Add nops to slow down merge process freeing resources for other tasks 87 | //#define CALDGEMM_MERGE_FLUSH 88 | 89 | //#define CALDGEMM_LDAB_INC 1 //Inc for LDA and LDB to avoid bank conflics 90 | //#define CALDGEMM_LDB_INC 0 //Override LDAB_INC for LDB 91 | //#define CALDGEMM_LDC_INC 0 //see above 92 | 93 | //#define CALDGEMM_DIVIDE_STATIC_BUFFER //Allocate tmpBuffer for divide staticly once and for all 94 | #define CALDGEMM_DIVIDE_BLOCKING 128 //Blocking size for divideBuffer with SHIFT_TEXTURE = 1 (larger multiple of two) 95 | //#define CALDGEMM_DIVIDE_TRANSPOSE_TWOPHASE //Perform dividebuffer transposition in two phases such that fewer write combining buffers are used, Only works for 2 input buffers per matrix with A transposed! 96 | #define CALDGEMM_TRANSPOSE_BLOCKING 8 //Blocking factor for the transposition (multiple of 2) 97 | 98 | //#define CALDGEMM_QUERY_ALL_EVENTS //Query for all events, not only the last one in a queue 99 | //#define CALDGEMM_USE_CAL_WAIT_FOR_EVENTS //Use different method for queriying CAL events 100 | //#define CALDGEMM_USE_CAL_WAIT_FOR_EVENTS_NO_POLL //Do not use active wait to reduce CPU utilization 101 | 102 | //Settings for integrated OpenCL kernels, 3rd party kernels from -Ol library must override this 103 | #define OCL_TILING_X 4 104 | #define OCL_TILING_Y 4 105 | #define OCL_TILED_KERNEL 106 | #define OCL_USE_SIMPLE_BUFFERS 107 | #define OCL_GROUP_SIZE_X 8 108 | #define OCL_GROUP_SIZE_Y 8 109 | 110 | //Custom header files for optimized height parameters 111 | //#define CALDGEMM_CUSTOM_AUTO_HEIGHT "auto_height.h" //Can define a custom header file that is included in caldgemm, that handles autoheight feature 112 | //#define CALDGEMM_CUSTOM_HEIGHT_MOD "height_mod.h" //Same for posterior height adoption 113 | 114 | //#define CALDGEMM_OPENCL_EMULATE_STRIDED //Emulate strided transfers in OpenCL via linear transfers 115 | #define CALDGEMM_OPENCL_USE_ORIGINAL_POINTERS //Use the original pointers returned by clEnqueueMapBuffer for the DMA transfers and supply an origin parameter for the correct offset 116 | #define CALDGEMM_OPENCL_PROFILED_PIPELINE 0 //Use a profiling command queue to get timing information in pipelined runs. 117 | -------------------------------------------------------------------------------- /caldgemm_config_load.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Utility header to complete configuration given in caldgemm_config.h 3 | * 4 | * Copyright 2015: 5 | * - David Rohr (drohr@jwdt.org) 6 | * - Matthias Bach (bach@compeng.uni-frankfurt.de) 7 | * - Matthias Kretz (kretz@compeng.uni-frankfurt.de) 8 | * 9 | * This file is part of CALDGEMM. 10 | * 11 | * CALDGEMM is free software: you can redistribute it and/or modify 12 | * it under the terms of the GNU Lesser General Public License as published by 13 | * the Free Software Foundation, either version 3 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * CALDGEMM is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU Lesser General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU Lesser General Public License 22 | * along with CALDGEMM. If not, see . 23 | */ 24 | 25 | #include "caldgemm_config.h" 26 | 27 | #ifdef CALDGEMM_COMPUTE_SHADER 28 | #define CALDGEMM_USE_MEMEXPORT 29 | #endif 30 | 31 | 32 | #ifdef CALDGEMM_44_BT_64 33 | #ifdef CALDGEMM_88 34 | #undef CALDGEMM_88 35 | #endif 36 | #ifdef CALDGEMM_84 37 | #undef CALDGEMM_84 38 | #endif 39 | #ifdef CALDGEMM_48 40 | #undef CALDGEMM_48 41 | #endif 42 | #ifdef CALDGEMM_TRANSPOSED_A 43 | #undef CALDGEMM_TRANSPOSED_A 44 | #endif 45 | #define CALDGEMM_TRANSPOSED_B 46 | #define CALDGEMM_44 47 | #ifndef CALDGEMM_44_BT_64_CONVERT 48 | #define CALDGEMM_44_BT_64_KERNEL 49 | #endif 50 | #endif 51 | 52 | #ifdef CALDGEMM_88 53 | #define CALDGEMM_84 54 | #define CALDGEMM_48 55 | #endif 56 | 57 | #if defined(CALDGEMM_84) | defined(CALDGEMM_48) 58 | #define CALDGEMM_44 59 | #define CALDGEMM_USE_MEMEXPORT 60 | #ifndef CALDGEMM_TRANSPOSED_A 61 | #define CALDGEMM_TRANSPOSED_A 62 | #warning Setting CALDGEMM_TRANSPOSED_A for 8x?/?x8 CAL tiling 63 | #endif 64 | #ifdef CALDGEMM_TRANSPOSED_B 65 | #warning Unsetting CALDGEMM_TRANSPOSED_B for 8x?/?x8 CAL tiling 66 | #undef CALDGEMM_TRANSPOSED_B 67 | #endif 68 | #endif 69 | 70 | #ifdef CALDGEMM_44 71 | #ifdef CALDGEMM_TRANSPOSED_B 72 | #ifdef CALDGEMM_TRANSPOSED_A 73 | #warning Unsetting CALDGEMM_TRANSPOSED_A for != 8x2 CAL tiling 74 | #undef CALDGEMM_TRANSPOSED_A 75 | #endif 76 | #else 77 | #ifndef CALDGEMM_TRANSPOSED_A 78 | #warning Setting CALDGEMM_TRANSPOSED_A for != 8x2 CAL tiling 79 | #define CALDGEMM_TRANSPOSED_A 80 | #endif 81 | #endif 82 | #endif 83 | 84 | #if defined(CALDGEMM_DIAGONAL_TEXTURE) & (!defined(CALDGEMM_44) | defined(CALDGEMM_84) | defined(CALDGEMM_48) | !defined(CALDGEMM_TRANSPOSED_A)) 85 | #undef CALDGEMM_DIAGONAL_TEXTURE 86 | #endif 87 | 88 | #if defined(CALDGEMM_DUAL_ENTRY) & (!defined(CALDGEMM_44) | defined(CALDGEMM_84) | defined(CALDGEMM_48) | !defined(CALDGEMM_TRANSPOSED_A)) 89 | #undef CALDGEMM_DUAL_ENTRY 90 | #endif 91 | 92 | #if defined(CALDGEMM_SINGLE_BUFFER) | defined(CALDGEMM_DOUBLE_BUFFERS) 93 | #if !defined(CALDGEMM_44) | defined(CALDGEMM_48) | defined(CALDGEMM_84) | !defined(CALDGEMM_DUAL_ENTRY) | defined(CALDGEMM_TRANSPOSED_B) 94 | #error Invalid options for CALDGEMM_SINGLE_BUFFER/CALDGEMM_DOUBLE_BUFFERS 95 | #endif 96 | #endif 97 | 98 | 99 | #if defined(CALDGEMM_48) | !defined(CALDGEMM_44) 100 | #define TILING_Y 8 101 | #else 102 | #define TILING_Y 4 103 | #endif 104 | 105 | #if defined(CALDGEMM_84) 106 | #define TILING_X 8 107 | #elif defined(CALDGEMM_44) 108 | #define TILING_X 4 109 | #else 110 | #define TILING_X 2 111 | #endif 112 | 113 | #ifdef CALDGEMM_LDAB_INC 114 | #define CALDGEMM_LDA_INC CALDGEMM_LDAB_INC 115 | #ifndef CALDGEMM_LDB_INC 116 | #define CALDGEMM_LDB_INC CALDGEMM_LDAB_INC 117 | #endif 118 | #endif 119 | 120 | #ifdef CALDGEMM_SHIFT_TEXTURE 121 | #if defined(CALDGEMM_LDA_INC) & CALDGEMM_LDA_INC < CALDGEMM_SHIFT_TEXTURE 122 | #undef CALDGEMM_LDA_INC 123 | #endif 124 | #if defined(CALDGEMM_LDB_INC) & CALDGEMM_LDB_INC < CALDGEMM_SHIFT_TEXTURE 125 | #undef CALDGEMM_LDB_INC 126 | #endif 127 | #ifndef CALDGEMM_LDA_INC 128 | #define CALDGEMM_LDA_INC CALDGEMM_SHIFT_TEXTURE 129 | #endif 130 | #ifndef CALDGEMM_LDB_INC 131 | #define CALDGEMM_LDB_INC CALDGEMM_SHIFT_TEXTURE 132 | #endif 133 | #endif 134 | 135 | #if defined(CALDGEMM_SINGLE_BUFFER_IMPROVED) & !defined(CALDGEMM_SINGLE_BUFFER) 136 | #undef CALDGEMM_SINGLE_BUFFER 137 | #endif 138 | 139 | #ifdef CALDGEMM_DIVIDE_STATIC_BUFFER 140 | #ifdef _WIN32 141 | #define CALDGEMM_DIVBUFA ,double* tmpBuffer 142 | #else 143 | #define CALDGEMM_DIVBUFA ,double* __restrict__ tmpBuffer 144 | #endif 145 | #define CALDGEMM_DIVBUFB , tmpBuffer 146 | #else 147 | #define CALDGEMM_DIVBUFA 148 | #define CALDGEMM_DIVBUFB 149 | #endif 150 | -------------------------------------------------------------------------------- /caldgemm_cpu.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * CPU side of CALDGEMM implementation. 3 | * 4 | * Copyright 2015: 5 | * - David Rohr (drohr@jwdt.org) 6 | * - Matthias Bach (bach@compeng.uni-frankfurt.de) 7 | * - Matthias Kretz (kretz@compeng.uni-frankfurt.de) 8 | * 9 | * This file is part of CALDGEMM. 10 | * 11 | * CALDGEMM is free software: you can redistribute it and/or modify 12 | * it under the terms of the GNU Lesser General Public License as published by 13 | * the Free Software Foundation, either version 3 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * CALDGEMM is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU Lesser General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU Lesser General Public License 22 | * along with CALDGEMM. If not, see . 23 | */ 24 | 25 | #include "caldgemm_cpu.h" 26 | 27 | caldgemm_cpu::caldgemm_cpu() : caldgemm() 28 | { 29 | } 30 | 31 | caldgemm_cpu::~caldgemm_cpu() 32 | { 33 | } 34 | 35 | int caldgemm_cpu::WaitForEvent(int a, int b, int) 36 | { 37 | if (Config->Debug) fprintf(STD_OUT, "\tSkipping waiting for event from device %d obuffer %d...\n", b, a); 38 | return(0); 39 | } 40 | 41 | int caldgemm_cpu::Initialize(bool nocalinit) 42 | { 43 | if (!Config->Quiet) fprintf(STD_OUT, "Initializing CALDGEMM (CPU Runtime)\n"); 44 | if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU Initialice\n"); 45 | 46 | nDevices = 0; 47 | gpu_available = 0; 48 | 49 | return(0); 50 | } 51 | 52 | int caldgemm_cpu::ValidateRuntime() 53 | { 54 | if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU ValidateRuntime\n"); 55 | Config->GPU_C = false; 56 | SetDefaultKernelSettings(); 57 | return(0); 58 | } 59 | 60 | int caldgemm_cpu::CheckDevices() 61 | { 62 | if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU CheckDevices\n"); 63 | return(0); 64 | } 65 | 66 | int caldgemm_cpu::InitDevices() 67 | { 68 | if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU InitDevices\n"); 69 | 70 | return(0); 71 | } 72 | 73 | int caldgemm_cpu::ReinitDevices() 74 | { 75 | if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU ReinitDevices\n"); 76 | return(0); 77 | } 78 | 79 | int caldgemm_cpu::InitConstantData(double alpha) 80 | { 81 | if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU InitConstantData\n"); 82 | return(0); 83 | } 84 | 85 | int caldgemm_cpu::ExecuteKernels(caldgemm::DGEMMPrepareAndExecuteTask& Task, int blockm, int blockn) 86 | { 87 | if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU ExecuteKernels\n"); 88 | 89 | fprintf(STD_OUT, "Error: DGEMMPrepareAndExecuteTask shoul never be executed for CALDGEMM_CPU\n"); 90 | 91 | return(1); 92 | } 93 | 94 | int caldgemm_cpu::ExitRuntime() 95 | { 96 | if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU ExitRuntime\n"); 97 | 98 | return(0); 99 | } 100 | 101 | int caldgemm_cpu::FetchResult(int device, int j, int m, int n, int mustlock) 102 | { 103 | if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU FetchResult\n"); 104 | return(0); 105 | } 106 | 107 | int caldgemm_cpu::CheckDMAQueue(int device, int forcej) 108 | { 109 | return(0); 110 | } 111 | 112 | int caldgemm_cpu::RunMergeBuffers(double* dst, int device, int j, int width, int height, int gpu_width, int gpu_height, int pitch) 113 | { 114 | if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU RunMergeBuffers\n"); 115 | return(0); 116 | } 117 | 118 | int caldgemm_cpu::DGEMM_prepare_backend(size_t k, int j, unsigned int num_device, bool prepareM, bool prepareN, bool buffersSufficiant, bool buffersSufficiant0 CALDGEMM_DIVBUFA) 119 | { 120 | if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU DGEMM_prepare k=%lld j=%d device=%d\n", (long long int) k, j, num_device); 121 | 122 | return(0); 123 | } 124 | 125 | int caldgemm_cpu::ExitDevices() 126 | { 127 | if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU ExitDevices\n"); 128 | 129 | return(0); 130 | } 131 | 132 | int caldgemm_cpu::UseOutputPthreads() {return(0);} 133 | int caldgemm_cpu::UseInputPthreads() {return(0);} 134 | int caldgemm_cpu::UseMutexPerDevice() {return(0);} 135 | -------------------------------------------------------------------------------- /caldgemm_cpu.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Interface of the CALDGEMM library. 3 | * 4 | * Copyright 2015: 5 | * - David Rohr (drohr@jwdt.org) 6 | * - Matthias Bach (bach@compeng.uni-frankfurt.de) 7 | * - Matthias Kretz (kretz@compeng.uni-frankfurt.de) 8 | * 9 | * This file is part of CALDGEMM. 10 | * 11 | * CALDGEMM is free software: you can redistribute it and/or modify 12 | * it under the terms of the GNU Lesser General Public License as published by 13 | * the Free Software Foundation, either version 3 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * CALDGEMM is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU Lesser General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU Lesser General Public License 22 | * along with CALDGEMM. If not, see . 23 | */ 24 | 25 | #ifndef caldgemm_cpu_H 26 | #define caldgemm_cpu_H 27 | 28 | #include "caldgemm.h" 29 | 30 | class caldgemm_cpu : public caldgemm 31 | { 32 | public: 33 | caldgemm_cpu(); 34 | virtual ~caldgemm_cpu(); 35 | 36 | private: 37 | virtual int UseOutputPthreads(); 38 | virtual int UseInputPthreads(); 39 | virtual int UseMutexPerDevice(); 40 | 41 | virtual int DGEMM_prepare_backend(size_t k, int j, unsigned int num_device, bool prepareM, bool prepareN, bool buffersSufficiant, bool buffersSufficiant0 CALDGEMM_DIVBUFA); 42 | virtual int Initialize (bool nocalinit); 43 | virtual int ValidateRuntime(); 44 | virtual int CheckDevices(); 45 | virtual int InitDevices(); 46 | virtual int ReinitDevices(); 47 | virtual int InitConstantData(double alpha); 48 | virtual int ExecuteKernels(caldgemm::DGEMMPrepareAndExecuteTask& Task, int blockm, int blockn); 49 | virtual int ExitRuntime(); 50 | virtual int ExitDevices(); 51 | virtual int WaitForEvent(int, int, int); 52 | virtual int FetchResult(int device, int j, int m, int n, int mustlock = 0); 53 | virtual int CheckDMAQueue(int device, int forcej = -1); 54 | virtual int RunMergeBuffers(double* dst, int device, int j, int width, int height, int gpu_width, int gpu_height, int pitch); 55 | }; 56 | 57 | #endif 58 | -------------------------------------------------------------------------------- /caldgemm_cuda.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Interface of the CALDGEMM library. 3 | * 4 | * Copyright 2015: 5 | * - David Rohr (drohr@jwdt.org) 6 | * - Matthias Bach (bach@compeng.uni-frankfurt.de) 7 | * - Matthias Kretz (kretz@compeng.uni-frankfurt.de) 8 | * 9 | * This file is part of CALDGEMM. 10 | * 11 | * CALDGEMM is free software: you can redistribute it and/or modify 12 | * it under the terms of the GNU Lesser General Public License as published by 13 | * the Free Software Foundation, either version 3 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * CALDGEMM is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU Lesser General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU Lesser General Public License 22 | * along with CALDGEMM. If not, see . 23 | */ 24 | 25 | #ifndef caldgemm_cuda_H 26 | #define caldgemm_cuda_H 27 | 28 | #include 29 | #include 30 | #ifdef CALDGEMM_CUDA_CUBLAS 31 | #include 32 | #endif 33 | #include "caldgemm.h" 34 | 35 | class caldgemm_cuda : public caldgemm 36 | { 37 | public: 38 | caldgemm_cuda(); 39 | virtual ~caldgemm_cuda(); 40 | 41 | private: 42 | virtual int UseOutputPthreads(); 43 | virtual int UseInputPthreads(); 44 | virtual int UseMutexPerDevice(); 45 | 46 | virtual int DGEMM_prepare_backend(size_t k, int j, unsigned int num_device, bool prepareM, bool prepareN, bool buffersSufficiant, bool buffersSufficiant0 CALDGEMM_DIVBUFA); 47 | virtual int Initialize (bool nocalinit); 48 | virtual int ValidateRuntime(); 49 | virtual int CheckDevices(); 50 | virtual int InitDevices(); 51 | virtual int ReinitDevices(); 52 | virtual int InitConstantData(double alpha); 53 | virtual int ExecuteKernels(caldgemm::DGEMMPrepareAndExecuteTask& Task, int blockm, int blockn); 54 | virtual int ExitRuntime(); 55 | virtual int ExitDevices(); 56 | virtual int WaitForEvent(int, int, int); 57 | virtual int FetchResult(int device, int j, int m, int n, int mustlock = 0); 58 | virtual int CheckDMAQueue(int device, int forcej = -1); 59 | virtual int RunMergeBuffers(double* dst, int device, int j, int width, int height, int gpu_width, int gpu_height, int pitch); 60 | virtual int RunCALDGEMM_Init(); 61 | virtual int RunCALDGEMM_Exit(); 62 | 63 | virtual double* AllocMemory(size_t nDoubles, bool page_locked, bool huge_pages, bool gpuaccessible = false, bool interleave = false); 64 | virtual int FreeMemory(double* ptr, bool gpuaccessible = false); 65 | virtual int Preallocate(); 66 | virtual int PreallocateFree(); 67 | virtual int SimpleQueuingAvailable(); 68 | 69 | void SetupSimpleQueue(size_t mb, size_t nb); 70 | struct caldgemm_cuda_simple_queue_event 71 | { 72 | cudaEvent_t event; 73 | int num_queue; 74 | }; 75 | caldgemm_cuda_simple_queue_event* simple_queue_events[max_devices][2]; //2 for m and n direction 76 | bool* simple_queue_event_requested[max_devices][obuffercount][2]; 77 | cudaEvent_t simple_queue_event_kernels[max_devices][ibuffercount][obuffercount]; 78 | bool simple_queue_event_kernels_used[max_devices][ibuffercount][obuffercount]; 79 | struct alternateSimpleQueueCBuffferEventStruct 80 | { 81 | cudaEvent_t event; 82 | bool used; 83 | }; 84 | cudaEvent_t alternateSimpleQueueCopyCEvent[max_devices][obuffercount]; 85 | alternateSimpleQueueCBuffferEventStruct alternateSimpleQueueCBuffferEvent[max_devices][obuffercount]; 86 | cudaEvent_t alternateSimpleQueueEvent_tmp_abuffers[max_devices][obuffercount]; 87 | cudaEvent_t alternateSimpleQueueEvent_tmp_bbuffers[max_devices][obuffercount]; 88 | bool alternateSimpleQueueEvent_tmp_abuffers_used[max_devices][obuffercount]; 89 | bool alternateSimpleQueueEvent_tmp_bbuffers_used[max_devices][obuffercount]; 90 | cudaEvent_t alternateSimpleQueueTmpEvents[2]; 91 | 92 | cudaEvent_t* AlternateLookaheadTilesRemainingSQ_events; 93 | virtual int CheckAlternateTilesRemainingSQ(); 94 | qSem AlternateLookaheadDoneMutexSQ; 95 | 96 | int cuda_devices[max_devices]; 97 | cudaStream_t cuda_command_queues[max_devices][obuffercount + 2]; 98 | void* cuda_abuffers[max_devices][ibuffercount]; 99 | void* cuda_bbuffers[max_devices][max_bbuffers]; 100 | void* cuda_cbuffers[max_devices][obuffercount]; 101 | void* cuda_tmp_abuffers[max_devices][obuffercount]; 102 | void* cuda_tmp_bbuffers[max_devices][obuffercount]; 103 | cudaEvent_t cuda_events[max_devices][obuffercount]; 104 | #ifdef CALDGEMM_CUDA_CUBLAS 105 | cublasHandle_t cublas_handles[max_devices]; 106 | #endif 107 | cudaEvent_t cuda_conversion_events[max_devices][2]; 108 | int cuda_conversion_events_use[max_devices][2]; 109 | 110 | int WaitForEventAndRelease(cudaEvent_t* pEvent); 111 | 112 | static const int GROUP_SIZE_X = 16, GROUP_SIZE_Y = 16, GROUP_COUNT_X = 16, GROUP_COUNT_Y = 16; //Group and block size for conversion kernels and for DGEMM kernel 113 | 114 | struct conversionKernelTaskStruct 115 | { 116 | conversionKernelTaskStruct() {} 117 | conversionKernelTaskStruct(void* c1, void* c2, int c3, int c4, char c5) : dest_buffer_tmp(c1), dest_image(c2), arg_width(c3), arg_height(c4), myMat(c5) {} 118 | void* dest_buffer_tmp; 119 | void* dest_image; 120 | size_t arg_width; 121 | size_t arg_height; 122 | char myMat; 123 | }; 124 | }; 125 | 126 | #endif 127 | -------------------------------------------------------------------------------- /caldgemm_opencl.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Interface of the CALDGEMM library. 3 | * 4 | * Copyright 2015: 5 | * - David Rohr (drohr@jwdt.org) 6 | * - Matthias Bach (bach@compeng.uni-frankfurt.de) 7 | * - Matthias Kretz (kretz@compeng.uni-frankfurt.de) 8 | * 9 | * This file is part of CALDGEMM. 10 | * 11 | * CALDGEMM is free software: you can redistribute it and/or modify 12 | * it under the terms of the GNU Lesser General Public License as published by 13 | * the Free Software Foundation, either version 3 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * CALDGEMM is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU Lesser General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU Lesser General Public License 22 | * along with CALDGEMM. If not, see . 23 | */ 24 | 25 | #ifndef CALDGEMM_OPENCL_H 26 | #define CALDGEMM_OPENCL_H 27 | 28 | #include 29 | 30 | #include "caldgemm.h" 31 | 32 | #if !defined(CALDGEMM_TRANSPOSED_A) & !defined(CALDGEMM_TRANSPOSED_B) 33 | #error You must either defined CALDGEMM_TRANSPOSED_A or CALDGEMM_TRANSPOSED_B for the OpenCL backend 34 | #endif 35 | 36 | #ifndef _WIN32 37 | #define HINSTANCE void* 38 | #endif 39 | 40 | class caldgemm_opencl : public caldgemm 41 | { 42 | public: 43 | caldgemm_opencl(); 44 | virtual ~caldgemm_opencl(); 45 | 46 | class caldgemm_config_backend_opencl : public caldgemm_config_backend 47 | { 48 | public: 49 | virtual ~caldgemm_config_backend_opencl(); 50 | caldgemm_config_backend_opencl(); 51 | virtual int ParseBackendOptions(unsigned int argc, char** argv); 52 | virtual void printConfig(caldgemm_config_backend* oldConfig = NULL); 53 | virtual caldgemm_config_backend_opencl* Clone() const {return new caldgemm_config_backend_opencl(*this);} 54 | 55 | char* kernelLib; 56 | bool allowCPUDevice; 57 | }; 58 | virtual caldgemm_config_backend* create_caldgemm_config_backend(); 59 | 60 | private: 61 | virtual int UseOutputPthreads(); 62 | virtual int UseInputPthreads(); 63 | virtual int UseMutexPerDevice(); 64 | virtual int AllowCPUFallback(); 65 | virtual int SimpleQueuingAvailable(); 66 | virtual int PipelinedModeAvailable(); 67 | virtual int AsyncModeAvailable(); 68 | 69 | virtual int DGEMM_prepare_backend(size_t k, int j, unsigned int num_device, bool prepareM, bool prepareN, bool buffersSufficiant, bool buffersSufficiant0 CALDGEMM_DIVBUFA); 70 | virtual int Initialize (bool nocalinit); 71 | virtual int ValidateRuntime(); 72 | virtual int CheckDevices(); 73 | virtual int InitDevices(); 74 | virtual int ReinitDevices(); 75 | virtual int InitConstantData(double alpha); 76 | virtual int ExecuteKernels(caldgemm::DGEMMPrepareAndExecuteTask& Task, int blockm, int blockn); 77 | virtual int ExitRuntime(); 78 | virtual int ExitDevices(); 79 | virtual int WaitForEvent(int, int, int); 80 | virtual int FetchResult(int device, int j, int m, int n, int mustlock = 0); 81 | virtual int CheckDMAQueue(int device, int forcej = -1); 82 | virtual int RunMergeBuffers(double* dst, int device, int j, int width, int height, int gpu_width, int gpu_height, int pitch); 83 | virtual int RunCALDGEMM_Init(); 84 | virtual int RunCALDGEMM_Exit(); 85 | virtual int Preallocate(); 86 | virtual int PreallocateFree(); 87 | virtual int RunAsyncSingleTileDGEMM(const double* A, const double* B, double* C, double alpha, double beta, size_t m, size_t k, size_t n, size_t Apitch, size_t Bpitch, size_t Cpitch, bool orderColMajor, bool TransA, bool TransB); 88 | virtual int RunAsyncSingleTileDTRSM(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const size_t M, const size_t N, const double alpha, const double *A, const size_t lda, double *B, const size_t ldb); 89 | virtual int RunCALDGEMM_Finish(); 90 | virtual int CheckParams(); 91 | virtual int FinishDataInit(); 92 | virtual void FinishDataFill(); 93 | virtual int WaitForCALDGEMMProgress(size_t n); 94 | 95 | virtual double* AllocMemory(size_t nDoubles, bool page_locked, bool huge_pages, bool gpuaccessible = false, bool interleave = false); 96 | virtual int FreeMemory(double* ptr, bool gpuaccessible = false); 97 | 98 | virtual int CaldgemmCustomAutoHeight(size_t MaxGpuM, size_t MaxGpuN, int nDevices); 99 | virtual int CaldgemmCustomModHeight(size_t MOD_OVER, size_t MOD_GPU); 100 | 101 | void SetupSimpleQueue(size_t mb, size_t nb); 102 | 103 | cl_platform_id ocl_platform; 104 | cl_device_id ocl_devices[max_devices + 1]; //+1 for cpu 105 | cl_context ocl_context; 106 | cl_command_queue ocl_command_queues[max_devices][obuffercount > 3 ? obuffercount : 3]; 107 | cl_command_queue ocl_command_queue_cpu; 108 | cl_mem ocl_abuffers[2][max_devices][ibuffercount]; 109 | cl_mem ocl_bbuffers[2][max_devices][max_bbuffers]; 110 | cl_mem ocl_cbuffers[2][max_devices][obuffercount]; 111 | cl_mem ocl_tmp_abuffers[2][max_devices][ibuffercount > obuffercount ? ibuffercount : obuffercount]; 112 | cl_mem ocl_tmp_bbuffers[2][max_devices][ibuffercount > obuffercount ? ibuffercount : obuffercount]; 113 | cl_mem ocl_tmp_cbuffers[max_devices][obuffercount]; 114 | cl_event ocl_events[max_devices][obuffercount]; 115 | cl_program ocl_program[5]; 116 | cl_kernel ocl_kernel[max_devices][5]; 117 | 118 | cl_command_queue ocl_async_queue[max_devices]; 119 | cl_kernel ocl_async_kernel[max_devices][4]; 120 | cl_mem ocl_async_buffers[max_devices][4]; 121 | 122 | struct caldgemm_opencl_simple_queue_event 123 | { 124 | cl_event event; 125 | int num_queue; 126 | }; 127 | 128 | struct finishStructOpenCL : public finishStruct 129 | { 130 | virtual ~finishStructOpenCL() {} 131 | 132 | cl_event StartMarker[max_devices][obuffercount]; 133 | cl_event MidMarker[max_devices][obuffercount]; 134 | cl_event EndMarker[max_devices][obuffercount]; 135 | 136 | bool MidMarkerDone, EndMarkerDone; 137 | }; 138 | cl_event StartMarker[max_devices][obuffercount]; 139 | cl_event MidMarker[max_devices][obuffercount]; 140 | cl_event EndMarker[max_devices][obuffercount]; 141 | bool MidMarkerCreated[max_devices][obuffercount]; 142 | 143 | caldgemm_opencl_simple_queue_event* simple_queue_events[max_devices][2]; //2 for m and n direction 144 | bool* simple_queue_event_requested[max_devices][obuffercount][2]; 145 | cl_event simple_queue_event_kernels[max_devices][ibuffercount][obuffercount]; 146 | bool simple_queue_event_kernels_used[max_devices][ibuffercount][obuffercount]; 147 | 148 | struct alternateSimpleQueueCBuffferEventStruct 149 | { 150 | cl_event event; 151 | bool must_release; 152 | bool used; 153 | }; 154 | 155 | struct conversionKernelTaskStruct 156 | { 157 | conversionKernelTaskStruct() {} 158 | conversionKernelTaskStruct(cl_mem c1, cl_mem* c2, int c3, int c4, int c5, cl_event* c6, cl_event c7, cl_event* c8, char c9) : dest_buffer_tmp(c1), dest_image(c2), arg_width(c3), arg_height(c4), arg_transpose(c5), ev(c6), ev2(c7), ev3(c8), myMat(c9) {} 159 | cl_mem dest_buffer_tmp; 160 | cl_mem* dest_image; 161 | int arg_width; 162 | int arg_height; 163 | int arg_transpose; 164 | cl_event* ev; 165 | cl_event ev2; 166 | cl_event* ev3; 167 | char myMat; 168 | }; 169 | 170 | cl_event alternateSimpleQueueCopyCEvent[max_devices][obuffercount]; 171 | alternateSimpleQueueCBuffferEventStruct alternateSimpleQueueCBuffferEvent[max_devices][obuffercount]; 172 | cl_event alternateSimpleQueueEvent_tmp_abuffers[max_devices][obuffercount]; 173 | cl_event alternateSimpleQueueEvent_tmp_bbuffers[max_devices][obuffercount]; 174 | bool alternateSimpleQueueEvent_tmp_abuffers_used[max_devices][obuffercount]; 175 | bool alternateSimpleQueueEvent_tmp_bbuffers_used[max_devices][obuffercount]; 176 | 177 | bool pipelinedModeStartBarrierDone[max_devices][obuffercount]; 178 | void pipelinedModeSetStartBarriers(unsigned int num_device, int j, int &nTransferEvents, cl_event* transferEvents, bool &freeTransferEvents); 179 | 180 | cl_event* AlternateLookaheadTilesRemainingSQ_events; 181 | virtual int CheckAlternateTilesRemainingSQ(); 182 | qSem AlternateLookaheadDoneMutexSQ; 183 | 184 | double* ocl_tmp_abuffers_ptr[max_devices][ibuffercount]; 185 | double* ocl_tmp_bbuffers_ptr[max_devices][ibuffercount]; 186 | double* ocl_tmp_cbuffers_ptr[max_devices][obuffercount]; 187 | 188 | cl_event ocl_conversion_events[max_devices][2]; 189 | int ocl_conversion_events_use[max_devices][2]; 190 | 191 | static const char *OCLKernel, *OCLKernelALPHA1, *OCLKernelLinpack, *OCLConvertKernel, *OCLConvertKernelTex; 192 | 193 | int WaitForEventAndRelease(cl_event* pEvent, int lock = -1); 194 | int divideBuffer(double* src, size_t pitch_src, double* dest, size_t nSrcRows, size_t nSrcCols, bool transpose); 195 | 196 | static const int GROUP_SIZE_X = 16, GROUP_SIZE_Y = 16, GROUP_COUNT_X = 16, GROUP_COUNT_Y = 16; //Group size and count for conversion kernels. 197 | 198 | caldgemm_config_backend_opencl* config_backend; 199 | 200 | HINSTANCE kernelLib; 201 | cl_kernel (*kernelLibCreate) (cl_context* context, int nDevices, cl_device_id* devices, int kernelType, int k, int betazero); 202 | void (*kernelLibQuerySettings) (int* tiling_x, int* tiling_y, bool* transposeA, bool* transposeB, bool* texture_buffers, int* group_size_x, int* group_size_y, int* min_tile_size, int* min_k); 203 | void (*kernelLibTerminate) (); 204 | size_t (*kernelLibSuggestedMaxHeight) (); 205 | size_t (*kernelLibGetAutoHeight) (size_t MaxGpuM, size_t MaxGpuN, int nDevices, size_t Width); 206 | void (*kernelLibModHeight) (size_t MOD_OVER, size_t MOD_GPU); 207 | int (*kernelLibInitialize) (cl_platform_id platform); 208 | 209 | cl_event last_device_kernel[max_devices]; 210 | 211 | public: 212 | static int GetMemoryInfo(cl_mem* mem, void** ptr, size_t* offset, const void* addr); 213 | 214 | struct gpu_mem_struct_opencl 215 | { 216 | void* ptr; 217 | size_t size; 218 | cl_mem mem_obj; 219 | }; 220 | }; 221 | 222 | #endif 223 | -------------------------------------------------------------------------------- /cmodules/affinity.cpp: -------------------------------------------------------------------------------- 1 | #ifdef _WIN32 2 | #include "pthread_mutex_win32_wrapper.h" 3 | #else 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #endif 10 | #include 11 | #include "affinity.h" 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "os_low_level_helper.h" 17 | 18 | #ifndef STD_OUT 19 | #define STD_OUT stdout 20 | #endif 21 | 22 | pid_t gettid() 23 | { 24 | #ifdef _WIN32 25 | return((pid_t) GetCurrentThreadId()); 26 | #else 27 | return((pid_t) syscall(SYS_gettid)); 28 | #endif 29 | } 30 | 31 | #ifdef _WIN32 32 | pid_t getpid() 33 | { 34 | return((pid_t) GetCurrentProcessId()); 35 | } 36 | #endif 37 | 38 | struct threadNameStruct 39 | { 40 | pid_t thread_id; 41 | std::string name; 42 | }; 43 | 44 | class lockClass 45 | { 46 | public: 47 | lockClass() {pthread_mutex_init(&lock, NULL);} 48 | ~lockClass() {pthread_mutex_destroy(&lock);threadNames.clear();} 49 | std::vector threadNames; 50 | pthread_mutex_t lock; 51 | }; 52 | 53 | static lockClass lockedVector; 54 | 55 | void setThreadName(const char* name) 56 | { 57 | threadNameStruct tmp; 58 | tmp.thread_id = gettid(); 59 | tmp.name = name; 60 | pthread_mutex_lock(&lockedVector.lock); 61 | lockedVector.threadNames.push_back(tmp); 62 | pthread_mutex_unlock(&lockedVector.lock); 63 | } 64 | 65 | const char* getThreadName(int tid, const char* defaultval) 66 | { 67 | #ifndef _WIN32 68 | if (tid == -1) tid = gettid(); 69 | for (size_t i = 0;i < lockedVector.threadNames.size();i++) 70 | { 71 | if (lockedVector.threadNames[i].thread_id == tid) 72 | { 73 | return(lockedVector.threadNames[i].name.c_str()); 74 | } 75 | } 76 | #endif 77 | return(defaultval); 78 | } 79 | 80 | void setUnknownNames(char* name) 81 | { 82 | pid_t pid = getpid(); 83 | #ifndef _WIN32 84 | char dirname[1024]; 85 | sprintf(dirname, "/proc/%d/task", (int) pid); 86 | DIR* dp = opendir(dirname); 87 | if (dp) 88 | { 89 | dirent* ent; 90 | while ((ent = readdir(dp)) != NULL) 91 | { 92 | pid_t tid = atoi(ent->d_name); 93 | if (tid != 0 && tid != pid) 94 | { 95 | int found = false; 96 | for (size_t i = 0;i < lockedVector.threadNames.size();i++) 97 | { 98 | if (lockedVector.threadNames[i].thread_id == tid) 99 | { 100 | found = true; 101 | } 102 | } 103 | if (found == false) 104 | { 105 | threadNameStruct tmp; 106 | tmp.thread_id = tid; 107 | tmp.name = name; 108 | lockedVector.threadNames.push_back(tmp); 109 | } 110 | } 111 | } 112 | } 113 | #endif 114 | } 115 | 116 | void setUnknownAffinity(int count, int* cores) 117 | { 118 | pid_t pid = getpid(); 119 | #ifndef _WIN32 120 | char dirname[1024]; 121 | sprintf(dirname, "/proc/%d/task", (int) pid); 122 | DIR* dp = opendir(dirname); 123 | if (dp) 124 | { 125 | dirent* ent; 126 | while ((ent = readdir(dp)) != NULL) 127 | { 128 | pid_t tid = atoi(ent->d_name); 129 | if (tid != 0 && tid != pid) 130 | { 131 | int found = false; 132 | for (size_t i = 0;i < lockedVector.threadNames.size();i++) 133 | { 134 | if (lockedVector.threadNames[i].thread_id == tid) 135 | { 136 | found = true; 137 | } 138 | } 139 | if (found == false) 140 | { 141 | cpu_set_t tmpset; 142 | CPU_ZERO(&tmpset); 143 | for (int i = 0;i < count;i++) CPU_SET(cores[i], &tmpset); 144 | sched_setaffinity(tid, sizeof(tmpset), &tmpset); 145 | } 146 | } 147 | } 148 | } 149 | #endif 150 | } 151 | 152 | void printThreadPinning() 153 | { 154 | pid_t pid = getpid(); 155 | #ifndef _WIN32 156 | char dirname[1024]; 157 | sprintf(dirname, "/proc/%d/task", (int) pid); 158 | DIR* dp = opendir(dirname); 159 | if (dp) 160 | { 161 | dirent* ent; 162 | fprintf(STD_OUT, "%12s", ""); 163 | for (int i = 0;i < get_number_of_cpu_cores();i++) 164 | { 165 | fprintf(STD_OUT, " %2d", i); 166 | } 167 | fprintf(STD_OUT, "\n"); 168 | 169 | while ((ent = readdir(dp)) != NULL) 170 | { 171 | pid_t tid = atoi(ent->d_name); 172 | if (tid != 0) 173 | { 174 | fprintf(STD_OUT, "Thread %5d", tid); 175 | cpu_set_t threadmask; 176 | sched_getaffinity(tid, sizeof(threadmask), &threadmask); 177 | for (int i = 0;i < get_number_of_cpu_cores();i++) 178 | { 179 | if (CPU_ISSET(i, &threadmask)) 180 | { 181 | fprintf(STD_OUT, " X"); 182 | } 183 | else 184 | { 185 | fprintf(STD_OUT, " ."); 186 | } 187 | } 188 | fprintf(STD_OUT, " - "); 189 | const char* name = getThreadName(tid); 190 | fprintf(STD_OUT, "%s", name); 191 | if (CPU_COUNT(&threadmask) == 1) 192 | { 193 | for (int i = 0;i < get_number_of_cpu_cores();i++) 194 | { 195 | if (CPU_ISSET(i, &threadmask)) fprintf(STD_OUT, " - Pinned to core %d", i); 196 | } 197 | } 198 | char filename[1024]; 199 | sprintf(filename, "/proc/%d/task/%d/stat", (int) pid, (int) tid); 200 | FILE* fp = fopen(filename, "r"); 201 | if (fp != NULL) 202 | { 203 | char buffer[1024]; 204 | if (fgets(buffer, 1023, fp) == NULL) break; 205 | int count = 0; 206 | for (unsigned int i = 0;i < strlen(buffer);i++) 207 | { 208 | if (buffer[i] == ' ') 209 | { 210 | if (++count == 13) 211 | { 212 | int time; 213 | sscanf(&buffer[i + 1], "%d ", &time); 214 | fprintf(STD_OUT, " - Time: %d", time); 215 | break; 216 | } 217 | } 218 | } 219 | fclose(fp); 220 | } 221 | fprintf(STD_OUT, "\n"); 222 | } 223 | } 224 | closedir(dp); 225 | } 226 | #endif 227 | } 228 | -------------------------------------------------------------------------------- /cmodules/affinity.h: -------------------------------------------------------------------------------- 1 | #ifndef AFFINITY_H 2 | #define AFFINITY_H 3 | 4 | #ifdef _WIN32 5 | typedef HANDLE pid_t; 6 | #include "sched_affinity_win32_wrapper.h" 7 | #else 8 | #include 9 | #endif 10 | 11 | #ifdef __cplusplus 12 | extern "C" 13 | { 14 | #endif 15 | 16 | void setThreadName(const char* name); 17 | #ifdef __cplusplus 18 | const char* getThreadName(int tid = -1, const char* defaultval = "Unknown Thread"); 19 | #else 20 | const char* getThreadName(int tid, const char* defaultval); 21 | #endif 22 | void printThreadPinning(); 23 | void setUnknownNames(char* name); 24 | void setUnknownAffinity(int count, int* cores); 25 | 26 | inline int sched_setaffinity_set_core(int core) 27 | { 28 | cpu_set_t set; 29 | CPU_ZERO(&set); 30 | CPU_SET(core, &set); 31 | return sched_setaffinity(0, sizeof(set), &set); 32 | } 33 | 34 | pid_t gettid(); 35 | #ifdef _WIN32 36 | pid_t getpid(); 37 | #endif 38 | 39 | #ifdef __cplusplus 40 | } 41 | #endif 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /cmodules/get_private_profile.h: -------------------------------------------------------------------------------- 1 | #ifndef GET_PRIVATE_PROFILE_H 2 | #define GET_PRIVATE_PROFILE_H 3 | 4 | static inline longint GetPrivateProfileString(char* lpSectionName, char* lpKeyName, char* lpDefault, char* lpBuffer, DWORD size, char* configfile) 5 | { 6 | for (size_t i = 0;i < strlen(configfile);i++) if (configfile[i] == '\\') configfile[i] = '/'; 7 | FILE* cfgfile = fopen(configfile, "r"); 8 | if (cfgfile == NULL) 9 | { 10 | fprintf(stderr, "Error opening file %s\n", configfile); 11 | return(-1); 12 | } 13 | char linebuffer[1024]; 14 | bool correctsection = false; 15 | //fprintf(stderr, "Searching for %s in %s default %s\n", lpKeyName, lpSectionName, lpDefault); 16 | while (!feof(cfgfile)) 17 | { 18 | if (fgets(linebuffer, 1023, cfgfile) == NULL) break; 19 | if (linebuffer[0] == '[') 20 | { 21 | correctsection = strncmp(&linebuffer[1], lpSectionName, strlen(lpSectionName)) == 0 && linebuffer[strlen(lpSectionName) + 1] == ']'; 22 | continue; 23 | } 24 | if (!correctsection) continue; 25 | if (strncmp(linebuffer, lpKeyName, strlen(lpKeyName)) == 0) 26 | { 27 | char* tmpptr = &linebuffer[strlen(lpKeyName)]; 28 | while (*tmpptr == ' ') tmpptr++; 29 | if (*tmpptr != '=') continue; 30 | while (*(++tmpptr) == ' ') ; 31 | char* tmpptr2 = tmpptr; 32 | while (*tmpptr2 != 0 && *tmpptr2 != 10 && *tmpptr2 != 13) tmpptr2++; 33 | *tmpptr2 = 0; 34 | strncpy(lpBuffer, &tmpptr[0], size < strlen(tmpptr) ? size : strlen(tmpptr)); 35 | lpBuffer[size < strlen(tmpptr) ? size : strlen(tmpptr)] = 0; 36 | fclose(cfgfile); 37 | //fprintf(stderr, "Found: %s in %s: '%s'\n", lpKeyName, lpSectionName, lpBuffer); 38 | return(strlen(tmpptr)); 39 | } 40 | } 41 | if (lpDefault == NULL) *lpBuffer = 0; 42 | else 43 | { 44 | strncpy(lpBuffer, lpDefault, size < strlen(lpDefault) ? size : strlen(lpDefault)); 45 | lpBuffer[size < strlen(lpDefault) ? size : strlen(lpDefault)] = 0; 46 | } 47 | fclose(cfgfile); 48 | //fprintf(stderr, "Not found: %s in %s, using default: '%s' -> '%s'\n", lpKeyName, lpSectionName, lpDefault, lpBuffer); 49 | return(strlen(lpDefault)); 50 | } 51 | 52 | static inline longint GetPrivateProfileInt(char* lpSectionName, char* lpKeyName, int nDefault, char* configfile) 53 | { 54 | char linebuffer[16] = "0"; 55 | char strdefault[16]; 56 | sprintf(strdefault, "%d", nDefault); 57 | GetPrivateProfileString(lpSectionName, lpKeyName, strdefault, linebuffer, 15, configfile); 58 | return(atoi(linebuffer)); 59 | } 60 | 61 | static inline int GetPrivateProfileSectionNames(char* buffer, int buffersize, char* filename) 62 | { 63 | for (size_t i = 0;i < strlen(filename);i++) if (filename[i] == '\\') filename[i] = '/'; 64 | FILE* cfgfile = fopen(filename, "r"); 65 | if (cfgfile == NULL) 66 | { 67 | fprintf(stderr, "Error opening file %s\n", filename); 68 | return(-1); 69 | } 70 | char linebuffer[1024]; 71 | int nwritten = 0; 72 | while (!feof(cfgfile)) 73 | { 74 | if (fgets(linebuffer, 1023, cfgfile) == NULL) break; 75 | char* tmpptr = linebuffer; 76 | while (*tmpptr == ' ') tmpptr++; 77 | if (*tmpptr != '[') continue; 78 | char* sectptr = ++tmpptr; 79 | int section_len = 0; 80 | while (*tmpptr && *tmpptr != 10 && *tmpptr != 13) 81 | { 82 | if (*tmpptr != ']') 83 | { 84 | tmpptr++; 85 | section_len++; 86 | } 87 | else 88 | { 89 | if (nwritten + section_len + 2 < buffersize) 90 | { 91 | memcpy(&buffer[nwritten], sectptr, section_len); 92 | buffer[nwritten + section_len] = 0; 93 | buffer[nwritten + section_len + 1] = 0; 94 | } 95 | nwritten += section_len + 1; 96 | break; 97 | } 98 | } 99 | } 100 | return(nwritten); 101 | } 102 | 103 | #endif 104 | -------------------------------------------------------------------------------- /cmodules/linux_helpers.h: -------------------------------------------------------------------------------- 1 | #ifndef LINUX_HELPERS_H 2 | #define LINUX_HELPERS_H 3 | 4 | #include 5 | #include 6 | 7 | static inline int getch() 8 | { 9 | static struct termios oldt, newt; 10 | tcgetattr( STDIN_FILENO, &oldt); 11 | newt = oldt; 12 | newt.c_lflag &= ~(ICANON|ECHO); 13 | tcsetattr( STDIN_FILENO, TCSANOW, &newt); 14 | int retVal = getchar(); 15 | tcsetattr( STDIN_FILENO, TCSANOW, &oldt); 16 | return(retVal); 17 | } 18 | 19 | static inline int kbhit() 20 | { 21 | struct termios term, oterm; 22 | int fd = 0; 23 | int c = 0; 24 | tcgetattr(fd, &oterm); 25 | term = oterm; 26 | term.c_lflag = term.c_lflag & (!ICANON); 27 | term.c_cc[VMIN] = 0; 28 | term.c_cc[VTIME] = 1; 29 | tcsetattr(fd, TCSANOW, &term); 30 | c = getchar(); 31 | tcsetattr(fd, TCSANOW, &oterm); 32 | if (c != -1) 33 | ungetc(c, stdin); 34 | return ((c != -1) ? 1 : 0); 35 | } 36 | 37 | static void inline Sleep(int msecs) 38 | { 39 | usleep(msecs * 1000); 40 | } 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /cmodules/os_low_level_helper.h: -------------------------------------------------------------------------------- 1 | #ifndef OS_LOW_LEVEL_HELPER_H 2 | #define OS_LOW_LEVEL_HELPER_H 3 | 4 | #ifndef _WIN32 5 | #include 6 | #include 7 | #endif 8 | 9 | inline int get_number_of_cpu_cores() 10 | { 11 | #ifdef _WIN32 12 | SYSTEM_INFO info; 13 | GetSystemInfo(&info); 14 | return(info.dwNumberOfProcessors); 15 | #else 16 | return(sysconf(_SC_NPROCESSORS_ONLN)); 17 | #endif 18 | } 19 | 20 | inline int get_standard_page_size() 21 | { 22 | #ifdef _WIN32 23 | SYSTEM_INFO info; 24 | GetSystemInfo(&info); 25 | return(info.dwPageSize); 26 | #else 27 | return(getpagesize()); 28 | #endif 29 | } 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /cmodules/pthread_mutex_win32_wrapper.h: -------------------------------------------------------------------------------- 1 | #ifndef PTHREAD_MUTEX_WIN32_WRAPPER_H 2 | #define PTHREAD_MUTEX_WIN32_WRAPPER_H 3 | 4 | #include 5 | #include 6 | typedef HANDLE pthread_mutex_t; 7 | typedef HANDLE pthread_t; 8 | typedef HANDLE sem_t; 9 | 10 | #ifndef EBUSY 11 | #define EBUSY WAIT_TIMEOUT 12 | #endif 13 | 14 | #ifndef EAGAIN 15 | #define EAGAIN WAIT_TIMEOUT 16 | #endif 17 | 18 | static inline int pthread_mutex_init(pthread_mutex_t *mutex, const void* attr) 19 | { 20 | *mutex = CreateSemaphore(NULL, 1, 1, NULL); 21 | //printf("INIT %d\n", *mutex); 22 | return((*mutex) == NULL); 23 | } 24 | 25 | static inline int pthread_mutex_lock(pthread_mutex_t *mutex) 26 | { 27 | //printf("LOCK %d\n", *mutex); 28 | return(WaitForSingleObject(*mutex, INFINITE) == WAIT_FAILED); 29 | } 30 | 31 | static inline int pthread_mutex_trylock(pthread_mutex_t *mutex) 32 | { 33 | DWORD retVal = WaitForSingleObject(*mutex, 0); 34 | if (retVal == WAIT_TIMEOUT) return(EBUSY); 35 | //printf("TRYLOCK %d\n", *mutex); 36 | if (retVal != WAIT_FAILED) return(0); 37 | return(1); 38 | } 39 | 40 | static inline int pthread_mutex_unlock(pthread_mutex_t *mutex) 41 | { 42 | //printf("UNLOCK %d\n", *mutex); 43 | return(ReleaseSemaphore(*mutex, 1, NULL) == 0); 44 | } 45 | 46 | static inline int pthread_mutex_destroy(pthread_mutex_t *mutex) 47 | { 48 | return(CloseHandle(*mutex) == 0); 49 | } 50 | 51 | static inline int pthread_create(pthread_t *thread, const void* attr, void *(*start_routine)(void*), void *arg) 52 | { 53 | return((*thread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) start_routine, arg, 0, NULL)) == 0); 54 | } 55 | 56 | static inline int pthread_exit(void* ret) 57 | { 58 | ExitThread((DWORD) (size_t) ret); 59 | } 60 | 61 | static inline int pthread_join(pthread_t thread, void** retval) 62 | { 63 | static DWORD ExitCode; 64 | while (GetExitCodeThread(thread, &ExitCode) == STILL_ACTIVE) Sleep(0); 65 | if (retval != NULL) *retval = (void*) &ExitCode; 66 | return(0); 67 | } 68 | 69 | static inline int sem_init(sem_t *sem, int pshared, unsigned int value) 70 | { 71 | *sem = CreateSemaphore(NULL, value, 1024, NULL); 72 | return((*sem) == NULL); 73 | } 74 | 75 | static inline int sem_destroy(sem_t *sem) 76 | { 77 | return(CloseHandle(*sem) == 0); 78 | } 79 | 80 | static inline int sem_wait(sem_t *sem) 81 | { 82 | return(WaitForSingleObject(*sem, INFINITE) == WAIT_FAILED); 83 | } 84 | 85 | static inline int sem_trywait(sem_t *sem) 86 | { 87 | DWORD retVal = WaitForSingleObject(*sem, 0); 88 | if (retVal == WAIT_TIMEOUT) return(EAGAIN); 89 | if (retVal != WAIT_FAILED) return(0); 90 | return(-1); 91 | } 92 | 93 | static inline int sem_post(sem_t *sem) 94 | { 95 | return(ReleaseSemaphore(*sem, 1, NULL) == 0); 96 | } 97 | 98 | #ifdef CMODULES_PTHREAD_BARRIERS 99 | 100 | /*typedef struct _RTL_BARRIER { 101 | DWORD Reserved1; 102 | DWORD Reserved2; 103 | ULONG_PTR Reserved3[2]; 104 | DWORD Reserved4; 105 | DWORD Reserved5; 106 | } RTL_BARRIER, *PRTL_BARRIER; 107 | 108 | typedef RTL_BARRIER SYNCHRONIZATION_BARRIER; 109 | typedef PRTL_BARRIER PSYNCHRONIZATION_BARRIER; 110 | typedef PRTL_BARRIER LPSYNCHRONIZATION_BARRIER; 111 | 112 | #define SYNCHRONIZATION_BARRIER_FLAGS_SPIN_ONLY 0x01 113 | #define SYNCHRONIZATION_BARRIER_FLAGS_BLOCK_ONLY 0x02 114 | #define SYNCHRONIZATION_BARRIER_FLAGS_NO_DELETE 0x04 115 | 116 | BOOL WINAPI EnterSynchronizationBarrier(_Inout_ LPSYNCHRONIZATION_BARRIER lpBarrier, _In_ DWORD dwFlags); 117 | BOOL WINAPI InitializeSynchronizationBarrier(_Out_ LPSYNCHRONIZATION_BARRIER lpBarrier, _In_ LONG lTotalThreads, _In_ LONG lSpinCount); 118 | BOOL WINAPI DeleteSynchronizationBarrier(_Inout_ LPSYNCHRONIZATION_BARRIER lpBarrier);*/ 119 | 120 | typedef SYNCHRONIZATION_BARRIER pthread_barrier_t; 121 | 122 | static inline int pthread_barrier_destroy(pthread_barrier_t* b) 123 | { 124 | return(DeleteSynchronizationBarrier(b) == 0); 125 | } 126 | 127 | static inline int pthread_barrier_init(pthread_barrier_t* b, void* attr, unsigned count) 128 | { 129 | return(InitializeSynchronizationBarrier(b, count, -1) == 0); 130 | } 131 | 132 | static inline int pthread_barrier_wait(pthread_barrier_t* b) 133 | { 134 | EnterSynchronizationBarrier(b, 0); 135 | return(0); 136 | } 137 | 138 | #endif 139 | 140 | #endif 141 | -------------------------------------------------------------------------------- /cmodules/qmalloc.cpp: -------------------------------------------------------------------------------- 1 | #include "qmalloc.h" 2 | 3 | #include 4 | #include 5 | 6 | #ifdef _WIN32 7 | #include 8 | #include 9 | #else //_WIN32 10 | #include 11 | #include 12 | #include 13 | #ifdef _NUMAIF_H 14 | #include 15 | #endif 16 | 17 | #ifndef MAP_HUGETLB 18 | #define MAP_HUGETLB 0x40000 /* arch specific */ 19 | #endif 20 | #ifndef MPOL_DEFAULT 21 | #define MPOL_DEFAULT 0 22 | #endif 23 | #ifndef MPOL_PREFERRED 24 | #define MPOL_PREFERRED 1 25 | #endif 26 | #ifndef MPOL_BIND 27 | #define MPOL_BIND 2 28 | #endif 29 | #ifndef MPOL_INTERLEAVE 30 | #define MPOL_INTERLEAVE 3 31 | #endif 32 | #endif //!_WIN32 33 | 34 | #ifndef STD_OUT 35 | #define STD_OUT stdout 36 | #endif 37 | 38 | int qmalloc::qMallocCount = 0; 39 | int qmalloc::qMallocUsed = 0; 40 | qmalloc::qMallocData* qmalloc::qMallocs = NULL; 41 | 42 | #ifdef _WIN32 43 | static void Privilege(TCHAR* pszPrivilege, BOOL bEnable) 44 | { 45 | HANDLE hToken; 46 | TOKEN_PRIVILEGES tp; 47 | BOOL status; 48 | DWORD error; 49 | 50 | if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken)) 51 | { 52 | fprintf(STD_OUT, "Error obtaining process token\n"); 53 | } 54 | 55 | if (!LookupPrivilegeValue(NULL, pszPrivilege, &tp.Privileges[0].Luid)) 56 | { 57 | fprintf(STD_OUT, "Error looking up priviledge value\n"); 58 | } 59 | 60 | tp.PrivilegeCount = 1; 61 | tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; 62 | 63 | status = AdjustTokenPrivileges(hToken, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0); 64 | 65 | error = GetLastError(); 66 | if (!status || (error != ERROR_SUCCESS)) 67 | { 68 | fprintf(STD_OUT, "Error obtaining Priviledge %d\n", GetLastError()); 69 | } 70 | 71 | CloseHandle(hToken); 72 | } 73 | #endif 74 | 75 | void* qmalloc::qMalloc(size_t size, bool huge, bool executable, bool locked, void* alloc_addr, int interleave) 76 | { 77 | int pagesize; 78 | void* addr; 79 | if (huge) 80 | { 81 | #ifdef _WIN32 82 | static int tokenObtained = 0; 83 | #ifdef _AMD64_ 84 | pagesize = GetLargePageMinimum(); 85 | #else 86 | pagesize = 1024 * 2048; 87 | #endif 88 | if (tokenObtained == 0) 89 | { 90 | fprintf(STD_OUT, "Obtaining security token\n"); 91 | Privilege(TEXT("SeLockMemoryPrivilege"), TRUE); 92 | tokenObtained = 1; 93 | } 94 | #else 95 | pagesize = 1024 * 2048; 96 | #endif 97 | } 98 | else 99 | { 100 | #ifdef _WIN32 101 | SYSTEM_INFO si; 102 | GetSystemInfo(&si); 103 | pagesize = si.dwPageSize; 104 | #else 105 | pagesize = sysconf(_SC_PAGESIZE); 106 | #endif 107 | } 108 | if (size % pagesize) size += pagesize - size % pagesize; 109 | #ifdef _WIN32 110 | DWORD flags = MEM_COMMIT; 111 | if (huge) 112 | { 113 | flags |= MEM_LARGE_PAGES; 114 | } 115 | DWORD protect = PAGE_READWRITE; 116 | if (executable) 117 | { 118 | protect = PAGE_EXECUTE_READWRITE; 119 | } 120 | if (interleave) 121 | { 122 | fprintf(stderr, "Interleaved allocation not supported on Windows\n"); 123 | return(NULL); 124 | } 125 | if (alloc_addr != NULL) 126 | { 127 | if (VirtualAlloc(alloc_addr, size, (flags & ~MEM_COMMIT) | MEM_RESERVE, protect) != alloc_addr) 128 | { 129 | return(NULL); 130 | } 131 | } 132 | addr = VirtualAlloc(alloc_addr, size, flags, protect); 133 | #else 134 | int flags = MAP_ANONYMOUS | MAP_PRIVATE; 135 | int prot = PROT_READ | PROT_WRITE; 136 | if (huge) flags |= MAP_HUGETLB; 137 | if (executable) prot |= PROT_EXEC; 138 | if (locked) flags |= MAP_LOCKED; 139 | //unsigned long oldnodemask; 140 | //int oldpolicy; 141 | if (interleave && locked) //mmap will perform a memory lock, so we have to change memory policy beforehand 142 | { 143 | /* if (syscall(SYS_get_mempolicy, &oldpolicy, &oldnodemask, sizeof(oldnodemask) * 8, NULL, 0) != 0) 144 | { 145 | fprintf(stderr, "Error obtaining memory policy\n"); 146 | exit(1); 147 | }*/ 148 | unsigned long nodemask = 0xffffff; 149 | if (syscall(SYS_set_mempolicy, MPOL_INTERLEAVE, &nodemask, sizeof(nodemask) * 8) != 0) 150 | { 151 | fprintf(stderr, "Error setting memory policy\n"); 152 | } 153 | } 154 | addr = mmap(alloc_addr, size, prot, flags, 0, 0); 155 | if (addr == MAP_FAILED) addr = NULL; 156 | if (interleave) 157 | { 158 | if (locked) //Restore old memory policy 159 | { 160 | //syscall(SYS_set_mempolicy, oldpolicy, &oldnodemask, sizeof(oldnodemask) * 8); 161 | if (syscall(SYS_set_mempolicy, MPOL_DEFAULT, NULL) != 0) 162 | { 163 | fprintf(stderr, "Error setting memory policy\n"); 164 | } 165 | } 166 | else if (addr) //Set memory policy for region 167 | { 168 | #ifndef _NUMAIF_H 169 | fprintf(stderr, "Interleaved memory can only be used with non-locked memory if numaif.h is present\n"); 170 | exit(1); 171 | #else 172 | unsigned long nodemask = 0xffffff; 173 | mbind(addr, size, MPOL_INTERLEAVE, &nodemask, sizeof(nodemask) * 8, 0); 174 | #endif 175 | } 176 | } 177 | #endif 178 | 179 | if (alloc_addr != NULL && addr != alloc_addr) 180 | { 181 | fprintf(stderr, "Could not allocate memory at desired address\n"); 182 | #ifdef _WIN32 183 | VirtualFree(addr, 0, MEM_RELEASE); 184 | #else 185 | munmap(addr, size); 186 | #endif 187 | return(NULL); 188 | } 189 | 190 | if (addr == NULL) 191 | { 192 | #ifdef _WIN32 193 | DWORD error = GetLastError(); 194 | #endif 195 | fprintf(stderr, "Failed to allocate memory\n"); 196 | return(NULL); 197 | } 198 | 199 | if (qMallocCount == qMallocUsed) 200 | { 201 | if (qMallocCount == 0) qMallocCount = 8; 202 | else if (qMallocCount < 1024) qMallocCount *= 2; 203 | else qMallocCount += 1024; 204 | if (qMallocUsed == 0) 205 | { 206 | qMallocs = (qMallocData*) malloc(qMallocCount * sizeof(qMallocData)); 207 | } 208 | else 209 | { 210 | qMallocs = (qMallocData*) realloc(qMallocs, qMallocCount * sizeof(qMallocData)); 211 | } 212 | } 213 | qMallocs[qMallocUsed].addr = addr; 214 | qMallocs[qMallocUsed].size = size; 215 | qMallocUsed++; 216 | 217 | #ifdef _WIN32 218 | if (locked) 219 | { 220 | size_t minp, maxp; 221 | HANDLE pid = GetCurrentProcess(); 222 | if (GetProcessWorkingSetSize(pid, (PSIZE_T) &minp, (PSIZE_T) &maxp) == 0) fprintf(STD_OUT, "Error getting minimum working set size\n"); 223 | if (SetProcessWorkingSetSize(pid, minp + size, maxp + size) == 0) fprintf(STD_OUT, "Error settings maximum working set size\n"); 224 | if (VirtualLock(addr, size) == 0) 225 | { 226 | fprintf(STD_OUT, "Error locking memory\n"); 227 | DWORD error = GetLastError(); 228 | VirtualFree(addr, 0, MEM_RELEASE); 229 | if (SetProcessWorkingSetSize(pid, minp, maxp) == 0) fprintf(STD_OUT, "Error settings maximum working set size\n"); 230 | addr = NULL; 231 | } 232 | } 233 | #endif 234 | 235 | return(addr); 236 | } 237 | 238 | int qmalloc::qFree(void* ptr) 239 | { 240 | for (int i = 0;i < qMallocUsed;i++) 241 | { 242 | if (qMallocs[i].addr == ptr) 243 | { 244 | #ifdef _WIN32 245 | if (VirtualFree(ptr, 0, MEM_RELEASE) == 0) return(1); 246 | #else 247 | if (munmap(ptr, qMallocs[i].size)) return(1); 248 | #endif 249 | qMallocUsed--; 250 | if (i < qMallocUsed) memcpy(&qMallocs[i], &qMallocs[qMallocUsed], sizeof(qMallocData)); 251 | if (qMallocUsed == 0) 252 | { 253 | free(qMallocs); 254 | qMallocCount = 0; 255 | } 256 | return(0); 257 | } 258 | } 259 | return(1); 260 | } -------------------------------------------------------------------------------- /cmodules/qmalloc.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #ifndef QMALLOC_H 4 | #define QMALLOC_H 5 | 6 | #if !defined(_WIN32) & !defined(__cdecl) 7 | #define __cdecl 8 | #endif 9 | 10 | class qmalloc 11 | { 12 | public: 13 | static void* __cdecl qMalloc(size_t size, bool huge, bool executable, bool locked, void* alloc_addr = NULL, int interleave = false); 14 | static int __cdecl qFree(void* ptr); 15 | 16 | private: 17 | static int qMallocCount; 18 | static int qMallocUsed; 19 | struct qMallocData 20 | { 21 | void* addr; 22 | size_t size; 23 | }; 24 | static qMallocData* qMallocs; 25 | }; 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /cmodules/qmath.h: -------------------------------------------------------------------------------- 1 | static inline bool qIsFinite(double val) 2 | { 3 | const unsigned long long int ival = *((unsigned long long int*) &val); 4 | return (!((ival & 0x7FF0000000000000) == 0x7FF0000000000000)); 5 | } -------------------------------------------------------------------------------- /cmodules/qmultialloc.cpp: -------------------------------------------------------------------------------- 1 | #include "qmultialloc.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | qMultiAlloc::qMultiAlloc() 8 | { 9 | p = NULL; 10 | np = npalloc = 0; 11 | maxalign = 1024; 12 | ptr = NULL; 13 | } 14 | 15 | qMultiAlloc::~qMultiAlloc() 16 | { 17 | if (p) free(p); 18 | if (ptr) cudaFree(ptr); 19 | p = NULL; 20 | ptr = NULL; 21 | } 22 | 23 | void qMultiAlloc::AddAlloc(void** ptr, size_t size, size_t align) 24 | { 25 | if (np == npalloc) 26 | { 27 | if (npalloc == 0) 28 | { 29 | npalloc = 8; 30 | } 31 | else 32 | { 33 | npalloc *= 2; 34 | } 35 | 36 | if (p) 37 | { 38 | p = (ptr_struct*) realloc(p, npalloc * sizeof(ptr_struct)); 39 | } 40 | else 41 | { 42 | p = (ptr_struct*) malloc(npalloc * sizeof(ptr_struct)); 43 | } 44 | if (p == NULL) 45 | { 46 | printf("Memory Allocation Error\n"); 47 | exit(1); 48 | } 49 | } 50 | 51 | p[np].ptr = ptr; 52 | p[np].size = size; 53 | np++; 54 | if (align > maxalign) maxalign = align; 55 | } 56 | 57 | size_t qMultiAlloc::Allocate() 58 | { 59 | size_t size = 0; 60 | for (int i = 0;i < np;i++) 61 | { 62 | size += p[i].size; 63 | if (size % maxalign) size += maxalign - size % maxalign; 64 | } 65 | size += maxalign; 66 | 67 | checkCudaErrors(cudaMalloc(&ptr, size)); 68 | if (ptr == NULL) 69 | { 70 | np = 0; 71 | maxalign = 1024; 72 | return(0); 73 | } 74 | char* tmpp = (char*) ptr; 75 | for (int i = 0;i < np;i++) 76 | { 77 | if (((size_t) tmpp) % maxalign) tmpp += maxalign - ((size_t) tmpp) % maxalign; 78 | *p[i].ptr = tmpp; 79 | tmpp += p[i].size; 80 | } 81 | return(size); 82 | } 83 | 84 | void qMultiAlloc::Free() 85 | { 86 | cudaFree(ptr); 87 | ptr = 0; 88 | np = 0; 89 | maxalign = 1024; 90 | } 91 | -------------------------------------------------------------------------------- /cmodules/qmultialloc.h: -------------------------------------------------------------------------------- 1 | #ifndef QMULTIALLOC_H 2 | #define QMULTIALLOC_H 3 | #ifdef _WIN32 4 | #include 5 | #include 6 | #else 7 | #include 8 | #endif 9 | 10 | class qMultiAlloc 11 | { 12 | public: 13 | qMultiAlloc(); 14 | ~qMultiAlloc(); 15 | 16 | void AddAlloc(void** ptr, size_t size, size_t align = 1024); 17 | size_t Allocate(); 18 | void Free(); 19 | 20 | private: 21 | struct ptr_struct 22 | { 23 | void** ptr; 24 | size_t size; 25 | }; 26 | 27 | ptr_struct* p; 28 | int np; 29 | int npalloc; 30 | size_t maxalign; 31 | void* ptr; 32 | }; 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /cmodules/qsem.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "qsem.h" 5 | 6 | #ifndef STD_OUT 7 | #define STD_OUT stdout 8 | #endif 9 | 10 | qSem::qSem(int num) 11 | { 12 | max = num; 13 | if (sem_init(&sem, 0, num)) fprintf(STD_OUT, "Error initializing semaphore"); 14 | } 15 | 16 | qSem::~qSem() 17 | { 18 | if (sem_destroy(&sem)) fprintf(STD_OUT, "Error destroying semaphore"); 19 | } 20 | 21 | int qSem::Lock() 22 | { 23 | int retVal; 24 | if ((retVal = sem_wait(&sem))) fprintf(STD_OUT, "Error locking semaphore"); 25 | return(retVal); 26 | } 27 | 28 | int qSem::Unlock() 29 | { 30 | int retVal; 31 | if ((retVal = sem_post(&sem))) fprintf(STD_OUT, "Error unlocking semaphire"); 32 | return(retVal); 33 | } 34 | 35 | int qSem::Trylock() 36 | { 37 | int retVal = sem_trywait(&sem); 38 | if (retVal) 39 | { 40 | if (errno == EAGAIN) return(EBUSY); 41 | return(-1); 42 | } 43 | return(0); 44 | } 45 | 46 | #ifndef _WIN32 47 | int qSem::Query() 48 | { 49 | int value; 50 | if (sem_getvalue(&sem, &value) != 0) value = -1; 51 | return(value); 52 | } 53 | #endif 54 | -------------------------------------------------------------------------------- /cmodules/qsem.h: -------------------------------------------------------------------------------- 1 | #ifndef QSEM_H 2 | #define QSEM_H 3 | 4 | #ifdef _WIN32 5 | #include "pthread_mutex_win32_wrapper.h" 6 | #else 7 | #include 8 | #endif 9 | 10 | class qSem 11 | { 12 | public: 13 | qSem(int num = 1); 14 | ~qSem(); 15 | 16 | int Lock(); 17 | int Unlock(); 18 | int Trylock(); 19 | int Query(); 20 | 21 | private: 22 | int max; 23 | sem_t sem; 24 | }; 25 | 26 | class qSignal 27 | { 28 | private: 29 | qSem sem; 30 | 31 | public: 32 | qSignal() : sem(0) {} 33 | void Wait() {sem.Lock();} 34 | void Signal() {sem.Unlock();} 35 | }; 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /cmodules/sched_affinity_win32_wrapper.h: -------------------------------------------------------------------------------- 1 | #ifndef SCHED_AFFINITY_WIN32_WRAPPER_H 2 | #define SCHED_AFFINITY_WIN32_WRAPPER_H 3 | 4 | typedef __int64 cpu_set_t; 5 | typedef HANDLE pid_t; 6 | 7 | static inline int CPU_ISSET(__int64 cpu, cpu_set_t *set) 8 | { 9 | return((*set & ((__int64) 1 << cpu)) ? 1 : 0); 10 | } 11 | 12 | static inline int sched_setaffinity(pid_t pid, unsigned int cpusetsize, cpu_set_t *mask) 13 | { 14 | return(0); 15 | } 16 | static inline int sched_getaffinity(pid_t pid, unsigned int cpusetsize, cpu_set_t *mask) 17 | { 18 | return(0); 19 | } 20 | static inline void CPU_CLR(__int64 cpu, cpu_set_t *set) 21 | { 22 | *set &= (~((__int64) 1 << cpu)); 23 | } 24 | 25 | static inline void CPU_SET(__int64 cpu, cpu_set_t *set) 26 | { 27 | *set |= ((__int64) 1 << cpu); 28 | } 29 | 30 | static inline void CPU_ZERO(cpu_set_t *set) 31 | { 32 | *set = 0; 33 | } 34 | 35 | #endif -------------------------------------------------------------------------------- /cmodules/switchtemplate.h: -------------------------------------------------------------------------------- 1 | #ifndef QSWITCHTEMPLATE_H 2 | #define QSWITCHTEMPLATE_H 3 | #define Q_SWITCH_TEMPLATE_BOOL(expr, varname, ...) \ 4 | { \ 5 | if (expr) \ 6 | { \ 7 | const int varname = 1; \ 8 | __VA_ARGS__; \ 9 | } \ 10 | else \ 11 | { \ 12 | const int varname = 0; \ 13 | __VA_ARGS__; \ 14 | } \ 15 | } 16 | 17 | #define Q_SWITCH_TEMPLATE_CASE4(val, varname, ...) \ 18 | switch (val) \ 19 | { \ 20 | case 0: \ 21 | { \ 22 | const int varname = 0; \ 23 | __VA_ARGS__; \ 24 | break; \ 25 | } \ 26 | case 1: \ 27 | { \ 28 | const int varname = 1; \ 29 | __VA_ARGS__; \ 30 | break; \ 31 | } \ 32 | case 2: \ 33 | { \ 34 | const int varname = 2; \ 35 | __VA_ARGS__; \ 36 | break; \ 37 | } \ 38 | case 3: \ 39 | { \ 40 | const int varname = 3; \ 41 | __VA_ARGS__; \ 42 | break; \ 43 | } \ 44 | } 45 | 46 | 47 | #endif -------------------------------------------------------------------------------- /cmodules/threadserver.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #ifndef STD_OUT 4 | #define STD_OUT stdout 5 | #endif 6 | 7 | #include "threadserver.h" 8 | -------------------------------------------------------------------------------- /cmodules/threadserver.h: -------------------------------------------------------------------------------- 1 | #ifndef THREADSERVER_H 2 | #define THREADSERVER_H 3 | 4 | #ifdef _WIN32 5 | #include "pthread_mutex_win32_wrapper.h" 6 | #include "sched_affinity_win32_wrapper.h" 7 | #else 8 | #include 9 | #include 10 | #endif 11 | #include "qsem.h" 12 | 13 | class qThreadServerException 14 | { 15 | 16 | }; 17 | 18 | template class qThreadCls; 19 | 20 | class qThreadParam 21 | { 22 | template friend class qThreadCls; 23 | 24 | public: 25 | qThreadParam() 26 | { 27 | for (int i = 0;i < 2;i++) threadMutex[i].Lock(); 28 | terminate = false; 29 | pinCPU = -1; 30 | } 31 | 32 | ~qThreadParam() 33 | { 34 | for (int i = 0;i < 2;i++) threadMutex[i].Unlock(); 35 | } 36 | 37 | bool WaitForTask() 38 | { 39 | threadMutex[1].Unlock(); 40 | threadMutex[0].Lock(); 41 | return(!terminate); 42 | } 43 | 44 | int threadNum; 45 | 46 | protected: 47 | int pinCPU; 48 | qSem threadMutex[2]; 49 | volatile bool terminate; 50 | }; 51 | 52 | template class qThreadParamCls : public qThreadParam 53 | { 54 | template friend class qThreadCls; 55 | 56 | private: 57 | S* pCls; 58 | void (S::*pFunc)(void*); 59 | }; 60 | 61 | template static void* qThreadWrapperCls(void* arg); 62 | 63 | template class qThreadCls 64 | { 65 | public: 66 | qThreadCls() {started = false;}; 67 | qThreadCls(S* pCls, void (S::*pFunc)(T*), int threadNum = 0, int pinCPU = -1) : threadParam() {started = false;SpawnThread(pCls, pFunc, threadNum, pinCPU);} 68 | 69 | void SpawnThread(S* pCls, void (S::*pFunc)(T*), int threadNum = 0, int pinCPU = -1, bool wait = true) 70 | { 71 | qThreadParamCls& XthreadParam = *((qThreadParamCls*) &this->threadParam); 72 | 73 | XthreadParam.pCls = pCls; 74 | XthreadParam.pFunc = (void (S::*)(void*)) pFunc; 75 | XthreadParam.threadNum = threadNum; 76 | XthreadParam.pinCPU = pinCPU; 77 | pthread_t thr; 78 | pthread_create(&thr, NULL, (void* (*) (void*)) &qThreadWrapperCls, &XthreadParam); 79 | if (wait) WaitForSpawn(); 80 | started = true; 81 | } 82 | 83 | void WaitForSpawn() 84 | { 85 | threadParam.threadMutex[1].Lock(); 86 | } 87 | 88 | ~qThreadCls() 89 | { 90 | if (started) 91 | { 92 | End(); 93 | } 94 | } 95 | 96 | void End() 97 | { 98 | qThreadParamCls& XthreadParam = *((qThreadParamCls*) &this->threadParam); 99 | 100 | XthreadParam.terminate = true; 101 | XthreadParam.threadMutex[0].Unlock(); 102 | XthreadParam.threadMutex[1].Lock(); 103 | started = false; 104 | } 105 | 106 | void Start() 107 | { 108 | threadParam.threadMutex[0].Unlock(); 109 | } 110 | 111 | void Sync() 112 | { 113 | threadParam.threadMutex[1].Lock(); 114 | } 115 | 116 | private: 117 | bool started; 118 | T threadParam; 119 | 120 | static void* qThreadWrapperCls(T* arg); 121 | }; 122 | 123 | template void* qThreadCls::qThreadWrapperCls(T* arg) 124 | { 125 | qThreadParamCls* const arg_A = (qThreadParamCls*) arg; 126 | if (arg_A->pinCPU != -1) 127 | { 128 | cpu_set_t tmp_mask; 129 | CPU_ZERO(&tmp_mask); 130 | CPU_SET(arg_A->pinCPU, &tmp_mask); 131 | sched_setaffinity(0, sizeof(tmp_mask), &tmp_mask); 132 | } 133 | 134 | void (S::*pFunc)(T*) = (void (S::*)(T*)) arg_A->pFunc; 135 | (arg_A->pCls->*pFunc)(arg); 136 | 137 | arg_A->threadMutex[1].Unlock(); 138 | pthread_exit(NULL); 139 | return(NULL); 140 | } 141 | 142 | template class qThreadClsArray 143 | { 144 | public: 145 | qThreadClsArray() {pArray = NULL;nThreadsRunning = 0;} 146 | qThreadClsArray(int n, S* pCls, void (S::*pFunc)(T*), int threadNumOffset = 0, int* pinCPU = NULL) {pArray = NULL;nThreadsRunning = 0;SetNumberOfThreads(n, pCls, pFunc, threadNumOffset, pinCPU);} 147 | 148 | void SetNumberOfThreads(int n, S* pCls, void (S::*pFunc)(T*), int threadNumOffset = 0, int* pinCPU = NULL) 149 | { 150 | if (nThreadsRunning) 151 | { 152 | fprintf(STD_OUT, "Threads already started\n");throw(qThreadServerException()); 153 | } 154 | pArray = new qThreadCls[n]; 155 | nThreadsRunning = n; 156 | for (int i = 0;i < n;i++) 157 | { 158 | pArray[i].SpawnThread(pCls, pFunc, threadNumOffset + i, pinCPU == NULL ? -1 : pinCPU[i], false); 159 | } 160 | for (int i = 0;i < n;i++) 161 | { 162 | pArray[i].WaitForSpawn(); 163 | } 164 | } 165 | 166 | ~qThreadClsArray() 167 | { 168 | if (nThreadsRunning) 169 | { 170 | EndThreads(); 171 | } 172 | } 173 | 174 | void EndThreads() 175 | { 176 | delete[] pArray; 177 | nThreadsRunning = 0; 178 | } 179 | 180 | void Start() 181 | { 182 | for (int i = 0;i < nThreadsRunning;i++) pArray[i].Start(); 183 | } 184 | 185 | void Sync() 186 | { 187 | for (int i = 0;i < nThreadsRunning;i++) pArray[i].Sync(); 188 | } 189 | 190 | private: 191 | qThreadCls* pArray; 192 | int nThreadsRunning; 193 | }; 194 | 195 | #endif 196 | -------------------------------------------------------------------------------- /cmodules/timer.cpp: -------------------------------------------------------------------------------- 1 | #include "timer.h" 2 | #ifdef _WIN32 3 | #include 4 | #include 5 | #else 6 | #include 7 | #endif 8 | 9 | HighResTimer::HighResTimer() 10 | { 11 | ElapsedTime = 0; 12 | running = 0; 13 | } 14 | 15 | HighResTimer::~HighResTimer() {} 16 | 17 | void HighResTimer::Start() 18 | { 19 | #ifdef _WIN32 20 | __int64 istart; 21 | QueryPerformanceCounter((LARGE_INTEGER*)&istart); 22 | StartTime = (double) istart; 23 | #else 24 | timespec tv; 25 | clock_gettime(CLOCK_REALTIME, &tv); 26 | StartTime = (double) tv.tv_sec * 1.0E9 + (double) tv.tv_nsec; 27 | #endif 28 | running = 1; 29 | } 30 | 31 | void HighResTimer::ResetStart() 32 | { 33 | ElapsedTime = 0; 34 | Start(); 35 | } 36 | 37 | void HighResTimer::Stop() 38 | { 39 | if (running == 0) return; 40 | running = 0; 41 | double EndTime = 0; 42 | #ifdef _WIN32 43 | __int64 iend; 44 | QueryPerformanceCounter((LARGE_INTEGER*) &iend); 45 | EndTime = (double) iend; 46 | #else 47 | timespec tv; 48 | clock_gettime(CLOCK_REALTIME, &tv); 49 | EndTime = (double) tv.tv_sec * 1.0E9 + (double) tv.tv_nsec; 50 | #endif 51 | ElapsedTime += EndTime - StartTime; 52 | } 53 | 54 | void HighResTimer::Reset() 55 | { 56 | ElapsedTime = 0; 57 | StartTime = 0; 58 | running = 0; 59 | } 60 | 61 | double HighResTimer::GetElapsedTime() 62 | { 63 | return ElapsedTime / Frequency; 64 | } 65 | 66 | double HighResTimer::GetCurrentElapsedTime() 67 | { 68 | if (running == 0) return(GetElapsedTime()); 69 | double CurrentTime = 0; 70 | #ifdef _WIN32 71 | __int64 iend; 72 | QueryPerformanceCounter((LARGE_INTEGER*) &iend); 73 | CurrentTime = (double) iend; 74 | #else 75 | timespec tv; 76 | clock_gettime(CLOCK_REALTIME, &tv); 77 | CurrentTime = (double) tv.tv_sec * 1.0E9 + (double) tv.tv_nsec; 78 | #endif 79 | return((CurrentTime - StartTime + ElapsedTime) / Frequency); 80 | } 81 | 82 | double HighResTimer::GetFrequency() 83 | { 84 | #ifdef _WIN32 85 | __int64 ifreq; 86 | QueryPerformanceFrequency((LARGE_INTEGER*)&ifreq); 87 | return((double) ifreq); 88 | #else 89 | return(1.0E9); 90 | #endif 91 | } 92 | 93 | double HighResTimer::Frequency = HighResTimer::GetFrequency(); -------------------------------------------------------------------------------- /cmodules/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef TIMER_H 2 | #define TIMER_H 3 | 4 | class HighResTimer { 5 | 6 | public: 7 | HighResTimer(); 8 | ~HighResTimer(); 9 | void Start(); 10 | void Stop(); 11 | void Reset(); 12 | void ResetStart(); 13 | double GetElapsedTime(); 14 | double GetCurrentElapsedTime(); 15 | 16 | private: 17 | static double Frequency; 18 | static double GetFrequency(); 19 | 20 | double ElapsedTime; 21 | double StartTime; 22 | int running; 23 | }; 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /cmodules/util_adl.cpp: -------------------------------------------------------------------------------- 1 | /// 2 | /// Copyright (c) 2008 - 2009 Advanced Micro Devices, Inc. 3 | 4 | /// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, 5 | /// EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED 6 | /// WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE. 7 | 8 | /// \file main.c 9 | /// \brief C/C++ ADL sample application 10 | /// 11 | /// Demonstrates some basic ADL functions - create, destroy, obtaining adapter and display information. 12 | /// If the display capabilities allow, increases, decreases and restores the brightness of each display 13 | 14 | #ifndef _NO_ADL 15 | 16 | #ifdef _WIN32 17 | #define WINDOWS 18 | #else 19 | #define LINUX 20 | #endif 21 | 22 | #include "../../ADL/include/adl_sdk.h" 23 | #ifdef LINUX 24 | #include //dyopen, dlsym, dlclose 25 | #include //sleep 26 | #else 27 | #include 28 | #include 29 | #endif 30 | #include 31 | #include //memeset 32 | #include 33 | 34 | #ifndef MAINPROG 35 | #include "util_adl.h" 36 | #endif 37 | 38 | #ifndef STD_OUT 39 | #define STD_OUT stdout 40 | #endif 41 | 42 | // Definitions of the used function pointers. Add more if you use other ADL APIs 43 | typedef int ( *ADL_MAIN_CONTROL_CREATE )(ADL_MAIN_MALLOC_CALLBACK, int ); 44 | typedef int ( *ADL_MAIN_CONTROL_DESTROY )(); 45 | typedef int ( *ADL_ADAPTER_NUMBEROFADAPTERS_GET ) ( int* ); 46 | typedef int ( *ADL_ADAPTER_ADAPTERINFO_GET ) ( LPAdapterInfo, int ); 47 | typedef int ( *ADL_OVERDRIVE5_TEMPERATURE_GET ) ( int, int , ADLTemperature * ); 48 | typedef int ( *ADL_ADAPTER_ACTIVE_GET ) ( int, int* ); 49 | typedef int ( *ADL_ADAPTER_VIDEOBIOSINFO_GET ) ( int, ADLBiosInfo* ); 50 | typedef int ( *ADL_ADAPTER_ID_GET ) ( int, int* ); 51 | typedef int ( *ADL_OVERDRIVE5_POWERCONTROL_SET ) ( int, int ); 52 | 53 | // Memory allocation function 54 | void* __stdcall ADL_Main_Memory_Alloc ( int iSize ) 55 | { 56 | void* lpBuffer = malloc ( iSize ); 57 | return lpBuffer; 58 | } 59 | 60 | ADL_MAIN_CONTROL_CREATE ADL_Main_Control_Create; 61 | ADL_MAIN_CONTROL_DESTROY ADL_Main_Control_Destroy; 62 | ADL_ADAPTER_NUMBEROFADAPTERS_GET ADL_Adapter_NumberOfAdapters_Get; 63 | ADL_ADAPTER_ADAPTERINFO_GET ADL_Adapter_AdapterInfo_Get; 64 | ADL_OVERDRIVE5_TEMPERATURE_GET ADL_Overdrive5_Temperature_Get; 65 | ADL_ADAPTER_ACTIVE_GET ADL_Adapter_Active_Get; 66 | ADL_ADAPTER_VIDEOBIOSINFO_GET ADL_Adapter_VideoBiosInfo_Get; 67 | ADL_ADAPTER_ID_GET ADL_Adapter_ID_Get; 68 | ADL_OVERDRIVE5_POWERCONTROL_SET ADL_Overdrive5_PowerControl_Set; 69 | 70 | int nAdapters; 71 | int* nAdapterIndizes; 72 | #ifdef LINUX 73 | void *hDLL; // Handle to .so library 74 | #else 75 | HINSTANCE hDLL; 76 | #endif 77 | 78 | #ifndef LINUX 79 | void* dlsym(HINSTANCE lib, char* name) 80 | { 81 | return(GetProcAddress(lib, name)); 82 | } 83 | #endif 84 | 85 | int adl_temperature_check_init() 86 | { 87 | LPAdapterInfo lpAdapterInfo = NULL; 88 | int iNumberAdapters; 89 | #ifdef LINUX 90 | setenv("DISPLAY", ":0", 1); 91 | hDLL = dlopen( "libatiadlxx.so", RTLD_LAZY|RTLD_GLOBAL); 92 | #else 93 | hDLL = LoadLibrary( "atiadlxx.dll" ); 94 | #endif 95 | 96 | 97 | if (NULL == hDLL) 98 | { 99 | printf("ADL library not found!\n"); 100 | return 0; 101 | } 102 | 103 | ADL_Main_Control_Create = (ADL_MAIN_CONTROL_CREATE) (size_t) dlsym(hDLL,"ADL_Main_Control_Create"); 104 | ADL_Main_Control_Destroy = (ADL_MAIN_CONTROL_DESTROY) (size_t) dlsym(hDLL,"ADL_Main_Control_Destroy"); 105 | ADL_Adapter_NumberOfAdapters_Get = (ADL_ADAPTER_NUMBEROFADAPTERS_GET) (size_t) dlsym(hDLL,"ADL_Adapter_NumberOfAdapters_Get"); 106 | ADL_Adapter_AdapterInfo_Get = (ADL_ADAPTER_ADAPTERINFO_GET) (size_t) dlsym(hDLL,"ADL_Adapter_AdapterInfo_Get"); 107 | ADL_Overdrive5_Temperature_Get = (ADL_OVERDRIVE5_TEMPERATURE_GET) (size_t) dlsym(hDLL,"ADL_Overdrive5_Temperature_Get"); 108 | ADL_Adapter_Active_Get = (ADL_ADAPTER_ACTIVE_GET) (size_t) dlsym(hDLL,"ADL_Adapter_Active_Get"); 109 | ADL_Adapter_VideoBiosInfo_Get = (ADL_ADAPTER_VIDEOBIOSINFO_GET) (size_t) dlsym(hDLL, "ADL_Adapter_VideoBiosInfo_Get"); 110 | ADL_Adapter_ID_Get = (ADL_ADAPTER_ID_GET) (size_t) dlsym(hDLL, "ADL_Adapter_ID_Get"); 111 | ADL_Overdrive5_PowerControl_Set = (ADL_OVERDRIVE5_POWERCONTROL_SET) (size_t) dlsym(hDLL, "ADL_Overdrive5_PowerControl_Set"); 112 | 113 | 114 | if ( NULL == ADL_Main_Control_Create || NULL == ADL_Main_Control_Destroy || NULL == ADL_Adapter_NumberOfAdapters_Get || NULL == ADL_Adapter_AdapterInfo_Get || NULL == ADL_Overdrive5_Temperature_Get || 115 | NULL == ADL_Adapter_Active_Get || NULL == ADL_Adapter_VideoBiosInfo_Get || NULL == ADL_Adapter_ID_Get || NULL == ADL_Overdrive5_PowerControl_Set ) 116 | { 117 | printf("ADL's API is missing!\n"); 118 | return 0; 119 | } 120 | 121 | // Initialize ADL. The second parameter is 1, which means: 122 | // retrieve adapter information only for adapters that are physically present and enabled in the system 123 | if ( ADL_OK != ADL_Main_Control_Create (ADL_Main_Memory_Alloc, 1) ) 124 | { 125 | printf("ADL Initialization Error!\n"); 126 | return 0; 127 | } 128 | 129 | // Obtain the number of adapters for the system 130 | if ( ADL_OK != ADL_Adapter_NumberOfAdapters_Get ( &iNumberAdapters ) ) 131 | { 132 | printf("Cannot get the number of adapters!\n"); 133 | return 0; 134 | } 135 | 136 | #ifdef VERBOSE 137 | printf("Number of adapters: %d\n", iNumberAdapters); 138 | #endif 139 | 140 | if (iNumberAdapters == 0) 141 | { 142 | printf("No Adapter found\n"); 143 | return(1); 144 | } 145 | 146 | lpAdapterInfo = (AdapterInfo*) malloc( sizeof(AdapterInfo) * iNumberAdapters); 147 | if (ADL_Adapter_AdapterInfo_Get(lpAdapterInfo, sizeof(AdapterInfo) * iNumberAdapters) != ADL_OK) 148 | { 149 | printf("Error getting adapter info\n"); 150 | return(1); 151 | } 152 | 153 | for (int j = 0;j < 2;j++) 154 | { 155 | nAdapters = 0; 156 | for ( int i = 0; i < iNumberAdapters; i++ ) 157 | { 158 | int status; 159 | if (ADL_Adapter_Active_Get(lpAdapterInfo[i].iAdapterIndex, &status) != ADL_OK) 160 | { 161 | printf("Error getting adapter status\n"); 162 | return(1); 163 | } 164 | if (status == ADL_TRUE) 165 | { 166 | if (j) 167 | { 168 | nAdapterIndizes[nAdapters] = lpAdapterInfo[i].iAdapterIndex; 169 | #ifdef VERBOSE 170 | ADLBiosInfo biosInfo; 171 | ADL_Adapter_VideoBiosInfo_Get(nAdapterIndizes[nAdapters], &biosInfo); 172 | int UID; 173 | ADL_Adapter_ID_Get(nAdapterIndizes[nAdapters], &UID); 174 | printf("Adapter %d (%s) Info: Bios %s %s %s, UID %d\n", nAdapters, lpAdapterInfo[i].strAdapterName, biosInfo.strPartNumber, biosInfo.strVersion, biosInfo.strDate, UID); 175 | #endif 176 | } 177 | nAdapters++; 178 | } 179 | } 180 | if (j == 0) nAdapterIndizes = new int[nAdapters]; 181 | } 182 | free(lpAdapterInfo); 183 | return(0); 184 | } 185 | 186 | int adl_temperature_check_run(double* max_temperature, int verbose) 187 | { 188 | *max_temperature = 0.; 189 | char tmpbuffer[128]; 190 | if (verbose) strcpy(tmpbuffer, "Temperatures:"); 191 | for (int i = 0;i < nAdapters;i++) 192 | { 193 | ADLTemperature temp; 194 | temp.iSize = sizeof(temp); 195 | if (ADL_Overdrive5_Temperature_Get(nAdapterIndizes[i], 0, &temp) != ADL_OK) 196 | { 197 | printf("Error reading temperature from adapter %d\n", i); 198 | return(1); 199 | } 200 | const double temperature = temp.iTemperature / 1000.; 201 | if (verbose) sprintf(tmpbuffer + strlen(tmpbuffer), " %f", temperature); 202 | if (temperature > *max_temperature) *max_temperature = temperature; 203 | } 204 | if (verbose) fprintf(STD_OUT, "%s\n", tmpbuffer); 205 | return(0); 206 | } 207 | 208 | int adl_temperature_check_exit() 209 | { 210 | ADL_Main_Control_Destroy (); 211 | #ifdef LINUX 212 | dlclose(hDLL); 213 | #else 214 | FreeLibrary(hDLL); 215 | #endif 216 | 217 | return(0); 218 | } 219 | 220 | int adl_powertune_set(int val) 221 | { 222 | for (int i = 0;i < nAdapters;i++) 223 | { 224 | if (ADL_Overdrive5_PowerControl_Set(nAdapterIndizes[i], val)) 225 | { 226 | printf("Error setting powertune to adapter %d (val %d)\n", i, val); 227 | } 228 | } 229 | return(0); 230 | } 231 | 232 | #ifdef MAINPROG 233 | int main (int argc, char** argv) 234 | { 235 | double temperature; 236 | if (adl_temperature_check_init()) 237 | { 238 | printf("Error initializing ADL\n"); 239 | return(1); 240 | } 241 | if (adl_temperature_check_run(&temperature, 1)) 242 | { 243 | printf("Error running ADL temperature check\n"); 244 | return(1); 245 | } 246 | printf("Maximum Temperature: %f\n", temperature); 247 | if (argc > 1) 248 | { 249 | adl_powertune_set(atoi(argv[1])); 250 | } 251 | if (adl_temperature_check_exit()) 252 | { 253 | printf("Error exiting ADL\n"); 254 | return(1); 255 | } 256 | } 257 | #endif 258 | 259 | #endif 260 | -------------------------------------------------------------------------------- /cmodules/util_adl.h: -------------------------------------------------------------------------------- 1 | /** 2 | * A wrapper for the C++ CALDGEMM 3 | * 4 | * Copyright 2010: 5 | * - David Rohr (drohr@jwdt.org) 6 | * - Matthias Bach (bach@compeng.uni-frankfurt.de) 7 | * - Matthias Kretz (kretz@compeng.uni-frankfurt.de) 8 | * 9 | * This file is part of HPL-GPU. 10 | * 11 | * HPL-GPU is free software: you can redistribute it and/or modify 12 | * it under the terms of the GNU General Public License as published by 13 | * the Free Software Foundation, either version 3 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * HPL-GPU is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU General Public License 22 | * along with HPL-GPU. If not, see . 23 | * 24 | * In addition to the rules layed out by the GNU General Public License 25 | * the following exception is granted: 26 | * 27 | * Use with the Original BSD License. 28 | * 29 | * Notwithstanding any other provision of the GNU General Public License 30 | * Version 3, you have permission to link or combine any covered work with 31 | * a work licensed under the 4-clause BSD license into a single combined 32 | * work, and to convey the resulting work. The terms of this License will 33 | * continue to apply to the part which is the covered work, but the special 34 | * requirements of the 4-clause BSD license, clause 3, concerning the 35 | * requirement of acknowledgement in advertising materials will apply to 36 | * the combination as such. 37 | */ 38 | 39 | #ifndef UTIL_ADL_H 40 | #define UTIL_ADL_H 41 | 42 | #ifdef __cplusplus 43 | extern "C" 44 | { 45 | #endif 46 | 47 | int adl_temperature_check_init(); 48 | int adl_temperature_check_run(double*, int); 49 | int adl_temperature_check_exit(); 50 | int adl_powertune_set(int); 51 | 52 | #ifdef __cplusplus 53 | } 54 | #endif 55 | 56 | #endif 57 | 58 | 59 | -------------------------------------------------------------------------------- /config.mak: -------------------------------------------------------------------------------- 1 | include config_options_load.mak 2 | 3 | INTELARCH = SSE4.2 4 | CUDAVERSION = 35 61 5 | CUDAREGS = 255 6 | ARCHBITS = 64 7 | 8 | HIDEECHO = @ 9 | CC_x86_64-pc-linux-gnu = GCC 10 | CC_i686-pc-cygwin = ICC 11 | 12 | TARGET = dgemm_bench 13 | 14 | LIBS = 15 | LIBPATHS = 16 | 17 | LIBS = 18 | EXTRAOBJFILES = 19 | 20 | CONFIG_STATIC = 0 21 | ifeq ($(CONFIG_LTO), 1) 22 | EXTRAFLAGSGCC = -flto 23 | EXTRAFLAGSLINK = -flto 24 | endif 25 | 26 | CPPFILES = caldgemm.cpp benchmark.cpp cmodules/timer.cpp cmodules/qmalloc.cpp caldgemm_cpu.cpp cmodules/affinity.cpp cmodules/threadserver.cpp cmodules/qsem.cpp caldgemm_adl.cpp 27 | CXXFILES = 28 | ASMFILES = 29 | CUFILES = 30 | 31 | COMPILER_FLAGS = OPT 32 | 33 | ifeq ($(AMDAPPSDKROOT), ) 34 | ifeq ($(INCLUDE_CAL), 1) 35 | warning CAL not found, disabling INCLUDE_CAL 36 | endif 37 | INCLUDE_CAL = 0 38 | endif 39 | 40 | ifeq ("$(CUDA_PATH)", "") 41 | ifeq ($(INCLUDE_CUDA), 1) 42 | warning CUDA not found, disabling INCLUDE_CUDA 43 | endif 44 | INCLUDE_CUDA = 0 45 | INCLUDE_CUBLAS = 0 46 | endif 47 | 48 | ifeq ($(CONFIGURED), 1) 49 | 50 | ifeq ($(INCLUDE_CUDA), 1) 51 | CONFIG_CUDA = 1 52 | CUFILES += caldgemm_cuda.cu 53 | DEFINES += CALDGEMM_CUDA 54 | ifeq ($(INCLUDE_CUBLAS), 1) 55 | CONFIG_CUBLAS = 1 56 | DEFINES += CALDGEMM_CUDA_CUBLAS 57 | endif 58 | endif 59 | 60 | ifeq ($(INCLUDE_OPENCL), 1) 61 | CONFIG_OPENCL = 1 62 | CPPFILES += caldgemm_opencl.cpp 63 | DEFINES += CALDGEMM_OPENCL 64 | endif 65 | 66 | ifeq ($(INCLUDE_CAL), 1) 67 | CONFIG_CAL = 1 68 | CPPFILES += caldgemm_cal.cpp 69 | DEFINES += CALDGEMM_CAL 70 | endif 71 | 72 | ifeq ($(BLAS_BACKEND), GOTOBLAS) 73 | INCLUDEPATHS += $(GOTOBLAS_PATH) 74 | DEFINES += USE_GOTO_BLAS 75 | ifeq ($(ARCH), i686-pc-cygwin) 76 | EXTRAOBJFILES += $(GOTOBLAS_PATH)/libgoto2.lib 77 | else 78 | #LIBS += gfortran 79 | EXTRAOBJFILES += $(GOTOBLAS_PATH)/libgoto2.a 80 | endif 81 | else 82 | ifeq ($(BLAS_BACKEND), MKL) 83 | INCLUDEPATHS += $(MKL_PATH)/include 84 | LIBS += iomp5 mkl_intel_lp64 mkl_core mkl_intel_thread 85 | LIBPATHS += $(MKL_PATH)/lib/intel64/ 86 | ifneq ($(ICC_PATH), ) 87 | LIBPATHS += $(ICC_PATH)/lib/intel64/ 88 | endif 89 | DEFINES += USE_MKL 90 | CONFIG_OPENMP = 1 91 | else 92 | ifeq ($(BLAS_BACKEND), ACML) 93 | INCLUDEPATHS += $(CBLAS_PATH)/include 94 | LIBPATHS += $(ACML_PATH)/lib $(CBLAS_PATH)/include 95 | LIBS += acml_mp 96 | EXTRAOBJFILES += $(CBLAS_PATH)/lib/cblas_LINUX.a 97 | CONFIG_OPENMP = 1 98 | LIBS += gfortran 99 | else 100 | $(error No valid BLAS_BACKEND selected) 101 | endif 102 | endif 103 | endif 104 | 105 | INCLUDEPATHS += $(OPENMPI_PATH)/include/vampirtrace 106 | 107 | else 108 | $(error Not configured yet, adapt config_options.mak!) 109 | endif 110 | 111 | caldgemm_config.h: 112 | cp caldgemm_config.sample caldgemm_config.h 113 | 114 | ALLDEP += caldgemm_config.h config_options.mak 115 | 116 | config_options.mak: 117 | cp config_options.sample config_options.mak 118 | 119 | FILEFLAGSbenchmark.cpp = -Wno-strict-aliasing 120 | FILEFLAGScaldgemm.cpp = -Wno-strict-aliasing 121 | FILEFLAGScaldgemm_cal.cpp = -Wno-strict-aliasing 122 | FILEFLAGScaldgemm_opencl.cpp = -Wno-strict-aliasing 123 | -------------------------------------------------------------------------------- /config_options.sample: -------------------------------------------------------------------------------- 1 | #Select BLAS Backend to use. Available options: MKL, ACML, GOTOBLAS 2 | BLAS_BACKEND = MKL 3 | 4 | #Select which GPU backends are compiled. (The CPU backend is always compiled) 5 | INCLUDE_OPENCL = 1 6 | INCLUDE_CAL = 1 7 | INCLUDE_CUDA = 1 8 | INCLUDE_CUBLAS = 1 9 | 10 | #Other Config options 11 | #Use link time optimization 12 | CONFIG_LTO = 1 13 | 14 | #Mark CONFIGURED = 1 to enable compilation 15 | CONFIGURED = 0 16 | -------------------------------------------------------------------------------- /config_options_load.mak: -------------------------------------------------------------------------------- 1 | include $(CALDGEMM_MAKE_DIR_PRE)config_options.mak 2 | 3 | ifeq ($(AMDAPPSDKROOT), ) 4 | ifeq ($(INCLUDE_CAL), 1) 5 | $(warning CAL not found, disabling INCLUDE_CAL) 6 | endif 7 | INCLUDE_CAL = 0 8 | endif 9 | 10 | ifeq ("$(CUDA_PATH)", "") 11 | ifeq ($(INCLUDE_CUDA), 1) 12 | $(warning CUDA not found, disabling INCLUDE_CUDA) 13 | endif 14 | INCLUDE_CUDA = 0 15 | INCLUDE_CUBLAS = 0 16 | endif 17 | -------------------------------------------------------------------------------- /cudakernel.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of the CALDGEMM library. 3 | * 4 | * Copyright 2015: 5 | * - David Rohr (drohr@jwdt.org) 6 | * - Matthias Bach (bach@compeng.uni-frankfurt.de) 7 | * - Matthias Kretz (kretz@compeng.uni-frankfurt.de) 8 | * 9 | * This file is part of CALDGEMM. 10 | * 11 | * CALDGEMM is free software: you can redistribute it and/or modify 12 | * it under the terms of the GNU Lesser General Public License as published by 13 | * the Free Software Foundation, either version 3 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * CALDGEMM is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU Lesser General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU Lesser General Public License 22 | * along with CALDGEMM. If not, see . 23 | */ 24 | 25 | __global__ void CUDAKernelName(double* C, double* A, double* B, size_t height1, size_t height2, size_t width, double Alpha, double Beta, size_t pitch) 26 | { 27 | for (int j = blockIdx.y * blockDim.y + threadIdx.y;j < height2;j += blockDim.y * gridDim.y) 28 | { 29 | for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < height1;i += blockDim.x * gridDim.x) 30 | { 31 | double addval = 0; 32 | #ifdef CALDGEMM_FORCE_K 33 | for (int k = 0;k < CALDGEMM_FORCE_K;k++) 34 | #else 35 | for (int k = 0;k < width;k++) 36 | #endif 37 | { 38 | addval += A[j * width + k] * B[i * width + k]; 39 | } 40 | double* destptr = &C[j * pitch + i]; 41 | *destptr = Alpha * addval + Beta * *destptr; 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /environment/caldgemm_setenv.sh.sample: -------------------------------------------------------------------------------- 1 | if [ "0$CALDGEMM_ENVIRONMENT" == "1" ]; then 2 | exit; 3 | fi 4 | 5 | ###Patchs to required software 6 | #Path to AMD APP SDK 7 | export AMDAPPSDKROOT=$HOME/AMD-APP-SDK-v2.9-RC-lnx64 8 | #Path to OpenMPI 9 | export OPENMPI_PATH=/opt/openmpi183 10 | #Base path to Intel software 11 | export INTELPATH=$HOME/intel 12 | #Path to the Intel MKL BLAS library (usually inside $INTELPATH) 13 | export MKL_PATH=$INTELPATH/mkl 14 | #Path to libiomp (as part of ICC or standalone) 15 | export ICC_PATH=$INTELPATH 16 | #Path to Intel TBB library inside MKL (Comment out if you want to use MKL TBB instead of having hpl-gpu compile tbb) 17 | #export TBB_PATH=$INTELPATH/tbb 18 | #Path to the GotoBLAS BLAS library 19 | export GOTOBLAS_PATH=$HOME/GotoBLAS2 20 | #Path to AMD ACML BLAS library 21 | export ACML_PATH=$HOME/acml/gfortran64_mp 22 | #Path to the CBLAS interface (required for ACML BLAS Slibrary) 23 | export CBLAS_PATH=$HOME/CBLAS 24 | #Path to NVIDIA CUDA SDK 25 | export CUDA_PATH=/usr/local/cuda 26 | 27 | ###Add all required paths to $LD_LIBRARY_PATH 28 | #We want to use the most recent AMD OpenCL library. Usually this comes with the driver. If the SDK is newer than the driver, outcomment the next line. 29 | #export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$AMDAPPSDKROOT/lib/x86_64 30 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_PATH/lib64 31 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$ICC_PATH/lib/intel64:$MKL_PATH/lib/intel64 32 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$ACML_PATH/lib 33 | export LD_LIBRARY_PATH=$OPENMPI_PATH/lib:$LD_LIBRARY_PATH 34 | #We need one library path at the very beginning that overrides all others for preloading libraries 35 | export LD_LIBRARY_PATH=$HOME/lib:$LD_LIBRARY_PATH 36 | 37 | ###Add OpenMPI to $PATH for mpirun command 38 | export PATH=$OPENMPI_PATH/bin:$PATH 39 | 40 | ###Set some environment variables for AMD GPUs and for Headless X Setup 41 | export GPU_FORCE_64BIT_PTR=1 42 | export GPU_NUM_COMPUTE_RINGS=1 43 | export DISPLAY=:0 44 | export COMPUTE=:0 45 | 46 | ###Set correct ulimits for memory allocation 47 | ulimit -v unlimited 48 | ulimit -m unlimited 49 | ulimit -l unlimited 50 | 51 | #We do need to set these values only once 52 | export CALDGEMM_ENVIRONMENT=1 53 | -------------------------------------------------------------------------------- /gcc_patch/libgomp.patch: -------------------------------------------------------------------------------- 1 | --- ../gcc-4.6.1/libgomp/team.c 2009-05-20 22:54:45.000000000 +0200 2 | +++ libgomp/team.c 2012-07-30 11:35:18.742162635 +0200 3 | @@ -29,6 +29,8 @@ 4 | #include 5 | #include 6 | 7 | +#define GOMP_KEEP_THREAD ((void*) (size_t) -1) 8 | + 9 | /* This attribute contains PTHREAD_CREATE_DETACHED. */ 10 | pthread_attr_t gomp_thread_attr; 11 | 12 | @@ -116,11 +118,18 @@ 13 | gomp_team_barrier_wait (&team->barrier); 14 | gomp_finish_task (task); 15 | 16 | - gomp_barrier_wait (&pool->threads_dock); 17 | + do 18 | + { 19 | + gomp_barrier_wait (&pool->threads_dock); 20 | + local_fn = thr->fn; 21 | + thr->fn = NULL; 22 | + if (local_fn == GOMP_KEEP_THREAD) 23 | + { 24 | + gomp_team_barrier_wait(&thr->ts.team->barrier); 25 | + } 26 | + } while (local_fn == GOMP_KEEP_THREAD); 27 | 28 | - local_fn = thr->fn; 29 | local_data = thr->data; 30 | - thr->fn = NULL; 31 | } 32 | while (local_fn); 33 | } 34 | @@ -258,7 +267,7 @@ 35 | struct gomp_task_icv *icv; 36 | bool nested; 37 | struct gomp_thread_pool *pool; 38 | - unsigned i, n, old_threads_used = 0; 39 | + unsigned i, j, n, old_threads_used = 0; 40 | pthread_attr_t thread_attr, *attr; 41 | 42 | thr = gomp_thread (); 43 | @@ -346,6 +355,16 @@ 44 | nthr->data = data; 45 | team->ordered_release[i] = &nthr->release; 46 | } 47 | + if (nthreads < old_threads_used) 48 | + { 49 | + for (j = i;j < old_threads_used;j++) 50 | + { 51 | + nthr = pool->threads[j]; 52 | + nthr->fn = GOMP_KEEP_THREAD; 53 | + nthr->ts.team = team; 54 | + } 55 | + gomp_barrier_reinit (&team->barrier, old_threads_used); 56 | + } 57 | 58 | if (i == nthreads) 59 | goto do_release; 60 | @@ -434,7 +453,7 @@ 61 | that should arrive back at the end of this team. The extra 62 | threads should be exiting. Note that we arrange for this test 63 | to never be true for nested teams. */ 64 | - if (__builtin_expect (nthreads < old_threads_used, 0)) 65 | + if (nested && __builtin_expect (nthreads < old_threads_used, 0)) 66 | { 67 | long diff = (long) nthreads - (long) old_threads_used; 68 | 69 | @@ -448,6 +467,7 @@ 70 | gomp_mutex_unlock (&gomp_remaining_threads_lock); 71 | #endif 72 | } 73 | + if (!nested && old_threads_used > nthreads) pool->threads_used = old_threads_used; 74 | } 75 | 76 | 77 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | include makefiles/makefile -------------------------------------------------------------------------------- /makefiles/as: -------------------------------------------------------------------------------- 1 | /cygdrive/c/utility/cygwin/bin/x86_64-w64-mingw32-as -------------------------------------------------------------------------------- /makefiles/callvc.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | IF '%2' == '0' CALL %1 3 | IF '%2' == '1' CALL %1 > nul 4 | SET VCEXECUTE= 5 | SET BAT_SPECIAL_CHAR=0 6 | 7 | :CHECKNEXT 8 | IF '%3' == '' GOTO EXECUTE 9 | IF '%3' == 'BAT_SPECIAL_EQ' GOTO SPECIAL_EQ 10 | IF '%3' == 'BAT_SPECIAL_KOMMA' GOTO SPECIAL_KOMMA 11 | IF '%BAT_SPECIAL_CHAR%' == '0' GOTO INSERT_SPACE 12 | SET BAT_SPECIAL_CHAR=0 13 | SET VCEXECUTE=%VCEXECUTE%%3 14 | GOTO DO_SHIFT 15 | 16 | :INSERT_SPACE 17 | SET BAT_SPECIAL_CHAR=0 18 | SET VCEXECUTE=%VCEXECUTE% %3 19 | GOTO DO_SHIFT 20 | 21 | :SPECIAL_EQ 22 | SET BAT_SPECIAL_CHAR=1 23 | SET VCEXECUTE=%VCEXECUTE%= 24 | GOTO DO_SHIFT 25 | 26 | :SPECIAL_KOMMA 27 | SET BAT_SPECIAL_CHAR=1 28 | SET VCEXECUTE=%VCEXECUTE%, 29 | 30 | :DO_SHIFT 31 | SHIFT 32 | GOTO CHECKNEXT 33 | 34 | :EXECUTE 35 | %VCEXECUTE% 36 | IF '%ERRORLEVEL%' == '0' GOTO ALLOK 37 | ECHO ERROR 38 | EXIT 1 39 | :ALLOK 40 | SET VCEXECUTE= 41 | -------------------------------------------------------------------------------- /makefiles/config.mak.sample: -------------------------------------------------------------------------------- 1 | #make command line options: 2 | #VERBOSE=1 //Do not hide command lines 3 | #CHECK_DEPENDENCIES=0 //Do not perform dependency file creation 4 | 5 | include config_options.mak #will be autocreated by config_options.sample 6 | 7 | INTELARCH = Host 8 | GCCARCH = native 9 | MSVCFAVOR = INTEL64 10 | CUDAVERSION = 10 13 20 30 35 11 | CUDAREGS = 64 12 | ARCHBITS = 64 13 | 14 | HIDEECHO = @ 15 | #HIDEECHO = - 16 | CC_x86_64-pc-linux-gnu = GCC 17 | CC_i686-pc-cygwin = ICC 18 | 19 | EXTRAFLAGSGCC = -Weffc++ 20 | EXTRAOBJFILES = objfile.obj 21 | EXTRAFLAGSLINK = 22 | EXTRADEPS = non_obj_file_required.dat 23 | 24 | INCLUDEPATHS = include subdir/include 25 | DEFINES = SAMPLE_DEFINE=1 26 | LIBPATHS = /usr/local/lib/sample 27 | LIBS = sample 28 | 29 | #CONFIG_STATIC = 1 30 | 31 | CONFIG_DIRECTX = 1 32 | CONFIG_OPENCL = 1 33 | CONFIG_OPENGL = 0 34 | CONFIG_OPENMP = 0 35 | CONFIG_GDB = 1 #by default 36 | CONFIG_CAL = 0 37 | #CONFIG_X11 = 38 | CONFIG_CUDA = 0 39 | CONFIG_CUBLAS = 0 40 | #CONFIG_VIDEO_EDIT = 41 | CONFIG_OPENCL_VERSION = All 42 | #Options are: AMD, NVIDIA, Intel, All, empty defaults to all 43 | CONFIG_QT = 0 44 | CONFIG_CPP11 = 0 45 | 46 | include config_common.mak 47 | 48 | TARGET = ca 49 | SUBTARGETS = libAliHLTTPCCAGPUSA 50 | SUBTARGETS_CLEAN = libAliHLTTPCCAGPUSA.* 51 | #TARGETPATH = ../ 52 | 53 | TARGETTYPE = LIB 54 | WORKPATHSUFFIX = $(TARGETTYPE)_$TARGET 55 | 56 | CPPFILES = sample.cpp 57 | CXXFILES = subdir/sample.cxx \ 58 | subdir/sample2.cxx 59 | ASMFILES = sample.asm 60 | CUFILES = sample.cu 61 | RESOURCEFILES = myresource.dat 62 | CLFILES = opencl_file.cl 63 | CFILES = sample.c 64 | 65 | QTFILES = 66 | QTCPPFILES = 67 | 68 | OPENCL_OPTIONS = -x clc++ 69 | 70 | #Set all compiler flags to optimized/debug or set compiler flags individually 71 | COMPILER_FLAGS = OPT 72 | #INTELFLAGSUSE = $(INTELFLAGSOPT) 73 | #VSNETFLAGSUSE = $(VSNETFLAGSOPT) 74 | #GCCFLAGSUSE = $(GCCFLAGSOPT) 75 | #NVCCFLAGSUSE = $(NVCCFLAGSOPT) 76 | 77 | CONFIG_LTO = 1 78 | CONFIG_CUDA_DC = 1 79 | -------------------------------------------------------------------------------- /makefiles/i686-pc-cygwin.mak: -------------------------------------------------------------------------------- 1 | #Set these Compiler Paths and Variables to your needs! 2 | VSPATH := ${VS120COMNTOOLS}../.. 3 | VSPATH10 := ${VS100COMNTOOLS}../.. 4 | VSPATH9 := ${VS90COMNTOOLS}../.. 5 | VSPATH6 := c:/Utility/Speeches/Visual Studio 6 6 | ICCPATH := ${ICPP_COMPILER14} 7 | VECTORCPATH := c:/Utility/speeches/Codeplay 8 | WINPATH := /cygdrive/c/Windows 9 | CUDAPATH := $(CUDA_PATH)/ 10 | AMDPATH := $(AMDAPPSDKROOT)/ 11 | CUDASDKPATH := $(CUDAPATH) 12 | DIRECTXPATH := $(DXSDK_DIR) 13 | QTPATH := C:/Utility/Speeches/Qt/5.2.1/msvc2012_64_opengl 14 | 15 | ifeq ($(GCC32), ) 16 | GCC32 = i686-pc-mingw32-c++.exe 17 | endif 18 | ifeq ($(GCC64), ) 19 | GCC64 = x86_64-w64-mingw32-c++.exe -B makefiles -w 20 | endif 21 | 22 | ICCPATH32 = $(ICCPATH)bin/ia32 23 | ICCPATH64 = $(ICCPATH)bin/intel64 24 | 25 | ICC32 = $(HIDEECHOA) $(CALLVC) "$(ICCPATH)bin/iclvars_ia32.bat" $(HIDEVARS) "$(ICCPATH32)/icl.exe" 26 | ICC64 = $(HIDEECHOA) $(CALLVC) "$(ICCPATH)bin/iclvars_intel64.bat" $(HIDEVARS) "$(ICCPATH64)/icl.exe" 27 | MSCC32 = $(HIDEECHOA) $(CALLVC) "$(VSPATH)/vc/bin/vcvars32.bat" $(HIDEVARS) "$(VSPATH)/vc/bin/cl.exe" 28 | MSCC64 = $(HIDEECHOA) $(CALLVC) "$(VSPATH)/vc/bin/amd64/vcvars64.bat" $(HIDEVARS) "$(VSPATH)/vc/bin/amd64/cl.exe" 29 | MSCC1032 = $(HIDEECHOA) $(CALLVC) "$(VSPATH10)/vc/bin/vcvars32.bat" $(HIDEVARS) "$(VSPATH10)/vc/bin/cl.exe" 30 | MSCC1064 = $(HIDEECHOA) $(CALLVC) "$(VSPATH10)/vc/bin/amd64/vcvars64.bat" $(HIDEVARS) "$(VSPATH10)/vc/bin/amd64/cl.exe" 31 | MASM32 = $(HIDEECHOA) $(CALLVC) "$(VSPATH)/vc/bin/vcvars32.bat" $(HIDEVARS) "$(VSPATH)/vc/bin/ml.exe" 32 | MASM64 = $(HIDEECHOA) $(CALLVC) "$(VSPATH)/vc/bin/amd64/vcvars64.bat" $(HIDEVARS) "$(VSPATH)/vc/bin/amd64/ml64.exe" 33 | VCC32 = $(HIDEECHOA) "$(VECTORCPATH)/vectorc86.exe" 34 | 35 | MSLINK32GCC = $(HIDEECHOA) $(CALLVC) "$(ICCPATH)bin/iclvars_ia32.bat" $(HIDEVARS) "$(VSPATH8)/VC/bin/link.exe" 36 | MSLINK32 = $(HIDEECHOA) $(CALLVC) "$(VSPATH)/vc/bin/vcvars32.bat" $(HIDEVARS) "$(VSPATH)/VC/bin/link.exe" 37 | MSLINK64 = $(HIDEECHOA) $(CALLVC) "$(VSPATH)/vc/bin/amd64/vcvars64.bat" $(HIDEVARS) "$(VSPATH)/VC/bin/amd64/link.exe" 38 | ICCLINK32 = $(HIDEECHOA) $(CALLVC) "$(ICCPATH)bin/iclvars_ia32.bat" $(HIDEVARS) "$(ICCPATH32)/xilink.exe" -quseenv 39 | ICCLINK64 = $(HIDEECHOA) $(CALLVC) "$(ICCPATH)bin/iclvars_intel64.bat" $(HIDEVARS) "$(ICCPATH64)/xilink.exe" -quseenv 40 | 41 | #Linker Optionss 42 | LINKFLAGSCOMMON = /fixed:no /nologo /subsystem:console /incremental:no /debug $(MULTITHREADLIBS) /MANIFEST:NO $(HOARD) /pdb:"$(WORKPATH)/$(TARGET).pdb" 43 | LINKFLAGS32 = $(LINKFLAGSCOMMON) /machine:I386 44 | LINKFLAGS64 = $(LINKFLAGSCOMMON) /machine:X64 45 | 46 | #Common Compiler Options 47 | PREHEADER = /Fp"$@.pch" /Fd"$@.pdb" 48 | CFLAGSCOMMON = $(PREHEADER) /nologo /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /W3 $(MULTITHREAD) 49 | CFLAGS32 = $(CFLAGSCOMMON) 50 | CFLAGS64 = $(CFLAGSCOMMON) /D "_WIN64" /D "_AMD64_" /D "_X64_" 51 | DEBUGFLAGS = /EHs /Zi /Od /D "DEBUG_RUNTIME" 52 | 53 | GCCFLAGS32 += -mrtd 54 | 55 | INTELQPROF = 56 | #/Qprof_gen, /Qprof_use 57 | 58 | #Intel Compiler Options 59 | INTELFLAGSOPT = /Oa /Ow /Qansi-alias /Ob2 /Ot /Oi /GA /G7 /O3 /Ox /Qvec_report0 /Qopt-prefetch /Q$(INTELARCH) /Gs0 /debug:minimal 60 | # /Qguide /Qopt-report:2 /Qvec-report:5 61 | ifeq ($(CONFIG_LTO), 1) 62 | INTELFLAGSOPT += /Qipo 63 | INTELLINKIPO = /Qipo-c /Qipo-fo 64 | else 65 | INTELFLAGSOPT += /Qip 66 | endif 67 | INTELFLAGSDBG = /Od /Zi 68 | INTELFLAGSBASE = /EHsc /D "INTEL_RUNTIME" /Qprof_dir$(WORKPATH) $(MULTITHREAD) $(INTELQPROF) 69 | INTELFLAGSCOMMON = $(INTELFLAGSBASE) $(INTELFLAGSUSE) 70 | INTELFLAGS32 = $(INTELFLAGSCOMMON) /Oy /Gr 71 | INTELFLAGS64 = $(INTELFLAGSCOMMON) 72 | # /Zd /Zi /Qvec_report0 73 | 74 | #VectorC Compiler Options 75 | VECTORCOPTIMIZED = /ssecalls /optimize 10 /max /target p4 /autoinline 4096 /vc /Ob2 /Oi /Ot 76 | VECTORCSTANDARD = /optimize 0 /novectors /vc /Ob0 77 | VECTORCFLAGS = /nologo /noprogress /vserror /cpp /mslibs $(VECTORCSTANDARD) /c /D "VECTORC_RUNTIME" $(MULTITHREAD) /I"$(VSPATH6)/VC98/include" $(VC8INCLUDES) 78 | 79 | #Visual Studio Compiler Options 80 | VSNETFLAGSOPT = /EHs /O2 /Ox /Oi /Ot /Oy /GA /Ob2 /Zi /Qfast_transcendentals $(MSOPENMP) 81 | VSNETFLAGSDBG = /Od /Zi 82 | VSNETFLAGSCOMMON = /D "VSNET_RUNTIME" $(VSNETFLAGSUSE) $(EXTRAFLAGSMSCC) /EHsc 83 | VSNETFLAGS32 = $(VSNETFLAGSCOMMON) 84 | VSNETFLAGS64 = $(VSNETFLAGSCOMMON) /favor:$(MSVCFAVOR) 85 | 86 | ifeq ("$(CONFIG_OPENMP)", "1") 87 | INTELFLAGSCOMMON += /Qopenmp 88 | VSNETFLAGSCOMMON += /openmp 89 | endif 90 | 91 | ifeq ($(GCCARCH), ) 92 | GCCARCHA = -march=native -msse4.2 93 | else 94 | GCCARCHA = -march=$(GCCARCH) -msse4.2 95 | endif 96 | 97 | #Compilation Output Control 98 | ifneq ("$(VERBOSE)", "1") 99 | HIDEECHOB = @ 100 | ifndef HIDEECHO 101 | HIDEECHOA = @ 102 | else ifneq ($(HIDEECHOA), "-") 103 | HIDEECHOA = $(HIDEECHO) 104 | endif 105 | endif 106 | ifndef HIDEVARS 107 | HIDEVARS = 1 108 | endif 109 | 110 | CALLVC = $(HIDEECHOA) cmd /C "makefiles\callvc.bat" 111 | 112 | PATH := /bin:/usr/bin:$(WINPATH):$(WINPATH)/system32:$(PATH) 113 | 114 | ifeq ($(ARCHBITS), 64) 115 | ICC = $(ICC64) $(INTELFLAGS64) $(CFLAGS64) 116 | CCDBG = $(ICC64) $(INTELFLAGSBASE) $(INTELFLAGSDBG) $(CFLAGS64) $(DEBUGFLAGS) 117 | ICCLINK = $(ICCLINK64) $(LINKFLAGS64) 118 | MSCC = $(MSCC64) $(VSNETFLAGS64) $(CFLAGS64) 119 | MSLINK = $(MSLINK64) $(LINKFLAGS64) 120 | GCC = $(GCC64) $(GCCFLAGS64) $(GCCFLAGSCOMMON) $(GCCFLAGSUSE) 121 | MASM = $(MASM64) 122 | CCCUDA = $(MSCC64) /TP $(VSNETFLAGS64) $(CFLAGS64) 123 | LIBPATHAMD = /LIBPATH:"$(AMDPATH)lib" /LIBPATH:"$(AMDPATH)lib/x86_64" 124 | LIBPATHCUDA = /LIBPATH:"$(CUDAPATH)lib/x64" /LIBPATH:"$(CUDASDKPATH)common/lib/x64" 125 | LIBPATHDIRECTX = /LIBPATH:"$(DIRECTXPATH)lib/x64" 126 | LIBPATHSUSE = /LIBPATH:"$(ICCPATH)compiler/lib/intel64" 127 | LINKFLAGSARCH = /machine:X64 128 | else 129 | ICC = $(ICC32) $(INTELFLAGS32) $(CFLAGS32) 130 | CCDBG = $(MSCC32) $(CFLAGS32) $(DEBUGFLAGS) 131 | ICCLINK = $(ICCLINK32) $(LINKFLAGS32) 132 | MSCC = $(MSCC32) $(VSNETFLAGS32) $(CFLAGS32) /Gr 133 | MSLINK = $(MSLINK32) $(LINKFLAGS32) 134 | MSLINKGCC = $(MSLINK32GCC) $(LINKFLAGS32) 135 | VCC = $(VCC32) /outfile $@ $(VECTORCFLAGS) $(CFLAGS32) 136 | GCC = $(GCC32) $(GCCFLAGS32) $(GCCFLAGSCOMMON) $(GCCFLAGSUSE) 137 | MASM = $(MASM32) 138 | CCCUDA = $(MSCC1032) $(VSNETFLAGS32) $(CFLAGS32) /TP /Gd 139 | LIBPATHAMD = /LIBPATH:"$(AMDPATH)lib" /LIBPATH:"$(AMDPATH)lib/x86" 140 | LIBPATHCUDA = /LIBPATH:"$(CUDAPATH)lib/win32" /LIBPATH:"$(CUDASDKPATH)common/lib/Win32" 141 | LIBPATHDIRECTX = /LIBPATH:"$(DIRECTXPATH)lib/x86" 142 | LIBPATHSUSE = /LIBPATH:"$(ICCPATH)compiler/lib/ia32" 143 | LINKFLAGSARCH = /machine:I386 144 | endif 145 | QTUIC = $(QTPATH)/bin/uic.exe 146 | QTMOC = $(QTPATH)/bin/moc.exe 147 | 148 | LIBPATHSUSE += $(LIBPATHS:%=/LIBPATH:%) 149 | 150 | ifeq ($(CC_i686-pc-cygwin), ICC) 151 | CC = $(ICC) 152 | ifeq ($(CPPFILES_GCC), ) 153 | LINK = $(ICCLINK) 154 | else 155 | LINK = $(MSLINK) 156 | endif 157 | else ifeq ($(CC_i686-pc-cygwin), GCC) 158 | CC = $(GCC) 159 | LINK = $(GCC) 160 | else 161 | CC = $(MSCC) 162 | LINK = $(MSLINK) 163 | endif 164 | GCC3264 = $(GCC) 165 | CC_SELECTED = $(CC_i686-pc-cygwin) 166 | 167 | ASM = $(MASM) 168 | ASMPRE = $(MSCC32) 169 | NVCC = $(HIDEECHOA) $(CALLVC) "$(VSPATH10)/vc/bin/vcvars32.bat" $(HIDEVARS) "$(CUDAPATH)bin/nvcc" 170 | 171 | MULTITHREADGCC = -mthreads -D_MT 172 | 173 | LIBSUSE = kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib 174 | 175 | ifneq ($(CPPFILES_GCC), ) 176 | GCCLIBPATH := $(shell cygpath -m `$(GCC) -print-libgcc-file-name | sed -e s/libgcc.a//` `$(GCC) -print-sysroot`/mingw/lib) 177 | LIBSUSE += $(GCCLIBPATH:%=/LIBPATH:"%") libgcc.a libstdc++.a libmingw32.a libgcc_eh.a 178 | #libmingwex.a 179 | #libmsvcrt.a 180 | #libgcov.a libmingwex.a 181 | endif 182 | 183 | OPENCLLIB = OpenCL.lib 184 | ifeq ("$(CONFIG_OPENCL)", "1") 185 | LIBSUSE += $(OPENCLLIB) 186 | endif 187 | 188 | ifeq ("$(CONFIG_CAL)", "1") 189 | ifeq ($(ARCHBITS), 64) 190 | LIBSUSE += aticalcl64.lib aticalrt64.lib 191 | else 192 | LIBSUSE += aticalcl.lib aticalrt.lib 193 | endif 194 | endif 195 | 196 | ifeq ("$(CONFIG_DIRECTX)", "1") 197 | LIBSUSE += ddraw.lib dxguid.lib dxerr.lib 198 | COMMONINCLUDEPATHS += "$(DIRECTXPATH)include" 199 | LIBPATHSUSE += $(LIBPATHDIRECTX) 200 | endif 201 | 202 | ifeq ("$(CONFIG_VIDEO_EDIT)", "1") 203 | LIBSUSE += amstrmid.lib msacm32.lib vfw32.lib winmm.lib 204 | endif 205 | 206 | ifeq ("$(CONFIG_OPENGL)", "1") 207 | LIBSUSE += opengl32.lib glu32.lib 208 | endif 209 | 210 | ifeq ("$(CONFIG_QT)", "1") 211 | LIBSUSE += Qt5Gui.lib Qt5Core.lib Qt5Widgets.lib 212 | COMMONINCLUDEPATHS += $(QTPATH)/include $(QTPATH)/include/QtGui $(QTPATH)/include/QtCore $(QTPATH)/include/QtWidgets $(WORKPATH)/qt 213 | LIBPATHSUSE += /LIBPATH:$(QTPATH)/lib 214 | endif 215 | 216 | LIBSUSE += $(LIBS:%=%.lib) 217 | 218 | ifeq ($(TARGETTYPE), LIB) 219 | LINKTARGETTYPE = /DLL 220 | EXECUTABLE = $(TARGET).dll 221 | else 222 | LINKTARGETTYPE = 223 | EXECUTABLE = $(TARGET).exe 224 | endif 225 | 226 | ifeq ("$(CONFIG_OPENCL)", "1") 227 | ifeq ("$(CONFIG_OPENCL_VERSION)", "AMD") 228 | COMMONINCLUDEPATHS += "$(AMDPATH)include" 229 | LIBPATHSUSE += $(LIBPATHAMD) 230 | endif 231 | ifeq ("$(CONFIG_OPENCL_VERSION)", "NVIDIA") 232 | COMMONINCLUDEPATHS += "$(CUDAPATH)include" 233 | LIBPATHSUSE += $(LIBPATHCUDA) 234 | endif 235 | ifeq ("$(CONFIG_OPENCL_VERSION)", "Intel") 236 | #COMMONINCLUDEPATHS += "" 237 | endif 238 | ifeq ("$(CONFIG_OPENCL_VERSION)", "All") 239 | COMMONINCLUDEPATHS += "$(AMDPATH)include" 240 | COMMONINCLUDEPATHS += "$(CUDAPATH)include" 241 | #COMMONINCLUDEPATHS += "" 242 | LIBPATHSUSE += $(LIBPATHAMD) 243 | endif 244 | endif 245 | 246 | ifeq ("$(CONFIG_CUDA)", "1") 247 | COMMONINCLUDEPATHS += "$(CUDAPATH)include" "$(CUDASDKPATH)common/inc" 248 | LIBPATHSUSE += $(LIBPATHCUDA) 249 | ifneq ($(CUFILES), ) 250 | LIBSUSE += cudart.lib cuda.lib 251 | ifeq ($(CONFIG_CUDA_DC), 1) 252 | LIBSUSE += cudadevrt.lib 253 | endif 254 | ifeq ("$(CONFIG_OPENGL)", "1") 255 | ifeq ($(ARCHBITS), 64) 256 | LIBSUSE += freeglut.lib glew64.lib 257 | else 258 | LIBSUSE += freeglut.lib glew32.lib 259 | endif 260 | endif 261 | endif 262 | endif 263 | 264 | ifeq ("$(CONFIG_CAL)", "1") 265 | COMMONINCLUDEPATHS += "$(AMDPATH)/include/CAL" 266 | LIBPATHSUSE += $(LIBPATHAMD) 267 | endif 268 | 269 | ifeq ($(CC_i686-pc-cygwin), GCC) 270 | COMPILEOUTPUT = -o $@ 271 | LINKOUTPUT = -o $@ 272 | COMPILEONLY = -c 273 | ASMONLY = -c 274 | PRECOMPILEONLY = -x c++ -E 275 | INCLUDEPATHSUSE = $(GCCINCLUDEPATHS) 276 | DEFINESUSE = $(GCCDEFINES) 277 | else 278 | INCLUDEPATHSUSE = $(VSINCLUDEPATHS) 279 | DEFINESUSE = $(VSDEFINES) 280 | COMPILEOUTPUTBASE = /Fo 281 | COMPILEOUTPUT = $(COMPILEOUTPUTBASE)"$@" 282 | LINKOUTPUT = /Out:"$@" 283 | COMPILEONLY = /c 284 | ASMONLY = /c 285 | PRECOMPILEONLY = /EP 286 | endif 287 | OBJ = obj 288 | 289 | DEFINESARCH = "WIN32" 290 | 291 | NVCCARCHS := `for i in $(CUDAVERSION); do echo -n -gencode arch BAT_SPECIAL_EQ compute_$$i BAT_SPECIAL_KOMMA code BAT_SPECIAL_EQ sm_$$i\ ;done` 292 | NVCC_GREP = "^#line\|^$$" 293 | -------------------------------------------------------------------------------- /makefiles/i686-pc-linux-gnu.mak: -------------------------------------------------------------------------------- 1 | CC_i686-pc-linux-gnu = $(CC_x86_64-pc-linux-gnu) 2 | ALLDEP += makefiles/x86_64-pc-linux-gnu.mak 3 | include makefiles/x86_64-pc-linux-gnu.mak 4 | -------------------------------------------------------------------------------- /makefiles/include.S: -------------------------------------------------------------------------------- 1 | .global FILENAMEMOD 2 | .global FILENAMEMOD_size 3 | .section .data 4 | FILENAMEMOD: 5 | .incbin "FILENAMENORMAL" 6 | 1: 7 | .byte 0 8 | FILENAMEMOD_size: 9 | .int 1b - FILENAMEMOD 10 | -------------------------------------------------------------------------------- /makefiles/makefile: -------------------------------------------------------------------------------- 1 | all : all_tmp 2 | 3 | ARCH := $(shell sort <<< $$MACHTYPE) 4 | ARCHCHK := $(shell if [ -a makefiles/$(ARCH).mak ]; then echo -n 1; else echo -n 0; fi) 5 | ifeq ($(ARCHCHK), 1) 6 | else 7 | $(warning Unknown Architecture: $(ARCH) $(ARCHCHK), defaulting to x86_64-pc-linux-gnu) 8 | ARCH := x86_64-pc-linux-gnu 9 | endif 10 | ifeq ($(ARCH), i686-pc-cygwin) 11 | ARCH_CYGWIN := 1 12 | endif 13 | ifeq ($(ARCH), x86_64-unknown-cygwin) 14 | ARCH_CYGWIN := 1 15 | endif 16 | 17 | ARCHFILE = $(ARCH).mak 18 | 19 | ifeq ($(CONFIGFILE), ) 20 | CONFIGFILE = config.mak 21 | CLEANRELEASEDIR = release 22 | endif 23 | 24 | ifeq ($(BUILDSCRIPT), ) 25 | BUILDSCRIPT = build.sh 26 | endif 27 | 28 | #GCC Compiler Options 29 | GCCFLAGSOPT = -O3 $(GCCFLAGSARCH) -fweb -frename-registers -minline-all-stringops -mfpmath=sse -ftracer -funroll-loops -fpeel-loops -fprefetch-loop-arrays -ffast-math -fno-stack-protector 30 | CLANGFLAGSOPT = -O3 $(GCCFLAGSARCH) -minline-all-stringops -mfpmath=sse -funroll-loops -ffast-math -fno-stack-protector $(EXTRAFLAGSCLANG) 31 | CLANGFLAGSDBG = $(GCCFLAGSDBG) $(EXTRAFLAGSCLANG) 32 | #-fgcse-sm -fgcse-las -fmodulo-sched -fipa-pta -floop-interchange -floop-block 33 | GCCFLAGSDBG = -O0 $(GCCFLAGSARCH) 34 | GCCFLAGSCOMMON = $(MULTITHREADGCC) -pipe -DGCC_RUNTIME $(GCCPROF) $(EXTRAFLAGSGCC) -Wall -Wno-write-strings 35 | CLANGFLAGSCOMMON = $(MULTITHREADGCC) -pipe -DGCC_RUNTIME -DCLANG_RUNTIME $(GCCPROF) $(EXTRAFLAGSGCC) -Wall -Wno-write-strings 36 | GCCFLAGS32 = -m32 37 | GCCFLAGS64 = -m64 -D"_AMD64_" -D"_X64_" 38 | 39 | GCCPROF = 40 | #-fprofile-arcs, -fbranch-probabilities 41 | 42 | #Multithread Options 43 | MULTITHREAD = /MT 44 | MULTITHREADLIBS = /nodefaultlib:libc.lib 45 | 46 | NVCCFLAGSOPT = --use_fast_math --maxrregcount $(CUDAREGS) -O4 -Xptxas -v -Xptxas -O4 -Xcompiler -O4 -m$(ARCHBITS) $(NVCCARCHS) 47 | NVCCFLAGSDBG = --maxrregcount $(CUDAREGS) -Xptxas -v -Xptxas -O0 -O0 -m$(ARCHBITS) $(NVCCARCHS) 48 | 49 | TARGETTYPE = EXECUTABLE 50 | 51 | WORKPATHSUFFIX = $(TARGETTYPE)_$(TARGET) 52 | include $(CONFIGFILE) 53 | WORKPATH = release/$(ARCH)_$(ARCHBITS)$(WORKPATHSUFFIX) 54 | ifeq ($(CONFIG_OPENCL_VERSION), ) 55 | CONFIG_OPENCL_VERSION = All 56 | endif 57 | ifeq ($(COMPILER_FLAGS), DBG) 58 | INTELFLAGSUSE = $(INTELFLAGSDBG) 59 | VSNETFLAGSUSE = $(VSNETFLAGSDBG) 60 | GCCFLAGSUSE = $(GCCFLAGSDBG) 61 | CLANGFLAGSUSE = $(CLANGFLAGSDBG) 62 | NVCCFLAGSUSE = $(NVCCFLAGSDBG) 63 | OPENCL_DEF_OPTIONS = "-O0 -g" 64 | CONFIG_LTO = 0 65 | endif 66 | ifeq ($(COMPILER_FLAGS), OPT) 67 | INTELFLAGSUSE = $(INTELFLAGSOPT) 68 | VSNETFLAGSUSE = $(VSNETFLAGSOPT) 69 | GCCFLAGSUSE = $(GCCFLAGSOPT) 70 | CLANGFLAGSUSE = $(CLANGFLAGSOPT) 71 | NVCCFLAGSUSE = $(NVCCFLAGSOPT) 72 | OPENCL_DEF_OPTIONS = "-O3" 73 | endif 74 | ifeq ($(GCCCUDA), ) 75 | GCCCUDA = $(GCC3264) 76 | endif 77 | 78 | MKDIR = $(HIDEECHOB) mkdir -p `echo $@ | sed 's,/[a-zA-Z0-9._-]*$$,,'` `echo $@ | sed 's,/[a-zA-Z0-9._-]*$$,,'` 79 | 80 | ALLDEP += makefiles/makefile $(CONFIGFILE) makefiles/$(ARCHFILE) config.mak 81 | include makefiles/$(ARCHFILE) 82 | GCCFLAGSARCH += $(GCCARCHA) 83 | ifeq ($(CONFIG_CPP11), 1) 84 | GCCFLAGSCOMMON += -std=c++11 85 | CLANGFLAGSCOMMON += -std=c++11 86 | endif 87 | ifneq ($(CONFIG_CPP), ) 88 | GCCFLAGSCOMMON += -std=$(CONFIG_CPP) 89 | CLANGFLAGSCOMMON += -std=$(CONFIG_CPP) 90 | endif 91 | ifeq ($(CONFIG_OPENMP), 1) 92 | GCCFLAGSCOMMON += -fopenmp 93 | CLANGFLAGSCOMMON += -fopenmp 94 | GCCLINK += -fopenmp 95 | endif 96 | 97 | ifeq ($(TARGETPATH), ) 98 | COPIED_EXECUTABLE = $(EXECUTABLE) 99 | TARGETPATH = . 100 | else 101 | COPIED_EXECUTABLE = $(TARGETPATH)$(EXECUTABLE) 102 | $(COPIED_EXECUTABLE) : $(EXECUTABLE) 103 | cp $(EXECUTABLE) $(COPIED_EXECUTABLE) 104 | endif 105 | 106 | all_tmp: $(SUBTARGETS:%=subbuild/%.mak) $(COPIED_EXECUTABLE) 107 | 108 | main: $(COPIED_EXECUTABLE) 109 | 110 | run : all 111 | cd $(TARGETPATH) && ./$(EXECUTABLE) 112 | 113 | 114 | subbuild/%.mak: 115 | +$(MAKE) CONFIGFILE=config_`echo $@ | sed s,subbuild/,,` BUILDSCRIPT=config_`echo $@ | sed s,subbuild/,, | sed s,mak,sh,` -f makefile 116 | 117 | CUDAINCLUDEPATHS = $(INCLUDEPATHSUSE:%=--compiler-options %) 118 | CUDADEFINES = $(DEFINESUSE:%=--compiler-options %) 119 | 120 | DEPENDS := $(CUFILES:%.cu=$(WORKPATH)/cu/%.d) $(CLFILES:%.cl=$(WORKPATH)/cl/%.d) $(ASMFILES:%.asm=$(WORKPATH)/asm/%.d) $(CPPFILES_DBG:%.cpp=$(WORKPATH)/dbg/%.d) $(CPPFILES_VCC:%.cpp=$(WORKPATH)/vcc/%.d) \ 121 | $(CPPFILES:%.cpp=$(WORKPATH)/cpp/%.d) $(CXXFILES:%.cxx=$(WORKPATH)/cxx/%.d) $(CFILES:%.c=$(WORKPATH)/c/%.d) \ 122 | $(CPPFILES_MSCC:%.cpp=$(WORKPATH)/mscc/%.d) $(CPPFILES_CLANG:%.cpp=$(WORKPATH)/clang/%.d) $(CPPFILES_ICC:%.cpp=$(WORKPATH)/icc/%.d) $(CPPFILES_GCC:%.cpp=$(WORKPATH)/gcc/%.d) $(QTFILES:%.ui=$(WORKPATH)/qt/%.qtd) 123 | CPPFILES += $(QTFILES:%.ui=$(WORKPATH)/cpp/%.$(OBJ)) $(QTCPPFILES) 124 | 125 | OBJFILES := $(CUFILES:%.cu=$(WORKPATH)/cu/%.$(OBJ)) $(ASMFILES:%.asm=$(WORKPATH)/asm/%.$(OBJ)) $(CPPFILES_DBG:%.cpp=$(WORKPATH)/dbg/%.$(OBJ)) $(CPPFILES_VCC:%.cpp=$(WORKPATH)/vcc/%.$(OBJ)) $(RESOURCEFILES:%=$(WORKPATH)/res/%.$(OBJ)) $(CLFILES:%.cl=$(WORKPATH)/cl/%.$(OBJ)) $(QTFILES:%.ui=$(WORKPATH)/qt/%_moc.$(OBJ)) $(QTCPPFILES:%.cpp=$(WORKPATH)/qt/%_moccpp.$(OBJ)) 126 | 127 | CPPFILES_ICC := $(CPPFILES_ICC:%.cpp=$(WORKPATH)/icc/%.$(OBJ)) 128 | CPPFILES_GCC := $(CPPFILES_GCC:%.cpp=$(WORKPATH)/gcc/%.$(OBJ)) 129 | CPPFILES_MSCC := $(CPPFILES_MSCC:%.cpp=$(WORKPATH)/mscc/%.$(OBJ)) 130 | CPPFILES_CLANG := $(CPPFILES_CLANG:%.cpp=$(WORKPATH)/clang/%.$(OBJ)) 131 | ifeq ($(CC_SELECTED), ICC) 132 | CPPFILES_ICC += $(CPPFILES:%.cpp=$(WORKPATH)/cpp/%.$(OBJ)) $(CXXFILES:%.cxx=$(WORKPATH)/cxx/%.$(OBJ)) $(CFILES:%.c=$(WORKPATH)/c/%.$(OBJ)) 133 | CCWITHLINK = /link 134 | endif 135 | ifeq ($(CC_SELECTED), MSCC) 136 | CPPFILES_MSCC += $(CPPFILES:%.cpp=$(WORKPATH)/cpp/%.$(OBJ)) $(CXXFILES:%.cxx=$(WORKPATH)/cxx/%.$(OBJ)) $(CFILES:%.c=$(WORKPATH)/c/%.$(OBJ)) 137 | CCWITHLINK = /link 138 | endif 139 | ifeq ($(CC_SELECTED), GCC) 140 | CPPFILES_GCC += $(CPPFILES:%.cpp=$(WORKPATH)/cpp/%.$(OBJ)) $(CXXFILES:%.cxx=$(WORKPATH)/cxx/%.$(OBJ)) $(CFILES:%.c=$(WORKPATH)/c/%.$(OBJ)) 141 | endif 142 | ifeq ($(CC_SELECTED), clang) 143 | CPPFILES_CLANG += $(CPPFILES:%.cpp=$(WORKPATH)/cpp/%.$(OBJ)) $(CXXFILES:%.cxx=$(WORKPATH)/cxx/%.$(OBJ)) $(CFILES:%.c=$(WORKPATH)/c/%.$(OBJ)) 144 | endif 145 | 146 | ifeq ($(CONFIG_LTO), 1) 147 | ifneq ("0$(CPPFILES_ICC)", "0") 148 | OBJFILES += $(WORKPATH)/make_lto_icc/icclto_$(TARGET).$(OBJ) 149 | endif 150 | OBJFILES += $(CPPFILES_MSCC) $(CPPFILES_GCC) $(CPPFILES_CLANG) 151 | else 152 | OBJFILES += $(CPPFILES_ICC) $(CPPFILES_MSCC) $(CPPFILES_GCC) $(CPPFILES_CLANG) 153 | endif 154 | 155 | ifneq ($(CONFIG_GDB), 0) 156 | GCCFLAGSOPT += -ggdb 157 | GCCFLAGSDBG += -ggdb 158 | CLANGFLAGSOPT += -ggdb 159 | endif 160 | 161 | ifeq ($(CONFIG_CUDA_DC), 1) 162 | OBJFILES += $(WORKPATH)/make_cuda_device_link/dl_$(TARGET).$(OBJ) 163 | NVCCFLAGSDC = -dc 164 | endif 165 | 166 | ifeq ($(ARCH_CYGWIN), 1) 167 | ifeq ($(INCLUDEPATHS)$(COMMONINCLUDEPATHS), ) 168 | GCCINCLUDEPATHS = 169 | else 170 | GCCINCLUDEPATHSA := $(INCLUDEPATHS) $(COMMONINCLUDEPATHS) 171 | GCCINCLUDEPATHSB := $(shell cygpath -u $(GCCINCLUDEPATHSA)) 172 | GCCINCLUDEPATHS := $(GCCINCLUDEPATHSB:%=-I%) 173 | endif 174 | else 175 | GCCINCLUDEPATHS := $(INCLUDEPATHS:%=-I%) $(COMMONINCLUDEPATHS:%=-I%) $(INCLUDEPATHSSYSTEM:%=-isystem %) 176 | endif 177 | VSINCLUDEPATHS := $(INCLUDEPATHS:%=/I%) $(COMMONINCLUDEPATHS:%=/I%) 178 | 179 | ifeq ($(ARCHBITS), 64) 180 | DEFINES += _64BIT 181 | endif 182 | GCCDEFINES := $(DEFINES:%=-D%) $(DEFINESARCH:%=-D%) 183 | VSDEFINESTMP := $(DEFINES:%=/D%) $(DEFINESARCH:%=/D%) 184 | VSDEFINES := $(subst =, BAT_SPECIAL_EQ ,$(VSDEFINESTMP)) 185 | 186 | LIBFILES = $(LIBSUSE) 187 | 188 | .SECONDARY: $(CUFILES:%.cu=$(WORKPATH)/cu/%.cpp) $(ASMFILES:%.asm=$(WORKPATH)/asm/%.cpp) $(RESOURCEFILES:%=$(WORKPATH)/res/%.$(OBJ)) $(CLFILES:%.cl=$(WORKPATH)/cl/%.$(OBJ)) $(QTFILES:%.ui=$(WORKPATH)/qt/%_moc.$(OBJ)) $(QTFILES:%.ui=$(WORKPATH)/qt/%_moc.cpp) $(QTFILES:%.ui=$(WORKPATH)/qt/%_ui.h) $(CLFILES:%.cl=$(WORKPATH)/cl/%.clbin) 189 | 190 | $(EXECUTABLE): $(EXTRADEPS) $(OBJFILES) $(EXTRAOBJFILES) $(ALLDEP) 191 | $(LINK) $(EXTRAFLAGSLINK) $(LIBPATHSUSE) $(OBJFILES) $(EXTRAOBJFILES) $(LIBFILES) $(LINKFLAGSUSE) $(LINKOUTPUT) $(LINKTARGETTYPE) 192 | $(HIDEECHOA) if [ -e "$(BUILDSCRIPT)" ]; then ./$(BUILDSCRIPT); fi 193 | 194 | $(WORKPATH)/cpp/%.$(OBJ): %.cpp $(ALLDEP) 195 | $(MKDIR) 196 | $(CC) $(INCLUDEPATHSUSE) $(DEFINESUSE) $(FILEFLAGS$<) $(COMPILEONLY) $< $(COMPILEOUTPUT) 197 | 198 | $(WORKPATH)/cxx/%.$(OBJ): %.cxx $(ALLDEP) 199 | $(CC) $(INCLUDEPATHSUSE) $(DEFINESUSE) $(FILEFLAGS$<) $(COMPILEONLY) $< $(COMPILEOUTPUT) 200 | 201 | $(WORKPATH)/c/%.$(OBJ): %.c $(ALLDEP) 202 | $(CC) $(INCLUDEPATHSUSE) $(DEFINESUSE) $(FILEFLAGS$<) $(COMPILEONLY) $< $(COMPILEOUTPUT) 203 | 204 | $(WORKPATH)/cu/%.$(OBJ): $(WORKPATH)/cu/%.cpp $(ALLDEP) 205 | $(CCCUDA) $(INCLUDEPATHSUSE) $(DEFINESUSE) $(FILEFLAGS$<) $(COMPILEONLY) $< $(COMPILEOUTPUT) 206 | 207 | $(WORKPATH)/cu/%.cpp: %.cu $(ALLDEP) 208 | $(NVCC) $(NVCCFLAGSUSE) $(NVCCFLAGSDC) $(CUDAINCLUDEPATHS) $(CUDADEFINES) $(FILEFLAGS$<) --cuda --output-file "$@" $< 209 | $(HIDEECHOA) cat $@ | grep -v NVCC_GREP | sed "s/#pragma detect_mismatch(\"_MSC_VER\", \"1600\")//g" > $@.tmp 210 | $(HIDEECHOA) mv -f $@.tmp $@ 211 | -if [ -e "$<.$(ARCH).patch" ]; then patch -r /dev/null -s --no-backup-if-mismatch -i $<.$(ARCH).patch $@; fi 212 | 213 | $(WORKPATH)/asm/%.$(OBJ): $(WORKPATH)/asm/%.asm $(ALLDEP) 214 | $(ASM) $(COMPILEOUTPUT) $(ASMONLY) $< 215 | 216 | $(WORKPATH)/asm/%.asm: %.asm $(ALLDEP) 217 | $(CC) $(PRECOMPILEONLY) $(FILEFLAGS$<) $(DEFINESUSE) $< > $@ 218 | 219 | $(WORKPATH)/dbg/%.$(OBJ): %.cpp $(ALLDEP) 220 | $(CCDBG) $(INCLUDEPATHSUSE) $(DEFINESUSE) $(FILEFLAGS$<) $(COMPILEONLY) $< $(COMPILEOUTPUT) 221 | $(WORKPATH)/icc/%.$(OBJ): %.cpp $(ALLDEP) 222 | $(ICC) $(INCLUDEPATHSUSE) $(DEFINESUSE) $(FILEFLAGS$<) $(COMPILEONLY) $< $(COMPILEOUTPUT) 223 | $(WORKPATH)/vcc/%.$(OBJ): %.cpp $(ALLDEP) 224 | $(VCC) $(INCLUDEPATHSUSE) $(DEFINESUSE) $(FILEFLAGS$<) $(COMPILEONLY) $< 225 | $(WORKPATH)/gcc/%.$(OBJ): %.cpp $(ALLDEP) 226 | $(GCC) $(GCCINCLUDEPATHS) $(GCCDEFINES) $(FILEFLAGS$<) -c $< -o $@ 227 | $(WORKPATH)/mscc/%.$(OBJ): %.cpp $(ALLDEP) 228 | $(MSCC) $(INCLUDEPATHSUSE) $(DEFINESUSE) $(FILEFLAGS$<) $(COMPILEONLY) $< $(COMPILEOUTPUT) 229 | $(WORKPATH)/clang/%.$(OBJ): %.cpp $(ALLDEP) 230 | $(CLANG) $(GCCINCLUDEPATHS) $(GCCDEFINES) $(FILEFLAGS$<) -c $< -o $@ 231 | 232 | 233 | $(WORKPATH)/qt/%_ui.h: %.ui $(ALLDEP) 234 | $(MKDIR) 235 | $(QTUIC) -o $@ $< 236 | 237 | $(WORKPATH)/qt/%_moc.cpp: %.h $(WORKPATH)/qt/%_ui.h $(ALLDEP) 238 | $(QTMOC) -DQT_NO_DEBUG -DQT_GUI_LIB -DQT_CORE_LIB -DQT_SHARED -o $@ $< 239 | 240 | $(WORKPATH)/qt/%_moccpp.cpp: %.h $(ALLDEP) 241 | $(QTMOC) -DQT_NO_DEBUG -DQT_GUI_LIB -DQT_CORE_LIB -DQT_SHARED -o $@ $< 242 | 243 | $(WORKPATH)/qt/%.$(OBJ): $(WORKPATH)/qt/%.cpp $(ALLDEP) 244 | $(CC) $(INCLUDEPATHSUSE) $(DEFINESUSE) $(FILEFLAGS$<) $(COMPILEONLY) $< $(COMPILEOUTPUT) 245 | 246 | $(WORKPATH)/_makefiles_support_opencl.$(OBJ): makefiles/makefile_opencl_compiler.cpp $(ALLDEP) 247 | $(HIDEECHOB) mkdir -p $(WORKPATH) 248 | $(CC) $(INCLUDEPATHSUSE) $(COMPILEONLY) $< $(COMPILEOUTPUT) 249 | $(WORKPATH)/_makefiles_support_opencl.exe: $(WORKPATH)/_makefiles_support_opencl.$(OBJ) $(ALLDEP) 250 | $(LINK) $(LIBPATHSUSE) $< $(LINKFLAGSUSE) $(LINKOUTPUT) $(LINKFLAGSARCH) $(OPENCLLIB) 251 | 252 | $(WORKPATH)/cl/%.clbin: %.cl $(WORKPATH)/_makefiles_support_opencl.exe $(ALLDEP) 253 | $(HIDEECHOA) $(OPENCL_ENVIRONMENT) ./$(WORKPATH)/_makefiles_support_opencl.exe -output-file $@ $< -- -I. $(GCCINCLUDEPATHS) $(GCCDEFINES) $(OPENCL_DEF_OPTIONS) $(OPENCL_OPTIONS) > /dev/null 254 | 255 | $(WORKPATH)/cl/%.$(OBJ): $(WORKPATH)/cl/%.clbin $(ALLDEP) 256 | @echo $< 257 | $(HIDEECHOA) sed -e "s&FILENAMENORMAL&$<&g" -e "s&FILENAMEMOD&_makefile_opencl_program_`echo $< | sed -e "s&$(WORKPATH)/cl/&&g" -e "s&\.clbin&\.cl&g" -e "s&\.&_&g" -e "s&/&_&g" -e "s& &_&g"`&g" makefiles/include.S | $(GCC3264) -c -x assembler -o $@ - 258 | $(WORKPATH)/res/%.$(OBJ): % $(ALLDEP) 259 | @echo $< 260 | $(MKDIR) 261 | $(HIDEECHOA) sed -e "s&FILENAMENORMAL&$<&g" -e "s&FILENAMEMOD&_resource_`echo $< | sed -e "s&\.&_&g" -e "s&/&_&g" -e "s& &_&g"`&g" makefiles/include.S | $(GCC3264) -c -x assembler -o $@ - 262 | 263 | 264 | 265 | $(WORKPATH)/make_lto_icc/icclto_$(TARGET).$(OBJ): $(CPPFILES_ICC) $(ALLDEP) 266 | $(HIDEECHOA) mkdir -p $(WORKPATH)/make_lto_icc 267 | $(ICCLINK) $(INTELLINKIPO)$@ $(CPPFILES_ICC) 268 | 269 | $(WORKPATH)/make_cuda_device_link/dl_$(TARGET).$(OBJ): $(CUFILES:%.cu=$(WORKPATH)/cu/%.$(OBJ)) $(ALLDEP) 270 | $(HIDEECHOA) mkdir -p $(WORKPATH)/make_cuda_device_link 271 | $(NVCC) $(NVCCFLAGSUSE) -dlink $(CUFILES:%.cu=$(WORKPATH)/cu/%.$(OBJ)) -o $@ -lcudadevrt 272 | 273 | clean: $(SUBTARGETS:%=subclean/%.mak) 274 | rm -Rf *.plg *.dpi *.exp *.lib $(EXECUTABLE) $(COPIED_EXECUTABLE) x64/release/* *.cubin *.gpu *.ptx *.linkinfo *.ii cuda.compute_* $(DEPENDS) $(OBJFILES) $(CUFILES:%.cu=$(WORKPATH)/cu/%.cpp) $(ASMFILES:%.asm=$(WORKPATH)/asm/%.asm) $(CLEANRELEASEDIR) $(SUBTARGETS_CLEAN) 275 | 276 | subclean/%.mak: 277 | +export CONFIGFILE=config_`echo $@ | sed s,subclean/,,` && $(MAKE) -f makefile clean 278 | 279 | SAVEDIR = releases/`date +%F`-BUILD-`cat buildnr` 280 | backup: 281 | mkdir $(SAVEDIR) 282 | cp *.cpp *.h makefile buildnr *.sh *.bat *.conf *.cu $(SAVEDIR) 283 | 284 | #Dependencies 285 | 286 | DEPMKDIR = $(MKDIR) && 287 | DEPGCC = $(GCC3264) $(GCCFLAGSARCH) $(GCCINCLUDEPATHS) $(GCCDEFINES) -x c++ -MM $< 288 | DEPSED1 = sed -e ':a;N;$$!ba;s/\n/ /g' | sed -e 289 | DEPSED2 = -e 's,\\,/,g' \ 290 | -e 's,[cC]:/,/cygdrive/c/,g' > \ 291 | $@; 292 | 293 | 294 | $(WORKPATH)/qt/%.qtd: %.cpp %.ui $(WORKPATH)/qt/%_ui.h $(ALLDEP) 295 | $(DEPMKDIR) $(DEPGCC) | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/cpp/$*.$(OBJ) $@ : ,g' $(DEPSED2) 296 | 297 | $(WORKPATH)/cl/%.d: %.cl $(ALLDEP) 298 | $(DEPMKDIR) $(DEPGCC) -I$(GCCPATH)/include -D__OPENCL__ | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/cl/$*.clbin $(WORKPATH)/cl/$*.cl $@ : ,g' $(DEPSED2) 299 | 300 | $(WORKPATH)/cu/%.d: %.cu $(ALLDEP) 301 | $(DEPMKDIR) $(DEPGCC) -I$(GCCPATH)/include -D_MSC_VER=1700 -D__CUDACC__ | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/cu/$*.$(OBJ) $(WORKPATH)/cu/$*.cpp $@ : ,g' $(DEPSED2) 302 | 303 | $(WORKPATH)/asm/%.d: %.asm $(ALLDEP) 304 | $(DEPMKDIR) $(DEPGCC) | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/asm/$*.$(OBJ) $(WORKPATH)/asm/$*.asm $@ : ,g' $(DEPSED2) 305 | 306 | $(WORKPATH)/cpp/%.d: %.cpp $(ALLDEP) 307 | $(DEPMKDIR) $(DEPGCC) | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/cpp/$*.$(OBJ) $@ : ,g' $(DEPSED2) 308 | 309 | $(WORKPATH)/cxx/%.d: %.cxx $(ALLDEP) 310 | $(DEPMKDIR) $(DEPGCC) | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/cxx/$*.$(OBJ) $@ : ,g' $(DEPSED2) 311 | 312 | $(WORKPATH)/c/%.d: %.c $(ALLDEP) 313 | $(DEPMKDIR) $(DEPGCC) | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/c/$*.$(OBJ) $@ : ,g' $(DEPSED2) 314 | 315 | $(WORKPATH)/dbg/%.d: %.cpp $(ALLDEP) 316 | $(DEPMKDIR) $(DEPGCC) -DDEBUG_RUNTIME | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/dbg/$*.$(OBJ) $@ : ,g' $(DEPSED2) 317 | 318 | $(WORKPATH)/icc/%.d: %.cpp $(ALLDEP) 319 | $(DEPMKDIR) $(DEPGCC) -DINTEL_RUNTIME | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/icc/$*.$(OBJ) $@ : ,g' $(DEPSED2) 320 | 321 | $(WORKPATH)/vcc/%.d: %.cpp $(ALLDEP) 322 | $(DEPMKDIR) $(DEPGCC) -DVECTORC_RUNTIME | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/vcc/$*.$(OBJ) $@ : ,g' $(DEPSED2) 323 | 324 | $(WORKPATH)/gcc/%.d: %.cpp $(ALLDEP) 325 | $(DEPMKDIR) $(DEPGCC) -DGCC_RUNTIME | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/gcc/$*.$(OBJ) $@ : ,g' $(DEPSED2) 326 | 327 | $(WORKPATH)/mscc/%.d: %.cpp $(ALLDEP) 328 | $(DEPMKDIR) $(DEPGCC) -DVSNET_RUNTIME | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/mscc/$*.$(OBJ) $@ : ,g' $(DEPSED2) 329 | 330 | $(WORKPATH)/clang/%.d: %.cpp $(ALLDEP) 331 | $(DEPMKDIR) $(DEPGCC) -DCLANG_RUNTIME | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/clang/$*.$(OBJ) $@ : ,g' $(DEPSED2) 332 | 333 | ifneq ($(CHECK_DEPENDENCIES), 0) 334 | include $(DEPENDS) 335 | endif 336 | -------------------------------------------------------------------------------- /makefiles/makefile_opencl_compiler.cpp: -------------------------------------------------------------------------------- 1 | #define _CRT_SECURE_NO_WARNINGS 2 | #include "CL/opencl.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "opencl_compiler_structs.h" 10 | 11 | #define quit(arg) {fprintf(stderr, arg "\n");return(1);} 12 | #define DEFAULT_OPENCL_COMPILER_OPTIONS "" 13 | #define DEFAULT_OUTPUT_FILE "opencl.out" 14 | 15 | int main(int argc, char** argv) 16 | { 17 | const char* output_file = DEFAULT_OUTPUT_FILE; 18 | std::string compiler_options = DEFAULT_OPENCL_COMPILER_OPTIONS; 19 | std::vector files; 20 | 21 | printf("Passing command line options:\n"); 22 | bool add_option = false; 23 | for (int i = 1;i < argc;i++) 24 | { 25 | if (add_option) 26 | { 27 | compiler_options += " "; 28 | compiler_options += argv[i]; 29 | } 30 | else if (strcmp(argv[i], "--") == 0) 31 | { 32 | add_option = true; 33 | } 34 | else if (strcmp(argv[i], "-output-file") == 0) 35 | { 36 | if (++i >= argc) quit("Output file name missing"); 37 | output_file = argv[i]; 38 | } 39 | else 40 | { 41 | fprintf(stderr, "%s\n", argv[i]); 42 | files.push_back(argv[i]); 43 | } 44 | } 45 | 46 | cl_int ocl_error; 47 | cl_uint num_platforms; 48 | if (clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS) quit("Error getting OpenCL Platform Count"); 49 | if (num_platforms == 0) quit("No OpenCL Platform found"); 50 | printf("%d OpenCL Platforms found\n", num_platforms); 51 | 52 | //Query platforms 53 | cl_platform_id* platforms = new cl_platform_id[num_platforms]; 54 | if (platforms == NULL) quit("Memory allocation error"); 55 | if (clGetPlatformIDs(num_platforms, platforms, NULL) != CL_SUCCESS) quit("Error getting OpenCL Platforms"); 56 | 57 | cl_platform_id platform; 58 | bool found = false; 59 | 60 | _makefiles_opencl_platform_info pinfo; 61 | for (unsigned int i_platform = 0;i_platform < num_platforms;i_platform++) 62 | { 63 | clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_PROFILE, 64, pinfo.platform_profile, NULL); 64 | clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_VERSION, 64, pinfo.platform_version, NULL); 65 | clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_NAME, 64, pinfo.platform_name, NULL); 66 | clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_VENDOR, 64, pinfo.platform_vendor, NULL); 67 | printf("Available Platform %d: (%s %s) %s %s\n", i_platform, pinfo.platform_profile, pinfo.platform_version, pinfo.platform_vendor, pinfo.platform_name); 68 | if (strcmp(pinfo.platform_vendor, "Advanced Micro Devices, Inc.") == 0) 69 | { 70 | found = true; 71 | printf("AMD OpenCL Platform found (%d)\n", i_platform); 72 | platform = platforms[i_platform]; 73 | break; 74 | } 75 | } 76 | if (found == false) 77 | { 78 | quit("Did not find AMD OpenCL Platform"); 79 | } 80 | 81 | if (clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &pinfo.count) != CL_SUCCESS) 82 | { 83 | quit("Error getting OPENCL Device Count"); 84 | } 85 | 86 | //Query devices 87 | cl_device_id* devices = new cl_device_id[pinfo.count]; 88 | if (devices == NULL) quit("Memory allocation error"); 89 | if (clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, pinfo.count, devices, NULL) != CL_SUCCESS) quit("Error getting OpenCL devices"); 90 | 91 | _makefiles_opencl_device_info dinfo; 92 | cl_device_type device_type; 93 | cl_uint freq, shaders; 94 | 95 | printf("Available OPENCL devices:\n"); 96 | for (unsigned int i = 0;i < pinfo.count;i++) 97 | { 98 | printf("Examining device %d\n", i); 99 | 100 | clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 64, dinfo.device_name, NULL); 101 | clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, 64, dinfo.device_vendor, NULL); 102 | clGetDeviceInfo(devices[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL); 103 | clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(freq), &freq, NULL); 104 | clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(shaders), &shaders, NULL); 105 | clGetDeviceInfo(devices[i], CL_DEVICE_ADDRESS_BITS, sizeof(dinfo.nbits), &dinfo.nbits, NULL); 106 | printf("Found Device %d: %s %s (Frequency %d, Shaders %d, %d bit)\n", i, dinfo.device_vendor, dinfo.device_name, (int) freq, (int) shaders, (int) dinfo.nbits); 107 | } 108 | 109 | if (files.size() == 0) 110 | { 111 | quit("Syntax: opencl [-output-file OUTPUT_FILE] FILE1 [FILE2] ... [FILEn] [-- COMPILER_OPTION_1] [COMPILER_OPTION_2] ... [COMPILER_OPTION_N]"); 112 | } 113 | 114 | char** buffers = (char**) malloc(files.size() * sizeof(char*)); 115 | if (buffers == NULL) quit("Memory allocation error\n"); 116 | for (unsigned int i = 0;i < files.size();i++) 117 | { 118 | printf("Reading source file %s\n", files[i]); 119 | FILE* fp = fopen(files[i], "rb"); 120 | if (fp == NULL) 121 | { 122 | printf("Cannot open %s\n", files[i]); 123 | return(1); 124 | } 125 | fseek(fp, 0, SEEK_END); 126 | size_t file_size = ftell(fp); 127 | fseek(fp, 0, SEEK_SET); 128 | 129 | buffers[i] = (char*) malloc(file_size + 1); 130 | if (buffers[i] == NULL) 131 | { 132 | quit("Memory allocation error"); 133 | } 134 | if (fread(buffers[i], 1, file_size, fp) != file_size) 135 | { 136 | quit("Error reading file"); 137 | } 138 | buffers[i][file_size] = 0; 139 | fclose(fp); 140 | } 141 | 142 | printf("Creating OpenCL Context\n"); 143 | //Create OpenCL context 144 | cl_context context = clCreateContext(NULL, pinfo.count, devices, NULL, NULL, &ocl_error); 145 | if (ocl_error != CL_SUCCESS) quit("Error creating OpenCL context"); 146 | 147 | printf("Creating OpenCL Program Object\n"); 148 | //Create OpenCL program object 149 | cl_program program = clCreateProgramWithSource(context, (cl_uint) files.size(), (const char**) buffers, NULL, &ocl_error); 150 | if (ocl_error != CL_SUCCESS) quit("Error creating program object"); 151 | 152 | printf("Compiling OpenCL Program\n"); 153 | //Compile program 154 | ocl_error = clBuildProgram(program, pinfo.count, devices, compiler_options.c_str(), NULL, NULL); 155 | if (ocl_error != CL_SUCCESS) 156 | { 157 | fprintf(stderr, "OpenCL Error while building program: %d (Compiler options: %s)\n", ocl_error, compiler_options.c_str()); 158 | fprintf(stderr, "OpenCL Kernel:\n\n"); 159 | for (unsigned int i = 0;i < files.size();i++) 160 | { 161 | printf("%s\n\n", buffers[i]); 162 | } 163 | 164 | for (unsigned int i = 0;i < pinfo.count;i++) 165 | { 166 | cl_build_status status; 167 | clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_STATUS, sizeof(status), &status, NULL); 168 | if (status == CL_BUILD_ERROR) 169 | { 170 | size_t log_size; 171 | clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); 172 | char* build_log = (char*) malloc(log_size + 1); 173 | if (build_log == NULL) quit("Memory allocation error"); 174 | clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL); 175 | fprintf(stderr, "Build Log (device %d):\n\n%s\n\n", i, build_log); 176 | free(build_log); 177 | } 178 | } 179 | } 180 | for (unsigned int i = 0;i < files.size();i++) 181 | { 182 | free(buffers[i]); 183 | } 184 | free(buffers); 185 | if (ocl_error != CL_SUCCESS) return(1); 186 | 187 | printf("Obtaining program binaries\n"); 188 | size_t* binary_sizes = (size_t*) malloc(pinfo.count * sizeof(size_t)); 189 | if (binary_sizes == NULL) quit("Memory allocation error"); 190 | clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, pinfo.count * sizeof(size_t), binary_sizes, NULL); 191 | char** binary_buffers = (char**) malloc(pinfo.count * sizeof(char*)); 192 | if (binary_buffers == NULL) quit("Memory allocation error"); 193 | for (unsigned int i = 0;i < pinfo.count;i++) 194 | { 195 | printf("Binary size for device %d: %d\n", i, (int) binary_sizes[i]); 196 | binary_buffers[i] = (char*) malloc(binary_sizes[i]); 197 | memset(binary_buffers[i], 0, binary_sizes[i]); 198 | if (binary_buffers[i] == NULL) quit("Memory allocation error"); 199 | } 200 | clGetProgramInfo(program, CL_PROGRAM_BINARIES, pinfo.count * sizeof(char*), binary_buffers, NULL); 201 | 202 | printf("Programs obtained successfully, cleaning up opencl\n"); 203 | clReleaseProgram(program); 204 | clReleaseContext(context); 205 | 206 | printf("Writing binaries to file (%s)\n", output_file); 207 | FILE* fp; 208 | fp = fopen(output_file, "w+b"); 209 | if (fp == NULL) quit("Error opening output file\n"); 210 | const char* magic_bytes = "QOCLPB"; 211 | fwrite(magic_bytes, 1, strlen(magic_bytes) + 1, fp); 212 | fwrite(&pinfo, 1, sizeof(pinfo), fp); 213 | for (unsigned int i = 0;i < pinfo.count;i++) 214 | { 215 | clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 64, dinfo.device_name, NULL); 216 | clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, 64, dinfo.device_vendor, NULL); 217 | dinfo.binary_size = binary_sizes[i]; 218 | fwrite(&dinfo, 1, sizeof(dinfo), fp); 219 | fwrite(binary_buffers[i], 1, binary_sizes[i], fp); 220 | } 221 | fclose(fp); 222 | 223 | printf("All done, cleaning up remaining buffers\n"); 224 | for (unsigned int i = 0;i < pinfo.count;i++) 225 | { 226 | free(binary_buffers[i]); 227 | } 228 | free(binary_sizes); 229 | free(binary_buffers); 230 | 231 | return(0); 232 | } -------------------------------------------------------------------------------- /makefiles/opencl_compiler_structs.h: -------------------------------------------------------------------------------- 1 | struct _makefiles_opencl_platform_info 2 | { 3 | char platform_profile[64]; 4 | char platform_version[64]; 5 | char platform_name[64]; 6 | char platform_vendor[64]; 7 | cl_uint count; 8 | }; 9 | 10 | struct _makefiles_opencl_device_info 11 | { 12 | char device_name[64]; 13 | char device_vendor[64]; 14 | cl_uint nbits; 15 | size_t binary_size; 16 | }; 17 | -------------------------------------------------------------------------------- /makefiles/opencl_obtain_program.h: -------------------------------------------------------------------------------- 1 | #ifndef MAKEFILES_OPENCL_OBTAIN_PROGRAMH 2 | #define MAKEFILES_OPENCL_OBTAIN_PROGRAMH 3 | 4 | #include 5 | #include 6 | #include "opencl_compiler_structs.h" 7 | 8 | static int _makefiles_opencl_obtain_program_helper(cl_context context, cl_uint num_devices, cl_device_id* devices, cl_program* program, char* binaries) 9 | { 10 | const char* magic_bytes = "QOCLPB"; 11 | if (strncmp(magic_bytes, binaries, strlen(magic_bytes)) != 0) 12 | { 13 | printf("Internal error accessing opencl program\n"); 14 | return(1); 15 | } 16 | char* current_ptr = binaries + strlen(magic_bytes) + 1; 17 | _makefiles_opencl_platform_info* pinfo = (_makefiles_opencl_platform_info*) current_ptr; 18 | current_ptr += sizeof(_makefiles_opencl_platform_info); 19 | 20 | if (num_devices != pinfo->count) 21 | { 22 | printf("Number of devices differs from number of devices in opencl program\n"); 23 | return(1); 24 | } 25 | //printf("Obtaining program for OpenCL Platform: (%s %s) %s %s\n", pinfo->platform_profile, pinfo->platform_version, pinfo->platform_vendor, pinfo->platform_name); 26 | 27 | std::vector program_sizes(pinfo->count); 28 | std::vector program_binaries(pinfo->count); 29 | 30 | for (unsigned int i = 0;i < pinfo->count;i++) 31 | { 32 | char device_name[64], device_vendor[64]; 33 | cl_uint nbits; 34 | clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 64, device_name, NULL); 35 | clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, 64, device_vendor, NULL); 36 | clGetDeviceInfo(devices[i], CL_DEVICE_ADDRESS_BITS, sizeof(nbits), &nbits, NULL); 37 | _makefiles_opencl_device_info* dinfo = (_makefiles_opencl_device_info*) current_ptr; 38 | if (strcmp(device_name, dinfo->device_name) != 0 || strcmp(device_vendor, dinfo->device_vendor) != 0) 39 | { 40 | printf("Device list is different to device list from opencl program\n"); 41 | return(1); 42 | } 43 | if (nbits != dinfo->nbits) 44 | { 45 | printf("Pointer size of device and stored device binary differs\n"); 46 | return(1); 47 | } 48 | current_ptr += sizeof(_makefiles_opencl_device_info); 49 | //printf("Device %d: %s %s (size %lld)\n", i, dinfo->device_vendor, dinfo->device_name, (long long int) dinfo->binary_size); 50 | program_sizes[i] = dinfo->binary_size; 51 | program_binaries[i] = current_ptr; 52 | current_ptr += dinfo->binary_size; 53 | } 54 | 55 | cl_int return_status[pinfo->count]; 56 | cl_int ocl_error; 57 | *program = clCreateProgramWithBinary(context, num_devices, devices, program_sizes.data(), (const unsigned char**) program_binaries.data(), return_status, &ocl_error); 58 | 59 | if (ocl_error != CL_SUCCESS) 60 | { 61 | printf("Error loading program\n"); 62 | return(1); 63 | } 64 | 65 | for (unsigned int i = 0;i < pinfo->count;i++) 66 | { 67 | if (return_status[i] != CL_SUCCESS) 68 | { 69 | printf("Error loading program for device %d\n", i); 70 | clReleaseProgram(*program); 71 | return(1); 72 | } 73 | } 74 | 75 | ocl_error = clBuildProgram(*program, num_devices, devices, "", NULL, NULL); 76 | if (ocl_error != CL_SUCCESS) 77 | { 78 | printf("Error building program\n"); 79 | clReleaseProgram(*program); 80 | return(1); 81 | } 82 | 83 | return(0); 84 | } 85 | 86 | #endif 87 | -------------------------------------------------------------------------------- /makefiles/x86_64-pc-linux-gnu.mak: -------------------------------------------------------------------------------- 1 | CUDAPATH = $(CUDA_PATH) 2 | CUDASDKPATH = $(CUDAPATH)/sdk 3 | AMDPATH = $(AMDAPPSDKROOT) 4 | INTELPATH := $(shell which icc 2> /dev/null | sed "s,/bin/.*/icc$$,,") 5 | ifeq ($(INTELPATH), ) 6 | INTELPATH = /opt/intel/compilers_and_libraries_2016.2.181/linux 7 | endif 8 | 9 | GCC3264 = c++ 10 | CLANG3264 = clang++ 11 | ICC32 = $(INTELPATH)/bin/ia32/icc 12 | ICC64 = $(INTELPATH)/bin/intel64/icc 13 | 14 | #Intel Compiler Options 15 | INTELFLAGSOPT = -O3 -fno-alias -fno-fnalias -x$(INTELARCH) -unroll -unroll-aggressive -g0 16 | ifeq ($(CONFIG_LTO), 1) 17 | INTELFLAGSOPT += -ipo 18 | INTELLINKIPO = -ipo-c -ipo-fo 19 | else 20 | INTELFLAGSOPT += -ip 21 | endif 22 | INTELFLAGSDBG = -O0 -g 23 | INTELFLAGSCOMMON = -DINTEL_RUNTIME $(INTELFLAGSUSE) -fasm-blocks 24 | INTELFLAGS32 = $(INTELFLAGSCOMMON) -m32 25 | INTELFLAGS64 = $(INTELFLAGSCOMMON) -m64 -D_AMD64_ 26 | 27 | ifeq ($(GCCARCH), ) 28 | GCCARCHA = -march=native -msse4.2 -m$(ARCHBITS) 29 | else 30 | GCCARCHA = -march=$(GCCARCH) -msse4.2 -m$(ARCHBITS) 31 | endif 32 | 33 | ifeq ("$(CONFIG_OPENMP)", "1") 34 | INTELFLAGSCOMMON += -qopenmp 35 | ifneq ("0$(CPPFILES_ICC)", "0") 36 | LIBSUSE += -liomp5 37 | endif 38 | ifeq ($(CC_SELECTED), ICC) 39 | LIBSUSE += -liomp5 40 | endif 41 | endif 42 | 43 | #GCC link flags 44 | LINKFLAGSCOMMON = -Wall 45 | ifeq ($(CONFIG_STATIC), 1) 46 | LINKFLAGSCOMMON += -static 47 | endif 48 | ifneq ($(CONFIG_GDB), 0) 49 | LINKFLAGSCOMMON += -ggdb 50 | endif 51 | LINKFLAGS32 = -m32 $(LINKFLAGSCOMMON) 52 | LINKFLAGS64 = -m64 $(LINKFLAGSCOMMON) 53 | 54 | #Compilation Output Control 55 | ifneq ("$(VERBOSE)", "1") 56 | HIDEECHOB = @ 57 | endif 58 | 59 | ifeq ($(ARCHBITS), 64) 60 | ASM = yasm -f elf64 61 | ICC = $(ICC64) $(INTELFLAGS64) $(CFLAGS64) $(COMPILETARGETTYPE) 62 | GCC = $(GCC3264) $(GCCFLAGS64) $(GCCFLAGSCOMMON) $(GCCFLAGSUSE) $(COMPILETARGETTYPE) 63 | CCDBG = $(GCC3264) $(GCCFLAGS64) $(GCCFLAGSCOMMON) $(GCCFLAGSDBG) $(COMPILETARGETTYPE) -DDEBUG_RUNTIME 64 | GCCLINK = $(GCC3264) $(LINKFLAGS64) 65 | ICCLINK = $(ICC64) $(LINKFLAGS64) -openmp 66 | CUDALIBPATH = $(CUDAPATH)/lib64 67 | AMDLIBPATH = $(AMDPATH)/lib/x86_64 68 | INTELLIBPATH = $(INTELPATH)/compiler/lib/intel64 69 | CLANG = $(CLANG3264) $(GCCFLAGS64) $(CLANGFLAGSCOMMON) $(CLANGFLAGSUSE) $(COMPILETARGETTYPE) 70 | else 71 | ASM = yasm -f elf32 72 | ICC = $(ICC32) $(INTELFLAGS32) $(CFLAGS32) $(COMPILETARGETTYPE) 73 | GCC = $(GCC3264) $(GCCFLAGS32) $(GCCFLAGSCOMMON) $(GCCFLAGSUSE) $(COMPILETARGETTYPE) 74 | CCDBG = $(GCC3264) $(GCCFLAGS32) $(GCCFLAGSCOMMON) $(GCCFLAGSDBG) $(COMPILETARGETTYPE) -DDEBUG_RUNTIME 75 | GCCLINK = $(GCC3264) $(LINKFLAGS32) 76 | ICCLINK = $(GCC3264) $(LINKFLAGS32) -openmp 77 | CUDALIBPATH = $(CUDAPATH)/lib 78 | AMDLIBPATH = $(AMDPATH)/lib/x86 79 | INTELLIBPATH = $(INTELPATH)/compiler/lib/ia32 80 | CLANG = $(CLANG3264) $(GCCFLAGS32) $(CLANGFLAGSCOMMON) $(CLANGFLAGSUSE) $(COMPILETARGETTYPE) 81 | endif 82 | QTUIC = uic 83 | QTMOC = moc 84 | 85 | ifeq ($(TARGETTYPE), LIB) 86 | LINKTARGETTYPE = -shared 87 | COMPILETARGETTYPE = -fPIC 88 | EXECUTABLE = $(TARGET).so 89 | else 90 | LINKTARGETTYPE = 91 | COMPILETARGETTYPE = 92 | EXECUTABLE = $(TARGET) 93 | endif 94 | LIBGLIBC = 95 | 96 | LIBSUSE += $(LIBGLIBC) -lrt -ldl -lpthread 97 | 98 | ifeq ($(CC_x86_64-pc-linux-gnu), ICC) 99 | CC = $(ICC) 100 | LINK = $(ICCLINK) 101 | else 102 | ifeq ($(CC_x86_64-pc-linux-gnu), clang) 103 | CC = $(CLANG) 104 | LINK = $(CLANG) 105 | else 106 | CC = $(GCC) 107 | LINK = $(GCCLINK) 108 | endif 109 | ifneq ($(CPPFILES_ICC), ) 110 | LIBSUSE += -lintlc -lsvml -limf -lirc 111 | endif 112 | endif 113 | CC_SELECTED = $(CC_x86_64-pc-linux-gnu) 114 | 115 | CCCUDA = $(GCC) -x c++ -Wno-effc++ 116 | ASMPRE = $(GCC3264) 117 | NVCC = $(CUDAPATH)/bin/nvcc --compiler-bindir $(GCCCUDA) 118 | 119 | COMMONINCLUDEPATHS = 120 | LIBPATHSUSE = 121 | 122 | ifneq ($(CUFILES), ) 123 | LIBSUSE += -lcudart -lcuda 124 | ifeq ($(CONFIG_CUDA_DC), 1) 125 | LIBSUSE += -lcudadevrt 126 | endif 127 | ifeq ($(CONFIG_CUBLAS), 1) 128 | LIBSUSE += -lcublas 129 | endif 130 | endif 131 | #$(CUDASDKPATH)/C/lib/libcutil.a 132 | 133 | OPENCLLIB = -lOpenCL 134 | ifeq ("$(CONFIG_OPENCL)", "1") 135 | LIBSUSE += $(OPENCLLIB) 136 | endif 137 | ifeq ("$(CONFIG_CAL)", "1") 138 | LIBSUSE += -laticalcl -laticalrt 139 | COMMONINCLUDEPATHS += $(AMDPATH)/include/CAL 140 | LIBPATHSUSE += -L$(AMDLIBPATH) 141 | endif 142 | ifeq ("$(CONFIG_OPENGL)", "1") 143 | LIBSUSE += -lGL -lGLU -lglut -lGLEW 144 | endif 145 | ifeq ("$(CONFIG_X11)", "1") 146 | LIBSUSE += -lX11 147 | endif 148 | 149 | ifeq ("$(CONFIG_QT)", "1") 150 | LIBSUSE += -lQtGui -lQtCore 151 | COMMONINCLUDEPATHS += /usr/include/qt4 /usr/include/qt4/QtGui /usr/include/qt4/QtCore /usr/include/qt4/QtWidgets $(WORKPATH)/qt 152 | ifeq ($(ARCHBITS), 64) 153 | LIBPATHSUSE += -L/usr/lib/qt4 154 | else 155 | LIBPATHSUSE += -L/usr/lib32/qt4 156 | endif 157 | endif 158 | 159 | LIBSUSE += $(LIBS:%=-l%) 160 | 161 | ifeq ("$(CONFIG_OPENCL)", "1") 162 | ifeq ("$(CONFIG_OPENCL_VERSION)", "AMD") 163 | COMMONINCLUDEPATHS += "$(AMDPATH)/include" 164 | -L$(AMDLIBPATH) 165 | endif 166 | ifeq ("$(CONFIG_OPENCL_VERSION)", "NVIDIA") 167 | COMMONINCLUDEPATHS += "$(CUDAPATH)/include" 168 | endif 169 | ifeq ("$(CONFIG_OPENCL_VERSION)", "Intel") 170 | #COMMONINCLUDEPATHS += "" 171 | endif 172 | ifeq ("$(CONFIG_OPENCL_VERSION)", "All") 173 | COMMONINCLUDEPATHS += "$(AMDPATH)/include" 174 | COMMONINCLUDEPATHS += "$(CUDAPATH)/include" 175 | LIBPATHSUSE += -L$(AMDLIBPATH) 176 | endif 177 | endif 178 | 179 | ifeq ("$(CONFIG_CUDA)", "1") 180 | COMMONINCLUDEPATHS += "$(CUDAPATH)/include" 181 | COMMONINCLUDEPATHS += "$(CUDASDKPATH)/common/inc" 182 | LIBPATHSUSE += -L$(CUDALIBPATH) 183 | endif 184 | 185 | INCLUDEPATHSUSE = $(GCCINCLUDEPATHS) 186 | DEFINESUSE = $(GCCDEFINES) 187 | 188 | LIBPATHSUSE += -L$(INTELLIBPATH) $(LIBPATHS:%=-L%) 189 | 190 | NVCCARCHS := `for i in $(CUDAVERSION); do echo -n -gencode arch=compute_$$i,code=sm_$$i\ ;done` 191 | NVCC_GREP = "^#line\|^$$\|^# [0-9]* " 192 | 193 | COMPILEOUTPUTBASE = -o 194 | COMPILEOUTPUT = $(COMPILEOUTPUTBASE) $@ 195 | LINKOUTPUT = -o $@ 196 | COMPILEONLY = -c 197 | ASMONLY = 198 | PRECOMPILEONLY = -x c++ -E 199 | OPTIONINCLUDEPATH = -I 200 | OBJ = o 201 | -------------------------------------------------------------------------------- /makefiles/x86_64-unknown-cygwin.mak: -------------------------------------------------------------------------------- 1 | CC_x64_64-unknown-cygwin = $(CC_i686-pc-cygwin) 2 | ALLDEP += makefiles/i686-pc-cygwin.mak 3 | include makefiles/i686-pc-cygwin.mak 4 | -------------------------------------------------------------------------------- /memtest/.gitignore: -------------------------------------------------------------------------------- 1 | mem 2 | -------------------------------------------------------------------------------- /memtest/build.sh: -------------------------------------------------------------------------------- 1 | c++ -m64 -o mem -L$AMDAPPSDKROOT/lib/x86_64 -I$AMDAPPSDKROOT/include -lrt -lOpenCL mem.cpp ../cmodules/timer.cpp 2 | -------------------------------------------------------------------------------- /memtest/cmd: -------------------------------------------------------------------------------- 1 | ./mem -g -2 -c -1 -x -z -l -lh 3072 -lw 3072 -lx 20 -ly 20 -a -u 2 | -------------------------------------------------------------------------------- /memtest/info.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for i in "ulimit -m" "ulimit -v" "ulimit -l" "clinfo" "dmesg" "cat /var/log/messages"; do 3 | echo $i 4 | $i | tail -n 1000 5 | done 6 | -------------------------------------------------------------------------------- /memtest/timer.cpp: -------------------------------------------------------------------------------- 1 | #include "timer.h" 2 | #ifdef _WIN32 3 | #include 4 | #include 5 | #else 6 | #include 7 | #endif 8 | 9 | HighResTimer::HighResTimer() 10 | { 11 | ElapsedTime = 0; 12 | } 13 | 14 | HighResTimer::~HighResTimer() {} 15 | 16 | void HighResTimer::Start() 17 | { 18 | #ifdef _WIN32 19 | __int64 istart; 20 | QueryPerformanceCounter((LARGE_INTEGER*)&istart); 21 | StartTime = (double) istart; 22 | #else 23 | timespec tv; 24 | clock_gettime(CLOCK_REALTIME, &tv); 25 | StartTime = (double) tv.tv_sec * 1.0E9 + (double) tv.tv_nsec; 26 | #endif 27 | } 28 | 29 | void HighResTimer::Stop() 30 | { 31 | double EndTime = 0; 32 | #ifdef _WIN32 33 | __int64 iend; 34 | QueryPerformanceCounter((LARGE_INTEGER*) &iend); 35 | EndTime = (double) iend; 36 | #else 37 | timespec tv; 38 | clock_gettime(CLOCK_REALTIME, &tv); 39 | EndTime = (double) tv.tv_sec * 1.0E9 + (double) tv.tv_nsec; 40 | #endif 41 | ElapsedTime += EndTime - StartTime; 42 | } 43 | 44 | void HighResTimer::Reset() 45 | { 46 | ElapsedTime = 0; 47 | StartTime = 0; 48 | } 49 | 50 | double HighResTimer::GetElapsedTime() 51 | { 52 | return ElapsedTime / Frequency; 53 | } 54 | 55 | double HighResTimer::GetFrequency() 56 | { 57 | #ifdef _WIN32 58 | __int64 ifreq; 59 | QueryPerformanceFrequency((LARGE_INTEGER*)&ifreq); 60 | return((double) ifreq); 61 | #else 62 | return(1.0E9); 63 | #endif 64 | } 65 | 66 | double HighResTimer::Frequency = HighResTimer::GetFrequency(); -------------------------------------------------------------------------------- /memtest/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef TIMER_H 2 | #define TIMER_H 3 | 4 | class HighResTimer { 5 | 6 | public: 7 | HighResTimer(); 8 | ~HighResTimer(); 9 | void Start(); 10 | void Stop(); 11 | void Reset(); 12 | double GetElapsedTime(); 13 | 14 | private: 15 | static double Frequency; 16 | static double GetFrequency(); 17 | 18 | double ElapsedTime; 19 | double StartTime; 20 | }; 21 | 22 | #endif 23 | --------------------------------------------------------------------------------