├── .gitignore
├── 3rd_party_dgemm_kernels
    └── amd_dgemm_2015_08_05
    │   └── amddgemm.so
├── 3rd_party_template
    ├── config.mak
    ├── kernel.cl
    ├── makefile
    ├── makefiles
    └── template.cpp
├── AUTHORS
├── COPYING
├── COPYING.LESSER
├── README
├── ati_patch
    ├── 10.10
    │   ├── install_org.sh
    │   ├── install_patched.sh
    │   └── libaticaldd.so.xdelta
    └── 10.9
    │   ├── fix.cpp
    │   ├── foo
    │   ├── foo.s
    │   ├── install_org.sh
    │   ├── install_patched.sh
    │   └── libaticaldd.so.xdelta
├── benchmark.cpp
├── cal_fake.h
├── cal_private_ext.h
├── caldgemm.cl
├── caldgemm.cpp
├── caldgemm.h
├── caldgemm.il
├── caldgemm_adl.cpp
├── caldgemm_cal.cpp
├── caldgemm_cal.h
├── caldgemm_cblas_wrapper.h
├── caldgemm_common.h
├── caldgemm_config.sample
├── caldgemm_config_load.h
├── caldgemm_cpu.cpp
├── caldgemm_cpu.h
├── caldgemm_cuda.cu
├── caldgemm_cuda.h
├── caldgemm_opencl.cpp
├── caldgemm_opencl.h
├── caldgemm_parse_parameters.h
├── cmodules
    ├── affinity.cpp
    ├── affinity.h
    ├── get_private_profile.h
    ├── linux_helpers.h
    ├── os_low_level_helper.h
    ├── pthread_mutex_win32_wrapper.h
    ├── qmalloc.cpp
    ├── qmalloc.h
    ├── qmath.h
    ├── qmultialloc.cpp
    ├── qmultialloc.h
    ├── qsem.cpp
    ├── qsem.h
    ├── sched_affinity_win32_wrapper.h
    ├── switchtemplate.h
    ├── threadserver.cpp
    ├── threadserver.h
    ├── timer.cpp
    ├── timer.h
    ├── util_adl.cpp
    └── util_adl.h
├── config.mak
├── config_options.sample
├── config_options_load.mak
├── cudakernel.cu
├── environment
    └── caldgemm_setenv.sh.sample
├── gcc_patch
    └── libgomp.patch
├── gotoblas_patch
    └── gotoblas.patch
├── makefile
├── makefiles
    ├── as
    ├── callvc.bat
    ├── config.mak.sample
    ├── i686-pc-cygwin.mak
    ├── i686-pc-linux-gnu.mak
    ├── include.S
    ├── makefile
    ├── makefile_opencl_compiler.cpp
    ├── opencl_compiler_structs.h
    ├── opencl_obtain_program.h
    ├── x86_64-pc-linux-gnu.mak
    └── x86_64-unknown-cygwin.mak
└── memtest
    ├── .gitignore
    ├── build.sh
    ├── cmd
    ├── info.sh
    ├── mem.cpp
    ├── timer.cpp
    └── timer.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | release
 2 | dgemm_bench.exe
 3 | dgemm_bench
 4 | .svn
 5 | vcproject
 6 | *.o
 7 | caldgemm_config.h
 8 | config_options.mak
 9 | 3rd_party_template/dgemm_template.so
10 | amd_dgemm*
11 | 


--------------------------------------------------------------------------------
/3rd_party_dgemm_kernels/amd_dgemm_2015_08_05/amddgemm.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidrohr/caldgemm/bc875b373f78b47c8e58a10353cce7ef751210a1/3rd_party_dgemm_kernels/amd_dgemm_2015_08_05/amddgemm.so


--------------------------------------------------------------------------------
/3rd_party_template/config.mak:
--------------------------------------------------------------------------------
 1 | INTELARCH					= SSE4.2
 2 | CUDAVERSION					= 20
 3 | CUDAREGS					= 64
 4 | ARCHBITS					= 64
 5 | 
 6 | HIDEECHO					= @
 7 | CC_x86_64-pc-linux-gnu		= GCC
 8 | CC_i686-pc-cygwin			= ICC
 9 | 
10 | TARGET						= dgemm_template
11 | 
12 | INTELFLAGSUSE				= $(INTELFLAGSOPT)
13 | VSNETFLAGSUSE				= $(VSNETFLAGSOPT)
14 | GCCFLAGSUSE					= $(GCCFLAGSOPT)
15 | NVCCFLAGSUSE				= $(NVCCFLAGSOPT)
16 | 
17 | TARGETTYPE					= LIB
18 | 
19 | CPPFILES					= template.cpp
20 | 
21 | CONFIG_OPENCL				= 1


--------------------------------------------------------------------------------
/3rd_party_template/kernel.cl:
--------------------------------------------------------------------------------
 1 | "__kernel void oclkernel(__global double* C, __global const double* __restrict const A, __global const double* __restrict const B, int height1, int height2, int width, double alpha, double beta, int pitch, ulong offset)\n"
 2 | "//input parameters are standard DGEMM parameters (however in modified order, width = k, height1 = m, height2 = n, pitch = LDC)\n"
 3 | "//matrices area assumed in row-major (you can get col-major by swapping A and B (and m,n)\n"
 4 | "//there is no transposition parameters, the kernel can assume the best settings for optimal performance, the library must export the required options, caldgemm will tread the rest\n"
 5 | "//LDA and LDB parameters are not present, they are as small as possible and hence equal m, n, k\n"
 6 | "{\n"
 7 | "	int i, j, k;\n"
 8 | "	for (i = get_global_id(1);i < height2;i += get_global_size(1))\n"
 9 | "	{\n"
10 | "		for (j = get_global_id(0);j < height1;j += get_global_size(0))\n"
11 | "		{\n"
12 | "			double addval = 0.;\n"
13 | "			for (k = 0;k < width;k++)\n"
14 | "			{\n"
15 | "				addval += A[k * height2 + i] * B[k * height1 + j];\n"
16 | "			}\n"
17 | "			C[offset + i * pitch + j] = beta * C[offset + i * pitch + j] + alpha * addval;\n"
18 | "		}\n"
19 | "	}\n"
20 | "}\n"
21 | ;
22 | 


--------------------------------------------------------------------------------
/3rd_party_template/makefile:
--------------------------------------------------------------------------------
1 | include makefiles/makefile


--------------------------------------------------------------------------------
/3rd_party_template/makefiles:
--------------------------------------------------------------------------------
1 | ../makefiles


--------------------------------------------------------------------------------
/3rd_party_template/template.cpp:
--------------------------------------------------------------------------------
  1 | #include <CL/opencl.h>
  2 | #include <stdio.h>
  3 | 
  4 | #define STD_OUT stdout
  5 | 
  6 | #ifdef __WIN32
  7 | #define DLL_EXPORT __declspec(dllexport)
  8 | #else
  9 | #define DLL_EXPORT
 10 | #endif
 11 | 
 12 | #define ERRRET(...) {fprintf(STD_OUT, __VA_ARGS__);fprintf(STD_OUT, "\n");return(1);}
 13 | #define CHKRET(result, ...) \
 14 | 	if (result != CL_SUCCESS) \
 15 | 	{ \
 16 | 		fprintf(STD_OUT, __VA_ARGS__); \
 17 | 		fprintf(STD_OUT, ":\n"); \
 18 | 		fprintf(STD_OUT, "OpenCL Error %d: (%s: %d) %s\n", result, __FILE__, __LINE__, opencl_error_string(result)); \
 19 | 		return(0); \
 20 | 	}
 21 | 
 22 | #define quit(arg) {fprintf(stderr, arg "\n");exit(1);}
 23 | 
 24 | //We must export several functions, kernelLibCreate to return the kernel object, kernelLibQuerySettings to return some parameters.
 25 | //Initialize is called after loading the library at very first, it can return 1 in case of error, terminate is used to clean up.
 26 | //The others are used to provide some insight in good matrix sizes to caldgemm.
 27 | extern "C" DLL_EXPORT cl_kernel kernelLibCreate(cl_context* context, int nDevices, cl_device_id* devices, int kernelType, int k, int betazero); 
 28 | extern "C" DLL_EXPORT void kernelLibQuerySettings(int* tiling_x, int* tiling_y, bool* transposeA, bool* transposeB, bool* texture_buffers, int* group_size_x, int* group_size_y, int* min_tile_size, int* min_k);
 29 | extern "C" DLL_EXPORT void kernelLibTerminate();
 30 | extern "C" DLL_EXPORT size_t suggestedMaxHeight();
 31 | extern "C" DLL_EXPORT size_t getAutoHeight(size_t MaxGpuM, size_t MaxGpuN, int nDevices, size_t Width);
 32 | extern "C" DLL_EXPORT void modHeight(size_t MOD_OVER, size_t MOD_GPU);
 33 | extern "C" DLL_EXPORT int kernelLibInitialize(cl_platform_id platform);
 34 | 
 35 | //The kernels can be subject to some optimizations, depending on the parameters:
 36 | //betazero indicates that beta can be assumed zero, regardless of other parameters
 37 | //kernelType:
 38 | //0 - no further optimizations
 39 | //1 - can assume alpha = 1
 40 | //2 - can assume alpha = 1, beta = 0/1 depending on betazero, and k is fixed to the parameter passed as k
 41 | //3 - not used
 42 | //4 - can assume alpha = -1, beta = 0/1 depending on betazero and k is fixed to the parameter passed as k
 43 | //5
 44 | //6
 45 | 
 46 | //kernelLibQuerySettings must return
 47 | //The tiling size in x and y (defines how many work-items are started
 48 | //transposeA and transposeB define whether the kernel expects A or B input matrices in transposed form or not
 49 | //texture_buffers = 1 means input is read from images, 0 stands for standard buffers
 50 | //group_size_x/y defines the work-group-size
 51 | 
 52 | cl_program ocl_program, ocl_programx;
 53 | 
 54 | const char* kernel_str =
 55 | #include "kernel.cl"
 56 | ;
 57 | 
 58 | int program_initialized = 0;
 59 | 
 60 | const char* opencl_error_string(int errorcode)
 61 | {
 62 |     switch (errorcode)
 63 | 	{
 64 | 		case CL_SUCCESS:                            return "Success!";
 65 | 		case CL_DEVICE_NOT_FOUND:                   return "Device not found.";
 66 | 		case CL_DEVICE_NOT_AVAILABLE:               return "Device not available";
 67 | 		case CL_COMPILER_NOT_AVAILABLE:             return "Compiler not available";
 68 | 		case CL_MEM_OBJECT_ALLOCATION_FAILURE:      return "Memory object allocation failure";
 69 | 		case CL_OUT_OF_RESOURCES:                   return "Out of resources";
 70 | 		case CL_OUT_OF_HOST_MEMORY:                 return "Out of host memory";
 71 | 		case CL_PROFILING_INFO_NOT_AVAILABLE:       return "Profiling information not available";
 72 | 		case CL_MEM_COPY_OVERLAP:                   return "Memory copy overlap";
 73 | 		case CL_IMAGE_FORMAT_MISMATCH:              return "Image format mismatch";
 74 | 		case CL_IMAGE_FORMAT_NOT_SUPPORTED:         return "Image format not supported";
 75 | 		case CL_BUILD_PROGRAM_FAILURE:              return "Program build failure";
 76 | 		case CL_MAP_FAILURE:                        return "Map failure";
 77 | 		case CL_INVALID_VALUE:                      return "Invalid value";
 78 | 		case CL_INVALID_DEVICE_TYPE:                return "Invalid device type";
 79 | 		case CL_INVALID_PLATFORM:                   return "Invalid platform";
 80 | 		case CL_INVALID_DEVICE:                     return "Invalid device";
 81 | 		case CL_INVALID_CONTEXT:                    return "Invalid context";
 82 | 		case CL_INVALID_QUEUE_PROPERTIES:           return "Invalid queue properties";
 83 | 		case CL_INVALID_COMMAND_QUEUE:              return "Invalid command queue";
 84 | 		case CL_INVALID_HOST_PTR:                   return "Invalid host pointer";
 85 | 		case CL_INVALID_MEM_OBJECT:                 return "Invalid memory object";
 86 | 		case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:    return "Invalid image format descriptor";
 87 | 		case CL_INVALID_IMAGE_SIZE:                 return "Invalid image size";
 88 | 		case CL_INVALID_SAMPLER:                    return "Invalid sampler";
 89 | 		case CL_INVALID_BINARY:                     return "Invalid binary";
 90 | 		case CL_INVALID_BUILD_OPTIONS:              return "Invalid build options";
 91 | 		case CL_INVALID_PROGRAM:                    return "Invalid program";
 92 | 		case CL_INVALID_PROGRAM_EXECUTABLE:         return "Invalid program executable";
 93 | 		case CL_INVALID_KERNEL_NAME:                return "Invalid kernel name";
 94 | 		case CL_INVALID_KERNEL_DEFINITION:          return "Invalid kernel definition";
 95 | 		case CL_INVALID_KERNEL:                     return "Invalid kernel";
 96 | 		case CL_INVALID_ARG_INDEX:                  return "Invalid argument index";
 97 | 		case CL_INVALID_ARG_VALUE:                  return "Invalid argument value";
 98 | 		case CL_INVALID_ARG_SIZE:                   return "Invalid argument size";
 99 | 		case CL_INVALID_KERNEL_ARGS:                return "Invalid kernel arguments";
100 | 		case CL_INVALID_WORK_DIMENSION:             return "Invalid work dimension";
101 | 		case CL_INVALID_WORK_GROUP_SIZE:            return "Invalid work group size";
102 | 		case CL_INVALID_WORK_ITEM_SIZE:             return "Invalid work item size";
103 | 		case CL_INVALID_GLOBAL_OFFSET:              return "Invalid global offset";
104 | 		case CL_INVALID_EVENT_WAIT_LIST:            return "Invalid event wait list";
105 | 		case CL_INVALID_EVENT:                      return "Invalid event";
106 | 		case CL_INVALID_OPERATION:                  return "Invalid operation";
107 | 		case CL_INVALID_GL_OBJECT:                  return "Invalid OpenGL object";
108 | 		case CL_INVALID_BUFFER_SIZE:                return "Invalid buffer size";
109 | 		case CL_INVALID_MIP_LEVEL:                  return "Invalid mip-map level";
110 | 		default: return "Unknown Errorcode";
111 | 	}
112 | }
113 | 
114 | cl_kernel kernelLibCreate(cl_context* context, int nDevices, cl_device_id* devices, int kernelType, int k, int betazero)
115 | {
116 | 	cl_int ocl_error;
117 | 	if (program_initialized == 0)
118 | 	{
119 | 		ocl_program = clCreateProgramWithSource(*context, 1, &kernel_str, NULL, &ocl_error);
120 | 		CHKRET(ocl_error, "Error creating program object");
121 | 		ocl_error = clBuildProgram(ocl_program, nDevices, devices, 0, NULL, NULL);
122 | 		if (ocl_error != CL_SUCCESS)
123 | 		{
124 | 			fprintf(STD_OUT, "OpenCL Error while building program: %d\n", ocl_error);
125 | 			fprintf(STD_OUT, "OpenCL Kernel:\n\n%s\n\n", kernel_str);
126 | 			char build_log[16384];
127 | 			for (int i = 0;i < nDevices;i++)
128 | 			{
129 | 				clGetProgramBuildInfo(ocl_program, devices[i], CL_PROGRAM_BUILD_LOG, 16384, build_log, NULL);
130 | 				fprintf(STD_OUT, "Build Log (device %d):\n\n%s\n\n", i, build_log);
131 | 			}
132 | 			return(0);
133 | 		}
134 | 		program_initialized = 1;
135 | 	}
136 | 	cl_kernel tmp = clCreateKernel(ocl_program, "oclkernel", &ocl_error);
137 | 	CHKRET(ocl_error, "Error creating kernel");
138 | 	
139 | 	return(tmp);
140 | }
141 | 
142 | void kernelLibTerminate()
143 | {
144 | 	if (program_initialized)
145 | 	{
146 | 		clReleaseProgram(ocl_program);
147 | 		program_initialized = 0;
148 | 	}
149 | }
150 | 
151 | void kernelLibQuerySettings(int* tiling_x, int* tiling_y, bool* transposeA, bool* transposeB, bool* texture_buffers, int* group_size_x, int* group_size_y, int* min_tile_size, int* min_k)
152 | {
153 | 	*group_size_x = *group_size_y = 8; //We start a grid with work-group-size 8x8 and in total m/tilingx x n/tiling_y work items
154 | 	*tiling_x = *tiling_y = 4;
155 | 	*texture_buffers = false;
156 | 	*transposeA = true;
157 | 	*transposeB = false;
158 | 	*min_tile_size = 32;
159 | 	*min_k = 4;
160 | }
161 | 
162 | int kernelLibInitialize(cl_platform_id platform)
163 | {
164 | 	return(0);
165 | }
166 | 
167 | size_t suggestedMaxHeight()
168 | {
169 | 	return(4096);
170 | }
171 | 
172 | //Suggest different height parameters depending on Matrix Size
173 | size_t getAutoHeight(size_t MaxGpuM, size_t MaxGpuN, int nDevices, size_t Width)
174 | {
175 | 	//Do not provide standard values for other GPU types, we rely on caldgemm defaults by returning 0
176 | 	return 0;
177 | }
178 | 
179 | void modHeight(size_t MOD_OVER, size_t MOD_GPU)
180 | {
181 | }
182 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | David Rohr (drohr@jwdt.org)
2 | Matthias Bach (bach@compeng.uni-frankfurt.de)
3 | Matthias Kretz (kretz@compeng.uni-frankfurt.de)
4 | 


--------------------------------------------------------------------------------
/COPYING.LESSER:
--------------------------------------------------------------------------------
  1 |                    GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 | 
  9 |   This version of the GNU Lesser General Public License incorporates
 10 | the terms and conditions of version 3 of the GNU General Public
 11 | License, supplemented by the additional permissions listed below.
 12 | 
 13 |   0. Additional Definitions.
 14 | 
 15 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 17 | General Public License.
 18 | 
 19 |   "The Library" refers to a covered work governed by this License,
 20 | other than an Application or a Combined Work as defined below.
 21 | 
 22 |   An "Application" is any work that makes use of an interface provided
 23 | by the Library, but which is not otherwise based on the Library.
 24 | Defining a subclass of a class defined by the Library is deemed a mode
 25 | of using an interface provided by the Library.
 26 | 
 27 |   A "Combined Work" is a work produced by combining or linking an
 28 | Application with the Library.  The particular version of the Library
 29 | with which the Combined Work was made is also called the "Linked
 30 | Version".
 31 | 
 32 |   The "Minimal Corresponding Source" for a Combined Work means the
 33 | Corresponding Source for the Combined Work, excluding any source code
 34 | for portions of the Combined Work that, considered in isolation, are
 35 | based on the Application, and not on the Linked Version.
 36 | 
 37 |   The "Corresponding Application Code" for a Combined Work means the
 38 | object code and/or source code for the Application, including any data
 39 | and utility programs needed for reproducing the Combined Work from the
 40 | Application, but excluding the System Libraries of the Combined Work.
 41 | 
 42 |   1. Exception to Section 3 of the GNU GPL.
 43 | 
 44 |   You may convey a covered work under sections 3 and 4 of this License
 45 | without being bound by section 3 of the GNU GPL.
 46 | 
 47 |   2. Conveying Modified Versions.
 48 | 
 49 |   If you modify a copy of the Library, and, in your modifications, a
 50 | facility refers to a function or data to be supplied by an Application
 51 | that uses the facility (other than as an argument passed when the
 52 | facility is invoked), then you may convey a copy of the modified
 53 | version:
 54 | 
 55 |    a) under this License, provided that you make a good faith effort to
 56 |    ensure that, in the event an Application does not supply the
 57 |    function or data, the facility still operates, and performs
 58 |    whatever part of its purpose remains meaningful, or
 59 | 
 60 |    b) under the GNU GPL, with none of the additional permissions of
 61 |    this License applicable to that copy.
 62 | 
 63 |   3. Object Code Incorporating Material from Library Header Files.
 64 | 
 65 |   The object code form of an Application may incorporate material from
 66 | a header file that is part of the Library.  You may convey such object
 67 | code under terms of your choice, provided that, if the incorporated
 68 | material is not limited to numerical parameters, data structure
 69 | layouts and accessors, or small macros, inline functions and templates
 70 | (ten or fewer lines in length), you do both of the following:
 71 | 
 72 |    a) Give prominent notice with each copy of the object code that the
 73 |    Library is used in it and that the Library and its use are
 74 |    covered by this License.
 75 | 
 76 |    b) Accompany the object code with a copy of the GNU GPL and this license
 77 |    document.
 78 | 
 79 |   4. Combined Works.
 80 | 
 81 |   You may convey a Combined Work under terms of your choice that,
 82 | taken together, effectively do not restrict modification of the
 83 | portions of the Library contained in the Combined Work and reverse
 84 | engineering for debugging such modifications, if you also do each of
 85 | the following:
 86 | 
 87 |    a) Give prominent notice with each copy of the Combined Work that
 88 |    the Library is used in it and that the Library and its use are
 89 |    covered by this License.
 90 | 
 91 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 92 |    document.
 93 | 
 94 |    c) For a Combined Work that displays copyright notices during
 95 |    execution, include the copyright notice for the Library among
 96 |    these notices, as well as a reference directing the user to the
 97 |    copies of the GNU GPL and this license document.
 98 | 
 99 |    d) Do one of the following:
100 | 
101 |        0) Convey the Minimal Corresponding Source under the terms of this
102 |        License, and the Corresponding Application Code in a form
103 |        suitable for, and under terms that permit, the user to
104 |        recombine or relink the Application with a modified version of
105 |        the Linked Version to produce a modified Combined Work, in the
106 |        manner specified by section 6 of the GNU GPL for conveying
107 |        Corresponding Source.
108 | 
109 |        1) Use a suitable shared library mechanism for linking with the
110 |        Library.  A suitable mechanism is one that (a) uses at run time
111 |        a copy of the Library already present on the user's computer
112 |        system, and (b) will operate properly with a modified version
113 |        of the Library that is interface-compatible with the Linked
114 |        Version.
115 | 
116 |    e) Provide Installation Information, but only if you would otherwise
117 |    be required to provide such information under section 6 of the
118 |    GNU GPL, and only to the extent that such information is
119 |    necessary to install and execute a modified version of the
120 |    Combined Work produced by recombining or relinking the
121 |    Application with a modified version of the Linked Version. (If
122 |    you use option 4d0, the Installation Information must accompany
123 |    the Minimal Corresponding Source and Corresponding Application
124 |    Code. If you use option 4d1, you must provide the Installation
125 |    Information in the manner specified by section 6 of the GNU GPL
126 |    for conveying Corresponding Source.)
127 | 
128 |   5. Combined Libraries.
129 | 
130 |   You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 | 
136 |    a) Accompany the combined library with a copy of the same work based
137 |    on the Library, uncombined with any other library facilities,
138 |    conveyed under the terms of this License.
139 | 
140 |    b) Give prominent notice with the combined library that part of it
141 |    is a work based on the Library, and explaining where to find the
142 |    accompanying uncombined form of the same work.
143 | 
144 |   6. Revised Versions of the GNU Lesser General Public License.
145 | 
146 |   The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 | 
151 |   Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 | 
161 |   If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 | 


--------------------------------------------------------------------------------
/ati_patch/10.10/install_org.sh:
--------------------------------------------------------------------------------
1 | sudo cp libaticaldd.so.orig /usr/lib64/libaticaldd.so
2 | 


--------------------------------------------------------------------------------
/ati_patch/10.10/install_patched.sh:
--------------------------------------------------------------------------------
1 | sudo cp libaticaldd.so /usr/lib64/libaticaldd.so
2 | 


--------------------------------------------------------------------------------
/ati_patch/10.10/libaticaldd.so.xdelta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidrohr/caldgemm/bc875b373f78b47c8e58a10353cce7ef751210a1/ati_patch/10.10/libaticaldd.so.xdelta


--------------------------------------------------------------------------------
/ati_patch/10.9/fix.cpp:
--------------------------------------------------------------------------------
 1 | extern void **ddi_interface;
 2 | 
 3 | void fixCal()
 4 | {
 5 |     unsigned char *func = ddi_interface[0xa8/8];
 6 |     func += 0x7fffe591b631 - 0x7fffe591b560;
 7 |     if (func[0] == 0x74) {
 8 |         func[0] = 0xeb;
 9 |         fprintf(stderr, "Replaced je with jmpq\n");
10 |     } else {
11 |         fprintf(stderr, "Did not find je at the expected position\n");
12 |     }
13 | }
14 | void fixCal()
15 | {
16 |     fprintf(stderr, "x\n");
17 |     unsigned char *foo = (unsigned char *)(&calCtxRunProgram);
18 |     unsigned char **bar = *(unsigned char ***)((size_t)(*(unsigned int *)(foo + 2)) + foo + 6);
19 |     fprintf(stderr, "bar = %p, ddi_interface[?] = %p\n", bar,
20 |             bar + (0x10f588 - 0x4220)/sizeof(void*));
21 |     unsigned char *func = *(bar + (0x10f588 - 0x4220)/sizeof(void*));
22 |     func += 0x7fffe591b631 - 0x7fffe591b560;
23 |     fprintf(stderr, "Read jump\n");
24 |     if (func[0] == 0x74) {
25 |         fprintf(stderr, "Replace je with jmpq\n");
26 |         func[0] = 0xeb;
27 |         fprintf(stderr, "Replaced je with jmpq\n");
28 |     } else {
29 |         fprintf(stderr, "Did not find je at the expected position\n");
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/ati_patch/10.9/install_org.sh:
--------------------------------------------------------------------------------
1 | sudo cp libaticaldd.so.orig /usr/lib64/libaticaldd.so
2 | 


--------------------------------------------------------------------------------
/ati_patch/10.9/install_patched.sh:
--------------------------------------------------------------------------------
1 | sudo cp libaticaldd.so /usr/lib64/libaticaldd.so
2 | 


--------------------------------------------------------------------------------
/ati_patch/10.9/libaticaldd.so.xdelta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidrohr/caldgemm/bc875b373f78b47c8e58a10353cce7ef751210a1/ati_patch/10.9/libaticaldd.so.xdelta


--------------------------------------------------------------------------------
/cal_fake.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is part of the CALDGEMM library.
  3 |  *
  4 |  * Copyright 2015:
  5 |  *  - David Rohr (drohr@jwdt.org)
  6 |  *  - Matthias Bach (bach@compeng.uni-frankfurt.de)
  7 |  *  - Matthias Kretz (kretz@compeng.uni-frankfurt.de)
  8 |  *
  9 |  * This file is part of CALDGEMM.
 10 |  *
 11 |  * CALDGEMM is free software: you can redistribute it and/or modify
 12 |  * it under the terms of the GNU Lesser General Public License as published by
 13 |  * the Free Software Foundation, either version 3 of the License, or
 14 |  * (at your option) any later version.
 15 |  *
 16 |  * CALDGEMM is distributed in the hope that it will be useful,
 17 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 |  * GNU Lesser General Public License for more details.
 20 |  *
 21 |  * You should have received a copy of the GNU Lesser General Public License
 22 |  * along with CALDGEMM.  If not, see <http://www.gnu.org/licenses/>.
 23 |  */
 24 | 
 25 | #ifndef CAL_FAKE_H
 26 | #define CAL_FAKE_H
 27 | 
 28 | #include "cmodules/timer.h"
 29 | #ifdef _WIN32
 30 | #include "cmodules/pthread_mutex_win32_wrapper.h"
 31 | #else
 32 | #include <pthread.h>
 33 | #endif
 34 | #include <cal.h>
 35 | 
 36 | #define NUM_FAKE_EVENTS 1000000
 37 | #define NUM_FAKE_MEM 10000
 38 | #define NUM_FAKE_MODULE 100
 39 | #define NUM_FAKE_NAME 1000
 40 | #define NUM_MODULE_NAMES 13
 41 | 
 42 | #define CAL_FAKE_PASSTHROUGH
 43 | #define CAL_FAKE_CHECKMEM
 44 | //#define CAL_FAKE_VERBOSE
 45 | 
 46 | class cal_fake_event
 47 | {
 48 | public:
 49 | 	HighResTimer timer;
 50 | 	int initialized;
 51 | 	int queried;
 52 | 	int reused;
 53 | 	double delay;
 54 | 	int mems[NUM_MODULE_NAMES];
 55 | 	int nmems;
 56 | 	CALevent through;
 57 | 
 58 | 	cal_fake_event() {initialized = queried = reused = 0;}
 59 | };
 60 | 
 61 | class cal_fake_mem
 62 | {
 63 | public:
 64 | 	int released;
 65 | 	int active;
 66 | 	
 67 | 	CALmem through;
 68 | };
 69 | 
 70 | class cal_fake_module
 71 | {
 72 | public:
 73 | 	int released;
 74 | 	int nnames;
 75 | 	int names[NUM_MODULE_NAMES];
 76 | 	
 77 | 	CALmodule through;
 78 | 	CALfunc throughFunc;
 79 | };
 80 | 
 81 | class cal_fake_name
 82 | {
 83 | public:
 84 | 	int mem;
 85 | 	
 86 | 	CALname through;
 87 | };
 88 | 
 89 | class cal_fake
 90 | {
 91 | public:
 92 | 	cal_fake_event event[NUM_FAKE_EVENTS];
 93 | 	pthread_mutex_t mutex;
 94 | 	int curevent;
 95 | 
 96 | 	cal_fake_mem mem[NUM_FAKE_MEM];
 97 | 	int curmem;
 98 | 
 99 | 	cal_fake_module module[NUM_FAKE_MODULE];
100 | 	int curmodule;
101 | 
102 | 	cal_fake_name name[NUM_FAKE_NAME];
103 | 	int curname;
104 | 
105 | 	cal_fake()
106 | 	{
107 | 		pthread_mutex_init(&mutex, NULL);
108 | 		curevent = 0;
109 | 		curmem = 0;
110 | 		curmodule = 0;
111 | 		curname = 0;
112 | 	}
113 | 
114 | 	~cal_fake()
115 | 	{
116 | 		pthread_mutex_destroy(&mutex);
117 | 		for (int i = 0;i < curevent;i++)
118 | 		{
119 | 			if (event[i].queried == 0) printf("Warning, event %d not queried\n", i);
120 | 		}
121 | 	}
122 | 
123 | 	CALresult AddEvent(CALevent* pevent, bool lock = true)
124 | 	{
125 | #ifdef CAL_FAKE_VERBOSE
126 | 		fprintf(STD_OUT, "CREATE EVENT %d\n", curevent);
127 | #endif
128 | 		*pevent = curevent;
129 | 		if (lock) pthread_mutex_lock(&mutex);
130 | 		if (event[curevent].initialized && !event[curevent].queried)
131 | 		{
132 | 			printf("------------------------ Event reused before queried\n");
133 | 			while (true);
134 | 		}
135 | 		if (event[curevent].initialized) event[curevent].reused = 1;
136 | 		event[curevent].initialized = 1;
137 | 		event[curevent].queried = 0;
138 | 		event[curevent].timer.Reset();
139 | 		event[curevent].timer.Start();
140 | 		event[curevent].delay = (rand() % 1000) / 100000.;
141 | 		event[curevent].nmems = 0;
142 | 		curevent = (curevent + 1) % NUM_FAKE_EVENTS;
143 | 		if (lock) pthread_mutex_unlock(&mutex);
144 | 		return(CAL_RESULT_OK);
145 | 	}
146 | 
147 | 	CALresult QueryEvent(CALevent num)
148 | 	{
149 | #ifdef CAL_FAKE_VERBOSE
150 | 		fprintf(STD_OUT, "QUERY EVENT %d\n", num);
151 | #endif
152 | 		CALresult retVal;
153 | 		pthread_mutex_lock(&mutex);
154 | 		if (num >= NUM_FAKE_EVENTS)
155 | 		{
156 | 			printf("------------------------- Requested fake event with handle %d >= %d\n", num, NUM_FAKE_EVENTS);
157 | 			retVal = CAL_RESULT_BAD_HANDLE;
158 | 		}
159 | 		else if (event[num].initialized == 0)
160 | 		{
161 | 			printf("------------------------- Fake event with handle %d not initialized\n", num);
162 | 			retVal = CAL_RESULT_BAD_HANDLE;
163 | 		}
164 | 		else if (event[num].queried)
165 | 		{
166 | 			printf("------------------------- Fake event with handle %d already queried\n", num);
167 | 			retVal = CAL_RESULT_BAD_HANDLE;
168 | 		}
169 | 		else
170 | 		{
171 | 			event[num].timer.Stop();
172 | #ifndef CAL_FAKE_PASSTHROUGH
173 | 			if (event[num].timer.GetElapsedTime() <= event[num].delay)
174 | 			{
175 | 				event[num].timer.Start();
176 | 				retVal = CAL_RESULT_PENDING;
177 | 			}
178 | 			else
179 | #endif
180 | 			{
181 | 				event[num].queried = 1;
182 | 				for (int i = 0;i < event[num].nmems;i++) mem[event[num].mems[i]].active--;
183 | 				retVal = CAL_RESULT_OK;
184 | 			}
185 | 		}
186 | 		pthread_mutex_unlock(&mutex);
187 | 		if(retVal == CAL_RESULT_BAD_HANDLE) while(true);
188 | 		return(retVal);
189 | 	}
190 | 	
191 | 	void ListMemCollisions(int mem)
192 | 	{
193 | 	    for (int i = 0;i < NUM_FAKE_EVENTS;i++)
194 | 	    {
195 | 		if (event[i].initialized && !event[i].queried)
196 | 		{
197 | 		    for (int j = 0;j < event[i].nmems;j++)
198 | 		    {
199 | 			if (event[i].mems[j] == mem)
200 | 			{
201 | 			    printf("Collision with event %d\n", i);
202 | 			}
203 | 		    }
204 | 		}
205 | 	    }
206 | 	}
207 | 
208 | 	CALresult AddMemHandle(CALmem* m)
209 | 	{
210 | 		pthread_mutex_lock(&mutex);
211 | 		if (curmem == NUM_FAKE_MEM)
212 | 		{
213 | 			fprintf(stderr, "NUM_FAKE_MEM overflow\n");
214 | 			while(true);
215 | 		}
216 | 		*m = curmem;
217 | 		mem[curmem].released = 0;
218 | 		mem[curmem].active = 0;
219 | 		curmem++;
220 | 		pthread_mutex_unlock(&mutex);
221 | 		return(CAL_RESULT_OK);
222 | 	}
223 | 
224 | 	CALresult AddModule(CALmodule* mod)
225 | 	{
226 | 		pthread_mutex_lock(&mutex);
227 | 		if (curmodule == NUM_FAKE_MODULE)
228 | 		{
229 | 			fprintf(stderr, "NUM_FAKE_MODULE overflow\n");
230 | 			while(true);
231 | 		}
232 | 		*mod = curmodule;
233 | 		module[curmodule].released = 0;
234 | 		module[curmodule].nnames = 0;
235 | 		curmodule++;
236 | 		pthread_mutex_unlock(&mutex);
237 | 		return(CAL_RESULT_OK);
238 | 	}
239 | 
240 | 	CALresult AddName(CALname* nam, CALmodule mod)
241 | 	{
242 | 		//printf("Giving name %d (mod %d)\n", curname, mod);
243 | 		pthread_mutex_lock(&mutex);
244 | 		if (curname == NUM_FAKE_NAME)
245 | 		{
246 | 			fprintf(stderr, "NUM_FAKE_NAME overflow\n");
247 | 			while(true);
248 | 		}
249 | 		if (mod > (unsigned) curmodule)
250 | 		{
251 | 			fprintf(stderr, "Invalid Module\n");
252 | 			while(true);
253 | 		}
254 | 		if (module[mod].nnames == NUM_MODULE_NAMES)
255 | 		{
256 | 			fprintf(stderr, "NUM_MODULE_NAMES overflow\n");
257 | 			while(true);
258 | 		}
259 | 		*nam = curname;
260 | 		module[mod].names[module[mod].nnames] = curname;
261 | 		module[mod].nnames++;
262 | 		name[curname].mem = 0;
263 | 		curname++;
264 | 		pthread_mutex_unlock(&mutex);
265 | 		return(CAL_RESULT_OK);
266 | 	}
267 | 
268 | 	CALresult FakeMemcpy(CALmem mem1, CALmem mem2, CALevent* ev, int allowOverlap = 0)
269 | 	{
270 | 		pthread_mutex_lock(&mutex);
271 | #ifdef CAL_FAKE_CHECKMEM
272 | 		if (allowOverlap == 0 && (mem[mem1].active || mem[mem2].active))
273 | 		{
274 | 			fprintf(stderr, "Memory active when starting memcpy (src: %d, dst: %d)\n", mem[mem1].active, mem[mem2].active);
275 | 			while(true);
276 | 		}
277 | #endif
278 | 		AddEvent(ev, false);
279 | 		event[*ev].nmems = 2;
280 | 		event[*ev].mems[0] = mem1;
281 | 		event[*ev].mems[1] = mem2;
282 | 		mem[mem1].active++;
283 | 		mem[mem2].active++;
284 | 		pthread_mutex_unlock(&mutex);
285 | 		return(CAL_RESULT_OK);
286 | 	}
287 | 
288 | 	CALresult FakeKernel(CALfunc func, CALevent* ev, int allowOverlap)
289 | 	{
290 | 		pthread_mutex_lock(&mutex);
291 | 		if (func > (unsigned) curmodule)
292 | 		{
293 | 			fprintf(stderr, "Invalid func/module");
294 | 			while(true);
295 | 		}
296 | #ifdef CAL_FAKE_CHECKMEM
297 | 		for (int i = 0;i < module[func].nnames;i++)
298 | 		{
299 | 			if (i >= allowOverlap && mem[name[module[func].names[i]].mem].active)
300 | 			{
301 | 				fprintf(stderr, "Memory %d (of %d) active when starting kernel (allowed overlap %d)\n", i, module[func].nnames, allowOverlap);
302 | 				ListMemCollisions(name[module[func].names[i]].mem);
303 | 				while(true);
304 | 			}
305 | 			mem[name[module[func].names[i]].mem].active++;
306 | 		}
307 | #endif
308 | 		AddEvent(ev, false);
309 | 		event[*ev].nmems = module[func].nnames;
310 | 		for (int i = 0;i < module[func].nnames;i++) event[*ev].mems[i] = name[module[func].names[i]].mem;
311 | 		pthread_mutex_unlock(&mutex);
312 | 		return(CAL_RESULT_OK);
313 | 	}
314 | 
315 | 	CALresult SetMem(CALname nam, CALmem m)
316 | 	{
317 | 		if (nam > (unsigned) curname || m > (unsigned) curmem)
318 | 		{
319 | 			fprintf(stderr, "Invalid name/mem\n");
320 | 			while(true);
321 | 		}
322 | 		name[nam].mem = m;
323 | 		return(CAL_RESULT_OK);
324 | 	}
325 | 
326 | 	CALresult GetFunc(CALfunc* fun, CALmodule mod)
327 | 	{
328 | 		*fun = mod;
329 | 		return(CAL_RESULT_OK);
330 | 	}
331 | 
332 | 	CALresult ReleaseMem(int m)
333 | 	{
334 | 		mem[m].released = 1;
335 | 		return(CAL_RESULT_OK);
336 | 	}
337 | 
338 | 	CALresult UnloadModule(int mod)
339 | 	{
340 | 		module[mod].released = 1;
341 | 		return(CAL_RESULT_OK);
342 | 	}
343 | };
344 | 
345 | cal_fake fake;
346 | 
347 | #ifndef CAL_FAKE_PASSTHROUGH
348 | #define calCtxRunProgram(event, ctx, func, rect) fake.FakeKernel(func, event)
349 | #define calMemCopy(event, ctx, src, dest, flags) fake.FakeMemcpy(src, dest, event)
350 | #define calCtxIsEventDone(ctx, event) fake.QueryEvent(event)
351 | #define calCtxGetMem(mem, ctx, res) fake.AddMemHandle(mem)
352 | #define calCtxSetMem(ctx, name, mem) fake.SetMem(name, mem)
353 | #define calCtxReleaseMem(ctx, mem) fake.ReleaseMem(mem)
354 | #define calModuleLoad(module, ctx, image) fake.AddModule(module)
355 | #define calModuleUnload(ctx, module) fake.UnloadModule(module)
356 | #define calModuleGetName(name, ctx, module, string) fake.AddName(name, module)
357 | #define calModuleGetEntry(func, ctx, module, string) fake.GetFunc(func, module)
358 | #else
359 | 
360 | static inline CALresult calCtxRunProgram_a(CALevent* event, CALcontext ctx, CALfunc func, CALdomain* rect)
361 | {
362 | 	fake.FakeKernel(func, event, 0);
363 | 	return(calCtxRunProgram(&fake.event[*event].through, ctx, fake.module[func].throughFunc, rect));
364 | }
365 | 
366 | static inline CALresult calMemCopy_a(CALevent* event, CALcontext ctx, CALmem src, CALmem dest, CALuint flags)
367 | {
368 | 	fake.FakeMemcpy(src, dest, event, 0);
369 | 	return(calMemCopy(&fake.event[*event].through, ctx, fake.mem[src].through, fake.mem[dest].through, flags));
370 | }
371 | 
372 | static inline CALresult calCtxRunProgram_b(CALevent* event, CALcontext ctx, CALfunc func, CALdomain* rect, int allowOverlap = 0)
373 | {
374 | 	fake.FakeKernel(func, event, allowOverlap);
375 | 	return(calCtxRunProgram(&fake.event[*event].through, ctx, fake.module[func].throughFunc, rect));
376 | }
377 | 
378 | static inline CALresult calMemCopy_b(CALevent* event, CALcontext ctx, CALmem src, CALmem dest, CALuint flags, int allowOverlap = 0)
379 | {
380 | 	fake.FakeMemcpy(src, dest, event, allowOverlap);
381 | 	return(calMemCopy(&fake.event[*event].through, ctx, fake.mem[src].through, fake.mem[dest].through, flags));
382 | }
383 | 
384 | static inline CALresult calCtxIsEventDone_a(CALcontext ctx, CALevent event)
385 | {
386 | 	CALresult retVal = calCtxIsEventDone(ctx, fake.event[event].through);
387 | 	if (retVal == CAL_RESULT_OK) fake.QueryEvent(event);
388 | 	return(retVal);
389 | }
390 | 
391 | static inline CALresult calCtxGetMem_a(CALmem* mem, CALcontext ctx, CALresource res)
392 | {
393 | 	fake.AddMemHandle(mem);
394 | 	return(calCtxGetMem(&fake.mem[*mem].through, ctx, res));
395 | }
396 | 
397 | static inline CALresult calCtxSetMem_a(CALcontext ctx, CALname name, CALmem mem)
398 | {
399 | 	fake.SetMem(name, mem);
400 | 	return(calCtxSetMem(ctx, fake.name[name].through, fake.mem[mem].through));
401 | }
402 | 
403 | static inline CALresult calCtxReleaseMem_a(CALcontext ctx, CALmem mem)
404 | {
405 | 	fake.ReleaseMem(mem);
406 | 	return(calCtxReleaseMem(ctx, fake.mem[mem].through));
407 | }
408 | 
409 | static inline CALresult calModuleLoad_a(CALmodule* module, CALcontext ctx, CALimage image)
410 | {
411 | 	fake.AddModule(module);
412 | 	return(calModuleLoad(&fake.module[*module].through, ctx, image));
413 | }
414 | 
415 | static inline CALresult calModuleUnload_a(CALcontext ctx, CALmodule module)
416 | {
417 | 	fake.UnloadModule(module);
418 | 	return(calModuleUnload(ctx, fake.module[module].through));
419 | }
420 | 
421 | static inline CALresult calModuleGetName_a(CALname* name, CALcontext ctx, CALmodule module, const CALchar* symbolname)
422 | {
423 | 	fake.AddName(name, module);
424 | 	return(calModuleGetName(&fake.name[*name].through, ctx, fake.module[module].through, symbolname));
425 | }
426 | 
427 | static inline CALresult calModuleGetEntry_a(CALfunc* func, CALcontext ctx, CALmodule module, const CALchar* symbolname)
428 | {
429 | 	fake.GetFunc(func, module);
430 | 	return(calModuleGetEntry(&fake.module[module].throughFunc, ctx, fake.module[module].through, symbolname));
431 | }
432 | 
433 | #define calCtxRunProgram calCtxRunProgram_a
434 | #define calMemCopy calMemCopy_a
435 | #define calCtxIsEventDone calCtxIsEventDone_a
436 | #define calCtxGetMem calCtxGetMem_a
437 | #define calCtxSetMem calCtxSetMem_a
438 | #define calCtxReleaseMem calCtxReleaseMem_a
439 | #define calModuleLoad calModuleLoad_a
440 | #define calModuleUnload calModuleUnload_a
441 | #define calModuleGetName calModuleGetName_a
442 | #define calModuleGetEntry calModuleGetEntry_a
443 | 
444 | #endif
445 | 
446 | #endif


--------------------------------------------------------------------------------
/cal_private_ext.h:
--------------------------------------------------------------------------------
  1 | /* ============================================================
  2 | 
  3 | Copyright (c) 2007 Advanced Micro Devices, Inc.  All rights reserved.
  4 | 
  5 | Redistribution and use of this material is permitted under the following
  6 | conditions:
  7 | 
  8 | Redistributions must retain the above copyright notice and all terms of this
  9 | license.
 10 | 
 11 | In no event shall anyone redistributing or accessing or using this material
 12 | commence or participate in any arbitration or legal action relating to this
 13 | material against Advanced Micro Devices, Inc. or any copyright holders or
 14 | contributors. The foregoing shall survive any expiration or termination of
 15 | this license or any agreement or access or use related to this material.
 16 | 
 17 | ANY BREACH OF ANY TERM OF THIS LICENSE SHALL RESULT IN THE IMMEDIATE REVOCATION
 18 | OF ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE THIS MATERIAL.
 19 | 
 20 | THIS MATERIAL IS PROVIDED BY ADVANCED MICRO DEVICES, INC. AND ANY COPYRIGHT
 21 | HOLDERS AND CONTRIBUTORS "AS IS" IN ITS CURRENT CONDITION AND WITHOUT ANY
 22 | REPRESENTATIONS, GUARANTEE, OR WARRANTY OF ANY KIND OR IN ANY WAY RELATED TO
 23 | SUPPORT, INDEMNITY, ERROR FREE OR UNINTERRUPTED OPERATION, OR THAT IT IS FREE
 24 | FROM DEFECTS OR VIRUSES.  ALL OBLIGATIONS ARE HEREBY DISCLAIMED - WHETHER
 25 | EXPRESS, IMPLIED, OR STATUTORY - INCLUDING, BUT NOT LIMITED TO, ANY IMPLIED
 26 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE,
 27 | ACCURACY, COMPLETENESS, OPERABILITY, QUALITY OF SERVICE, OR NON-INFRINGEMENT.
 28 | IN NO EVENT SHALL ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR
 29 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, PUNITIVE,
 30 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
 31 | OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, REVENUE, DATA, OR PROFITS; OR
 32 | BUSINESS INTERRUPTION) HOWEVER CAUSED OR BASED ON ANY THEORY OF LIABILITY
 33 | ARISING IN ANY WAY RELATED TO THIS MATERIAL, EVEN IF ADVISED OF THE POSSIBILITY
 34 | OF SUCH DAMAGE. THE ENTIRE AND AGGREGATE LIABILITY OF ADVANCED MICRO DEVICES,
 35 | INC. AND ANY COPYRIGHT HOLDERS AND CONTRIBUTORS SHALL NOT EXCEED TEN DOLLARS
 36 | (US $10.00). ANYONE REDISTRIBUTING OR ACCESSING OR USING THIS MATERIAL ACCEPTS
 37 | THIS ALLOCATION OF RISK AND AGREES TO RELEASE ADVANCED MICRO DEVICES, INC. AND
 38 | ANY COPYRIGHT HOLDERS AND CONTRIBUTORS FROM ANY AND ALL LIABILITIES,
 39 | OBLIGATIONS, CLAIMS, OR DEMANDS IN EXCESS OF TEN DOLLARS (US $10.00). THE
 40 | FOREGOING ARE ESSENTIAL TERMS OF THIS LICENSE AND, IF ANY OF THESE TERMS ARE
 41 | CONSTRUED AS UNENFORCEABLE, FAIL IN ESSENTIAL PURPOSE, OR BECOME VOID OR
 42 | DETRIMENTAL TO ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR
 43 | CONTRIBUTORS FOR ANY REASON, THEN ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE
 44 | THIS MATERIAL SHALL TERMINATE IMMEDIATELY. MOREOVER, THE FOREGOING SHALL
 45 | SURVIVE ANY EXPIRATION OR TERMINATION OF THIS LICENSE OR ANY AGREEMENT OR
 46 | ACCESS OR USE RELATED TO THIS MATERIAL.
 47 | 
 48 | NOTICE IS HEREBY PROVIDED, AND BY REDISTRIBUTING OR ACCESSING OR USING THIS
 49 | MATERIAL SUCH NOTICE IS ACKNOWLEDGED, THAT THIS MATERIAL MAY BE SUBJECT TO
 50 | RESTRICTIONS UNDER THE LAWS AND REGULATIONS OF THE UNITED STATES OR OTHER
 51 | COUNTRIES, WHICH INCLUDE BUT ARE NOT LIMITED TO, U.S. EXPORT CONTROL LAWS SUCH
 52 | AS THE EXPORT ADMINISTRATION REGULATIONS AND NATIONAL SECURITY CONTROLS AS
 53 | DEFINED THEREUNDER, AS WELL AS STATE DEPARTMENT CONTROLS UNDER THE U.S.
 54 | MUNITIONS LIST. THIS MATERIAL MAY NOT BE USED, RELEASED, TRANSFERRED, IMPORTED,
 55 | EXPORTED AND/OR RE-EXPORTED IN ANY MANNER PROHIBITED UNDER ANY APPLICABLE LAWS,
 56 | INCLUDING U.S. EXPORT CONTROL LAWS REGARDING SPECIFICALLY DESIGNATED PERSONS,
 57 | COUNTRIES AND NATIONALS OF COUNTRIES SUBJECT TO NATIONAL SECURITY CONTROLS.
 58 | MOREOVER, THE FOREGOING SHALL SURVIVE ANY EXPIRATION OR TERMINATION OF ANY
 59 | LICENSE OR AGREEMENT OR ACCESS OR USE RELATED TO THIS MATERIAL.
 60 | 
 61 | NOTICE REGARDING THE U.S. GOVERNMENT AND DOD AGENCIES: This material is
 62 | provided with "RESTRICTED RIGHTS" and/or "LIMITED RIGHTS" as applicable to
 63 | computer software and technical data, respectively. Use, duplication,
 64 | distribution or disclosure by the U.S. Government and/or DOD agencies is
 65 | subject to the full extent of restrictions in all applicable regulations,
 66 | including those found at FAR52.227 and DFARS252.227 et seq. and any successor
 67 | regulations thereof. Use of this material by the U.S. Government and/or DOD
 68 | agencies is acknowledgment of the proprietary rights of any copyright holders
 69 | and contributors, including those of Advanced Micro Devices, Inc., as well as
 70 | the provisions of FAR52.227-14 through 23 regarding privately developed and/or
 71 | commercial computer software.
 72 | 
 73 | This license forms the entire agreement regarding the subject matter hereof and
 74 | supersedes all proposals and prior discussions and writings between the parties
 75 | with respect thereto. This license does not affect any ownership, rights, title,
 76 | or interest in, or relating to, this material. No terms of this license can be
 77 | modified or waived, and no breach of this license can be excused, unless done
 78 | so in a writing signed by all affected parties. Each term of this license is
 79 | separately enforceable. If any term of this license is determined to be or
 80 | becomes unenforceable or illegal, such term shall be reformed to the minimum
 81 | extent necessary in order for this license to remain in effect in accordance
 82 | with its terms as modified by such reformation. This license shall be governed
 83 | by and construed in accordance with the laws of the State of Texas without
 84 | regard to rules on conflicts of law of any state or jurisdiction or the United
 85 | Nations Convention on the International Sale of Goods. All disputes arising out
 86 | of this license shall be subject to the jurisdiction of the federal and state
 87 | courts in Austin, Texas, and all defenses are hereby waived concerning personal
 88 | jurisdiction and venue of these courts.
 89 | 
 90 | ============================================================ */
 91 | 
 92 | #ifndef __CAL_PRIVATE_EXT_H__
 93 | #define __CAL_PRIVATE_EXT_H__
 94 | 
 95 | #include "cal_ext.h"
 96 | 
 97 | #ifdef __cplusplus
 98 | extern "C" {
 99 | #endif
100 | 
101 | #ifndef CALAPIENTRYP
102 | #define CALAPIENTRYP CALAPIENTRY *
103 | #endif
104 | 
105 | 
106 | typedef enum calPrivateExtidEnum {
107 |     CAL_PRIVATE_EXT_SYNC_OBJECT       = 0x8009,
108 | } calPrivateExtid;
109 | 
110 | 
111 | // flags for calCtxWaitForEvents
112 | typedef enum CALwaitTypeEnum
113 | {
114 |     CAL_WAIT_LOW_CPU_UTILIZATION    = 0,
115 |     CAL_WAIT_POLLING                = 1,
116 | } CALwaitType;
117 | 
118 | /**
119 |  * @fn calCtxWaitForEvents(CALcontext ctx,
120 |  *                         CALevent *events,
121 |  *                         CALuint n,
122 |  *                         CALuint flags)
123 |  *
124 |  * @brief wait until all programs referenced by event list have executed.
125 |  *
126 |  * @param ctx (in)    - CAL context
127 |  * @param events (in) - array of events
128 |  * @param n (in)      - number of events
129 |  * @param flags (in)  - currently unused.
130 |  *
131 |  * @return Returns CAL_RESULT_OK on success, CAL_RESULT_ERROR if there was an error.
132 |  *
133 |  */
134 | typedef CALresult (CALAPIENTRYP PFNCALCTXWAITFOREVENTS) (CALcontext ctx, CALevent *events, CALuint n, CALuint flags);
135 | 
136 | 
137 | #ifdef __cplusplus
138 | }
139 | #endif
140 | #endif // __CAL_PRIVATE_EXT_H__
141 | 
142 | 
143 | 
144 | 


--------------------------------------------------------------------------------
/caldgemm.cl:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is part of the CALDGEMM library.
  3 |  *
  4 |  * Copyright 2015:
  5 |  *  - David Rohr (drohr@jwdt.org)
  6 |  *  - Matthias Bach (bach@compeng.uni-frankfurt.de)
  7 |  *  - Matthias Kretz (kretz@compeng.uni-frankfurt.de)
  8 |  *
  9 |  * This file is part of CALDGEMM.
 10 |  *
 11 |  * CALDGEMM is free software: you can redistribute it and/or modify
 12 |  * it under the terms of the GNU Lesser General Public License as published by
 13 |  * the Free Software Foundation, either version 3 of the License, or
 14 |  * (at your option) any later version.
 15 |  *
 16 |  * CALDGEMM is distributed in the hope that it will be useful,
 17 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 |  * GNU Lesser General Public License for more details.
 20 |  *
 21 |  * You should have received a copy of the GNU Lesser General Public License
 22 |  * along with CALDGEMM.  If not, see <http://www.gnu.org/licenses/>.
 23 |  */
 24 | 
 25 | #define qon_mstr(a) #a
 26 | #define qon_mxstr(a) qon_mstr(a)
 27 | 
 28 | #ifdef OCL_USE_SIMPLE_BUFFERS
 29 | 
 30 | #ifdef CALDGEMM_TRANSPOSED_B
 31 | 
 32 | const char *caldgemm_opencl::OCLKernelName =
 33 | OCL_KERNEL_PRE
 34 | "//KERNEL TRANSPOSED B SIMPLE BUFFERS\n"
 35 | "__kernel void oclkernel(__global double* C, __global const double* __restrict const A, __global const double* __restrict const B, int height1, int height2, int width, double alpha, double beta, int pitch, ulong offset)\n"
 36 | "{\n"
 37 | "	int i, j, k;\n"
 38 | "	for (i = get_global_id(1);i < height2;i += get_global_size(1))\n"
 39 | "	{\n"
 40 | "		for (j = get_global_id(0);j < height1;j += get_global_size(0))\n"
 41 | "		{\n"
 42 | "			double addval = 0.;\n"
 43 | #ifdef CALDGEMM_FORCE_K
 44 | "			for (k = 0;k < " qon_mxstr(CALDGEMM_FORCE_K) ";k++)\n"
 45 | #else
 46 | "			for (k = 0;k < width;k++)\n"
 47 | #endif
 48 | "			{\n"
 49 | "				addval += A[i * width + k] * B[j * width + k];\n"
 50 | "			}\n"
 51 | #ifdef CALDGEMM_ALPHA1
 52 | "			C[offset + i * pitch + j] = beta * C[offset + i * pitch + j] + addval;\n"
 53 | #else
 54 | "			C[offset + i * pitch + j] = beta * C[offset + i * pitch + j] + alpha * addval;\n"
 55 | #endif
 56 | "		}\n"
 57 | "	}\n"
 58 | "}\n"
 59 | ;
 60 | 
 61 | #else
 62 | 
 63 | const char *caldgemm_opencl::OCLKernelName =
 64 | OCL_KERNEL_PRE
 65 | "//KERNEL TRANSPOSED A SIMPLE BUFFERS\n"
 66 | "__kernel void oclkernel(__global double* C, __global const double* __restrict const A, __global const double* __restrict const B, int height1, int height2, int width, double alpha, double beta, int pitch, ulong offset)\n"
 67 | "{\n"
 68 | "	int i, j, k;\n"
 69 | "	for (i = get_global_id(1);i < height2;i += get_global_size(1))\n"
 70 | "	{\n"
 71 | "		for (j = get_global_id(0);j < height1;j += get_global_size(0))\n"
 72 | "		{\n"
 73 | "			double addval = 0.;\n"
 74 | #ifdef CALDGEMM_FORCE_K
 75 | "			for (k = 0;k < " qon_mxstr(CALDGEMM_FORCE_K) ";k++)\n"
 76 | #else
 77 | "			for (k = 0;k < width;k++)\n"
 78 | #endif
 79 | "			{\n"
 80 | "				addval += A[k * height2 + i] * B[k * height1 + j];\n"
 81 | "			}\n"
 82 | #ifdef CALDGEMM_ALPHA1
 83 | "			C[offset + i * pitch + j] = beta * C[offset + i * pitch + j] + addval;\n"
 84 | #else
 85 | "			C[offset + i * pitch + j] = beta * C[offset + i * pitch + j] + alpha * addval;\n"
 86 | #endif
 87 | "		}\n"
 88 | "	}\n"
 89 | "}\n"
 90 | ;
 91 | 
 92 | #endif
 93 | 
 94 | 
 95 | #else //OCL_USE_SIMPLE_BUFFERS
 96 | 
 97 | 
 98 | #ifdef CALDGEMM_TRANSPOSED_B
 99 | 
100 | const char *caldgemm_opencl::OCLKernelName =
101 | OCL_KERNEL_PRE
102 | "//KERNEL TRANSPOSED B TEXTURE BUFFERS\n"
103 | "union double_read {uint4 f; double2 d;};\n"
104 | "__kernel void oclkernel(__global double* C, image2d_t A, image2d_t B, int height1, int height2, int width, double alpha, double beta, int pitch, ulong offset)\n"
105 | "{\n"
106 | "	const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"
107 | "	int i, j, k;\n"
108 | "	for (i = get_global_id(1);i < height2;i += get_global_size(1))\n"
109 | "	{\n"
110 | "		for (j = get_global_id(0);j < height1;j += get_global_size(0))\n"
111 | "		{\n"
112 | "			double addval = 0.;\n"
113 | #ifdef CALDGEMM_FORCE_K
114 | "			for (k = 0;k < " qon_mxstr(CALDGEMM_FORCE_K) " / 2;k++)\n"
115 | #else
116 | "			for (k = 0;k < width / 2;k++)\n"
117 | #endif
118 | "			{\n"
119 | "				float2 coord;\n"
120 | "				union double_read tmp, tmp2;\n"
121 | "				coord.x = k;\n"
122 | "				coord.y = i;\n"
123 | "				tmp.f = read_imageui(A, sampler, coord);\n"
124 | "				coord.y = j;\n"
125 | "				tmp2.f = read_imageui(B, sampler, coord);\n"
126 | "				addval += tmp.d.x * tmp2.d.x + tmp.d.y * tmp2.d.y;\n"
127 | "			}\n"
128 | #ifdef CALDGEMM_ALPHA1
129 | "			C[offset + i * pitch + j] = beta * C[offset + i * pitch + j] + addval;\n"
130 | #else
131 | "			C[offset + i * pitch + j] = beta * C[offset + i * pitch + j] + alpha * addval;\n"
132 | #endif
133 | "		}\n"
134 | "	}\n"
135 | "}\n"
136 | ;
137 | 
138 | #elif defined(CALDGEMM_TRANSPOSED_A)
139 | 
140 | #ifndef OCL_TILED_KERNEL
141 | 
142 | const char *caldgemm_opencl::OCLKernelName =
143 | OCL_KERNEL_PRE
144 | "//KERNEL TRANSPOSED A TEXTURE BUFFERS\n"
145 | "union double_read {uint4 f; double2 d;};\n"
146 | "__kernel void oclkernel(__global double* C, image2d_t A, image2d_t B, int height1, int height2, int width, double alpha, double beta, int pitch, ulong offset)\n"
147 | "{\n"
148 | "	const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"
149 | "	int i, j, k;\n"
150 | "	for (i = get_global_id(1);i < height2;i += get_global_size(1))\n"
151 | "	{\n"
152 | "		for (j = get_global_id(0);j < height1;j += get_global_size(0))\n"
153 | "		{\n"
154 | "			double addval = 0.;\n"
155 | #ifdef CALDGEMM_FORCE_K
156 | "			for (k = 0;k < " qon_mxstr(CALDGEMM_FORCE_K) ";k++)\n"
157 | #else
158 | "			for (k = 0;k < width;k++)\n"
159 | #endif
160 | "			{\n"
161 | "				float2 coord;\n"
162 | "				union double_read tmp, tmp2;\n"
163 | "				coord.x = i / 2;\n"
164 | "				coord.y = k;\n"
165 | "				tmp.f = read_imageui(A, sampler, coord);\n"
166 | "				coord.x = j / 2;\n"
167 | "				tmp2.f = read_imageui(B, sampler, coord);\n"
168 | "				double v1 = (i & 1) ? tmp.d.y : tmp.d.x, v2 = (j & 1) ? tmp2.d.y : tmp2.d.x;\n"
169 | "				addval += v1 * v2;\n"
170 | "			}\n"
171 | #ifdef CALDGEMM_ALPHA1
172 | "			C[offset + i * pitch + j] = beta * C[offset + i * pitch + j] + addval;\n"
173 | #else
174 | "			C[offset + i * pitch + j] = beta * C[offset + i * pitch + j] + alpha * addval;\n"
175 | #endif
176 | "		}\n"
177 | "	}\n"
178 | "}\n"
179 | ;
180 | 
181 | #else
182 | 
183 | const char *caldgemm_opencl::OCLKernelName =
184 | OCL_KERNEL_PRE
185 | "//KERNEL TRANSPOSED A TEXTURE BUFFERS TILED\n"
186 | "//#pragma OPENCL EXTENSION CP_FP_FMA\n"
187 | "union double_read {uint4 f; double2 d;};\n"
188 | "#define OCL_TILING_X " qon_mxstr(OCL_TILING_X) "\n"
189 | "#define OCL_TILING_Y " qon_mxstr(OCL_TILING_Y) "\n"
190 | "__kernel void oclkernel(__global double* C, image2d_t A, image2d_t B, int height1, int height2, int width, double alpha, double beta, int pitch, ulong offset)\n"
191 | "{\n"
192 | "	const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"
193 | "	int i, j, k, l, m;\n"
194 | "	for (i = get_global_id(1) * OCL_TILING_Y;i < height2;i += get_global_size(1) * OCL_TILING_Y)\n"
195 | "	{\n"
196 | "		for (j = get_global_id(0) * OCL_TILING_X;j < height1;j += get_global_size(0) * OCL_TILING_X)\n"
197 | "		{\n"
198 | "			double addval[OCL_TILING_X][OCL_TILING_Y];\n"
199 | "#pragma unroll\n"
200 | "			for (k = 0;k < OCL_TILING_X;k++) for (l = 0;l < OCL_TILING_Y;l++) addval[k][l] = 0.;\n"
201 | "#pragma unroll 1\n"
202 | #ifdef CALDGEMM_FORCE_K
203 | "			for (k = 0;k < " qon_mxstr(CALDGEMM_FORCE_K) ";k++)\n"
204 | #else
205 | "			for (k = 0;k < width;k++)\n"
206 | #endif
207 | "			{\n"
208 | "				float2 coord;\n"
209 | "				union double_read tmp[OCL_TILING_X / 2], tmp2[OCL_TILING_Y / 2];\n"
210 | "				coord.y = k;\n"
211 | "#pragma unroll\n"
212 | "				for (l = 0;l < OCL_TILING_X / 2;l++)\n"
213 | "				{\n"
214 | "					coord.x = i / 2 + l;\n"
215 | "					tmp[l].f = read_imageui(A, sampler, coord);\n"
216 | "				}\n"
217 | "				for (l = 0;l < OCL_TILING_Y / 2;l++)\n"
218 | "				{\n"
219 | "					coord.x = j / 2 + l;\n"
220 | "					tmp2[l].f = read_imageui(B, sampler, coord);\n"
221 | "				}\n"
222 | "#pragma unroll\n"
223 | "				for (l = 0;l < OCL_TILING_X / 2;l++)\n"
224 | "				{\n"
225 | "#pragma unroll\n"
226 | "					for (m = 0;m < OCL_TILING_Y / 2;m++)\n"
227 | "					{\n"
228 | "						addval[2 * l][2 * m] = mad(tmp[l].d.x, tmp2[m].d.x, addval[2 * l][2 * m]);\n"
229 | "						addval[2 * l + 1][2 * m] = mad(tmp[l].d.y, tmp2[m].d.x, addval[2 * l + 1][2 * m]);\n"
230 | "						addval[2 * l][2 * m + 1] = mad(tmp[l].d.x, tmp2[m].d.y, addval[2 * l][2 * m + 1]);\n"
231 | "						addval[2 * l + 1][2 * m + 1] = mad(tmp[l].d.y, tmp2[m].d.y, addval[2 * l + 1][2 * m + 1]);\n"
232 | 
233 | "					}\n"
234 | "				}\n"
235 | "			}\n"
236 | "#pragma unroll\n"
237 | "			for (k = 0;k < OCL_TILING_X;k++)\n"
238 | "			{\n"
239 | "#pragma unroll\n"
240 | "				for (l = 0;l < OCL_TILING_Y;l++)\n"
241 | "				{\n"
242 | #ifdef CALDGEMM_ALPHA1
243 | "					C[offset + (i + k) * pitch + j + l] = beta * C[offset + (i + k) * pitch + j + l] + addval[k][l];\n"
244 | #else
245 | "					C[offset + (i + k) * pitch + j + l] = beta * C[offset + (i + k) * pitch + j + l] + alpha * addval[k][l];\n"
246 | #endif
247 | "				}\n"
248 | "			}\n"
249 | "		}\n"
250 | "	}\n"
251 | "}\n"
252 | ;
253 | 
254 | #endif
255 | #endif
256 | #endif
257 | 


--------------------------------------------------------------------------------
/caldgemm_adl.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This file is part of the CALDGEMM library.
 3 |  *
 4 |  * Copyright 2015:
 5 |  *  - David Rohr (drohr@jwdt.org)
 6 |  *  - Matthias Bach (bach@compeng.uni-frankfurt.de)
 7 |  *  - Matthias Kretz (kretz@compeng.uni-frankfurt.de)
 8 |  *
 9 |  * This file is part of CALDGEMM.
10 |  *
11 |  * CALDGEMM is free software: you can redistribute it and/or modify
12 |  * it under the terms of the GNU Lesser General Public License as published by
13 |  * the Free Software Foundation, either version 3 of the License, or
14 |  * (at your option) any later version.
15 |  *
16 |  * CALDGEMM is distributed in the hope that it will be useful,
17 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 |  * GNU Lesser General Public License for more details.
20 |  *
21 |  * You should have received a copy of the GNU Lesser General Public License
22 |  * along with CALDGEMM.  If not, see <http://www.gnu.org/licenses/>.
23 |  */
24 | 
25 | #include "caldgemm_config_load.h"
26 | #include "cmodules/util_adl.cpp"
27 | 


--------------------------------------------------------------------------------
/caldgemm_cal.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Interface of the CALDGEMM library.
  3 |  *
  4 |  * Copyright 2015:
  5 |  *  - David Rohr (drohr@jwdt.org)
  6 |  *  - Matthias Bach (bach@compeng.uni-frankfurt.de)
  7 |  *  - Matthias Kretz (kretz@compeng.uni-frankfurt.de)
  8 |  *
  9 |  * This file is part of CALDGEMM.
 10 |  *
 11 |  * CALDGEMM is free software: you can redistribute it and/or modify
 12 |  * it under the terms of the GNU Lesser General Public License as published by
 13 |  * the Free Software Foundation, either version 3 of the License, or
 14 |  * (at your option) any later version.
 15 |  *
 16 |  * CALDGEMM is distributed in the hope that it will be useful,
 17 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 |  * GNU Lesser General Public License for more details.
 20 |  *
 21 |  * You should have received a copy of the GNU Lesser General Public License
 22 |  * along with CALDGEMM.  If not, see <http://www.gnu.org/licenses/>.
 23 |  */
 24 | 
 25 | #ifndef CALDGEMM_CAL_H
 26 | #define CALDGEMM_CAL_H
 27 | 
 28 | #include <cal.h>
 29 | #include <cal_ext.h>
 30 | #include <calcl.h>
 31 | #include "cal_private_ext.h"
 32 | 
 33 | #include <emmintrin.h>
 34 | 
 35 | #include "caldgemm.h"
 36 | 
 37 | class caldgemm_cal : public caldgemm
 38 | {
 39 | public:
 40 | 	caldgemm_cal();
 41 | 	virtual ~caldgemm_cal();
 42 | 
 43 | 	virtual double getMaxGPUTemperature();
 44 | 
 45 | private:
 46 | 	int adl_util_initialized;
 47 | 	virtual int UseOutputPthreads();
 48 | 	virtual int UseInputPthreads();
 49 | 	virtual int UseMutexPerDevice();
 50 | 
 51 | 	unsigned int numInputs, numOutputs, numConstantBuffers;
 52 | 
 53 | #ifdef CALDGEMM_44
 54 | #ifdef CALDGEMM_SINGLE_BUFFER
 55 | 	static const unsigned int dwBuffersA = 1;
 56 | #elif !defined(CALDGEMM_48) & !defined(CALDGEMM_DOUBLE_BUFFERS)
 57 | 	static const unsigned int dwBuffersA = 2;
 58 | #else
 59 | 	static const unsigned int dwBuffersA = 4;
 60 | #endif
 61 | #ifdef CALDGEMM_SINGLE_BUFFER
 62 | 	static const unsigned int dwBuffersB = 1;
 63 | #elif !defined(CALDGEMM_84) & !defined(CALDGEMM_DOUBLE_BUFFERS)
 64 | 	static const unsigned int dwBuffersB = 2;
 65 | #else
 66 | 	static const unsigned int dwBuffersB = 4;
 67 | #endif
 68 | #else //CALDGEMM_44
 69 | #ifdef CALDGEMM_TRANSPOSED_A
 70 | 	static const unsigned int dwBuffersA = 2;
 71 | #else
 72 | 	static const unsigned int dwBuffersA = 8;
 73 | #endif
 74 | 	static const unsigned int dwBuffersB = 2;
 75 | #endif //CALDGEMM_44
 76 | 
 77 | #ifdef CALDGEMM_USE_MEMEXPORT
 78 | 	static const unsigned int dwBuffersC = 1;
 79 | #else
 80 | 	static const unsigned int dwBuffersC = 8;
 81 | #endif
 82 | 
 83 | 	struct BufferProperties
 84 | 	{
 85 | 		union
 86 | 		{
 87 | 			float*  ptr_float;
 88 | 			unsigned int*   ptr_uint;
 89 | 			int*    ptr_int;
 90 | 			double* ptr_double;
 91 | 			char*   ptr_char;
 92 | 			void*   ptr_void;
 93 | 		};
 94 | 		unsigned int Width;
 95 | 		unsigned int Height;
 96 | 		unsigned int VectorSize;
 97 | 		unsigned int DataSize;
 98 | 
 99 | 		bool CALMemory;
100 | 		CALresource res;
101 | 		CALmem mem;
102 | 		CALmem dstMem;
103 | 		unsigned int pitch;
104 | 		CALresource tmpres;
105 | 		CALmem tmpmem;
106 | 		
107 | 		BufferProperties* conversionBuffer;
108 | 	};
109 | 
110 | 	int divideBuffer(BufferProperties* dst, double* src, int width, int height, int gpu_width, int gpu_height, int pitch, int numBuffers, bool transpose CALDGEMM_DIVBUFA);
111 | 	int mergeBuffers(double* dst, BufferProperties* src, int width, int height, int gpu_width, int gpu_height, int pitch, int numBuffers);
112 | 	void checkCalPatch();
113 | 	void cal_init_constant_data(BufferProperties* &data, double alpha);
114 | 	virtual int DGEMM_prepare_backend(size_t k, int j, unsigned int num_device, bool prepareM, bool prepareN, bool buffersSufficiant, bool buffersSufficiant0 CALDGEMM_DIVBUFA);
115 | 
116 | 	struct CALVersion {unsigned int major, minor, imp;};
117 | 
118 | 	virtual	int Initialize (bool nocalinit);
119 | 	int SetupKernel(const char* ILKernel, CALmodule* module, CALcontext* ctx, unsigned int device_num, bool disassemble = false);
120 | 	int RunProgram(CALcontext* ctx, CALmodule* module, unsigned int Width, unsigned int Height, CALevent* event);
121 | 	int CleanupData(CALcontext* ctx, CALresource* &resourceHandler, BufferProperties* &data, unsigned int numHandles, int nContext, unsigned int num_device);
122 | 	int Cleanup(CALdevice* device, CALcontext* ctx, CALmodule* module, CALresource* &resourceHandler, BufferProperties* &data, unsigned int numHandles, int nContext, unsigned int num_device);
123 | 	int SetupData(CALmodule* module, CALresource* &_Res, BufferProperties* &data, CALdevice* device, CALcontext* ctx, unsigned int numInputs, unsigned int numOutputs, unsigned int numConstantBuffers, CALname** ctxProgNames, int nContext, unsigned int num_device);
124 | 	int CopyDataFromGPU(int nDevice, CALresource* _Res, BufferProperties* data, unsigned int num, int nContext, size_t lastm, size_t lastn, int mustlock = 0);
125 | 	int CopyDataToGPU(int nDevice, CALresource* _Res, BufferProperties* data, unsigned int num, int nContext, bool constants, BufferProperties* dest_data = NULL);
126 | 	int ValidateCALRuntime();
127 | 
128 | 	class eventCls
129 | 	{
130 | 	public:
131 | #ifdef CALDGEMM_QUERY_ALL_EVENTS
132 | 		CALevent events[13];
133 | 		volatile int nEvents;
134 | 		inline CALevent* GetNextEvent()
135 | 		{
136 | 			if (nEvents == 13)
137 | 			{
138 | 				fprintf(STD_OUT, "Event buffer overflow\n");
139 | 				exit(1);
140 | 			}
141 | 			return(&events[nEvents++]);
142 | 		}
143 | 		inline void Reset() {nEvents = 0;}
144 | #else
145 | 		CALevent events[1];
146 | 		static const int nEvents = 1;
147 | 		inline CALevent* GetNextEvent() {return(&events[0]);}
148 | 		inline void Reset() {};
149 | #endif
150 | 	};
151 | 
152 | 	PFNCALCTXWAITFOREVENTS calCtxWaitForEvents;
153 | 
154 | 	BufferProperties* datas[max_devices][max_bbuffers];
155 | 	CALdevice devices[max_devices];
156 | 	CALcontext ctxs[max_devices];
157 | 	CALresource* resourceHandlers[max_devices][max_bbuffers];
158 | 	CALmodule modules[max_devices][kernel_count];
159 | 	CALmodule modulesConvert[max_devices];
160 | 	CALmodule fakeModule;
161 | 	CALname *progNames[max_devices][kernel_count];
162 | 	CALname progNamesConvert[max_devices][2 * dwBuffersA];
163 | 	eventCls events[max_devices][obuffercount];
164 | 	unsigned int device_nums[max_devices];
165 | 
166 | 	static const char *ILKernel, *ILKernelALPHA1, *ILKernelLinpack, *ILFakeKernel, *ILConvertKernel;
167 | 
168 | 	virtual int ValidateRuntime();
169 | 	virtual int CheckDevices();
170 | 	virtual int InitDevices();
171 | 	virtual int ReinitDevices();
172 | 	virtual int InitConstantData(double alpha);
173 | 	virtual int ExecuteKernels(caldgemm::DGEMMPrepareAndExecuteTask& Task, int blockm, int blockn);
174 | 	virtual int ExitRuntime();
175 | 	virtual int ExitDevices();
176 | 	virtual int WaitForEvent(int, int, int lock = 0);
177 | 	virtual int FetchResult(int device, int j, int m, int n, int mustlock = 0);
178 | 	virtual int CheckDMAQueue(int device, int forcej = -1);
179 | 	virtual int RunMergeBuffers(double* dst, int device, int j, int width, int height, int gpu_width, int gpu_height, int pitch);
180 | };
181 | 
182 | #endif
183 | 


--------------------------------------------------------------------------------
/caldgemm_cblas_wrapper.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This file is part of the CALDGEMM library.
 3 |  *
 4 |  * Copyright 2015:
 5 |  *  - David Rohr (drohr@jwdt.org)
 6 |  *  - Matthias Bach (bach@compeng.uni-frankfurt.de)
 7 |  *  - Matthias Kretz (kretz@compeng.uni-frankfurt.de)
 8 |  *
 9 |  * This file is part of CALDGEMM.
10 |  *
11 |  * CALDGEMM is free software: you can redistribute it and/or modify
12 |  * it under the terms of the GNU Lesser General Public License as published by
13 |  * the Free Software Foundation, either version 3 of the License, or
14 |  * (at your option) any later version.
15 |  *
16 |  * CALDGEMM is distributed in the hope that it will be useful,
17 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 |  * GNU Lesser General Public License for more details.
20 |  *
21 |  * You should have received a copy of the GNU Lesser General Public License
22 |  * along with CALDGEMM.  If not, see <http://www.gnu.org/licenses/>.
23 |  */
24 | 
25 | void cblas_dtrsma(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
26 | void cblas_dgemva(enum CBLAS_ORDER order,  enum CBLAS_TRANSPOSE trans,  blasint m, blasint n, double alpha, double  *a, blasint lda,  double  *x, blasint incx,  double beta,  double  *y, blasint incy);
27 | void cblas_dgemma(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
28 | void cblas_daxpya(blasint n, double, double *x, blasint incx, double *y, blasint incy);
29 | void cblas_dscala(blasint N, double alpha, double *X, blasint incX);
30 | 


--------------------------------------------------------------------------------
/caldgemm_common.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This file is part of the CALDGEMM library.
  3 |  *
  4 |  * Copyright 2015:
  5 |  *  - David Rohr (drohr@jwdt.org)
  6 |  *  - Matthias Bach (bach@compeng.uni-frankfurt.de)
  7 |  *  - Matthias Kretz (kretz@compeng.uni-frankfurt.de)
  8 |  *
  9 |  * This file is part of CALDGEMM.
 10 |  *
 11 |  * CALDGEMM is free software: you can redistribute it and/or modify
 12 |  * it under the terms of the GNU Lesser General Public License as published by
 13 |  * the Free Software Foundation, either version 3 of the License, or
 14 |  * (at your option) any later version.
 15 |  *
 16 |  * CALDGEMM is distributed in the hope that it will be useful,
 17 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 |  * GNU Lesser General Public License for more details.
 20 |  *
 21 |  * You should have received a copy of the GNU Lesser General Public License
 22 |  * along with CALDGEMM.  If not, see <http://www.gnu.org/licenses/>.
 23 |  */
 24 | 
 25 | #ifndef CALDGEMM_COMMON_H
 26 | #define CALDGEMM_COMMON_H
 27 | 
 28 | #include <signal.h>
 29 | 
 30 | #ifdef _WIN32
 31 | #define __INTRIN_H_
 32 | #define _Complex
 33 | #ifndef __CUDA_ARCH__
 34 | #ifndef __restrict__
 35 | #define __restrict__ __restrict
 36 | #endif
 37 | #endif
 38 | #endif
 39 | 
 40 | #if !defined(_WIN32) & defined(USE_GOTO_BLAS)
 41 | extern "C" {
 42 | #define CBLAS
 43 | #define ASSEMBLER
 44 | #include <common_linux.h>
 45 | #undef ASSEMBLER
 46 | #include <common.h>
 47 | }
 48 | #else
 49 | 
 50 | #ifndef USE_GOTO_BLAS
 51 | #include <omp.h>
 52 | #endif
 53 | 
 54 | extern "C" int get_num_procs();
 55 | static inline void caldgemm_goto_reserve_cpu(int, int) {}
 56 | static inline void caldgemm_goto_reserve_cpus(int) {}
 57 | 
 58 | typedef int blasint;
 59 | extern "C" {
 60 | #ifdef USE_MKL
 61 | #include <mkl_cblas.h>
 62 | #else
 63 | #include <cblas.h>
 64 | #endif
 65 | }
 66 | 
 67 | #ifndef _WIN32
 68 | void goto_set_num_threads(int num);
 69 | void caldgemm_goto_restrict_cpus(int);
 70 | 
 71 | #ifdef USE_MKL
 72 | #define CBLAS_ENUM
 73 | #else
 74 | #define CBLAS_ENUM enum
 75 | #endif
 76 | 
 77 | extern "C" {
 78 | void cblas_dtrsma(CBLAS_ENUM CBLAS_ORDER Order, CBLAS_ENUM CBLAS_SIDE Side, CBLAS_ENUM CBLAS_UPLO Uplo, CBLAS_ENUM CBLAS_TRANSPOSE TransA, CBLAS_ENUM CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
 79 | void cblas_dgemva(CBLAS_ENUM CBLAS_ORDER order,  CBLAS_ENUM CBLAS_TRANSPOSE trans,  blasint m, blasint n, double alpha, double  *a, blasint lda,  double  *x, blasint incx,  double beta,  double  *y, blasint incy);
 80 | void cblas_dgemma(CBLAS_ENUM CBLAS_ORDER Order, CBLAS_ENUM CBLAS_TRANSPOSE TransA, CBLAS_ENUM CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
 81 | void cblas_daxpya(blasint n, double, double *x, blasint incx, double *y, blasint incy);
 82 | void cblas_dscala(blasint N, double alpha, double *X, blasint incX);
 83 | }
 84 | #else
 85 | static inline void goto_set_num_threads(int) {}
 86 | static inline void caldgemm_goto_restrict_cpus(int) {}
 87 | #endif
 88 | 
 89 | #endif
 90 | 
 91 | #ifndef _WIN32
 92 | #define CAST_FOR_MMPREFETCH
 93 | #else
 94 | #define CAST_FOR_MMPREFETCH (char*)
 95 | #endif
 96 | 
 97 | #ifdef VTRACE
 98 | #include <vt_user.h>
 99 | #include <pthread.h>
100 | extern pthread_mutex_t global_vt_mutex;
101 | #define VT_USER_START_A(a) {pthread_mutex_lock(&global_vt_mutex);VT_USER_START(a);pthread_mutex_unlock(&global_vt_mutex);}
102 | #define VT_USER_END_A(a) {pthread_mutex_lock(&global_vt_mutex);VT_USER_END(a);pthread_mutex_unlock(&global_vt_mutex);}
103 | #else
104 | #define VT_USER_START_A(a)
105 | #define VT_USER_END_A(a)
106 | #endif
107 | 
108 | #define mcat(a, b) a ## b
109 | #define mxcat(a, b) mcat(a, b)
110 | 
111 | #define str(s) xstr(s)
112 | #define xstr(s) #s
113 | 
114 | #define PASS_ARG(arg) arg
115 | #define COMMA ,
116 | #define EMPTY
117 | 
118 | #define RED "\033[22;31m"
119 | #define BOLDRED "\033[1m\033[31m"
120 | #define BOLDBLACK "\033[1m\033[30m"
121 | #define RESET "\033[0m"
122 | 
123 | #define COMPARE_GENERAL(a, b) ((a) != (b))
124 | #define COMPARE_STRING(a, b) (strcmp(a, b))
125 | 
126 | #define PRINT_CONFIG_BASE(name1, type, type2, name2_old, name2_new, name2_conf, compare) \
127 | 	{ \
128 | 		if (oldConfig) \
129 | 		{ \
130 | 			if (compare((name2_old), (name2_new))) \
131 | 				fprintf(STD_OUT, "%35s: " type " changed to " BOLDRED type RESET "\n", name1, (type2) (name2_old), (type2) (name2_new)); \
132 | 		} \
133 | 		else \
134 | 		{ \
135 | 		     fprintf(STD_OUT, "%35s: " type "\n", name1, (type2) name2_conf); \
136 | 		} \
137 | 	}
138 | 	
139 | #define PRINT_CONFIG_BASE_WRAP(name1, name2, name1param, type, type2, conf, hide1, hide2, hide1val, hide2val) \
140 | 	{ \
141 | 		char tmpBuffer[256]; \
142 | 		sprintf(tmpBuffer, str(name1) name1param); \
143 | 		PRINT_CONFIG_BASE(tmpBuffer, type, type2, (hide1) ? (hide1val) : oldConfig->name2, (hide2) ? (hide2val) : newConfig->name2, conf->name2, COMPARE_GENERAL) \
144 | 	}
145 | 
146 | 
147 | #define PRINT_CONFIG_BASE_THIS(name1, name2, name1param, type, type2, conf) \
148 | 	{ \
149 | 		char tmpBuffer[256]; \
150 | 		sprintf(tmpBuffer, str(name1) name1param); \
151 | 		if (oldConfig == NULL) fprintf(STD_OUT, "%35s: " type "\n", tmpBuffer, (type2) conf->name2); \
152 | 	}
153 | 
154 | #define PRINT_CONFIG_INT(name) PRINT_CONFIG_BASE_WRAP(name, name, EMPTY, "%5d", int, myConfig, 0, 0, 0, 0)
155 | #define PRINT_CONFIG_CHAR(name) PRINT_CONFIG_BASE_WRAP(name, name, EMPTY, "%5c", char, myConfig, 0, 0, 0, 0)
156 | #define PRINT_CONFIG_DOUBLE(name) PRINT_CONFIG_BASE_WRAP(name, name, EMPTY, "%2.3f", double, myConfig, 0, 0, 0, 0)
157 | #define PRINT_CONFIG_STRING(name) \
158 | 	{ \
159 | 		const char* strEmpty = ""; \
160 | 		const char* str1 = (myConfig->name ? myConfig->name : strEmpty); \
161 | 		const char* str2 = (oldConfig && oldConfig->name ? oldConfig->name : strEmpty); \
162 | 		const char* str3 = (newConfig && newConfig->name ? newConfig->name : strEmpty); \
163 | 		PRINT_CONFIG_BASE(str(name), "%5s", char*, str2, str3, str1, COMPARE_STRING) \
164 | 	}
165 | 
166 | #define PRINT_CONFIG_INT_THIS(name) PRINT_CONFIG_BASE_THIS(name, name, EMPTY, "%5d", int, this)
167 | 
168 | #define PRINT_CONFIG_LOOP_INT(name, loopvar) \
169 | 	{ \
170 | 		for (int i = 0;i < mymax(oldConfig ? oldConfig->loopvar : 0, newConfig->loopvar);i++) \
171 | 		{ \
172 | 			PRINT_CONFIG_BASE_WRAP(name[%d], name[i], PASS_ARG(COMMA) i, "%5d", int, myConfig, oldConfig && oldConfig->loopvar <= i, newConfig->loopvar <= i, -1, -1) \
173 | 		} \
174 | 	}
175 | 	
176 | #define CALDGEMM_PREPARE_BACKEND_VARS1 \
177 | 	size_t blockm, blockn; \
178 | 	DGEMM_getblocks(k, blockm, blockn); \
179 | 	const size_t HeightM = ((blockm == gpu_m / Config->Height) ? (gpu_m % Config->Height) : Config->Height); \
180 | 	const size_t HeightN = ((blockn == gpu_n / Config->Height) ? (gpu_n % Config->Height) : Config->Height);
181 | 
182 | #define CALDGEMM_PREPARE_BACKEND_VARS2 \
183 | 	char myMat                = iMat ? 'B'                           : 'A'; \
184 | 	int& my_next_buffer       = iMat ? next_buffer_B[num_device]     : next_buffer_A[num_device]; \
185 | 	int*& my_buffer_pointers  = iMat ? buffer_pointers_B[num_device] : buffer_pointers_A[num_device]; \
186 | 	size_t& myblock           = iMat ? blockn : blockm; \
187 | 	bool& myTranspose         = iMat ? TransposeB : TransposeA; \
188 | 	const bool myKernelTranspose = iMat ? KernelSettings.transposeB : KernelSettings.transposeA; \
189 | 	const size_t& myHeight    = iMat ? HeightN : HeightM; \
190 | 	const size_t pitch        = iMat ? B_pitch : A_pitch; \
191 | 	double* src_ptr           = iMat ? \
192 | 		(B + blockn * Config->Height * (myTranspose ? B_pitch : 1)) : \
193 | 		(A + blockm * Config->Height * (myTranspose ? 1 : A_pitch)); \
194 | 	const bool access_bbuffers = (bool) (!DGEMM_favor_m && buffersSufficiant0) ^ (bool) iMat; \
195 | 	const int destbuffer       = access_bbuffers ? \
196 | 		((!iMat || buffersSufficiant) ? (my_buffer_pointers[myblock] % ((iMat || buffersSufficiant) ? bbuffers[num_device] : ibuffercount)) : (my_next_buffer % ibuffercount)) : \
197 | 		my_next_buffer % ibuffercount; \
198 | 	if (iMat) Timers.divideB++; else Timers.divideA++;
199 | 
200 | #define PREALLOC_ALTERNATE_LOOKAHEAD 4
201 | 
202 | #endif
203 | 


--------------------------------------------------------------------------------
/caldgemm_config.sample:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Compile time configuration of the CALDGEMM library.
  3 |  *
  4 |  * Copyright 2015:
  5 |  *  - David Rohr (drohr@jwdt.org)
  6 |  *  - Matthias Bach (bach@compeng.uni-frankfurt.de)
  7 |  *  - Matthias Kretz (kretz@compeng.uni-frankfurt.de)
  8 |  *
  9 |  * This file is part of CALDGEMM.
 10 |  *
 11 |  * CALDGEMM is free software: you can redistribute it and/or modify
 12 |  * it under the terms of the GNU Lesser General Public License as published by
 13 |  * the Free Software Foundation, either version 3 of the License, or
 14 |  * (at your option) any later version.
 15 |  *
 16 |  * CALDGEMM is distributed in the hope that it will be useful,
 17 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 |  * GNU Lesser General Public License for more details.
 20 |  *
 21 |  * You should have received a copy of the GNU Lesser General Public License
 22 |  * along with CALDGEMM.  If not, see <http://www.gnu.org/licenses/>.
 23 |  */
 24 | 
 25 | //CAL DGEMM Kernel Settings
 26 | #define CALDGEMM_TRANSPOSED_A					//Use Kernel for transposed A Matrix
 27 | //#define CALDGEMM_TRANSPOSED_B					//Use Kernel for transposed B Matrix
 28 | //#define CALDGEMM_88							//8x8 tiling (implies memexport)
 29 | //#define CALDGEMM_84							//8x4 tiling (implies memexport)
 30 | //#define CALDGEMM_48							//4x8 tiling (implies memexport)
 31 | #define CALDGEMM_44								//4x4 tiling
 32 | //#define CALDGEMM_USE_MEMEXPORT				//Use Memexport for output instead of color buffers
 33 | //#define CALDGEMM_COMPUTE_SHADER 64			//Use compute shader, define compute group size
 34 | //#define CALDGEMM_DIAGONAL_TEXTURE				//Alternate storage format, only valid for 4x4 kernel, obsolete
 35 | #define CALDGEMM_DUAL_ENTRY						//Unroll factor of 2 for 4x4 tiling
 36 | //#define CALDGEMM_SINGLE_BUFFER				//Use a single buffer, 4x4 tiling a transposed, experimental
 37 | //#define CALDGEMM_SINGLE_BUFFER_IMPROVED		//Alternative access scheme for single buffer, experimental
 38 | //#define CALDGEMM_DUAL_BUFFERS					//Double number of buffers, 4x4 tiling a transposed, experimental
 39 | #define CALDGEMM_LATE_EXIT_CONDITION			//Put exit condition at end of while loop
 40 | #define CALDGEMM_SHIFT_TEXTURE 1				//Shift even numbered rows in texture by n pixels
 41 | //#define CALDGEMM_44_BT_64						//64 bit DMA transfers for 4x4 B transposed kernel
 42 | //#define CALDGEMM_44_BT_64_CONVERT				//Perform 64 bit DMA transfer but transform to 128 bit for kernel input
 43 | 
 44 | //Other Settings
 45 | //#define TESTMODE								//Activate Test Mode for debugging
 46 | //#define CALDGEMM_LOOP_DETECTION				//Enable loop detection
 47 | //#define TEST_KERNEL
 48 | //#define TEST_PARAMETERS
 49 | //#define CALDGEMM_UNALIGNED_ADDRESSES
 50 | 
 51 | #ifndef STD_OUT
 52 | #define STD_OUT stdout							//Output for all messages
 53 | #endif
 54 | 
 55 | #define CALDGEMM_OUTPUT_THREADS 1				//Number of Output threads
 56 | #define CALDGEMM_OUTPUT_THREADS_SLOW 2			//Number of output threads when KeepBuffersMapped = false
 57 | #define CALDGEMM_EXTRA_OUTPUT_THREADS_LINPACK 0	//Number of additional output threads when running in linpack mode
 58 | #define REUSE_BBUFFERS							//Allocate many BBuffers on the GPU so B is not necessarily retransferred, used for A as well
 59 | //#define WASTE_MEMORY							//Allocate extra memory before and after every memory segment allocated
 60 | //#define CALDGEMM_BENCHMARK_KERNEL 1
 61 | 
 62 | //#define DEBUG_MSG_ALLOCATION					//Debug Messages considering GPU buffer allocation when in Debug = true
 63 | //#define DEBUG_MSG_TIMED						//Add timestamps to all messages
 64 | 
 65 | //#define CALDGEMM_SGEMM						//Experimental SGEMM implementation (requires MemExport)
 66 | //#define CALDGEMM_IGEMM						//Experimental IGEMM implementation (Integer instead of single) (requires SGEMM)
 67 | //#define CALDGEMM_BGEMM						//Experimental
 68 | 
 69 | #define CALDGEMM_MIN_TILE_DIM 32                              //Tile Dimension must be multiple of this
 70 | #define CALDGEMM_MIN_TILE_DIM2 128                            //Min dimension of a tile
 71 | #define CALDGEMM_MIN_CORRECTION_SIZE 768                      //Min tile size used to calculate correction ratio for tile distribution
 72 | 
 73 | //#define CALDGEMM_FORCE_K 16					//Force K Parameter to simulate different kernel perfoemance
 74 | 
 75 | #define _NO_AMD_CPU								//Set to run on CPU without 3dnow (which nowadays also include AMD CPUs)
 76 | #define _NO_AVX									//Do not use AVX instructions (Only relevant for OpenCL code atm)
 77 | #define _NO_ADL									//Do not use ADL library to read GPU temps
 78 | //#define _NO_AFFINITY							//Disable affinity setting
 79 | //#define USE_OLD_HUGE_MALLOC					//Use old method to allocate huge tables
 80 | //#define VTRACE
 81 | 
 82 | #define CALDGEMM_USE_VEC_MEMCPY_PREFETCH		//Use prefetching in Divide / Merge Buffer
 83 | #define CALDGEMM_STREAMING_STORES_DIVIDE		//Use streaming stores in Divide Buffer
 84 | #define CALDGEMM_STREAMING_STORES_MERGE			//Use streaming stores in Merge buffer
 85 | #define CALDGEMM_PREFETCH_MERGE_STORES			//Use prefetching in Merge buffer even when using streaming stores
 86 | #define CALDGEMM_MERGE_NOPS 20					//Add nops to slow down merge process freeing resources for other tasks
 87 | //#define CALDGEMM_MERGE_FLUSH				
 88 | 
 89 | //#define CALDGEMM_LDAB_INC 1					//Inc for LDA and LDB to avoid bank conflics
 90 | //#define CALDGEMM_LDB_INC 0					//Override LDAB_INC for LDB
 91 | //#define CALDGEMM_LDC_INC 0					//see above
 92 | 
 93 | //#define CALDGEMM_DIVIDE_STATIC_BUFFER			//Allocate tmpBuffer for divide staticly once and for all
 94 | #define CALDGEMM_DIVIDE_BLOCKING 128			//Blocking size for divideBuffer with SHIFT_TEXTURE = 1 (larger multiple of two)
 95 | //#define CALDGEMM_DIVIDE_TRANSPOSE_TWOPHASE	//Perform dividebuffer transposition in two phases such that fewer write combining buffers are used, Only works for 2 input buffers per matrix with A transposed!
 96 | #define CALDGEMM_TRANSPOSE_BLOCKING 8			//Blocking factor for the transposition (multiple of 2)
 97 | 
 98 | //#define CALDGEMM_QUERY_ALL_EVENTS				//Query for all events, not only the last one in a queue
 99 | //#define CALDGEMM_USE_CAL_WAIT_FOR_EVENTS		//Use different method for queriying CAL events
100 | //#define CALDGEMM_USE_CAL_WAIT_FOR_EVENTS_NO_POLL	//Do not use active wait to reduce CPU utilization
101 | 
102 | //Settings for integrated OpenCL kernels, 3rd party kernels from -Ol library must override this
103 | #define OCL_TILING_X 4
104 | #define OCL_TILING_Y 4
105 | #define OCL_TILED_KERNEL
106 | #define OCL_USE_SIMPLE_BUFFERS
107 | #define OCL_GROUP_SIZE_X 8
108 | #define OCL_GROUP_SIZE_Y 8
109 | 
110 | //Custom header files for optimized height parameters
111 | //#define CALDGEMM_CUSTOM_AUTO_HEIGHT "auto_height.h"		//Can define a custom header file that is included in caldgemm, that handles autoheight feature
112 | //#define CALDGEMM_CUSTOM_HEIGHT_MOD "height_mod.h"		//Same for posterior height adoption
113 | 
114 | //#define CALDGEMM_OPENCL_EMULATE_STRIDED		//Emulate strided transfers in OpenCL via linear transfers
115 | #define CALDGEMM_OPENCL_USE_ORIGINAL_POINTERS	//Use the original pointers returned by clEnqueueMapBuffer for the DMA transfers and supply an origin parameter for the correct offset
116 | #define CALDGEMM_OPENCL_PROFILED_PIPELINE 0	//Use a profiling command queue to get timing information in pipelined runs.
117 | 


--------------------------------------------------------------------------------
/caldgemm_config_load.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Utility header to complete configuration given in caldgemm_config.h
  3 |  *
  4 |  * Copyright 2015:
  5 |  *  - David Rohr (drohr@jwdt.org)
  6 |  *  - Matthias Bach (bach@compeng.uni-frankfurt.de)
  7 |  *  - Matthias Kretz (kretz@compeng.uni-frankfurt.de)
  8 |  *
  9 |  * This file is part of CALDGEMM.
 10 |  *
 11 |  * CALDGEMM is free software: you can redistribute it and/or modify
 12 |  * it under the terms of the GNU Lesser General Public License as published by
 13 |  * the Free Software Foundation, either version 3 of the License, or
 14 |  * (at your option) any later version.
 15 |  *
 16 |  * CALDGEMM is distributed in the hope that it will be useful,
 17 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 |  * GNU Lesser General Public License for more details.
 20 |  *
 21 |  * You should have received a copy of the GNU Lesser General Public License
 22 |  * along with CALDGEMM.  If not, see <http://www.gnu.org/licenses/>.
 23 |  */
 24 | 
 25 | #include "caldgemm_config.h"
 26 | 
 27 | #ifdef CALDGEMM_COMPUTE_SHADER
 28 | #define CALDGEMM_USE_MEMEXPORT
 29 | #endif
 30 | 
 31 | 
 32 | #ifdef CALDGEMM_44_BT_64
 33 | #ifdef CALDGEMM_88
 34 | #undef CALDGEMM_88
 35 | #endif
 36 | #ifdef CALDGEMM_84
 37 | #undef CALDGEMM_84
 38 | #endif
 39 | #ifdef CALDGEMM_48
 40 | #undef CALDGEMM_48
 41 | #endif
 42 | #ifdef CALDGEMM_TRANSPOSED_A
 43 | #undef CALDGEMM_TRANSPOSED_A
 44 | #endif
 45 | #define CALDGEMM_TRANSPOSED_B
 46 | #define CALDGEMM_44
 47 | #ifndef CALDGEMM_44_BT_64_CONVERT
 48 | #define CALDGEMM_44_BT_64_KERNEL
 49 | #endif
 50 | #endif
 51 | 
 52 | #ifdef CALDGEMM_88
 53 | #define CALDGEMM_84
 54 | #define CALDGEMM_48
 55 | #endif
 56 | 
 57 | #if defined(CALDGEMM_84) | defined(CALDGEMM_48)
 58 | #define CALDGEMM_44
 59 | #define CALDGEMM_USE_MEMEXPORT
 60 | #ifndef CALDGEMM_TRANSPOSED_A
 61 | #define CALDGEMM_TRANSPOSED_A
 62 | #warning Setting CALDGEMM_TRANSPOSED_A for 8x?/?x8 CAL tiling
 63 | #endif
 64 | #ifdef CALDGEMM_TRANSPOSED_B
 65 | #warning Unsetting CALDGEMM_TRANSPOSED_B for 8x?/?x8 CAL tiling
 66 | #undef CALDGEMM_TRANSPOSED_B
 67 | #endif
 68 | #endif
 69 | 
 70 | #ifdef CALDGEMM_44
 71 | #ifdef CALDGEMM_TRANSPOSED_B
 72 | #ifdef CALDGEMM_TRANSPOSED_A
 73 | #warning Unsetting CALDGEMM_TRANSPOSED_A for != 8x2 CAL tiling
 74 | #undef CALDGEMM_TRANSPOSED_A
 75 | #endif
 76 | #else
 77 | #ifndef CALDGEMM_TRANSPOSED_A
 78 | #warning Setting CALDGEMM_TRANSPOSED_A for != 8x2 CAL tiling
 79 | #define CALDGEMM_TRANSPOSED_A
 80 | #endif
 81 | #endif
 82 | #endif
 83 | 
 84 | #if defined(CALDGEMM_DIAGONAL_TEXTURE) & (!defined(CALDGEMM_44) | defined(CALDGEMM_84) | defined(CALDGEMM_48) | !defined(CALDGEMM_TRANSPOSED_A))
 85 | #undef CALDGEMM_DIAGONAL_TEXTURE
 86 | #endif
 87 | 
 88 | #if defined(CALDGEMM_DUAL_ENTRY) & (!defined(CALDGEMM_44) | defined(CALDGEMM_84) | defined(CALDGEMM_48) | !defined(CALDGEMM_TRANSPOSED_A))
 89 | #undef CALDGEMM_DUAL_ENTRY
 90 | #endif
 91 | 
 92 | #if defined(CALDGEMM_SINGLE_BUFFER) | defined(CALDGEMM_DOUBLE_BUFFERS)
 93 | #if !defined(CALDGEMM_44) | defined(CALDGEMM_48) | defined(CALDGEMM_84) | !defined(CALDGEMM_DUAL_ENTRY) | defined(CALDGEMM_TRANSPOSED_B)
 94 | #error Invalid options for CALDGEMM_SINGLE_BUFFER/CALDGEMM_DOUBLE_BUFFERS
 95 | #endif
 96 | #endif
 97 | 
 98 | 
 99 | #if defined(CALDGEMM_48) | !defined(CALDGEMM_44)
100 | #define TILING_Y 8
101 | #else
102 | #define TILING_Y 4
103 | #endif
104 | 
105 | #if defined(CALDGEMM_84)
106 | #define TILING_X 8
107 | #elif defined(CALDGEMM_44)
108 | #define TILING_X 4
109 | #else
110 | #define TILING_X 2
111 | #endif
112 | 
113 | #ifdef CALDGEMM_LDAB_INC
114 | #define CALDGEMM_LDA_INC CALDGEMM_LDAB_INC
115 | #ifndef CALDGEMM_LDB_INC
116 | #define CALDGEMM_LDB_INC CALDGEMM_LDAB_INC
117 | #endif
118 | #endif
119 | 
120 | #ifdef CALDGEMM_SHIFT_TEXTURE
121 | #if defined(CALDGEMM_LDA_INC) & CALDGEMM_LDA_INC < CALDGEMM_SHIFT_TEXTURE
122 | #undef CALDGEMM_LDA_INC
123 | #endif
124 | #if defined(CALDGEMM_LDB_INC) & CALDGEMM_LDB_INC < CALDGEMM_SHIFT_TEXTURE
125 | #undef CALDGEMM_LDB_INC
126 | #endif
127 | #ifndef CALDGEMM_LDA_INC
128 | #define CALDGEMM_LDA_INC CALDGEMM_SHIFT_TEXTURE
129 | #endif
130 | #ifndef CALDGEMM_LDB_INC
131 | #define CALDGEMM_LDB_INC CALDGEMM_SHIFT_TEXTURE
132 | #endif
133 | #endif
134 | 
135 | #if defined(CALDGEMM_SINGLE_BUFFER_IMPROVED) & !defined(CALDGEMM_SINGLE_BUFFER)
136 | #undef CALDGEMM_SINGLE_BUFFER
137 | #endif
138 | 
139 | #ifdef CALDGEMM_DIVIDE_STATIC_BUFFER
140 | #ifdef _WIN32
141 | #define CALDGEMM_DIVBUFA ,double* tmpBuffer
142 | #else
143 | #define CALDGEMM_DIVBUFA ,double* __restrict__ tmpBuffer
144 | #endif
145 | #define CALDGEMM_DIVBUFB , tmpBuffer
146 | #else
147 | #define CALDGEMM_DIVBUFA
148 | #define CALDGEMM_DIVBUFB
149 | #endif
150 | 


--------------------------------------------------------------------------------
/caldgemm_cpu.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * CPU side of CALDGEMM implementation.
  3 |  *
  4 |  * Copyright 2015:
  5 |  *  - David Rohr (drohr@jwdt.org)
  6 |  *  - Matthias Bach (bach@compeng.uni-frankfurt.de)
  7 |  *  - Matthias Kretz (kretz@compeng.uni-frankfurt.de)
  8 |  *
  9 |  * This file is part of CALDGEMM.
 10 |  *
 11 |  * CALDGEMM is free software: you can redistribute it and/or modify
 12 |  * it under the terms of the GNU Lesser General Public License as published by
 13 |  * the Free Software Foundation, either version 3 of the License, or
 14 |  * (at your option) any later version.
 15 |  *
 16 |  * CALDGEMM is distributed in the hope that it will be useful,
 17 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 |  * GNU Lesser General Public License for more details.
 20 |  *
 21 |  * You should have received a copy of the GNU Lesser General Public License
 22 |  * along with CALDGEMM.  If not, see <http://www.gnu.org/licenses/>.
 23 |  */
 24 | 
 25 | #include "caldgemm_cpu.h"
 26 | 
 27 | caldgemm_cpu::caldgemm_cpu() : caldgemm()
 28 | {
 29 | }
 30 | 
 31 | caldgemm_cpu::~caldgemm_cpu()
 32 | {
 33 | }
 34 | 
 35 | int caldgemm_cpu::WaitForEvent(int a, int b, int)
 36 | {
 37 | 	if (Config->Debug) fprintf(STD_OUT, "\tSkipping waiting for event from device %d obuffer %d...\n", b, a);
 38 | 	return(0);
 39 | }
 40 | 
 41 | int caldgemm_cpu::Initialize(bool nocalinit)
 42 | {
 43 | 	if (!Config->Quiet) fprintf(STD_OUT, "Initializing CALDGEMM (CPU Runtime)\n");
 44 | 	if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU Initialice\n");
 45 | 
 46 | 	nDevices = 0;
 47 | 	gpu_available = 0;
 48 | 
 49 | 	return(0);
 50 | }
 51 | 
 52 | int caldgemm_cpu::ValidateRuntime()
 53 | {
 54 | 	if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU ValidateRuntime\n");
 55 | 	Config->GPU_C = false;
 56 | 	SetDefaultKernelSettings();
 57 | 	return(0);
 58 | }
 59 | 
 60 | int caldgemm_cpu::CheckDevices()
 61 | {
 62 | 	if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU CheckDevices\n");
 63 | 	return(0);
 64 | }
 65 | 
 66 | int caldgemm_cpu::InitDevices()
 67 | {
 68 | 	if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU InitDevices\n");
 69 | 
 70 | 	return(0);
 71 | }
 72 | 
 73 | int caldgemm_cpu::ReinitDevices()
 74 | {
 75 | 	if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU ReinitDevices\n");
 76 | 	return(0);
 77 | }
 78 | 
 79 | int caldgemm_cpu::InitConstantData(double alpha)
 80 | {
 81 | 	if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU InitConstantData\n");
 82 | 	return(0);
 83 | }
 84 | 
 85 | int caldgemm_cpu::ExecuteKernels(caldgemm::DGEMMPrepareAndExecuteTask& Task, int blockm, int blockn)
 86 | {
 87 | 	if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU ExecuteKernels\n");
 88 | 
 89 | 	fprintf(STD_OUT, "Error: DGEMMPrepareAndExecuteTask shoul never be executed for CALDGEMM_CPU\n");
 90 | 
 91 | 	return(1);
 92 | }
 93 | 
 94 | int caldgemm_cpu::ExitRuntime()
 95 | {
 96 | 	if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU ExitRuntime\n");
 97 | 
 98 | 	return(0);
 99 | }
100 | 
101 | int caldgemm_cpu::FetchResult(int device, int j, int m, int n, int mustlock)
102 | {
103 | 	if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU FetchResult\n");
104 | 	return(0);
105 | }
106 | 
107 | int caldgemm_cpu::CheckDMAQueue(int device, int forcej)
108 | {
109 | 	return(0);
110 | }
111 | 
112 | int caldgemm_cpu::RunMergeBuffers(double* dst, int device, int j, int width, int height, int gpu_width, int gpu_height, int pitch)
113 | {
114 | 	if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU RunMergeBuffers\n");
115 | 	return(0);
116 | }
117 | 
118 | int caldgemm_cpu::DGEMM_prepare_backend(size_t k, int j, unsigned int num_device, bool prepareM, bool prepareN, bool buffersSufficiant, bool buffersSufficiant0 CALDGEMM_DIVBUFA)
119 | {
120 | 	if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU DGEMM_prepare k=%lld j=%d device=%d\n", (long long int) k, j, num_device);
121 | 	
122 | 	return(0);
123 | }
124 | 
125 | int caldgemm_cpu::ExitDevices()
126 | {
127 | 	if (Config->Debug) fprintf(STD_OUT, "CALDGEMM_CPU ExitDevices\n");
128 | 
129 | 	return(0);
130 | }
131 | 
132 | int caldgemm_cpu::UseOutputPthreads() {return(0);}
133 | int caldgemm_cpu::UseInputPthreads() {return(0);}
134 | int caldgemm_cpu::UseMutexPerDevice() {return(0);}
135 | 


--------------------------------------------------------------------------------
/caldgemm_cpu.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Interface of the CALDGEMM library.
 3 |  *
 4 |  * Copyright 2015:
 5 |  *  - David Rohr (drohr@jwdt.org)
 6 |  *  - Matthias Bach (bach@compeng.uni-frankfurt.de)
 7 |  *  - Matthias Kretz (kretz@compeng.uni-frankfurt.de)
 8 |  *
 9 |  * This file is part of CALDGEMM.
10 |  *
11 |  * CALDGEMM is free software: you can redistribute it and/or modify
12 |  * it under the terms of the GNU Lesser General Public License as published by
13 |  * the Free Software Foundation, either version 3 of the License, or
14 |  * (at your option) any later version.
15 |  *
16 |  * CALDGEMM is distributed in the hope that it will be useful,
17 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 |  * GNU Lesser General Public License for more details.
20 |  *
21 |  * You should have received a copy of the GNU Lesser General Public License
22 |  * along with CALDGEMM.  If not, see <http://www.gnu.org/licenses/>.
23 |  */
24 | 
25 | #ifndef caldgemm_cpu_H
26 | #define caldgemm_cpu_H
27 | 
28 | #include "caldgemm.h"
29 | 
30 | class caldgemm_cpu : public caldgemm
31 | {
32 | public:
33 | 	caldgemm_cpu();
34 | 	virtual ~caldgemm_cpu();
35 | 
36 | private:
37 | 	virtual int UseOutputPthreads();
38 | 	virtual int UseInputPthreads();
39 | 	virtual int UseMutexPerDevice();
40 | 
41 | 	virtual int DGEMM_prepare_backend(size_t k, int j, unsigned int num_device, bool prepareM, bool prepareN, bool buffersSufficiant, bool buffersSufficiant0 CALDGEMM_DIVBUFA);
42 | 	virtual	int Initialize (bool nocalinit);
43 | 	virtual int ValidateRuntime();
44 | 	virtual int CheckDevices();
45 | 	virtual int InitDevices();
46 | 	virtual int ReinitDevices();
47 | 	virtual int InitConstantData(double alpha);
48 | 	virtual int ExecuteKernels(caldgemm::DGEMMPrepareAndExecuteTask& Task, int blockm, int blockn);
49 | 	virtual int ExitRuntime();
50 | 	virtual int ExitDevices();
51 | 	virtual int WaitForEvent(int, int, int);
52 | 	virtual int FetchResult(int device, int j, int m, int n, int mustlock = 0);
53 | 	virtual int CheckDMAQueue(int device, int forcej = -1);
54 | 	virtual int RunMergeBuffers(double* dst, int device, int j, int width, int height, int gpu_width, int gpu_height, int pitch);
55 | };
56 | 
57 | #endif
58 | 


--------------------------------------------------------------------------------
/caldgemm_cuda.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Interface of the CALDGEMM library.
  3 |  *
  4 |  * Copyright 2015:
  5 |  *  - David Rohr (drohr@jwdt.org)
  6 |  *  - Matthias Bach (bach@compeng.uni-frankfurt.de)
  7 |  *  - Matthias Kretz (kretz@compeng.uni-frankfurt.de)
  8 |  *
  9 |  * This file is part of CALDGEMM.
 10 |  *
 11 |  * CALDGEMM is free software: you can redistribute it and/or modify
 12 |  * it under the terms of the GNU Lesser General Public License as published by
 13 |  * the Free Software Foundation, either version 3 of the License, or
 14 |  * (at your option) any later version.
 15 |  *
 16 |  * CALDGEMM is distributed in the hope that it will be useful,
 17 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 |  * GNU Lesser General Public License for more details.
 20 |  *
 21 |  * You should have received a copy of the GNU Lesser General Public License
 22 |  * along with CALDGEMM.  If not, see <http://www.gnu.org/licenses/>.
 23 |  */
 24 | 
 25 | #ifndef caldgemm_cuda_H
 26 | #define caldgemm_cuda_H
 27 | 
 28 | #include <cuda.h>
 29 | #include <cuda_runtime_api.h>
 30 | #ifdef CALDGEMM_CUDA_CUBLAS
 31 | #include <cublas_v2.h>
 32 | #endif
 33 | #include "caldgemm.h"
 34 | 
 35 | class caldgemm_cuda : public caldgemm
 36 | {
 37 | public:
 38 | 	caldgemm_cuda();
 39 | 	virtual ~caldgemm_cuda();
 40 | 
 41 | private:
 42 | 	virtual int UseOutputPthreads();
 43 | 	virtual int UseInputPthreads();
 44 | 	virtual int UseMutexPerDevice();
 45 | 
 46 | 	virtual int DGEMM_prepare_backend(size_t k, int j, unsigned int num_device, bool prepareM, bool prepareN, bool buffersSufficiant, bool buffersSufficiant0 CALDGEMM_DIVBUFA);
 47 | 	virtual	int Initialize (bool nocalinit);
 48 | 	virtual int ValidateRuntime();
 49 | 	virtual int CheckDevices();
 50 | 	virtual int InitDevices();
 51 | 	virtual int ReinitDevices();
 52 | 	virtual int InitConstantData(double alpha);
 53 | 	virtual int ExecuteKernels(caldgemm::DGEMMPrepareAndExecuteTask& Task, int blockm, int blockn);
 54 | 	virtual int ExitRuntime();
 55 | 	virtual int ExitDevices();
 56 | 	virtual int WaitForEvent(int, int, int);
 57 | 	virtual int FetchResult(int device, int j, int m, int n, int mustlock = 0);
 58 | 	virtual int CheckDMAQueue(int device, int forcej = -1);
 59 | 	virtual int RunMergeBuffers(double* dst, int device, int j, int width, int height, int gpu_width, int gpu_height, int pitch);
 60 | 	virtual int RunCALDGEMM_Init();
 61 | 	virtual int RunCALDGEMM_Exit();
 62 | 
 63 | 	virtual double* AllocMemory(size_t nDoubles, bool page_locked, bool huge_pages, bool gpuaccessible = false, bool interleave = false);
 64 | 	virtual int FreeMemory(double* ptr, bool gpuaccessible = false);
 65 | 	virtual int Preallocate();
 66 | 	virtual int PreallocateFree();
 67 | 	virtual int SimpleQueuingAvailable();
 68 | 	
 69 | 	void SetupSimpleQueue(size_t mb, size_t nb);
 70 | 	struct caldgemm_cuda_simple_queue_event
 71 | 	{
 72 | 		cudaEvent_t event;
 73 | 		int num_queue;
 74 | 	};
 75 | 	caldgemm_cuda_simple_queue_event* simple_queue_events[max_devices][2]; //2 for m and n direction
 76 | 	bool* simple_queue_event_requested[max_devices][obuffercount][2];
 77 | 	cudaEvent_t simple_queue_event_kernels[max_devices][ibuffercount][obuffercount];
 78 | 	bool simple_queue_event_kernels_used[max_devices][ibuffercount][obuffercount];
 79 | 	struct alternateSimpleQueueCBuffferEventStruct
 80 | 	{
 81 | 		cudaEvent_t event;
 82 | 		bool used;
 83 | 	};
 84 | 	cudaEvent_t alternateSimpleQueueCopyCEvent[max_devices][obuffercount];
 85 | 	alternateSimpleQueueCBuffferEventStruct alternateSimpleQueueCBuffferEvent[max_devices][obuffercount];
 86 | 	cudaEvent_t alternateSimpleQueueEvent_tmp_abuffers[max_devices][obuffercount];
 87 | 	cudaEvent_t alternateSimpleQueueEvent_tmp_bbuffers[max_devices][obuffercount];
 88 | 	bool alternateSimpleQueueEvent_tmp_abuffers_used[max_devices][obuffercount];
 89 | 	bool alternateSimpleQueueEvent_tmp_bbuffers_used[max_devices][obuffercount];
 90 | 	cudaEvent_t alternateSimpleQueueTmpEvents[2];
 91 | 	
 92 | 	cudaEvent_t* AlternateLookaheadTilesRemainingSQ_events;
 93 | 	virtual int CheckAlternateTilesRemainingSQ();
 94 | 	qSem AlternateLookaheadDoneMutexSQ;
 95 | 
 96 | 	int cuda_devices[max_devices];
 97 | 	cudaStream_t cuda_command_queues[max_devices][obuffercount + 2];
 98 | 	void* cuda_abuffers[max_devices][ibuffercount];
 99 | 	void* cuda_bbuffers[max_devices][max_bbuffers];
100 | 	void* cuda_cbuffers[max_devices][obuffercount];
101 | 	void* cuda_tmp_abuffers[max_devices][obuffercount];
102 | 	void* cuda_tmp_bbuffers[max_devices][obuffercount];
103 | 	cudaEvent_t cuda_events[max_devices][obuffercount];
104 | #ifdef CALDGEMM_CUDA_CUBLAS
105 |         cublasHandle_t cublas_handles[max_devices];
106 | #endif
107 | 	cudaEvent_t cuda_conversion_events[max_devices][2];
108 | 	int cuda_conversion_events_use[max_devices][2];
109 | 
110 | 	int WaitForEventAndRelease(cudaEvent_t* pEvent);
111 | 
112 | 	static const int GROUP_SIZE_X = 16, GROUP_SIZE_Y = 16, GROUP_COUNT_X = 16, GROUP_COUNT_Y = 16;	//Group and block size for conversion kernels and for DGEMM kernel
113 | 	
114 | 	struct conversionKernelTaskStruct
115 | 	{
116 | 		conversionKernelTaskStruct() {}
117 | 		conversionKernelTaskStruct(void* c1, void* c2, int c3, int c4, char c5) : dest_buffer_tmp(c1), dest_image(c2), arg_width(c3), arg_height(c4), myMat(c5) {}
118 | 		void* dest_buffer_tmp;
119 | 		void* dest_image;
120 | 		size_t arg_width;
121 | 		size_t arg_height;
122 | 		char myMat;
123 | 	};
124 | };
125 | 
126 | #endif
127 | 


--------------------------------------------------------------------------------
/caldgemm_opencl.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Interface of the CALDGEMM library.
  3 |  *
  4 |  * Copyright 2015:
  5 |  *  - David Rohr (drohr@jwdt.org)
  6 |  *  - Matthias Bach (bach@compeng.uni-frankfurt.de)
  7 |  *  - Matthias Kretz (kretz@compeng.uni-frankfurt.de)
  8 |  *
  9 |  * This file is part of CALDGEMM.
 10 |  *
 11 |  * CALDGEMM is free software: you can redistribute it and/or modify
 12 |  * it under the terms of the GNU Lesser General Public License as published by
 13 |  * the Free Software Foundation, either version 3 of the License, or
 14 |  * (at your option) any later version.
 15 |  *
 16 |  * CALDGEMM is distributed in the hope that it will be useful,
 17 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 |  * GNU Lesser General Public License for more details.
 20 |  *
 21 |  * You should have received a copy of the GNU Lesser General Public License
 22 |  * along with CALDGEMM.  If not, see <http://www.gnu.org/licenses/>.
 23 |  */
 24 | 
 25 | #ifndef CALDGEMM_OPENCL_H
 26 | #define CALDGEMM_OPENCL_H
 27 | 
 28 | #include <CL/opencl.h>
 29 | 
 30 | #include "caldgemm.h"
 31 | 
 32 | #if !defined(CALDGEMM_TRANSPOSED_A) & !defined(CALDGEMM_TRANSPOSED_B)
 33 | #error You must either defined CALDGEMM_TRANSPOSED_A or CALDGEMM_TRANSPOSED_B for the OpenCL backend
 34 | #endif
 35 | 
 36 | #ifndef _WIN32
 37 | #define HINSTANCE void*
 38 | #endif
 39 | 
 40 | class caldgemm_opencl : public caldgemm
 41 | {
 42 | public:
 43 | 	caldgemm_opencl();
 44 | 	virtual ~caldgemm_opencl();
 45 | 
 46 | 	class caldgemm_config_backend_opencl : public caldgemm_config_backend
 47 | 	{
 48 | 	public:
 49 | 		virtual ~caldgemm_config_backend_opencl();
 50 | 		caldgemm_config_backend_opencl();
 51 | 		virtual int ParseBackendOptions(unsigned int argc, char** argv);
 52 | 		virtual void printConfig(caldgemm_config_backend* oldConfig = NULL);
 53 | 		virtual caldgemm_config_backend_opencl* Clone() const {return new caldgemm_config_backend_opencl(*this);}
 54 | 
 55 | 		char* kernelLib;
 56 | 		bool allowCPUDevice;
 57 | 	};
 58 | 	virtual caldgemm_config_backend* create_caldgemm_config_backend();
 59 | 	
 60 | private:
 61 | 	virtual int UseOutputPthreads();
 62 | 	virtual int UseInputPthreads();
 63 | 	virtual int UseMutexPerDevice();
 64 | 	virtual int AllowCPUFallback();
 65 | 	virtual int SimpleQueuingAvailable();
 66 | 	virtual int PipelinedModeAvailable();
 67 | 	virtual int AsyncModeAvailable();
 68 | 
 69 | 	virtual int DGEMM_prepare_backend(size_t k, int j, unsigned int num_device, bool prepareM, bool prepareN, bool buffersSufficiant, bool buffersSufficiant0 CALDGEMM_DIVBUFA);
 70 | 	virtual	int Initialize (bool nocalinit);
 71 | 	virtual int ValidateRuntime();
 72 | 	virtual int CheckDevices();
 73 | 	virtual int InitDevices();
 74 | 	virtual int ReinitDevices();
 75 | 	virtual int InitConstantData(double alpha);
 76 | 	virtual int ExecuteKernels(caldgemm::DGEMMPrepareAndExecuteTask& Task, int blockm, int blockn);
 77 | 	virtual int ExitRuntime();
 78 | 	virtual int ExitDevices();
 79 | 	virtual int WaitForEvent(int, int, int);
 80 | 	virtual int FetchResult(int device, int j, int m, int n, int mustlock = 0);
 81 | 	virtual int CheckDMAQueue(int device, int forcej = -1);
 82 | 	virtual int RunMergeBuffers(double* dst, int device, int j, int width, int height, int gpu_width, int gpu_height, int pitch);
 83 | 	virtual int RunCALDGEMM_Init();
 84 | 	virtual int RunCALDGEMM_Exit();
 85 | 	virtual int Preallocate();
 86 | 	virtual int PreallocateFree();
 87 | 	virtual int RunAsyncSingleTileDGEMM(const double* A, const double* B, double* C, double alpha, double beta, size_t m, size_t k, size_t n, size_t Apitch, size_t Bpitch, size_t Cpitch, bool orderColMajor, bool TransA, bool TransB);
 88 | 	virtual int RunAsyncSingleTileDTRSM(const CBLAS_ORDER Order, const CBLAS_SIDE Side, const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, const size_t M, const size_t N, const double alpha, const double *A, const size_t lda, double *B, const size_t ldb);
 89 | 	virtual int RunCALDGEMM_Finish();
 90 | 	virtual int CheckParams();
 91 | 	virtual int FinishDataInit();
 92 | 	virtual void FinishDataFill();
 93 | 	virtual int WaitForCALDGEMMProgress(size_t n);
 94 | 
 95 | 	virtual double* AllocMemory(size_t nDoubles, bool page_locked, bool huge_pages, bool gpuaccessible = false, bool interleave = false);
 96 | 	virtual int FreeMemory(double* ptr, bool gpuaccessible = false);
 97 | 	
 98 | 	virtual int CaldgemmCustomAutoHeight(size_t MaxGpuM, size_t MaxGpuN, int nDevices);
 99 | 	virtual int CaldgemmCustomModHeight(size_t MOD_OVER, size_t MOD_GPU);
100 | 
101 | 	void SetupSimpleQueue(size_t mb, size_t nb);
102 | 
103 | 	cl_platform_id ocl_platform;
104 | 	cl_device_id ocl_devices[max_devices + 1]; //+1 for cpu
105 | 	cl_context ocl_context;
106 | 	cl_command_queue ocl_command_queues[max_devices][obuffercount > 3 ? obuffercount : 3];
107 | 	cl_command_queue ocl_command_queue_cpu;
108 | 	cl_mem ocl_abuffers[2][max_devices][ibuffercount];
109 | 	cl_mem ocl_bbuffers[2][max_devices][max_bbuffers];
110 | 	cl_mem ocl_cbuffers[2][max_devices][obuffercount];
111 | 	cl_mem ocl_tmp_abuffers[2][max_devices][ibuffercount > obuffercount ? ibuffercount : obuffercount];
112 | 	cl_mem ocl_tmp_bbuffers[2][max_devices][ibuffercount > obuffercount ? ibuffercount : obuffercount];
113 | 	cl_mem ocl_tmp_cbuffers[max_devices][obuffercount];
114 | 	cl_event ocl_events[max_devices][obuffercount];
115 | 	cl_program ocl_program[5];
116 | 	cl_kernel ocl_kernel[max_devices][5];
117 | 
118 | 	cl_command_queue ocl_async_queue[max_devices];
119 | 	cl_kernel ocl_async_kernel[max_devices][4];
120 | 	cl_mem ocl_async_buffers[max_devices][4];
121 | 
122 | 	struct caldgemm_opencl_simple_queue_event
123 | 	{
124 | 		cl_event event;
125 | 		int num_queue;
126 | 	};
127 | 	
128 | 	struct finishStructOpenCL : public finishStruct
129 | 	{
130 | 		virtual ~finishStructOpenCL() {}
131 | 		
132 | 		cl_event StartMarker[max_devices][obuffercount];
133 | 		cl_event MidMarker[max_devices][obuffercount];
134 | 		cl_event EndMarker[max_devices][obuffercount];
135 | 		
136 | 		bool MidMarkerDone, EndMarkerDone;
137 | 	};
138 | 	cl_event StartMarker[max_devices][obuffercount];
139 | 	cl_event MidMarker[max_devices][obuffercount];
140 | 	cl_event EndMarker[max_devices][obuffercount];
141 | 	bool MidMarkerCreated[max_devices][obuffercount];
142 | 
143 | 	caldgemm_opencl_simple_queue_event* simple_queue_events[max_devices][2]; //2 for m and n direction
144 | 	bool* simple_queue_event_requested[max_devices][obuffercount][2];
145 | 	cl_event simple_queue_event_kernels[max_devices][ibuffercount][obuffercount];
146 | 	bool simple_queue_event_kernels_used[max_devices][ibuffercount][obuffercount];
147 | 	
148 | 	struct alternateSimpleQueueCBuffferEventStruct
149 | 	{
150 | 		cl_event event;
151 | 		bool must_release;
152 | 		bool used;
153 | 	};
154 | 
155 | 	struct conversionKernelTaskStruct
156 | 	{
157 | 		conversionKernelTaskStruct() {}
158 | 		conversionKernelTaskStruct(cl_mem c1, cl_mem* c2, int c3, int c4, int c5, cl_event* c6, cl_event c7, cl_event* c8, char c9) : dest_buffer_tmp(c1), dest_image(c2), arg_width(c3), arg_height(c4), arg_transpose(c5), ev(c6), ev2(c7), ev3(c8), myMat(c9) {}
159 | 		cl_mem dest_buffer_tmp;
160 | 		cl_mem* dest_image;
161 | 		int arg_width;
162 | 		int arg_height;
163 | 		int arg_transpose;
164 | 		cl_event* ev;
165 | 		cl_event ev2;
166 | 		cl_event* ev3;
167 | 		char myMat;
168 | 	};
169 | 	
170 | 	cl_event alternateSimpleQueueCopyCEvent[max_devices][obuffercount];
171 | 	alternateSimpleQueueCBuffferEventStruct alternateSimpleQueueCBuffferEvent[max_devices][obuffercount];
172 | 	cl_event alternateSimpleQueueEvent_tmp_abuffers[max_devices][obuffercount];
173 | 	cl_event alternateSimpleQueueEvent_tmp_bbuffers[max_devices][obuffercount];
174 | 	bool alternateSimpleQueueEvent_tmp_abuffers_used[max_devices][obuffercount];
175 | 	bool alternateSimpleQueueEvent_tmp_bbuffers_used[max_devices][obuffercount];
176 | 	
177 | 	bool pipelinedModeStartBarrierDone[max_devices][obuffercount];
178 | 	void pipelinedModeSetStartBarriers(unsigned int num_device, int j, int &nTransferEvents, cl_event* transferEvents, bool &freeTransferEvents);
179 | 
180 | 	cl_event* AlternateLookaheadTilesRemainingSQ_events;
181 | 	virtual int CheckAlternateTilesRemainingSQ();
182 | 	qSem AlternateLookaheadDoneMutexSQ;
183 | 
184 | 	double* ocl_tmp_abuffers_ptr[max_devices][ibuffercount];
185 | 	double* ocl_tmp_bbuffers_ptr[max_devices][ibuffercount];
186 | 	double* ocl_tmp_cbuffers_ptr[max_devices][obuffercount];
187 | 
188 | 	cl_event ocl_conversion_events[max_devices][2];
189 | 	int ocl_conversion_events_use[max_devices][2];
190 | 
191 | 	static const char *OCLKernel, *OCLKernelALPHA1, *OCLKernelLinpack, *OCLConvertKernel, *OCLConvertKernelTex;
192 | 
193 | 	int WaitForEventAndRelease(cl_event* pEvent, int lock = -1);
194 | 	int divideBuffer(double* src, size_t pitch_src, double* dest, size_t nSrcRows, size_t nSrcCols, bool transpose);
195 | 
196 | 	static const int GROUP_SIZE_X = 16, GROUP_SIZE_Y = 16, GROUP_COUNT_X = 16, GROUP_COUNT_Y = 16; //Group size and count for conversion kernels.
197 | 
198 | 	caldgemm_config_backend_opencl* config_backend;
199 | 
200 | 	HINSTANCE kernelLib;
201 | 	cl_kernel (*kernelLibCreate) (cl_context* context, int nDevices, cl_device_id* devices, int kernelType, int k, int betazero);
202 | 	void (*kernelLibQuerySettings) (int* tiling_x, int* tiling_y, bool* transposeA, bool* transposeB, bool* texture_buffers, int* group_size_x, int* group_size_y, int* min_tile_size, int* min_k);
203 | 	void (*kernelLibTerminate) ();
204 | 	size_t (*kernelLibSuggestedMaxHeight) ();
205 | 	size_t (*kernelLibGetAutoHeight) (size_t MaxGpuM, size_t MaxGpuN, int nDevices, size_t Width);
206 | 	void (*kernelLibModHeight) (size_t MOD_OVER, size_t MOD_GPU);
207 | 	int (*kernelLibInitialize) (cl_platform_id platform);
208 | 	
209 | 	cl_event last_device_kernel[max_devices];
210 | 
211 | public:
212 | 	static int GetMemoryInfo(cl_mem* mem, void** ptr, size_t* offset, const void* addr);
213 | 
214 | 	struct gpu_mem_struct_opencl
215 | 	{
216 | 		void* ptr;
217 | 		size_t size;
218 | 		cl_mem mem_obj;
219 | 	};
220 | };
221 | 
222 | #endif
223 | 


--------------------------------------------------------------------------------
/cmodules/affinity.cpp:
--------------------------------------------------------------------------------
  1 | #ifdef _WIN32
  2 | #include "pthread_mutex_win32_wrapper.h"
  3 | #else
  4 | #include <sys/types.h>
  5 | #include <sys/syscall.h>
  6 | #include <syscall.h>
  7 | #include <dirent.h>
  8 | #include <pthread.h>
  9 | #endif
 10 | #include <vector>
 11 | #include "affinity.h"
 12 | #include <string>
 13 | #include <stdio.h>
 14 | #include <stdlib.h>
 15 | #include <string.h>
 16 | #include "os_low_level_helper.h"
 17 | 
 18 | #ifndef STD_OUT
 19 | #define STD_OUT stdout
 20 | #endif
 21 | 
 22 | pid_t gettid()
 23 | {
 24 | #ifdef _WIN32
 25 | 	return((pid_t) GetCurrentThreadId());
 26 | #else
 27 | 	return((pid_t) syscall(SYS_gettid));
 28 | #endif
 29 | }
 30 | 
 31 | #ifdef _WIN32
 32 | pid_t getpid()
 33 | {
 34 | 	return((pid_t) GetCurrentProcessId());
 35 | }
 36 | #endif
 37 | 
 38 | struct threadNameStruct
 39 | {
 40 | 	pid_t thread_id;
 41 | 	std::string name;
 42 | };
 43 | 
 44 | class lockClass
 45 | {
 46 | public:
 47 |     lockClass() {pthread_mutex_init(&lock, NULL);}
 48 |     ~lockClass() {pthread_mutex_destroy(&lock);threadNames.clear();}
 49 |     std::vector<threadNameStruct> threadNames;
 50 |     pthread_mutex_t lock;
 51 | };
 52 | 
 53 | static lockClass lockedVector;
 54 | 
 55 | void setThreadName(const char* name)
 56 | {
 57 | 	threadNameStruct tmp;
 58 | 	tmp.thread_id = gettid();
 59 | 	tmp.name = name;
 60 | 	pthread_mutex_lock(&lockedVector.lock);
 61 | 	lockedVector.threadNames.push_back(tmp);
 62 | 	pthread_mutex_unlock(&lockedVector.lock);
 63 | }
 64 | 
 65 | const char* getThreadName(int tid, const char* defaultval)
 66 | {
 67 | #ifndef _WIN32
 68 | 	if (tid == -1) tid = gettid();
 69 | 	for (size_t i = 0;i < lockedVector.threadNames.size();i++)
 70 | 	{
 71 | 		if (lockedVector.threadNames[i].thread_id == tid)
 72 | 		{
 73 | 			return(lockedVector.threadNames[i].name.c_str());
 74 | 		}
 75 | 	}
 76 | #endif
 77 | 	return(defaultval);
 78 | }
 79 | 
 80 | void setUnknownNames(char* name)
 81 | {
 82 | 	pid_t pid = getpid();
 83 | #ifndef _WIN32
 84 | 	char dirname[1024];
 85 | 	sprintf(dirname, "/proc/%d/task", (int) pid);
 86 | 	DIR* dp = opendir(dirname);
 87 | 	if (dp)
 88 | 	{
 89 | 		dirent* ent;
 90 | 		while ((ent = readdir(dp)) != NULL)
 91 | 		{
 92 | 			pid_t tid = atoi(ent->d_name);
 93 | 			if (tid != 0 && tid != pid)
 94 | 			{
 95 | 				int found = false;
 96 | 				for (size_t i = 0;i < lockedVector.threadNames.size();i++)
 97 | 				{
 98 | 					if (lockedVector.threadNames[i].thread_id == tid)
 99 | 					{
100 | 						found = true;
101 | 					}
102 | 				}
103 | 				if (found == false)
104 | 				{
105 | 					threadNameStruct tmp;
106 | 					tmp.thread_id = tid;
107 | 					tmp.name = name;
108 | 					lockedVector.threadNames.push_back(tmp);
109 | 				}
110 | 			}
111 | 		}
112 | 	}
113 | #endif
114 | }
115 | 
116 | void setUnknownAffinity(int count, int* cores)
117 | {
118 | 	pid_t pid = getpid();
119 | #ifndef _WIN32
120 | 	char dirname[1024];
121 | 	sprintf(dirname, "/proc/%d/task", (int) pid);
122 | 	DIR* dp = opendir(dirname);
123 | 	if (dp)
124 | 	{
125 | 		dirent* ent;
126 | 		while ((ent = readdir(dp)) != NULL)
127 | 		{
128 | 			pid_t tid = atoi(ent->d_name);
129 | 			if (tid != 0 && tid != pid)
130 | 			{
131 | 				int found = false;
132 | 				for (size_t i = 0;i < lockedVector.threadNames.size();i++)
133 | 				{
134 | 					if (lockedVector.threadNames[i].thread_id == tid)
135 | 					{
136 | 						found = true;
137 | 					}
138 | 				}
139 | 				if (found == false)
140 | 				{
141 | 					cpu_set_t tmpset;
142 | 					CPU_ZERO(&tmpset);
143 | 					for (int i = 0;i < count;i++) CPU_SET(cores[i], &tmpset);
144 | 					sched_setaffinity(tid, sizeof(tmpset), &tmpset);
145 | 				}
146 | 			}
147 | 		}
148 | 	}
149 | #endif
150 | }
151 | 
152 | void printThreadPinning()
153 | {
154 | 	pid_t pid = getpid();
155 | #ifndef _WIN32
156 | 	char dirname[1024];
157 | 	sprintf(dirname, "/proc/%d/task", (int) pid);
158 | 	DIR* dp = opendir(dirname);
159 | 	if (dp)
160 | 	{
161 | 		dirent* ent;
162 | 		fprintf(STD_OUT, "%12s", "");
163 | 		for (int i = 0;i < get_number_of_cpu_cores();i++)
164 | 		{
165 | 			fprintf(STD_OUT, " %2d", i);
166 | 		}
167 | 		fprintf(STD_OUT, "\n");
168 | 		
169 | 		while ((ent = readdir(dp)) != NULL)
170 | 		{
171 | 			pid_t tid = atoi(ent->d_name);
172 | 			if (tid != 0)
173 | 			{
174 | 				fprintf(STD_OUT, "Thread %5d", tid);
175 | 				cpu_set_t threadmask;
176 | 				sched_getaffinity(tid, sizeof(threadmask), &threadmask);
177 | 				for (int i = 0;i < get_number_of_cpu_cores();i++)
178 | 				{
179 | 					if (CPU_ISSET(i, &threadmask))
180 | 					{
181 | 						fprintf(STD_OUT, "  X");
182 | 					}
183 | 					else
184 | 					{
185 | 						fprintf(STD_OUT, "  .");
186 | 					}
187 | 				}
188 | 				fprintf(STD_OUT, " - ");
189 | 				const char* name = getThreadName(tid);
190 | 				fprintf(STD_OUT, "%s", name);
191 | 				if (CPU_COUNT(&threadmask) == 1)
192 | 				{
193 | 					for (int i = 0;i < get_number_of_cpu_cores();i++)
194 | 					{
195 | 						if (CPU_ISSET(i, &threadmask)) fprintf(STD_OUT, " - Pinned to core %d", i);
196 | 					}
197 | 				}
198 | 				char filename[1024];
199 | 				sprintf(filename, "/proc/%d/task/%d/stat", (int) pid, (int) tid);
200 | 				FILE* fp = fopen(filename, "r");
201 | 				if (fp != NULL)
202 | 				{
203 | 					char buffer[1024];
204 | 					if (fgets(buffer, 1023, fp) == NULL) break;
205 | 					int count = 0;
206 | 					for (unsigned int i = 0;i < strlen(buffer);i++)
207 | 					{
208 | 						if (buffer[i] == ' ')
209 | 						{
210 | 							if (++count == 13)
211 | 							{
212 | 								int time;
213 | 								sscanf(&buffer[i + 1], "%d ", &time);
214 | 								fprintf(STD_OUT, " - Time: %d", time);
215 | 								break;
216 | 							}
217 | 						}
218 | 					}
219 | 					fclose(fp);
220 | 				}
221 | 				fprintf(STD_OUT, "\n");
222 | 			}
223 | 		}
224 | 		closedir(dp);
225 | 	}
226 | #endif
227 | }
228 | 


--------------------------------------------------------------------------------
/cmodules/affinity.h:
--------------------------------------------------------------------------------
 1 | #ifndef AFFINITY_H
 2 | #define AFFINITY_H
 3 | 
 4 | #ifdef _WIN32
 5 | typedef HANDLE pid_t;
 6 | #include "sched_affinity_win32_wrapper.h"
 7 | #else
 8 | #include <sched.h>
 9 | #endif
10 | 
11 | #ifdef __cplusplus
12 | extern "C"
13 | {
14 | #endif
15 | 
16 | void setThreadName(const char* name);
17 | #ifdef __cplusplus
18 | const char* getThreadName(int tid = -1, const char* defaultval = "Unknown Thread");
19 | #else
20 | const char* getThreadName(int tid, const char* defaultval);
21 | #endif
22 | void printThreadPinning();
23 | void setUnknownNames(char* name);
24 | void setUnknownAffinity(int count, int* cores);
25 | 
26 | inline int sched_setaffinity_set_core(int core)
27 | {
28 | 	cpu_set_t set;
29 | 	CPU_ZERO(&set);
30 | 	CPU_SET(core, &set);
31 | 	return sched_setaffinity(0, sizeof(set), &set);
32 | }
33 | 
34 | pid_t gettid();
35 | #ifdef _WIN32
36 | pid_t getpid();
37 | #endif
38 | 
39 | #ifdef __cplusplus
40 | }
41 | #endif
42 | 
43 | #endif
44 | 


--------------------------------------------------------------------------------
/cmodules/get_private_profile.h:
--------------------------------------------------------------------------------
  1 | #ifndef GET_PRIVATE_PROFILE_H
  2 | #define GET_PRIVATE_PROFILE_H
  3 | 
  4 | static inline longint GetPrivateProfileString(char* lpSectionName, char* lpKeyName, char* lpDefault, char* lpBuffer, DWORD size, char* configfile)
  5 | {
  6 | 	for (size_t i = 0;i < strlen(configfile);i++) if (configfile[i] == '\\') configfile[i] = '/';
  7 | 	FILE* cfgfile = fopen(configfile, "r");
  8 | 	if (cfgfile == NULL)
  9 | 	{
 10 | 		fprintf(stderr, "Error opening file %s\n", configfile);
 11 | 		return(-1);
 12 | 	}
 13 | 	char linebuffer[1024];
 14 | 	bool correctsection = false;
 15 | 	//fprintf(stderr, "Searching for %s in %s default %s\n", lpKeyName, lpSectionName, lpDefault);
 16 | 	while (!feof(cfgfile))
 17 | 	{
 18 | 		if (fgets(linebuffer, 1023, cfgfile) == NULL) break;
 19 | 		if (linebuffer[0] == '[')
 20 | 		{
 21 | 			correctsection = strncmp(&linebuffer[1], lpSectionName, strlen(lpSectionName)) == 0 && linebuffer[strlen(lpSectionName) + 1] == ']';
 22 | 			continue;
 23 | 		}
 24 | 		if (!correctsection) continue;
 25 | 		if (strncmp(linebuffer, lpKeyName, strlen(lpKeyName)) == 0)
 26 | 		{
 27 | 			char* tmpptr = &linebuffer[strlen(lpKeyName)];
 28 | 			while (*tmpptr == ' ') tmpptr++;
 29 | 			if (*tmpptr != '=') continue;
 30 | 			while (*(++tmpptr) == ' ') ;
 31 | 			char* tmpptr2 = tmpptr;
 32 | 			while (*tmpptr2 != 0 && *tmpptr2 != 10 && *tmpptr2 != 13) tmpptr2++;
 33 | 			*tmpptr2 = 0;
 34 | 			strncpy(lpBuffer, &tmpptr[0], size < strlen(tmpptr) ? size : strlen(tmpptr));
 35 | 			lpBuffer[size < strlen(tmpptr) ? size : strlen(tmpptr)] = 0;
 36 | 			fclose(cfgfile);
 37 | 			//fprintf(stderr, "Found: %s in %s: '%s'\n", lpKeyName, lpSectionName, lpBuffer);
 38 | 			return(strlen(tmpptr));
 39 | 		}
 40 | 	}
 41 | 	if (lpDefault == NULL) *lpBuffer = 0;
 42 | 	else
 43 | 	{
 44 | 		strncpy(lpBuffer, lpDefault, size < strlen(lpDefault) ? size : strlen(lpDefault));
 45 | 		lpBuffer[size < strlen(lpDefault) ? size : strlen(lpDefault)] = 0;
 46 | 	}
 47 | 	fclose(cfgfile);
 48 | 	//fprintf(stderr, "Not found: %s in %s, using default: '%s' -> '%s'\n", lpKeyName, lpSectionName, lpDefault, lpBuffer);
 49 | 	return(strlen(lpDefault));
 50 | }
 51 | 
 52 | static inline longint GetPrivateProfileInt(char* lpSectionName, char* lpKeyName, int nDefault, char* configfile)
 53 | {
 54 | 	char linebuffer[16] = "0";
 55 | 	char strdefault[16];
 56 | 	sprintf(strdefault, "%d", nDefault);
 57 | 	GetPrivateProfileString(lpSectionName, lpKeyName, strdefault, linebuffer, 15, configfile);
 58 | 	return(atoi(linebuffer));
 59 | }
 60 | 
 61 | static inline int GetPrivateProfileSectionNames(char* buffer, int buffersize, char* filename)
 62 | {
 63 | 	for (size_t i = 0;i < strlen(filename);i++) if (filename[i] == '\\') filename[i] = '/';
 64 | 	FILE* cfgfile = fopen(filename, "r");
 65 | 	if (cfgfile == NULL)
 66 | 	{
 67 | 		fprintf(stderr, "Error opening file %s\n", filename);
 68 | 		return(-1);
 69 | 	}
 70 | 	char linebuffer[1024];
 71 | 	int nwritten = 0;
 72 | 	while (!feof(cfgfile))
 73 | 	{
 74 | 		if (fgets(linebuffer, 1023, cfgfile) == NULL) break;
 75 | 		char* tmpptr = linebuffer;
 76 | 		while (*tmpptr == ' ') tmpptr++;
 77 | 		if (*tmpptr != '[') continue;
 78 | 		char* sectptr = ++tmpptr;
 79 | 		int section_len = 0;
 80 | 		while (*tmpptr && *tmpptr != 10 && *tmpptr != 13)
 81 | 		{
 82 | 			if (*tmpptr != ']')
 83 | 			{
 84 | 				tmpptr++;
 85 | 				section_len++;
 86 | 			}
 87 | 			else
 88 | 			{
 89 | 				if (nwritten + section_len + 2 < buffersize)
 90 | 				{
 91 | 					memcpy(&buffer[nwritten], sectptr, section_len);
 92 | 					buffer[nwritten + section_len] = 0;
 93 | 					buffer[nwritten + section_len + 1] = 0;
 94 | 				}
 95 | 				nwritten += section_len + 1;
 96 | 				break;
 97 | 			}
 98 | 		}
 99 | 	}
100 | 	return(nwritten);
101 | }
102 | 
103 | #endif
104 | 


--------------------------------------------------------------------------------
/cmodules/linux_helpers.h:
--------------------------------------------------------------------------------
 1 | #ifndef LINUX_HELPERS_H
 2 | #define LINUX_HELPERS_H
 3 | 
 4 | #include <termios.h>
 5 | #include <unistd.h>
 6 | 
 7 | static inline int getch()
 8 | {
 9 | 	static struct termios oldt, newt;
10 | 	tcgetattr( STDIN_FILENO, &oldt);
11 | 	newt = oldt;
12 | 	newt.c_lflag &= ~(ICANON|ECHO);
13 | 	tcsetattr( STDIN_FILENO, TCSANOW, &newt);
14 | 	int retVal = getchar();
15 | 	tcsetattr( STDIN_FILENO, TCSANOW, &oldt);
16 | 	return(retVal);
17 | }
18 | 
19 | static inline int kbhit()
20 | {
21 |    struct termios term, oterm;
22 |    int fd = 0;
23 |    int c = 0;
24 |    tcgetattr(fd, &oterm);
25 |    term = oterm;
26 |    term.c_lflag = term.c_lflag & (!ICANON);
27 |    term.c_cc[VMIN] = 0;
28 |    term.c_cc[VTIME] = 1;
29 |    tcsetattr(fd, TCSANOW, &term);
30 |    c = getchar();
31 |    tcsetattr(fd, TCSANOW, &oterm);
32 |    if (c != -1)
33 |    ungetc(c, stdin);
34 |    return ((c != -1) ? 1 : 0);
35 | }
36 | 
37 | static void inline Sleep(int msecs)
38 | {
39 | 	usleep(msecs * 1000);
40 | }
41 | 
42 | #endif
43 | 


--------------------------------------------------------------------------------
/cmodules/os_low_level_helper.h:
--------------------------------------------------------------------------------
 1 | #ifndef OS_LOW_LEVEL_HELPER_H
 2 | #define OS_LOW_LEVEL_HELPER_H
 3 | 
 4 | #ifndef _WIN32
 5 | #include <syscall.h>
 6 | #include <unistd.h> 
 7 | #endif
 8 | 
 9 | inline int get_number_of_cpu_cores()
10 | {
11 | #ifdef _WIN32
12 | 	SYSTEM_INFO info;
13 | 	GetSystemInfo(&info);
14 | 	return(info.dwNumberOfProcessors);
15 | #else
16 | 	return(sysconf(_SC_NPROCESSORS_ONLN));
17 | #endif
18 | }
19 | 
20 | inline int get_standard_page_size()
21 | {
22 | #ifdef _WIN32
23 | 	SYSTEM_INFO info;
24 | 	GetSystemInfo(&info);
25 | 	return(info.dwPageSize);
26 | #else
27 | 	return(getpagesize());
28 | #endif
29 | }
30 | 
31 | #endif
32 | 


--------------------------------------------------------------------------------
/cmodules/pthread_mutex_win32_wrapper.h:
--------------------------------------------------------------------------------
  1 | #ifndef PTHREAD_MUTEX_WIN32_WRAPPER_H
  2 | #define PTHREAD_MUTEX_WIN32_WRAPPER_H
  3 | 
  4 | #include <windows.h>
  5 | #include <winbase.h>
  6 | typedef HANDLE pthread_mutex_t;
  7 | typedef HANDLE pthread_t;
  8 | typedef HANDLE sem_t;
  9 | 
 10 | #ifndef EBUSY
 11 | #define EBUSY WAIT_TIMEOUT
 12 | #endif
 13 | 
 14 | #ifndef EAGAIN
 15 | #define EAGAIN WAIT_TIMEOUT
 16 | #endif
 17 | 
 18 | static inline int pthread_mutex_init(pthread_mutex_t *mutex, const void* attr)
 19 | {
 20 | 	*mutex = CreateSemaphore(NULL, 1, 1, NULL);
 21 | 	//printf("INIT %d\n", *mutex);
 22 | 	return((*mutex) == NULL);
 23 | }
 24 | 
 25 | static inline int pthread_mutex_lock(pthread_mutex_t *mutex)
 26 | {
 27 | 	//printf("LOCK %d\n", *mutex);
 28 | 	return(WaitForSingleObject(*mutex, INFINITE) == WAIT_FAILED);
 29 | }
 30 | 
 31 | static inline int pthread_mutex_trylock(pthread_mutex_t *mutex)
 32 | {
 33 | 	DWORD retVal = WaitForSingleObject(*mutex, 0);
 34 | 	if (retVal == WAIT_TIMEOUT) return(EBUSY);
 35 | 	//printf("TRYLOCK %d\n", *mutex);
 36 | 	if (retVal != WAIT_FAILED) return(0);
 37 | 	return(1);
 38 | }
 39 | 
 40 | static inline int pthread_mutex_unlock(pthread_mutex_t *mutex)
 41 | {
 42 | 	//printf("UNLOCK %d\n", *mutex);
 43 | 	return(ReleaseSemaphore(*mutex, 1, NULL) == 0);
 44 | }
 45 | 
 46 | static inline int pthread_mutex_destroy(pthread_mutex_t *mutex)
 47 | {
 48 | 	return(CloseHandle(*mutex) == 0);
 49 | }
 50 | 
 51 | static inline int pthread_create(pthread_t *thread, const void* attr, void *(*start_routine)(void*), void *arg)
 52 | {
 53 | 	return((*thread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) start_routine, arg, 0, NULL)) == 0);
 54 | }
 55 | 
 56 | static inline int pthread_exit(void* ret)
 57 | {
 58 | 	ExitThread((DWORD) (size_t) ret);
 59 | }
 60 | 
 61 | static inline int pthread_join(pthread_t thread, void** retval)
 62 | {
 63 | 	static DWORD ExitCode;
 64 | 	while (GetExitCodeThread(thread, &ExitCode) == STILL_ACTIVE) Sleep(0);
 65 | 	if (retval != NULL) *retval = (void*) &ExitCode;
 66 | 	return(0);
 67 | }
 68 | 
 69 | static inline int sem_init(sem_t *sem, int pshared, unsigned int value)
 70 | {
 71 | 	*sem = CreateSemaphore(NULL, value, 1024, NULL);
 72 | 	return((*sem) == NULL);
 73 | }
 74 | 
 75 | static inline int sem_destroy(sem_t *sem)
 76 | {
 77 | 	return(CloseHandle(*sem) == 0);
 78 | }
 79 | 
 80 | static inline int sem_wait(sem_t *sem)
 81 | {
 82 | 	return(WaitForSingleObject(*sem, INFINITE) == WAIT_FAILED);
 83 | }
 84 | 
 85 | static inline int sem_trywait(sem_t *sem)
 86 | {
 87 | 	DWORD retVal = WaitForSingleObject(*sem, 0);
 88 | 	if (retVal == WAIT_TIMEOUT) return(EAGAIN);
 89 | 	if (retVal != WAIT_FAILED) return(0);
 90 | 	return(-1);
 91 | }
 92 | 
 93 | static inline int sem_post(sem_t *sem)
 94 | {
 95 | 	return(ReleaseSemaphore(*sem, 1, NULL) == 0);
 96 | }
 97 | 
 98 | #ifdef CMODULES_PTHREAD_BARRIERS
 99 | 
100 | /*typedef struct _RTL_BARRIER {                       
101 |             DWORD Reserved1;                        
102 |             DWORD Reserved2;                        
103 |             ULONG_PTR Reserved3[2];                 
104 |             DWORD Reserved4;                        
105 |             DWORD Reserved5;                        
106 | } RTL_BARRIER, *PRTL_BARRIER;  
107 | 
108 | typedef RTL_BARRIER SYNCHRONIZATION_BARRIER;
109 | typedef PRTL_BARRIER PSYNCHRONIZATION_BARRIER;
110 | typedef PRTL_BARRIER LPSYNCHRONIZATION_BARRIER;
111 | 
112 | #define SYNCHRONIZATION_BARRIER_FLAGS_SPIN_ONLY  0x01
113 | #define SYNCHRONIZATION_BARRIER_FLAGS_BLOCK_ONLY 0x02
114 | #define SYNCHRONIZATION_BARRIER_FLAGS_NO_DELETE  0x04
115 | 
116 | BOOL WINAPI EnterSynchronizationBarrier(_Inout_ LPSYNCHRONIZATION_BARRIER lpBarrier, _In_ DWORD dwFlags);
117 | BOOL WINAPI InitializeSynchronizationBarrier(_Out_ LPSYNCHRONIZATION_BARRIER lpBarrier, _In_ LONG lTotalThreads, _In_ LONG lSpinCount);
118 | BOOL WINAPI DeleteSynchronizationBarrier(_Inout_ LPSYNCHRONIZATION_BARRIER lpBarrier);*/
119 | 
120 | typedef SYNCHRONIZATION_BARRIER pthread_barrier_t;
121 | 
122 | static inline int pthread_barrier_destroy(pthread_barrier_t* b)
123 | {
124 | 	return(DeleteSynchronizationBarrier(b) == 0);
125 | }
126 | 
127 | static inline int pthread_barrier_init(pthread_barrier_t* b, void* attr, unsigned count)
128 | {
129 | 	return(InitializeSynchronizationBarrier(b, count, -1) == 0);
130 | }
131 | 
132 | static inline int pthread_barrier_wait(pthread_barrier_t* b)
133 | {
134 | 	EnterSynchronizationBarrier(b, 0);
135 | 	return(0);
136 | }
137 | 
138 | #endif
139 | 
140 | #endif
141 | 


--------------------------------------------------------------------------------
/cmodules/qmalloc.cpp:
--------------------------------------------------------------------------------
  1 | #include "qmalloc.h"
  2 | 
  3 | #include <stdio.h>
  4 | #include <string.h>
  5 | 
  6 | #ifdef _WIN32
  7 | #include <windows.h>
  8 | #include <winbase.h>
  9 | #else //_WIN32
 10 | #include <unistd.h>
 11 | #include <sys/mman.h>
 12 | #include <syscall.h>
 13 | #ifdef _NUMAIF_H
 14 | #include <numaif.h>
 15 | #endif
 16 | 
 17 | #ifndef MAP_HUGETLB
 18 | #define MAP_HUGETLB 0x40000 /* arch specific */
 19 | #endif
 20 | #ifndef MPOL_DEFAULT
 21 | #define MPOL_DEFAULT 0
 22 | #endif
 23 | #ifndef MPOL_PREFERRED
 24 | #define MPOL_PREFERRED 1
 25 | #endif
 26 | #ifndef MPOL_BIND
 27 | #define MPOL_BIND 2
 28 | #endif
 29 | #ifndef MPOL_INTERLEAVE
 30 | #define MPOL_INTERLEAVE 3
 31 | #endif
 32 | #endif //!_WIN32
 33 | 
 34 | #ifndef STD_OUT
 35 | #define STD_OUT stdout
 36 | #endif
 37 | 
 38 | int qmalloc::qMallocCount = 0;
 39 | int qmalloc::qMallocUsed = 0;
 40 | qmalloc::qMallocData* qmalloc::qMallocs = NULL;
 41 | 
 42 | #ifdef _WIN32
 43 | static void Privilege(TCHAR* pszPrivilege, BOOL bEnable)
 44 | {
 45 | 	HANDLE           hToken;
 46 | 	TOKEN_PRIVILEGES tp;
 47 | 	BOOL             status;
 48 | 	DWORD            error;
 49 | 
 50 | 	if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken))
 51 | 	{
 52 | 		fprintf(STD_OUT, "Error obtaining process token\n");
 53 | 	}
 54 | 
 55 | 	if (!LookupPrivilegeValue(NULL, pszPrivilege, &tp.Privileges[0].Luid))
 56 | 	{
 57 | 		fprintf(STD_OUT, "Error looking up priviledge value\n");
 58 | 	}
 59 | 
 60 | 	tp.PrivilegeCount = 1;
 61 | 	tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
 62 | 
 63 | 	status = AdjustTokenPrivileges(hToken, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0);
 64 | 
 65 | 	error = GetLastError();
 66 | 	if (!status || (error != ERROR_SUCCESS))
 67 | 	{
 68 | 		fprintf(STD_OUT, "Error obtaining Priviledge %d\n", GetLastError());
 69 | 	}
 70 | 
 71 | 	CloseHandle(hToken);
 72 | }
 73 | #endif
 74 | 
 75 | void* qmalloc::qMalloc(size_t size, bool huge, bool executable, bool locked, void* alloc_addr, int interleave)
 76 | {
 77 | 	int pagesize;
 78 | 	void* addr;
 79 | 	if (huge)
 80 | 	{
 81 | #ifdef _WIN32
 82 | 		static int tokenObtained = 0;
 83 | #ifdef _AMD64_
 84 | 		pagesize = GetLargePageMinimum();
 85 | #else
 86 | 		pagesize = 1024 * 2048;
 87 | #endif
 88 | 		if (tokenObtained == 0)
 89 | 		{
 90 | 			fprintf(STD_OUT, "Obtaining security token\n");
 91 | 			Privilege(TEXT("SeLockMemoryPrivilege"), TRUE);
 92 | 			tokenObtained = 1;
 93 | 		}
 94 | #else
 95 | 		pagesize = 1024 * 2048;
 96 | #endif
 97 | 	}
 98 | 	else
 99 | 	{
100 | #ifdef _WIN32
101 | 		SYSTEM_INFO si;
102 | 		GetSystemInfo(&si);
103 | 		pagesize = si.dwPageSize;
104 | #else
105 | 		pagesize = sysconf(_SC_PAGESIZE);
106 | #endif
107 | 	}
108 | 	if (size % pagesize) size += pagesize - size % pagesize;
109 | #ifdef _WIN32
110 | 	DWORD flags = MEM_COMMIT;
111 | 	if (huge)
112 | 	{
113 | 		flags |= MEM_LARGE_PAGES;
114 | 	}
115 | 	DWORD protect = PAGE_READWRITE;
116 | 	if (executable)
117 | 	{
118 | 		protect = PAGE_EXECUTE_READWRITE;
119 | 	}
120 | 	if (interleave)
121 | 	{
122 | 		fprintf(stderr, "Interleaved allocation not supported on Windows\n");
123 | 		return(NULL);
124 | 	}
125 | 	if (alloc_addr != NULL)
126 | 	{
127 | 		if (VirtualAlloc(alloc_addr, size, (flags & ~MEM_COMMIT) | MEM_RESERVE, protect) != alloc_addr)
128 | 		{
129 | 			return(NULL);
130 | 		}
131 | 	}
132 | 	addr = VirtualAlloc(alloc_addr, size, flags, protect);
133 | #else
134 | 	int flags = MAP_ANONYMOUS | MAP_PRIVATE;
135 | 	int prot = PROT_READ | PROT_WRITE;
136 | 	if (huge) flags |= MAP_HUGETLB;
137 | 	if (executable) prot |= PROT_EXEC;
138 | 	if (locked) flags |= MAP_LOCKED;
139 | 	//unsigned long oldnodemask;
140 | 	//int oldpolicy;
141 | 	if (interleave && locked) //mmap will perform a memory lock, so we have to change memory policy beforehand
142 | 	{
143 | /*		if (syscall(SYS_get_mempolicy, &oldpolicy, &oldnodemask, sizeof(oldnodemask) * 8, NULL, 0) != 0)
144 | 		{
145 | 		    fprintf(stderr, "Error obtaining memory policy\n");
146 | 		    exit(1);
147 | 		}*/
148 | 		unsigned long nodemask = 0xffffff;
149 | 		if (syscall(SYS_set_mempolicy, MPOL_INTERLEAVE, &nodemask, sizeof(nodemask) * 8) != 0)
150 | 		{
151 | 		    fprintf(stderr, "Error setting memory policy\n");
152 | 		}
153 | 	}
154 | 	addr = mmap(alloc_addr, size, prot, flags, 0, 0);
155 | 	if (addr == MAP_FAILED) addr = NULL;
156 | 	if (interleave)
157 | 	{
158 | 		if (locked)	//Restore old memory policy
159 | 		{
160 | 			//syscall(SYS_set_mempolicy, oldpolicy, &oldnodemask, sizeof(oldnodemask) * 8);
161 | 			if (syscall(SYS_set_mempolicy, MPOL_DEFAULT, NULL) != 0)
162 | 			{
163 | 			    fprintf(stderr, "Error setting memory policy\n");
164 | 			}
165 | 		}
166 | 		else if (addr) //Set memory policy for region
167 | 		{
168 | #ifndef _NUMAIF_H
169 | 			fprintf(stderr, "Interleaved memory can only be used with non-locked memory if numaif.h is present\n");
170 | 			exit(1);
171 | #else
172 | 			unsigned long nodemask = 0xffffff;
173 | 			mbind(addr, size, MPOL_INTERLEAVE, &nodemask, sizeof(nodemask) * 8, 0);
174 | #endif
175 | 		}
176 | 	}
177 | #endif
178 | 
179 | 	if (alloc_addr != NULL && addr != alloc_addr)
180 | 	{
181 | 		fprintf(stderr, "Could not allocate memory at desired address\n");
182 | #ifdef _WIN32
183 | 		VirtualFree(addr, 0, MEM_RELEASE);
184 | #else
185 | 		munmap(addr, size);
186 | #endif
187 | 		return(NULL);
188 | 	}
189 | 
190 | 	if (addr == NULL)
191 | 	{
192 | #ifdef _WIN32
193 | 		DWORD error = GetLastError();
194 | #endif
195 | 		fprintf(stderr, "Failed to allocate memory\n");
196 | 		return(NULL);
197 | 	}
198 | 	
199 | 	if (qMallocCount == qMallocUsed)
200 | 	{
201 | 		if (qMallocCount == 0) qMallocCount = 8;
202 | 		else if (qMallocCount < 1024) qMallocCount *= 2;
203 | 		else qMallocCount += 1024;
204 | 		if (qMallocUsed == 0)
205 | 		{
206 | 			qMallocs = (qMallocData*) malloc(qMallocCount * sizeof(qMallocData));
207 | 		}
208 | 		else
209 | 		{
210 | 			qMallocs = (qMallocData*) realloc(qMallocs, qMallocCount * sizeof(qMallocData));
211 | 		}
212 | 	}
213 | 	qMallocs[qMallocUsed].addr = addr;
214 | 	qMallocs[qMallocUsed].size = size;
215 | 	qMallocUsed++;
216 | 
217 | #ifdef _WIN32
218 | 	if (locked)
219 | 	{
220 | 		size_t minp, maxp;
221 | 		HANDLE pid = GetCurrentProcess();
222 | 		if (GetProcessWorkingSetSize(pid, (PSIZE_T) &minp, (PSIZE_T) &maxp) == 0) fprintf(STD_OUT, "Error getting minimum working set size\n");
223 | 		if (SetProcessWorkingSetSize(pid, minp + size, maxp + size) == 0) fprintf(STD_OUT, "Error settings maximum working set size\n");
224 | 		if (VirtualLock(addr, size) == 0)
225 | 		{
226 | 			fprintf(STD_OUT, "Error locking memory\n");
227 | 			DWORD error = GetLastError();
228 | 			VirtualFree(addr, 0, MEM_RELEASE);
229 | 			if (SetProcessWorkingSetSize(pid, minp, maxp) == 0) fprintf(STD_OUT, "Error settings maximum working set size\n");
230 | 			addr = NULL;
231 | 		}
232 | 	}
233 | #endif
234 | 
235 | 	return(addr);
236 | }
237 | 
238 | int qmalloc::qFree(void* ptr)
239 | {
240 | 	for (int i = 0;i < qMallocUsed;i++)
241 | 	{
242 | 		if (qMallocs[i].addr == ptr)
243 | 		{
244 | #ifdef _WIN32
245 | 			if (VirtualFree(ptr, 0, MEM_RELEASE) == 0) return(1);
246 | #else
247 | 			if (munmap(ptr, qMallocs[i].size)) return(1);
248 | #endif
249 | 			qMallocUsed--;
250 | 			if (i < qMallocUsed) memcpy(&qMallocs[i], &qMallocs[qMallocUsed], sizeof(qMallocData));
251 | 			if (qMallocUsed == 0)
252 | 			{
253 | 				free(qMallocs);
254 | 				qMallocCount = 0;
255 | 			}
256 | 			return(0);
257 | 		}
258 | 	}
259 | 	return(1);
260 | }


--------------------------------------------------------------------------------
/cmodules/qmalloc.h:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | 
 3 | #ifndef QMALLOC_H
 4 | #define QMALLOC_H
 5 | 
 6 | #if !defined(_WIN32) & !defined(__cdecl)
 7 | #define __cdecl
 8 | #endif
 9 | 
10 | class qmalloc
11 | {
12 | public:
13 | 	static void* __cdecl qMalloc(size_t size, bool huge, bool executable, bool locked, void* alloc_addr = NULL, int interleave = false);
14 | 	static int __cdecl qFree(void* ptr);
15 | 
16 | private:	
17 | 	static int qMallocCount;
18 | 	static int qMallocUsed;
19 | 	struct qMallocData
20 | 	{
21 | 		void* addr;
22 | 		size_t size;
23 | 	};
24 | 	static qMallocData* qMallocs;
25 | };
26 | 
27 | #endif
28 | 


--------------------------------------------------------------------------------
/cmodules/qmath.h:
--------------------------------------------------------------------------------
1 | static inline bool qIsFinite(double val)
2 | {
3 | 	const unsigned long long int ival = *((unsigned long long int*) &val);
4 | 	return (!((ival & 0x7FF0000000000000) == 0x7FF0000000000000));
5 | }


--------------------------------------------------------------------------------
/cmodules/qmultialloc.cpp:
--------------------------------------------------------------------------------
 1 | #include "qmultialloc.h"
 2 | #include <stdlib.h>
 3 | #include <stdio.h>
 4 | #include <cuda_runtime.h>
 5 | #include <helper_cuda.h>
 6 | 
 7 | qMultiAlloc::qMultiAlloc()
 8 | {
 9 | 	p = NULL;
10 | 	np = npalloc = 0;
11 | 	maxalign = 1024;
12 | 	ptr = NULL;
13 | }
14 | 
15 | qMultiAlloc::~qMultiAlloc()
16 | {
17 | 	if (p) free(p);
18 | 	if (ptr) cudaFree(ptr);
19 | 	p = NULL;
20 | 	ptr = NULL;
21 | }
22 | 
23 | void qMultiAlloc::AddAlloc(void** ptr, size_t size, size_t align)
24 | {
25 | 	if (np == npalloc)
26 | 	{
27 | 		if (npalloc == 0)
28 | 		{
29 | 			npalloc = 8;
30 | 		}
31 | 		else
32 | 		{
33 | 			npalloc *= 2;
34 | 		}
35 | 		
36 | 		if (p)
37 | 		{
38 | 			p = (ptr_struct*) realloc(p, npalloc * sizeof(ptr_struct));
39 | 		}
40 | 		else
41 | 		{
42 | 			p = (ptr_struct*) malloc(npalloc * sizeof(ptr_struct));
43 | 		}
44 | 		if (p == NULL)
45 | 		{
46 | 			printf("Memory Allocation Error\n");
47 | 			exit(1);
48 | 		}
49 | 	}
50 | 
51 | 	p[np].ptr = ptr;
52 | 	p[np].size = size;
53 | 	np++;
54 | 	if (align > maxalign) maxalign = align;
55 | }
56 | 
57 | size_t qMultiAlloc::Allocate()
58 | {
59 | 	size_t size = 0;
60 | 	for (int i = 0;i < np;i++)
61 | 	{
62 | 		size += p[i].size;
63 | 		if (size % maxalign) size += maxalign - size % maxalign;
64 | 	}
65 | 	size += maxalign;
66 | 
67 | 	checkCudaErrors(cudaMalloc(&ptr, size));
68 | 	if (ptr == NULL)
69 | 	{
70 | 		np = 0;
71 | 		maxalign = 1024;
72 | 		return(0);
73 | 	}
74 | 	char* tmpp = (char*) ptr;
75 | 	for (int i = 0;i < np;i++)
76 | 	{
77 | 		if (((size_t) tmpp) % maxalign) tmpp += maxalign - ((size_t) tmpp) % maxalign;
78 | 		*p[i].ptr = tmpp;
79 | 		tmpp += p[i].size;
80 | 	}
81 | 	return(size);
82 | }
83 | 
84 | void qMultiAlloc::Free()
85 | {
86 | 	cudaFree(ptr);
87 | 	ptr = 0;
88 | 	np = 0;
89 | 	maxalign = 1024;
90 | }
91 | 


--------------------------------------------------------------------------------
/cmodules/qmultialloc.h:
--------------------------------------------------------------------------------
 1 | #ifndef QMULTIALLOC_H
 2 | #define QMULTIALLOC_H
 3 | #ifdef _WIN32
 4 | #include <windows.h>
 5 | #include <winbase.h>
 6 | #else
 7 | #include <stddef.h>
 8 | #endif
 9 | 
10 | class qMultiAlloc
11 | {
12 | public:
13 | 	qMultiAlloc();
14 | 	~qMultiAlloc();
15 | 
16 | 	void AddAlloc(void** ptr, size_t size, size_t align = 1024);
17 | 	size_t Allocate();
18 | 	void Free();
19 | 
20 | private:
21 | 	struct ptr_struct
22 | 	{
23 | 		void** ptr;
24 | 		size_t size;
25 | 	};
26 | 
27 | 	ptr_struct* p;
28 | 	int np;
29 | 	int npalloc;
30 | 	size_t maxalign;
31 | 	void* ptr;
32 | };
33 | 
34 | #endif
35 | 


--------------------------------------------------------------------------------
/cmodules/qsem.cpp:
--------------------------------------------------------------------------------
 1 | #include <errno.h>
 2 | #include <stdio.h>
 3 | 
 4 | #include "qsem.h"
 5 | 
 6 | #ifndef STD_OUT
 7 | #define STD_OUT stdout
 8 | #endif
 9 | 
10 | qSem::qSem(int num)
11 | {
12 | 	max = num;
13 | 	if (sem_init(&sem, 0, num)) fprintf(STD_OUT, "Error initializing semaphore");
14 | }
15 | 
16 | qSem::~qSem()
17 | {
18 | 	if (sem_destroy(&sem)) fprintf(STD_OUT, "Error destroying semaphore");
19 | }
20 | 
21 | int qSem::Lock()
22 | {
23 | 	int retVal;
24 | 	if ((retVal = sem_wait(&sem))) fprintf(STD_OUT, "Error locking semaphore");
25 | 	return(retVal);
26 | }
27 | 
28 | int qSem::Unlock()
29 | {
30 | 	int retVal;
31 | 	if ((retVal = sem_post(&sem))) fprintf(STD_OUT, "Error unlocking semaphire");
32 | 	return(retVal);
33 | }
34 | 
35 | int qSem::Trylock()
36 | {
37 | 	int retVal = sem_trywait(&sem);
38 | 	if (retVal)
39 | 	{
40 | 		if (errno == EAGAIN) return(EBUSY);
41 | 		return(-1);
42 | 	}
43 | 	return(0);
44 | }
45 | 
46 | #ifndef _WIN32
47 | int qSem::Query()
48 | {
49 | 	int value;
50 | 	if (sem_getvalue(&sem, &value) != 0) value = -1;
51 | 	return(value);
52 | }
53 | #endif
54 | 


--------------------------------------------------------------------------------
/cmodules/qsem.h:
--------------------------------------------------------------------------------
 1 | #ifndef QSEM_H
 2 | #define QSEM_H
 3 | 
 4 | #ifdef _WIN32
 5 | #include "pthread_mutex_win32_wrapper.h"
 6 | #else
 7 | #include <semaphore.h>
 8 | #endif
 9 | 
10 | class qSem
11 | {
12 | public:
13 | 	qSem(int num = 1);
14 | 	~qSem();
15 | 
16 | 	int Lock();
17 | 	int Unlock();
18 | 	int Trylock();
19 | 	int Query();
20 | 
21 | private:
22 | 	int max;
23 | 	sem_t sem;
24 | };
25 | 
26 | class qSignal
27 | {
28 | private:
29 | 	qSem sem;
30 | 
31 | public:
32 | 	qSignal() : sem(0) {}
33 | 	void Wait() {sem.Lock();}
34 | 	void Signal() {sem.Unlock();}
35 | };
36 | 
37 | #endif
38 | 


--------------------------------------------------------------------------------
/cmodules/sched_affinity_win32_wrapper.h:
--------------------------------------------------------------------------------
 1 | #ifndef SCHED_AFFINITY_WIN32_WRAPPER_H
 2 | #define SCHED_AFFINITY_WIN32_WRAPPER_H
 3 | 
 4 | typedef __int64 cpu_set_t;
 5 | typedef HANDLE pid_t;
 6 | 
 7 | static inline int CPU_ISSET(__int64 cpu, cpu_set_t *set)
 8 | {
 9 | 	return((*set & ((__int64) 1 << cpu)) ? 1 : 0);
10 | }
11 | 
12 | static inline int sched_setaffinity(pid_t pid, unsigned int cpusetsize, cpu_set_t *mask)
13 | {
14 | 	return(0);
15 | }
16 | static inline int sched_getaffinity(pid_t pid, unsigned int cpusetsize, cpu_set_t *mask)
17 | {
18 | 	return(0);
19 | }
20 | static inline void CPU_CLR(__int64 cpu, cpu_set_t *set)
21 | {
22 | 	*set &= (~((__int64) 1 << cpu));
23 | }
24 | 
25 | static inline void CPU_SET(__int64 cpu, cpu_set_t *set)
26 | {
27 | 	*set |= ((__int64) 1 << cpu);
28 | }
29 | 
30 | static inline void CPU_ZERO(cpu_set_t *set)
31 | {
32 | 	*set = 0;
33 | }
34 | 
35 | #endif


--------------------------------------------------------------------------------
/cmodules/switchtemplate.h:
--------------------------------------------------------------------------------
 1 | #ifndef QSWITCHTEMPLATE_H
 2 | #define QSWITCHTEMPLATE_H
 3 | #define Q_SWITCH_TEMPLATE_BOOL(expr, varname, ...) \
 4 | 	{ \
 5 | 		if (expr) \
 6 | 		{ \
 7 | 			const int varname = 1; \
 8 | 			__VA_ARGS__; \
 9 | 		} \
10 | 		else \
11 | 		{ \
12 | 			const int varname = 0; \
13 | 			__VA_ARGS__; \
14 | 		} \
15 | 	}
16 | 
17 | #define Q_SWITCH_TEMPLATE_CASE4(val, varname, ...) \
18 | 	switch (val) \
19 | 	{ \
20 | 	case 0: \
21 | 	{ \
22 | 		const int varname = 0; \
23 | 		__VA_ARGS__; \
24 | 		break; \
25 | 	} \
26 | 	case 1: \
27 | 	{ \
28 | 		const int varname = 1; \
29 | 		__VA_ARGS__; \
30 | 		break; \
31 | 	} \
32 | 	case 2: \
33 | 	{ \
34 | 		const int varname = 2; \
35 | 		__VA_ARGS__; \
36 | 		break; \
37 | 	} \
38 | 	case 3: \
39 | 	{ \
40 | 		const int varname = 3; \
41 | 		__VA_ARGS__; \
42 | 		break; \
43 | 	} \
44 | 	}
45 | 
46 | 
47 | #endif


--------------------------------------------------------------------------------
/cmodules/threadserver.cpp:
--------------------------------------------------------------------------------
1 | #include <stdio.h>
2 | 
3 | #ifndef STD_OUT
4 | #define STD_OUT stdout
5 | #endif
6 | 
7 | #include "threadserver.h"
8 | 


--------------------------------------------------------------------------------
/cmodules/threadserver.h:
--------------------------------------------------------------------------------
  1 | #ifndef THREADSERVER_H
  2 | #define THREADSERVER_H
  3 | 
  4 | #ifdef _WIN32
  5 | #include "pthread_mutex_win32_wrapper.h"
  6 | #include "sched_affinity_win32_wrapper.h"
  7 | #else
  8 | #include <pthread.h>
  9 | #include <sched.h>
 10 | #endif
 11 | #include "qsem.h"
 12 | 
 13 | class qThreadServerException
 14 | {
 15 | 
 16 | };
 17 | 
 18 | template <class S, class T> class qThreadCls;
 19 | 
 20 | class qThreadParam
 21 | {
 22 | 	template <class S, class T> friend class qThreadCls;
 23 | 
 24 | public:
 25 | 	qThreadParam()
 26 | 	{
 27 | 		for (int i = 0;i < 2;i++) threadMutex[i].Lock();
 28 | 		terminate = false;
 29 | 		pinCPU = -1;
 30 | 	}
 31 | 
 32 | 	~qThreadParam()
 33 | 	{
 34 | 		for (int i = 0;i < 2;i++) threadMutex[i].Unlock();
 35 | 	}
 36 | 
 37 | 	bool WaitForTask()
 38 | 	{
 39 | 		threadMutex[1].Unlock();
 40 | 		threadMutex[0].Lock();
 41 | 		return(!terminate);
 42 | 	}
 43 | 
 44 | 	int threadNum;
 45 | 
 46 | protected:
 47 | 	int pinCPU;
 48 | 	qSem threadMutex[2];
 49 | 	volatile bool terminate;
 50 | };
 51 | 
 52 | template <class S> class qThreadParamCls : public qThreadParam
 53 | {
 54 | 	template <class SS, class TT> friend class qThreadCls;
 55 | 	
 56 | private:
 57 | 	S* pCls;
 58 | 	void (S::*pFunc)(void*);
 59 | };
 60 | 
 61 | template <class S, class T> static void* qThreadWrapperCls(void* arg);
 62 | 
 63 | template <class S, class T> class qThreadCls
 64 | {
 65 | public:
 66 | 	qThreadCls() {started = false;};
 67 | 	qThreadCls(S* pCls, void (S::*pFunc)(T*), int threadNum = 0, int pinCPU = -1) : threadParam() {started = false;SpawnThread(pCls, pFunc, threadNum, pinCPU);}
 68 | 
 69 | 	void SpawnThread(S* pCls, void (S::*pFunc)(T*), int threadNum = 0, int pinCPU = -1, bool wait = true)
 70 | 	{
 71 | 		qThreadParamCls<S>& XthreadParam = *((qThreadParamCls<S>*) &this->threadParam);
 72 | 
 73 | 		XthreadParam.pCls = pCls;
 74 | 		XthreadParam.pFunc = (void (S::*)(void*)) pFunc;
 75 | 		XthreadParam.threadNum = threadNum;
 76 | 		XthreadParam.pinCPU = pinCPU;
 77 | 		pthread_t thr;
 78 | 		pthread_create(&thr, NULL, (void* (*) (void*)) &qThreadWrapperCls, &XthreadParam);
 79 | 		if (wait) WaitForSpawn();
 80 | 		started = true;
 81 | 	}
 82 | 	
 83 | 	void WaitForSpawn()
 84 | 	{
 85 | 		threadParam.threadMutex[1].Lock();
 86 | 	}
 87 | 
 88 | 	~qThreadCls()
 89 | 	{
 90 | 		if (started)
 91 | 		{
 92 | 			End();
 93 | 		}
 94 | 	}
 95 | 
 96 | 	void End()
 97 | 	{
 98 | 		qThreadParamCls<S>& XthreadParam = *((qThreadParamCls<S>*) &this->threadParam);
 99 | 	
100 | 		XthreadParam.terminate = true;
101 | 		XthreadParam.threadMutex[0].Unlock();
102 | 		XthreadParam.threadMutex[1].Lock();
103 | 		started = false;
104 | 	}
105 | 
106 | 	void Start()
107 | 	{
108 | 		threadParam.threadMutex[0].Unlock();
109 | 	}
110 | 
111 | 	void Sync()
112 | 	{
113 | 		threadParam.threadMutex[1].Lock();
114 | 	}
115 | 		
116 | private:
117 | 	bool started;
118 | 	T threadParam;
119 | 	
120 | 	static void* qThreadWrapperCls(T* arg);
121 | };
122 | 
123 | template <class S, class T> void* qThreadCls<S, T>::qThreadWrapperCls(T* arg)
124 | {
125 | 	qThreadParamCls<S>* const arg_A = (qThreadParamCls<S>*) arg;
126 | 	if (arg_A->pinCPU != -1)
127 | 	{
128 | 		cpu_set_t tmp_mask;
129 | 		CPU_ZERO(&tmp_mask);
130 | 		CPU_SET(arg_A->pinCPU, &tmp_mask);
131 | 		sched_setaffinity(0, sizeof(tmp_mask), &tmp_mask);
132 | 	}
133 | 
134 | 	void (S::*pFunc)(T*) = (void (S::*)(T*)) arg_A->pFunc;
135 | 	(arg_A->pCls->*pFunc)(arg);
136 | 
137 | 	arg_A->threadMutex[1].Unlock();
138 | 	pthread_exit(NULL);
139 | 	return(NULL);
140 | }
141 | 
142 | template <class S, class T> class qThreadClsArray
143 | {
144 | public:
145 | 	qThreadClsArray() {pArray = NULL;nThreadsRunning = 0;}
146 | 	qThreadClsArray(int n, S* pCls, void (S::*pFunc)(T*), int threadNumOffset = 0, int* pinCPU = NULL) {pArray = NULL;nThreadsRunning = 0;SetNumberOfThreads(n, pCls, pFunc, threadNumOffset, pinCPU);}
147 | 
148 | 	void SetNumberOfThreads(int n, S* pCls, void (S::*pFunc)(T*), int threadNumOffset = 0, int* pinCPU = NULL)
149 | 	{
150 | 		if (nThreadsRunning)
151 | 		{
152 | 			fprintf(STD_OUT, "Threads already started\n");throw(qThreadServerException());
153 | 		}
154 | 		pArray = new qThreadCls<S, T>[n];
155 | 		nThreadsRunning = n;
156 | 		for (int i = 0;i < n;i++)
157 | 		{
158 | 			pArray[i].SpawnThread(pCls, pFunc, threadNumOffset + i, pinCPU == NULL ? -1 : pinCPU[i], false);
159 | 		}
160 | 		for (int i = 0;i < n;i++)
161 | 		{
162 | 			pArray[i].WaitForSpawn();
163 | 		}
164 | 	}
165 | 
166 | 	~qThreadClsArray()
167 | 	{
168 | 		if (nThreadsRunning)
169 | 		{
170 | 			EndThreads();
171 | 		}
172 | 	}
173 | 
174 | 	void EndThreads()
175 | 	{
176 | 		delete[] pArray;
177 | 		nThreadsRunning = 0;
178 | 	}
179 | 
180 | 	void Start()
181 | 	{
182 | 		for (int i = 0;i < nThreadsRunning;i++) pArray[i].Start();
183 | 	}
184 | 
185 | 	void Sync()
186 | 	{
187 | 		for (int i = 0;i < nThreadsRunning;i++) pArray[i].Sync();
188 | 	}
189 | 
190 | private:
191 | 	qThreadCls<S, T>* pArray;
192 | 	int nThreadsRunning;
193 | };
194 | 
195 | #endif
196 | 


--------------------------------------------------------------------------------
/cmodules/timer.cpp:
--------------------------------------------------------------------------------
 1 | #include "timer.h"
 2 | #ifdef _WIN32
 3 | #include <windows.h>
 4 | #include <winbase.h>
 5 | #else
 6 | #include <time.h>
 7 | #endif
 8 | 
 9 | HighResTimer::HighResTimer()
10 | {
11 | 	ElapsedTime = 0;
12 | 	running = 0;
13 | }
14 | 
15 | HighResTimer::~HighResTimer() {}
16 | 
17 | void HighResTimer::Start()
18 | {
19 | #ifdef _WIN32
20 | 	__int64 istart;
21 | 	QueryPerformanceCounter((LARGE_INTEGER*)&istart);
22 | 	StartTime = (double) istart;
23 | #else
24 | 	timespec tv;
25 | 	clock_gettime(CLOCK_REALTIME, &tv);
26 | 	StartTime = (double) tv.tv_sec * 1.0E9 + (double) tv.tv_nsec;
27 | #endif
28 | 	running = 1;
29 | }
30 | 
31 | void HighResTimer::ResetStart()
32 | {
33 | 	ElapsedTime = 0;
34 | 	Start();
35 | }
36 | 
37 | void HighResTimer::Stop()
38 | {
39 | 	if (running == 0) return;
40 | 	running = 0;
41 | 	double EndTime = 0;
42 | #ifdef _WIN32
43 | 	__int64 iend;
44 | 	QueryPerformanceCounter((LARGE_INTEGER*) &iend);
45 | 	EndTime = (double) iend;
46 | #else
47 | 	timespec tv;
48 | 	clock_gettime(CLOCK_REALTIME, &tv);
49 | 	EndTime = (double) tv.tv_sec * 1.0E9 + (double) tv.tv_nsec;
50 | #endif
51 | 	ElapsedTime += EndTime - StartTime;
52 | }
53 | 
54 | void HighResTimer::Reset()
55 | {
56 | 	ElapsedTime = 0;
57 | 	StartTime = 0;
58 | 	running = 0;
59 | }
60 | 
61 | double HighResTimer::GetElapsedTime()
62 | {
63 | 	return ElapsedTime / Frequency;
64 | }
65 | 
66 | double HighResTimer::GetCurrentElapsedTime()
67 | {
68 | 	if (running == 0) return(GetElapsedTime());
69 | 	double CurrentTime = 0;
70 | #ifdef _WIN32
71 | 	__int64 iend;
72 | 	QueryPerformanceCounter((LARGE_INTEGER*) &iend);
73 | 	CurrentTime = (double) iend;
74 | #else
75 | 	timespec tv;
76 | 	clock_gettime(CLOCK_REALTIME, &tv);
77 | 	CurrentTime = (double) tv.tv_sec * 1.0E9 + (double) tv.tv_nsec;
78 | #endif
79 | 	return((CurrentTime - StartTime + ElapsedTime) / Frequency);
80 | }
81 | 
82 | double HighResTimer::GetFrequency()
83 | {
84 | #ifdef _WIN32
85 | 	__int64 ifreq;
86 | 	QueryPerformanceFrequency((LARGE_INTEGER*)&ifreq);
87 | 	return((double) ifreq);
88 | #else
89 | 	return(1.0E9);
90 | #endif
91 | }
92 | 
93 | double HighResTimer::Frequency = HighResTimer::GetFrequency();


--------------------------------------------------------------------------------
/cmodules/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef TIMER_H
 2 | #define TIMER_H
 3 | 
 4 | class HighResTimer {
 5 | 
 6 | public:
 7 | 	HighResTimer();
 8 | 	~HighResTimer();
 9 | 	void Start();
10 | 	void Stop();
11 | 	void Reset();
12 | 	void ResetStart();
13 | 	double GetElapsedTime();
14 | 	double GetCurrentElapsedTime();
15 | 
16 | private:
17 | 	static double Frequency;
18 | 	static double GetFrequency();
19 | 
20 | 	double ElapsedTime;
21 | 	double StartTime;
22 | 	int running;
23 | }; 
24 | 
25 | #endif
26 | 


--------------------------------------------------------------------------------
/cmodules/util_adl.cpp:
--------------------------------------------------------------------------------
  1 | ///
  2 | ///  Copyright (c) 2008 - 2009 Advanced Micro Devices, Inc.
  3 |  
  4 | ///  THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND,
  5 | ///  EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED
  6 | ///  WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE.
  7 | 
  8 | /// \file main.c
  9 | /// \brief C/C++ ADL sample application
 10 | ///
 11 | /// Demonstrates some basic ADL functions - create, destroy, obtaining adapter and display information.
 12 | /// If the display capabilities allow, increases, decreases and restores the brightness of each display
 13 | 
 14 | #ifndef _NO_ADL
 15 | 
 16 | #ifdef _WIN32
 17 | #define WINDOWS
 18 | #else
 19 | #define LINUX
 20 | #endif
 21 | 
 22 | #include "../../ADL/include/adl_sdk.h"
 23 | #ifdef LINUX
 24 | #include <dlfcn.h>	//dyopen, dlsym, dlclose
 25 | #include <unistd.h>	//sleep
 26 | #else
 27 | #include <windows.h>
 28 | #include <winbase.h>
 29 | #endif
 30 | #include <stdlib.h>	
 31 | #include <string.h>	//memeset
 32 | #include <stdio.h>
 33 | 
 34 | #ifndef MAINPROG
 35 | #include "util_adl.h"
 36 | #endif
 37 | 
 38 | #ifndef STD_OUT
 39 | #define STD_OUT stdout
 40 | #endif
 41 | 
 42 | // Definitions of the used function pointers. Add more if you use other ADL APIs
 43 | typedef int ( *ADL_MAIN_CONTROL_CREATE )(ADL_MAIN_MALLOC_CALLBACK, int );
 44 | typedef int ( *ADL_MAIN_CONTROL_DESTROY )();
 45 | typedef int ( *ADL_ADAPTER_NUMBEROFADAPTERS_GET ) ( int* );
 46 | typedef int ( *ADL_ADAPTER_ADAPTERINFO_GET ) ( LPAdapterInfo, int );
 47 | typedef int ( *ADL_OVERDRIVE5_TEMPERATURE_GET ) ( int, int , ADLTemperature * );
 48 | typedef int ( *ADL_ADAPTER_ACTIVE_GET ) ( int, int* );
 49 | typedef int ( *ADL_ADAPTER_VIDEOBIOSINFO_GET ) ( int, ADLBiosInfo* );
 50 | typedef int ( *ADL_ADAPTER_ID_GET ) ( int, int* );
 51 | typedef int ( *ADL_OVERDRIVE5_POWERCONTROL_SET ) ( int, int );	 
 52 | 
 53 | // Memory allocation function
 54 | void* __stdcall ADL_Main_Memory_Alloc ( int iSize )
 55 | {
 56 |     void* lpBuffer = malloc ( iSize );
 57 |     return lpBuffer;
 58 | }
 59 | 
 60 | ADL_MAIN_CONTROL_CREATE          ADL_Main_Control_Create;
 61 | ADL_MAIN_CONTROL_DESTROY         ADL_Main_Control_Destroy;
 62 | ADL_ADAPTER_NUMBEROFADAPTERS_GET ADL_Adapter_NumberOfAdapters_Get;
 63 | ADL_ADAPTER_ADAPTERINFO_GET      ADL_Adapter_AdapterInfo_Get;
 64 | ADL_OVERDRIVE5_TEMPERATURE_GET   ADL_Overdrive5_Temperature_Get;
 65 | ADL_ADAPTER_ACTIVE_GET           ADL_Adapter_Active_Get;
 66 | ADL_ADAPTER_VIDEOBIOSINFO_GET    ADL_Adapter_VideoBiosInfo_Get;
 67 | ADL_ADAPTER_ID_GET               ADL_Adapter_ID_Get;
 68 | ADL_OVERDRIVE5_POWERCONTROL_SET  ADL_Overdrive5_PowerControl_Set;
 69 | 
 70 | int nAdapters;
 71 | int* nAdapterIndizes;
 72 | #ifdef LINUX
 73 | void *hDLL;		// Handle to .so library
 74 | #else
 75 | HINSTANCE hDLL;
 76 | #endif
 77 | 
 78 | #ifndef LINUX
 79 | void* dlsym(HINSTANCE lib, char* name)
 80 | {
 81 | 	return(GetProcAddress(lib, name));
 82 | }
 83 | #endif
 84 | 
 85 | int adl_temperature_check_init()
 86 | {
 87 |     LPAdapterInfo     lpAdapterInfo = NULL;
 88 |     int  iNumberAdapters;
 89 | #ifdef LINUX
 90 |     setenv("DISPLAY", ":0", 1);
 91 | 	hDLL = dlopen( "libatiadlxx.so", RTLD_LAZY|RTLD_GLOBAL);
 92 | #else
 93 | 	hDLL = LoadLibrary( "atiadlxx.dll" );
 94 | #endif
 95 |     
 96 | 
 97 |         if (NULL == hDLL)
 98 |         {
 99 |             printf("ADL library not found!\n");
100 |             return 0;
101 |         }
102 | 
103 |         ADL_Main_Control_Create = (ADL_MAIN_CONTROL_CREATE) (size_t) dlsym(hDLL,"ADL_Main_Control_Create");
104 |         ADL_Main_Control_Destroy = (ADL_MAIN_CONTROL_DESTROY) (size_t) dlsym(hDLL,"ADL_Main_Control_Destroy");
105 |         ADL_Adapter_NumberOfAdapters_Get = (ADL_ADAPTER_NUMBEROFADAPTERS_GET) (size_t) dlsym(hDLL,"ADL_Adapter_NumberOfAdapters_Get");
106 |         ADL_Adapter_AdapterInfo_Get = (ADL_ADAPTER_ADAPTERINFO_GET) (size_t) dlsym(hDLL,"ADL_Adapter_AdapterInfo_Get");
107 |         ADL_Overdrive5_Temperature_Get = (ADL_OVERDRIVE5_TEMPERATURE_GET) (size_t) dlsym(hDLL,"ADL_Overdrive5_Temperature_Get");
108 |         ADL_Adapter_Active_Get = (ADL_ADAPTER_ACTIVE_GET) (size_t) dlsym(hDLL,"ADL_Adapter_Active_Get");
109 |         ADL_Adapter_VideoBiosInfo_Get = (ADL_ADAPTER_VIDEOBIOSINFO_GET) (size_t) dlsym(hDLL, "ADL_Adapter_VideoBiosInfo_Get");
110 |         ADL_Adapter_ID_Get = (ADL_ADAPTER_ID_GET) (size_t) dlsym(hDLL, "ADL_Adapter_ID_Get");
111 | 		ADL_Overdrive5_PowerControl_Set = (ADL_OVERDRIVE5_POWERCONTROL_SET) (size_t) dlsym(hDLL, "ADL_Overdrive5_PowerControl_Set");
112 |         
113 |         
114 | 		if ( NULL == ADL_Main_Control_Create || NULL == ADL_Main_Control_Destroy || NULL == ADL_Adapter_NumberOfAdapters_Get || NULL == ADL_Adapter_AdapterInfo_Get || NULL == ADL_Overdrive5_Temperature_Get ||
115 | 			NULL == ADL_Adapter_Active_Get || NULL == ADL_Adapter_VideoBiosInfo_Get || NULL == ADL_Adapter_ID_Get || NULL == ADL_Overdrive5_PowerControl_Set )
116 | 		{
117 | 			printf("ADL's API is missing!\n");
118 | 			return 0;
119 | 		}
120 | 
121 |         // Initialize ADL. The second parameter is 1, which means:
122 |         // retrieve adapter information only for adapters that are physically present and enabled in the system
123 |         if ( ADL_OK != ADL_Main_Control_Create (ADL_Main_Memory_Alloc, 1) )
124 | 	{
125 | 		printf("ADL Initialization Error!\n");
126 | 		return 0;
127 | 	}
128 | 
129 |         // Obtain the number of adapters for the system
130 |         if ( ADL_OK != ADL_Adapter_NumberOfAdapters_Get ( &iNumberAdapters ) )
131 | 	{
132 | 		printf("Cannot get the number of adapters!\n");
133 | 		return 0;
134 | 	}
135 | 
136 | #ifdef VERBOSE
137 | 	printf("Number of adapters: %d\n", iNumberAdapters);
138 | #endif
139 | 		
140 | 	if (iNumberAdapters == 0)
141 | 	{
142 | 		printf("No Adapter found\n");
143 | 		return(1);
144 | 	}
145 | 		
146 | 	lpAdapterInfo = (AdapterInfo*) malloc( sizeof(AdapterInfo) * iNumberAdapters);
147 | 	if (ADL_Adapter_AdapterInfo_Get(lpAdapterInfo, sizeof(AdapterInfo) * iNumberAdapters) != ADL_OK)
148 | 	{
149 | 		printf("Error getting adapter info\n");
150 | 		return(1);
151 | 	}
152 | 
153 | 	for (int j = 0;j < 2;j++)
154 | 	{
155 | 		nAdapters = 0;
156 | 		for ( int i = 0; i < iNumberAdapters; i++ )
157 | 		{
158 | 			int status;
159 | 			if (ADL_Adapter_Active_Get(lpAdapterInfo[i].iAdapterIndex, &status) != ADL_OK)
160 | 			{
161 | 				printf("Error getting adapter status\n");
162 | 				return(1);
163 | 			}
164 | 			if (status == ADL_TRUE)
165 | 			{
166 | 				if (j)
167 | 				{
168 | 					nAdapterIndizes[nAdapters] = lpAdapterInfo[i].iAdapterIndex;
169 | #ifdef VERBOSE
170 | 					ADLBiosInfo biosInfo;
171 | 					ADL_Adapter_VideoBiosInfo_Get(nAdapterIndizes[nAdapters], &biosInfo);
172 | 					int UID;
173 | 					ADL_Adapter_ID_Get(nAdapterIndizes[nAdapters], &UID);
174 | 					printf("Adapter %d (%s) Info: Bios %s %s %s, UID %d\n", nAdapters, lpAdapterInfo[i].strAdapterName, biosInfo.strPartNumber, biosInfo.strVersion, biosInfo.strDate, UID);
175 | #endif
176 | 				}
177 | 				nAdapters++;
178 | 			}
179 | 		}
180 | 		if (j == 0) nAdapterIndizes = new int[nAdapters];
181 | 	}
182 | 	free(lpAdapterInfo);
183 | 	return(0);
184 | }
185 | 
186 | int adl_temperature_check_run(double* max_temperature, int verbose)
187 | {
188 | 	*max_temperature = 0.;
189 | 	char tmpbuffer[128];
190 | 	if (verbose) strcpy(tmpbuffer, "Temperatures:");
191 | 	for (int i = 0;i < nAdapters;i++)
192 | 	{
193 | 		ADLTemperature temp;
194 | 		temp.iSize = sizeof(temp);
195 | 		if (ADL_Overdrive5_Temperature_Get(nAdapterIndizes[i], 0, &temp) != ADL_OK)
196 | 		{
197 | 			printf("Error reading temperature from adapter %d\n", i);
198 | 			return(1);
199 | 		}
200 | 		const double temperature = temp.iTemperature / 1000.;
201 | 		if (verbose) sprintf(tmpbuffer + strlen(tmpbuffer), " %f", temperature);
202 | 		if (temperature > *max_temperature) *max_temperature = temperature;
203 |         }
204 |         if (verbose) fprintf(STD_OUT, "%s\n", tmpbuffer);
205 |         return(0);
206 | }
207 | 
208 | int adl_temperature_check_exit()
209 | {
210 |     ADL_Main_Control_Destroy ();
211 | #ifdef LINUX
212 |     dlclose(hDLL);
213 | #else
214 | 	FreeLibrary(hDLL);
215 | #endif
216 | 
217 |     return(0);
218 | }
219 | 
220 | int adl_powertune_set(int val)
221 | {
222 | 	for (int i = 0;i < nAdapters;i++)
223 | 	{
224 | 		if (ADL_Overdrive5_PowerControl_Set(nAdapterIndizes[i], val))
225 | 		{
226 | 			printf("Error setting powertune to adapter %d (val %d)\n", i, val);
227 | 		}
228 | 	}
229 | 	return(0);
230 | }
231 | 
232 | #ifdef MAINPROG
233 | int main (int argc, char** argv)
234 | {
235 | 	double temperature;
236 | 	if (adl_temperature_check_init())
237 | 	{
238 | 		printf("Error initializing ADL\n");
239 | 		return(1);
240 | 	}
241 | 	if (adl_temperature_check_run(&temperature, 1))
242 | 	{
243 | 		printf("Error running ADL temperature check\n");
244 | 		return(1);
245 | 	}
246 | 	printf("Maximum Temperature: %f\n", temperature);
247 | 	if (argc > 1)
248 | 	{
249 | 		adl_powertune_set(atoi(argv[1]));
250 | 	}
251 | 	if (adl_temperature_check_exit())
252 | 	{
253 | 		printf("Error exiting ADL\n");
254 | 		return(1);
255 | 	}
256 | }
257 | #endif
258 | 
259 | #endif
260 | 


--------------------------------------------------------------------------------
/cmodules/util_adl.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * A wrapper for the C++ CALDGEMM
 3 |  *
 4 |  * Copyright 2010:
 5 |  *  - David Rohr (drohr@jwdt.org)
 6 |  *  - Matthias Bach (bach@compeng.uni-frankfurt.de)
 7 |  *  - Matthias Kretz (kretz@compeng.uni-frankfurt.de)
 8 |  *
 9 |  * This file is part of HPL-GPU.
10 |  *
11 |  * HPL-GPU is free software: you can redistribute it and/or modify
12 |  * it under the terms of the GNU General Public License as published by
13 |  * the Free Software Foundation, either version 3 of the License, or
14 |  * (at your option) any later version.
15 |  *
16 |  * HPL-GPU is distributed in the hope that it will be useful,
17 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 |  * GNU General Public License for more details.
20 |  *
21 |  * You should have received a copy of the GNU General Public License
22 |  * along with HPL-GPU.  If not, see <http://www.gnu.org/licenses/>.
23 |  *
24 |  * In addition to the rules layed out by the GNU General Public License
25 |  * the following exception is granted:
26 |  *
27 |  * Use with the Original BSD License.
28 |  *
29 |  * Notwithstanding any other provision of the GNU General Public License
30 |  * Version 3, you have permission to link or combine any covered work with
31 |  * a work licensed under the 4-clause BSD license into a single combined
32 |  * work, and to convey the resulting work.  The terms of this License will
33 |  * continue to apply to the part which is the covered work, but the special
34 |  * requirements of the 4-clause BSD license, clause 3, concerning the
35 |  * requirement of acknowledgement in advertising materials will apply to
36 |  * the combination as such.
37 |  */
38 | 
39 | #ifndef UTIL_ADL_H
40 | #define UTIL_ADL_H
41 | 
42 | #ifdef __cplusplus
43 | extern "C"
44 | {
45 | #endif
46 | 
47 | int adl_temperature_check_init();
48 | int adl_temperature_check_run(double*, int);
49 | int adl_temperature_check_exit();
50 | int adl_powertune_set(int);
51 | 
52 | #ifdef __cplusplus
53 | }
54 | #endif
55 | 
56 | #endif
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/config.mak:
--------------------------------------------------------------------------------
  1 | include config_options_load.mak
  2 | 
  3 | INTELARCH					= SSE4.2
  4 | CUDAVERSION					= 35 61
  5 | CUDAREGS					= 255
  6 | ARCHBITS					= 64
  7 | 
  8 | HIDEECHO					= @
  9 | CC_x86_64-pc-linux-gnu		= GCC
 10 | CC_i686-pc-cygwin			= ICC
 11 | 
 12 | TARGET						= dgemm_bench
 13 | 
 14 | LIBS						= 
 15 | LIBPATHS					= 
 16 | 
 17 | LIBS						=
 18 | EXTRAOBJFILES				=
 19 | 
 20 | CONFIG_STATIC				= 0
 21 | ifeq ($(CONFIG_LTO), 1)
 22 | EXTRAFLAGSGCC				= -flto
 23 | EXTRAFLAGSLINK				= -flto
 24 | endif
 25 | 
 26 | CPPFILES					= caldgemm.cpp benchmark.cpp cmodules/timer.cpp cmodules/qmalloc.cpp caldgemm_cpu.cpp cmodules/affinity.cpp cmodules/threadserver.cpp cmodules/qsem.cpp caldgemm_adl.cpp
 27 | CXXFILES					=
 28 | ASMFILES					=
 29 | CUFILES						=
 30 | 
 31 | COMPILER_FLAGS				= OPT
 32 | 
 33 | ifeq ($(AMDAPPSDKROOT), )
 34 | ifeq ($(INCLUDE_CAL), 1)
 35 | warning CAL not found, disabling INCLUDE_CAL
 36 | endif
 37 | INCLUDE_CAL					= 0
 38 | endif
 39 | 
 40 | ifeq ("$(CUDA_PATH)", "")
 41 | ifeq ($(INCLUDE_CUDA), 1)
 42 | warning CUDA not found, disabling INCLUDE_CUDA
 43 | endif
 44 | INCLUDE_CUDA				= 0
 45 | INCLUDE_CUBLAS				= 0
 46 | endif
 47 | 
 48 | ifeq ($(CONFIGURED), 1)
 49 | 
 50 | ifeq ($(INCLUDE_CUDA), 1)
 51 | CONFIG_CUDA					= 1
 52 | CUFILES						+= caldgemm_cuda.cu
 53 | DEFINES						+= CALDGEMM_CUDA
 54 | ifeq ($(INCLUDE_CUBLAS), 1)
 55 | CONFIG_CUBLAS					= 1
 56 | DEFINES						+= CALDGEMM_CUDA_CUBLAS
 57 | endif
 58 | endif
 59 | 
 60 | ifeq ($(INCLUDE_OPENCL), 1)
 61 | CONFIG_OPENCL				= 1
 62 | CPPFILES					+= caldgemm_opencl.cpp
 63 | DEFINES						+= CALDGEMM_OPENCL
 64 | endif
 65 | 
 66 | ifeq ($(INCLUDE_CAL), 1)
 67 | CONFIG_CAL					= 1
 68 | CPPFILES					+= caldgemm_cal.cpp
 69 | DEFINES						+= CALDGEMM_CAL
 70 | endif
 71 | 
 72 | ifeq ($(BLAS_BACKEND), GOTOBLAS)
 73 | INCLUDEPATHS				+= $(GOTOBLAS_PATH)
 74 | DEFINES						+= USE_GOTO_BLAS
 75 | ifeq ($(ARCH), i686-pc-cygwin)
 76 | EXTRAOBJFILES				+= $(GOTOBLAS_PATH)/libgoto2.lib
 77 | else
 78 | #LIBS						+= gfortran
 79 | EXTRAOBJFILES				+= $(GOTOBLAS_PATH)/libgoto2.a
 80 | endif
 81 | else
 82 | ifeq ($(BLAS_BACKEND), MKL)
 83 | INCLUDEPATHS				+= $(MKL_PATH)/include
 84 | LIBS						+= iomp5 mkl_intel_lp64 mkl_core mkl_intel_thread
 85 | LIBPATHS					+= $(MKL_PATH)/lib/intel64/
 86 | ifneq ($(ICC_PATH), )
 87 | LIBPATHS					+= $(ICC_PATH)/lib/intel64/
 88 | endif
 89 | DEFINES						+= USE_MKL
 90 | CONFIG_OPENMP				= 1
 91 | else
 92 | ifeq ($(BLAS_BACKEND), ACML)
 93 | INCLUDEPATHS				+= $(CBLAS_PATH)/include
 94 | LIBPATHS				+= $(ACML_PATH)/lib $(CBLAS_PATH)/include
 95 | LIBS					+= acml_mp
 96 | EXTRAOBJFILES				+= $(CBLAS_PATH)/lib/cblas_LINUX.a
 97 | CONFIG_OPENMP				= 1
 98 | LIBS						+= gfortran
 99 | else
100 | $(error No valid BLAS_BACKEND selected)
101 | endif
102 | endif
103 | endif
104 | 
105 | INCLUDEPATHS				+= $(OPENMPI_PATH)/include/vampirtrace
106 | 
107 | else
108 | $(error Not configured yet, adapt config_options.mak!)
109 | endif
110 | 
111 | caldgemm_config.h:
112 | 							cp caldgemm_config.sample caldgemm_config.h
113 | 
114 | ALLDEP						+= caldgemm_config.h config_options.mak
115 | 
116 | config_options.mak:
117 | 							cp config_options.sample config_options.mak 
118 | 
119 | FILEFLAGSbenchmark.cpp			= -Wno-strict-aliasing
120 | FILEFLAGScaldgemm.cpp			= -Wno-strict-aliasing
121 | FILEFLAGScaldgemm_cal.cpp			= -Wno-strict-aliasing
122 | FILEFLAGScaldgemm_opencl.cpp			= -Wno-strict-aliasing
123 | 


--------------------------------------------------------------------------------
/config_options.sample:
--------------------------------------------------------------------------------
 1 | #Select BLAS Backend to use. Available options: MKL, ACML, GOTOBLAS
 2 | BLAS_BACKEND				= MKL
 3 | 
 4 | #Select which GPU backends are compiled. (The CPU backend is always compiled)
 5 | INCLUDE_OPENCL				= 1
 6 | INCLUDE_CAL				= 1
 7 | INCLUDE_CUDA				= 1
 8 | INCLUDE_CUBLAS				= 1
 9 | 
10 | #Other Config options
11 | #Use link time optimization
12 | CONFIG_LTO				= 1
13 | 
14 | #Mark CONFIGURED = 1 to enable compilation
15 | CONFIGURED				= 0
16 | 


--------------------------------------------------------------------------------
/config_options_load.mak:
--------------------------------------------------------------------------------
 1 | include $(CALDGEMM_MAKE_DIR_PRE)config_options.mak
 2 | 
 3 | ifeq ($(AMDAPPSDKROOT), )
 4 | ifeq ($(INCLUDE_CAL), 1)
 5 | $(warning CAL not found, disabling INCLUDE_CAL)
 6 | endif
 7 | INCLUDE_CAL				= 0
 8 | endif
 9 | 
10 | ifeq ("$(CUDA_PATH)", "")
11 | ifeq ($(INCLUDE_CUDA), 1)
12 | $(warning CUDA not found, disabling INCLUDE_CUDA)
13 | endif
14 | INCLUDE_CUDA				= 0
15 | INCLUDE_CUBLAS				= 0
16 | endif
17 | 


--------------------------------------------------------------------------------
/cudakernel.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This file is part of the CALDGEMM library.
 3 |  *
 4 |  * Copyright 2015:
 5 |  *  - David Rohr (drohr@jwdt.org)
 6 |  *  - Matthias Bach (bach@compeng.uni-frankfurt.de)
 7 |  *  - Matthias Kretz (kretz@compeng.uni-frankfurt.de)
 8 |  *
 9 |  * This file is part of CALDGEMM.
10 |  *
11 |  * CALDGEMM is free software: you can redistribute it and/or modify
12 |  * it under the terms of the GNU Lesser General Public License as published by
13 |  * the Free Software Foundation, either version 3 of the License, or
14 |  * (at your option) any later version.
15 |  *
16 |  * CALDGEMM is distributed in the hope that it will be useful,
17 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 |  * GNU Lesser General Public License for more details.
20 |  *
21 |  * You should have received a copy of the GNU Lesser General Public License
22 |  * along with CALDGEMM.  If not, see <http://www.gnu.org/licenses/>.
23 |  */
24 | 
25 | __global__ void CUDAKernelName(double* C, double* A, double* B, size_t height1, size_t height2, size_t width, double Alpha, double Beta, size_t pitch)
26 | {
27 | 	for (int j = blockIdx.y * blockDim.y + threadIdx.y;j < height2;j += blockDim.y * gridDim.y)
28 | 	{
29 | 		for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < height1;i += blockDim.x * gridDim.x)
30 | 		{
31 | 			double addval = 0;
32 | #ifdef CALDGEMM_FORCE_K
33 | 			for (int k = 0;k < CALDGEMM_FORCE_K;k++)
34 | #else
35 | 			for (int k = 0;k < width;k++)
36 | #endif
37 | 			{
38 | 				addval += A[j * width + k] * B[i * width + k];
39 | 			}
40 | 			double* destptr = &C[j * pitch + i];
41 | 			*destptr = Alpha * addval + Beta * *destptr;
42 | 		}
43 | 	}
44 | }
45 | 


--------------------------------------------------------------------------------
/environment/caldgemm_setenv.sh.sample:
--------------------------------------------------------------------------------
 1 | if [ "0$CALDGEMM_ENVIRONMENT" == "1" ]; then
 2 |     exit;
 3 | fi
 4 | 
 5 | ###Patchs to required software
 6 | #Path to AMD APP SDK
 7 | export AMDAPPSDKROOT=$HOME/AMD-APP-SDK-v2.9-RC-lnx64
 8 | #Path to OpenMPI
 9 | export OPENMPI_PATH=/opt/openmpi183
10 | #Base path to Intel software
11 | export INTELPATH=$HOME/intel
12 | #Path to the Intel MKL BLAS library (usually inside $INTELPATH)
13 | export MKL_PATH=$INTELPATH/mkl
14 | #Path to libiomp (as part of ICC or standalone)
15 | export ICC_PATH=$INTELPATH
16 | #Path to Intel TBB library inside MKL (Comment out if you want to use MKL TBB instead of having hpl-gpu compile tbb)
17 | #export TBB_PATH=$INTELPATH/tbb
18 | #Path to the GotoBLAS BLAS library
19 | export GOTOBLAS_PATH=$HOME/GotoBLAS2
20 | #Path to AMD ACML BLAS library
21 | export ACML_PATH=$HOME/acml/gfortran64_mp
22 | #Path to the CBLAS interface (required for ACML BLAS Slibrary)
23 | export CBLAS_PATH=$HOME/CBLAS
24 | #Path to NVIDIA CUDA SDK
25 | export CUDA_PATH=/usr/local/cuda
26 | 
27 | ###Add all required paths to $LD_LIBRARY_PATH
28 | #We want to use the most recent AMD OpenCL library. Usually this comes with the driver. If the SDK is newer than the driver, outcomment the next line.
29 | #export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$AMDAPPSDKROOT/lib/x86_64
30 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_PATH/lib64
31 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$ICC_PATH/lib/intel64:$MKL_PATH/lib/intel64
32 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$ACML_PATH/lib
33 | export LD_LIBRARY_PATH=$OPENMPI_PATH/lib:$LD_LIBRARY_PATH
34 | #We need one library path at the very beginning that overrides all others for preloading libraries
35 | export LD_LIBRARY_PATH=$HOME/lib:$LD_LIBRARY_PATH
36 | 
37 | ###Add OpenMPI to $PATH for mpirun command
38 | export PATH=$OPENMPI_PATH/bin:$PATH
39 | 
40 | ###Set some environment variables for AMD GPUs and for Headless X Setup
41 | export GPU_FORCE_64BIT_PTR=1
42 | export GPU_NUM_COMPUTE_RINGS=1
43 | export DISPLAY=:0
44 | export COMPUTE=:0
45 | 
46 | ###Set correct ulimits for memory allocation
47 | ulimit -v unlimited
48 | ulimit -m unlimited
49 | ulimit -l unlimited
50 | 
51 | #We do need to set these values only once
52 | export CALDGEMM_ENVIRONMENT=1
53 | 


--------------------------------------------------------------------------------
/gcc_patch/libgomp.patch:
--------------------------------------------------------------------------------
 1 | --- ../gcc-4.6.1/libgomp/team.c	2009-05-20 22:54:45.000000000 +0200
 2 | +++ libgomp/team.c	2012-07-30 11:35:18.742162635 +0200
 3 | @@ -29,6 +29,8 @@
 4 |  #include <stdlib.h>
 5 |  #include <string.h>
 6 |  
 7 | +#define GOMP_KEEP_THREAD ((void*) (size_t) -1)
 8 | +
 9 |  /* This attribute contains PTHREAD_CREATE_DETACHED.  */
10 |  pthread_attr_t gomp_thread_attr;
11 |  
12 | @@ -116,11 +118,18 @@
13 |  	  gomp_team_barrier_wait (&team->barrier);
14 |  	  gomp_finish_task (task);
15 |  
16 | -	  gomp_barrier_wait (&pool->threads_dock);
17 | +	  do
18 | +	  {
19 | +	    gomp_barrier_wait (&pool->threads_dock);
20 | +	    local_fn = thr->fn;
21 | +	    thr->fn = NULL;
22 | +	    if (local_fn == GOMP_KEEP_THREAD)
23 | +	    {
24 | +		    gomp_team_barrier_wait(&thr->ts.team->barrier);
25 | +	    }
26 | +	  } while (local_fn == GOMP_KEEP_THREAD);
27 |  
28 | -	  local_fn = thr->fn;
29 |  	  local_data = thr->data;
30 | -	  thr->fn = NULL;
31 |  	}
32 |        while (local_fn);
33 |      }
34 | @@ -258,7 +267,7 @@
35 |    struct gomp_task_icv *icv;
36 |    bool nested;
37 |    struct gomp_thread_pool *pool;
38 | -  unsigned i, n, old_threads_used = 0;
39 | +  unsigned i, j, n, old_threads_used = 0;
40 |    pthread_attr_t thread_attr, *attr;
41 |  
42 |    thr = gomp_thread ();
43 | @@ -346,6 +355,16 @@
44 |  	  nthr->data = data;
45 |  	  team->ordered_release[i] = &nthr->release;
46 |  	}
47 | +      if (nthreads < old_threads_used)
48 | +        {
49 | +          for (j = i;j < old_threads_used;j++)
50 | +	    {
51 | +	      nthr = pool->threads[j];
52 | +	      nthr->fn = GOMP_KEEP_THREAD;
53 | +	      nthr->ts.team = team;
54 | +	    }
55 | +	  gomp_barrier_reinit (&team->barrier, old_threads_used);
56 | +	}
57 |  
58 |        if (i == nthreads)
59 |  	goto do_release;
60 | @@ -434,7 +453,7 @@
61 |       that should arrive back at the end of this team.  The extra
62 |       threads should be exiting.  Note that we arrange for this test
63 |       to never be true for nested teams.  */
64 | -  if (__builtin_expect (nthreads < old_threads_used, 0))
65 | +  if (nested && __builtin_expect (nthreads < old_threads_used, 0))
66 |      {
67 |        long diff = (long) nthreads - (long) old_threads_used;
68 |  
69 | @@ -448,6 +467,7 @@
70 |        gomp_mutex_unlock (&gomp_remaining_threads_lock);
71 |  #endif
72 |      }
73 | +    if (!nested && old_threads_used > nthreads) pool->threads_used = old_threads_used;
74 |  }
75 |  
76 |  
77 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
1 | include makefiles/makefile


--------------------------------------------------------------------------------
/makefiles/as:
--------------------------------------------------------------------------------
1 | /cygdrive/c/utility/cygwin/bin/x86_64-w64-mingw32-as


--------------------------------------------------------------------------------
/makefiles/callvc.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | IF '%2' == '0' CALL %1
 3 | IF '%2' == '1' CALL %1 > nul
 4 | SET VCEXECUTE=
 5 | SET BAT_SPECIAL_CHAR=0
 6 | 
 7 | :CHECKNEXT
 8 | IF '%3' == '' GOTO EXECUTE
 9 | IF '%3' == 'BAT_SPECIAL_EQ' GOTO SPECIAL_EQ
10 | IF '%3' == 'BAT_SPECIAL_KOMMA' GOTO SPECIAL_KOMMA
11 | IF '%BAT_SPECIAL_CHAR%' == '0' GOTO INSERT_SPACE
12 | SET BAT_SPECIAL_CHAR=0
13 | SET VCEXECUTE=%VCEXECUTE%%3
14 | GOTO DO_SHIFT
15 | 
16 | :INSERT_SPACE
17 | SET BAT_SPECIAL_CHAR=0
18 | SET VCEXECUTE=%VCEXECUTE% %3
19 | GOTO DO_SHIFT
20 | 
21 | :SPECIAL_EQ
22 | SET BAT_SPECIAL_CHAR=1
23 | SET VCEXECUTE=%VCEXECUTE%=
24 | GOTO DO_SHIFT
25 | 
26 | :SPECIAL_KOMMA
27 | SET BAT_SPECIAL_CHAR=1
28 | SET VCEXECUTE=%VCEXECUTE%,
29 | 
30 | :DO_SHIFT
31 | SHIFT
32 | GOTO CHECKNEXT
33 | 
34 | :EXECUTE
35 | %VCEXECUTE%
36 | IF '%ERRORLEVEL%' == '0' GOTO ALLOK
37 | ECHO ERROR
38 | EXIT 1
39 | :ALLOK
40 | SET VCEXECUTE=
41 | 


--------------------------------------------------------------------------------
/makefiles/config.mak.sample:
--------------------------------------------------------------------------------
 1 | #make command line options:
 2 | #VERBOSE=1 //Do not hide command lines
 3 | #CHECK_DEPENDENCIES=0 //Do not perform dependency file creation
 4 | 
 5 | include config_options.mak  #will be autocreated by config_options.sample
 6 | 
 7 | INTELARCH					= Host
 8 | GCCARCH						= native
 9 | MSVCFAVOR					= INTEL64
10 | CUDAVERSION					= 10 13 20 30 35
11 | CUDAREGS					= 64
12 | ARCHBITS					= 64
13 | 
14 | HIDEECHO					= @
15 | #HIDEECHO					= -
16 | CC_x86_64-pc-linux-gnu		= GCC
17 | CC_i686-pc-cygwin			= ICC
18 | 
19 | EXTRAFLAGSGCC				= -Weffc++
20 | EXTRAOBJFILES				= objfile.obj
21 | EXTRAFLAGSLINK				= 
22 | EXTRADEPS					= non_obj_file_required.dat
23 | 
24 | INCLUDEPATHS				= include subdir/include
25 | DEFINES						= SAMPLE_DEFINE=1
26 | LIBPATHS					= /usr/local/lib/sample
27 | LIBS						= sample
28 | 
29 | #CONFIG_STATIC				= 1
30 | 
31 | CONFIG_DIRECTX				= 1
32 | CONFIG_OPENCL				= 1
33 | CONFIG_OPENGL				= 0
34 | CONFIG_OPENMP				= 0
35 | CONFIG_GDB					= 1 #by default
36 | CONFIG_CAL					= 0
37 | #CONFIG_X11					=
38 | CONFIG_CUDA					= 0
39 | CONFIG_CUBLAS					= 0
40 | #CONFIG_VIDEO_EDIT			=
41 | CONFIG_OPENCL_VERSION		= All
42 | #Options are: AMD, NVIDIA, Intel, All, empty defaults to all
43 | CONFIG_QT					= 0
44 | CONFIG_CPP11				= 0
45 | 
46 | include						config_common.mak
47 | 
48 | TARGET						= ca
49 | SUBTARGETS					= libAliHLTTPCCAGPUSA
50 | SUBTARGETS_CLEAN				= libAliHLTTPCCAGPUSA.*
51 | #TARGETPATH					= ../
52 | 
53 | TARGETTYPE					= LIB
54 | WORKPATHSUFFIX				= $(TARGETTYPE)_$TARGET
55 | 
56 | CPPFILES					= sample.cpp
57 | CXXFILES					= subdir/sample.cxx \
58 | 								subdir/sample2.cxx
59 | ASMFILES					= sample.asm
60 | CUFILES						= sample.cu
61 | RESOURCEFILES				= myresource.dat
62 | CLFILES						= opencl_file.cl
63 | CFILES						= sample.c
64 | 
65 | QTFILES						= 
66 | QTCPPFILES					=
67 | 
68 | OPENCL_OPTIONS				= -x clc++
69 | 
70 | #Set all compiler flags to optimized/debug or set compiler flags individually
71 | COMPILER_FLAGS				= OPT
72 | #INTELFLAGSUSE				= $(INTELFLAGSOPT)
73 | #VSNETFLAGSUSE				= $(VSNETFLAGSOPT)
74 | #GCCFLAGSUSE				= $(GCCFLAGSOPT)
75 | #NVCCFLAGSUSE				= $(NVCCFLAGSOPT)
76 | 
77 | CONFIG_LTO					= 1
78 | CONFIG_CUDA_DC				= 1
79 | 


--------------------------------------------------------------------------------
/makefiles/i686-pc-cygwin.mak:
--------------------------------------------------------------------------------
  1 | #Set these Compiler Paths and Variables to your needs! 
  2 | VSPATH							:= ${VS120COMNTOOLS}../..
  3 | VSPATH10						:= ${VS100COMNTOOLS}../..
  4 | VSPATH9							:= ${VS90COMNTOOLS}../..
  5 | VSPATH6							:= c:/Utility/Speeches/Visual Studio 6
  6 | ICCPATH							:= ${ICPP_COMPILER14}
  7 | VECTORCPATH						:= c:/Utility/speeches/Codeplay
  8 | WINPATH							:= /cygdrive/c/Windows
  9 | CUDAPATH						:= $(CUDA_PATH)/
 10 | AMDPATH							:= $(AMDAPPSDKROOT)/
 11 | CUDASDKPATH						:= $(CUDAPATH)
 12 | DIRECTXPATH						:= $(DXSDK_DIR)
 13 | QTPATH							:= C:/Utility/Speeches/Qt/5.2.1/msvc2012_64_opengl
 14 | 
 15 | ifeq ($(GCC32), )
 16 | GCC32							= i686-pc-mingw32-c++.exe
 17 | endif
 18 | ifeq ($(GCC64), )
 19 | GCC64							= x86_64-w64-mingw32-c++.exe -B makefiles -w
 20 | endif
 21 | 
 22 | ICCPATH32						= $(ICCPATH)bin/ia32
 23 | ICCPATH64						= $(ICCPATH)bin/intel64
 24 | 
 25 | ICC32							= $(HIDEECHOA) $(CALLVC) "$(ICCPATH)bin/iclvars_ia32.bat" $(HIDEVARS) "$(ICCPATH32)/icl.exe"
 26 | ICC64							= $(HIDEECHOA) $(CALLVC) "$(ICCPATH)bin/iclvars_intel64.bat" $(HIDEVARS) "$(ICCPATH64)/icl.exe"
 27 | MSCC32							= $(HIDEECHOA) $(CALLVC) "$(VSPATH)/vc/bin/vcvars32.bat" $(HIDEVARS) "$(VSPATH)/vc/bin/cl.exe"
 28 | MSCC64							= $(HIDEECHOA) $(CALLVC) "$(VSPATH)/vc/bin/amd64/vcvars64.bat" $(HIDEVARS) "$(VSPATH)/vc/bin/amd64/cl.exe"
 29 | MSCC1032						= $(HIDEECHOA) $(CALLVC) "$(VSPATH10)/vc/bin/vcvars32.bat" $(HIDEVARS) "$(VSPATH10)/vc/bin/cl.exe"
 30 | MSCC1064						= $(HIDEECHOA) $(CALLVC) "$(VSPATH10)/vc/bin/amd64/vcvars64.bat" $(HIDEVARS) "$(VSPATH10)/vc/bin/amd64/cl.exe"
 31 | MASM32							= $(HIDEECHOA) $(CALLVC) "$(VSPATH)/vc/bin/vcvars32.bat" $(HIDEVARS) "$(VSPATH)/vc/bin/ml.exe"
 32 | MASM64							= $(HIDEECHOA) $(CALLVC) "$(VSPATH)/vc/bin/amd64/vcvars64.bat" $(HIDEVARS) "$(VSPATH)/vc/bin/amd64/ml64.exe"
 33 | VCC32							= $(HIDEECHOA) "$(VECTORCPATH)/vectorc86.exe"
 34 | 
 35 | MSLINK32GCC						= $(HIDEECHOA) $(CALLVC) "$(ICCPATH)bin/iclvars_ia32.bat" $(HIDEVARS) "$(VSPATH8)/VC/bin/link.exe"
 36 | MSLINK32						= $(HIDEECHOA) $(CALLVC) "$(VSPATH)/vc/bin/vcvars32.bat" $(HIDEVARS) "$(VSPATH)/VC/bin/link.exe"
 37 | MSLINK64						= $(HIDEECHOA) $(CALLVC) "$(VSPATH)/vc/bin/amd64/vcvars64.bat" $(HIDEVARS) "$(VSPATH)/VC/bin/amd64/link.exe"
 38 | ICCLINK32						= $(HIDEECHOA) $(CALLVC) "$(ICCPATH)bin/iclvars_ia32.bat" $(HIDEVARS) "$(ICCPATH32)/xilink.exe" -quseenv
 39 | ICCLINK64						= $(HIDEECHOA) $(CALLVC) "$(ICCPATH)bin/iclvars_intel64.bat" $(HIDEVARS) "$(ICCPATH64)/xilink.exe" -quseenv
 40 | 
 41 | #Linker Optionss
 42 | LINKFLAGSCOMMON					= /fixed:no /nologo /subsystem:console /incremental:no /debug $(MULTITHREADLIBS) /MANIFEST:NO $(HOARD) /pdb:"$(WORKPATH)/$(TARGET).pdb"
 43 | LINKFLAGS32						= $(LINKFLAGSCOMMON) /machine:I386
 44 | LINKFLAGS64						= $(LINKFLAGSCOMMON) /machine:X64
 45 | 
 46 | #Common Compiler Options
 47 | PREHEADER						= /Fp"$@.pch" /Fd"$@.pdb"
 48 | CFLAGSCOMMON					= $(PREHEADER) /nologo /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /W3 $(MULTITHREAD)
 49 | CFLAGS32						= $(CFLAGSCOMMON)
 50 | CFLAGS64						= $(CFLAGSCOMMON) /D "_WIN64" /D "_AMD64_" /D "_X64_" 
 51 | DEBUGFLAGS						= /EHs /Zi /Od /D "DEBUG_RUNTIME"
 52 | 
 53 | GCCFLAGS32						+= -mrtd
 54 | 
 55 | INTELQPROF						= 
 56 | #/Qprof_gen, /Qprof_use
 57 | 
 58 | #Intel Compiler Options
 59 | INTELFLAGSOPT					= /Oa /Ow /Qansi-alias /Ob2 /Ot /Oi /GA /G7 /O3 /Ox /Qvec_report0 /Qopt-prefetch /Q$(INTELARCH) /Gs0 /debug:minimal
 60 | # /Qguide  /Qopt-report:2 /Qvec-report:5
 61 | ifeq ($(CONFIG_LTO), 1)
 62 | INTELFLAGSOPT					+= /Qipo
 63 | INTELLINKIPO					= /Qipo-c /Qipo-fo
 64 | else
 65 | INTELFLAGSOPT					+= /Qip
 66 | endif
 67 | INTELFLAGSDBG					= /Od /Zi
 68 | INTELFLAGSBASE					= /EHsc /D "INTEL_RUNTIME" /Qprof_dir$(WORKPATH) $(MULTITHREAD) $(INTELQPROF)
 69 | INTELFLAGSCOMMON				= $(INTELFLAGSBASE) $(INTELFLAGSUSE)
 70 | INTELFLAGS32					= $(INTELFLAGSCOMMON) /Oy /Gr
 71 | INTELFLAGS64					= $(INTELFLAGSCOMMON)
 72 | # /Zd /Zi /Qvec_report0 
 73 | 
 74 | #VectorC Compiler Options
 75 | VECTORCOPTIMIZED				= /ssecalls /optimize 10 /max /target p4 /autoinline 4096 /vc /Ob2 /Oi /Ot
 76 | VECTORCSTANDARD					= /optimize 0 /novectors /vc /Ob0
 77 | VECTORCFLAGS					= /nologo /noprogress /vserror /cpp /mslibs $(VECTORCSTANDARD) /c /D "VECTORC_RUNTIME" $(MULTITHREAD) /I"$(VSPATH6)/VC98/include" $(VC8INCLUDES)
 78 | 
 79 | #Visual Studio Compiler Options
 80 | VSNETFLAGSOPT					= /EHs /O2 /Ox /Oi /Ot /Oy /GA /Ob2 /Zi /Qfast_transcendentals $(MSOPENMP)
 81 | VSNETFLAGSDBG					= /Od /Zi
 82 | VSNETFLAGSCOMMON				= /D "VSNET_RUNTIME" $(VSNETFLAGSUSE) $(EXTRAFLAGSMSCC) /EHsc
 83 | VSNETFLAGS32					= $(VSNETFLAGSCOMMON)
 84 | VSNETFLAGS64					= $(VSNETFLAGSCOMMON) /favor:$(MSVCFAVOR)
 85 | 
 86 | ifeq ("$(CONFIG_OPENMP)", "1")
 87 | INTELFLAGSCOMMON				+= /Qopenmp
 88 | VSNETFLAGSCOMMON				+= /openmp
 89 | endif
 90 | 
 91 | ifeq ($(GCCARCH), )
 92 | GCCARCHA						= -march=native -msse4.2
 93 | else
 94 | GCCARCHA						= -march=$(GCCARCH) -msse4.2
 95 | endif
 96 | 
 97 | #Compilation Output Control
 98 | ifneq ("$(VERBOSE)", "1")
 99 | HIDEECHOB						= @
100 | ifndef HIDEECHO
101 | HIDEECHOA						= @
102 | else ifneq ($(HIDEECHOA), "-")
103 | HIDEECHOA						= $(HIDEECHO)
104 | endif
105 | endif
106 | ifndef HIDEVARS
107 | HIDEVARS						= 1
108 | endif
109 | 
110 | CALLVC							= $(HIDEECHOA) cmd /C "makefiles\callvc.bat"
111 | 
112 | PATH							:= /bin:/usr/bin:$(WINPATH):$(WINPATH)/system32:$(PATH)
113 | 
114 | ifeq ($(ARCHBITS), 64)
115 | ICC								= $(ICC64) $(INTELFLAGS64) $(CFLAGS64)
116 | CCDBG							= $(ICC64) $(INTELFLAGSBASE) $(INTELFLAGSDBG) $(CFLAGS64) $(DEBUGFLAGS)
117 | ICCLINK							= $(ICCLINK64) $(LINKFLAGS64)
118 | MSCC							= $(MSCC64) $(VSNETFLAGS64) $(CFLAGS64)
119 | MSLINK							= $(MSLINK64) $(LINKFLAGS64)
120 | GCC								= $(GCC64) $(GCCFLAGS64) $(GCCFLAGSCOMMON) $(GCCFLAGSUSE)
121 | MASM							= $(MASM64)
122 | CCCUDA							= $(MSCC64) /TP $(VSNETFLAGS64) $(CFLAGS64)
123 | LIBPATHAMD						= /LIBPATH:"$(AMDPATH)lib" /LIBPATH:"$(AMDPATH)lib/x86_64"
124 | LIBPATHCUDA						= /LIBPATH:"$(CUDAPATH)lib/x64" /LIBPATH:"$(CUDASDKPATH)common/lib/x64"
125 | LIBPATHDIRECTX					= /LIBPATH:"$(DIRECTXPATH)lib/x64"
126 | LIBPATHSUSE						= /LIBPATH:"$(ICCPATH)compiler/lib/intel64"
127 | LINKFLAGSARCH					= /machine:X64
128 | else
129 | ICC								= $(ICC32) $(INTELFLAGS32) $(CFLAGS32)
130 | CCDBG							= $(MSCC32) $(CFLAGS32) $(DEBUGFLAGS)
131 | ICCLINK							= $(ICCLINK32) $(LINKFLAGS32)
132 | MSCC							= $(MSCC32) $(VSNETFLAGS32) $(CFLAGS32) /Gr
133 | MSLINK							= $(MSLINK32) $(LINKFLAGS32)
134 | MSLINKGCC						= $(MSLINK32GCC) $(LINKFLAGS32)
135 | VCC								= $(VCC32) /outfile $@ $(VECTORCFLAGS) $(CFLAGS32)
136 | GCC								= $(GCC32) $(GCCFLAGS32) $(GCCFLAGSCOMMON) $(GCCFLAGSUSE)
137 | MASM							= $(MASM32)
138 | CCCUDA							= $(MSCC1032) $(VSNETFLAGS32) $(CFLAGS32) /TP /Gd
139 | LIBPATHAMD						= /LIBPATH:"$(AMDPATH)lib" /LIBPATH:"$(AMDPATH)lib/x86"
140 | LIBPATHCUDA						= /LIBPATH:"$(CUDAPATH)lib/win32" /LIBPATH:"$(CUDASDKPATH)common/lib/Win32"
141 | LIBPATHDIRECTX					= /LIBPATH:"$(DIRECTXPATH)lib/x86"
142 | LIBPATHSUSE						= /LIBPATH:"$(ICCPATH)compiler/lib/ia32"
143 | LINKFLAGSARCH					= /machine:I386
144 | endif
145 | QTUIC							= $(QTPATH)/bin/uic.exe
146 | QTMOC							= $(QTPATH)/bin/moc.exe
147 | 
148 | LIBPATHSUSE						+= $(LIBPATHS:%=/LIBPATH:%)
149 | 
150 | ifeq ($(CC_i686-pc-cygwin), ICC)
151 | CC								= $(ICC)
152 | ifeq ($(CPPFILES_GCC), )
153 | LINK							= $(ICCLINK)
154 | else
155 | LINK							= $(MSLINK)
156 | endif
157 | else ifeq ($(CC_i686-pc-cygwin), GCC)
158 | CC								= $(GCC)
159 | LINK							= $(GCC)
160 | else
161 | CC								= $(MSCC)
162 | LINK							= $(MSLINK)
163 | endif
164 | GCC3264							= $(GCC)
165 | CC_SELECTED						= $(CC_i686-pc-cygwin)
166 | 
167 | ASM								= $(MASM)
168 | ASMPRE							= $(MSCC32)
169 | NVCC							= $(HIDEECHOA) $(CALLVC) "$(VSPATH10)/vc/bin/vcvars32.bat" $(HIDEVARS) "$(CUDAPATH)bin/nvcc"
170 | 
171 | MULTITHREADGCC					= -mthreads -D_MT
172 | 
173 | LIBSUSE							= kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib
174 | 
175 | ifneq ($(CPPFILES_GCC), )
176 | GCCLIBPATH						:= $(shell cygpath -m `$(GCC) -print-libgcc-file-name | sed -e s/libgcc.a//` `$(GCC) -print-sysroot`/mingw/lib)
177 | LIBSUSE							+= $(GCCLIBPATH:%=/LIBPATH:"%") libgcc.a libstdc++.a libmingw32.a libgcc_eh.a
178 | #libmingwex.a
179 | #libmsvcrt.a
180 | #libgcov.a libmingwex.a
181 | endif
182 | 
183 | OPENCLLIB						= OpenCL.lib
184 | ifeq ("$(CONFIG_OPENCL)", "1")
185 | LIBSUSE							+= $(OPENCLLIB)
186 | endif
187 | 
188 | ifeq ("$(CONFIG_CAL)", "1")
189 | ifeq ($(ARCHBITS), 64)
190 | LIBSUSE							+= aticalcl64.lib aticalrt64.lib
191 | else
192 | LIBSUSE							+= aticalcl.lib aticalrt.lib
193 | endif
194 | endif
195 | 
196 | ifeq ("$(CONFIG_DIRECTX)", "1")
197 | LIBSUSE							+= ddraw.lib dxguid.lib dxerr.lib
198 | COMMONINCLUDEPATHS				+= "$(DIRECTXPATH)include"
199 | LIBPATHSUSE						+= $(LIBPATHDIRECTX)
200 | endif
201 | 
202 | ifeq ("$(CONFIG_VIDEO_EDIT)", "1")
203 | LIBSUSE							+= amstrmid.lib msacm32.lib vfw32.lib winmm.lib
204 | endif
205 | 
206 | ifeq ("$(CONFIG_OPENGL)", "1")
207 | LIBSUSE							+= opengl32.lib glu32.lib
208 | endif
209 | 
210 | ifeq ("$(CONFIG_QT)", "1")
211 | LIBSUSE							+= Qt5Gui.lib Qt5Core.lib Qt5Widgets.lib
212 | COMMONINCLUDEPATHS				+= $(QTPATH)/include $(QTPATH)/include/QtGui $(QTPATH)/include/QtCore $(QTPATH)/include/QtWidgets $(WORKPATH)/qt
213 | LIBPATHSUSE						+= /LIBPATH:$(QTPATH)/lib
214 | endif
215 | 
216 | LIBSUSE							+= $(LIBS:%=%.lib)
217 | 
218 | ifeq ($(TARGETTYPE), LIB)
219 | LINKTARGETTYPE					= /DLL
220 | EXECUTABLE						= $(TARGET).dll
221 | else
222 | LINKTARGETTYPE					=
223 | EXECUTABLE						= $(TARGET).exe
224 | endif
225 | 
226 | ifeq ("$(CONFIG_OPENCL)", "1")
227 | ifeq ("$(CONFIG_OPENCL_VERSION)", "AMD")
228 | COMMONINCLUDEPATHS				+= "$(AMDPATH)include"
229 | LIBPATHSUSE						+= $(LIBPATHAMD)
230 | endif
231 | ifeq ("$(CONFIG_OPENCL_VERSION)", "NVIDIA")
232 | COMMONINCLUDEPATHS				+= "$(CUDAPATH)include"
233 | LIBPATHSUSE						+= $(LIBPATHCUDA)
234 | endif
235 | ifeq ("$(CONFIG_OPENCL_VERSION)", "Intel")
236 | #COMMONINCLUDEPATHS				+= ""
237 | endif
238 | ifeq ("$(CONFIG_OPENCL_VERSION)", "All")
239 | COMMONINCLUDEPATHS				+= "$(AMDPATH)include"
240 | COMMONINCLUDEPATHS				+= "$(CUDAPATH)include"
241 | #COMMONINCLUDEPATHS				+= ""
242 | LIBPATHSUSE						+= $(LIBPATHAMD)
243 | endif
244 | endif
245 | 
246 | ifeq ("$(CONFIG_CUDA)", "1")
247 | COMMONINCLUDEPATHS				+= "$(CUDAPATH)include" "$(CUDASDKPATH)common/inc"
248 | LIBPATHSUSE						+= $(LIBPATHCUDA)
249 | ifneq ($(CUFILES), )
250 | LIBSUSE							+= cudart.lib cuda.lib
251 | ifeq ($(CONFIG_CUDA_DC), 1)
252 | LIBSUSE							+= cudadevrt.lib
253 | endif
254 | ifeq ("$(CONFIG_OPENGL)", "1")
255 | ifeq ($(ARCHBITS), 64)
256 | LIBSUSE							+= freeglut.lib glew64.lib
257 | else
258 | LIBSUSE							+= freeglut.lib glew32.lib
259 | endif
260 | endif
261 | endif
262 | endif
263 | 
264 | ifeq ("$(CONFIG_CAL)", "1")
265 | COMMONINCLUDEPATHS				+= "$(AMDPATH)/include/CAL"
266 | LIBPATHSUSE						+= $(LIBPATHAMD)
267 | endif
268 | 
269 | ifeq ($(CC_i686-pc-cygwin), GCC)
270 | COMPILEOUTPUT					= -o $@
271 | LINKOUTPUT						= -o $@
272 | COMPILEONLY						= -c
273 | ASMONLY							= -c
274 | PRECOMPILEONLY					= -x c++ -E
275 | INCLUDEPATHSUSE					= $(GCCINCLUDEPATHS)
276 | DEFINESUSE						= $(GCCDEFINES)
277 | else
278 | INCLUDEPATHSUSE					= $(VSINCLUDEPATHS)
279 | DEFINESUSE						= $(VSDEFINES)
280 | COMPILEOUTPUTBASE				= /Fo
281 | COMPILEOUTPUT					= $(COMPILEOUTPUTBASE)"$@"
282 | LINKOUTPUT						= /Out:"$@"
283 | COMPILEONLY						= /c
284 | ASMONLY							= /c
285 | PRECOMPILEONLY					= /EP 
286 | endif
287 | OBJ								= obj
288 | 
289 | DEFINESARCH						= "WIN32"
290 | 
291 | NVCCARCHS						:= `for i in $(CUDAVERSION); do echo -n -gencode arch BAT_SPECIAL_EQ compute_$$i BAT_SPECIAL_KOMMA code BAT_SPECIAL_EQ sm_$$i\ ;done`
292 | NVCC_GREP						= "^#line\|^$$"
293 | 


--------------------------------------------------------------------------------
/makefiles/i686-pc-linux-gnu.mak:
--------------------------------------------------------------------------------
1 | CC_i686-pc-linux-gnu				= $(CC_x86_64-pc-linux-gnu)
2 | ALLDEP								+= makefiles/x86_64-pc-linux-gnu.mak
3 | include makefiles/x86_64-pc-linux-gnu.mak
4 | 


--------------------------------------------------------------------------------
/makefiles/include.S:
--------------------------------------------------------------------------------
 1 | 	.global FILENAMEMOD
 2 | 	.global FILENAMEMOD_size
 3 | 	.section .data
 4 | FILENAMEMOD:
 5 | 	.incbin "FILENAMENORMAL"
 6 | 1:
 7 | 	.byte 0
 8 | FILENAMEMOD_size:
 9 | 	.int 1b - FILENAMEMOD
10 | 


--------------------------------------------------------------------------------
/makefiles/makefile:
--------------------------------------------------------------------------------
  1 | all								: all_tmp
  2 | 
  3 | ARCH							:= $(shell sort <<< $$MACHTYPE)
  4 | ARCHCHK							:= $(shell if [ -a makefiles/$(ARCH).mak ]; then echo -n 1; else echo -n 0; fi)
  5 | ifeq ($(ARCHCHK), 1)
  6 | else
  7 | $(warning Unknown Architecture: $(ARCH) $(ARCHCHK), defaulting to x86_64-pc-linux-gnu)
  8 | ARCH							:= x86_64-pc-linux-gnu
  9 | endif
 10 | ifeq ($(ARCH), i686-pc-cygwin)
 11 | ARCH_CYGWIN						:= 1
 12 | endif
 13 | ifeq ($(ARCH), x86_64-unknown-cygwin)
 14 | ARCH_CYGWIN						:= 1
 15 | endif
 16 | 
 17 | ARCHFILE						= $(ARCH).mak
 18 | 
 19 | ifeq ($(CONFIGFILE), )
 20 | CONFIGFILE						= config.mak
 21 | CLEANRELEASEDIR					= release
 22 | endif
 23 | 
 24 | ifeq ($(BUILDSCRIPT), )
 25 | BUILDSCRIPT						= build.sh
 26 | endif
 27 | 
 28 | #GCC Compiler Options
 29 | GCCFLAGSOPT						= -O3 $(GCCFLAGSARCH) -fweb -frename-registers -minline-all-stringops -mfpmath=sse -ftracer -funroll-loops -fpeel-loops -fprefetch-loop-arrays -ffast-math -fno-stack-protector
 30 | CLANGFLAGSOPT					= -O3 $(GCCFLAGSARCH) -minline-all-stringops -mfpmath=sse -funroll-loops -ffast-math -fno-stack-protector $(EXTRAFLAGSCLANG)
 31 | CLANGFLAGSDBG					= $(GCCFLAGSDBG) $(EXTRAFLAGSCLANG)
 32 | #-fgcse-sm -fgcse-las -fmodulo-sched -fipa-pta -floop-interchange -floop-block 
 33 | GCCFLAGSDBG						= -O0 $(GCCFLAGSARCH)
 34 | GCCFLAGSCOMMON					= $(MULTITHREADGCC) -pipe -DGCC_RUNTIME $(GCCPROF) $(EXTRAFLAGSGCC) -Wall -Wno-write-strings
 35 | CLANGFLAGSCOMMON				= $(MULTITHREADGCC) -pipe -DGCC_RUNTIME -DCLANG_RUNTIME $(GCCPROF) $(EXTRAFLAGSGCC) -Wall -Wno-write-strings
 36 | GCCFLAGS32						= -m32
 37 | GCCFLAGS64						= -m64 -D"_AMD64_" -D"_X64_"
 38 | 
 39 | GCCPROF							= 
 40 | #-fprofile-arcs, -fbranch-probabilities
 41 | 
 42 | #Multithread Options
 43 | MULTITHREAD						= /MT
 44 | MULTITHREADLIBS					= /nodefaultlib:libc.lib
 45 | 
 46 | NVCCFLAGSOPT					= --use_fast_math --maxrregcount $(CUDAREGS) -O4 -Xptxas -v -Xptxas -O4 -Xcompiler -O4 -m$(ARCHBITS) $(NVCCARCHS)
 47 | NVCCFLAGSDBG					= --maxrregcount $(CUDAREGS) -Xptxas -v -Xptxas -O0 -O0 -m$(ARCHBITS) $(NVCCARCHS)
 48 | 
 49 | TARGETTYPE						= EXECUTABLE
 50 | 
 51 | WORKPATHSUFFIX					= $(TARGETTYPE)_$(TARGET)
 52 | include							$(CONFIGFILE)
 53 | WORKPATH						= release/$(ARCH)_$(ARCHBITS)$(WORKPATHSUFFIX)
 54 | ifeq ($(CONFIG_OPENCL_VERSION), )
 55 | CONFIG_OPENCL_VERSION			= All
 56 | endif
 57 | ifeq ($(COMPILER_FLAGS), DBG)
 58 | INTELFLAGSUSE					= $(INTELFLAGSDBG)
 59 | VSNETFLAGSUSE					= $(VSNETFLAGSDBG)
 60 | GCCFLAGSUSE						= $(GCCFLAGSDBG)
 61 | CLANGFLAGSUSE					= $(CLANGFLAGSDBG)
 62 | NVCCFLAGSUSE					= $(NVCCFLAGSDBG)
 63 | OPENCL_DEF_OPTIONS				= "-O0 -g"
 64 | CONFIG_LTO						= 0
 65 | endif
 66 | ifeq ($(COMPILER_FLAGS), OPT)
 67 | INTELFLAGSUSE					= $(INTELFLAGSOPT)
 68 | VSNETFLAGSUSE					= $(VSNETFLAGSOPT)
 69 | GCCFLAGSUSE						= $(GCCFLAGSOPT)
 70 | CLANGFLAGSUSE					= $(CLANGFLAGSOPT)
 71 | NVCCFLAGSUSE					= $(NVCCFLAGSOPT)
 72 | OPENCL_DEF_OPTIONS				= "-O3"
 73 | endif
 74 | ifeq ($(GCCCUDA), )
 75 | GCCCUDA						= $(GCC3264)
 76 | endif
 77 | 
 78 | MKDIR							= $(HIDEECHOB) mkdir -p `echo $@ | sed 's,/[a-zA-Z0-9._-]*$$,,'` `echo $@ | sed 's,/[a-zA-Z0-9._-]*$$,,'`
 79 | 
 80 | ALLDEP							+= makefiles/makefile $(CONFIGFILE) makefiles/$(ARCHFILE) config.mak
 81 | include							makefiles/$(ARCHFILE)
 82 | GCCFLAGSARCH					+= $(GCCARCHA)
 83 | ifeq ($(CONFIG_CPP11), 1)
 84 | GCCFLAGSCOMMON						+= -std=c++11
 85 | CLANGFLAGSCOMMON					+= -std=c++11
 86 | endif
 87 | ifneq ($(CONFIG_CPP), )
 88 | GCCFLAGSCOMMON						+= -std=$(CONFIG_CPP)
 89 | CLANGFLAGSCOMMON					+= -std=$(CONFIG_CPP)
 90 | endif
 91 | ifeq ($(CONFIG_OPENMP), 1)
 92 | GCCFLAGSCOMMON						+= -fopenmp
 93 | CLANGFLAGSCOMMON					+= -fopenmp
 94 | GCCLINK							+= -fopenmp
 95 | endif
 96 | 
 97 | ifeq ($(TARGETPATH), )
 98 | COPIED_EXECUTABLE				= $(EXECUTABLE)
 99 | TARGETPATH					= .
100 | else
101 | COPIED_EXECUTABLE				= $(TARGETPATH)$(EXECUTABLE)
102 | $(COPIED_EXECUTABLE)				: $(EXECUTABLE)
103 | 						cp $(EXECUTABLE) $(COPIED_EXECUTABLE)
104 | endif
105 | 
106 | all_tmp:						$(SUBTARGETS:%=subbuild/%.mak) $(COPIED_EXECUTABLE)
107 | 
108 | main:							$(COPIED_EXECUTABLE)
109 | 
110 | run								: all
111 | 								cd $(TARGETPATH) && ./$(EXECUTABLE)
112 | 
113 | 
114 | subbuild/%.mak:
115 | 								+$(MAKE) CONFIGFILE=config_`echo $@ | sed s,subbuild/,,` BUILDSCRIPT=config_`echo $@ | sed s,subbuild/,, | sed s,mak,sh,` -f makefile
116 | 
117 | CUDAINCLUDEPATHS				= $(INCLUDEPATHSUSE:%=--compiler-options %)
118 | CUDADEFINES						= $(DEFINESUSE:%=--compiler-options %)
119 | 
120 | DEPENDS							:= $(CUFILES:%.cu=$(WORKPATH)/cu/%.d) $(CLFILES:%.cl=$(WORKPATH)/cl/%.d) $(ASMFILES:%.asm=$(WORKPATH)/asm/%.d) $(CPPFILES_DBG:%.cpp=$(WORKPATH)/dbg/%.d) $(CPPFILES_VCC:%.cpp=$(WORKPATH)/vcc/%.d) \
121 | 									$(CPPFILES:%.cpp=$(WORKPATH)/cpp/%.d) $(CXXFILES:%.cxx=$(WORKPATH)/cxx/%.d) $(CFILES:%.c=$(WORKPATH)/c/%.d) \
122 | 									$(CPPFILES_MSCC:%.cpp=$(WORKPATH)/mscc/%.d) $(CPPFILES_CLANG:%.cpp=$(WORKPATH)/clang/%.d) $(CPPFILES_ICC:%.cpp=$(WORKPATH)/icc/%.d) $(CPPFILES_GCC:%.cpp=$(WORKPATH)/gcc/%.d) $(QTFILES:%.ui=$(WORKPATH)/qt/%.qtd)
123 | CPPFILES						+= $(QTFILES:%.ui=$(WORKPATH)/cpp/%.$(OBJ)) $(QTCPPFILES)
124 | 
125 | OBJFILES						:= $(CUFILES:%.cu=$(WORKPATH)/cu/%.$(OBJ)) $(ASMFILES:%.asm=$(WORKPATH)/asm/%.$(OBJ)) $(CPPFILES_DBG:%.cpp=$(WORKPATH)/dbg/%.$(OBJ)) $(CPPFILES_VCC:%.cpp=$(WORKPATH)/vcc/%.$(OBJ)) $(RESOURCEFILES:%=$(WORKPATH)/res/%.$(OBJ)) $(CLFILES:%.cl=$(WORKPATH)/cl/%.$(OBJ)) $(QTFILES:%.ui=$(WORKPATH)/qt/%_moc.$(OBJ)) $(QTCPPFILES:%.cpp=$(WORKPATH)/qt/%_moccpp.$(OBJ))
126 | 
127 | CPPFILES_ICC					:= $(CPPFILES_ICC:%.cpp=$(WORKPATH)/icc/%.$(OBJ))
128 | CPPFILES_GCC					:= $(CPPFILES_GCC:%.cpp=$(WORKPATH)/gcc/%.$(OBJ))
129 | CPPFILES_MSCC					:= $(CPPFILES_MSCC:%.cpp=$(WORKPATH)/mscc/%.$(OBJ))
130 | CPPFILES_CLANG					:= $(CPPFILES_CLANG:%.cpp=$(WORKPATH)/clang/%.$(OBJ))
131 | ifeq ($(CC_SELECTED), ICC)
132 | CPPFILES_ICC					+= $(CPPFILES:%.cpp=$(WORKPATH)/cpp/%.$(OBJ)) $(CXXFILES:%.cxx=$(WORKPATH)/cxx/%.$(OBJ)) $(CFILES:%.c=$(WORKPATH)/c/%.$(OBJ)) 
133 | CCWITHLINK						= /link
134 | endif
135 | ifeq ($(CC_SELECTED), MSCC)
136 | CPPFILES_MSCC					+= $(CPPFILES:%.cpp=$(WORKPATH)/cpp/%.$(OBJ)) $(CXXFILES:%.cxx=$(WORKPATH)/cxx/%.$(OBJ)) $(CFILES:%.c=$(WORKPATH)/c/%.$(OBJ)) 
137 | CCWITHLINK						= /link
138 | endif
139 | ifeq ($(CC_SELECTED), GCC)
140 | CPPFILES_GCC					+= $(CPPFILES:%.cpp=$(WORKPATH)/cpp/%.$(OBJ)) $(CXXFILES:%.cxx=$(WORKPATH)/cxx/%.$(OBJ)) $(CFILES:%.c=$(WORKPATH)/c/%.$(OBJ)) 
141 | endif
142 | ifeq ($(CC_SELECTED), clang)
143 | CPPFILES_CLANG					+= $(CPPFILES:%.cpp=$(WORKPATH)/cpp/%.$(OBJ)) $(CXXFILES:%.cxx=$(WORKPATH)/cxx/%.$(OBJ)) $(CFILES:%.c=$(WORKPATH)/c/%.$(OBJ)) 
144 | endif
145 | 
146 | ifeq ($(CONFIG_LTO), 1)
147 | ifneq ("0$(CPPFILES_ICC)", "0")
148 | OBJFILES						+= $(WORKPATH)/make_lto_icc/icclto_$(TARGET).$(OBJ)
149 | endif
150 | OBJFILES						+= $(CPPFILES_MSCC) $(CPPFILES_GCC) $(CPPFILES_CLANG)
151 | else
152 | OBJFILES						+= $(CPPFILES_ICC) $(CPPFILES_MSCC) $(CPPFILES_GCC) $(CPPFILES_CLANG)
153 | endif
154 | 
155 | ifneq ($(CONFIG_GDB), 0)
156 | GCCFLAGSOPT					+= -ggdb
157 | GCCFLAGSDBG					+= -ggdb
158 | CLANGFLAGSOPT					+= -ggdb
159 | endif
160 | 
161 | ifeq ($(CONFIG_CUDA_DC), 1)
162 | OBJFILES						+= $(WORKPATH)/make_cuda_device_link/dl_$(TARGET).$(OBJ)
163 | NVCCFLAGSDC						= -dc
164 | endif
165 | 
166 | ifeq ($(ARCH_CYGWIN), 1)
167 | ifeq ($(INCLUDEPATHS)$(COMMONINCLUDEPATHS), )
168 | GCCINCLUDEPATHS					=
169 | else
170 | GCCINCLUDEPATHSA				:= $(INCLUDEPATHS) $(COMMONINCLUDEPATHS)
171 | GCCINCLUDEPATHSB				:= $(shell cygpath -u $(GCCINCLUDEPATHSA))
172 | GCCINCLUDEPATHS					:= $(GCCINCLUDEPATHSB:%=-I%)
173 | endif
174 | else
175 | GCCINCLUDEPATHS					:= $(INCLUDEPATHS:%=-I%) $(COMMONINCLUDEPATHS:%=-I%) $(INCLUDEPATHSSYSTEM:%=-isystem %)
176 | endif
177 | VSINCLUDEPATHS					:= $(INCLUDEPATHS:%=/I%) $(COMMONINCLUDEPATHS:%=/I%)
178 | 
179 | ifeq ($(ARCHBITS), 64)
180 | DEFINES							+= _64BIT
181 | endif
182 | GCCDEFINES						:= $(DEFINES:%=-D%) $(DEFINESARCH:%=-D%)
183 | VSDEFINESTMP					:= $(DEFINES:%=/D%) $(DEFINESARCH:%=/D%)
184 | VSDEFINES						:= $(subst =, BAT_SPECIAL_EQ ,$(VSDEFINESTMP))
185 | 
186 | LIBFILES						= $(LIBSUSE)
187 | 
188 | .SECONDARY:						$(CUFILES:%.cu=$(WORKPATH)/cu/%.cpp) $(ASMFILES:%.asm=$(WORKPATH)/asm/%.cpp) $(RESOURCEFILES:%=$(WORKPATH)/res/%.$(OBJ)) $(CLFILES:%.cl=$(WORKPATH)/cl/%.$(OBJ)) $(QTFILES:%.ui=$(WORKPATH)/qt/%_moc.$(OBJ)) $(QTFILES:%.ui=$(WORKPATH)/qt/%_moc.cpp) $(QTFILES:%.ui=$(WORKPATH)/qt/%_ui.h) $(CLFILES:%.cl=$(WORKPATH)/cl/%.clbin)
189 | 
190 | $(EXECUTABLE):					$(EXTRADEPS) $(OBJFILES) $(EXTRAOBJFILES) $(ALLDEP)
191 | 								$(LINK) $(EXTRAFLAGSLINK) $(LIBPATHSUSE) $(OBJFILES) $(EXTRAOBJFILES) $(LIBFILES) $(LINKFLAGSUSE) $(LINKOUTPUT) $(LINKTARGETTYPE)
192 | 								$(HIDEECHOA) if [ -e "$(BUILDSCRIPT)" ]; then ./$(BUILDSCRIPT); fi
193 | 
194 | $(WORKPATH)/cpp/%.$(OBJ):		%.cpp $(ALLDEP)
195 | 								$(MKDIR)
196 | 								$(CC) $(INCLUDEPATHSUSE) $(DEFINESUSE) $(FILEFLAGS$<) $(COMPILEONLY) $< $(COMPILEOUTPUT)
197 | 							
198 | $(WORKPATH)/cxx/%.$(OBJ):		%.cxx $(ALLDEP)
199 | 								$(CC) $(INCLUDEPATHSUSE) $(DEFINESUSE) $(FILEFLAGS$<) $(COMPILEONLY) $< $(COMPILEOUTPUT)
200 | 							
201 | $(WORKPATH)/c/%.$(OBJ):			%.c $(ALLDEP)
202 | 								$(CC) $(INCLUDEPATHSUSE) $(DEFINESUSE) $(FILEFLAGS$<) $(COMPILEONLY) $< $(COMPILEOUTPUT)
203 | 
204 | $(WORKPATH)/cu/%.$(OBJ):		$(WORKPATH)/cu/%.cpp $(ALLDEP)
205 | 								$(CCCUDA) $(INCLUDEPATHSUSE) $(DEFINESUSE) $(FILEFLAGS$<) $(COMPILEONLY) $< $(COMPILEOUTPUT)
206 | 							
207 | $(WORKPATH)/cu/%.cpp:			%.cu $(ALLDEP)
208 | 								$(NVCC) $(NVCCFLAGSUSE) $(NVCCFLAGSDC) $(CUDAINCLUDEPATHS) $(CUDADEFINES) $(FILEFLAGS$<) --cuda --output-file "$@" $<
209 | 								$(HIDEECHOA) cat $@ | grep -v NVCC_GREP | sed "s/#pragma detect_mismatch(\"_MSC_VER\", \"1600\")//g" > $@.tmp
210 | 								$(HIDEECHOA) mv -f $@.tmp $@
211 | 								-if [ -e "$<.$(ARCH).patch" ]; then patch -r /dev/null -s --no-backup-if-mismatch -i $<.$(ARCH).patch $@; fi
212 | 
213 | $(WORKPATH)/asm/%.$(OBJ):		$(WORKPATH)/asm/%.asm $(ALLDEP)
214 | 								$(ASM) $(COMPILEOUTPUT) $(ASMONLY) $<
215 | 							
216 | $(WORKPATH)/asm/%.asm:			%.asm $(ALLDEP)
217 | 								$(CC) $(PRECOMPILEONLY) $(FILEFLAGS$<) $(DEFINESUSE) $< > $@
218 | 
219 | $(WORKPATH)/dbg/%.$(OBJ):		%.cpp $(ALLDEP)
220 | 								$(CCDBG) $(INCLUDEPATHSUSE) $(DEFINESUSE) $(FILEFLAGS$<) $(COMPILEONLY) $< $(COMPILEOUTPUT)
221 | $(WORKPATH)/icc/%.$(OBJ):		%.cpp $(ALLDEP)
222 | 								$(ICC) $(INCLUDEPATHSUSE) $(DEFINESUSE) $(FILEFLAGS$<) $(COMPILEONLY) $< $(COMPILEOUTPUT)
223 | $(WORKPATH)/vcc/%.$(OBJ):		%.cpp $(ALLDEP)
224 | 								$(VCC) $(INCLUDEPATHSUSE) $(DEFINESUSE) $(FILEFLAGS$<) $(COMPILEONLY) $<
225 | $(WORKPATH)/gcc/%.$(OBJ):		%.cpp $(ALLDEP)
226 | 								$(GCC) $(GCCINCLUDEPATHS) $(GCCDEFINES) $(FILEFLAGS$<) -c $< -o $@
227 | $(WORKPATH)/mscc/%.$(OBJ):		%.cpp $(ALLDEP)
228 | 								$(MSCC) $(INCLUDEPATHSUSE) $(DEFINESUSE) $(FILEFLAGS$<) $(COMPILEONLY) $< $(COMPILEOUTPUT)
229 | $(WORKPATH)/clang/%.$(OBJ):		%.cpp $(ALLDEP)
230 | 								$(CLANG) $(GCCINCLUDEPATHS) $(GCCDEFINES) $(FILEFLAGS$<) -c $< -o $@
231 | 
232 | 
233 | $(WORKPATH)/qt/%_ui.h:			%.ui $(ALLDEP)
234 | 								$(MKDIR)
235 | 								$(QTUIC) -o $@ $<
236 | 
237 | $(WORKPATH)/qt/%_moc.cpp:		%.h $(WORKPATH)/qt/%_ui.h $(ALLDEP)
238 | 								$(QTMOC) -DQT_NO_DEBUG -DQT_GUI_LIB -DQT_CORE_LIB -DQT_SHARED -o $@ $<
239 | 
240 | $(WORKPATH)/qt/%_moccpp.cpp:		%.h $(ALLDEP)
241 | 								$(QTMOC) -DQT_NO_DEBUG -DQT_GUI_LIB -DQT_CORE_LIB -DQT_SHARED -o $@ $<
242 | 								
243 | $(WORKPATH)/qt/%.$(OBJ):	$(WORKPATH)/qt/%.cpp $(ALLDEP)
244 | 								$(CC) $(INCLUDEPATHSUSE) $(DEFINESUSE) $(FILEFLAGS$<) $(COMPILEONLY) $< $(COMPILEOUTPUT)
245 | 
246 | $(WORKPATH)/_makefiles_support_opencl.$(OBJ):	makefiles/makefile_opencl_compiler.cpp $(ALLDEP)
247 | 								$(HIDEECHOB) mkdir -p $(WORKPATH)
248 | 								$(CC) $(INCLUDEPATHSUSE) $(COMPILEONLY) $< $(COMPILEOUTPUT)
249 | $(WORKPATH)/_makefiles_support_opencl.exe:	$(WORKPATH)/_makefiles_support_opencl.$(OBJ) $(ALLDEP)
250 | 								$(LINK) $(LIBPATHSUSE) $< $(LINKFLAGSUSE) $(LINKOUTPUT) $(LINKFLAGSARCH) $(OPENCLLIB)
251 | 
252 | $(WORKPATH)/cl/%.clbin:			%.cl $(WORKPATH)/_makefiles_support_opencl.exe $(ALLDEP)
253 | 								$(HIDEECHOA) $(OPENCL_ENVIRONMENT) ./$(WORKPATH)/_makefiles_support_opencl.exe -output-file $@ $< -- -I. $(GCCINCLUDEPATHS) $(GCCDEFINES) $(OPENCL_DEF_OPTIONS) $(OPENCL_OPTIONS) > /dev/null
254 | 
255 | $(WORKPATH)/cl/%.$(OBJ):		$(WORKPATH)/cl/%.clbin $(ALLDEP)
256 | 								@echo $<
257 | 								$(HIDEECHOA) sed -e "s&FILENAMENORMAL&$<&g" -e "s&FILENAMEMOD&_makefile_opencl_program_`echo $< | sed -e "s&$(WORKPATH)/cl/&&g" -e "s&\.clbin&\.cl&g" -e "s&\.&_&g" -e "s&/&_&g" -e "s& &_&g"`&g" makefiles/include.S | $(GCC3264) -c -x assembler -o $@ -
258 | $(WORKPATH)/res/%.$(OBJ):		% $(ALLDEP)
259 | 								@echo $<
260 | 								$(MKDIR)
261 | 								$(HIDEECHOA) sed -e "s&FILENAMENORMAL&$<&g" -e "s&FILENAMEMOD&_resource_`echo $< | sed -e "s&\.&_&g" -e "s&/&_&g" -e "s& &_&g"`&g" makefiles/include.S | $(GCC3264) -c -x assembler -o $@ -
262 | 
263 | 
264 | 
265 | $(WORKPATH)/make_lto_icc/icclto_$(TARGET).$(OBJ):	$(CPPFILES_ICC) $(ALLDEP)
266 | 								$(HIDEECHOA) mkdir -p $(WORKPATH)/make_lto_icc
267 | 								$(ICCLINK) $(INTELLINKIPO)$@ $(CPPFILES_ICC)
268 | 							
269 | $(WORKPATH)/make_cuda_device_link/dl_$(TARGET).$(OBJ):	$(CUFILES:%.cu=$(WORKPATH)/cu/%.$(OBJ)) $(ALLDEP)
270 | 								$(HIDEECHOA) mkdir -p $(WORKPATH)/make_cuda_device_link
271 | 								$(NVCC) $(NVCCFLAGSUSE) -dlink $(CUFILES:%.cu=$(WORKPATH)/cu/%.$(OBJ)) -o $@ -lcudadevrt
272 | 							
273 | clean:							$(SUBTARGETS:%=subclean/%.mak)
274 | 								rm -Rf *.plg *.dpi *.exp *.lib $(EXECUTABLE) $(COPIED_EXECUTABLE) x64/release/* *.cubin *.gpu *.ptx *.linkinfo *.ii cuda.compute_* $(DEPENDS) $(OBJFILES) $(CUFILES:%.cu=$(WORKPATH)/cu/%.cpp) $(ASMFILES:%.asm=$(WORKPATH)/asm/%.asm) $(CLEANRELEASEDIR) $(SUBTARGETS_CLEAN)
275 | 							
276 | subclean/%.mak:
277 | 								+export CONFIGFILE=config_`echo $@ | sed s,subclean/,,` && $(MAKE) -f makefile clean
278 | 
279 | SAVEDIR							= releases/`date +%F`-BUILD-`cat buildnr`							
280 | backup:						
281 | 								mkdir $(SAVEDIR)
282 | 								cp *.cpp *.h makefile buildnr *.sh *.bat *.conf *.cu $(SAVEDIR)
283 | 
284 | #Dependencies
285 | 
286 | DEPMKDIR						= $(MKDIR) &&
287 | DEPGCC							= $(GCC3264) $(GCCFLAGSARCH) $(GCCINCLUDEPATHS) $(GCCDEFINES) -x c++ -MM $<
288 | DEPSED1							= sed -e ':a;N;$$!ba;s/\n/ /g' | sed -e
289 | DEPSED2							= -e 's,\\,/,g' \
290 | 								-e 's,[cC]:/,/cygdrive/c/,g' > \
291 | 								$@;
292 | 
293 | 
294 | $(WORKPATH)/qt/%.qtd:			%.cpp %.ui $(WORKPATH)/qt/%_ui.h $(ALLDEP)
295 | 								$(DEPMKDIR) $(DEPGCC) | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/cpp/$*.$(OBJ) $@ : ,g' $(DEPSED2)
296 | 							
297 | $(WORKPATH)/cl/%.d:				%.cl $(ALLDEP)
298 | 								$(DEPMKDIR) $(DEPGCC) -I$(GCCPATH)/include -D__OPENCL__ | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/cl/$*.clbin $(WORKPATH)/cl/$*.cl $@ : ,g' $(DEPSED2)
299 | 							
300 | $(WORKPATH)/cu/%.d:				%.cu $(ALLDEP)
301 | 								$(DEPMKDIR) $(DEPGCC) -I$(GCCPATH)/include -D_MSC_VER=1700 -D__CUDACC__ | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/cu/$*.$(OBJ) $(WORKPATH)/cu/$*.cpp $@ : ,g' $(DEPSED2)
302 | 							
303 | $(WORKPATH)/asm/%.d:			%.asm $(ALLDEP)
304 | 								$(DEPMKDIR) $(DEPGCC) | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/asm/$*.$(OBJ) $(WORKPATH)/asm/$*.asm $@ : ,g' $(DEPSED2)
305 | 
306 | $(WORKPATH)/cpp/%.d:			%.cpp $(ALLDEP)
307 | 								$(DEPMKDIR) $(DEPGCC) | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/cpp/$*.$(OBJ) $@ : ,g' $(DEPSED2)
308 | 							
309 | $(WORKPATH)/cxx/%.d:			%.cxx $(ALLDEP)
310 | 								$(DEPMKDIR) $(DEPGCC) | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/cxx/$*.$(OBJ) $@ : ,g' $(DEPSED2)
311 | 
312 | $(WORKPATH)/c/%.d:				%.c $(ALLDEP)
313 | 								$(DEPMKDIR) $(DEPGCC) | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/c/$*.$(OBJ) $@ : ,g' $(DEPSED2)
314 | 							
315 | $(WORKPATH)/dbg/%.d:			%.cpp $(ALLDEP)
316 | 								$(DEPMKDIR) $(DEPGCC) -DDEBUG_RUNTIME | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/dbg/$*.$(OBJ) $@ : ,g' $(DEPSED2)
317 | 
318 | $(WORKPATH)/icc/%.d:			%.cpp $(ALLDEP)
319 | 								$(DEPMKDIR) $(DEPGCC) -DINTEL_RUNTIME | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/icc/$*.$(OBJ) $@ : ,g' $(DEPSED2)
320 | 							
321 | $(WORKPATH)/vcc/%.d:			%.cpp $(ALLDEP)
322 | 								$(DEPMKDIR) $(DEPGCC) -DVECTORC_RUNTIME | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/vcc/$*.$(OBJ) $@ : ,g' $(DEPSED2)
323 | 							
324 | $(WORKPATH)/gcc/%.d:			%.cpp $(ALLDEP)
325 | 								$(DEPMKDIR) $(DEPGCC) -DGCC_RUNTIME | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/gcc/$*.$(OBJ) $@ : ,g' $(DEPSED2)
326 | 							
327 | $(WORKPATH)/mscc/%.d:			%.cpp $(ALLDEP)
328 | 								$(DEPMKDIR) $(DEPGCC) -DVSNET_RUNTIME | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/mscc/$*.$(OBJ) $@ : ,g' $(DEPSED2)
329 | 
330 | $(WORKPATH)/clang/%.d:			%.cpp $(ALLDEP)
331 | 								$(DEPMKDIR) $(DEPGCC) -DCLANG_RUNTIME | $(DEPSED1) 's,^[a-zA-Z0-9._-]*[ ]*:,$(WORKPATH)/clang/$*.$(OBJ) $@ : ,g' $(DEPSED2)
332 | 
333 | ifneq ($(CHECK_DEPENDENCIES), 0)
334 | include $(DEPENDS)							
335 | endif
336 | 


--------------------------------------------------------------------------------
/makefiles/makefile_opencl_compiler.cpp:
--------------------------------------------------------------------------------
  1 | #define _CRT_SECURE_NO_WARNINGS
  2 | #include "CL/opencl.h"
  3 | #include <stdlib.h>
  4 | #include <stdio.h>
  5 | #include <string.h>
  6 | #include <string>
  7 | #include <vector>
  8 | 
  9 | #include "opencl_compiler_structs.h"
 10 | 
 11 | #define quit(arg) {fprintf(stderr, arg "\n");return(1);}
 12 | #define DEFAULT_OPENCL_COMPILER_OPTIONS ""
 13 | #define DEFAULT_OUTPUT_FILE "opencl.out"
 14 | 
 15 | int main(int argc, char** argv)
 16 | {
 17 | 	const char* output_file = DEFAULT_OUTPUT_FILE;
 18 | 	std::string compiler_options = DEFAULT_OPENCL_COMPILER_OPTIONS;
 19 | 	std::vector<char*> files;
 20 | 
 21 | 	printf("Passing command line options:\n");
 22 | 	bool add_option = false;
 23 | 	for (int i = 1;i < argc;i++)
 24 | 	{
 25 | 		if (add_option)
 26 | 		{
 27 | 			compiler_options += " ";
 28 | 			compiler_options += argv[i];
 29 | 		}
 30 | 		else if (strcmp(argv[i], "--") == 0)
 31 | 		{
 32 | 			add_option = true;
 33 | 		}
 34 | 		else if (strcmp(argv[i], "-output-file") == 0)
 35 | 		{
 36 | 			if (++i >= argc) quit("Output file name missing");
 37 | 			output_file = argv[i];
 38 | 		}
 39 | 		else
 40 | 		{
 41 | 			fprintf(stderr, "%s\n", argv[i]);
 42 | 			files.push_back(argv[i]);
 43 | 		}
 44 | 	}
 45 | 	
 46 | 	cl_int ocl_error;
 47 | 	cl_uint num_platforms;
 48 | 	if (clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS) quit("Error getting OpenCL Platform Count");
 49 | 	if (num_platforms == 0) quit("No OpenCL Platform found");
 50 | 	printf("%d OpenCL Platforms found\n", num_platforms);
 51 | 	
 52 | 	//Query platforms
 53 | 	cl_platform_id* platforms = new cl_platform_id[num_platforms];
 54 | 	if (platforms == NULL) quit("Memory allocation error");
 55 | 	if (clGetPlatformIDs(num_platforms, platforms, NULL) != CL_SUCCESS) quit("Error getting OpenCL Platforms");
 56 | 
 57 | 	cl_platform_id platform;
 58 | 	bool found = false;
 59 | 
 60 | 	_makefiles_opencl_platform_info pinfo;
 61 | 	for (unsigned int i_platform = 0;i_platform < num_platforms;i_platform++)
 62 | 	{
 63 | 		clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_PROFILE, 64, pinfo.platform_profile, NULL);
 64 | 		clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_VERSION, 64, pinfo.platform_version, NULL);
 65 | 		clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_NAME, 64, pinfo.platform_name, NULL);
 66 | 		clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_VENDOR, 64, pinfo.platform_vendor, NULL);
 67 | 		printf("Available Platform %d: (%s %s) %s %s\n", i_platform, pinfo.platform_profile, pinfo.platform_version, pinfo.platform_vendor, pinfo.platform_name);
 68 | 		if (strcmp(pinfo.platform_vendor, "Advanced Micro Devices, Inc.") == 0)
 69 | 		{
 70 | 			found = true;
 71 | 			printf("AMD OpenCL Platform found (%d)\n", i_platform);
 72 | 			platform = platforms[i_platform];
 73 | 			break;
 74 | 		}
 75 | 	}
 76 | 	if (found == false)
 77 | 	{
 78 | 		quit("Did not find AMD OpenCL Platform");
 79 | 	}
 80 | 
 81 | 	if (clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &pinfo.count) != CL_SUCCESS)
 82 | 	{
 83 | 		quit("Error getting OPENCL Device Count");
 84 | 	}
 85 | 
 86 | 	//Query devices
 87 | 	cl_device_id* devices = new cl_device_id[pinfo.count];
 88 | 	if (devices == NULL) quit("Memory allocation error");
 89 | 	if (clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, pinfo.count, devices, NULL) != CL_SUCCESS) quit("Error getting OpenCL devices"); 
 90 | 
 91 | 	_makefiles_opencl_device_info dinfo;
 92 | 	cl_device_type device_type;
 93 | 	cl_uint freq, shaders;
 94 | 
 95 | 	printf("Available OPENCL devices:\n");
 96 | 	for (unsigned int i = 0;i < pinfo.count;i++)
 97 | 	{
 98 | 		printf("Examining device %d\n", i);
 99 | 
100 | 		clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 64, dinfo.device_name, NULL);
101 | 		clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, 64, dinfo.device_vendor, NULL);
102 | 		clGetDeviceInfo(devices[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL);
103 | 		clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(freq), &freq, NULL);
104 | 		clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(shaders), &shaders, NULL);
105 | 		clGetDeviceInfo(devices[i], CL_DEVICE_ADDRESS_BITS, sizeof(dinfo.nbits), &dinfo.nbits, NULL);
106 | 		printf("Found Device %d: %s %s (Frequency %d, Shaders %d, %d bit)\n", i, dinfo.device_vendor, dinfo.device_name, (int) freq, (int) shaders, (int) dinfo.nbits);
107 | 	}
108 | 
109 | 	if (files.size() == 0)
110 | 	{
111 | 		quit("Syntax: opencl [-output-file OUTPUT_FILE] FILE1 [FILE2] ... [FILEn] [-- COMPILER_OPTION_1] [COMPILER_OPTION_2] ... [COMPILER_OPTION_N]");
112 | 	}
113 | 
114 | 	char** buffers = (char**) malloc(files.size() * sizeof(char*));
115 | 	if (buffers == NULL) quit("Memory allocation error\n");
116 | 	for (unsigned int i = 0;i < files.size();i++)
117 | 	{
118 | 		printf("Reading source file %s\n", files[i]);
119 | 		FILE* fp = fopen(files[i], "rb");
120 | 		if (fp == NULL)
121 | 		{
122 | 			printf("Cannot open %s\n", files[i]);
123 | 			return(1);
124 | 		}
125 | 		fseek(fp, 0, SEEK_END);
126 | 		size_t file_size = ftell(fp);
127 | 		fseek(fp, 0, SEEK_SET);
128 | 
129 | 		buffers[i] = (char*) malloc(file_size + 1);
130 | 		if (buffers[i] == NULL)
131 | 		{
132 | 			quit("Memory allocation error");
133 | 		}
134 | 		if (fread(buffers[i], 1, file_size, fp) != file_size)
135 | 		{
136 | 			quit("Error reading file");
137 | 		}
138 | 		buffers[i][file_size] = 0;
139 | 		fclose(fp);
140 | 	}
141 | 
142 | 	printf("Creating OpenCL Context\n");
143 | 	//Create OpenCL context
144 | 	cl_context context = clCreateContext(NULL, pinfo.count, devices, NULL, NULL, &ocl_error);
145 | 	if (ocl_error != CL_SUCCESS) quit("Error creating OpenCL context");
146 | 
147 | 	printf("Creating OpenCL Program Object\n");
148 | 	//Create OpenCL program object
149 | 	cl_program program = clCreateProgramWithSource(context, (cl_uint) files.size(), (const char**) buffers, NULL, &ocl_error);
150 | 	if (ocl_error != CL_SUCCESS) quit("Error creating program object");
151 | 
152 | 	printf("Compiling OpenCL Program\n");
153 | 	//Compile program
154 | 	ocl_error = clBuildProgram(program, pinfo.count, devices, compiler_options.c_str(), NULL, NULL);
155 | 	if (ocl_error != CL_SUCCESS)
156 | 	{
157 | 		fprintf(stderr, "OpenCL Error while building program: %d (Compiler options: %s)\n", ocl_error, compiler_options.c_str());
158 | 		fprintf(stderr, "OpenCL Kernel:\n\n");
159 | 		for (unsigned int i = 0;i < files.size();i++)
160 | 		{
161 | 			printf("%s\n\n", buffers[i]);
162 | 		}
163 | 		
164 | 		for (unsigned int i = 0;i < pinfo.count;i++)
165 | 		{
166 | 			cl_build_status status;
167 | 			clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_STATUS, sizeof(status), &status, NULL);
168 | 			if (status == CL_BUILD_ERROR)
169 | 			{
170 | 				size_t log_size;
171 | 				clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
172 | 				char* build_log = (char*) malloc(log_size + 1);
173 | 				if (build_log == NULL) quit("Memory allocation error");
174 | 				clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
175 | 				fprintf(stderr, "Build Log (device %d):\n\n%s\n\n", i, build_log);
176 | 				free(build_log);
177 | 			}
178 | 		}
179 | 	}
180 | 	for (unsigned int i = 0;i < files.size();i++)
181 | 	{
182 | 		free(buffers[i]);
183 | 	}
184 | 	free(buffers);
185 | 	if (ocl_error != CL_SUCCESS) return(1);
186 | 
187 | 	printf("Obtaining program binaries\n");
188 | 	size_t* binary_sizes = (size_t*) malloc(pinfo.count * sizeof(size_t));
189 | 	if (binary_sizes == NULL) quit("Memory allocation error");
190 | 	clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, pinfo.count * sizeof(size_t), binary_sizes, NULL);
191 | 	char** binary_buffers = (char**) malloc(pinfo.count * sizeof(char*));
192 | 	if (binary_buffers == NULL) quit("Memory allocation error");
193 | 	for (unsigned int i = 0;i < pinfo.count;i++)
194 | 	{
195 | 		printf("Binary size for device %d: %d\n", i, (int) binary_sizes[i]);
196 | 		binary_buffers[i] = (char*) malloc(binary_sizes[i]);
197 | 		memset(binary_buffers[i], 0, binary_sizes[i]);
198 | 		if (binary_buffers[i] == NULL) quit("Memory allocation error");
199 | 	}
200 | 	clGetProgramInfo(program, CL_PROGRAM_BINARIES, pinfo.count * sizeof(char*), binary_buffers, NULL);
201 | 
202 | 	printf("Programs obtained successfully, cleaning up opencl\n");
203 | 	clReleaseProgram(program);
204 | 	clReleaseContext(context);
205 | 
206 | 	printf("Writing binaries to file (%s)\n", output_file);
207 | 	FILE* fp;
208 | 	fp = fopen(output_file, "w+b");
209 | 	if (fp == NULL) quit("Error opening output file\n");
210 | 	const char* magic_bytes = "QOCLPB";
211 | 	fwrite(magic_bytes, 1, strlen(magic_bytes) + 1, fp);
212 | 	fwrite(&pinfo, 1, sizeof(pinfo), fp);
213 | 	for (unsigned int i = 0;i < pinfo.count;i++)
214 | 	{
215 | 		clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 64, dinfo.device_name, NULL);
216 | 		clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, 64, dinfo.device_vendor, NULL);
217 | 		dinfo.binary_size = binary_sizes[i];
218 | 		fwrite(&dinfo, 1, sizeof(dinfo), fp);
219 | 		fwrite(binary_buffers[i], 1, binary_sizes[i], fp);
220 | 	}
221 | 	fclose(fp);
222 | 
223 | 	printf("All done, cleaning up remaining buffers\n");
224 | 	for (unsigned int i = 0;i < pinfo.count;i++)
225 | 	{
226 | 		free(binary_buffers[i]);
227 | 	}
228 | 	free(binary_sizes);
229 | 	free(binary_buffers);
230 | 
231 | 	return(0);
232 | }


--------------------------------------------------------------------------------
/makefiles/opencl_compiler_structs.h:
--------------------------------------------------------------------------------
 1 | struct _makefiles_opencl_platform_info
 2 | {
 3 | 	char platform_profile[64];
 4 | 	char platform_version[64];
 5 | 	char platform_name[64];
 6 | 	char platform_vendor[64];
 7 | 	cl_uint count;
 8 | };
 9 | 
10 | struct _makefiles_opencl_device_info
11 | {
12 | 	char device_name[64];
13 | 	char device_vendor[64];
14 | 	cl_uint nbits;
15 | 	size_t binary_size;
16 | };
17 | 


--------------------------------------------------------------------------------
/makefiles/opencl_obtain_program.h:
--------------------------------------------------------------------------------
 1 | #ifndef MAKEFILES_OPENCL_OBTAIN_PROGRAMH
 2 | #define MAKEFILES_OPENCL_OBTAIN_PROGRAMH
 3 | 
 4 | #include <CL/opencl.h>
 5 | #include <vector>
 6 | #include "opencl_compiler_structs.h"
 7 | 
 8 | static int _makefiles_opencl_obtain_program_helper(cl_context context, cl_uint num_devices, cl_device_id* devices, cl_program* program, char* binaries)
 9 | {
10 | 	const char* magic_bytes = "QOCLPB";
11 | 	if (strncmp(magic_bytes, binaries, strlen(magic_bytes)) != 0)
12 | 	{
13 | 		printf("Internal error accessing opencl program\n");
14 | 		return(1);
15 | 	}
16 | 	char* current_ptr = binaries + strlen(magic_bytes) + 1;
17 | 	_makefiles_opencl_platform_info* pinfo = (_makefiles_opencl_platform_info*) current_ptr;
18 | 	current_ptr += sizeof(_makefiles_opencl_platform_info);
19 | 
20 | 	if (num_devices != pinfo->count)
21 | 	{
22 | 		printf("Number of devices differs from number of devices in opencl program\n");
23 | 		return(1);
24 | 	}
25 | 	//printf("Obtaining program for OpenCL Platform: (%s %s) %s %s\n", pinfo->platform_profile, pinfo->platform_version, pinfo->platform_vendor, pinfo->platform_name);
26 | 
27 | 	std::vector<size_t> program_sizes(pinfo->count);
28 | 	std::vector<char*> program_binaries(pinfo->count);
29 | 
30 | 	for (unsigned int i = 0;i < pinfo->count;i++)
31 | 	{
32 | 		char device_name[64], device_vendor[64];
33 | 		cl_uint nbits;
34 | 		clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 64, device_name, NULL);
35 | 		clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, 64, device_vendor, NULL);
36 | 		clGetDeviceInfo(devices[i], CL_DEVICE_ADDRESS_BITS, sizeof(nbits), &nbits, NULL);
37 | 		_makefiles_opencl_device_info* dinfo = (_makefiles_opencl_device_info*) current_ptr;
38 | 		if (strcmp(device_name, dinfo->device_name) != 0 || strcmp(device_vendor, dinfo->device_vendor) != 0)
39 | 		{
40 | 			printf("Device list is different to device list from opencl program\n");
41 | 			return(1);
42 | 		}
43 | 		if (nbits != dinfo->nbits)
44 | 		{
45 | 			printf("Pointer size of device and stored device binary differs\n");
46 | 			return(1);
47 | 		}
48 | 		current_ptr += sizeof(_makefiles_opencl_device_info);
49 | 		//printf("Device %d: %s %s (size %lld)\n", i, dinfo->device_vendor, dinfo->device_name, (long long int) dinfo->binary_size);
50 | 		program_sizes[i] = dinfo->binary_size;
51 | 		program_binaries[i] = current_ptr;
52 | 		current_ptr += dinfo->binary_size;
53 | 	}
54 | 
55 | 	cl_int return_status[pinfo->count];
56 | 	cl_int ocl_error;
57 | 	*program = clCreateProgramWithBinary(context, num_devices, devices, program_sizes.data(), (const unsigned char**) program_binaries.data(), return_status, &ocl_error);
58 | 
59 | 	if (ocl_error != CL_SUCCESS)
60 | 	{
61 | 		printf("Error loading program\n");
62 | 		return(1);
63 | 	}
64 | 
65 | 	for (unsigned int i = 0;i < pinfo->count;i++)
66 | 	{
67 | 		if (return_status[i] != CL_SUCCESS)
68 | 		{
69 | 			printf("Error loading program for device %d\n", i);
70 | 			clReleaseProgram(*program);
71 | 			return(1);
72 | 		}
73 | 	}
74 | 
75 | 	ocl_error = clBuildProgram(*program, num_devices, devices, "", NULL, NULL);
76 | 	if (ocl_error != CL_SUCCESS)
77 | 	{
78 | 		printf("Error building program\n");
79 | 		clReleaseProgram(*program);
80 | 		return(1);
81 | 	}
82 | 
83 | 	return(0);
84 | }
85 | 
86 | #endif
87 | 


--------------------------------------------------------------------------------
/makefiles/x86_64-pc-linux-gnu.mak:
--------------------------------------------------------------------------------
  1 | CUDAPATH						= $(CUDA_PATH)
  2 | CUDASDKPATH						= $(CUDAPATH)/sdk
  3 | AMDPATH							= $(AMDAPPSDKROOT)
  4 | INTELPATH						:= $(shell which icc 2> /dev/null | sed "s,/bin/.*/icc$$,,")
  5 | ifeq ($(INTELPATH), )
  6 | INTELPATH						= /opt/intel/compilers_and_libraries_2016.2.181/linux
  7 | endif
  8 | 
  9 | GCC3264							= c++
 10 | CLANG3264						= clang++
 11 | ICC32							= $(INTELPATH)/bin/ia32/icc
 12 | ICC64							= $(INTELPATH)/bin/intel64/icc
 13 | 
 14 | #Intel Compiler Options
 15 | INTELFLAGSOPT					= -O3 -fno-alias -fno-fnalias -x$(INTELARCH) -unroll -unroll-aggressive -g0
 16 | ifeq ($(CONFIG_LTO), 1)
 17 | INTELFLAGSOPT					+= -ipo
 18 | INTELLINKIPO					= -ipo-c -ipo-fo
 19 | else
 20 | INTELFLAGSOPT					+= -ip
 21 | endif
 22 | INTELFLAGSDBG					= -O0 -g
 23 | INTELFLAGSCOMMON				= -DINTEL_RUNTIME $(INTELFLAGSUSE) -fasm-blocks
 24 | INTELFLAGS32					= $(INTELFLAGSCOMMON) -m32
 25 | INTELFLAGS64					= $(INTELFLAGSCOMMON) -m64 -D_AMD64_
 26 | 
 27 | ifeq ($(GCCARCH), )
 28 | GCCARCHA							= -march=native -msse4.2 -m$(ARCHBITS)
 29 | else
 30 | GCCARCHA							= -march=$(GCCARCH) -msse4.2 -m$(ARCHBITS)
 31 | endif
 32 | 
 33 | ifeq ("$(CONFIG_OPENMP)", "1")
 34 | INTELFLAGSCOMMON					+= -qopenmp
 35 | ifneq ("0$(CPPFILES_ICC)", "0")
 36 | LIBSUSE							+= -liomp5
 37 | endif
 38 | ifeq ($(CC_SELECTED), ICC)
 39 | LIBSUSE							+= -liomp5
 40 | endif
 41 | endif
 42 | 
 43 | #GCC link flags
 44 | LINKFLAGSCOMMON					= -Wall
 45 | ifeq ($(CONFIG_STATIC), 1)
 46 | LINKFLAGSCOMMON					+= -static
 47 | endif
 48 | ifneq ($(CONFIG_GDB), 0)
 49 | LINKFLAGSCOMMON					+= -ggdb
 50 | endif
 51 | LINKFLAGS32						= -m32 $(LINKFLAGSCOMMON)
 52 | LINKFLAGS64						= -m64 $(LINKFLAGSCOMMON)
 53 | 
 54 | #Compilation Output Control
 55 | ifneq ("$(VERBOSE)", "1")
 56 | HIDEECHOB						= @
 57 | endif
 58 | 
 59 | ifeq ($(ARCHBITS), 64)
 60 | ASM								= yasm -f elf64
 61 | ICC								= $(ICC64) $(INTELFLAGS64) $(CFLAGS64) $(COMPILETARGETTYPE)
 62 | GCC								= $(GCC3264) $(GCCFLAGS64) $(GCCFLAGSCOMMON) $(GCCFLAGSUSE) $(COMPILETARGETTYPE)
 63 | CCDBG							= $(GCC3264) $(GCCFLAGS64) $(GCCFLAGSCOMMON) $(GCCFLAGSDBG) $(COMPILETARGETTYPE) -DDEBUG_RUNTIME
 64 | GCCLINK							= $(GCC3264) $(LINKFLAGS64)
 65 | ICCLINK							= $(ICC64) $(LINKFLAGS64) -openmp
 66 | CUDALIBPATH						= $(CUDAPATH)/lib64
 67 | AMDLIBPATH						= $(AMDPATH)/lib/x86_64
 68 | INTELLIBPATH					= $(INTELPATH)/compiler/lib/intel64
 69 | CLANG							= $(CLANG3264) $(GCCFLAGS64) $(CLANGFLAGSCOMMON) $(CLANGFLAGSUSE) $(COMPILETARGETTYPE)
 70 | else
 71 | ASM								= yasm -f elf32
 72 | ICC								= $(ICC32) $(INTELFLAGS32) $(CFLAGS32) $(COMPILETARGETTYPE)
 73 | GCC								= $(GCC3264) $(GCCFLAGS32) $(GCCFLAGSCOMMON) $(GCCFLAGSUSE) $(COMPILETARGETTYPE)
 74 | CCDBG							= $(GCC3264) $(GCCFLAGS32) $(GCCFLAGSCOMMON) $(GCCFLAGSDBG) $(COMPILETARGETTYPE) -DDEBUG_RUNTIME
 75 | GCCLINK							= $(GCC3264) $(LINKFLAGS32)
 76 | ICCLINK							= $(GCC3264) $(LINKFLAGS32) -openmp
 77 | CUDALIBPATH						= $(CUDAPATH)/lib
 78 | AMDLIBPATH						= $(AMDPATH)/lib/x86
 79 | INTELLIBPATH					= $(INTELPATH)/compiler/lib/ia32
 80 | CLANG							= $(CLANG3264) $(GCCFLAGS32) $(CLANGFLAGSCOMMON) $(CLANGFLAGSUSE) $(COMPILETARGETTYPE)
 81 | endif
 82 | QTUIC							= uic
 83 | QTMOC							= moc
 84 | 
 85 | ifeq ($(TARGETTYPE), LIB)
 86 | LINKTARGETTYPE					= -shared
 87 | COMPILETARGETTYPE				= -fPIC
 88 | EXECUTABLE						= $(TARGET).so
 89 | else
 90 | LINKTARGETTYPE					=
 91 | COMPILETARGETTYPE				=
 92 | EXECUTABLE						= $(TARGET)
 93 | endif
 94 | LIBGLIBC						=
 95 | 
 96 | LIBSUSE							+= $(LIBGLIBC) -lrt -ldl -lpthread
 97 | 
 98 | ifeq ($(CC_x86_64-pc-linux-gnu), ICC)
 99 | CC								= $(ICC)
100 | LINK							= $(ICCLINK)
101 | else
102 | ifeq ($(CC_x86_64-pc-linux-gnu), clang)
103 | CC								= $(CLANG)
104 | LINK							= $(CLANG)
105 | else
106 | CC								= $(GCC)
107 | LINK							= $(GCCLINK)
108 | endif
109 | ifneq ($(CPPFILES_ICC), )
110 | LIBSUSE							+= -lintlc -lsvml -limf -lirc
111 | endif
112 | endif
113 | CC_SELECTED						= $(CC_x86_64-pc-linux-gnu)
114 | 
115 | CCCUDA							= $(GCC) -x c++ -Wno-effc++
116 | ASMPRE							= $(GCC3264)
117 | NVCC							= $(CUDAPATH)/bin/nvcc --compiler-bindir $(GCCCUDA)
118 | 
119 | COMMONINCLUDEPATHS				=
120 | LIBPATHSUSE						=
121 | 
122 | ifneq ($(CUFILES), )
123 | LIBSUSE							+= -lcudart -lcuda
124 | ifeq ($(CONFIG_CUDA_DC), 1)
125 | LIBSUSE							+= -lcudadevrt
126 | endif
127 | ifeq ($(CONFIG_CUBLAS), 1)
128 | LIBSUSE							+= -lcublas
129 | endif
130 | endif
131 | #$(CUDASDKPATH)/C/lib/libcutil.a
132 | 
133 | OPENCLLIB						= -lOpenCL
134 | ifeq ("$(CONFIG_OPENCL)", "1")
135 | LIBSUSE							+= $(OPENCLLIB)
136 | endif
137 | ifeq ("$(CONFIG_CAL)", "1")
138 | LIBSUSE							+= -laticalcl -laticalrt
139 | COMMONINCLUDEPATHS				+= $(AMDPATH)/include/CAL
140 | LIBPATHSUSE						+= -L$(AMDLIBPATH)
141 | endif
142 | ifeq ("$(CONFIG_OPENGL)", "1")
143 | LIBSUSE							+= -lGL -lGLU -lglut -lGLEW
144 | endif
145 | ifeq ("$(CONFIG_X11)", "1")
146 | LIBSUSE							+= -lX11
147 | endif
148 | 
149 | ifeq ("$(CONFIG_QT)", "1")
150 | LIBSUSE							+= -lQtGui -lQtCore
151 | COMMONINCLUDEPATHS				+= /usr/include/qt4 /usr/include/qt4/QtGui /usr/include/qt4/QtCore /usr/include/qt4/QtWidgets $(WORKPATH)/qt
152 | ifeq ($(ARCHBITS), 64)
153 | LIBPATHSUSE						+= -L/usr/lib/qt4
154 | else
155 | LIBPATHSUSE						+= -L/usr/lib32/qt4
156 | endif
157 | endif
158 | 
159 | LIBSUSE							+= $(LIBS:%=-l%)
160 | 
161 | ifeq ("$(CONFIG_OPENCL)", "1")
162 | ifeq ("$(CONFIG_OPENCL_VERSION)", "AMD")
163 | COMMONINCLUDEPATHS				+= "$(AMDPATH)/include"
164 | -L$(AMDLIBPATH)
165 | endif
166 | ifeq ("$(CONFIG_OPENCL_VERSION)", "NVIDIA")
167 | COMMONINCLUDEPATHS				+= "$(CUDAPATH)/include"
168 | endif
169 | ifeq ("$(CONFIG_OPENCL_VERSION)", "Intel")
170 | #COMMONINCLUDEPATHS				+= ""
171 | endif
172 | ifeq ("$(CONFIG_OPENCL_VERSION)", "All")
173 | COMMONINCLUDEPATHS				+= "$(AMDPATH)/include"
174 | COMMONINCLUDEPATHS				+= "$(CUDAPATH)/include"
175 | LIBPATHSUSE						+= -L$(AMDLIBPATH)
176 | endif
177 | endif
178 | 
179 | ifeq ("$(CONFIG_CUDA)", "1")
180 | COMMONINCLUDEPATHS				+= "$(CUDAPATH)/include"
181 | COMMONINCLUDEPATHS				+= "$(CUDASDKPATH)/common/inc"
182 | LIBPATHSUSE						+= -L$(CUDALIBPATH)
183 | endif
184 | 
185 | INCLUDEPATHSUSE					= $(GCCINCLUDEPATHS)
186 | DEFINESUSE						= $(GCCDEFINES)
187 | 
188 | LIBPATHSUSE						+= -L$(INTELLIBPATH) $(LIBPATHS:%=-L%)
189 | 
190 | NVCCARCHS						:= `for i in $(CUDAVERSION); do echo -n -gencode arch=compute_$$i,code=sm_$$i\ ;done`
191 | NVCC_GREP						= "^#line\|^$$\|^# [0-9]* "
192 | 
193 | COMPILEOUTPUTBASE				= -o
194 | COMPILEOUTPUT					= $(COMPILEOUTPUTBASE) $@
195 | LINKOUTPUT						= -o $@
196 | COMPILEONLY						= -c
197 | ASMONLY							=
198 | PRECOMPILEONLY					= -x c++ -E
199 | OPTIONINCLUDEPATH				= -I
200 | OBJ								= o
201 | 


--------------------------------------------------------------------------------
/makefiles/x86_64-unknown-cygwin.mak:
--------------------------------------------------------------------------------
1 | CC_x64_64-unknown-cygwin				= $(CC_i686-pc-cygwin)
2 | ALLDEP									+= makefiles/i686-pc-cygwin.mak
3 | include makefiles/i686-pc-cygwin.mak
4 | 


--------------------------------------------------------------------------------
/memtest/.gitignore:
--------------------------------------------------------------------------------
1 | mem
2 | 


--------------------------------------------------------------------------------
/memtest/build.sh:
--------------------------------------------------------------------------------
1 | c++ -m64 -o mem -L$AMDAPPSDKROOT/lib/x86_64 -I$AMDAPPSDKROOT/include -lrt -lOpenCL mem.cpp ../cmodules/timer.cpp
2 | 


--------------------------------------------------------------------------------
/memtest/cmd:
--------------------------------------------------------------------------------
1 | ./mem -g -2 -c -1 -x -z -l -lh 3072 -lw 3072 -lx 20 -ly 20 -a -u
2 | 


--------------------------------------------------------------------------------
/memtest/info.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | for i in "ulimit -m" "ulimit -v" "ulimit -l" "clinfo" "dmesg" "cat /var/log/messages"; do
3 |     echo $i
4 |     $i | tail -n 1000
5 | done
6 | 


--------------------------------------------------------------------------------
/memtest/timer.cpp:
--------------------------------------------------------------------------------
 1 | #include "timer.h"
 2 | #ifdef _WIN32
 3 | #include <windows.h>
 4 | #include <winbase.h>
 5 | #else
 6 | #include <time.h>
 7 | #endif
 8 | 
 9 | HighResTimer::HighResTimer()
10 | {
11 | 	ElapsedTime = 0;
12 | }
13 | 
14 | HighResTimer::~HighResTimer() {}
15 | 
16 | void HighResTimer::Start()
17 | {
18 | #ifdef _WIN32
19 | 	__int64 istart;
20 | 	QueryPerformanceCounter((LARGE_INTEGER*)&istart);
21 | 	StartTime = (double) istart;
22 | #else
23 | 	timespec tv;
24 | 	clock_gettime(CLOCK_REALTIME, &tv);
25 | 	StartTime = (double) tv.tv_sec * 1.0E9 + (double) tv.tv_nsec;
26 | #endif
27 | }
28 | 
29 | void HighResTimer::Stop()
30 | {
31 | 	double EndTime = 0;
32 | #ifdef _WIN32
33 | 	__int64 iend;
34 | 	QueryPerformanceCounter((LARGE_INTEGER*) &iend);
35 | 	EndTime = (double) iend;
36 | #else
37 | 	timespec tv;
38 | 	clock_gettime(CLOCK_REALTIME, &tv);
39 | 	EndTime = (double) tv.tv_sec * 1.0E9 + (double) tv.tv_nsec;
40 | #endif
41 | 	ElapsedTime += EndTime - StartTime;
42 | }
43 | 
44 | void HighResTimer::Reset()
45 | {
46 | 	ElapsedTime = 0;
47 | 	StartTime = 0;
48 | }
49 | 
50 | double HighResTimer::GetElapsedTime()
51 | {
52 | 	return ElapsedTime / Frequency;
53 | }
54 | 
55 | double HighResTimer::GetFrequency()
56 | {
57 | #ifdef _WIN32
58 | 	__int64 ifreq;
59 | 	QueryPerformanceFrequency((LARGE_INTEGER*)&ifreq);
60 | 	return((double) ifreq);
61 | #else
62 | 	return(1.0E9);
63 | #endif
64 | }
65 | 
66 | double HighResTimer::Frequency = HighResTimer::GetFrequency();


--------------------------------------------------------------------------------
/memtest/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef TIMER_H
 2 | #define TIMER_H
 3 | 
 4 | class HighResTimer {
 5 | 
 6 | public:
 7 | 	HighResTimer();
 8 | 	~HighResTimer();
 9 | 	void Start();
10 | 	void Stop();
11 | 	void Reset();
12 | 	double GetElapsedTime();
13 | 
14 | private:
15 | 	static double Frequency;
16 | 	static double GetFrequency();
17 | 
18 | 	double ElapsedTime;
19 | 	double StartTime;
20 | }; 
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------