├── Hip ├── .gitignore ├── Kernels │ ├── Pad.py │ ├── Pool.py │ ├── Costs.py │ ├── MatVec.py │ ├── Memory.py │ ├── PRelu.py │ ├── Embedder.py │ ├── RadixSort.py │ ├── Upsample.py │ └── CTC.py ├── Wrappers │ ├── RocBlas.py │ └── MIOpenNorm.py ├── Benchmarks │ └── ConvSpeed.py ├── Source │ └── Build.py ├── GPUArray.py ├── Utils.py └── CheckInstall.py ├── Cuda ├── Source │ ├── .gitignore │ └── TraceMalloc │ │ └── .gitignore ├── .gitignore ├── Wrappers │ └── CuDnnMemory.py └── Benchmarks │ └── ConvSpeed.py ├── Intel ├── Libs │ └── .gitignore ├── ThirdParty │ └── finddnnl.py ├── Benchmarks │ └── ConvSpeed.py └── Wrappers │ └── DNNLBlas.py ├── .gitignore ├── Compiler ├── TestData │ └── .gitignore ├── Codegen │ ├── PyDefines │ │ ├── Generate.py │ │ └── PyDefines.h │ ├── Vector │ │ ├── TVector.h │ │ ├── Generate.py │ │ └── TVector.c │ ├── Map │ │ ├── TMap.h │ │ └── Generate.py │ ├── Malloc │ │ ├── TMalloc.h │ │ ├── TMalloc.c │ │ ├── Generate.py │ │ └── TMallocTest.c │ └── Tree │ │ ├── TTree.h │ │ └── Generate.py └── Compilers │ ├── NVCC.py │ └── GCC.py ├── Converter ├── TestData │ └── .gitignore ├── MXNet │ └── .gitignore ├── OpenVINO │ ├── TestData │ │ └── .gitignore │ ├── .gitignore │ ├── Tests │ │ ├── ResNet50Test.py │ │ ├── Common.py │ │ └── GraphTest.py │ ├── Source │ │ └── Build.py │ └── VINOEngine.py ├── TensorRT │ ├── TestData │ │ └── .gitignore │ ├── .gitignore │ ├── Tests │ │ ├── UNetTest.py │ │ ├── WaveToLetterTest.py │ │ ├── ResNet50Test.py │ │ ├── Common.py │ │ ├── GraphTest.py │ │ └── MnistLenetTest.py │ ├── Source │ │ └── Plugins.cpp │ └── DataCalibrator.py ├── Caffe │ ├── .gitignore │ └── ConvertBlob.py └── Examples │ ├── NiN.py │ ├── VGG.py │ ├── Inception.py │ ├── ResNet.py │ └── Common.py ├── TestData ├── .gitignore ├── test.tar └── test.zip ├── requirements.txt ├── MANIFEST.in ├── Handlers └── __init__.py ├── Containers └── __init__.py ├── Datasets ├── __init__.py ├── DataLoader.py ├── TarLoader.py ├── ZipLoader.py ├── PathLoader.py └── MnistLoader.py ├── Cost ├── __init__.py ├── MSE.py ├── Abs.py ├── SmoothL1.py └── Cost.py ├── Optimizers ├── __init__.py ├── Hooks.py ├── SGD.py ├── AdaGrad.py ├── MomentumSGD.py ├── NesterovSGD.py ├── RMSProp.py ├── AdaDelta.py ├── SMORMS3.py └── Adam.py ├── TestLib ├── NormFilters.py ├── OptimizeNet.py ├── CnnMnistLenet.py ├── GradientCheck.py ├── RnnIMDBTrain.py ├── BiRnnIMDBTrain.py ├── CnnIMDBTrain.py ├── ResumeTrain.py ├── EncoderTrain.py ├── MultiGPUMnist.py ├── MultiGPUCifar10.py └── CnnCifar10Simple.py ├── Models └── Nets │ ├── __init__.py │ └── LeNet.py ├── Transformers ├── Transformer.py ├── Generator.py └── Serial.py ├── Modules ├── Identity.py ├── LRN.py ├── Flatten.py ├── Add.py ├── Penalty.py ├── Pool1D.py ├── Replicate.py ├── Mul.py ├── Pool2D.py ├── CrossMapLRN.py ├── MulAddConst.py ├── Gelu.py ├── Slice.py ├── Glue.py ├── AvgPool2D.py ├── Transpose.py ├── MapLRN.py ├── MaxPool2D.py ├── Pool3D.py ├── SoftMax.py ├── SwapAxes.py ├── AvgPool3D.py ├── AvgPool1D.py ├── MaxPool3D.py └── SpatialTf.py ├── Backend ├── Kernels │ ├── Embedder.py │ ├── Pad.py │ ├── PRelu.py │ ├── Pool.py │ ├── Upsample.py │ ├── MatVec.py │ └── Costs.py └── Memory.py ├── Config.py ├── Variable.py ├── README.md └── CPU ├── Wrappers └── NumpyBlas.py └── Kernels └── Upsample2D.py /Hip/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | -------------------------------------------------------------------------------- /Cuda/Source/.gitignore: -------------------------------------------------------------------------------- 1 | *.gen.c 2 | *.gen.h 3 | -------------------------------------------------------------------------------- /Intel/Libs/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | 3 | !.gitignore 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | 3 | .idea 4 | .DS_Store 5 | -------------------------------------------------------------------------------- /Compiler/TestData/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | 3 | !.gitignore 4 | -------------------------------------------------------------------------------- /Converter/TestData/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | 3 | !.gitignore 4 | -------------------------------------------------------------------------------- /Cuda/Source/TraceMalloc/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | 3 | !.gitignore 4 | -------------------------------------------------------------------------------- /Converter/MXNet/.gitignore: -------------------------------------------------------------------------------- 1 | *.params 2 | *.json 3 | 4 | *.hdf 5 | -------------------------------------------------------------------------------- /Converter/OpenVINO/TestData/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | 3 | !.gitignore 4 | -------------------------------------------------------------------------------- /Converter/TensorRT/TestData/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | 3 | !.gitignore 4 | -------------------------------------------------------------------------------- /TestData/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | 3 | !test.tar 4 | !test.zip 5 | 6 | !.gitignore 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | h5py 3 | Pillow 4 | graphviz 5 | colorama 6 | pybind11 7 | -------------------------------------------------------------------------------- /TestData/test.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/puzzlelib/PuzzleLib/HEAD/TestData/test.tar -------------------------------------------------------------------------------- /TestData/test.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/puzzlelib/PuzzleLib/HEAD/TestData/test.zip -------------------------------------------------------------------------------- /Cuda/.gitignore: -------------------------------------------------------------------------------- 1 | *.exp 2 | *.lib 3 | *.ilk 4 | 5 | *.idb 6 | *.pdb 7 | 8 | *.obj 9 | *.pyd 10 | 11 | *.o 12 | *.so 13 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | global-exclude .gitignore 2 | 3 | exclude MANIFEST.in 4 | 5 | include *.py 6 | include LICENSE 7 | include requirements.txt -------------------------------------------------------------------------------- /Converter/OpenVINO/.gitignore: -------------------------------------------------------------------------------- 1 | *.exp 2 | *.lib 3 | *.ilk 4 | 5 | *.idb 6 | *.pdb 7 | 8 | *.obj 9 | *.pyd 10 | 11 | *.o 12 | *.so 13 | -------------------------------------------------------------------------------- /Converter/Caffe/.gitignore: -------------------------------------------------------------------------------- 1 | *.whl 2 | *.proto 3 | *.exe 4 | 5 | caffe_pb2.py 6 | 7 | *.caffemodel 8 | *.binaryproto 9 | 10 | *.pkl 11 | *.hdf 12 | -------------------------------------------------------------------------------- /Converter/TensorRT/.gitignore: -------------------------------------------------------------------------------- 1 | *.json 2 | 3 | *.exp 4 | *.lib 5 | *.ilk 6 | 7 | *.idb 8 | *.pdb 9 | 10 | *.obj 11 | *.pyd 12 | 13 | *.o 14 | *.so 15 | -------------------------------------------------------------------------------- /Handlers/__init__.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Handlers.Calculator import Calculator 2 | from PuzzleLib.Handlers.Trainer import Trainer 3 | from PuzzleLib.Handlers.Validator import Validator 4 | -------------------------------------------------------------------------------- /Hip/Kernels/Pad.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Cuda.Kernels.Pad import backendTest 2 | 3 | 4 | def unittest(): 5 | from PuzzleLib.Hip import Backend 6 | backendTest(Backend) 7 | 8 | 9 | if __name__ == "__main__": 10 | unittest() 11 | -------------------------------------------------------------------------------- /Hip/Kernels/Pool.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Cuda.Kernels.Pool import backendTest 2 | 3 | 4 | def unittest(): 5 | from PuzzleLib.Hip import Backend 6 | backendTest(Backend) 7 | 8 | 9 | if __name__ == "__main__": 10 | unittest() 11 | -------------------------------------------------------------------------------- /Containers/__init__.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Containers.Container import Container 2 | from PuzzleLib.Containers.Graph import Graph 3 | from PuzzleLib.Containers.Parallel import Parallel 4 | from PuzzleLib.Containers.Sequential import Sequential 5 | -------------------------------------------------------------------------------- /Hip/Kernels/Costs.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Cuda.Kernels.Costs import backendTest 2 | 3 | 4 | def unittest(): 5 | from PuzzleLib.Hip import Backend 6 | backendTest(Backend) 7 | 8 | 9 | if __name__ == "__main__": 10 | unittest() 11 | -------------------------------------------------------------------------------- /Hip/Kernels/MatVec.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Cuda.Kernels.MatVec import backendTest 2 | 3 | 4 | def unittest(): 5 | from PuzzleLib.Hip import Backend 6 | backendTest(Backend) 7 | 8 | 9 | if __name__ == "__main__": 10 | unittest() 11 | -------------------------------------------------------------------------------- /Hip/Kernels/Memory.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Cuda.Kernels.Memory import backendTest 2 | 3 | 4 | def unittest(): 5 | from PuzzleLib.Hip import Backend 6 | backendTest(Backend) 7 | 8 | 9 | if __name__ == "__main__": 10 | unittest() 11 | -------------------------------------------------------------------------------- /Hip/Kernels/PRelu.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Cuda.Kernels.PRelu import backendTest 2 | 3 | 4 | def unittest(): 5 | from PuzzleLib.Hip import Backend 6 | backendTest(Backend) 7 | 8 | 9 | if __name__ == "__main__": 10 | unittest() 11 | -------------------------------------------------------------------------------- /Hip/Kernels/Embedder.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Cuda.Kernels.Embedder import backendTest 2 | 3 | 4 | def unittest(): 5 | from PuzzleLib.Hip import Backend 6 | backendTest(Backend) 7 | 8 | 9 | if __name__ == "__main__": 10 | unittest() 11 | -------------------------------------------------------------------------------- /Hip/Kernels/RadixSort.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Cuda.Kernels.RadixSort import backendTest 2 | 3 | 4 | def unittest(): 5 | from PuzzleLib.Hip import Backend 6 | backendTest(Backend) 7 | 8 | 9 | if __name__ == "__main__": 10 | unittest() 11 | -------------------------------------------------------------------------------- /Hip/Kernels/Upsample.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Cuda.Kernels.Upsample import backendTest 2 | 3 | 4 | def unittest(): 5 | from PuzzleLib.Hip import Backend 6 | backendTest(Backend) 7 | 8 | 9 | if __name__ == "__main__": 10 | unittest() 11 | -------------------------------------------------------------------------------- /Hip/Wrappers/RocBlas.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Cuda.Wrappers.CuBlas import backendTest 2 | 3 | 4 | def unittest(): 5 | from PuzzleLib.Hip import Backend 6 | backendTest(Backend) 7 | 8 | 9 | if __name__ == "__main__": 10 | unittest() 11 | -------------------------------------------------------------------------------- /Compiler/Codegen/PyDefines/Generate.py: -------------------------------------------------------------------------------- 1 | import os 2 | from PuzzleLib.Compiler.Toolchain import copySource 3 | 4 | 5 | def generatePyDefines(path): 6 | dirname = os.path.dirname(__file__) 7 | copySource(os.path.join(dirname, "PyDefines.h"), os.path.join(path, "PyDefines.gen.h")) 8 | -------------------------------------------------------------------------------- /Datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Datasets.Cifar10Loader import Cifar10Loader 2 | from PuzzleLib.Datasets.IMDBLoader import IMDBLoader 3 | from PuzzleLib.Datasets.MnistLoader import MnistLoader 4 | from PuzzleLib.Datasets.PathLoader import PathLoader 5 | from PuzzleLib.Datasets.SmallNorbLoader import SmallNorbLoader 6 | from PuzzleLib.Datasets.TarLoader import TarLoader 7 | from PuzzleLib.Datasets.ZipLoader import ZipLoader 8 | -------------------------------------------------------------------------------- /Datasets/DataLoader.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | class DataLoader: 5 | def __init__(self, datanames=None, cachename=None): 6 | self.cachename = cachename 7 | 8 | if datanames is None: 9 | self.datanames = ["data"] 10 | else: 11 | if isinstance(datanames, list) or isinstance(datanames, tuple): 12 | self.datanames = datanames 13 | else: 14 | self.datanames = [datanames] 15 | 16 | 17 | def clear(self): 18 | if os.path.exists(self.cachename): 19 | os.remove(self.cachename) 20 | -------------------------------------------------------------------------------- /Cost/__init__.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Cost.Abs import Abs 2 | from PuzzleLib.Cost.BCE import BCE 3 | from PuzzleLib.Cost.CrossEntropy import CrossEntropy 4 | from PuzzleLib.Cost.CTC import CTC 5 | from PuzzleLib.Cost.Hinge import Hinge 6 | from PuzzleLib.Cost.KLDivergence import KLDivergence 7 | from PuzzleLib.Cost.L1Hinge import L1Hinge 8 | from PuzzleLib.Cost.MSE import MSE 9 | from PuzzleLib.Cost.Multi import Multi 10 | from PuzzleLib.Cost.SmoothL1 import SmoothL1 11 | from PuzzleLib.Cost.SVM import SVM 12 | -------------------------------------------------------------------------------- /Optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Optimizers.AdaDelta import AdaDelta 2 | from PuzzleLib.Optimizers.AdaGrad import AdaGrad 3 | from PuzzleLib.Optimizers.Adam import Adam 4 | from PuzzleLib.Optimizers.MomentumSGD import MomentumSGD 5 | from PuzzleLib.Optimizers.NesterovSGD import NesterovSGD 6 | from PuzzleLib.Optimizers.RMSProp import RMSProp 7 | from PuzzleLib.Optimizers.RMSPropGraves import RMSPropGraves 8 | from PuzzleLib.Optimizers.SGD import SGD 9 | from PuzzleLib.Optimizers.SMORMS3 import SMORMS3 10 | -------------------------------------------------------------------------------- /Hip/Benchmarks/ConvSpeed.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Hip.Backend import getBackend 4 | from PuzzleLib.Cuda.Benchmarks.ConvSpeed import timeConv 5 | 6 | 7 | def main(): 8 | datashape = (128, 32, 64, 64) 9 | Wshape = (64, 32, 11, 11) 10 | 11 | stride, pad, dilation, groups = 1, 0, 1, datashape[1] // Wshape[1] 12 | 13 | backend = getBackend(initmode=1) 14 | timeConv(backend, datashape, Wshape, np.float32, stride, pad, dilation, groups) 15 | 16 | 17 | if __name__ == "__main__": 18 | main() 19 | -------------------------------------------------------------------------------- /Optimizers/Hooks.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend.Kernels.ElementWise import weightDecayKer 4 | 5 | 6 | class Hook: 7 | def __call__(self, var, state, stream=None): 8 | raise NotImplementedError() 9 | 10 | 11 | class WeightDecay(Hook): 12 | def __init__(self, rate): 13 | self.rate = rate 14 | 15 | 16 | def __call__(self, var, state, stream=None): 17 | assert var.grad.dtype == np.float32 18 | if var.wc > 0.0: 19 | weightDecayKer(var.grad, var.data, self.rate * var.wc, stream=stream) 20 | -------------------------------------------------------------------------------- /TestLib/NormFilters.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Backend import gpuarray 2 | 3 | from PuzzleLib.Modules import SubtractMean, LCN 4 | from PuzzleLib.Visual import loadImage, showImage 5 | 6 | 7 | def main(): 8 | subtractMean = SubtractMean(size=7) 9 | lcn = LCN(N=7) 10 | 11 | img = gpuarray.to_gpu(loadImage("../TestData/Bench.png")) 12 | 13 | subtractMean(img) 14 | showImage(subtractMean.data.get(), "../TestData/ResultSubtractNorm.png") 15 | 16 | lcn(img) 17 | showImage(lcn.data.get(), "../TestData/ResultLCN.png") 18 | 19 | 20 | if __name__ == "__main__": 21 | main() 22 | -------------------------------------------------------------------------------- /Models/Nets/__init__.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Models.Nets.Inception import loadInceptionBN, loadInceptionV3 2 | from PuzzleLib.Models.Nets.LeNet import loadLeNet 3 | from PuzzleLib.Models.Nets.MiniYolo import loadMiniYolo 4 | from PuzzleLib.Models.Nets.NiN import loadNiNImageNet 5 | from PuzzleLib.Models.Nets.OpenPoseCOCO import loadCOCO 6 | from PuzzleLib.Models.Nets.OpenPoseMPI import loadMPI 7 | from PuzzleLib.Models.Nets.ResNet import loadResNet 8 | from PuzzleLib.Models.Nets.UNet import loadUNet 9 | from PuzzleLib.Models.Nets.VGG import loadVGG 10 | from PuzzleLib.Models.Nets.WaveToLetter import loadW2L 11 | -------------------------------------------------------------------------------- /Transformers/Transformer.py: -------------------------------------------------------------------------------- 1 | class Transformer: 2 | def __call__(self, batch, threadidx): 3 | return batch 4 | 5 | 6 | def unittest(): 7 | from PuzzleLib.Transformers.Merger import Merger 8 | from PuzzleLib.Datasets.ZipLoader import ZipLoader 9 | 10 | zipfile = ZipLoader() 11 | data1 = zipfile.load("../TestData/test.zip") 12 | data2 = zipfile.load("../TestData/test.zip") 13 | 14 | with Merger([data1, data2]) as merger: 15 | merger.addTransformer(Transformer()) 16 | 17 | for _ in range(10): 18 | merger.prepareData(chunksize=4, permutate=False) 19 | merger.getData() 20 | 21 | 22 | if __name__ == "__main__": 23 | unittest() 24 | -------------------------------------------------------------------------------- /Cuda/Wrappers/CuDnnMemory.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Cuda.Kernels.Memory import transposeTest, moveAxisTest, swapAxesTest, depthConcatTest 2 | 3 | 4 | def unittest(): 5 | from PuzzleLib.Cuda import Backend 6 | backendTest(Backend) 7 | 8 | 9 | def backendTest(Backend): 10 | for deviceIdx in range(Backend.getDeviceCount()): 11 | bnd = Backend.getBackend(deviceIdx, initmode=1) 12 | 13 | for dtype, _ in bnd.dtypesSupported(): 14 | transposeTest(bnd, bnd.dnn, dtype) 15 | moveAxisTest(bnd, bnd.dnn, dtype) 16 | swapAxesTest(bnd, bnd.dnn, dtype) 17 | depthConcatTest(bnd, bnd.dnn, dtype) 18 | 19 | 20 | if __name__ == "__main__": 21 | unittest() 22 | -------------------------------------------------------------------------------- /Compiler/Codegen/Vector/TVector.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | $HEADER_PREAMBULE 7 | typedef struct $NAME 8 | { 9 | $T *ptr; 10 | size_t size, capacity; 11 | } 12 | $NAME; 13 | 14 | 15 | void ${NAME}_init($NAME *self); 16 | void ${NAME}_dealloc($NAME *self); 17 | 18 | void ${NAME}_reserve($NAME *self, size_t capacity); 19 | void ${NAME}_append($NAME *self, $T elem); 20 | void ${NAME}_appendEmpty($NAME *self); 21 | bool ${NAME}_pop($NAME *self, $T *elem); 22 | void ${NAME}_clear($NAME *self); 23 | bool ${NAME}_get($NAME *self, size_t index, $T *elem); 24 | bool ${NAME}_set($NAME *self, size_t index, $T elem); 25 | -------------------------------------------------------------------------------- /Hip/Kernels/CTC.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Cuda.Kernels.CTC import CTCModule, backendTest 2 | 3 | 4 | class HipCTCModule(CTCModule): 5 | @staticmethod 6 | def generateConfig(backend): 7 | return [ 8 | (backend.warpSize, 1), 9 | (backend.warpSize * 2, 1), 10 | (backend.warpSize, 3), 11 | (backend.warpSize * 2, 2), 12 | (backend.warpSize, 6), 13 | (backend.warpSize * 2, 4), 14 | (backend.warpSize, 9), 15 | (backend.warpSize * 2, 6), 16 | (backend.warpSize * 2, 9), 17 | (backend.warpSize * 2, 10) 18 | ] 19 | 20 | 21 | def unittest(): 22 | from PuzzleLib.Hip import Backend 23 | backendTest(Backend, HipCTCModule) 24 | 25 | 26 | if __name__ == "__main__": 27 | unittest() 28 | -------------------------------------------------------------------------------- /Compiler/Codegen/Map/TMap.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | $HEADER_PREAMBULE 7 | struct ${NAME}_Bucket; 8 | 9 | 10 | typedef struct ${NAME}_Bucket 11 | { 12 | $K key; 13 | $V value; 14 | 15 | struct ${NAME}_Bucket *next; 16 | } 17 | ${NAME}_Bucket; 18 | 19 | 20 | typedef struct $NAME 21 | { 22 | ${NAME}_Bucket **ptr; 23 | size_t size, log2capacity; 24 | } 25 | $NAME; 26 | 27 | 28 | void ${NAME}_init($NAME *self); 29 | void ${NAME}_dealloc($NAME *self); 30 | 31 | bool ${NAME}_insert($NAME *self, $K key, $V value); 32 | bool ${NAME}_delete($NAME *self, $K key); 33 | bool ${NAME}_get($NAME *self, $K key, $V *value); 34 | void ${NAME}_clear($NAME *self); 35 | -------------------------------------------------------------------------------- /Converter/Examples/NiN.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib import Config 2 | Config.globalEvalMode = True 3 | 4 | from PuzzleLib.Backend import gpuarray 5 | from PuzzleLib.Models.Nets import loadNiNImageNet 6 | 7 | from PuzzleLib.Converter.Examples.Common import loadSample, loadLabels, showLabelResults 8 | 9 | 10 | def main(): 11 | net = loadNiNImageNet(modelpath="../TestData/nin_imagenet.hdf") 12 | 13 | sample = loadSample("../TestData/barometer.jpg") 14 | labels = loadLabels(synpath="../TestData/synsets.txt", wordpath="../TestData/synset_words.txt") 15 | 16 | res = net(gpuarray.to_gpu(sample)).get().reshape(-1) 17 | showLabelResults(res, labels, header="NiN") 18 | 19 | 20 | if __name__ == "__main__": 21 | main() 22 | -------------------------------------------------------------------------------- /Transformers/Generator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Transformers.Provider import Provider 4 | from PuzzleLib.Transformers.Transformer import Transformer 5 | 6 | 7 | class Generator(Provider): 8 | def getNextChunk(self, chunksize, **kwargs): 9 | return None 10 | 11 | 12 | class TestGenTransformer(Transformer): 13 | def __call__(self, batch, threadidx): 14 | return np.random.randn(10, 3, 4, 4).astype(np.float32) 15 | 16 | 17 | def unittest(): 18 | with Generator(numofthreads=4) as generator: 19 | generator.addTransformer(TestGenTransformer()) 20 | 21 | generator.prepareData() 22 | assert generator.getData().shape == (40, 3, 4, 4) 23 | 24 | 25 | if __name__ == "__main__": 26 | unittest() 27 | -------------------------------------------------------------------------------- /Compiler/Codegen/Malloc/TMalloc.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | 7 | #if defined(ENABLE_TRACE_MALLOC) 8 | #define TRACE_MALLOC(size) ${NAME}_malloc(size, __FILE__, __LINE__) 9 | #define TRACE_FREE(ptr) ${NAME}_free(ptr) 10 | 11 | #else 12 | #define TRACE_MALLOC(size) malloc(size) 13 | #define TRACE_FREE(ptr) free(ptr) 14 | 15 | #endif 16 | 17 | 18 | void *${NAME}_malloc(size_t size, const char *file, int line); 19 | void ${NAME}_free(void *ptr); 20 | 21 | size_t ${NAME}_traceLeaks(void); 22 | 23 | bool ${NAME}_Iterator_init(void); 24 | void ${NAME}_Iterator_dealloc(void); 25 | 26 | bool ${NAME}_Iterator_move(void); 27 | void ${NAME}_Iterator_item(size_t *size, const char **file, int *line); 28 | -------------------------------------------------------------------------------- /Converter/TensorRT/Tests/UNetTest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray 4 | from PuzzleLib.Models.Nets.UNet import loadUNet 5 | 6 | from PuzzleLib.Converter.TensorRT.Tests.Common import benchModels 7 | from PuzzleLib.Converter.TensorRT.BuildRTEngine import buildRTEngine 8 | 9 | 10 | def main(): 11 | net = loadUNet(None) 12 | data = gpuarray.to_gpu(np.random.randn(1, 1, 256, 256).astype(np.float32)) 13 | 14 | engine = buildRTEngine(net, inshape=data.shape, savepath="../TestData") 15 | 16 | net.evalMode() 17 | outdata = net(data) 18 | 19 | enginedata = engine(data) 20 | 21 | assert np.allclose(outdata.get(), enginedata.get()) 22 | benchModels(net, engine, data) 23 | 24 | 25 | if __name__ == "__main__": 26 | main() 27 | -------------------------------------------------------------------------------- /Datasets/TarLoader.py: -------------------------------------------------------------------------------- 1 | import tarfile 2 | 3 | from PuzzleLib.Datasets.InputLoader import InputLoader 4 | 5 | 6 | class TarLoader(InputLoader): 7 | def checkInput(self, archivename): 8 | if not tarfile.is_tarfile(archivename): 9 | raise RuntimeError("'%s' is not tar file" % archivename) 10 | 11 | 12 | def openInput(self, archivename): 13 | return tarfile.open(archivename) 14 | 15 | 16 | def loadFilelist(self, archive): 17 | return [file for file in archive.getnames() if any([file.lower().endswith(ext) for ext in self.exts])] 18 | 19 | 20 | def openFile(self, archive, file): 21 | return archive.extractfile(file) 22 | 23 | 24 | def unittest(): 25 | loader = TarLoader() 26 | loader.load("../TestData/test.tar", maxsamples=5, filepacksize=3) 27 | loader.clear() 28 | 29 | 30 | if __name__ == "__main__": 31 | unittest() 32 | -------------------------------------------------------------------------------- /Datasets/ZipLoader.py: -------------------------------------------------------------------------------- 1 | import zipfile 2 | 3 | from PuzzleLib.Datasets.InputLoader import InputLoader 4 | 5 | 6 | class ZipLoader(InputLoader): 7 | def checkInput(self, archivename): 8 | if not zipfile.is_zipfile(archivename): 9 | raise RuntimeError("'%s' is not zip file" % archivename) 10 | 11 | 12 | def openInput(self, archivename): 13 | return zipfile.ZipFile(archivename) 14 | 15 | 16 | def loadFilelist(self, archive): 17 | return [file for file in archive.namelist() if any([file.lower().endswith(ext) for ext in self.exts])] 18 | 19 | 20 | def openFile(self, archive, file): 21 | return archive.open(file) 22 | 23 | 24 | def unittest(): 25 | loader = ZipLoader() 26 | loader.load("../TestData/test.zip", maxsamples=5, filepacksize=3) 27 | loader.clear() 28 | 29 | 30 | if __name__ == "__main__": 31 | unittest() 32 | -------------------------------------------------------------------------------- /Converter/TensorRT/Tests/WaveToLetterTest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib import Config 4 | Config.globalEvalMode = True 5 | 6 | from PuzzleLib.Backend import gpuarray 7 | from PuzzleLib.Models.Nets.WaveToLetter import loadW2L 8 | 9 | from PuzzleLib.Converter.TensorRT.Tests.Common import benchModels 10 | from PuzzleLib.Converter.TensorRT.BuildRTEngine import buildRTEngine 11 | 12 | 13 | def main(): 14 | inmaps = 161 15 | net = loadW2L(None, inmaps, nlabels=29) 16 | 17 | data = gpuarray.to_gpu(np.random.randn(1, inmaps, 200).astype(np.float32)) 18 | engine = buildRTEngine(net, inshape=data.shape, savepath="../TestData") 19 | 20 | net.evalMode() 21 | outdata = net(data) 22 | 23 | enginedata = engine(data) 24 | 25 | assert np.allclose(outdata.get(), enginedata.get(), atol=1e-7) 26 | benchModels(net, engine, data) 27 | 28 | 29 | if __name__ == "__main__": 30 | main() 31 | -------------------------------------------------------------------------------- /Cuda/Benchmarks/ConvSpeed.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from PuzzleLib.Cuda.Backend import getBackend 3 | 4 | 5 | def main(): 6 | datashape = (128, 32, 64, 64) 7 | Wshape = (64, 32, 11, 11) 8 | 9 | stride, pad, dilation, groups = 1, 0, 1, datashape[1] // Wshape[1] 10 | timeConv(getBackend(initmode=1), datashape, Wshape, np.float32, stride, pad, dilation, groups) 11 | 12 | 13 | def timeConv(backend, datashape, Wshape, dtype, stride, pad, dilation, groups): 14 | fwdResults, bwdDataResults, bwdFilterResults = backend.convNdbenchmark( 15 | datashape, Wshape, dtype, stride, pad, dilation, groups 16 | ) 17 | 18 | print("Forward results:") 19 | for res in fwdResults: 20 | print(res) 21 | 22 | print("\nBackward filter results:") 23 | for res in bwdFilterResults: 24 | print(res) 25 | 26 | print("\nBackward data results:") 27 | for res in bwdDataResults: 28 | print(res) 29 | 30 | 31 | if __name__ == "__main__": 32 | main() 33 | -------------------------------------------------------------------------------- /Modules/Identity.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray 4 | from PuzzleLib.Modules.Module import Module 5 | 6 | 7 | class Identity(Module): 8 | def __init__(self, name=None): 9 | super().__init__(name) 10 | 11 | self.movesData = True 12 | self.movesGrad = True 13 | 14 | 15 | def updateData(self, data): 16 | self.data = data 17 | 18 | 19 | def updateGrad(self, grad): 20 | self.grad = grad 21 | 22 | 23 | def dataShapeFrom(self, shape): 24 | return shape 25 | 26 | 27 | def gradShapeFrom(self, shape): 28 | return shape 29 | 30 | 31 | def calcMode(self, T): 32 | self.calctype = T 33 | 34 | 35 | def unittest(): 36 | data = gpuarray.to_gpu(np.random.normal(0.0, 0.01, (10, 3, 40, 40)).astype(np.float32)) 37 | 38 | identity = Identity() 39 | identity(data) 40 | 41 | assert np.allclose(data.get(), identity.data.get()) 42 | 43 | 44 | if __name__ == "__main__": 45 | unittest() 46 | -------------------------------------------------------------------------------- /Converter/TensorRT/Tests/ResNet50Test.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib import Config 2 | Config.globalEvalMode = True 3 | 4 | from PuzzleLib.Backend import gpuarray 5 | from PuzzleLib.Models.Nets.ResNet import loadResNet 6 | 7 | from PuzzleLib.Converter.Examples.Common import loadResNetSample, loadLabels 8 | 9 | from PuzzleLib.Converter.TensorRT.Tests.Common import scoreModels, benchModels 10 | from PuzzleLib.Converter.TensorRT.BuildRTEngine import buildRTEngine 11 | 12 | 13 | def main(): 14 | net = loadResNet(modelpath="../../TestData/ResNet-50-model.hdf", layers="50") 15 | 16 | data = gpuarray.to_gpu(loadResNetSample(net, "../../TestData/tarantula.jpg")) 17 | labels = loadLabels(synpath="../../TestData/synsets.txt", wordpath="../../TestData/synset_words.txt") 18 | 19 | engine = buildRTEngine(net, inshape=data.shape, savepath="../TestData") 20 | 21 | scoreModels(net, engine, data, labels) 22 | benchModels(net, engine, data) 23 | 24 | 25 | if __name__ == "__main__": 26 | main() 27 | -------------------------------------------------------------------------------- /Modules/LRN.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Modules.Module import ModuleError, Module 2 | 3 | 4 | class LRN(Module): 5 | def __init__(self, N=5, alpha=1e-4, beta=0.75, K=2.0, name=None): 6 | super().__init__(name) 7 | self.registerBlueprint(locals()) 8 | 9 | self.N = N 10 | self.alpha = alpha 11 | self.beta = beta 12 | self.K = K 13 | 14 | self.workspace = None 15 | 16 | 17 | def dataShapeFrom(self, shape): 18 | return shape 19 | 20 | 21 | def checkDataShape(self, shape): 22 | if len(shape) != 4: 23 | raise ModuleError("Data must be 4d tensor") 24 | 25 | 26 | def gradShapeFrom(self, shape): 27 | return shape 28 | 29 | 30 | def checkGradShape(self, shape): 31 | if len(shape) != 4: 32 | raise ModuleError("Grad must be 4d tensor") 33 | 34 | 35 | def updateData(self, data): 36 | raise NotImplementedError() 37 | 38 | 39 | def updateGrad(self, grad): 40 | raise NotImplementedError() 41 | 42 | 43 | def reset(self): 44 | super().reset() 45 | self.workspace = None 46 | -------------------------------------------------------------------------------- /Converter/OpenVINO/Tests/ResNet50Test.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib import Config 2 | 3 | Config.backend = Config.Backend.intel 4 | Config.globalEvalMode = True 5 | 6 | from PuzzleLib.Backend import gpuarray 7 | from PuzzleLib.Models.Nets.ResNet import loadResNet 8 | 9 | from PuzzleLib.Converter.Examples.Common import loadResNetSample, loadLabels 10 | 11 | from PuzzleLib.Converter.OpenVINO.Tests.Common import scoreModels, benchModels 12 | from PuzzleLib.Converter.OpenVINO.BuildVINOEngine import buildVINOEngine 13 | 14 | 15 | def main(): 16 | net = loadResNet(modelpath="../../TestData/ResNet-50-model.hdf", layers="50") 17 | 18 | data = gpuarray.to_gpu(loadResNetSample(net, "../../TestData/tarantula.jpg")) 19 | labels = loadLabels(synpath="../../TestData/synsets.txt", wordpath="../../TestData/synset_words.txt") 20 | 21 | engine = buildVINOEngine(net, inshape=data.shape, savepath="../TestData") 22 | 23 | scoreModels(net, engine, data, labels) 24 | benchModels(net, engine, data) 25 | 26 | 27 | if __name__ == "__main__": 28 | main() 29 | -------------------------------------------------------------------------------- /Converter/OpenVINO/Tests/Common.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend.gpuarray import timeKernel 4 | 5 | 6 | def scoreModels(net, engine, data, labels): 7 | hostNetData = net(data).get() 8 | hostEngineData = engine(data).get() 9 | 10 | assert np.allclose(hostNetData, hostEngineData, atol=1e-6) 11 | 12 | printResults(hostNetData, labels, "Net") 13 | printResults(hostEngineData, labels, "Engine") 14 | 15 | 16 | def printResults(probs, labels, name): 17 | probs = probs.flatten() 18 | 19 | idx = (-probs).argsort()[:5] 20 | print("%s top-5 predictions: " % name) 21 | 22 | for i in range(5): 23 | print("#%s %s (prob=%s)" % (i, labels[idx[i]], probs[idx[i]])) 24 | 25 | 26 | def benchModels(net, engine, data): 27 | net.optimizeForShape(data.shape) 28 | 29 | nettime = timeKernel(net, args=(data, ), looplength=100, log=False, normalize=True) 30 | enginetime = timeKernel(engine, args=(data, ), looplength=100, log=False, normalize=True) 31 | 32 | print("Net time: host=%.10f" % nettime) 33 | print("Engine time: host=%.10f" % enginetime) 34 | -------------------------------------------------------------------------------- /Backend/Kernels/Embedder.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib import Config 2 | 3 | 4 | embed = None 5 | embedBackwardParams = None 6 | 7 | 8 | def autoinit(): 9 | if not Config.shouldInit(): 10 | return 11 | 12 | if Config.backend == Config.Backend.cuda: 13 | initCuda() 14 | elif Config.backend == Config.Backend.hip: 15 | initHip() 16 | elif Config.isCPUBased(Config.backend): 17 | initCPU() 18 | else: 19 | raise Config.ConfigError(Config.backend) 20 | 21 | 22 | def initCuda(): 23 | from PuzzleLib.Cuda import Backend 24 | initGPU(Backend) 25 | 26 | 27 | def initHip(): 28 | from PuzzleLib.Hip import Backend 29 | initGPU(Backend) 30 | 31 | 32 | def initGPU(Backend): 33 | backend = Backend.getBackend(Config.deviceIdx, initmode=2, logger=Config.getLogger()) 34 | memoryPool, embedmod = backend.memoryPool, backend.embedmod 35 | 36 | def wrapEmbed(data, W): 37 | return embedmod.embed(data, W, memoryPool) 38 | 39 | global embed, embedBackwardParams 40 | embed = wrapEmbed 41 | embedBackwardParams = embedmod.embedBackwardParams 42 | 43 | 44 | def initCPU(): 45 | pass 46 | 47 | 48 | autoinit() 49 | -------------------------------------------------------------------------------- /Intel/ThirdParty/finddnnl.py: -------------------------------------------------------------------------------- 1 | import sys, os, ctypes 2 | 3 | 4 | def findDNNL(): 5 | versions = ["1.91", "1.2", "1.1"] 6 | 7 | if sys.platform == "linux": 8 | libnames = ["libdnnl.so.%s" % v for v in versions] 9 | libnames += ["/usr/local/lib/%s" % libname for libname in libnames] 10 | 11 | elif sys.platform == "darwin": 12 | libnames = ["/usr/local/lib/libdnnl.%s.dylib" % v for v in versions] 13 | 14 | elif sys.platform == "win32": 15 | libpaths = [ 16 | os.environ.get("DNNL_PATH", ""), 17 | os.path.normpath(os.path.join(os.path.dirname(__file__), "../Libs/")) 18 | ] 19 | 20 | libnames = [os.path.join(libpath, "dnnl.dll") for libpath in libpaths] 21 | 22 | else: 23 | raise RuntimeError("Unsupported platform for dnnl") 24 | 25 | cloader = ctypes.windll if sys.platform == "win32" else ctypes.cdll 26 | 27 | for libname in libnames: 28 | try: 29 | clib = cloader.LoadLibrary(libname) 30 | 31 | except OSError: 32 | pass 33 | 34 | else: 35 | return libname, clib 36 | 37 | raise OSError("dnnl library not found (searched for following version(s): %s)" % versions) 38 | -------------------------------------------------------------------------------- /Converter/TensorRT/Tests/Common.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend.gpuarray import timeKernel 4 | 5 | 6 | def scoreModels(net, engine, data, labels): 7 | hostNetData = net(data).get() 8 | hostEngineData = engine(data).get() 9 | 10 | assert np.allclose(hostNetData, hostEngineData) 11 | 12 | printResults(hostNetData, labels, "Net") 13 | printResults(hostEngineData, labels, "Engine") 14 | 15 | 16 | def printResults(probs, labels, name): 17 | probs = probs.flatten() 18 | 19 | idx = (-probs).argsort()[:5] 20 | print("%s top-5 predictions: " % name) 21 | 22 | for i in range(5): 23 | print("#%s %s (prob=%s)" % (i, labels[idx[i]], probs[idx[i]])) 24 | 25 | 26 | def benchModels(net, engine, data): 27 | net.optimizeForShape(data.shape) 28 | 29 | nettime = timeKernel(net, args=(data, ), looplength=100, log=False, normalize=True) 30 | enginetime = timeKernel(engine, args=(data, ), looplength=100, log=False, normalize=True) 31 | 32 | print("Net time: device=%.10f host=%.10f" % (nettime[0], nettime[1])) 33 | print("Engine time: device=%.10f host=%.10f" % (enginetime[0], enginetime[1])) 34 | -------------------------------------------------------------------------------- /Converter/Caffe/ConvertBlob.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | import numpy as np 4 | import h5py 5 | 6 | 7 | def saveAttr(data, name, filename): 8 | hdf = h5py.File(filename, mode="a") 9 | 10 | modelname = next(iter(hdf["links"].keys())).split(sep=".")[0] 11 | 12 | attrGrpName = "attrs.%s" % modelname 13 | 14 | attrGrp = hdf.require_group(attrGrpName) 15 | attrGrp.create_dataset("%s.%s" % (modelname, name), data=data) 16 | 17 | 18 | def main(): 19 | binaryname = "ResNet_mean.binaryproto" 20 | modelname = "ResNet-50-model.hdf" 21 | attrName = "mean" 22 | 23 | subprocess.check_call(["protoc", "--proto_path", ".", "--python_out", ".", "caffe.proto"]) 24 | print("Compiled caffe.proto") 25 | 26 | from PuzzleLib.Converter.Caffe import caffe_pb2 27 | blob = caffe_pb2.BlobProto() 28 | 29 | msg = open(binaryname, "rb").read() 30 | 31 | print("Started parsing binaryproto %s ..." % binaryname) 32 | blob.ParseFromString(msg) 33 | 34 | data = np.array(blob.data, dtype=np.float32).reshape((1, blob.channels, blob.height, blob.width)) 35 | saveAttr(data, attrName, modelname) 36 | 37 | 38 | if __name__ == "__main__": 39 | main() 40 | -------------------------------------------------------------------------------- /Intel/Benchmarks/ConvSpeed.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Intel.Wrappers import DNNL 2 | 3 | 4 | def main(): 5 | datashape = (16, 32, 64, 64) 6 | Wshape = (64, 32, 3, 3) 7 | 8 | stride, pad = 1, 0 9 | timeConv(datashape, Wshape, stride, pad) 10 | 11 | 12 | def timeConv(datashape, Wshape, stride, pad): 13 | fwdResults, bwdFilterResults, bwdDataResults = DNNL.convNdbenchmark(datashape, Wshape, stride, pad) 14 | 15 | formatstr = "%-40s %-25s %-28s" 16 | 17 | print("Forward results:") 18 | for res in fwdResults: 19 | print(formatstr % ( 20 | "Algo %s" % res.algo, "time %.6f secs" % res.time, "memory %.6f mbytes" % (res.memory / 1024**2) 21 | )) 22 | 23 | print("\nBackward filter results:") 24 | for res in bwdFilterResults: 25 | print(formatstr % ( 26 | "Algo %s" % res.algo, "time %.6f secs" % res.time, "memory %.6f mbytes" % (res.memory / 1024**2) 27 | )) 28 | 29 | print("\nBackward data results:") 30 | for res in bwdDataResults: 31 | print(formatstr % ( 32 | "Algo %s" % DNNL.ConvAlgo(res.algo), "time %.6f secs" % res.time, 33 | "memory %.6f mbytes" % (res.memory / 1024**2) 34 | )) 35 | 36 | 37 | if __name__ == "__main__": 38 | main() 39 | -------------------------------------------------------------------------------- /Config.py: -------------------------------------------------------------------------------- 1 | import sys, multiprocessing, logging 2 | from enum import Enum 3 | 4 | 5 | class ConfigError(Exception): 6 | pass 7 | 8 | 9 | class Backend(Enum): 10 | cuda = 0 11 | hip = 1 12 | cpu = 2 13 | intel = 3 14 | 15 | 16 | backend = Backend.cuda 17 | deviceIdx = 0 18 | 19 | 20 | allowMultiContext = False 21 | systemLog = False 22 | logger = None 23 | 24 | 25 | libname = "PuzzleLib" 26 | 27 | 28 | globalEvalMode = False 29 | disableDtypeShapeChecks = False 30 | disableModuleCompatChecks = False 31 | verifyData = False 32 | showWarnings = True 33 | 34 | 35 | def isCPUBased(bnd): 36 | return bnd in {Backend.cpu, Backend.intel} 37 | 38 | 39 | def shouldInit(): 40 | return multiprocessing.current_process().name == "MainProcess" or allowMultiContext 41 | 42 | 43 | def getLogger(): 44 | global logger 45 | 46 | if logger is not None: 47 | return logger 48 | 49 | logger = logging.getLogger(libname) 50 | logger.setLevel(logging.DEBUG if systemLog else logging.INFO) 51 | 52 | handler = logging.StreamHandler(stream=sys.stdout) 53 | handler.setFormatter(logging.Formatter("[%(name)s] %(message)s")) 54 | 55 | logger.addHandler(handler) 56 | return logger 57 | -------------------------------------------------------------------------------- /Converter/Examples/VGG.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib import Config 2 | Config.globalEvalMode = True 3 | 4 | from PuzzleLib.Backend import gpuarray 5 | from PuzzleLib.Models.Nets.VGG import loadVGG 6 | 7 | from PuzzleLib.Converter.Examples.Common import loadVGGSample, loadLabels, showLabelResults 8 | 9 | 10 | def main(): 11 | vgg16Test() 12 | vgg19Test() 13 | 14 | 15 | def vgg16Test(): 16 | net = loadVGG(modelpath="../TestData/VGG_ILSVRC_16_layers.hdf", layers="16") 17 | 18 | sample = loadVGGSample("../TestData/tarantula.jpg") 19 | labels = loadLabels(synpath="../TestData/synsets.txt", wordpath="../TestData/synset_words.txt") 20 | 21 | res = net(gpuarray.to_gpu(sample)).get().reshape(-1) 22 | showLabelResults(res, labels, header=net.name) 23 | 24 | 25 | def vgg19Test(): 26 | net = loadVGG(modelpath="../TestData/VGG_ILSVRC_19_layers.hdf", layers="19") 27 | 28 | sample = loadVGGSample("../TestData/tarantula.jpg") 29 | labels = loadLabels(synpath="../TestData/synsets.txt", wordpath="../TestData/synset_words.txt") 30 | 31 | res = net(gpuarray.to_gpu(sample)).get().reshape(-1) 32 | showLabelResults(res, labels, header=net.name) 33 | 34 | 35 | if __name__ == "__main__": 36 | main() 37 | -------------------------------------------------------------------------------- /Hip/Source/Build.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Cuda.Source.Build import prepareCompiler, generateTemplates, collectCoreSources, collectLibSources 2 | 3 | 4 | def buildDriver(debugmode, verbose): 5 | cc = prepareCompiler(debugmode, verbose) 6 | prepareHip(cc) 7 | 8 | generateTemplates(path="../../Cuda/Source") 9 | 10 | driver = "../Driver" + cc.pydext 11 | cc.build(driver, collectSources(path="../../Cuda/Source")).clearPath("..") 12 | 13 | return driver 14 | 15 | 16 | def prepareHip(cc): 17 | cc.cppMode(True).addDefine("__HIP_PLATFORM_HCC__") 18 | cc.cflags.extend(["-x", "c++"]) 19 | 20 | cc.addLibrary( 21 | "hip", 22 | [ 23 | ".", "/opt/rocm/hsa/include", "/opt/rocm/hip/include", 24 | "/opt/rocm/hiprand/include", "/opt/rocm/rocrand/include", 25 | "/opt/rocm/rocblas/include", "/opt/rocm/miopen/include" 26 | ], 27 | ["/opt/rocm/hip/lib", "/opt/rocm/hiprand/lib", "/opt/rocm/rocblas/lib", "/opt/rocm/miopen/lib"], 28 | ["hip_hcc", "hiprtc", "hiprand", "rocblas"] 29 | ) 30 | 31 | 32 | def collectSources(path): 33 | return collectCoreSources(path) + collectLibSources(path) 34 | 35 | 36 | def main(): 37 | return buildDriver(debugmode=0, verbose=2) 38 | 39 | 40 | if __name__ == "__main__": 41 | main() 42 | -------------------------------------------------------------------------------- /TestLib/OptimizeNet.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray 4 | from PuzzleLib.Models.Nets.VGG import loadVGG 5 | 6 | from PuzzleLib.Optimizers import SGD 7 | from PuzzleLib.Cost import CrossEntropy 8 | from PuzzleLib.Handlers import Trainer 9 | 10 | 11 | def main(): 12 | net = loadVGG(None, "16") 13 | 14 | batchsize = 16 15 | size = (batchsize, 3, 224, 224) 16 | 17 | batch = np.random.normal(size=size).astype(dtype=np.float32) 18 | batch = gpuarray.to_gpu(batch) 19 | 20 | labels = np.random.randint(low=0, high=1000, size=(batchsize, ), dtype=np.int32) 21 | labels = gpuarray.to_gpu(labels) 22 | 23 | optimizer = SGD() 24 | optimizer.setupOn(net) 25 | 26 | cost = CrossEntropy(maxlabels=1000) 27 | trainer = Trainer(net, cost, optimizer) 28 | 29 | print("Started benchmarking %s ..." % net.name) 30 | gpuarray.timeKernel( 31 | trainer.train, args=(batch, labels), looplength=100, logname="Before optimizing %s" % net.name, normalize=True 32 | ) 33 | 34 | net.optimizeForShape(size) 35 | gpuarray.timeKernel( 36 | trainer.train, args=(batch, labels), looplength=100, logname="After optimizing %s" % net.name, normalize=True 37 | ) 38 | 39 | 40 | if __name__ == "__main__": 41 | main() 42 | -------------------------------------------------------------------------------- /Compiler/Codegen/Tree/TTree.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | $HEADER_PREAMBULE 7 | struct ${NAME}_Node; 8 | 9 | 10 | typedef struct ${NAME}_Node 11 | { 12 | bool red; 13 | struct ${NAME}_Node *links[2]; 14 | 15 | $K key; 16 | $V value; 17 | } 18 | ${NAME}_Node; 19 | 20 | 21 | typedef struct $NAME 22 | { 23 | ${NAME}_Node *root; 24 | size_t size; 25 | } 26 | $NAME; 27 | 28 | 29 | void ${NAME}_init($NAME *self); 30 | void ${NAME}_dealloc($NAME *self); 31 | bool ${NAME}_validate($NAME *self); 32 | 33 | bool ${NAME}_insert($NAME *self, $K key, $V value); 34 | bool ${NAME}_delete($NAME *self, $K key); 35 | bool ${NAME}_get($NAME *self, $K key, $V *value); 36 | void ${NAME}_clear($NAME *self); 37 | 38 | 39 | typedef struct ${NAME}_Iterator 40 | { 41 | $NAME *map; 42 | ${NAME}_Node *node; 43 | 44 | ${NAME}_Node *path[16 * sizeof(size_t)]; 45 | size_t top; 46 | } 47 | ${NAME}_Iterator; 48 | 49 | 50 | bool ${NAME}_Iterator_init(${NAME}_Iterator *self, $NAME *map, bool atLeft); 51 | void ${NAME}_Iterator_dealloc(${NAME}_Iterator *self); 52 | 53 | bool ${NAME}_Iterator_move(${NAME}_Iterator *self, bool toRight); 54 | void ${NAME}_Iterator_item(${NAME}_Iterator *self, $K *key, $V *value); 55 | -------------------------------------------------------------------------------- /Converter/Examples/Inception.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib import Config 2 | Config.globalEvalMode = True 3 | 4 | from PuzzleLib.Backend import gpuarray 5 | from PuzzleLib.Models.Nets.Inception import loadInceptionBN, loadInceptionV3 6 | 7 | from PuzzleLib.Converter.Examples.Common import loadVGGSample, loadLabels, loadV3Labels, showLabelResults 8 | 9 | 10 | def main(): 11 | inceptionBNTest() 12 | inceptionV3Test() 13 | 14 | 15 | def inceptionBNTest(): 16 | net = loadInceptionBN(modelpath="../TestData/Inception-BN-0126.hdf") 17 | 18 | sample = loadVGGSample("../TestData/tarantula.jpg") 19 | labels = loadLabels(synpath="../TestData/synsets.txt", wordpath="../TestData/synset_words.txt") 20 | 21 | res = net(gpuarray.to_gpu(sample)).get().reshape(-1) 22 | showLabelResults(res, labels, header=net.name) 23 | 24 | 25 | def inceptionV3Test(): 26 | net = loadInceptionV3(modelpath="../TestData/Inception-7-0001.hdf") 27 | 28 | sample = loadVGGSample("../TestData/tarantula.jpg", shape=(299, 299), normalize=True) 29 | labels = loadV3Labels(filename="../TestData/synset_inception_v3.txt") 30 | 31 | res = net(gpuarray.to_gpu(sample)).get().reshape(-1) 32 | showLabelResults(res, labels, header=net.name) 33 | 34 | 35 | if __name__ == "__main__": 36 | main() 37 | -------------------------------------------------------------------------------- /Models/Nets/LeNet.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray 4 | from PuzzleLib.Containers.Sequential import Sequential 5 | 6 | from PuzzleLib.Modules.Conv2D import Conv2D 7 | from PuzzleLib.Modules.MaxPool2D import MaxPool2D 8 | from PuzzleLib.Modules.Activation import Activation, relu 9 | from PuzzleLib.Modules.Flatten import Flatten 10 | from PuzzleLib.Modules.Linear import Linear 11 | 12 | 13 | def loadLeNet(modelpath, initscheme="none", name="lenet-5-like"): 14 | net = Sequential(name=name) 15 | 16 | net.append(Conv2D(1, 16, 3, initscheme=initscheme)) 17 | net.append(MaxPool2D()) 18 | net.append(Activation(relu)) 19 | 20 | net.append(Conv2D(16, 32, 4, initscheme=initscheme)) 21 | net.append(MaxPool2D()) 22 | net.append(Activation(relu)) 23 | 24 | net.append(Flatten()) 25 | net.append(Linear(32 * 5 * 5, 1024, initscheme=initscheme)) 26 | net.append(Activation(relu)) 27 | 28 | net.append(Linear(1024, 10, initscheme=initscheme)) 29 | 30 | if modelpath is not None: 31 | net.load(modelpath) 32 | 33 | return net 34 | 35 | 36 | def unittest(): 37 | data = gpuarray.to_gpu(np.random.randn(1, 1, 28, 28).astype(np.float32)) 38 | 39 | net = loadLeNet(None) 40 | net(data) 41 | 42 | 43 | if __name__ == "__main__": 44 | unittest() 45 | -------------------------------------------------------------------------------- /Modules/Flatten.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray 4 | from PuzzleLib.Modules.Module import Module 5 | 6 | 7 | class Flatten(Module): 8 | def __init__(self, name=None): 9 | super().__init__(name) 10 | 11 | self.movesData = True 12 | self.movesGrad = True 13 | 14 | self.inshape = None 15 | 16 | 17 | def updateData(self, data): 18 | self.inshape = data.shape 19 | self.data = data.reshape(data.shape[0], int(np.prod(data.shape[1:]))) 20 | 21 | 22 | def updateGrad(self, grad): 23 | self.grad = grad.reshape(self.inshape) 24 | 25 | 26 | def dataShapeFrom(self, shape): 27 | return shape[0], int(np.prod(shape[1:])) 28 | 29 | 30 | def gradShapeFrom(self, shape): 31 | return (shape[0], ) + self.inshape[1:] 32 | 33 | 34 | def calcMode(self, T): 35 | self.calctype = T 36 | 37 | 38 | def unittest(): 39 | data = gpuarray.to_gpu(np.random.randn(10, 10, 10, 10).astype(np.float32)) 40 | 41 | flatten = Flatten() 42 | flatten(data) 43 | 44 | shape = (10, 1000) 45 | assert flatten.data.shape == shape 46 | 47 | grad = gpuarray.to_gpu(np.random.randn(*flatten.data.shape).astype(np.float32)) 48 | flatten.backward(grad) 49 | 50 | assert flatten.grad.shape == data.shape 51 | 52 | 53 | if __name__ == "__main__": 54 | unittest() 55 | -------------------------------------------------------------------------------- /Converter/TensorRT/Tests/GraphTest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray 4 | 5 | from PuzzleLib.Containers import Graph 6 | from PuzzleLib.Modules import Linear, Activation, relu, Add 7 | 8 | from PuzzleLib.Converter.TensorRT.Tests.Common import benchModels 9 | from PuzzleLib.Converter.TensorRT.BuildRTEngine import buildRTEngine 10 | 11 | 12 | def main(): 13 | batchsize, insize = 16, 1000 14 | 15 | inNode = Linear(insize, 1000, name="linear1").node() 16 | node = Activation(relu, name="relu1").node(inNode) 17 | 18 | node1 = Linear(1000, 800, name="linear2").node(node) 19 | node1 = Activation(relu, name="relu2").node(node1) 20 | 21 | node2 = Linear(1000, 800, name="linear3").node(node) 22 | node2 = Activation(relu, name="relu3").node(node2) 23 | 24 | outNode = Add(name="add").node(node1, node2) 25 | 26 | graph = Graph(inputs=inNode, outputs=outNode, name="graph") 27 | engine = buildRTEngine(graph, (batchsize, insize), savepath="../TestData") 28 | 29 | data = gpuarray.to_gpu(np.random.randn(batchsize, insize).astype(np.float32)) 30 | 31 | outdata = graph(data) 32 | enginedata = engine(data) 33 | 34 | assert np.allclose(outdata.get(), enginedata.get(), atol=1e-6) 35 | benchModels(graph, engine, data) 36 | 37 | 38 | if __name__ == "__main__": 39 | main() 40 | -------------------------------------------------------------------------------- /Variable.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib import Config 2 | from PuzzleLib.Backend import gpuarray 3 | 4 | 5 | class Variable: 6 | index = 0 7 | 8 | 9 | def __init__(self, data, name=None, withgrad=True, grad=None, updater=None, postUpdater=None): 10 | if name is None: 11 | self.name = str(type(self).index) 12 | type(self).index += 1 13 | else: 14 | self.name = name 15 | 16 | self.data = data 17 | self.updater = updater 18 | 19 | if updater is not None: 20 | return 21 | 22 | self.postUpdater = postUpdater 23 | self.grad = None 24 | 25 | if grad is not None: 26 | self.grad = grad 27 | 28 | elif withgrad and not Config.globalEvalMode: 29 | self.grad = gpuarray.zeros(shape=self.data.shape, dtype=self.data.dtype) 30 | 31 | self.learnRate, self.momRate = 1.0, 1.0 32 | self.wc = 0.0 33 | 34 | 35 | @property 36 | def hasUpdater(self): 37 | return self.updater is not None 38 | 39 | 40 | @property 41 | def hasPostUpdater(self): 42 | return self.postUpdater is not None 43 | 44 | 45 | def update(self, learnRate): 46 | self.updater(self, learnRate) 47 | 48 | 49 | def postUpdate(self): 50 | self.postUpdater(self) 51 | 52 | 53 | def set(self, variable): 54 | self.data.set(variable.data) 55 | 56 | if self.grad is not None: 57 | self.grad.set(variable.grad) 58 | -------------------------------------------------------------------------------- /Compiler/Compilers/NVCC.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from PuzzleLib.Compiler.Compilers.GCC import GCCLike 4 | from PuzzleLib.Compiler.Compilers.MSVC import MSVC 5 | 6 | 7 | class NVCC(GCCLike): 8 | cc = "nvcc" 9 | 10 | 11 | def __init__(self, verbose=0, forPython=False): 12 | super().__init__(verbose) 13 | cflags = MSVC.cflags if sys.platform == "win32" else self.cflags 14 | 15 | self.cflags = [flag for cflag in cflags for flag in ["-Xcompiler", cflag]] 16 | self.cpp = True 17 | 18 | if not forPython: 19 | self.ldflags = [] 20 | 21 | 22 | def cppMode(self, enabled): 23 | assert False 24 | 25 | 26 | def fullCFlags(self, asObject, debug=True, optimize=True): 27 | oflags = self.fullCppFlags() 28 | 29 | if debug and self.debuglevel > 0: 30 | oflags.extend(["-g", "-G" if self.debuglevel >= 3 else "-lineinfo"]) 31 | 32 | if optimize and self.optlevel > 0: 33 | oflags.append("-O3" if self.optlevel >= 3 else "-O%s" % self.optlevel) 34 | 35 | if self.optlevel >= 3: 36 | oflags.append("-use_fast_math") 37 | 38 | return self.cflags + oflags + ["-I%s" % idir for idir in self.includeDirs] + (["-c"] if asObject else []) 39 | 40 | 41 | def fullCppFlags(self): 42 | return [] if sys.platform == "win32" else ["-std=c++14"] 43 | 44 | 45 | def outFlags(self, extfile): 46 | return ["-o", extfile] 47 | -------------------------------------------------------------------------------- /TestLib/CnnMnistLenet.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Datasets import MnistLoader 4 | from PuzzleLib.Visual import showFilters 5 | from PuzzleLib.Handlers import Trainer, Validator 6 | from PuzzleLib.Optimizers import MomentumSGD 7 | from PuzzleLib.Cost import CrossEntropy 8 | 9 | from PuzzleLib.Models.Nets.LeNet import loadLeNet 10 | 11 | 12 | def main(): 13 | mnist = MnistLoader() 14 | data, labels = mnist.load(path="../TestData/") 15 | data, labels = data[:], labels[:] 16 | print("Loaded mnist") 17 | 18 | np.random.seed(1234) 19 | net = loadLeNet(None, initscheme=None) 20 | 21 | optimizer = MomentumSGD() 22 | optimizer.setupOn(net, useGlobalState=True) 23 | optimizer.learnRate = 0.1 24 | optimizer.momRate = 0.9 25 | 26 | cost = CrossEntropy(maxlabels=10) 27 | trainer = Trainer(net, cost, optimizer) 28 | validator = Validator(net, cost) 29 | 30 | for i in range(15): 31 | trainer.trainFromHost( 32 | data[:60000], labels[:60000], macroBatchSize=60000, 33 | onMacroBatchFinish=lambda train: print("Train error: %s" % train.cost.getMeanError()) 34 | ) 35 | print("Accuracy: %s" % (1.0 - validator.validateFromHost(data[60000:], labels[60000:], macroBatchSize=10000))) 36 | 37 | optimizer.learnRate *= 0.9 38 | 39 | showFilters(net[0].W.get(), "../TestData/conv1.png") 40 | showFilters(net[3].W.get(), "../TestData/conv2.png") 41 | 42 | 43 | if __name__ == "__main__": 44 | main() 45 | -------------------------------------------------------------------------------- /Converter/OpenVINO/Tests/GraphTest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib import Config 4 | 5 | Config.backend = Config.Backend.intel 6 | Config.globalEvalMode = True 7 | 8 | from PuzzleLib.Backend import gpuarray 9 | 10 | from PuzzleLib.Containers import Graph 11 | from PuzzleLib.Modules import Linear, Activation, relu, Add 12 | 13 | from PuzzleLib.Converter.OpenVINO.Tests.Common import benchModels 14 | from PuzzleLib.Converter.OpenVINO.BuildVINOEngine import buildVINOEngine 15 | 16 | 17 | def main(): 18 | batchsize, insize = 16, 1000 19 | 20 | inNode = Linear(insize, 1000, name="linear1").node() 21 | node = Activation(relu, name="relu1").node(inNode) 22 | 23 | node1 = Linear(1000, 800, name="linear2").node(node) 24 | node1 = Activation(relu, name="relu2").node(node1) 25 | 26 | node2 = Linear(1000, 800, name="linear3").node(node) 27 | node2 = Activation(relu, name="relu3").node(node2) 28 | 29 | outNode = Add(name="add").node(node1, node2) 30 | 31 | graph = Graph(inputs=inNode, outputs=outNode, name="graph") 32 | 33 | data = gpuarray.to_gpu(np.random.randn(batchsize, insize).astype(np.float32)) 34 | 35 | engine = buildVINOEngine(graph, (batchsize, insize), savepath="../TestData") 36 | 37 | outdata = graph(data) 38 | enginedata = engine(data) 39 | 40 | assert np.allclose(outdata.get(), enginedata.get()) 41 | benchModels(graph, engine, data) 42 | 43 | 44 | if __name__ == "__main__": 45 | main() 46 | -------------------------------------------------------------------------------- /Optimizers/SGD.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib import Config 4 | 5 | from PuzzleLib.Backend import gpuarray 6 | from PuzzleLib.Backend.Kernels.ElementWise import toVectorAddVectorKer 7 | 8 | from PuzzleLib.Optimizers.Optimizer import Optimizer, trainSimpleTest, trainHardTest 9 | 10 | 11 | class SGD(Optimizer): 12 | def __init__(self, learnRate=1e-3, nodeinfo=None): 13 | super().__init__(nodeinfo) 14 | self.setAttr("learnRate", learnRate) 15 | 16 | 17 | def updateVar(self, var, state, stream=None): 18 | toVectorAddVectorKer(var.data.dtype)(var.data, var.grad, self.learnRate * var.learnRate, stream=stream) 19 | 20 | 21 | def unittest(): 22 | for dtype, atol in gpuarray.dtypesSupported(): 23 | calcTest(dtype, atol) 24 | trainSimpleTest(SGD, dtype, learnRate=1e-1) 25 | 26 | if Config.backend == Config.Backend.cuda: 27 | trainHardTest(SGD, dtype, learnRate=1e-1) 28 | 29 | 30 | def calcTest(dtype, atol): 31 | lr = 0.01 32 | shape = (11, 13) 33 | 34 | hostW, hostDw = np.random.randn(*shape).astype(dtype), np.random.randn(*shape).astype(dtype) 35 | 36 | w, dw = gpuarray.to_gpu(hostW), gpuarray.to_gpu(hostDw) 37 | toVectorAddVectorKer(w.dtype)(w, dw, lr) 38 | 39 | hostW, hostDw = hostW.astype(np.float32), hostDw.astype(np.float32) 40 | 41 | hostW += lr * hostDw 42 | hostW, hostDw = hostW.astype(dtype), hostDw.astype(dtype) 43 | 44 | assert np.allclose(hostW, w.get(), atol=atol) 45 | 46 | 47 | if __name__ == "__main__": 48 | unittest() 49 | -------------------------------------------------------------------------------- /Converter/TensorRT/Source/Plugins.cpp: -------------------------------------------------------------------------------- 1 | #include "Plugins.h" 2 | 3 | 4 | PuzzlePlugin::PuzzlePlugin() {} 5 | 6 | PuzzlePlugin::PuzzlePlugin(const void *serialData, size_t serialLength) 7 | { 8 | (void)serialLength; 9 | const char *buffer = static_cast(serialData); 10 | 11 | readValue(buffer, m_inshape); 12 | readValue(buffer, m_outshape); 13 | readValue(buffer, m_datatype); 14 | } 15 | 16 | size_t PuzzlePlugin::getSerializationSize() const 17 | { 18 | return sizeof(m_inshape) + sizeof(m_outshape) + sizeof(m_datatype); 19 | } 20 | 21 | void PuzzlePlugin::serialize(void *serialData) const 22 | { 23 | char *buffer = static_cast(serialData); 24 | 25 | writeValue(buffer, m_inshape); 26 | writeValue(buffer, m_outshape); 27 | writeValue(buffer, m_datatype); 28 | } 29 | 30 | void PuzzlePlugin::setPluginNamespace(const char *pluginNamespace) { m_ns = std::string(pluginNamespace); } 31 | const char *PuzzlePlugin::getPluginNamespace() const { return m_ns.c_str(); } 32 | 33 | 34 | const char *PuzzlePluginCreator::getPluginVersion() const { return version; } 35 | void PuzzlePluginCreator::setPluginNamespace(const char *pluginNamespace) { m_ns = std::string(pluginNamespace); } 36 | const char *PuzzlePluginCreator::getPluginNamespace() const { return m_ns.c_str(); } 37 | 38 | 39 | const char *PuzzlePluginCreator::version = "1"; 40 | const char *PuzzlePluginCreator::reflectPad1DName = "reflectpad1d"; 41 | const char *PuzzlePluginCreator::instNorm2DName = "instnorm2d"; 42 | -------------------------------------------------------------------------------- /Hip/GPUArray.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Cuda.GPUArray import extendGPUArray, arithmTest 4 | 5 | from PuzzleLib.Hip import Driver as HipDriver 6 | from PuzzleLib.Hip.SourceModule import HipEltwiseKernel, HipEltHalf2Kernel, HipReductionKernel 7 | 8 | 9 | HipGPUArray = extendGPUArray(HipDriver, HipEltwiseKernel, HipEltHalf2Kernel, HipReductionKernel) 10 | 11 | 12 | def unittest(): 13 | from PuzzleLib.Hip import Backend 14 | 15 | for deviceIdx in range(Backend.getDeviceCount()): 16 | bnd = Backend.getBackend(deviceIdx) 17 | 18 | for dtype, _ in bnd.dtypesSupported(): 19 | arithmTest(bnd, dtype) 20 | memoryTest(bnd, dtype) 21 | 22 | 23 | def memoryTest(bnd, dtype): 24 | hostA = np.random.randn(10, 10).astype(dtype) 25 | a = bnd.GPUArray.toGpu(hostA) 26 | 27 | b = a[:, :6] 28 | hostB = hostA[:, :6] 29 | 30 | assert np.allclose(hostB.reshape((2, 5, 6)), b.reshape(2, 5, 6).get()) 31 | assert np.allclose(hostB.reshape((5, 2, 3, 2)), b.reshape(5, 2, 3, 2).get()) 32 | assert np.allclose(hostB.reshape((10, 1, 6)), b.reshape(10, 1, 6).get()) 33 | 34 | hostA = np.random.randn(10, 10, 10).astype(dtype) 35 | a = bnd.GPUArray.toGpu(hostA) 36 | 37 | b = a[:, :, :6] 38 | assert np.allclose(hostA[:, :, :6], b.get()) 39 | 40 | hostB = np.random.randn(*b.shape).astype(dtype) 41 | b.set(hostB) 42 | assert np.allclose(hostB, b.get()) 43 | 44 | hostB = b.get() 45 | b = a[:, :6, :6] 46 | assert np.allclose(hostB[:, :6, :6], b.get()) 47 | 48 | 49 | if __name__ == "__main__": 50 | unittest() 51 | -------------------------------------------------------------------------------- /Backend/Kernels/Pad.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib import Config 2 | 3 | 4 | reflectpad1d = None 5 | reflectpad1dBackward = None 6 | 7 | reflectpad2d = None 8 | reflectpad2dBackward = None 9 | 10 | 11 | def autoinit(): 12 | if not Config.shouldInit(): 13 | return 14 | 15 | if Config.backend == Config.Backend.cuda: 16 | initCuda() 17 | elif Config.backend == Config.Backend.hip: 18 | initHip() 19 | elif Config.isCPUBased(Config.backend): 20 | initCPU() 21 | else: 22 | raise Config.ConfigError(Config.backend) 23 | 24 | 25 | def initCuda(): 26 | from PuzzleLib.Cuda import Backend 27 | initGPU(Backend) 28 | 29 | 30 | def initHip(): 31 | from PuzzleLib.Hip import Backend 32 | initGPU(Backend) 33 | 34 | 35 | def initGPU(Backend): 36 | backend = Backend.getBackend(Config.deviceIdx, initmode=2, logger=Config.getLogger()) 37 | memoryPool, padmod = backend.memoryPool, backend.padmod 38 | 39 | def wrapReflectPad(data, pad): 40 | return padmod.reflectpad(data, pad, memoryPool) 41 | 42 | def wrapReflectPadBackward(grad, pad): 43 | return padmod.reflectpadBackward(grad, pad, memoryPool) 44 | 45 | global reflectpad1d, reflectpad1dBackward, reflectpad2d, reflectpad2dBackward 46 | reflectpad1d = reflectpad2d = wrapReflectPad 47 | reflectpad1dBackward = reflectpad2dBackward = wrapReflectPadBackward 48 | 49 | 50 | def initCPU(): 51 | from PuzzleLib.CPU.Kernels import Pad 52 | 53 | global reflectpad1d 54 | reflectpad1d = Pad.reflectpad1d 55 | 56 | global reflectpad2d 57 | reflectpad2d = Pad.reflectpad2d 58 | 59 | 60 | autoinit() 61 | -------------------------------------------------------------------------------- /Backend/Kernels/PRelu.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib import Config 2 | 3 | 4 | prelu = None 5 | preluBackwardData = None 6 | preluBackwardParams = None 7 | 8 | 9 | def autoinit(): 10 | if not Config.shouldInit(): 11 | return 12 | 13 | if Config.backend == Config.Backend.cuda: 14 | initCuda() 15 | elif Config.backend == Config.Backend.hip: 16 | initHip() 17 | elif Config.isCPUBased(Config.backend): 18 | initCPU() 19 | else: 20 | raise Config.ConfigError(Config.backend) 21 | 22 | 23 | def initCuda(): 24 | from PuzzleLib.Cuda import Backend 25 | initGPU(Backend) 26 | 27 | 28 | def initHip(): 29 | from PuzzleLib.Hip import Backend 30 | initGPU(Backend) 31 | 32 | 33 | def initGPU(Backend): 34 | backend = Backend.getBackend(Config.deviceIdx, initmode=2, logger=Config.getLogger()) 35 | memoryPool, prelumod = backend.memoryPool, backend.prelumod 36 | 37 | def wrapPRelu(data, slopes, inplace, sharedMaps): 38 | return prelumod.prelu(data, slopes, inplace, sharedMaps, memoryPool) 39 | 40 | def wrapPReluBackwardData(grad, slopes, indata, sharedMaps): 41 | return prelumod.preluBackwardData(grad, slopes, indata, sharedMaps, memoryPool) 42 | 43 | def wrapPReluBackwardParams(indata, outgrad, sharedMaps): 44 | return prelumod.preluBackwardParams(indata, outgrad, sharedMaps, memoryPool) 45 | 46 | global prelu, preluBackwardData, preluBackwardParams 47 | prelu = wrapPRelu 48 | preluBackwardData = wrapPReluBackwardData 49 | preluBackwardParams = wrapPReluBackwardParams 50 | 51 | 52 | def initCPU(): 53 | pass 54 | 55 | 56 | autoinit() 57 | -------------------------------------------------------------------------------- /Datasets/PathLoader.py: -------------------------------------------------------------------------------- 1 | import os, shutil, zipfile 2 | 3 | from PuzzleLib.Datasets.InputLoader import InputLoader 4 | 5 | 6 | class PathLoader(InputLoader): 7 | def __init__(self, onFile=None, exts=None, dataname=None, cachename=None, onFileList=None, doOpen=True): 8 | super().__init__(onFile, exts, dataname, cachename, onFileList) 9 | self.doOpen = doOpen 10 | 11 | 12 | class Path: 13 | def __init__(self, path): 14 | self.path = path 15 | 16 | 17 | def __enter__(self): 18 | return self 19 | 20 | 21 | def __exit__(self, exc_type, exc_val, exc_tb): 22 | pass 23 | 24 | 25 | def checkInput(self, path): 26 | if not os.path.exists(path): 27 | raise RuntimeError("Path '%s' does not exist" % path) 28 | 29 | 30 | def openInput(self, path): 31 | return self.Path(path) 32 | 33 | 34 | def loadFilelist(self, path): 35 | lst = [] 36 | 37 | for dirpath, dirnames, filenames in os.walk(path.path): 38 | lst.extend([file for file in filenames if any([file.lower().endswith(ext) for ext in self.exts])]) 39 | 40 | return lst 41 | 42 | 43 | def openFile(self, path, file): 44 | fullname = os.path.join(path.path, file) 45 | return open(fullname, mode="rb") if self.doOpen else fullname 46 | 47 | 48 | def unittest(): 49 | zipname = "../TestData/test.zip" 50 | path = os.path.splitext(zipname)[0] 51 | 52 | zipfile.ZipFile(zipname).extractall(path) 53 | 54 | loader = PathLoader() 55 | loader.load(path) 56 | loader.clear() 57 | 58 | shutil.rmtree(path) 59 | 60 | 61 | if __name__ == "__main__": 62 | unittest() 63 | -------------------------------------------------------------------------------- /Compiler/Codegen/Malloc/TMalloc.c: -------------------------------------------------------------------------------- 1 | #undef NDEBUG 2 | #include 3 | 4 | #include "AllocTree.gen.h" 5 | #include "$HEADER_NAME" 6 | 7 | 8 | static AllocTree allocTree; 9 | static AllocTree_Iterator allocIterator; 10 | 11 | 12 | void *${NAME}_malloc(size_t size, const char *file, int line) 13 | { 14 | void *ptr = malloc(size); 15 | 16 | Allocation alloc; 17 | alloc.size = size; 18 | alloc.file = file; 19 | alloc.line = line; 20 | 21 | bool inserted = AllocTree_insert(&allocTree, ptr, alloc); 22 | assert(inserted); 23 | 24 | return ptr; 25 | } 26 | 27 | 28 | void ${NAME}_free(void *ptr) 29 | { 30 | if (ptr != NULL) 31 | { 32 | Allocation alloc; 33 | 34 | bool found = AllocTree_get(&allocTree, ptr, &alloc); 35 | assert(found); 36 | 37 | bool deleted = AllocTree_delete(&allocTree, ptr); 38 | assert(deleted); 39 | } 40 | 41 | free(ptr); 42 | } 43 | 44 | 45 | size_t ${NAME}_traceLeaks(void) 46 | { 47 | return allocTree.size; 48 | } 49 | 50 | 51 | bool ${NAME}_Iterator_init(void) 52 | { 53 | return AllocTree_Iterator_init(&allocIterator, &allocTree, true); 54 | } 55 | 56 | 57 | void ${NAME}_Iterator_dealloc(void) 58 | { 59 | AllocTree_Iterator_dealloc(&allocIterator); 60 | } 61 | 62 | 63 | bool ${NAME}_Iterator_move(void) 64 | { 65 | return AllocTree_Iterator_move(&allocIterator, true); 66 | } 67 | 68 | 69 | void ${NAME}_Iterator_item(size_t *size, const char **file, int *line) 70 | { 71 | void *ptr; 72 | Allocation alloc; 73 | 74 | AllocTree_Iterator_item(&allocIterator, &ptr, &alloc); 75 | 76 | *size = alloc.size; 77 | *file = alloc.file; 78 | *line = alloc.line; 79 | } 80 | -------------------------------------------------------------------------------- /Converter/Examples/ResNet.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib import Config 2 | Config.globalEvalMode = True 3 | 4 | from PuzzleLib.Backend import gpuarray 5 | from PuzzleLib.Models.Nets.ResNet import loadResNet 6 | 7 | from PuzzleLib.Converter.Examples.Common import loadResNetSample, loadLabels, showLabelResults 8 | 9 | 10 | def main(): 11 | resNet50Test() 12 | resNet101Test() 13 | resNet152Test() 14 | 15 | 16 | def resNet50Test(): 17 | net = loadResNet(modelpath="../TestData/ResNet-50-model.hdf", layers="50") 18 | 19 | sample = loadResNetSample(net, "../TestData/tarantula.jpg") 20 | labels = loadLabels(synpath="../TestData/synsets.txt", wordpath="../TestData/synset_words.txt") 21 | 22 | res = net(gpuarray.to_gpu(sample)).get().reshape(-1) 23 | showLabelResults(res, labels, header=net.name) 24 | 25 | 26 | def resNet101Test(): 27 | net = loadResNet(modelpath="../TestData/ResNet-101-model.hdf", layers="101") 28 | 29 | sample = loadResNetSample(net, "../TestData/tarantula.jpg") 30 | labels = loadLabels(synpath="../TestData/synsets.txt", wordpath="../TestData/synset_words.txt") 31 | 32 | res = net(gpuarray.to_gpu(sample)).get().reshape(-1) 33 | showLabelResults(res, labels, header=net.name) 34 | 35 | 36 | def resNet152Test(): 37 | net = loadResNet(modelpath="../TestData/ResNet-152-model.hdf", layers="152") 38 | 39 | sample = loadResNetSample(net, "../TestData/tarantula.jpg") 40 | labels = loadLabels(synpath="../TestData/synsets.txt", wordpath="../TestData/synset_words.txt") 41 | 42 | res = net(gpuarray.to_gpu(sample)).get().reshape(-1) 43 | showLabelResults(res, labels, header=net.name) 44 | 45 | 46 | if __name__ == "__main__": 47 | main() 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PuzzleLib 2 | 3 | [PuzzleLib](https://puzzlelib.org) is a high level Deep Learning framework with CPU (Intel/AMD) and GPU (NVIDIA/AMD) support. The library is written in Python, it is modular and dynamic. 4 | 5 | ## Installation 6 | 7 | For detailed instructions on how to install the library and dependencies, see the documentation for [Windows](https://puzzlelib.org/ru/documentation/general/installation/windows/) and [Linux/macOS](https://puzzlelib.org/ru/documentation/general/installation/linux/) installation. 8 | 9 | PuzzleLib supports: 10 | 11 | * NVIDIA GPU (CUDA backend); 12 | * AMD GPU (ROCm backend); 13 | * Intel CPU (mkl-dnn backend); 14 | * AMD CPU (numpy backend). 15 | 16 | 17 | ## Documentation 18 | 19 | https://puzzlelib.org 20 | 21 | 22 | ## License 23 | 24 | [Apache License 2.0](LICENSE) 25 | 26 | ___ 27 | # PuzzleLib 28 | 29 | [PuzzleLib](https://puzzlelib.org) - это библиотека для построения нейронных сетей с поддержкой вычислений на CPU (Intel/AMD) и GPU (NVIDIA/AMD). Библиотека модульная и динамическая, написана на языке Python. 30 | 31 | ## Установка 32 | 33 | Для детальных инструкций по установке библиотеки и её зависимостей смотрите документацию: установка на [Windows](https://puzzlelib.org/ru/documentation/general/installation/windows/) и на [Linux/macOS](https://puzzlelib.org/ru/documentation/general/installation/linux/). 34 | 35 | PuzzleLib поддерживает: 36 | 37 | * NVIDIA GPU (CUDA backend); 38 | * AMD GPU (ROCm backend); 39 | * Intel CPU (mkl-dnn backend); 40 | * AMD CPU (numpy backend). 41 | 42 | 43 | ## Документация 44 | 45 | https://puzzlelib.org 46 | 47 | 48 | ## Лицензия 49 | 50 | [Apache License 2.0](LICENSE) 51 | -------------------------------------------------------------------------------- /TestLib/GradientCheck.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray 4 | 5 | from PuzzleLib.Containers import Sequential 6 | from PuzzleLib.Modules import Conv2D, AvgPool2D, BatchNorm2D, Activation, relu, Flatten 7 | from PuzzleLib.Cost import BCE 8 | 9 | 10 | def buildNet(): 11 | net = Sequential(name="test-net") 12 | 13 | net.append(Conv2D(1, 2, 3, wscale=1.0, initscheme="gaussian")) 14 | net.append(AvgPool2D(2, 2)) 15 | 16 | net.append(BatchNorm2D(2)) 17 | net.append(Activation(relu)) 18 | 19 | net.append(Conv2D(2, 1, 2, wscale=1.0, initscheme="gaussian")) 20 | net.append(Flatten()) 21 | 22 | return net 23 | 24 | 25 | def gradientCheck(mod, data, target, cost, h=1e-3): 26 | vartable = mod.getVarTable() 27 | 28 | mod(data) 29 | error, grad = cost(mod.data, target) 30 | mod.backward(grad, updGrad=False) 31 | 32 | for var in vartable.keys(): 33 | w = var.data.get() 34 | dw = -var.grad.get() 35 | 36 | for i in range(w.ravel().shape[0]): 37 | wph = np.copy(w) 38 | wmh = np.copy(w) 39 | 40 | wph.ravel()[i] = w.ravel()[i] + h 41 | var.data.set(wph) 42 | yph, _ = cost(mod(data), target) 43 | 44 | wmh.ravel()[i] = w.ravel()[i] - h 45 | var.data.set(wmh) 46 | ymh, _ = cost(mod(data), target) 47 | 48 | host = (yph - ymh) / (2.0 * h) 49 | dev = dw.ravel()[i] 50 | var.data.set(w) 51 | 52 | print(abs((host - dev) / (dev + h))) 53 | 54 | 55 | def main(): 56 | net = buildNet() 57 | cost = BCE() 58 | 59 | data = gpuarray.to_gpu(np.random.randn(1, 1, 6, 6).astype(np.float32)) 60 | target = gpuarray.to_gpu(np.random.randint(0, 2, size=(1, ))) 61 | 62 | gradientCheck(net, data, target, cost) 63 | 64 | 65 | if __name__ == "__main__": 66 | main() 67 | -------------------------------------------------------------------------------- /Backend/Kernels/Pool.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib import Config 2 | 3 | 4 | maxpool2d = None 5 | maxpool2dBackward = None 6 | maxunpool2d = None 7 | maxunpool2dBackward = None 8 | 9 | 10 | def autoinit(): 11 | if not Config.shouldInit(): 12 | return 13 | 14 | if Config.backend == Config.Backend.cuda: 15 | initCuda() 16 | elif Config.backend == Config.Backend.hip: 17 | initHip() 18 | elif Config.isCPUBased(Config.backend): 19 | initCPU() 20 | else: 21 | raise Config.ConfigError(Config.backend) 22 | 23 | 24 | def initCuda(): 25 | from PuzzleLib.Cuda import Backend 26 | initGPU(Backend) 27 | 28 | 29 | def initHip(): 30 | from PuzzleLib.Hip import Backend 31 | initGPU(Backend) 32 | 33 | 34 | def initGPU(Backend): 35 | backend = Backend.getBackend(Config.deviceIdx, initmode=2, logger=Config.getLogger()) 36 | memoryPool, poolmod = backend.memoryPool, backend.poolmod 37 | 38 | def wrapMaxPool2d(data, size, stride, pad): 39 | return poolmod.maxpool2d(data, size, stride, pad, memoryPool) 40 | 41 | def wrapMaxPool2dBackward(grad, origshape, mask, size, stride, pad): 42 | return poolmod.maxpool2dBackward(grad, origshape, mask, size, stride, pad, memoryPool) 43 | 44 | global maxpool2d, maxpool2dBackward 45 | maxpool2d = wrapMaxPool2d 46 | maxpool2dBackward = wrapMaxPool2dBackward 47 | 48 | def wrapMaxUnpool2d(data, origshape, mask): 49 | return poolmod.maxunpool2d(data, origshape, mask, memoryPool) 50 | 51 | def wrapMaxUnpool2dBackward(grad, poolshape, mask): 52 | return poolmod.maxunpool2dBackward(grad, poolshape, mask, memoryPool) 53 | 54 | global maxunpool2d, maxunpool2dBackward 55 | maxunpool2d = wrapMaxUnpool2d 56 | maxunpool2dBackward = wrapMaxUnpool2dBackward 57 | 58 | 59 | def initCPU(): 60 | pass 61 | 62 | 63 | autoinit() 64 | -------------------------------------------------------------------------------- /Converter/TensorRT/DataCalibrator.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | 3 | import numpy as np 4 | 5 | from PuzzleLib.Backend import gpuarray 6 | from PuzzleLib.Backend.gpuarray import memoryPool as memPool 7 | from PuzzleLib.Converter.TensorRT import Driver 8 | 9 | 10 | class CalibratorError(Exception): 11 | pass 12 | 13 | 14 | class DataCalibrator(Driver.ICalibrator): 15 | def __init__(self, data, batchsize=100, cachename=None): 16 | super().__init__("" if cachename is None else cachename) 17 | 18 | if data is None: 19 | if cachename is None: 20 | raise CalibratorError("Invalid calibration cache file") 21 | 22 | self.nbatches = 0 23 | 24 | else: 25 | if data.shape[0] % batchsize != 0: 26 | raise CalibratorError("TensorRT calibration engine requires data size to be divisible by batch size") 27 | 28 | if data.dtype != np.float32: 29 | raise CalibratorError("Invalid data type") 30 | 31 | self.nbatches = data.shape[0] // batchsize 32 | 33 | self.data = data 34 | self.idx = 0 35 | 36 | self.batchsize = batchsize 37 | self.batch = None 38 | 39 | 40 | def getDataShape(self): 41 | return self.data.shape[1:] 42 | 43 | 44 | def getBatchSize(self): 45 | return self.batchsize 46 | 47 | 48 | def getBatch(self, bindings, names): 49 | assert len(bindings) == 1 and len(names) == 1 50 | 51 | if self.idx >= self.nbatches: 52 | return False 53 | 54 | self.batch = gpuarray.to_gpu( 55 | self.data[self.idx * self.batchsize:(self.idx + 1) * self.batchsize], allocator=memPool 56 | ) 57 | 58 | ptr = ctypes.cast(bindings[0], ctypes.POINTER(ctypes.c_void_p)) 59 | ptr.contents.value = self.batch.ptr 60 | 61 | print("Sending batch #%s out of %s" % (self.idx + 1, self.nbatches)) 62 | self.idx += 1 63 | 64 | return True 65 | -------------------------------------------------------------------------------- /TestLib/RnnIMDBTrain.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Backend import Dnn 2 | 3 | from PuzzleLib.Containers import Sequential 4 | from PuzzleLib.Modules import Embedder, SwapAxes, RNN, Linear 5 | 6 | from PuzzleLib.Datasets import IMDBLoader 7 | from PuzzleLib.Handlers import Trainer, Validator 8 | from PuzzleLib.Optimizers import Adam 9 | from PuzzleLib.Cost import BCE 10 | 11 | 12 | def buildNet(numwords, maxlen, hintBatchsize): 13 | seq = Sequential() 14 | seq.append(Embedder(numwords, maxlen, 128, initscheme="uniform", wscale=0.05, learnable=True)) 15 | 16 | seq.append(SwapAxes(0, 1)) 17 | seq.append(RNN(128, 128, mode="lstm", dropout=0.2, hintBatchSize=hintBatchsize)) 18 | 19 | seq.append(Linear(128, 1)) 20 | return seq 21 | 22 | 23 | def main(): 24 | hintBatchsize, batchsize = (40, 40) if Dnn.deviceSupportsBatchHint() else (None, 32) 25 | numwords, maxlen = 20000, 80 26 | 27 | imdb = IMDBLoader(numwords=numwords, maxlen=maxlen) 28 | data, labels, _ = imdb.load(path="../TestData/") 29 | data, labels = data[:], labels[:] 30 | print("Loaded IMDB") 31 | 32 | net = buildNet(numwords, maxlen, hintBatchsize) 33 | 34 | optimizer = Adam(alpha=1e-3) 35 | optimizer.setupOn(net, useGlobalState=True) 36 | 37 | cost = BCE() 38 | trainer = Trainer(net, cost, optimizer, batchsize=batchsize) 39 | validator = Validator(net, cost, batchsize=batchsize) 40 | 41 | print("Started training ...") 42 | 43 | for i in range(15): 44 | trainer.trainFromHost( 45 | data[:25000], labels[:25000], macroBatchSize=25000, 46 | onMacroBatchFinish=lambda train: print("Train error: %s" % train.cost.getMeanError()) 47 | ) 48 | print("Accuracy: %s" % (1.0 - validator.validateFromHost(data[25000:], labels[25000:], macroBatchSize=25000))) 49 | 50 | 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /Converter/Examples/Common.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib import Visual 4 | 5 | 6 | def loadV3Labels(filename): 7 | with open(filename) as f: 8 | synsets = f.readlines() 9 | synsets = [line.strip() for line in synsets] 10 | 11 | labels = {} 12 | for i, synset in enumerate(synsets): 13 | labels[i] = synset 14 | 15 | return labels 16 | 17 | 18 | def loadLabels(synpath, wordpath): 19 | with open(synpath) as f: 20 | synsets = f.readlines() 21 | synsets = [line.strip() for line in synsets] 22 | 23 | with open(wordpath) as f: 24 | lines = f.readlines() 25 | lines = [line.strip() for line in lines] 26 | 27 | words = {} 28 | for line in lines: 29 | tags = line.split(sep=" ", maxsplit=1) 30 | words[tags[0]] = tags[1] 31 | 32 | labels = {} 33 | for i, synset in enumerate(synsets): 34 | labels[i] = words[synset] 35 | 36 | return labels 37 | 38 | 39 | def showLabelResults(res, labels, limit=5, header=""): 40 | idx = (-res).argsort()[:limit] 41 | 42 | print("%sTop-%s predictions:" % ("%s " % header if len(header) > 0 else "", limit)) 43 | for i in range(limit): 44 | print("#%s %s (prob=%s)" % (i + 1, labels[idx[i]], res[idx[i]])) 45 | 46 | 47 | def loadVGGSample(filename, shape=None, normalize=False): 48 | meanPixel = np.array([103.939, 116.779, 123.68], dtype=np.float32).reshape((1, 3, 1, 1)) 49 | sample = loadSample(filename, shape) - meanPixel 50 | 51 | return sample * (2.0 / 255.0) - 1.0 if normalize else sample 52 | 53 | 54 | def loadResNetSample(net, filename, shape=None): 55 | mean = net.getAttr("mean") 56 | return loadSample(filename, shape) - mean 57 | 58 | 59 | def loadSample(filename, shape=None): 60 | return np.ascontiguousarray( 61 | Visual.loadImage(filename, shape, normalize=False, contiguous=False)[:, ::-1, :, :], 62 | dtype=np.float32 63 | ) 64 | -------------------------------------------------------------------------------- /Backend/Kernels/Upsample.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib import Config 2 | 3 | 4 | upsample2d = None 5 | upsample2dBackward = None 6 | 7 | upsample3d = None 8 | upsample3dBackward = None 9 | 10 | 11 | def autoinit(): 12 | if not Config.shouldInit(): 13 | return 14 | 15 | if Config.backend == Config.Backend.cuda: 16 | initCuda() 17 | elif Config.backend == Config.Backend.hip: 18 | initHip() 19 | elif Config.isCPUBased(Config.backend): 20 | initCPU() 21 | else: 22 | raise Config.ConfigError(Config.backend) 23 | 24 | 25 | def initCuda(): 26 | from PuzzleLib.Cuda import Backend 27 | initGPU(Backend) 28 | 29 | 30 | def initHip(): 31 | from PuzzleLib.Hip import Backend 32 | initGPU(Backend) 33 | 34 | 35 | def initGPU(Backend): 36 | backend = Backend.getBackend(Config.deviceIdx, initmode=2, logger=Config.getLogger()) 37 | memoryPool, upsamplemod = backend.memoryPool, backend.upsamplemod 38 | 39 | def wrapUpsample2d(data, scale, mode): 40 | return upsamplemod.upsample2d(data, scale, mode, memoryPool) 41 | 42 | def wrapUpsample2dBackward(grad, scale, mode): 43 | return upsamplemod.upsample2dBackward(grad, scale, mode, memoryPool) 44 | 45 | global upsample2d, upsample2dBackward 46 | upsample2d = wrapUpsample2d 47 | upsample2dBackward = wrapUpsample2dBackward 48 | 49 | def wrapUpsample3d(data, scale, mode): 50 | return upsamplemod.upsample3d(data, scale, mode, memoryPool) 51 | 52 | def wrapUpsample3dBackward(grad, scale, mode): 53 | return upsamplemod.upsample3dBackward(grad, scale, mode, memoryPool) 54 | 55 | global upsample3d, upsample3dBackward 56 | upsample3d = wrapUpsample3d 57 | upsample3dBackward = wrapUpsample3dBackward 58 | 59 | 60 | def initCPU(): 61 | from PuzzleLib.CPU.Kernels import Upsample2D 62 | 63 | global upsample2d 64 | upsample2d = Upsample2D.upsample2d 65 | 66 | 67 | autoinit() 68 | -------------------------------------------------------------------------------- /Intel/Wrappers/DNNLBlas.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.CPU.CPUArray import CPUArray 4 | from PuzzleLib.Intel.ThirdParty import libdnnl 5 | 6 | 7 | def mulMatrixOnMatrix(A, B, out=None, transpA=False, transpB=False, alpha=1.0, beta=0.0): 8 | assert not (transpA and transpB) 9 | assert A.ndim == 2 and B.ndim == 2 10 | 11 | assert A.dtype == B.dtype and A.dtype == np.float32 12 | assert A.flags.c_contiguous and B.flags.c_contiguous 13 | 14 | if transpA: 15 | assert A.shape[0] == B.shape[0] 16 | shape = (A.shape[1], B.shape[1]) 17 | elif transpB: 18 | assert A.shape[1] == B.shape[1] 19 | shape = (A.shape[0], B.shape[0]) 20 | else: 21 | assert A.shape[1] == B.shape[0] 22 | shape = (A.shape[0], B.shape[1]) 23 | 24 | if out is None: 25 | out = CPUArray.empty(shape, dtype=np.float32) 26 | 27 | if transpA: 28 | k, m = A.shape 29 | n = B.shape[1] 30 | libdnnl.dnnl_sgemm('t', 'n', m, n, k, alpha, A.ptr, m, B.ptr, n, beta, out.ptr, n) 31 | elif transpB: 32 | m, k = A.shape 33 | n = B.shape[0] 34 | libdnnl.dnnl_sgemm('n', 't', m, n, k, alpha, A.ptr, k, B.ptr, k, beta, out.ptr, n) 35 | else: 36 | m, k = A.shape 37 | n = B.shape[1] 38 | libdnnl.dnnl_sgemm('n', 'n', m, n, k, alpha, A.ptr, k, B.ptr, n, beta, out.ptr, n) 39 | 40 | return out 41 | 42 | 43 | def unittest(): 44 | A = CPUArray.toDevice(np.random.randn(5, 3).astype(np.float32)) 45 | B = CPUArray.toDevice(np.random.randn(3, 4).astype(np.float32)) 46 | 47 | C = mulMatrixOnMatrix(A, B) 48 | assert np.allclose(np.dot(A.get(), B.get()), C.get()) 49 | 50 | F = mulMatrixOnMatrix(B, C, transpB=True) 51 | assert np.allclose(np.dot(B.get(), C.get().T), F.get()) 52 | 53 | G = mulMatrixOnMatrix(F, B, transpA=True) 54 | assert np.allclose(np.dot(F.get().T, B.get()), G.get()) 55 | 56 | 57 | if __name__ == "__main__": 58 | unittest() 59 | -------------------------------------------------------------------------------- /Modules/Add.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray, Blas 4 | from PuzzleLib.Backend.gpuarray import memoryPool as memPool 5 | 6 | from PuzzleLib.Modules.Module import ModuleError, Module 7 | 8 | 9 | class Add(Module): 10 | def __init__(self, name=None): 11 | super().__init__(name) 12 | self.movesGrad = True 13 | 14 | 15 | def updateData(self, data): 16 | firstdata = data[0] 17 | 18 | self.data = gpuarray.empty(firstdata.shape, dtype=firstdata.dtype, allocator=memPool) 19 | self.data.fill(0) 20 | 21 | for dat in data: 22 | Blas.toVectorAddVector(self.data.ravel(), dat.ravel()) 23 | 24 | 25 | def updateGrad(self, grad): 26 | self.grad = [grad] * len(self.inData) 27 | 28 | 29 | def checkDataShape(self, shapes): 30 | for shape in shapes: 31 | if shape != shapes[0]: 32 | raise ModuleError("Shape %s is not equal to initial shape %s" % (shape, shapes[0])) 33 | 34 | 35 | def dataShapeFrom(self, shape): 36 | return shape[0] 37 | 38 | 39 | def gradShapeFrom(self, shape): 40 | return [shape] * len(self.inData) 41 | 42 | 43 | def calcMode(self, T): 44 | dtypes = {dtype for dtype, _ in gpuarray.dtypesSupported()} 45 | 46 | if T not in dtypes: 47 | raise ModuleError("Unsupported dtype %s" % T) 48 | 49 | self.calctype = T 50 | 51 | 52 | def unittest(): 53 | for dtype, _ in gpuarray.dtypesSupported(): 54 | addTest(dtype) 55 | 56 | 57 | def addTest(dtype): 58 | hostData1 = np.random.randn(2, 5, 5).astype(dtype) 59 | hostData2 = np.random.randn(*hostData1.shape).astype(dtype) 60 | 61 | data1, data2 = gpuarray.to_gpu(hostData1), gpuarray.to_gpu(hostData2) 62 | 63 | add = Add() 64 | add.calcMode(dtype) 65 | 66 | add([data1, data2]) 67 | assert np.allclose(hostData1 + hostData2, add.data.get()) 68 | 69 | 70 | if __name__ == "__main__": 71 | unittest() 72 | -------------------------------------------------------------------------------- /Compiler/Codegen/Malloc/Generate.py: -------------------------------------------------------------------------------- 1 | import os 2 | from string import Template 3 | 4 | from PuzzleLib.Compiler.Codegen.Tree.Generate import generateTree 5 | from PuzzleLib.Compiler.Toolchain import createTemplateNames, writeTemplates, buildTemplateTest 6 | 7 | 8 | def generateMalloc(name=None, filename=None): 9 | treename = generateTree( 10 | name="AllocTree", K="VoidPtr", V="Allocation", 11 | headerPreambule= 12 | """ 13 | typedef void *VoidPtr; 14 | 15 | 16 | typedef struct Allocation 17 | { 18 | size_t size; 19 | const char *file; 20 | int line; 21 | } 22 | Allocation; 23 | """, 24 | filename=os.path.join(os.path.dirname(filename), "AllocTree") 25 | ) 26 | 27 | name = "TraceMalloc" if name is None else name 28 | 29 | filename = name if filename is None else filename 30 | headername, bodyname = createTemplateNames(filename) 31 | 32 | dirname = os.path.dirname(__file__) 33 | 34 | with open(os.path.join(dirname, "TMalloc.h"), mode="r", encoding="utf-8") as f: 35 | header = Template(f.read()).substitute(NAME=name) 36 | 37 | with open(os.path.join(dirname, "TMalloc.c"), mode="r", encoding="utf-8") as f: 38 | body = Template(f.read()).substitute(HEADER_NAME=os.path.basename(headername), NAME=name) 39 | 40 | writeTemplates([ 41 | (header, headername), 42 | (body, bodyname) 43 | ]) 44 | 45 | return [bodyname, treename] 46 | 47 | 48 | def unittest(): 49 | TraceMalloc = buildTemplateTest( 50 | name="TraceMalloc", bindingName="TMallocTest.c", path="../../TestData", generator=generateMalloc, 51 | defines=["ENABLE_TRACE_MALLOC"] 52 | ) 53 | 54 | ptr = TraceMalloc.malloc(16) 55 | 56 | leaks = TraceMalloc.traceLeaks() 57 | assert len(leaks) == 1 58 | 59 | TraceMalloc.free(ptr) 60 | 61 | leaks = TraceMalloc.traceLeaks() 62 | assert len(leaks) == 0 63 | 64 | 65 | if __name__ == "__main__": 66 | unittest() 67 | -------------------------------------------------------------------------------- /Modules/Penalty.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | import numpy as np 4 | 5 | from PuzzleLib.Backend import gpuarray, Blas 6 | from PuzzleLib.Backend.gpuarray import memoryPool as memPool 7 | from PuzzleLib.Backend.Kernels.ElementWise import l1penaltyKer 8 | 9 | from PuzzleLib.Modules.Module import Module 10 | 11 | 12 | class PenaltyMode(str, Enum): 13 | l1 = "l1" 14 | l2 = "l2" 15 | 16 | 17 | class Penalty(Module): 18 | def __init__(self, mode="l1", weight=1e-2, name=None): 19 | super().__init__(name) 20 | self.registerBlueprint(locals()) 21 | 22 | self.gradUsesOutData = True 23 | self.movesData = True 24 | 25 | self.mode = PenaltyMode(mode) 26 | self.weight = weight 27 | 28 | 29 | def updateData(self, data): 30 | self.data = data 31 | 32 | 33 | def updateGrad(self, grad): 34 | if self.mode == PenaltyMode.l1: 35 | self.grad = gpuarray.empty(grad.shape, dtype=grad.dtype, allocator=memPool) 36 | l1penaltyKer(self.grad, grad, self.data, self.weight / grad.shape[0]) 37 | 38 | elif self.mode == PenaltyMode.l2: 39 | self.grad = Blas.addVectorToVector( 40 | grad.ravel(), self.data.ravel(), alpha=1.0, beta=-self.weight / grad.shape[0] 41 | ).reshape(grad.shape) 42 | 43 | else: 44 | raise NotImplementedError(self.mode) 45 | 46 | 47 | def dataShapeFrom(self, shape): 48 | return shape 49 | 50 | 51 | def gradShapeFrom(self, shape): 52 | return shape 53 | 54 | 55 | def unittest(): 56 | data = gpuarray.to_gpu(np.random.randn(10, 50).astype(np.float32)) 57 | 58 | penalty = Penalty() 59 | penalty(data) 60 | 61 | grad = gpuarray.to_gpu(np.random.randn(10, 50).astype(np.float32)) 62 | penalty.backward(grad) 63 | 64 | hostGrad = grad.get() - penalty.weight * np.sign(data.get()) / data.shape[0] 65 | assert np.allclose(hostGrad, penalty.grad.get()) 66 | 67 | 68 | if __name__ == "__main__": 69 | unittest() 70 | -------------------------------------------------------------------------------- /Converter/OpenVINO/Source/Build.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | import pybind11 3 | 4 | from PuzzleLib.Compiler.Toolchain import guessToolchain 5 | 6 | 7 | def buildDriver(debugmode=0): 8 | cc = prepareCompiler(debugmode=debugmode) 9 | 10 | driver = "../Driver%s" % cc.pydext 11 | cc.build(driver, sourcefiles="./Driver.cpp").clearPath("..") 12 | 13 | return driver 14 | 15 | 16 | def findLibraryPath(): 17 | OPENVINO_PATH = os.environ.get("OPENVINO_PATH", None) 18 | 19 | if OPENVINO_PATH is None: 20 | if sys.platform == "linux": 21 | OPENVINO_PATH = "/opt/intel/openvino" 22 | 23 | elif sys.platform == "win32": 24 | raise OSError("OpenVINO path needs to be specified in the system variables as OPENVINO_PATH") 25 | 26 | else: 27 | raise NotImplementedError(sys.platform) 28 | 29 | return OPENVINO_PATH 30 | 31 | 32 | def prepareCompiler(debugmode=0): 33 | level, debuglevel = (0, 3) if debugmode > 0 else (4, 0) 34 | 35 | cc = guessToolchain(verbose=2).withOptimizationLevel(level=level, debuglevel=debuglevel).cppMode(True) 36 | OPENVINO_PATH = findLibraryPath() 37 | 38 | if sys.platform == "linux": 39 | cc.includeDirs.append(pybind11.get_include(user=True)) 40 | 41 | cc.addLibrary( 42 | "openvino", 43 | [os.path.join(OPENVINO_PATH, "inference_engine/include")], 44 | [os.path.join(OPENVINO_PATH, "inference_engine/lib/intel64")], 45 | ["inference_engine"] 46 | ) 47 | 48 | elif sys.platform == "win32": 49 | cc.addLibrary( 50 | "openvino", 51 | [os.path.join(OPENVINO_PATH, "inference_engine/include")], 52 | [os.path.join(OPENVINO_PATH, "inference_engine/lib/intel64/Release")], 53 | ["inference_engine"] 54 | ) 55 | 56 | else: 57 | raise NotImplementedError(sys.platform) 58 | 59 | return cc 60 | 61 | 62 | def main(): 63 | return buildDriver(debugmode=0) 64 | 65 | 66 | if __name__ == "__main__": 67 | main() 68 | -------------------------------------------------------------------------------- /Cost/MSE.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray, Blas 4 | from PuzzleLib.Cost.Cost import Cost 5 | 6 | 7 | class MSE(Cost): 8 | def calcGrad(self, pred, target): 9 | c = 1.0 / np.prod(target.shape) 10 | grad = Blas.addVectorToVector(target.ravel(), pred.ravel(), alpha=c, beta=-c) 11 | grad = grad.reshape(pred.shape) 12 | 13 | return grad 14 | 15 | 16 | def calcError(self, pred, target): 17 | self.devErr.fill( 18 | Blas.dot(self.grad.ravel(), self.grad.ravel()) * np.prod(self.grad.shape) * self.grad.shape[0] / 2.0 19 | ) 20 | self.accumErr += self.devErr 21 | 22 | 23 | def calcVal(self, pred, target): 24 | diff = Blas.addVectorToVector(target.ravel(), pred.ravel(), alpha=1.0, beta=-1.0) 25 | error = Blas.dot(diff, diff) / (2.0 * np.prod(target.shape)) 26 | 27 | return error 28 | 29 | 30 | def checkDataShape(self, pred, target): 31 | assert pred.shape[1:] == target.shape[1:] 32 | 33 | 34 | def checkValDataShape(self, pred, target): 35 | assert pred.shape[1:] == target.shape[1:] 36 | 37 | 38 | def unittest(): 39 | errorTest() 40 | valTest() 41 | 42 | 43 | def errorTest(): 44 | pred = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32)) 45 | target = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32)) 46 | 47 | mse = MSE() 48 | mse(pred, target) 49 | 50 | assert np.isclose(mse.error, np.linalg.norm(target.get() - pred.get())**2 / (2.0 * np.prod(target.shape))) 51 | 52 | 53 | def valTest(): 54 | pred = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32)) 55 | target = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32)) 56 | 57 | mse = MSE() 58 | error = mse.validate(pred, target) 59 | 60 | assert np.isclose(error, np.linalg.norm(target.get() - pred.get())**2 / (2.0 * np.prod(target.shape))) 61 | 62 | 63 | if __name__ == "__main__": 64 | unittest() 65 | -------------------------------------------------------------------------------- /TestLib/BiRnnIMDBTrain.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Backend import Dnn 2 | 3 | from PuzzleLib.Containers import Sequential 4 | from PuzzleLib.Modules import Embedder, SwapAxes, RNN, Concat, Dropout, Linear 5 | 6 | from PuzzleLib.Datasets import IMDBLoader 7 | from PuzzleLib.Handlers import Trainer, Validator 8 | from PuzzleLib.Optimizers import Adam 9 | from PuzzleLib.Cost import BCE 10 | 11 | 12 | def buildNet(numwords, maxlen, hintBatchsize): 13 | seq = Sequential() 14 | seq.append(Embedder(numwords, maxlen, 128, initscheme="uniform", wscale=0.05, learnable=True)) 15 | 16 | seq.append(SwapAxes(0, 1)) 17 | seq.append(RNN(128, 64, mode="lstm", direction="bi", hintBatchSize=hintBatchsize)) 18 | 19 | seq.append(Concat(axis=1)) 20 | seq.append(Dropout(p=0.5)) 21 | 22 | seq.append(Linear(128, 1)) 23 | return seq 24 | 25 | 26 | def main(): 27 | hintBatchsize, batchsize = (40, 40) if Dnn.deviceSupportsBatchHint() else (None, 32) 28 | numwords, maxlen = 20000, 100 29 | 30 | imdb = IMDBLoader(numwords=numwords, maxlen=maxlen) 31 | data, labels, _ = imdb.load(path="../TestData/") 32 | data, labels = data[:], labels[:] 33 | print("Loaded IMDB") 34 | 35 | net = buildNet(numwords, maxlen, hintBatchsize) 36 | 37 | optimizer = Adam(alpha=1e-3) 38 | optimizer.setupOn(net, useGlobalState=True) 39 | 40 | cost = BCE() 41 | trainer = Trainer(net, cost, optimizer, batchsize=batchsize) 42 | validator = Validator(net, cost, batchsize=batchsize) 43 | 44 | print("Started training ...") 45 | 46 | for i in range(15): 47 | trainer.trainFromHost( 48 | data[:25000], labels[:25000], macroBatchSize=25000, 49 | onMacroBatchFinish=lambda train: print("Train error: %s" % train.cost.getMeanError()) 50 | ) 51 | print("Accuracy: %s" % (1.0 - validator.validateFromHost(data[25000:], labels[25000:], macroBatchSize=25000))) 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /TestLib/CnnIMDBTrain.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Datasets import IMDBLoader 2 | 3 | from PuzzleLib.Containers import Sequential 4 | from PuzzleLib.Modules import Embedder, Dropout, SwapAxes, Conv1D, Activation, relu, MaxPool1D, Flatten, Linear 5 | 6 | from PuzzleLib.Handlers import Trainer, Validator 7 | from PuzzleLib.Optimizers import Adam 8 | from PuzzleLib.Cost import BCE 9 | 10 | 11 | def buildNet(numwords, maxlen, embsize): 12 | seq = Sequential() 13 | 14 | seq.append(Embedder(numwords, maxlen, embsize, initscheme="uniform", wscale=0.05, learnable=True)) 15 | 16 | seq.append(Dropout(p=0.2)) 17 | seq.append(SwapAxes(1, 2)) 18 | 19 | seq.append(Conv1D(embsize, embsize, 3)) 20 | seq.append(Activation(relu)) 21 | 22 | seq.append(MaxPool1D(maxlen - 2, 1)) 23 | seq.append(Flatten()) 24 | 25 | seq.append(Linear(embsize, 250)) 26 | seq.append(Dropout(p=0.2)) 27 | seq.append(Activation(relu)) 28 | 29 | seq.append(Linear(250, 1)) 30 | return seq 31 | 32 | 33 | def main(): 34 | numwords, maxlen, embsize = 5000, 250, 50 35 | 36 | imdb = IMDBLoader(numwords=numwords, maxlen=maxlen) 37 | data, labels, _ = imdb.load(path="../TestData/") 38 | data, labels = data[:], labels[:] 39 | print("Loaded IMDB") 40 | 41 | net = buildNet(numwords, maxlen, embsize) 42 | 43 | optimizer = Adam(alpha=1e-3) 44 | optimizer.setupOn(net, useGlobalState=True) 45 | 46 | cost = BCE() 47 | trainer = Trainer(net, cost, optimizer, batchsize=32) 48 | validator = Validator(net, cost, batchsize=32) 49 | 50 | for i in range(15): 51 | trainer.trainFromHost( 52 | data[:25000], labels[:25000], macroBatchSize=25000, 53 | onMacroBatchFinish=lambda train: print("Train error: %s" % train.cost.getMeanError()) 54 | ) 55 | print("Accuracy: %s" % (1.0 - validator.validateFromHost(data[25000:], labels[25000:], macroBatchSize=25000))) 56 | 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /Modules/Pool1D.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Backend import gpuarray 2 | from PuzzleLib.Modules.Module import ModuleError, Module 3 | 4 | 5 | class Pool1D(Module): 6 | def __init__(self, size=2, stride=2, pad=0, name=None): 7 | super().__init__(name) 8 | self.gradUsesOutData = True 9 | 10 | self.size = (1, size) 11 | self.stride = (1, stride) 12 | self.pad = (0, pad) 13 | 14 | self.workspace = None 15 | 16 | 17 | def dataShapeFrom(self, shape): 18 | batchsize, maps, insize = shape 19 | 20 | _, size = self.size 21 | _, pad = self.pad 22 | _, stride = self.stride 23 | 24 | outsize = (insize + 2 * pad - size) // stride + 1 25 | 26 | return batchsize, maps, outsize 27 | 28 | 29 | def checkDataShape(self, shape): 30 | if len(shape) != 3: 31 | raise ModuleError("Data must be 3d tensor") 32 | 33 | _, _, insize = shape 34 | if insize + 2 * self.pad[1] < self.size[1]: 35 | raise ModuleError("Data maps size is too small (got %d, expected at least %d)" % 36 | (insize + 2 * self.pad[1], self.size[1])) 37 | 38 | 39 | def gradShapeFrom(self, shape): 40 | batchsize, maps, outsize = shape 41 | 42 | _, size = self.size 43 | _, pad = self.pad 44 | _, stride = self.stride 45 | 46 | insize = (outsize - 1) * stride - 2 * pad + size 47 | 48 | return batchsize, maps, insize 49 | 50 | 51 | def checkGradShape(self, shape): 52 | if len(shape) != 3: 53 | raise ModuleError("Grad must be 3d tensor") 54 | 55 | 56 | def updateData(self, data): 57 | raise NotImplementedError() 58 | 59 | 60 | def updateGrad(self, grad): 61 | raise NotImplementedError() 62 | 63 | 64 | def reset(self): 65 | super().reset() 66 | self.workspace = None 67 | 68 | 69 | def calcMode(self, T): 70 | dtypes = {dtype for dtype, _ in gpuarray.dtypesSupported()} 71 | 72 | if T not in dtypes: 73 | raise ModuleError("Unsupported dtype %s" % T) 74 | 75 | self.calctype = T 76 | -------------------------------------------------------------------------------- /Transformers/Serial.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Transformers.Provider import Provider 4 | 5 | 6 | class Serial(Provider): 7 | def __init__(self, dataset, labels=None, numofthreads=4): 8 | super().__init__(numofthreads) 9 | 10 | self.datalen = dataset.shape[0] 11 | 12 | self.labels = labels 13 | self.dataset = dataset 14 | 15 | self.index = 0 16 | 17 | 18 | def getNextChunk(self, chunksize, **kwargs): 19 | if chunksize >= self.datalen: 20 | self.index = 0 21 | 22 | if self.labels is not None: 23 | return np.array(self.dataset), np.array(self.labels) 24 | else: 25 | return np.array(self.dataset) 26 | 27 | begin = self.index 28 | end = self.index + chunksize 29 | 30 | if end > self.datalen: 31 | chunk = np.empty((chunksize, ) + self.dataset.shape[1:], dtype=self.dataset.dtype) 32 | tup = chunk 33 | 34 | chunk[:self.datalen - begin] = self.dataset[begin:self.datalen] 35 | 36 | self.index = end - self.datalen 37 | chunk[self.datalen - begin:] = self.dataset[:self.index] 38 | 39 | if self.labels is not None: 40 | labels = np.empty((chunksize, ) + self.dataset.shape[1:], dtype=self.labels.dtype) 41 | tup = (chunk, labels) 42 | 43 | labels[:self.datalen - begin] = self.labels[begin:self.datalen] 44 | labels[self.datalen - begin:] = self.labels[:self.index] 45 | 46 | else: 47 | self.index = end 48 | chunk = np.array(self.dataset[begin:end]) 49 | tup = chunk 50 | 51 | if self.labels is not None: 52 | labels = np.array(self.labels[begin:end]) 53 | tup = (chunk, labels) 54 | 55 | return tup 56 | 57 | 58 | def unittest(): 59 | from PuzzleLib.Datasets.ZipLoader import ZipLoader 60 | 61 | zipfile = ZipLoader() 62 | data = zipfile.load("../TestData/test.zip") 63 | 64 | with Serial(data) as serial: 65 | for _ in range(10): 66 | serial.prepareData(chunksize=4) 67 | serial.getData() 68 | 69 | 70 | if __name__ == "__main__": 71 | unittest() 72 | -------------------------------------------------------------------------------- /Modules/Replicate.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray, Blas 4 | from PuzzleLib.Backend.gpuarray import memoryPool as memPool 5 | 6 | from PuzzleLib.Modules.Module import ModuleError, Module 7 | 8 | 9 | class Replicate(Module): 10 | def __init__(self, times, name=None): 11 | super().__init__(name) 12 | self.registerBlueprint(locals()) 13 | 14 | self.movesData = True 15 | self.times = times 16 | 17 | 18 | def updateData(self, data): 19 | self.data = [data] * self.times 20 | 21 | 22 | def updateGrad(self, grad): 23 | firstgrad = grad[0] 24 | 25 | self.grad = gpuarray.empty(firstgrad.shape, dtype=firstgrad.dtype, allocator=memPool) 26 | self.grad.fill(0) 27 | 28 | for gr in grad: 29 | Blas.toVectorAddVector(self.grad.ravel(), gr.ravel()) 30 | 31 | 32 | def dataShapeFrom(self, shape): 33 | return [shape] * self.times 34 | 35 | 36 | def gradShapeFrom(self, shape): 37 | return shape[0] 38 | 39 | 40 | def calcMode(self, T): 41 | dtypes = {dtype for dtype, _ in gpuarray.dtypesSupported()} 42 | 43 | if T not in dtypes: 44 | raise ModuleError("Unsupported dtype %s" % T) 45 | 46 | self.calctype = T 47 | 48 | 49 | def unittest(): 50 | for dtype, _ in gpuarray.dtypesSupported(): 51 | replicateTest(dtype) 52 | 53 | 54 | def replicateTest(dtype): 55 | hostData = np.random.randn(10, 10, 3, 3).astype(dtype) 56 | data = gpuarray.to_gpu(hostData) 57 | 58 | times = 3 59 | 60 | repl = Replicate(times) 61 | repl.calcMode(dtype) 62 | 63 | repl(data) 64 | 65 | assert len(repl.data) == times 66 | 67 | hostGrad = [np.random.randn(10, 10, 3, 3).astype(dtype) for _ in range(times)] 68 | grad = [gpuarray.to_gpu(gr) for gr in hostGrad] 69 | 70 | repl.backward(grad) 71 | 72 | hostInGrad = np.zeros(grad[0].shape, dtype=dtype) 73 | for i in range(times): 74 | hostInGrad += hostGrad[i] 75 | 76 | assert np.allclose(hostInGrad, repl.grad.get()) 77 | 78 | 79 | if __name__ == "__main__": 80 | unittest() 81 | -------------------------------------------------------------------------------- /Compiler/Codegen/PyDefines/PyDefines.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | 5 | #if defined(__clang__) 6 | #pragma GCC diagnostic push 7 | #pragma GCC diagnostic ignored "-Wvisibility" 8 | 9 | #elif defined(_MSC_VER) 10 | #pragma warning(push) 11 | #pragma warning(disable: 4115) 12 | 13 | #endif 14 | 15 | #include 16 | #include 17 | 18 | #if defined(__clang__) 19 | #pragma GCC diagnostic pop 20 | 21 | #elif defined(_MSC_VER) 22 | #pragma warning(pop) 23 | 24 | #endif 25 | 26 | 27 | inline static bool createPyClass(PyObject *module, const char *name, PyType_Spec *spec, PyTypeObject **pType) 28 | { 29 | PyTypeObject *type = (PyTypeObject *)PyType_FromSpec(spec); 30 | if (type == NULL) 31 | return false; 32 | 33 | if (PyModule_AddObject(module, name, (PyObject *)type) < 0) 34 | { 35 | Py_DECREF(type); 36 | return false; 37 | } 38 | 39 | Py_INCREF(type); 40 | *pType = type; 41 | 42 | return true; 43 | } 44 | 45 | inline static bool createPyExc(PyObject *module, const char *name, const char *fullname, PyObject **pExc) 46 | { 47 | PyObject *exc = PyErr_NewException(fullname, NULL, NULL); 48 | if (exc == NULL) 49 | return false; 50 | 51 | if (PyModule_AddObject(module, name, exc) < 0) 52 | { 53 | Py_DECREF(exc); 54 | return false; 55 | } 56 | 57 | Py_INCREF(exc); 58 | *pExc = exc; 59 | 60 | return true; 61 | } 62 | 63 | inline static bool unpackPyOptional(PyObject **pObj, PyTypeObject *type, const char *key) 64 | { 65 | PyObject *obj = *pObj; 66 | 67 | if (obj != NULL && Py_TYPE(obj) != type && obj != Py_None) 68 | { 69 | PyErr_Format( 70 | PyExc_TypeError, "%s must be %s or %s, not %s", 71 | key, type->tp_name, Py_TYPE(Py_None)->tp_name, Py_TYPE(obj)->tp_name 72 | ); 73 | return false; 74 | } 75 | 76 | *pObj = (obj == Py_None) ? NULL : obj; 77 | return true; 78 | } 79 | 80 | #define REMOVE_PY_OBJECT(pObj) do { PyObject *obj = (PyObject *)*(pObj); Py_DECREF(obj); *(pObj) = NULL; } while (0) 81 | -------------------------------------------------------------------------------- /TestLib/ResumeTrain.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | 5 | from PuzzleLib.Datasets import MnistLoader 6 | from PuzzleLib.Handlers import Trainer, Validator 7 | from PuzzleLib.Optimizers import MomentumSGD 8 | from PuzzleLib.Cost import CrossEntropy 9 | 10 | from PuzzleLib.Models.Nets.LeNet import loadLeNet 11 | 12 | 13 | def train(net, optimizer, data, labels, epochs): 14 | cost = CrossEntropy(maxlabels=10) 15 | trainer = Trainer(net, cost, optimizer) 16 | validator = Validator(net, cost) 17 | 18 | for i in range(epochs): 19 | trainer.trainFromHost( 20 | data[:60000], labels[:60000], macroBatchSize=60000, 21 | onMacroBatchFinish=lambda tr: print("Train error: %s" % tr.cost.getMeanError()) 22 | ) 23 | print("Accuracy: %s" % (1.0 - validator.validateFromHost(data[60000:], labels[60000:], macroBatchSize=10000))) 24 | 25 | optimizer.learnRate *= 0.9 26 | print("Reduced optimizer learn rate to %s" % optimizer.learnRate) 27 | 28 | 29 | def main(): 30 | mnist = MnistLoader() 31 | data, labels = mnist.load(path="../TestData/") 32 | data, labels = data[:], labels[:] 33 | print("Loaded mnist") 34 | 35 | np.random.seed(1234) 36 | net = loadLeNet(None, initscheme=None) 37 | 38 | optimizer = MomentumSGD() 39 | optimizer.setupOn(net, useGlobalState=True) 40 | optimizer.learnRate = 0.1 41 | optimizer.momRate = 0.9 42 | 43 | epochs = 10 44 | print("Training for %s epochs ..." % epochs) 45 | train(net, optimizer, data, labels, epochs) 46 | 47 | print("Saving net and optimizer ...") 48 | net.save("../TestData/net.hdf") 49 | optimizer.save("../TestData/optimizer.hdf") 50 | 51 | print("Reloading net and optimizer ...") 52 | net.load("../TestData/net.hdf") 53 | optimizer.load("../TestData/optimizer.hdf") 54 | 55 | print("Continuing training for %s epochs ..." % epochs) 56 | train(net, optimizer, data, labels, epochs) 57 | 58 | os.remove("../TestData/net.hdf") 59 | os.remove("../TestData/optimizer.hdf") 60 | 61 | 62 | if __name__ == "__main__": 63 | main() 64 | -------------------------------------------------------------------------------- /Optimizers/AdaGrad.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib import Config 4 | 5 | from PuzzleLib.Backend import gpuarray 6 | from PuzzleLib.Backend.Kernels.ElementWise import adagradKer 7 | 8 | from PuzzleLib.Optimizers.Optimizer import Optimizer, trainSimpleTest, trainHardTest 9 | 10 | 11 | class AdaGrad(Optimizer): 12 | def __init__(self, learnRate=1e-3, epsilon=1e-8, nodeinfo=None): 13 | super().__init__(nodeinfo) 14 | 15 | self.epsilon = None 16 | 17 | self.setAttr("learnRate", learnRate) 18 | self.setAttr("epsilon", epsilon) 19 | 20 | 21 | def setupState(self, var): 22 | return {"h": gpuarray.zeros(var.data.shape, dtype=var.data.dtype)} 23 | 24 | 25 | def updateVar(self, var, state, stream=None): 26 | adagradKer(var.data.dtype)( 27 | var.data, var.grad, state["h"], self.learnRate * var.learnRate, self.epsilon, stream=stream 28 | ) 29 | 30 | 31 | def unittest(): 32 | for dtype, atol in gpuarray.dtypesSupported(): 33 | calcTest(dtype, atol) 34 | trainSimpleTest(AdaGrad, dtype, learnRate=1e-2) 35 | 36 | if Config.backend == Config.Backend.cuda: 37 | trainHardTest(AdaGrad, dtype, learnRate=1e-2) 38 | 39 | 40 | def calcTest(dtype, atol): 41 | lr, epsilon = 0.01, 1e-8 42 | shape = (11, 13) 43 | 44 | hostW, hostDw = np.random.randn(*shape).astype(dtype), np.random.randn(*shape).astype(dtype) 45 | hostH = (1.0 + np.random.randn(*shape)**2).astype(dtype) 46 | 47 | w, dw, h = gpuarray.to_gpu(hostW), gpuarray.to_gpu(hostDw), gpuarray.to_gpu(hostH) 48 | adagradKer(w.dtype)(w, dw, h, lr, epsilon) 49 | 50 | hostW, hostDw, hostH = hostW.astype(np.float32), hostDw.astype(np.float32), hostH.astype(np.float32) 51 | 52 | hostH += hostDw**2 53 | hostW += lr * hostDw / (np.sqrt(hostH) + epsilon) 54 | 55 | hostW, hostDw, hostH = hostW.astype(dtype), hostDw.astype(dtype), hostH.astype(dtype) 56 | 57 | assert np.allclose(hostH, h.get(), atol=atol) 58 | assert np.allclose(hostW, w.get(), atol=atol) 59 | 60 | 61 | if __name__ == "__main__": 62 | unittest() 63 | -------------------------------------------------------------------------------- /Compiler/Codegen/Vector/Generate.py: -------------------------------------------------------------------------------- 1 | import os, random 2 | from string import Template 3 | 4 | from PuzzleLib.Compiler.Toolchain import createTemplateNames, writeTemplates, buildTemplateTest 5 | 6 | 7 | def generateVector(name, T, borrow="(void)", destruct="(void)", minCapacity=16, 8 | headerPreambule=None, bodyPreambule=None, malloc="malloc", free="free", filename=None): 9 | headerPreambule = "%s\n\n" % headerPreambule if headerPreambule is not None else "" 10 | bodyPreambule = "%s\n\n" % bodyPreambule if bodyPreambule is not None else "" 11 | 12 | filename = name if filename is None else filename 13 | headername, bodyname = createTemplateNames(filename) 14 | 15 | dirname = os.path.dirname(__file__) 16 | headerTmpl, bodyTmpl = os.path.join(dirname, "TVector.h"), os.path.join(dirname, "TVector.c") 17 | 18 | with open(headerTmpl, mode="r", encoding="utf-8") as f: 19 | header = Template(f.read()).substitute(HEADER_PREAMBULE=headerPreambule, NAME=name, T=T) 20 | 21 | with open(bodyTmpl, mode="r", encoding="utf-8") as f: 22 | body = Template(f.read()).substitute( 23 | HEADER_NAME=os.path.basename(headername), BODY_PREAMBULE=bodyPreambule, NAME=name, T=T, 24 | MIN_CAPACITY=minCapacity, MALLOC=malloc, FREE=free, BORROW=borrow, DESTRUCT=destruct 25 | ) 26 | 27 | writeTemplates([ 28 | (header, headername), 29 | (body, bodyname) 30 | ]) 31 | 32 | return bodyname 33 | 34 | 35 | def unittest(): 36 | IntVector = buildTemplateTest( 37 | name="IntVector", bindingName="TVectorTest.c", path="../../TestData", generator=generateVector, T="int" 38 | ) 39 | 40 | size = 1 << 16 41 | 42 | pyvec = list(range(size)) 43 | random.shuffle(pyvec) 44 | 45 | vector = IntVector.IntVector() 46 | 47 | for i in pyvec: 48 | vector.append(i) 49 | 50 | assert len(vector) == size 51 | 52 | for i in range(size): 53 | assert vector[i] == pyvec[i] 54 | 55 | for i in reversed(pyvec): 56 | assert vector.pop() == i 57 | 58 | assert len(vector) == 0 59 | 60 | 61 | if __name__ == "__main__": 62 | unittest() 63 | -------------------------------------------------------------------------------- /TestLib/EncoderTrain.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray 4 | from PuzzleLib.Backend.gpuarray import memoryPool as memPool 5 | 6 | from PuzzleLib.Containers import Sequential 7 | from PuzzleLib.Modules import Linear, Activation, relu, Dropout 8 | 9 | from PuzzleLib.Datasets import MnistLoader 10 | from PuzzleLib.Visual import showFilters 11 | from PuzzleLib.Optimizers import MomentumSGD 12 | from PuzzleLib.Cost import MSE 13 | from PuzzleLib.Variable import Variable 14 | 15 | 16 | def buildEncoder(): 17 | seq = Sequential() 18 | 19 | seq.append(Linear(784, 256)) 20 | seq.append(Activation(relu, inplace=True)) 21 | seq.append(Dropout()) 22 | seq.append(Linear(256, 784, empty=True, transpose=True)) 23 | 24 | seq[-1].setVar("W", seq[0].vars["W"]) 25 | seq[-1].setVar("b", Variable(gpuarray.zeros((784,), dtype=np.float32, allocator=memPool))) 26 | 27 | return seq 28 | 29 | 30 | def main(): 31 | mnist = MnistLoader() 32 | data, _ = mnist.load(path="../TestData") 33 | data = data[:].reshape(data.shape[0], -1) 34 | print("Loaded mnist") 35 | 36 | np.random.seed(1234) 37 | net = buildEncoder() 38 | 39 | optimizer = MomentumSGD() 40 | optimizer.setupOn(net, useGlobalState=True) 41 | optimizer.learnRate = 10.0 42 | optimizer.momRate = 0.5 43 | 44 | data = gpuarray.to_gpu(data) 45 | batchsize = 100 46 | 47 | mse = MSE() 48 | 49 | for epoch in range(40): 50 | for i in range(data.shape[0] // batchsize): 51 | batch = data[i * batchsize:(i + 1) * batchsize] 52 | 53 | net(batch) 54 | _, grad = mse(net.data, batch) 55 | 56 | net.zeroGradParams() 57 | net.backward(grad) 58 | optimizer.update() 59 | 60 | optimizer.learnRate *= 0.8 61 | print("Finished epoch %d" % (epoch + 1)) 62 | 63 | print("Error: %s" % (mse.getMeanError())) 64 | mse.resetAccumulator() 65 | 66 | if (epoch + 1) % 5 == 0: 67 | filters = net[0].W.get().T 68 | showFilters(filters.reshape(16, 16, 28, 28), "../TestData/encoder.png") 69 | 70 | 71 | if __name__ == "__main__": 72 | main() 73 | -------------------------------------------------------------------------------- /Optimizers/MomentumSGD.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib import Config 4 | 5 | from PuzzleLib.Backend import gpuarray 6 | from PuzzleLib.Backend.Kernels.ElementWise import classicMomSGDKer 7 | 8 | from PuzzleLib.Optimizers.Optimizer import trainSimpleTest, trainHardTest 9 | from PuzzleLib.Optimizers.SGD import SGD 10 | 11 | 12 | class MomentumSGD(SGD): 13 | def __init__(self, learnRate=1e-3, momRate=0.9, nodeinfo=None): 14 | super().__init__(learnRate, nodeinfo) 15 | 16 | self.momRate = None 17 | self.setAttr("momRate", momRate) 18 | 19 | 20 | def setupState(self, var): 21 | return {"mom": gpuarray.zeros(var.data.shape, dtype=var.data.dtype)} 22 | 23 | 24 | def updateVar(self, var, state, stream=None): 25 | classicMomSGDKer(var.data.dtype)( 26 | var.data, var.grad, state["mom"], self.learnRate * var.learnRate, self.momRate * var.momRate, stream=stream 27 | ) 28 | 29 | 30 | def unittest(): 31 | for dtype, atol in gpuarray.dtypesSupported(): 32 | calcTest(dtype, atol) 33 | trainSimpleTest(MomentumSGD, dtype, learnRate=1e-1, momRate=0.9) 34 | 35 | if Config.backend == Config.Backend.cuda: 36 | trainHardTest(MomentumSGD, dtype, learnRate=1e-1, momRate=0.9) 37 | 38 | 39 | def calcTest(dtype, atol): 40 | lr, mr = 0.01, 0.9 41 | shape = (11, 13) 42 | 43 | hostW, hostDw = np.random.randn(*shape).astype(dtype), np.random.randn(*shape).astype(dtype) 44 | hostMom = np.random.randn(*shape).astype(dtype) 45 | 46 | w, dw, mom = gpuarray.to_gpu(hostW), gpuarray.to_gpu(hostDw), gpuarray.to_gpu(hostMom) 47 | classicMomSGDKer(w.dtype)(w, dw, mom, lr, mr) 48 | 49 | hostW, hostDw, hostMom = hostW.astype(np.float32), hostDw.astype(np.float32), hostMom.astype(np.float32) 50 | 51 | hostMom = mr * hostMom + lr * hostDw 52 | hostW += hostMom 53 | 54 | hostW, hostDw, hostMom = hostW.astype(dtype), hostDw.astype(dtype), hostMom.astype(dtype) 55 | 56 | assert np.allclose(hostMom, mom.get(), atol=atol) 57 | assert np.allclose(hostW, w.get(), atol=atol) 58 | 59 | 60 | if __name__ == "__main__": 61 | unittest() 62 | -------------------------------------------------------------------------------- /Cost/Abs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray, Blas 4 | from PuzzleLib.Backend.gpuarray import memoryPool as memPool 5 | from PuzzleLib.Backend.Kernels.ElementWise import l1gradKer 6 | 7 | from PuzzleLib.Cost.Cost import Cost 8 | 9 | 10 | class Abs(Cost): 11 | def calcGrad(self, pred, target): 12 | grad = gpuarray.empty(pred.shape, dtype=np.float32, allocator=memPool) 13 | norm = 1.0 / np.prod(target.shape) 14 | 15 | l1gradKer(grad, pred, target, norm) 16 | 17 | return grad 18 | 19 | 20 | def calcError(self, pred, target): 21 | diff = Blas.addVectorToVector(pred.ravel(), target.ravel(), alpha=1.0, beta=-1.0) 22 | 23 | self.devErr.fill(Blas.vectorL1Norm(diff) / np.prod(pred.shape[1:])) 24 | self.accumErr += self.devErr 25 | 26 | 27 | def calcVal(self, pred, target): 28 | diff = Blas.addVectorToVector(pred.ravel(), target.ravel(), alpha=1.0, beta=-1.0) 29 | error = Blas.vectorL1Norm(diff) / np.prod(target.shape) 30 | 31 | return error 32 | 33 | 34 | def checkDataShape(self, pred, target): 35 | assert pred.shape[1:] == target.shape[1:] 36 | 37 | 38 | def checkValDataShape(self, pred, target): 39 | assert pred.shape[1:] == target.shape[1:] 40 | 41 | 42 | def unittest(): 43 | errorTest() 44 | valTest() 45 | 46 | 47 | def errorTest(): 48 | pred = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32)) 49 | target = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32)) 50 | 51 | abscost = Abs() 52 | abscost(pred, target) 53 | 54 | assert np.isclose(abscost.error, np.linalg.norm((target.get() - pred.get()).ravel(), ord=1) / np.prod(target.shape)) 55 | 56 | 57 | def valTest(): 58 | pred = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32)) 59 | target = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32)) 60 | 61 | abscost = Abs() 62 | error = abscost.validate(pred, target) 63 | 64 | assert np.isclose(error, np.linalg.norm((target.get() - pred.get()).ravel(), ord=1) / np.prod(target.shape)) 65 | 66 | 67 | if __name__ == "__main__": 68 | unittest() 69 | -------------------------------------------------------------------------------- /Compiler/Codegen/Tree/Generate.py: -------------------------------------------------------------------------------- 1 | import os, random 2 | from string import Template 3 | 4 | from PuzzleLib.Compiler.Toolchain import createTemplateNames, writeTemplates, buildTemplateTest 5 | 6 | 7 | def generateTree(name, K, V, headerPreambule=None, bodyPreambule=None, malloc="malloc", free="free", filename=None): 8 | headerPreambule = "%s\n\n" % headerPreambule if headerPreambule is not None else "" 9 | bodyPreambule = "%s\n\n" % bodyPreambule if bodyPreambule is not None else "" 10 | 11 | filename = name if filename is None else filename 12 | headername, bodyname = createTemplateNames(filename) 13 | 14 | dirname = os.path.dirname(__file__) 15 | headerTmpl, bodyTmpl = os.path.join(dirname, "TTree.h"), os.path.join(dirname, "TTree.c") 16 | 17 | with open(headerTmpl, mode="r", encoding="utf-8") as f: 18 | header = Template(f.read()).substitute(HEADER_PREAMBULE=headerPreambule, NAME=name, K=K, V=V) 19 | 20 | with open(bodyTmpl, mode="r", encoding="utf-8") as f: 21 | body = Template(f.read()).substitute( 22 | HEADER_NAME=os.path.basename(headername), BODY_PREAMBULE=bodyPreambule, NAME=name, K=K, V=V, 23 | MALLOC=malloc, FREE=free 24 | ) 25 | 26 | writeTemplates([ 27 | (header, headername), 28 | (body, bodyname) 29 | ]) 30 | 31 | return bodyname 32 | 33 | 34 | def unittest(): 35 | IntTree = buildTemplateTest( 36 | name="IntTree", bindingName="TTreeTest.c", path="../../TestData", generator=generateTree, K="int", V="int" 37 | ) 38 | 39 | size = 1 << 16 40 | 41 | keys, values = list(range(size)), list(range(size)) 42 | random.shuffle(keys) 43 | random.shuffle(values) 44 | 45 | pytree = {k: v for k, v in zip(keys, values)} 46 | 47 | inttree = IntTree.IntTree() 48 | 49 | for k, v in pytree.items(): 50 | inttree[k] = v 51 | 52 | assert len(inttree) == size 53 | assert inttree.validate() 54 | 55 | for k in pytree.keys(): 56 | assert inttree[k] == pytree[k] 57 | 58 | for k in pytree.keys(): 59 | del inttree[k] 60 | 61 | assert len(inttree) == 0 62 | assert inttree.validate() 63 | 64 | 65 | if __name__ == "__main__": 66 | unittest() 67 | -------------------------------------------------------------------------------- /Optimizers/NesterovSGD.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib import Config 4 | 5 | from PuzzleLib.Backend import gpuarray 6 | from PuzzleLib.Backend.Kernels.ElementWise import nesterovMomSGDKer 7 | 8 | from PuzzleLib.Optimizers.Optimizer import trainSimpleTest, trainHardTest 9 | from PuzzleLib.Optimizers.SGD import SGD 10 | 11 | 12 | class NesterovSGD(SGD): 13 | def __init__(self, learnRate=1e-3, momRate=0.9, nodeinfo=None): 14 | super().__init__(learnRate, nodeinfo) 15 | 16 | self.momRate = None 17 | self.setAttr("momRate", momRate) 18 | 19 | 20 | def setupState(self, var): 21 | return {"mom": gpuarray.zeros(var.data.shape, dtype=var.data.dtype)} 22 | 23 | 24 | def updateVar(self, var, state, stream=None): 25 | nesterovMomSGDKer(var.data.dtype)( 26 | var.data, var.grad, state["mom"], self.learnRate * var.learnRate, self.momRate * var.momRate, stream=stream 27 | ) 28 | 29 | 30 | def unittest(): 31 | for dtype, atol in gpuarray.dtypesSupported(): 32 | calcTest(dtype, atol) 33 | trainSimpleTest(NesterovSGD, dtype, learnRate=1e-1, momRate=0.9) 34 | 35 | if Config.backend == Config.Backend.cuda: 36 | trainHardTest(NesterovSGD, dtype, learnRate=1e-1, momRate=0.9) 37 | 38 | 39 | def calcTest(dtype, atol): 40 | lr, mr = 0.01, 0.9 41 | shape = (11, 13) 42 | 43 | hostW, hostDw = np.random.randn(*shape).astype(dtype), np.random.randn(*shape).astype(dtype) 44 | hostMom = np.random.randn(*shape).astype(dtype) 45 | 46 | w, dw, mom = gpuarray.to_gpu(hostW), gpuarray.to_gpu(hostDw), gpuarray.to_gpu(hostMom) 47 | nesterovMomSGDKer(w.dtype)(w, dw, mom, lr, mr) 48 | 49 | hostW, hostDw, hostMom = hostW.astype(np.float32), hostDw.astype(np.float32), hostMom.astype(np.float32) 50 | 51 | hostW += mr**2 * hostMom + (1 + mr) * lr * hostDw 52 | hostMom = mr * hostMom + lr * hostDw 53 | 54 | hostW, hostDw, hostMom = hostW.astype(dtype), hostDw.astype(dtype), hostMom.astype(dtype) 55 | 56 | assert np.allclose(hostMom, mom.get(), atol=atol) 57 | assert np.allclose(hostW, w.get(), atol=atol) 58 | 59 | 60 | if __name__ == "__main__": 61 | unittest() 62 | -------------------------------------------------------------------------------- /Hip/Utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Cuda.Utils import SharedArray, shareMemTest, randomTest 4 | from PuzzleLib.Hip import Driver as HipDriver 5 | 6 | 7 | class HipSharedArray(SharedArray): 8 | GPUArray = HipDriver.GPUArray 9 | 10 | 11 | def unittest(): 12 | from PuzzleLib.Hip import Backend 13 | 14 | for deviceIdx in range(Backend.getDeviceCount()): 15 | bnd = Backend.getBackend(deviceIdx, initmode=2) 16 | 17 | for dtype, _ in bnd.dtypesSupported(): 18 | shareMemTest(bnd, dtype) 19 | memCopyTest(bnd, dtype) 20 | 21 | randomTest(bnd) 22 | 23 | 24 | def memCopyTest(bnd, dtype): 25 | hostSrc = np.random.randn(4, 4, 4, 4).astype(dtype) 26 | 27 | src = bnd.GPUArray.toGpu(hostSrc) 28 | assert np.allclose(hostSrc, src.copy().get()) 29 | 30 | hostA = np.random.randn(7, 4, 4, 4).astype(dtype) 31 | a = bnd.GPUArray.toGpu(hostA) 32 | 33 | out = bnd.concatenate((src, a), axis=0) 34 | assert np.allclose(np.concatenate((hostSrc, hostA), axis=0), out.get()) 35 | 36 | hostA = np.random.randn(4, 2, 4, 4).astype(dtype) 37 | hostB = np.random.randn(4, 1, 4, 4).astype(dtype) 38 | 39 | a, b = bnd.GPUArray.toGpu(hostA), bnd.GPUArray.toGpu(hostB) 40 | 41 | out = bnd.concatenate((src, a, b), axis=1) 42 | assert np.allclose(np.concatenate((hostSrc, hostA, hostB), axis=1), out.get()) 43 | 44 | hostA = np.random.randn(4, 4, 5, 4).astype(dtype) 45 | 46 | out = bnd.concatenate((bnd.GPUArray.toGpu(hostA), src), axis=2) 47 | assert np.allclose(np.concatenate((hostA, hostSrc), axis=2), out.get()) 48 | 49 | outs = bnd.split(src, (2, 2), axis=0) 50 | assert all(np.allclose(hostSrc[2 * i:2 * (i + 1)], out.get()) for i, out in enumerate(outs)) 51 | 52 | outs = bnd.split(src, (2, 2), axis=1) 53 | assert all(np.allclose(hostSrc[:, 2 * i:2 * (i + 1), :, :], out.get()) for i, out in enumerate(outs)) 54 | 55 | outs = bnd.split(src, (2, 2), axis=2) 56 | assert all(np.allclose(hostSrc[:, :, 2 * i:2 * (i + 1), :], out.get()) for i, out in enumerate(outs)) 57 | 58 | assert np.allclose(np.tile(hostB, (1, 3, 1, 1)), bnd.tile(b, 3, axis=1).get()) 59 | 60 | 61 | if __name__ == "__main__": 62 | unittest() 63 | -------------------------------------------------------------------------------- /Compiler/Codegen/Malloc/TMallocTest.c: -------------------------------------------------------------------------------- 1 | #define Py_LIMITED_API 2 | #include 3 | 4 | #include "TraceMalloc.gen.h" 5 | 6 | 7 | static PyObject *PyTraceMalloc_malloc(PyObject *self, PyObject *args) 8 | { 9 | (void)self, (void)args; 10 | Py_ssize_t nbytes; 11 | 12 | if (!PyArg_ParseTuple(args, "n", &nbytes)) 13 | return NULL; 14 | 15 | void *ptr = TRACE_MALLOC(nbytes); 16 | return Py_BuildValue("n", (Py_ssize_t)ptr); 17 | } 18 | 19 | 20 | static PyObject *PyTraceMalloc_free(PyObject *self, PyObject *args) 21 | { 22 | (void)self, (void)args; 23 | Py_ssize_t ptr; 24 | 25 | if (!PyArg_ParseTuple(args, "n", &ptr)) 26 | return NULL; 27 | 28 | TRACE_FREE((void *)ptr); 29 | Py_RETURN_NONE; 30 | } 31 | 32 | 33 | static PyObject *PyTraceMalloc_traceLeaks(PyObject *self, PyObject *args) 34 | { 35 | (void)self, (void)args; 36 | size_t nleaks = TraceMalloc_traceLeaks(); 37 | 38 | PyObject *leaks = PyList_New(nleaks); 39 | if (leaks == NULL) 40 | return NULL; 41 | 42 | size_t index = 0; 43 | if (!TraceMalloc_Iterator_init()) 44 | return leaks; 45 | 46 | do 47 | { 48 | size_t size; 49 | const char *file; 50 | int line; 51 | 52 | TraceMalloc_Iterator_item(&size, &file, &line); 53 | 54 | PyObject *leak = Py_BuildValue("(nsi)", (Py_ssize_t)size, file, line); 55 | if (leak == NULL) 56 | goto error; 57 | 58 | PyList_SetItem(leaks, index, leak); 59 | index += 1; 60 | } 61 | while (TraceMalloc_Iterator_move()); 62 | 63 | TraceMalloc_Iterator_dealloc(); 64 | return leaks; 65 | 66 | error: 67 | TraceMalloc_Iterator_dealloc(); 68 | Py_DECREF(leaks); 69 | 70 | return NULL; 71 | } 72 | 73 | 74 | static PyModuleDef PyTraceMalloc_moduleDef = { 75 | PyModuleDef_HEAD_INIT, 76 | .m_name = "TraceMalloc", 77 | .m_methods = (PyMethodDef[]){ 78 | {"malloc", PyTraceMalloc_malloc, METH_VARARGS, NULL}, 79 | {"free", PyTraceMalloc_free, METH_VARARGS, NULL}, 80 | {"traceLeaks", PyTraceMalloc_traceLeaks, METH_NOARGS, NULL}, 81 | {NULL, NULL, 0, NULL} 82 | }, 83 | .m_slots = NULL 84 | }; 85 | 86 | 87 | PyMODINIT_FUNC PyInit_TraceMalloc(void) 88 | { 89 | return PyModule_Create(&PyTraceMalloc_moduleDef); 90 | } 91 | -------------------------------------------------------------------------------- /Optimizers/RMSProp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib import Config 4 | 5 | from PuzzleLib.Backend import gpuarray 6 | from PuzzleLib.Backend.Kernels.ElementWise import rmspropKer 7 | 8 | from PuzzleLib.Optimizers.Optimizer import Optimizer, trainSimpleTest, trainHardTest 9 | 10 | 11 | class RMSProp(Optimizer): 12 | def __init__(self, learnRate=1e-3, factor=0.9, epsilon=1e-5, nodeinfo=None): 13 | super().__init__(nodeinfo) 14 | 15 | self.factor = None 16 | self.epsilon = None 17 | 18 | self.setAttr("learnRate", learnRate) 19 | self.setAttr("factor", factor) 20 | self.setAttr("epsilon", epsilon) 21 | 22 | 23 | def setupState(self, var): 24 | return {"ms": gpuarray.zeros(var.data.shape, dtype=var.data.dtype)} 25 | 26 | 27 | def updateVar(self, var, state, stream=None): 28 | rmspropKer(var.data.dtype)( 29 | var.data, var.grad, state["ms"], self.learnRate * var.learnRate, self.factor, self.epsilon, stream=stream 30 | ) 31 | 32 | 33 | def unittest(): 34 | for dtype, atol in gpuarray.dtypesSupported(): 35 | calcTest(dtype, atol) 36 | trainSimpleTest(RMSProp, dtype, learnRate=1e-2) 37 | 38 | if Config.backend == Config.Backend.cuda: 39 | trainHardTest(RMSProp, dtype, learnRate=1e-2) 40 | 41 | 42 | def calcTest(dtype, atol): 43 | lr, factor, epsilon = 0.01, 0.9, 1e-5 44 | shape = (11, 13) 45 | 46 | hostW, hostDw = np.random.randn(*shape).astype(dtype), np.random.randn(*shape).astype(dtype) 47 | hostMs = (1.0 + np.random.randn(*shape)**2).astype(dtype) 48 | 49 | w, dw, ms = gpuarray.to_gpu(hostW), gpuarray.to_gpu(hostDw), gpuarray.to_gpu(hostMs) 50 | rmspropKer(w.dtype)(w, dw, ms, lr, factor, epsilon) 51 | 52 | hostW, hostDw, hostMs = hostW.astype(np.float32), hostDw.astype(np.float32), hostMs.astype(np.float32) 53 | 54 | hostMs = factor * hostMs + (1 - factor) * hostDw**2 55 | hostW += lr * hostDw / (np.sqrt(hostMs) + epsilon) 56 | 57 | hostW, hostDw, hostMs = hostW.astype(dtype), hostDw.astype(dtype), hostMs.astype(dtype) 58 | 59 | assert np.allclose(hostMs, ms.get(), atol=atol) 60 | assert np.allclose(hostW, w.get(), atol=atol) 61 | 62 | 63 | if __name__ == "__main__": 64 | unittest() 65 | -------------------------------------------------------------------------------- /Modules/Mul.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray 4 | from PuzzleLib.Backend.gpuarray import copy, memoryPool as memPool 5 | from PuzzleLib.Backend.Kernels.ElementWise import mulKer 6 | 7 | from PuzzleLib.Modules.Module import ModuleError, Module 8 | 9 | 10 | class Mul(Module): 11 | def updateData(self, data): 12 | self.data = gpuarray.empty(data[0].shape, dtype=data[0].dtype, allocator=memPool) 13 | self.data.fill(1.0) 14 | 15 | for dat in data: 16 | mulKer(dat.dtype)(self.data, dat, self.data) 17 | 18 | 19 | def updateGrad(self, grad): 20 | self.grad = [] 21 | for i in range(len(self.inData)): 22 | ingrad = copy(None, grad) 23 | 24 | for k in range(len(self.inData)): 25 | if k != i: 26 | mulKer(ingrad.dtype)(ingrad, self.inData[k], ingrad) 27 | 28 | self.grad.append(ingrad) 29 | 30 | 31 | def checkDataShape(self, shapes): 32 | for shape in shapes: 33 | if shape != shapes[0]: 34 | raise ModuleError("Shape %s is not equal to initial shape %s" % (shape, shapes[0])) 35 | 36 | 37 | def dataShapeFrom(self, shape): 38 | return shape 39 | 40 | 41 | def gradShapeFrom(self, shape): 42 | return [shape] * len(self.inData) 43 | 44 | 45 | def calcMode(self, T): 46 | dtypes = {dtype for dtype, _ in gpuarray.dtypesSupported()} 47 | 48 | if T not in dtypes: 49 | raise ModuleError("Unsupported dtype %s" % T) 50 | 51 | self.calctype = T 52 | 53 | 54 | def unittest(): 55 | for dtype, _ in gpuarray.dtypesSupported(): 56 | mulTest(dtype) 57 | 58 | 59 | def mulTest(dtype): 60 | hostData1 = np.random.randn(2, 5, 5).astype(dtype) 61 | hostData2 = np.random.randn(*hostData1.shape).astype(dtype) 62 | 63 | data1, data2 = gpuarray.to_gpu(hostData1), gpuarray.to_gpu(hostData2) 64 | 65 | mul = Mul() 66 | mul.calcMode(dtype) 67 | 68 | mul([data1, data2]) 69 | assert np.allclose(mul.data.get(), hostData1 * hostData2) 70 | 71 | hostGrad = np.random.randn(*mul.data.shape).astype(dtype) 72 | 73 | grad = gpuarray.to_gpu(hostGrad) 74 | mul.backward(grad) 75 | 76 | assert np.allclose(mul.grad[0].get(), hostGrad * hostData2) 77 | assert np.allclose(mul.grad[1].get(), hostGrad * hostData1) 78 | 79 | 80 | if __name__ == "__main__": 81 | unittest() 82 | -------------------------------------------------------------------------------- /Modules/Pool2D.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Backend import gpuarray 2 | from PuzzleLib.Modules.Module import ModuleError, Module 3 | 4 | 5 | class Pool2D(Module): 6 | def __init__(self, size=2, stride=2, pad=0, name=None): 7 | super().__init__(name) 8 | self.gradUsesOutData = True 9 | 10 | self.size = self.repeat(size, 2) 11 | self.stride = self.repeat(stride, 2) 12 | self.pad = self.repeat(pad, 2) 13 | 14 | self.workspace = None 15 | 16 | 17 | def dataShapeFrom(self, shape): 18 | batchsize, maps, inh, inw = shape 19 | 20 | hsize, wsize = self.size 21 | hpad, wpad = self.pad 22 | hstride, wstride = self.stride 23 | 24 | outh = (inh + 2 * hpad - hsize) // hstride + 1 25 | outw = (inw + 2 * wpad - wsize) // wstride + 1 26 | 27 | return batchsize, maps, outh, outw 28 | 29 | 30 | def checkDataShape(self, shape): 31 | if len(shape) != 4: 32 | raise ModuleError("Data must be 4d tensor") 33 | 34 | _, _, inh, inw = shape 35 | if inh + 2 * self.pad[0] < self.size[0]: 36 | raise ModuleError("Data maps height is too small (got %d, expected at least %d)" % 37 | (inh + 2 * self.pad[0], self.size[0])) 38 | 39 | if inw + 2 * self.pad[1] < self.size[1]: 40 | raise ModuleError("Data maps width is too small (got %d, expected at least %d)" % 41 | (inw + 2 * self.pad[1], self.size[1])) 42 | 43 | 44 | def gradShapeFrom(self, shape): 45 | batchsize, maps, outh, outw = shape 46 | 47 | hsize, wsize = self.size 48 | hpad, wpad = self.pad 49 | hstride, wstride = self.stride 50 | 51 | inh = (outh - 1) * hstride - 2 * hpad + hsize 52 | inw = (outw - 1) * wstride - 2 * wpad + wsize 53 | 54 | return batchsize, maps, inh, inw 55 | 56 | 57 | def checkGradShape(self, shape): 58 | if len(shape) != 4: 59 | raise ModuleError("Grad must be 4d tensor") 60 | 61 | 62 | def updateData(self, data): 63 | raise NotImplementedError() 64 | 65 | 66 | def updateGrad(self, grad): 67 | raise NotImplementedError() 68 | 69 | 70 | def reset(self): 71 | super().reset() 72 | self.workspace = None 73 | 74 | 75 | def calcMode(self, T): 76 | dtypes = {dtype for dtype, _ in gpuarray.dtypesSupported()} 77 | 78 | if T not in dtypes: 79 | raise ModuleError("Unsupported dtype %s" % T) 80 | 81 | self.calctype = T 82 | -------------------------------------------------------------------------------- /TestLib/MultiGPUMnist.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Grid import runGrid 4 | 5 | 6 | def train(nodeinfo, verbose): 7 | from PuzzleLib.Datasets import MnistLoader 8 | mnist = MnistLoader(cachename="mnist-%s.hdf" % nodeinfo.index) 9 | data, labels = mnist.load(path="../TestData/") 10 | 11 | data, labels = data[:], labels[:] 12 | print("[%s]: Loaded mnist" % nodeinfo.index) 13 | 14 | np.random.seed(1234) 15 | 16 | from PuzzleLib.Models.Nets.LeNet import loadLeNet 17 | net = loadLeNet(None, initscheme=None) 18 | 19 | from PuzzleLib.Optimizers import MomentumSGD 20 | optimizer = MomentumSGD(learnRate=0.1, momRate=0.9, nodeinfo=nodeinfo) 21 | optimizer.setupOn(net, useGlobalState=True) 22 | 23 | from PuzzleLib.Cost import CrossEntropy 24 | cost = CrossEntropy(maxlabels=10) 25 | 26 | from PuzzleLib.Handlers import Trainer, Validator 27 | trainer = Trainer(net, cost, optimizer, batchsize=128 // nodeinfo.gridsize) 28 | validator = Validator(net, cost) 29 | 30 | valsize = 10000 31 | trainsize = data.shape[0] - valsize 32 | 33 | trainpart = trainsize // nodeinfo.gridsize 34 | valpart = valsize // nodeinfo.gridsize 35 | 36 | for i in range(15): 37 | start, end = nodeinfo.index * trainpart, (nodeinfo.index + 1) * trainpart 38 | 39 | trainer.trainFromHost(data[start:end], labels[start:end], macroBatchSize=trainpart) 40 | trerr = cost.getMeanError() 41 | 42 | if verbose: 43 | print("[%s]: Epoch %s local train error: %s" % (nodeinfo.index, i + 1, trerr)) 44 | 45 | trerr = nodeinfo.meanValue(trerr) 46 | 47 | if nodeinfo.index == 0: 48 | print("Epoch %s global train error: %s" % (i + 1, trerr)) 49 | 50 | start, end = trainsize + nodeinfo.index * valpart, trainsize + (nodeinfo.index + 1) * valpart 51 | valerr = validator.validateFromHost(data[start:end], labels[start:end], macroBatchSize=valpart) 52 | 53 | if verbose: 54 | print("[%s]: Epoch %s local accuracy: %s" % (nodeinfo.index, i + 1, 1.0 - valerr)) 55 | 56 | valerr = nodeinfo.meanValue(valerr) 57 | 58 | if nodeinfo.index == 0: 59 | print("Epoch %s global accuracy: %s" % (i + 1, 1.0 - valerr)) 60 | 61 | optimizer.learnRate *= 0.9 62 | 63 | 64 | def main(): 65 | runGrid(target=train, size=2, verbose=True) 66 | 67 | 68 | if __name__ == "__main__": 69 | main() 70 | -------------------------------------------------------------------------------- /Modules/CrossMapLRN.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray 4 | from PuzzleLib.Backend.Dnn import crossMapLRN, crossMapLRNBackward 5 | 6 | from PuzzleLib.Modules.LRN import LRN 7 | 8 | 9 | class CrossMapLRN(LRN): 10 | def __init__(self, N=5, alpha=1e-4, beta=0.75, K=2.0, name=None): 11 | super().__init__(N, alpha, beta, K, name) 12 | self.gradUsesOutData = True 13 | 14 | 15 | def updateData(self, data): 16 | self.data, self.workspace = crossMapLRN(data, N=self.N, alpha=self.alpha, beta=self.beta, K=self.K, 17 | test=not self.train) 18 | 19 | 20 | def updateGrad(self, grad): 21 | self.grad = crossMapLRNBackward(self.inData, self.data, grad, self.workspace, 22 | N=self.N, alpha=self.alpha, beta=self.beta, K=self.K) 23 | 24 | 25 | def unittest(): 26 | maps = 10 27 | data = gpuarray.to_gpu(np.random.randn(1, maps, 1, 1).astype(np.float32)) 28 | 29 | crossMapLrn = CrossMapLRN() 30 | crossMapLrn(data) 31 | 32 | lookBehind = int((crossMapLrn.N - 1) / 2) 33 | lookAhead = crossMapLrn.N - lookBehind 34 | 35 | hostData = data.get().reshape(maps, ).astype(np.float32) 36 | norms = np.empty((maps, ), dtype=np.float32) 37 | for i in range(maps): 38 | norm = 0.0 39 | for j in range(max(0, i - lookBehind), min(maps, i + lookAhead)): 40 | norm += hostData[j]**2 41 | norms[i] = crossMapLrn.K + norm * crossMapLrn.alpha / crossMapLrn.N 42 | 43 | hostOutData = hostData / norms**crossMapLrn.beta 44 | assert np.allclose(hostOutData, crossMapLrn.data.get().reshape(maps, ).astype(np.float32)) 45 | 46 | grad = gpuarray.to_gpu(np.random.randn(1, maps, 1, 1).astype(np.float32)) 47 | crossMapLrn.backward(grad) 48 | 49 | hostGrad = grad.get().reshape(maps, ).astype(np.float32) 50 | hostInGrad = np.zeros((maps, ), dtype=np.float32) 51 | 52 | k = 2.0 * crossMapLrn.alpha * crossMapLrn.beta / crossMapLrn.N 53 | for i in range(maps): 54 | hostInGrad[i] += hostGrad[i] / norms[i]**crossMapLrn.beta 55 | 56 | for j in range(max(0, i - lookBehind), min(maps, i + lookAhead)): 57 | hostInGrad[j] -= hostGrad[i] * k * hostData[i] * hostData[j] / norms[i]**(crossMapLrn.beta+1) 58 | assert np.allclose(hostInGrad, crossMapLrn.grad.get().reshape(maps, ).astype(np.float32)) 59 | 60 | 61 | if __name__ == "__main__": 62 | unittest() 63 | -------------------------------------------------------------------------------- /Modules/MulAddConst.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib import Config 4 | 5 | from PuzzleLib.Backend import gpuarray 6 | from PuzzleLib.Backend.gpuarray import memoryPool as memPool 7 | from PuzzleLib.Backend.Kernels.ElementWise import linearKer 8 | 9 | from PuzzleLib.Modules.Module import ModuleError, Module 10 | 11 | 12 | class MulAddConst(Module): 13 | def __init__(self, a=1.0, b=0.0, inplace=False, name=None): 14 | super().__init__(name) 15 | self.registerBlueprint(locals()) 16 | 17 | self.a, self.b = a, b 18 | self.inplace = inplace 19 | 20 | if inplace and Config.showWarnings: 21 | Config.getLogger().info("Warning: %s is using inplace flag", self) 22 | 23 | 24 | def updateData(self, data): 25 | self.data = data if self.inplace else gpuarray.empty(data.shape, dtype=data.dtype, allocator=memPool) 26 | linearKer(data.dtype)(self.data, data, self.a, self.b) 27 | 28 | 29 | def updateGrad(self, grad): 30 | self.grad = grad if self.inplace else gpuarray.empty(grad.shape, dtype=grad.dtype, allocator=memPool) 31 | linearKer(grad.dtype)(self.grad, grad, self.a, 0.0) 32 | 33 | 34 | def dataShapeFrom(self, shape): 35 | return shape 36 | 37 | 38 | def gradShapeFrom(self, shape): 39 | return shape 40 | 41 | 42 | def calcMode(self, T): 43 | dtypes = {dtype for dtype, _ in gpuarray.dtypesSupported()} 44 | 45 | if T not in dtypes: 46 | raise ModuleError("Unsupported dtype %s" % T) 47 | 48 | self.calctype = T 49 | 50 | 51 | def unittest(): 52 | for dtype, atol in gpuarray.dtypesSupported(): 53 | mulAddConstTest(dtype, atol) 54 | 55 | 56 | def mulAddConstTest(dtype, atol): 57 | hostData = np.random.randn(10, 10, 4, 3).astype(dtype) 58 | data = gpuarray.to_gpu(hostData) 59 | 60 | mulAdd = MulAddConst(a=3.141592, b=42.0) 61 | mulAdd.calcMode(dtype) 62 | 63 | mulAdd(data) 64 | 65 | hostOutData = (hostData.astype(np.float32) * mulAdd.a + mulAdd.b).astype(dtype) 66 | assert np.allclose(hostOutData, mulAdd.data.get(), atol=atol) 67 | 68 | hostGrad = np.random.randn(*data.shape).astype(dtype) 69 | grad = gpuarray.to_gpu(hostGrad) 70 | 71 | mulAdd.backward(grad) 72 | 73 | hostInGrad = hostGrad * mulAdd.a 74 | assert np.allclose(hostInGrad, mulAdd.grad.get(), atol=atol) 75 | 76 | 77 | if __name__ == "__main__": 78 | unittest() 79 | -------------------------------------------------------------------------------- /Modules/Gelu.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import numpy as np 4 | 5 | from PuzzleLib import Config 6 | 7 | from PuzzleLib.Backend import gpuarray 8 | from PuzzleLib.Backend.gpuarray import memoryPool as memPool 9 | from PuzzleLib.Backend.Kernels.ElementWise import geluKer, geluDerKer 10 | 11 | from PuzzleLib.Modules.Module import ModuleError, Module 12 | 13 | 14 | class Gelu(Module): 15 | def __init__(self, inplace=False, name=None): 16 | super().__init__(name) 17 | self.registerBlueprint(locals()) 18 | 19 | self.inplace = inplace 20 | 21 | if inplace and Config.showWarnings: 22 | Config.getLogger().info("Warning: %s is using inplace flag", self) 23 | 24 | 25 | def updateData(self, data): 26 | self.data = data if self.inplace else gpuarray.empty(data.shape, dtype=data.dtype, allocator=memPool) 27 | geluKer(data.dtype)(self.data, data) 28 | 29 | 30 | def updateGrad(self, grad): 31 | self.grad = grad if self.inplace else gpuarray.empty(grad.shape, dtype=grad.dtype, allocator=memPool) 32 | geluDerKer(grad.dtype)(self.grad, grad, self.inData) 33 | 34 | 35 | def dataShapeFrom(self, shape): 36 | return shape 37 | 38 | 39 | def gradShapeFrom(self, shape): 40 | return shape 41 | 42 | 43 | def calcMode(self, T): 44 | dtypes = {dtype for dtype, _ in gpuarray.dtypesSupported()} 45 | 46 | if T not in dtypes: 47 | raise ModuleError("Unsupported dtype %s" % T) 48 | 49 | self.calctype = T 50 | 51 | 52 | def unittest(): 53 | for dtype, atol in gpuarray.dtypesSupported(): 54 | geluTest(dtype, atol) 55 | 56 | 57 | def geluTest(dtype, atol): 58 | gelu = Gelu() 59 | gelu.calcMode(dtype) 60 | 61 | hostData = np.random.randn(11, 51).astype(dtype) 62 | 63 | data = gpuarray.to_gpu(hostData) 64 | gelu(data) 65 | 66 | erf = np.vectorize(math.erf) 67 | hostOutData = 0.5 * hostData * (1.0 + erf(hostData / math.sqrt(2))) 68 | 69 | assert np.allclose(hostOutData, gelu.data.get(), atol=atol) 70 | 71 | hostGrad = np.random.randn(*gelu.data.shape).astype(dtype) 72 | 73 | grad = gpuarray.to_gpu(hostGrad) 74 | gelu.backward(grad) 75 | 76 | hostInGrad = hostGrad * (0.5 * (1.0 + erf(hostData / math.sqrt(2))) + 77 | hostData / math.sqrt(math.pi) * np.exp(-0.5 * hostData**2)) 78 | assert np.allclose(hostInGrad, gelu.grad.get(), atol=atol) 79 | 80 | 81 | if __name__ == "__main__": 82 | unittest() 83 | -------------------------------------------------------------------------------- /Converter/OpenVINO/VINOEngine.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray 4 | from PuzzleLib.Backend.gpuarray import memoryPool as memPool 5 | 6 | from PuzzleLib.Modules.Module import Module 7 | from PuzzleLib.Converter.OpenVINO import Driver 8 | 9 | 10 | def genEngineName(name): 11 | return "%s.xml" % name, "%s.bin" % name 12 | 13 | 14 | class VINOEngine(Module): 15 | def __init__(self, enginepath, batchsize, name=None): 16 | super().__init__(name) 17 | self.registerBlueprint(locals()) 18 | 19 | xmlpath, binpath = enginepath 20 | self.engine = Driver.VINOEngine(batchsize, xmlpath, binpath, "CPU") 21 | 22 | inshape, outshape = self.engine.inshape, self.engine.outshape 23 | 24 | inshape = [tuple(inshape[key]) for key in sorted(inshape.keys(), key=lambda nm: nm.split(sep="_")[-1])] 25 | outshape = [tuple(outshape[key]) for key in sorted(outshape.keys(), key=lambda nm: nm.split(sep="_")[-1])] 26 | 27 | self.inshape = inshape[0] if len(inshape) == 1 else inshape 28 | self.outshape = outshape[0] if len(outshape) == 1 else outshape 29 | 30 | 31 | def updateData(self, data): 32 | data = data if isinstance(data, list) else [data] 33 | inputs = {"data_%s" % i: (d.ptr, d.nbytes) for i, d in enumerate(data)} 34 | 35 | outshape = [self.outshape] if not isinstance(self.outshape, list) else self.outshape 36 | 37 | outdata = [gpuarray.empty(shape, dtype=np.float32, allocator=memPool) for shape in outshape] 38 | outputs = {"outdata_%s" % i: (data.ptr, data.nbytes) for i, data in enumerate(outdata)} 39 | 40 | self.engine.infer(outputs, inputs) 41 | self.data = outdata if isinstance(self.outshape, list) else outdata[0] 42 | 43 | 44 | def updateGrad(self, grad): 45 | assert False 46 | 47 | 48 | def dataShapeFrom(self, shape): 49 | return self.outshape 50 | 51 | 52 | def checkDataShape(self, shape): 53 | if isinstance(shape, list): 54 | for i, sh in enumerate(shape): 55 | if sh != self.inshape[i]: 56 | raise ValueError("Shape %s is not equal to shape %s on index %s" % (sh, self.inshape[i], i)) 57 | 58 | elif shape != self.inshape: 59 | raise ValueError("Data shape must be equal %s (was given %s)" % (self.inshape, shape)) 60 | 61 | 62 | def gradShapeFrom(self, shape): 63 | assert False 64 | 65 | 66 | def checkGradShape(self, shape): 67 | assert False 68 | -------------------------------------------------------------------------------- /Compiler/Codegen/Vector/TVector.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "$HEADER_NAME" 3 | 4 | $BODY_PREAMBULE 5 | void ${NAME}_init($NAME *self) 6 | { 7 | self->ptr = NULL; 8 | self->size = self->capacity = 0; 9 | } 10 | 11 | 12 | void ${NAME}_dealloc($NAME *self) 13 | { 14 | ${NAME}_clear(self); 15 | $FREE(self->ptr); 16 | } 17 | 18 | 19 | void ${NAME}_reserve($NAME *self, size_t capacity) 20 | { 21 | if (self->size < capacity) 22 | { 23 | $T *ptr = ($T *)$MALLOC(sizeof(self->ptr[0]) * capacity); 24 | 25 | for (size_t i = 0; i < self->size; i += 1) 26 | ptr[i] = self->ptr[i]; 27 | 28 | $FREE(self->ptr); 29 | 30 | self->ptr = ptr; 31 | self->capacity = capacity; 32 | } 33 | else 34 | { 35 | for (size_t i = capacity; i < self->size; i += 1) 36 | $DESTRUCT(self->ptr[i]); 37 | 38 | self->size = self->capacity = capacity; 39 | } 40 | } 41 | 42 | 43 | inline static void ${NAME}_ensureIsAppendable($NAME *self) 44 | { 45 | if (self->size == self->capacity) 46 | { 47 | size_t size = (self->capacity < $MIN_CAPACITY) ? $MIN_CAPACITY : self->capacity * 2; 48 | ${NAME}_reserve(self, size); 49 | } 50 | } 51 | 52 | 53 | void ${NAME}_append($NAME *self, $T elem) 54 | { 55 | ${NAME}_ensureIsAppendable(self); 56 | 57 | $BORROW(elem); 58 | self->ptr[self->size] = elem; 59 | 60 | self->size += 1; 61 | } 62 | 63 | 64 | void ${NAME}_appendEmpty($NAME *self) 65 | { 66 | ${NAME}_ensureIsAppendable(self); 67 | self->size += 1; 68 | } 69 | 70 | 71 | bool ${NAME}_pop($NAME *self, $T *elem) 72 | { 73 | if (self->size == 0) 74 | return false; 75 | 76 | self->size -= 1; 77 | *elem = self->ptr[self->size]; 78 | 79 | return true; 80 | } 81 | 82 | 83 | void ${NAME}_clear($NAME *self) 84 | { 85 | for (size_t i = 0; i < self->size; i += 1) 86 | $DESTRUCT(self->ptr[i]); 87 | 88 | self->size = 0; 89 | } 90 | 91 | 92 | bool ${NAME}_get($NAME *self, size_t index, $T *elem) 93 | { 94 | if (index >= self->size) 95 | return false; 96 | 97 | *elem = self->ptr[index]; 98 | return true; 99 | } 100 | 101 | 102 | bool ${NAME}_set($NAME *self, size_t index, $T elem) 103 | { 104 | if (index >= self->size) 105 | return false; 106 | 107 | $BORROW(elem); 108 | $DESTRUCT(self->ptr[index]); 109 | 110 | self->ptr[index] = elem; 111 | return true; 112 | } 113 | -------------------------------------------------------------------------------- /Modules/Slice.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray 4 | from PuzzleLib.Backend.gpuarray import memoryPool as memPool 5 | 6 | from PuzzleLib.Modules.Module import ModuleError, Module 7 | 8 | 9 | class Slice(Module): 10 | def __init__(self, slc=None, name=None): 11 | super().__init__(name) 12 | self.registerBlueprint(locals()) 13 | 14 | self.slc = slc 15 | self.inshape = None 16 | 17 | 18 | def __getitem__(self, slc): 19 | if not isinstance(slc, tuple): 20 | slc = (slc, ) 21 | 22 | self.slc = slc 23 | return self 24 | 25 | 26 | def updateData(self, data): 27 | self.inshape = data.shape 28 | self.data = data[self.slc].copy(allocator=memPool) 29 | 30 | 31 | def updateGrad(self, grad): 32 | self.grad = gpuarray.zeros(self.inshape, dtype=np.float32, allocator=memPool) 33 | self.grad[self.slc] = grad 34 | 35 | 36 | def dataShapeFrom(self, shape): 37 | if self.slc is None: 38 | raise ModuleError("Slice parameter is not initialized") 39 | 40 | outshape = [None] * len(shape) 41 | for i, dim in enumerate(shape): 42 | slc = self.slc[i] 43 | start, stop, step = slc.indices(dim) 44 | 45 | outshape[i] = (stop - start) // step 46 | 47 | return tuple(outshape) 48 | 49 | 50 | def checkDataShape(self, shape): 51 | if self.slc is None: 52 | raise ModuleError("Slice parameter is not initialized") 53 | 54 | if len(shape) < len(self.slc): 55 | raise ModuleError("Expected at least %d data dimensions, %d were given" % (len(self.slc), len(shape))) 56 | 57 | 58 | def gradShapeFrom(self, shape): 59 | return self.inshape 60 | 61 | 62 | def checkGradShape(self, shape): 63 | if shape != self.data.shape: 64 | raise ModuleError("Grad shape %s is inconsistent with output data shape %s" % (shape, self.data.shape)) 65 | 66 | 67 | def unittest(): 68 | data = gpuarray.to_gpu(np.random.randn(3, 4, 5, 6).astype(np.float32)) 69 | 70 | slc = Slice()[:, :, 1:-1, 1:-1] 71 | slc(data) 72 | 73 | assert slc.dataShapeFrom(data.shape) == slc.data.shape 74 | assert np.allclose(slc.data.get(), data.get()[slc.slc]) 75 | 76 | grad = gpuarray.to_gpu(np.random.randn(*slc.data.shape).astype(np.float32)) 77 | slc.backward(grad) 78 | 79 | assert slc.gradShapeFrom(grad.shape) == data.shape 80 | assert np.allclose(slc.grad.get()[slc.slc], grad.get()) 81 | 82 | 83 | if __name__ == "__main__": 84 | unittest() 85 | -------------------------------------------------------------------------------- /Modules/Glue.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray 4 | from PuzzleLib.Modules.Module import ModuleError, Module 5 | 6 | 7 | class Glue(Module): 8 | def __init__(self, modules=None, fwdGlue=None, bwdGlue=None, fwdShapeGlue=None, bwdShapeGlue=None, name=None): 9 | super().__init__(name) 10 | 11 | if modules is not None and not isinstance(modules, dict): 12 | raise ModuleError("Modules object must be non-empty dictionary") 13 | 14 | self.modules = modules 15 | 16 | self.fwdGlue = fwdGlue 17 | self.bwdGlue = bwdGlue 18 | 19 | self.fwdShapeGlue = fwdShapeGlue 20 | self.bwdShapeGlue = bwdShapeGlue 21 | 22 | 23 | def updateData(self, data): 24 | self.data = self.fwdGlue(data, self.modules) 25 | 26 | 27 | def updateGrad(self, grad): 28 | self.grad = self.bwdGlue(grad, self.modules) 29 | 30 | 31 | def dataShapeFrom(self, shape): 32 | if self.fwdShapeGlue is not None: 33 | return self.fwdShapeGlue(shape) 34 | else: 35 | raise ModuleError("Forward shape glue hook is not installed") 36 | 37 | 38 | def gradShapeFrom(self, shape): 39 | if self.bwdShapeGlue is not None: 40 | return self.bwdShapeGlue(shape) 41 | else: 42 | raise ModuleError("Backward shape glue hook is not installed") 43 | 44 | 45 | def unittest(): 46 | data1 = gpuarray.to_gpu(np.random.randn(10, 2, 3, 3).astype(np.float32)) 47 | data2 = gpuarray.to_gpu(np.random.randn(10, 2, 3, 3).astype(np.float32)) 48 | data3 = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32)) 49 | 50 | def fwdGlue(data, modules): 51 | dat1, dat2, dat3 = data 52 | split = modules["split"] 53 | out1, out2 = split(data3) 54 | 55 | return [dat1 + dat2, out1, out2] 56 | 57 | def bwdGlue(grad, modules): 58 | gr1, gr2, gr3 = grad 59 | split = modules["split"] 60 | split.backward([gr2, gr3]) 61 | 62 | return [gr1, gr1, split.grad] 63 | 64 | from PuzzleLib.Modules.Split import Split 65 | glue = Glue(fwdGlue=fwdGlue, bwdGlue=bwdGlue, modules={"split": Split(axis=1, sections=(5, 5))}) 66 | glue([data1, data2, data3]) 67 | 68 | grad1 = gpuarray.to_gpu(np.random.randn(*glue.data[0].shape).astype(np.float32)) 69 | grad2 = gpuarray.to_gpu(np.random.randn(*glue.data[1].shape).astype(np.float32)) 70 | grad3 = gpuarray.to_gpu(np.random.randn(*glue.data[2].shape).astype(np.float32)) 71 | 72 | glue.backward([grad1, grad2, grad3]) 73 | 74 | 75 | if __name__ == "__main__": 76 | unittest() 77 | -------------------------------------------------------------------------------- /Modules/AvgPool2D.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray 4 | from PuzzleLib.Backend.Dnn import PoolMode, poolNd, poolNdBackward 5 | 6 | from PuzzleLib.Modules.Pool2D import Pool2D 7 | 8 | 9 | class AvgPool2D(Pool2D): 10 | def __init__(self, size=2, stride=2, pad=0, includePad=True, name=None): 11 | super().__init__(size, stride, pad, name) 12 | self.registerBlueprint(locals()) 13 | 14 | self.mode = PoolMode.avgWithPad if includePad else PoolMode.avgNoPad 15 | 16 | 17 | def updateData(self, data): 18 | self.data, self.workspace = poolNd(data, size=self.size, stride=self.stride, pad=self.pad, mode=self.mode, 19 | test=not self.train) 20 | 21 | 22 | def updateGrad(self, grad): 23 | self.grad = poolNdBackward(self.inData, self.data, grad, self.workspace, size=self.size, stride=self.stride, 24 | pad=self.pad, mode=self.mode) 25 | 26 | 27 | def unittest(): 28 | batchsize, maps, h, w = 2, 3, 6, 6 29 | data = gpuarray.to_gpu(np.random.randn(batchsize, maps, h, w).astype(np.float32)) 30 | 31 | size = 3 32 | stride, pad = 1, 1 33 | 34 | avgpool2d = AvgPool2D(size=size, stride=stride, pad=pad, includePad=True) 35 | avgpool2d(data) 36 | 37 | grad = gpuarray.to_gpu(np.random.randn(*avgpool2d.data.shape).astype(np.float32)) 38 | avgpool2d.backward(grad) 39 | 40 | hostData = np.zeros(shape=(batchsize, maps, h + 2 * pad, w + 2 * pad), dtype=np.float32) 41 | hostData[:, :, pad:-pad, pad:-pad] = data.get() 42 | 43 | hostOutData = np.empty(avgpool2d.data.shape, dtype=np.float32) 44 | 45 | for b in range(batchsize): 46 | for c in range(maps): 47 | for y in range(avgpool2d.data.shape[2]): 48 | for x in range(avgpool2d.data.shape[3]): 49 | hostOutData[b,c,y,x] = np.sum(hostData[b,c,y*stride:y*stride+size, x*stride:x*stride+size])/size**2 50 | 51 | assert np.allclose(hostOutData, avgpool2d.data.get()) 52 | 53 | hostGrad, hostInGrad = grad.get(), np.zeros(hostData.shape, dtype=np.float32) 54 | 55 | for b in range(batchsize): 56 | for c in range(maps): 57 | for y in range(hostGrad.shape[2]): 58 | for x in range(hostGrad.shape[3]): 59 | for dy in range(size): 60 | for dx in range(size): 61 | hostInGrad[b, c, y * stride + dy, x * stride + dx] += hostGrad[b, c, y, x] / size**2 62 | 63 | assert np.allclose(hostInGrad[:, :, pad:-pad, pad:-pad], avgpool2d.grad.get()) 64 | 65 | 66 | if __name__ == "__main__": 67 | unittest() 68 | -------------------------------------------------------------------------------- /Modules/Transpose.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray, Memory 4 | from PuzzleLib.Modules.Module import ModuleError, Module 5 | 6 | 7 | class Transpose(Module): 8 | def __init__(self, axes=None, name=None): 9 | super().__init__(name) 10 | self.registerBlueprint(locals()) 11 | 12 | self.axes = axes 13 | 14 | if axes is None: 15 | self.invaxes = None 16 | else: 17 | self.invaxes = [0] * len(axes) 18 | for i, axis in enumerate(axes): 19 | self.invaxes[axis] = i 20 | 21 | 22 | def updateData(self, data): 23 | self.data = Memory.transpose(data, self.axes) 24 | 25 | 26 | def updateGrad(self, grad): 27 | self.grad = Memory.transpose(grad, self.invaxes) 28 | 29 | 30 | def checkDataShape(self, shape): 31 | if self.axes is not None and len(shape) != len(self.axes): 32 | raise ModuleError("Data dimension needs to be %d, (data has %d)" % (len(self.axes), len(shape))) 33 | 34 | 35 | def checkGradShape(self, shape): 36 | if self.axes is not None and len(shape) != len(self.axes): 37 | raise ModuleError("Grad dimension needs to be %d, (grad has %d)" % (len(self.axes), len(shape))) 38 | 39 | 40 | def dataShapeFrom(self, shape): 41 | return tuple(shape[axis] for axis in self.axes) 42 | 43 | 44 | def gradShapeFrom(self, shape): 45 | return tuple(shape[axis] for axis in self.invaxes) 46 | 47 | 48 | def calcMode(self, T): 49 | dtypes = {dtype for dtype, _ in gpuarray.dtypesSupported()} 50 | 51 | if T not in dtypes: 52 | raise ModuleError("Unsupported dtype %s" % T) 53 | 54 | self.calctype = T 55 | 56 | 57 | def unittest(): 58 | for dtype, _ in gpuarray.dtypesSupported(): 59 | transposeTest(dtype) 60 | 61 | 62 | def transposeTest(dtype): 63 | shape = (10, 3, 5, 4, 2) 64 | axes = (2, 4, 1, 3, 0) 65 | 66 | hostData = np.random.randn(*shape).astype(dtype) 67 | data = gpuarray.to_gpu(hostData) 68 | 69 | transpose = Transpose(axes) 70 | transpose.calcMode(dtype) 71 | 72 | transpose(data) 73 | 74 | hostOutData = np.transpose(hostData, axes=axes) 75 | assert np.allclose(hostOutData, transpose.data.get()) 76 | 77 | hostGrad = np.random.randn(*transpose.data.shape).astype(dtype) 78 | grad = gpuarray.to_gpu(hostGrad) 79 | 80 | transpose.backward(grad) 81 | 82 | hostInGrad = np.transpose(hostGrad, axes=transpose.invaxes) 83 | assert np.allclose(hostInGrad, transpose.grad.get()) 84 | 85 | 86 | if __name__ == "__main__": 87 | unittest() 88 | -------------------------------------------------------------------------------- /Modules/MapLRN.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib import Config 4 | 5 | from PuzzleLib.Backend import gpuarray 6 | from PuzzleLib.Backend.Dnn import mapLRN, mapLRNBackward 7 | 8 | from PuzzleLib.Modules.LRN import LRN 9 | 10 | 11 | class MapLRN(LRN): 12 | def __init__(self, N=5, alpha=1e-4, beta=0.75, K=2.0, name=None): 13 | super().__init__(N, alpha, beta, K, name) 14 | 15 | if Config.backend != Config.Backend.cuda: 16 | self.gradUsesOutData = True 17 | 18 | 19 | def updateData(self, data): 20 | self.data, self.workspace = mapLRN(data, None, N=self.N, alpha=self.alpha, beta=self.beta, K=self.K, 21 | test=not self.train) 22 | 23 | 24 | def updateGrad(self, grad): 25 | self.grad = mapLRNBackward(self.inData, self.data, grad, None, self.workspace, 26 | N=self.N, alpha=self.alpha, beta=self.beta, K=self.K) 27 | 28 | 29 | def unittest(): 30 | h, w = 10, 10 31 | data = gpuarray.to_gpu(np.random.randn(1, 1, h, w).astype(np.float32)) 32 | 33 | mapLrn = MapLRN() 34 | mapLrn(data) 35 | 36 | lookBehind = int((mapLrn.N - 1) / 2) 37 | lookAhead = mapLrn.N - lookBehind 38 | 39 | hostData = data.get().reshape(h, w).astype(np.float32) 40 | norms = np.empty((h, w), dtype=np.float32) 41 | for i in range(h): 42 | for j in range(w): 43 | norm = 0.0 44 | for m in range(max(0, i - lookBehind), min(h, i + lookAhead)): 45 | for n in range(max(0, j - lookBehind), min(w, j + lookAhead)): 46 | norm += hostData[m, n]**2 47 | norms[i, j] = mapLrn.K + norm * mapLrn.alpha / mapLrn.N**2 48 | 49 | hostOutData = hostData / norms**mapLrn.beta 50 | assert np.allclose(hostOutData, mapLrn.data.get()[0, 0]) 51 | 52 | grad = gpuarray.to_gpu(np.random.randn(1, 1, h, w).astype(np.float32)) 53 | mapLrn.backward(grad) 54 | 55 | hostGrad = grad.get().reshape(h, w).astype(np.float32) 56 | hostInGrad = np.zeros((h, w), dtype=np.float32) 57 | 58 | k = 2.0 * mapLrn.alpha * mapLrn.beta / mapLrn.N**2 59 | for i in range(h): 60 | for j in range(w): 61 | hostInGrad[i, j] += hostGrad[i, j] / norms[i, j]**mapLrn.beta 62 | 63 | for m in range(max(0, i - lookBehind), min(h, i + lookAhead)): 64 | for n in range(max(0, j - lookBehind), min(w, j + lookAhead)): 65 | hostInGrad[i, j] -= k*hostGrad[m, n]*hostData[i, j]*hostData[m, n]/norms[m, n]**(mapLrn.beta+1) 66 | 67 | assert np.allclose(hostInGrad, mapLrn.grad.get()[0, 0]) 68 | 69 | 70 | if __name__ == "__main__": 71 | unittest() 72 | -------------------------------------------------------------------------------- /Optimizers/AdaDelta.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib import Config 4 | 5 | from PuzzleLib.Backend import gpuarray 6 | from PuzzleLib.Backend.Kernels.ElementWise import adadeltaKer 7 | 8 | from PuzzleLib.Optimizers.Optimizer import Optimizer, trainSimpleTest, trainHardTest 9 | 10 | 11 | class AdaDelta(Optimizer): 12 | def __init__(self, rho=0.95, epsilon=1e-6, nodeinfo=None): 13 | super().__init__(nodeinfo) 14 | 15 | self.rho = None 16 | self.epsilon = None 17 | 18 | self.setAttr("rho", rho) 19 | self.setAttr("epsilon", epsilon) 20 | 21 | 22 | def setupState(self, var): 23 | return { 24 | "msg": gpuarray.zeros(var.data.shape, dtype=var.data.dtype), 25 | "msdx": gpuarray.zeros(var.data.shape, dtype=var.data.dtype) 26 | } 27 | 28 | 29 | def updateVar(self, var, state, stream=None): 30 | adadeltaKer(var.data.dtype)( 31 | var.data, var.grad, state["msg"], state["msdx"], self.rho, self.epsilon, stream=stream 32 | ) 33 | 34 | 35 | def unittest(): 36 | for dtype, atol in gpuarray.dtypesSupported(): 37 | calcTest(dtype, atol) 38 | trainSimpleTest(AdaDelta, dtype) 39 | 40 | if Config.backend == Config.Backend.cuda: 41 | trainHardTest(AdaDelta, dtype) 42 | 43 | 44 | def calcTest(dtype, atol): 45 | rho, epsilon = 0.95, 1e-6 46 | shape = (11, 13) 47 | 48 | hostW, hostDw = np.random.randn(*shape).astype(dtype), np.random.randn(*shape).astype(dtype) 49 | hostMsg = (1.0 + np.random.randn(*shape)**2).astype(dtype) 50 | hostMsdx = (1.0 + np.random.randn(*shape)**2).astype(dtype) 51 | 52 | w, dw = gpuarray.to_gpu(hostW), gpuarray.to_gpu(hostDw) 53 | msg, msdx = gpuarray.to_gpu(hostMsg), gpuarray.to_gpu(hostMsdx) 54 | 55 | adadeltaKer(w.dtype)(w, dw, msg, msdx, rho, epsilon) 56 | 57 | hostW, hostDw = hostW.astype(np.float32), hostDw.astype(np.float32) 58 | hostMsg, hostMsdx = hostMsg.astype(np.float32), hostMsdx.astype(np.float32) 59 | 60 | hostMsg += (1.0 - rho) * (hostDw * hostDw - hostMsg) 61 | hostDx = np.sqrt((hostMsdx + epsilon) / (hostMsg + epsilon)) * hostDw 62 | hostMsdx += (1.0 - rho) * (hostDx**2 - hostMsdx) 63 | hostW += hostDx 64 | 65 | hostW, hostDw = hostW.astype(dtype), hostDw.astype(dtype) 66 | hostMsg, hostMsdx = hostMsg.astype(dtype), hostMsdx.astype(dtype) 67 | 68 | assert np.allclose(hostMsg, msg.get(), atol=atol) 69 | assert np.allclose(hostMsdx, msdx.get(), atol=atol) 70 | assert np.allclose(hostW, w.get(), atol=atol) 71 | 72 | 73 | if __name__ == "__main__": 74 | unittest() 75 | -------------------------------------------------------------------------------- /TestLib/MultiGPUCifar10.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Grid import runGrid 4 | 5 | 6 | def train(nodeinfo, verbose): 7 | from PuzzleLib.Datasets import Cifar10Loader 8 | cifar10 = Cifar10Loader(cachename="cifar10-%s.hdf" % nodeinfo.index) 9 | data, labels = cifar10.load(path="../TestData/") 10 | 11 | data, labels = data[:], labels[:] 12 | print("[%s]: Loaded cifar10" % nodeinfo.index) 13 | 14 | np.random.seed(1234) 15 | 16 | from PuzzleLib.TestLib.CnnCifar10Simple import buildNet 17 | net = buildNet() 18 | 19 | from PuzzleLib.Optimizers import MomentumSGD 20 | optimizer = MomentumSGD(learnRate=0.01, momRate=0.9, nodeinfo=nodeinfo) 21 | optimizer.setupOn(net, useGlobalState=True) 22 | 23 | from PuzzleLib.Cost import CrossEntropy 24 | cost = CrossEntropy(maxlabels=10) 25 | 26 | from PuzzleLib.Handlers import Trainer, Validator 27 | trainer = Trainer(net, cost, optimizer, batchsize=128 // nodeinfo.gridsize) 28 | validator = Validator(net, cost) 29 | 30 | import math 31 | currerror = math.inf 32 | 33 | valsize = 10000 34 | trainsize = data.shape[0] - valsize 35 | 36 | trainpart = trainsize // nodeinfo.gridsize 37 | valpart = valsize // nodeinfo.gridsize 38 | 39 | for i in range(25): 40 | start, end = nodeinfo.index * trainpart, (nodeinfo.index + 1) * trainpart 41 | 42 | trainer.trainFromHost(data[start:end], labels[start:end], macroBatchSize=trainpart) 43 | trerr = cost.getMeanError() 44 | 45 | if verbose: 46 | print("[%s]: Epoch %s local train error: %s" % (nodeinfo.index, i + 1, trerr)) 47 | 48 | trerr = nodeinfo.meanValue(trerr) 49 | 50 | if nodeinfo.index == 0: 51 | print("Epoch %s global train error: %s" % (i + 1, trerr)) 52 | 53 | start, end = trainsize + nodeinfo.index * valpart, trainsize + (nodeinfo.index + 1) * valpart 54 | valerr = validator.validateFromHost(data[start:end], labels[start:end], macroBatchSize=valpart) 55 | 56 | if verbose: 57 | print("[%s]: Epoch %s local accuracy: %s" % (nodeinfo.index, i + 1, 1.0 - valerr)) 58 | 59 | valerr = nodeinfo.meanValue(valerr) 60 | 61 | if nodeinfo.index == 0: 62 | print("Epoch %s global accuracy: %s" % (i + 1, 1.0 - valerr)) 63 | 64 | if valerr >= currerror: 65 | optimizer.learnRate *= 0.5 66 | print("[%s]: Lowered learn rate: %s" % (nodeinfo.index, optimizer.learnRate)) 67 | 68 | currerror = valerr 69 | 70 | 71 | def main(): 72 | runGrid(target=train, size=2, verbose=True) 73 | 74 | 75 | if __name__ == "__main__": 76 | main() 77 | -------------------------------------------------------------------------------- /CPU/Wrappers/NumpyBlas.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.CPU.CPUArray import CPUArray 4 | from PuzzleLib.CPU.Kernels import ElementWise 5 | 6 | 7 | def sumOnMatrix(A, out=None, cols=True, alpha=1.0, beta=0.0): 8 | assert A.ndim == 2 9 | assert A.flags.c_contiguous 10 | assert A.dtype == np.float32 11 | 12 | if out is None: 13 | out = CPUArray.empty((A.shape[1], ) if cols else (A.shape[0], ), dtype=np.float32) 14 | 15 | if alpha == 1.0 and beta == 0.0: 16 | np.sum(A.data, axis=0 if cols else 1, out=out.data) 17 | 18 | else: 19 | s = np.sum(A.data, axis=0 if cols else 1) 20 | np.add(beta * out.data, alpha * s, out=out.data) 21 | 22 | return out 23 | 24 | 25 | def toVectorAddVector(y, x, alpha=1.0): 26 | assert x.ndim == 1 27 | assert x.shape == y.shape 28 | assert y.flags.forc and x.flags.forc 29 | 30 | assert x.dtype == y.dtype 31 | assert x.dtype == np.float32 32 | 33 | ElementWise.toVectorAddVectorKer(y.dtype)(y, x, alpha) 34 | 35 | 36 | def addVectorToVector(x, y, out=None, alpha=1.0, beta=1.0): 37 | assert x.ndim == 1 38 | assert x.flags.forc and y.flags.forc 39 | assert x.shape == y.shape 40 | assert x.dtype == y.dtype and x.dtype == np.float32 41 | 42 | if out is None: 43 | out = CPUArray.empty(x.shape, dtype=np.float32) 44 | 45 | ElementWise.addVectorToVectorKer(out, x, y, alpha, beta) 46 | return out 47 | 48 | 49 | def vectorL1Norm(x): 50 | assert x.ndim == 1 51 | assert x.flags.forc 52 | assert x.dtype == np.float32 53 | 54 | return np.linalg.norm(x.data, ord=1) 55 | 56 | 57 | def dot(x, y): 58 | assert x.ndim == 1 59 | assert x.shape == y.shape 60 | assert x.flags.forc and y.flags.forc 61 | assert x.dtype == y.dtype and y.dtype == np.float32 62 | 63 | return np.vdot(x.data, y.data) 64 | 65 | 66 | def mulMatrixOnMatrix(A, B, out=None, transpA=False, transpB=False, alpha=1.0, beta=0.0): 67 | assert not (transpA and transpB) 68 | assert A.ndim == 2 and B.ndim == 2 69 | 70 | assert alpha == 1.0 and beta == 0.0 71 | 72 | if transpA: 73 | assert A.shape[0] == B.shape[0] 74 | shape = (A.shape[1], B.shape[1]) 75 | 76 | elif transpB: 77 | assert A.shape[1] == B.shape[1] 78 | shape = (A.shape[0], B.shape[0]) 79 | 80 | else: 81 | assert A.shape[1] == B.shape[0] 82 | shape = (A.shape[0], B.shape[1]) 83 | 84 | A = A.data.T if transpA else A.data 85 | B = B.data.T if transpB else B.data 86 | 87 | if out is None: 88 | out = CPUArray.empty(shape, dtype=np.float32) 89 | 90 | np.dot(A, B, out=out.data) 91 | return out 92 | -------------------------------------------------------------------------------- /Backend/Kernels/MatVec.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib import Config 2 | 3 | 4 | addVecToMat = None 5 | argmax = None 6 | 7 | addVecToMatBatch = None 8 | argmaxBatch = None 9 | 10 | 11 | def autoinit(): 12 | if not Config.shouldInit(): 13 | return 14 | 15 | if Config.backend == Config.Backend.cuda: 16 | initCuda() 17 | elif Config.backend == Config.Backend.hip: 18 | initHip() 19 | elif Config.isCPUBased(Config.backend): 20 | initCPU() 21 | else: 22 | raise Config.ConfigError(Config.backend) 23 | 24 | 25 | def initCuda(): 26 | from PuzzleLib.Cuda import Backend 27 | initGPU(Backend) 28 | 29 | 30 | def initHip(): 31 | from PuzzleLib.Hip import Backend 32 | initGPU(Backend) 33 | 34 | 35 | def initGPU(Backend): 36 | backend = Backend.getBackend(Config.deviceIdx, initmode=2, logger=Config.getLogger()) 37 | memoryPool, matmod = backend.memoryPool, backend.matmod 38 | 39 | def wrapAddVecToMat(vec, mat, axis, out): 40 | return matmod.addVecToMat(vec, mat, axis, out, memoryPool) 41 | 42 | def wrapArgmax(tensor, axis): 43 | return matmod.argmax(tensor, axis, memoryPool) 44 | 45 | global addVecToMat, argmax 46 | addVecToMat = wrapAddVecToMat 47 | argmax = wrapArgmax 48 | 49 | def wrapAddVecToMatBatch(vec, mat, axis, out): 50 | return matmod.addVecToMat(vec, mat, axis, out, memoryPool) 51 | 52 | def wrapArgmaxBatch(tensor, axis): 53 | return matmod.argmax(tensor, axis, memoryPool) 54 | 55 | global addVecToMatBatch, argmaxBatch 56 | addVecToMatBatch = wrapAddVecToMatBatch 57 | argmaxBatch = wrapArgmaxBatch 58 | 59 | 60 | def initCPU(): 61 | import numpy as np 62 | from PuzzleLib.CPU.CPUArray import CPUArray 63 | 64 | def wrapAddVecToMat(v, m, axis, out): 65 | if axis == 0: 66 | v = v[:, np.newaxis] 67 | elif axis == 1: 68 | v = v[np.newaxis, :] 69 | 70 | np.add(m.get(copy=False), v.get(copy=False), out=out.get(copy=False)) 71 | 72 | def wrapArgmax(mats, axis): 73 | out = np.empty(mats.shape[:axis] + mats.shape[axis + 1:], dtype=np.int32) 74 | np.argmax(mats.get(copy=False), axis, out=out) 75 | 76 | return CPUArray(out.shape, out.dtype, data=out, acquire=True) 77 | 78 | global addVecToMat, argmax 79 | addVecToMat = wrapAddVecToMat 80 | argmax = wrapArgmax 81 | 82 | def wrapArgmax(mats, axis): 83 | out = np.empty(mats.shape[:axis] + mats.shape[axis + 1:], dtype=np.int32) 84 | np.argmax(mats.get(copy=False), axis, out=out) 85 | 86 | return CPUArray(out.shape, out.dtype, data=out, acquire=True) 87 | 88 | global argmaxBatch 89 | argmaxBatch = wrapArgmax 90 | 91 | 92 | autoinit() 93 | -------------------------------------------------------------------------------- /CPU/Kernels/Upsample2D.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Compiler.Codegen.Types import void_t, int32_t, float_t 4 | 5 | from PuzzleLib.CPU.SourceModule import SourceModule 6 | from PuzzleLib.CPU.CPUArray import CPUArray 7 | 8 | 9 | upsample2dNearestTmpl = """ 10 | 11 | static void upsample2dNearest(float * __restrict outdata, const float * __restrict indata, 12 | int32_t batchsize, int32_t maps, int32_t inh, int32_t inw, int32_t hscale, int32_t wscale) 13 | { 14 | int32_t outh = inh * hscale, outw = inw * wscale; 15 | 16 | for (int32_t z = 0; z < batchsize * maps; z++) 17 | for (int32_t y = 0; y < inh; y++) 18 | for (int32_t x = 0; x < inw; x++) 19 | for (int32_t i = 0; i < hscale; i++) 20 | for (int32_t j = 0; j < wscale; j++) 21 | { 22 | int32_t outidx = z * outh * outw + (y * hscale + i) * outw + (x * wscale + j); 23 | outdata[outidx] = indata[z * inh * inw + y * inw + x]; 24 | } 25 | } 26 | 27 | """ 28 | 29 | 30 | nearestMod = SourceModule(upsample2dNearestTmpl, functions=[ 31 | ("upsample2dNearest", void_t, [ 32 | (float_t.ptr.restrict, "outdata"), (float_t.const.ptr.restrict, "indata"), (int32_t, "batchsize"), 33 | (int32_t, "maps"), (int32_t, "inh"), (int32_t, "inw"), (int32_t, "hscale"), (int32_t, "wscale") 34 | ], True) 35 | ]) 36 | 37 | 38 | def upsample2d(data, scale, mode="nearest"): 39 | batchsize, maps, inh, inw = data.shape 40 | hscale, wscale = (scale, scale) if isinstance(scale, int) else scale 41 | 42 | outh, outw = hscale * inh, wscale * inw 43 | outdata = CPUArray.empty((batchsize, maps, outh, outw), dtype=data.dtype) 44 | 45 | if mode == "nearest": 46 | nearestMod.upsample2dNearest(outdata.data, data.data, batchsize, maps, inh, inw, hscale, wscale) 47 | 48 | else: 49 | raise ValueError("Unsupported upsampling mode") 50 | 51 | return outdata 52 | 53 | 54 | def unittest(): 55 | batchsize, maps, inh, inw = 3, 2, 16, 15 56 | scale = 2 57 | 58 | data = CPUArray.toDevice(np.random.uniform(low=-1.0, high=1.0, size=(batchsize, maps, inh, inw)).astype(np.float32)) 59 | outdata = upsample2d(data, scale, mode="nearest") 60 | 61 | hostData = data.get() 62 | hostOutData = np.empty(outdata.shape, dtype=np.float32) 63 | 64 | for b in range(batchsize): 65 | for c in range(maps): 66 | for y in range(inh): 67 | for x in range(inw): 68 | hostOutData[b, c, y * scale:(y + 1) * scale, x * scale:(x + 1) * scale] = hostData[b, c, y, x] 69 | 70 | assert np.allclose(hostOutData, outdata.get()) 71 | 72 | 73 | if __name__ == "__main__": 74 | unittest() 75 | -------------------------------------------------------------------------------- /TestLib/CnnCifar10Simple.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import numpy as np 4 | 5 | from PuzzleLib.Containers import Sequential 6 | from PuzzleLib.Modules import Conv2D, MaxPool2D, Activation, relu, Flatten, Linear 7 | 8 | from PuzzleLib.Datasets import Cifar10Loader 9 | from PuzzleLib.Visual import showImageBasedFilters, showFilters 10 | from PuzzleLib.Handlers import Trainer, Validator 11 | from PuzzleLib.Optimizers import MomentumSGD 12 | from PuzzleLib.Cost import CrossEntropy 13 | 14 | 15 | def buildNet(): 16 | seq = Sequential() 17 | 18 | seq.append(Conv2D(3, 32, 5, pad=2, wscale=0.0001, initscheme="gaussian")) 19 | seq.append(MaxPool2D(3, 2)) 20 | seq.append(Activation(relu)) 21 | 22 | seq.append(Conv2D(32, 32, 5, pad=2, wscale=0.01, initscheme="gaussian")) 23 | seq.append(MaxPool2D(3, 2)) 24 | seq.append(Activation(relu)) 25 | 26 | seq.append(Conv2D(32, 64, 5, pad=2, wscale=0.01, initscheme="gaussian")) 27 | seq.append(MaxPool2D(3, 2)) 28 | seq.append(Activation(relu)) 29 | 30 | seq.append(Flatten()) 31 | seq.append(Linear(seq.dataShapeFrom((1, 3, 32, 32))[1], 64, wscale=0.1, initscheme="gaussian")) 32 | seq.append(Activation(relu)) 33 | 34 | seq.append(Linear(64, 10, wscale=0.1, initscheme="gaussian")) 35 | return seq 36 | 37 | 38 | def main(): 39 | cifar10 = Cifar10Loader() 40 | data, labels = cifar10.load(path="../TestData/") 41 | data, labels = data[:], labels[:] 42 | print("Loaded cifar10") 43 | 44 | np.random.seed(1234) 45 | net = buildNet() 46 | 47 | optimizer = MomentumSGD() 48 | optimizer.setupOn(net, useGlobalState=True) 49 | optimizer.learnRate = 0.01 50 | optimizer.momRate = 0.9 51 | 52 | cost = CrossEntropy(maxlabels=10) 53 | trainer = Trainer(net, cost, optimizer) 54 | 55 | validator = Validator(net, cost) 56 | currerror = math.inf 57 | 58 | for i in range(25): 59 | trainer.trainFromHost( 60 | data[:50000], labels[:50000], macroBatchSize=50000, 61 | onMacroBatchFinish=lambda train: print("Train error: %s" % train.cost.getMeanError()) 62 | ) 63 | valerror = validator.validateFromHost(data[50000:], labels[50000:], macroBatchSize=10000) 64 | print("Accuracy: %s" % (1.0 - valerror)) 65 | 66 | if valerror >= currerror: 67 | optimizer.learnRate *= 0.5 68 | print("Lowered learn rate: %s" % optimizer.learnRate) 69 | 70 | currerror = valerror 71 | 72 | showImageBasedFilters(net[0].W.get(), "../TestData/conv1.png") 73 | showFilters(net[3].W.get(), "../TestData/conv2.png") 74 | showFilters(net[6].W.get(), "../TestData/conv3.png") 75 | 76 | 77 | if __name__ == "__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /Compiler/Codegen/Map/Generate.py: -------------------------------------------------------------------------------- 1 | import os, random 2 | from string import Template 3 | 4 | from PuzzleLib.Compiler.Toolchain import createTemplateNames, writeTemplates, buildTemplateTest 5 | 6 | 7 | def generateMap(name, K, V, hasher, compareKeys, borrowKey, borrowValue, 8 | destructKey="(void)", destructValue="(void)", minLog2Capacity=4, 9 | headerPreambule=None, bodyPreambule=None, malloc="malloc", free="free", filename=None): 10 | headerPreambule = "%s\n\n" % headerPreambule if headerPreambule is not None else "" 11 | bodyPreambule = "%s\n\n" % bodyPreambule if bodyPreambule is not None else "" 12 | 13 | filename = name if filename is None else filename 14 | headername, bodyname = createTemplateNames(filename) 15 | 16 | dirname = os.path.dirname(__file__) 17 | headerTmpl, bodyTmpl = os.path.join(dirname, "TMap.h"), os.path.join(dirname, "TMap.c") 18 | 19 | with open(headerTmpl, mode="r", encoding="utf-8") as f: 20 | header = Template(f.read()).substitute(HEADER_PREAMBULE=headerPreambule, NAME=name, K=K, V=V) 21 | 22 | with open(bodyTmpl, mode="r", encoding="utf-8") as f: 23 | body = Template(f.read()).substitute( 24 | HEADER_NAME=os.path.basename(headername), BODY_PREAMBULE=bodyPreambule, NAME=name, K=K, V=V, 25 | MIN_LOG2_CAPACITY=minLog2Capacity, MALLOC=malloc, FREE=free, 26 | HASHER=hasher, COMPARE_KEYS=compareKeys, BORROW_KEY=borrowKey, BORROW_VALUE=borrowValue, 27 | DESTRUCT_KEY=destructKey, DESTRUCT_VALUE=destructValue 28 | ) 29 | 30 | writeTemplates([ 31 | (header, headername), 32 | (body, bodyname) 33 | ]) 34 | 35 | return bodyname 36 | 37 | 38 | def unittest(): 39 | IntMap = buildTemplateTest( 40 | name="IntMap", bindingName="TMapTest.c", path="../../TestData", generator=generateMap, K="int", V="int", 41 | hasher="hashKey", compareKeys="compareKeys", borrowKey="(int)", borrowValue="(int)", 42 | bodyPreambule=""" 43 | inline static size_t hashKey(int key) { return key; } 44 | inline static bool compareKeys(int key1, int key2) { return key1 == key2; } 45 | """) 46 | 47 | size = 1 << 16 48 | 49 | keys, values = list(range(size)), list(range(size)) 50 | random.shuffle(keys) 51 | random.shuffle(values) 52 | 53 | pymap = {k: v for k, v in zip(keys, values)} 54 | 55 | intmap = IntMap.IntMap() 56 | 57 | for k, v in pymap.items(): 58 | intmap[k] = v 59 | 60 | assert len(intmap) == size 61 | 62 | for k in pymap.keys(): 63 | assert intmap[k] == pymap[k] 64 | 65 | for k in pymap.keys(): 66 | del intmap[k] 67 | 68 | assert len(intmap) == 0 69 | 70 | 71 | if __name__ == "__main__": 72 | unittest() 73 | -------------------------------------------------------------------------------- /Modules/MaxPool2D.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray 4 | from PuzzleLib.Backend.Kernels import Pool 5 | from PuzzleLib.Backend.Dnn import PoolMode, poolNd, poolNdBackward 6 | 7 | from PuzzleLib.Modules.Pool2D import Pool2D 8 | 9 | 10 | class MaxPool2D(Pool2D): 11 | def __init__(self, size=2, stride=2, pad=0, useMask=False, name=None): 12 | super().__init__(size, stride, pad, name) 13 | self.registerBlueprint(locals()) 14 | 15 | self.useMask = useMask 16 | self.mask = None 17 | 18 | self.mode = PoolMode.max 19 | 20 | 21 | @property 22 | def withMask(self): 23 | return self.useMask 24 | 25 | 26 | @withMask.setter 27 | def withMask(self, val): 28 | self.useMask = val 29 | self.gradUsesOutData = False if val else True 30 | 31 | 32 | def updateData(self, data): 33 | if self.useMask: 34 | self.data, self.mask = Pool.maxpool2d(data, size=self.size, stride=self.stride, pad=self.pad) 35 | else: 36 | test = not self.train 37 | self.data, self.workspace = poolNd(data, size=self.size, stride=self.stride, pad=self.pad, 38 | mode=self.mode, test=test) 39 | 40 | 41 | def updateGrad(self, grad): 42 | if self.useMask: 43 | self.grad = Pool.maxpool2dBackward(grad, self.inData.shape, self.mask, 44 | size=self.size, stride=self.stride, pad=self.pad) 45 | else: 46 | self.grad = poolNdBackward(self.inData, self.data, grad, self.workspace, 47 | size=self.size, stride=self.stride, pad=self.pad, mode=self.mode) 48 | 49 | 50 | def reset(self): 51 | super().reset() 52 | self.mask = None 53 | 54 | 55 | def unittest(): 56 | batchsize, maps, h, w = 1, 1, 6, 6 57 | data = gpuarray.to_gpu(np.random.randn(batchsize, maps, h, w).astype(np.float32)) 58 | 59 | maxpool2d = MaxPool2D() 60 | maxpool2d(data) 61 | 62 | grad = gpuarray.to_gpu(np.random.randn(*maxpool2d.data.shape).astype(np.float32)) 63 | maxpool2d.backward(grad) 64 | 65 | def maxDownSample2d(dat, factor): 66 | trimrows = dat.shape[0] // factor * factor 67 | trimcols = dat.shape[1] // factor * factor 68 | 69 | maxSoFar = None 70 | first = True 71 | 72 | for coff in range(factor): 73 | for roff in range(factor): 74 | hopped = dat[roff:trimrows:factor, coff:trimcols:factor] 75 | if first: 76 | maxSoFar = hopped 77 | first = False 78 | else: 79 | maxSoFar = np.maximum(maxSoFar, hopped) 80 | 81 | return maxSoFar 82 | 83 | hostOutData = maxDownSample2d(data.get()[0, 0], 2) 84 | assert np.allclose(hostOutData, maxpool2d.data.get()[0, 0]) 85 | 86 | 87 | if __name__ == "__main__": 88 | unittest() 89 | -------------------------------------------------------------------------------- /Modules/Pool3D.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib.Backend import gpuarray 2 | from PuzzleLib.Modules.Module import ModuleError, Module 3 | 4 | 5 | class Pool3D(Module): 6 | def __init__(self, size=2, stride=2, pad=0, name=None): 7 | super().__init__(name) 8 | self.gradUsesOutData = True 9 | 10 | self.size = self.repeat(size, 3) 11 | self.stride = self.repeat(stride, 3) 12 | self.pad = self.repeat(pad, 3) 13 | 14 | self.workspace = None 15 | 16 | 17 | def dataShapeFrom(self, shape): 18 | batchsize, maps, ind, inh, inw = shape 19 | 20 | dsize, hsize, wsize = self.size 21 | dpad, hpad, wpad = self.pad 22 | dstride, hstride, wstride = self.stride 23 | 24 | outd = (ind + 2 * dpad - dsize) // dstride + 1 25 | outh = (inh + 2 * hpad - hsize) // hstride + 1 26 | outw = (inw + 2 * wpad - wsize) // wstride + 1 27 | 28 | return batchsize, maps, outd, outh, outw 29 | 30 | 31 | def checkDataShape(self, shape): 32 | if len(shape) != 5: 33 | raise ModuleError("Data must be 5d tensor") 34 | 35 | _, _, ind, inh, inw = shape 36 | if ind + 2 * self.pad[0] < self.size[0]: 37 | raise ModuleError("Data cube time is too small (got %d, expected at least %d)" % 38 | (ind + 2 * self.pad[0], self.size[0])) 39 | 40 | if inh + 2 * self.pad[1] < self.size[1]: 41 | raise ModuleError("Data cube height is too small (got %d, expected at least %d)" % 42 | (inh + 2 * self.pad[1], self.size[1])) 43 | 44 | if inw + 2 * self.pad[2] < self.size[2]: 45 | raise ModuleError("Data cube width is too small (got %d, expected at least %d)" % 46 | (inw + 2 * self.pad[2], self.size[2])) 47 | 48 | 49 | def gradShapeFrom(self, shape): 50 | batchsize, maps, outd, outh, outw = shape 51 | 52 | dsize, hsize, wsize = self.size 53 | dpad, hpad, wpad = self.pad 54 | dstride, hstride, wstride = self.stride 55 | 56 | ind = (outd - 1) * dstride - 2 * dpad + dsize 57 | inh = (outh - 1) * hstride - 2 * hpad + hsize 58 | inw = (outw - 1) * wstride - 2 * wpad + wsize 59 | 60 | return batchsize, maps, ind, inh, inw 61 | 62 | 63 | def checkGradShape(self, shape): 64 | if len(shape) != 5: 65 | raise ModuleError("Grad must be 5d tensor") 66 | 67 | 68 | def updateData(self, data): 69 | raise NotImplementedError() 70 | 71 | 72 | def updateGrad(self, grad): 73 | raise NotImplementedError() 74 | 75 | 76 | def reset(self): 77 | super().reset() 78 | self.workspace = None 79 | 80 | 81 | def calcMode(self, T): 82 | dtypes = {dtype for dtype, _ in gpuarray.dtypesSupported()} 83 | 84 | if T not in dtypes: 85 | raise ModuleError("Unsupported dtype %s" % T) 86 | 87 | self.calctype = T 88 | -------------------------------------------------------------------------------- /Compiler/Compilers/GCC.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from PuzzleLib.Compiler.Compilers.Compiler import Compiler 4 | 5 | 6 | class GCCLike(Compiler): 7 | cflags = ["-Wall", "-Wextra"] 8 | ldflags = ["--shared"] 9 | 10 | 11 | def __init__(self, verbose, forPython=True): 12 | super().__init__(verbose, forPython=forPython) 13 | 14 | if sys.platform == "linux": 15 | self.cflags = self.cflags + ["-fPIC"] 16 | 17 | 18 | def objectLine(self, extfile, sourcefiles): 19 | return self.fullCFlags(asObject=True) + self.outFlags(extfile) + sourcefiles 20 | 21 | 22 | def linkLine(self, extfile, objfiles): 23 | return self.fullLDFlags() + self.outFlags(extfile) + objfiles + self.linkFlags() 24 | 25 | 26 | def buildLine(self, extfile, sourcefiles): 27 | return self.fullCFlags(asObject=False) + self.fullLDFlags() + self.outFlags(extfile) + \ 28 | sourcefiles + self.linkFlags() 29 | 30 | 31 | def depLine(self, sourcefiles): 32 | return ["-M"] + self.fullCFlags(asObject=False, debug=False, optimize=False) + sourcefiles 33 | 34 | 35 | def fullCFlags(self, asObject, debug=True, optimize=True): 36 | oflags = self.fullCppFlags() 37 | 38 | if debug and self.debuglevel > 0: 39 | oflags.append("-g3" if self.debuglevel >= 3 else "-g") 40 | 41 | if optimize and self.optlevel > 0: 42 | oflags.append("-O3" if self.optlevel >= 3 else "-O%s" % self.optlevel) 43 | 44 | if self.optlevel >= 3: 45 | oflags.extend(["-march=native", "-mtune=native", "-ffast-math"]) 46 | 47 | if debug and self.debuglevel >= 3: 48 | oflags.append("-fno-omit-frame-pointer") 49 | 50 | oflags.extend("-D%s" % define for define in self.defines) 51 | return self.cflags + oflags + ["-I%s" % idir for idir in self.includeDirs] + (["-c"] if asObject else []) 52 | 53 | 54 | def fullCppFlags(self): 55 | return ["-std=c++14" if self.cpp else "-std=gnu99"] 56 | 57 | 58 | def fullLDFlags(self): 59 | return self.ldflags + ["-L%s" % ldir for ldir in self.libraryDirs] 60 | 61 | 62 | def outFlags(self, extfile): 63 | outFlags = ["-o", extfile] 64 | 65 | if self.optlevel >= 4: 66 | outFlags.append("-flto") 67 | 68 | return outFlags 69 | 70 | 71 | def linkFlags(self): 72 | return ["-l%s" % lib for lib in self.libraries] 73 | 74 | 75 | class GCC(GCCLike): 76 | cc = "gcc" 77 | 78 | 79 | class Clang(GCCLike): 80 | cc = "clang" 81 | 82 | 83 | def fullCppFlags(self): 84 | return ["-std=c++14" if self.cpp else "-std=c99"] 85 | 86 | 87 | def outFlags(self, extfile): 88 | outflags = super().outFlags(extfile) 89 | 90 | if sys.platform == "win32": 91 | outflags.append("-fuse-ld=lld") 92 | 93 | return outflags 94 | -------------------------------------------------------------------------------- /Modules/SoftMax.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray 4 | from PuzzleLib.Backend.Dnn import softmaxNd, softmaxNdBackward 5 | 6 | from PuzzleLib.Modules.Module import ModuleError, Module 7 | 8 | 9 | class SoftMax(Module): 10 | def __init__(self, name=None): 11 | super().__init__(name) 12 | self.gradUsesOutData = True 13 | 14 | 15 | def updateData(self, data): 16 | shape = data.shape 17 | ndim = max(0, 4 - len(shape)) 18 | 19 | data = data.reshape(shape + tuple(1 for _ in range(ndim))) 20 | self.data = softmaxNd(data).reshape(shape) 21 | 22 | 23 | def updateGrad(self, grad): 24 | shape = grad.shape 25 | ndim = max(0, 4 - len(shape)) 26 | 27 | grad = grad.reshape(shape + tuple(1 for _ in range(ndim))) 28 | data = self.data.reshape(shape + tuple(1 for _ in range(ndim))) 29 | 30 | self.grad = softmaxNdBackward(data, grad).reshape(shape) 31 | 32 | 33 | def dataShapeFrom(self, shape): 34 | return shape 35 | 36 | 37 | def gradShapeFrom(self, shape): 38 | return shape 39 | 40 | 41 | def calcMode(self, T): 42 | dtypes = {dtype for dtype, _ in gpuarray.dtypesSupported()} 43 | 44 | if T not in dtypes: 45 | raise ModuleError("Unsupported dtype %s" % T) 46 | 47 | self.calctype = T 48 | 49 | 50 | def unittest(): 51 | batchsize, maps = 2, 3 52 | 53 | hostData = np.random.randn(batchsize, maps, 1).astype(np.float32) 54 | data = gpuarray.to_gpu(hostData) 55 | 56 | softmax = SoftMax() 57 | softmax(data) 58 | 59 | def softMaxForward(w): 60 | e = np.exp(w - np.amax(w)) 61 | p = e / np.sum(e) 62 | return p 63 | 64 | hostData = hostData.reshape(batchsize, maps).astype(np.float32) 65 | 66 | hostOutData = np.vstack([softMaxForward(hostData[i]) for i in range(batchsize)]) 67 | assert np.allclose(hostOutData, softmax.data.get().reshape(batchsize, maps).astype(np.float32)) 68 | 69 | hostGrad = np.random.randn(batchsize, maps, 1, 1).astype(np.float32) 70 | grad = gpuarray.to_gpu(hostGrad) 71 | 72 | softmax.backward(grad) 73 | hostGrad = hostGrad.reshape(batchsize, maps).astype(np.float32) 74 | 75 | def softMaxBackward(outdata, gr): 76 | ingrad = np.zeros(outdata.shape, dtype=np.float32) 77 | for i in range(ingrad.shape[0]): 78 | ingrad[i] += outdata[i] * gr[i] 79 | 80 | for j in range(outdata.shape[0]): 81 | ingrad[i] -= outdata[i] * outdata[j] * gr[j] 82 | return ingrad 83 | 84 | hostInGrad = np.vstack([softMaxBackward(hostOutData[i], hostGrad[i]) for i in range(batchsize)]) 85 | assert np.allclose(hostInGrad, softmax.grad.get().reshape(batchsize, maps).astype(np.float32)) 86 | 87 | 88 | if __name__ == "__main__": 89 | unittest() 90 | -------------------------------------------------------------------------------- /Backend/Kernels/Costs.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib import Config 2 | 3 | 4 | bceKer = None 5 | hingeKer = None 6 | smoothL1Ker = None 7 | l1HingeKer = None 8 | 9 | getAccuracyKernel = None 10 | crossEntropyKernel = None 11 | svmKernel = None 12 | 13 | ctcLoss = None 14 | ctcLossTest = None 15 | 16 | 17 | def autoinit(): 18 | if not Config.shouldInit(): 19 | return 20 | 21 | if Config.backend == Config.Backend.cuda: 22 | initCuda() 23 | elif Config.backend == Config.Backend.hip: 24 | initHip() 25 | elif Config.backend == Config.Backend.cpu: 26 | initCPU() 27 | elif Config.backend == Config.Backend.intel: 28 | initIntel() 29 | else: 30 | raise Config.ConfigError(Config.backend) 31 | 32 | 33 | def initCuda(): 34 | from PuzzleLib.Cuda import Backend 35 | from PuzzleLib.Cuda.Kernels import CTC 36 | 37 | initGPU(Backend, CTC) 38 | 39 | 40 | def initHip(): 41 | from PuzzleLib.Hip import Backend 42 | from PuzzleLib.Cuda.Kernels import CTC 43 | 44 | initGPU(Backend, CTC) 45 | 46 | 47 | def initGPU(Backend, CTC): 48 | backend = Backend.getBackend(Config.deviceIdx, initmode=2, logger=Config.getLogger()) 49 | memoryPool, costmod, ctcmod = backend.memoryPool, backend.costmod, backend.ctcmod 50 | 51 | global bceKer, hingeKer, smoothL1Ker, l1HingeKer, getAccuracyKernel 52 | bceKer = backend.bceKer 53 | hingeKer = backend.hingeKer 54 | smoothL1Ker = backend.smoothL1Ker 55 | l1HingeKer = backend.l1HingeKer 56 | getAccuracyKernel = backend.getAccuracyKernel 57 | 58 | def wrapCrossEntropy(scores, labels, weights, error): 59 | return costmod.crossEntropy(scores, labels, weights, error, memoryPool) 60 | 61 | def wrapSVM(scores, labels, mode, error): 62 | return costmod.svm(scores, labels, mode, error, memoryPool) 63 | 64 | global crossEntropyKernel, svmKernel 65 | crossEntropyKernel = wrapCrossEntropy 66 | svmKernel = wrapSVM 67 | 68 | def wrapCTC(data, datalen, labels, lengths, blank, error, normalized): 69 | return ctcmod.ctcLoss(data, datalen, labels, lengths, blank, error, normalized, allocator=memoryPool) 70 | 71 | global ctcLoss, ctcLossTest 72 | ctcLoss = wrapCTC 73 | ctcLossTest = CTC.hostCTCLoss 74 | 75 | 76 | def initCPU(): 77 | pass 78 | 79 | 80 | def initIntel(): 81 | from PuzzleLib.Intel.Kernels import Costs 82 | 83 | global bceKer, hingeKer, smoothL1Ker, l1HingeKer, getAccuracyKernel, crossEntropyKernel, svmKernel 84 | bceKer = Costs.bceKer 85 | hingeKer = Costs.hingeKer 86 | smoothL1Ker = Costs.smoothL1Ker 87 | l1HingeKer = Costs.l1HingeKer 88 | getAccuracyKernel = Costs.getAccuracyKernel 89 | crossEntropyKernel = Costs.crossEntropy 90 | svmKernel = Costs.svm 91 | 92 | 93 | autoinit() 94 | -------------------------------------------------------------------------------- /Modules/SwapAxes.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray, Memory 4 | from PuzzleLib.Modules.Module import ModuleError, Module 5 | 6 | 7 | class SwapAxes(Module): 8 | def __init__(self, axis1, axis2, name=None): 9 | super().__init__(name) 10 | self.registerBlueprint(locals()) 11 | 12 | self.axis1, self.axis2 = (axis2, axis1) if axis1 > axis2 else (axis1, axis2) 13 | 14 | 15 | def updateData(self, data): 16 | self.data = Memory.swapaxes(data, self.axis1, self.axis2) 17 | 18 | 19 | def updateGrad(self, grad): 20 | self.grad = Memory.swapaxes(grad, self.axis1, self.axis2) 21 | 22 | 23 | def checkDataShape(self, shape): 24 | if len(shape) - 1 < self.axis2: 25 | raise ModuleError("Data dimension needs to be at least %d, (data has %d)" % (self.axis2 + 1, len(shape))) 26 | 27 | 28 | def checkGradShape(self, shape): 29 | if len(shape) - 1 < self.axis2: 30 | raise ModuleError("Grad dimension needs to be at least %d, (grad has %d)" % (self.axis2 + 1, len(shape))) 31 | 32 | 33 | def dataShapeFrom(self, shape): 34 | return shape[:self.axis1] + (shape[self.axis2], ) + shape[self.axis1 + 1:self.axis2] + \ 35 | (shape[self.axis1], ) + shape[self.axis2 + 1:] 36 | 37 | 38 | def gradShapeFrom(self, shape): 39 | return shape[:self.axis1] + (shape[self.axis2], ) + shape[self.axis1 + 1:self.axis2] + \ 40 | (shape[self.axis1], ) + shape[self.axis2 + 1:] 41 | 42 | 43 | def calcMode(self, T): 44 | dtypes = {dtype for dtype, _ in gpuarray.dtypesSupported()} 45 | 46 | if T not in dtypes: 47 | raise ModuleError("Unsupported dtype %s" % T) 48 | 49 | self.calctype = T 50 | 51 | 52 | def unittest(): 53 | for dtype, _ in gpuarray.dtypesSupported(): 54 | swapAxesTest(dtype) 55 | 56 | 57 | def swapAxesTest(dtype): 58 | shape = (10, 3, 5, 4, 2) 59 | 60 | for axis1 in range(len(shape)): 61 | for axis2 in range(axis1 + 1, len(shape)): 62 | hostData = np.random.randn(*shape).astype(dtype) 63 | data = gpuarray.to_gpu(hostData) 64 | 65 | swapaxes = SwapAxes(axis1, axis2) 66 | swapaxes.calcMode(dtype) 67 | 68 | swapaxes(data) 69 | 70 | hostOutData = np.swapaxes(hostData, axis1=axis1, axis2=axis2) 71 | assert np.allclose(hostOutData, swapaxes.data.get()) 72 | 73 | hostGrad = np.random.randn(*swapaxes.data.shape).astype(dtype) 74 | grad = gpuarray.to_gpu(hostGrad) 75 | 76 | swapaxes.backward(grad) 77 | 78 | hostInGrad = np.swapaxes(hostGrad, axis1=axis2, axis2=axis1) 79 | 80 | assert swapaxes.grad.shape == data.shape 81 | assert np.allclose(hostInGrad, swapaxes.grad.get()) 82 | 83 | 84 | if __name__ == "__main__": 85 | unittest() 86 | -------------------------------------------------------------------------------- /Optimizers/SMORMS3.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib import Config 4 | 5 | from PuzzleLib.Backend import gpuarray 6 | from PuzzleLib.Backend.Kernels.ElementWise import smorms3Ker 7 | 8 | from PuzzleLib.Optimizers.Optimizer import Optimizer, trainSimpleTest, trainHardTest 9 | 10 | 11 | class SMORMS3(Optimizer): 12 | def __init__(self, learnRate=1e-3, epsilon=1e-16, nodeinfo=None): 13 | super().__init__(nodeinfo) 14 | 15 | self.epsilon = None 16 | 17 | self.setAttr("learnRate", learnRate) 18 | self.setAttr("epsilon", epsilon) 19 | 20 | 21 | def setupState(self, var): 22 | return { 23 | "mem": gpuarray.to_gpu(np.ones(var.data.shape, dtype=np.float32)), 24 | "mg": gpuarray.zeros(var.data.shape, dtype=np.float32), 25 | "ms": gpuarray.zeros(var.data.shape, dtype=np.float32) 26 | } 27 | 28 | 29 | def updateVar(self, var, state, stream=None): 30 | smorms3Ker(var.data.dtype)( 31 | var.data, var.grad, state["mem"], state["mg"], state["ms"], self.learnRate * var.learnRate, self.epsilon, 32 | stream=stream 33 | ) 34 | 35 | 36 | def unittest(): 37 | for dtype, atol in gpuarray.dtypesSupported(): 38 | calcTest(dtype, atol) 39 | trainSimpleTest(SMORMS3, dtype, learnRate=1e-2) 40 | 41 | if Config.backend == Config.Backend.cuda: 42 | trainHardTest(SMORMS3, dtype, learnRate=1e-2) 43 | 44 | 45 | def calcTest(dtype, atol): 46 | lr, epsilon = 1e-3, 1e-16 47 | shape = (11, 13) 48 | 49 | hostW, hostDw = np.random.randn(*shape).astype(dtype), np.random.randn(*shape).astype(dtype) 50 | hostMem = (1.0 + np.random.randn(*shape)**2).astype(np.float32) 51 | hostMg, hostMs = np.random.randn(*shape).astype(np.float32), np.random.randn(*shape).astype(np.float32)**2 52 | 53 | w, dw = gpuarray.to_gpu(hostW), gpuarray.to_gpu(hostDw) 54 | mem, mg, ms = gpuarray.to_gpu(hostMem), gpuarray.to_gpu(hostMg), gpuarray.to_gpu(hostMs) 55 | 56 | smorms3Ker(w.dtype)(w, dw, mem, mg, ms, lr, epsilon) 57 | 58 | hostW, hostDw = hostW.astype(np.float32), hostDw.astype(np.float32) 59 | 60 | r = 1.0 / (1.0 + hostMem) 61 | hostMg = (1.0 - r) * hostMg + r * hostDw 62 | hostMs = (1.0 - r) * hostMs + r * hostDw**2 63 | x = hostMg**2 / (hostMs + epsilon) 64 | 65 | hostMem = 1.0 + hostMem * (1.0 - x) 66 | hostW += hostDw * np.minimum(lr, x) / (np.sqrt(hostMs) + epsilon) 67 | 68 | hostW, hostDw = hostW.astype(dtype), hostDw.astype(dtype) 69 | 70 | assert np.allclose(hostMem, mem.get(), atol=atol) 71 | assert np.allclose(hostMg, mg.get(), atol=atol) 72 | assert np.allclose(hostMs, ms.get(), atol=atol) 73 | assert np.allclose(hostW, w.get(), atol=atol) 74 | 75 | 76 | if __name__ == "__main__": 77 | unittest() 78 | -------------------------------------------------------------------------------- /Modules/AvgPool3D.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray 4 | from PuzzleLib.Backend.Dnn import PoolMode, poolNd, poolNdBackward 5 | 6 | from PuzzleLib.Modules.Pool3D import Pool3D 7 | 8 | 9 | class AvgPool3D(Pool3D): 10 | def __init__(self, size=2, stride=2, pad=0, includePad=True, name=None): 11 | super().__init__(size, stride, pad, name) 12 | self.registerBlueprint(locals()) 13 | 14 | self.mode = PoolMode.avgWithPad if includePad else PoolMode.avgNoPad 15 | 16 | 17 | def updateData(self, data): 18 | self.data, self.workspace = poolNd(data, size=self.size, stride=self.stride, pad=self.pad, mode=self.mode, 19 | test=not self.train) 20 | 21 | 22 | def updateGrad(self, grad): 23 | self.grad = poolNdBackward(self.inData, self.data, grad, self.workspace, size=self.size, stride=self.stride, 24 | pad=self.pad, mode=self.mode) 25 | 26 | 27 | def unittest(): 28 | batchsize, maps, d, h, w = 2, 6, 5, 7, 5 29 | data = gpuarray.to_gpu(np.random.randn(batchsize, maps, d, h, w).astype(np.float32)) 30 | 31 | size = 3 32 | stride, pad = 2, 1 33 | 34 | avgpool3d = AvgPool3D(size=size, stride=stride, pad=pad, includePad=True) 35 | avgpool3d(data) 36 | 37 | hostData = np.zeros(shape=(batchsize, maps, d + 2 * pad, h + 2 * pad, w + 2 * pad), dtype=np.float32) 38 | hostData[:, :, pad:-pad, pad:-pad, pad:-pad] = data.get() 39 | hostOutData = np.empty(avgpool3d.data.shape, dtype=np.float32) 40 | 41 | for b in range(batchsize): 42 | for c in range(maps): 43 | for z in range(hostOutData.shape[2]): 44 | for y in range(hostOutData.shape[3]): 45 | for x in range(hostOutData.shape[4]): 46 | hostOutData[b, c, z, y, x] = np.mean(hostData[b, c, z * stride:z * stride + size, 47 | y * stride:y * stride + size,x * stride:x * stride + size]) 48 | 49 | assert np.allclose(hostOutData, avgpool3d.data.get()) 50 | 51 | grad = gpuarray.to_gpu(np.random.randn(*avgpool3d.data.shape).astype(np.float32)) 52 | avgpool3d.backward(grad) 53 | 54 | hostGrad = grad.get() 55 | hostInGrad = np.zeros(hostData.shape, dtype=np.float32) 56 | 57 | for b in range(batchsize): 58 | for c in range(maps): 59 | for z in range(hostOutData.shape[2]): 60 | for y in range(hostOutData.shape[3]): 61 | for x in range(hostOutData.shape[4]): 62 | for dz in range(size): 63 | for dy in range(size): 64 | for dx in range(size): 65 | hostInGrad[b,c,z*stride+dz,y*stride+dy,x*stride+dx] += hostGrad[b,c,z,y,x]/size**3 66 | 67 | assert np.allclose(hostInGrad[:, :, pad:-pad, pad:-pad, pad:-pad], avgpool3d.grad.get()) 68 | 69 | 70 | if __name__ == "__main__": 71 | unittest() 72 | -------------------------------------------------------------------------------- /Hip/Wrappers/MIOpenNorm.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import numpy as np 3 | 4 | from PuzzleLib.Cuda.Wrappers.CuDnnNorm import batchNorm2dTest, batchNorm3dTest, instanceNorm2dTest 5 | 6 | 7 | def unittest(): 8 | from PuzzleLib.Hip import Backend 9 | backendTest(Backend) 10 | 11 | 12 | def backendTest(Backend): 13 | for deviceIdx in range(Backend.getDeviceCount()): 14 | bnd = Backend.getBackend(deviceIdx, initmode=2) 15 | 16 | float32 = bnd.dtypesSupported()[0] 17 | 18 | batchNorm2dTest(bnd, *float32, np.float32) 19 | batchNorm3dTest(bnd, *float32, np.float32) 20 | instanceNorm2dTest(bnd, *float32, np.float32) 21 | 22 | for dtype, atol in bnd.dtypesSupported(): 23 | mapLRN2dTest(bnd, dtype, atol) 24 | 25 | 26 | def mapLRN2dTest(bnd, dtype, atol): 27 | batchsize, maps, h, w = 2, 2, 9, 10 28 | N, alpha, beta, K = 5, 1.0, 0.5, 2.0 29 | 30 | lookBehind = int((N - 1) / 2) 31 | lookAhead = N - lookBehind 32 | 33 | hostData = np.random.randn(batchsize, maps, h, w).astype(dtype) 34 | 35 | data = bnd.GPUArray.toGpu(hostData) 36 | outdata, workspace = bnd.dnn.lrn(data, N=N, alpha=alpha, beta=beta, K=K, mode=bnd.LRNMode.map.value) 37 | 38 | norms = np.empty(hostData.shape, dtype=np.float32) 39 | 40 | for b, c, y, x in itertools.product(range(batchsize), range(maps), range(h), range(w)): 41 | slcy = slice(max(0, y - lookBehind), min(h, y + lookAhead)) 42 | slcx = slice(max(0, x - lookBehind), min(w, x + lookAhead)) 43 | 44 | slc = hostData[b, c, slcy, slcx].ravel() 45 | norms[b, c, y, x] = K + np.dot(slc, slc) * alpha / N**2 46 | 47 | hostOutData = (hostData / norms**beta).astype(dtype) 48 | assert np.allclose(hostOutData, outdata.get(), atol=atol) 49 | 50 | hostGrad = np.random.randn(*outdata.shape).astype(dtype) 51 | 52 | grad = bnd.GPUArray.toGpu(hostGrad) 53 | ingrad = bnd.dnn.lrnBackward( 54 | grad, data, outdata, workspace, N=N, alpha=alpha, beta=beta, K=K, mode=bnd.LRNMode.map.value 55 | ) 56 | 57 | hostInGrad = hostGrad / norms**beta 58 | k = 2.0 * alpha * beta / N**2 59 | 60 | for b, c, y, x in itertools.product(range(batchsize), range(maps), range(h), range(w)): 61 | slcy = slice(max(0, y - lookBehind), min(h, y + lookAhead)) 62 | slcx = slice(max(0, x - lookBehind), min(w, x + lookAhead)) 63 | 64 | slcdata, slcgrad = hostData[b, c, slcy, slcx].ravel(), hostGrad[b, c, slcy, slcx].ravel() 65 | slcnorms = norms[b, c, slcy, slcx].ravel() 66 | 67 | hostInGrad[b, c, y, x] -= k * hostData[b, c, y, x] * np.dot(slcgrad, slcdata / slcnorms**(beta + 1)) 68 | 69 | hostInGrad = hostInGrad.astype(dtype) 70 | assert np.allclose(hostInGrad, ingrad.get(), atol=atol) 71 | 72 | 73 | if __name__ == "__main__": 74 | unittest() 75 | -------------------------------------------------------------------------------- /Modules/AvgPool1D.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray 4 | from PuzzleLib.Backend.Dnn import PoolMode, poolNd, poolNdBackward 5 | 6 | from PuzzleLib.Modules.Pool1D import Pool1D 7 | 8 | 9 | class AvgPool1D(Pool1D): 10 | def __init__(self, size=2, stride=2, pad=0, includePad=True, name=None): 11 | super().__init__(size, stride, pad, name) 12 | self.registerBlueprint(locals()) 13 | 14 | self.mode = PoolMode.avgWithPad if includePad else PoolMode.avgNoPad 15 | 16 | 17 | def updateData(self, data): 18 | data = data.reshape(*data.shape[:2], 1, *data.shape[2:]) 19 | self.data, self.workspace = poolNd(data, size=self.size, stride=self.stride, pad=self.pad, mode=self.mode, 20 | test=not self.train) 21 | self.data = self.data.reshape(*self.data.shape[:2], *self.data.shape[3:]) 22 | 23 | 24 | def updateGrad(self, grad): 25 | grad = grad.reshape(*grad.shape[:2], 1, *grad.shape[2:]) 26 | 27 | indata = self.inData.reshape(*self.inData.shape[:2], 1, *self.inData.shape[2:]) 28 | outdata = self.data.reshape(*self.data.shape[:2], 1, *self.data.shape[2:]) 29 | 30 | self.grad = poolNdBackward(indata, outdata, grad, self.workspace, size=self.size, stride=self.stride, 31 | pad=self.pad, mode=self.mode) 32 | self.grad = self.grad.reshape(*self.grad.shape[:2], *self.grad.shape[3:]) 33 | 34 | 35 | def unittest(): 36 | batchsize, maps, insize = 2, 6, 5 37 | data = gpuarray.to_gpu(np.random.randn(batchsize, maps, insize).astype(np.float32)) 38 | 39 | size = 3 40 | stride, pad = 2, 1 41 | 42 | avgpool1d = AvgPool1D(size=size, stride=stride, pad=pad, includePad=True) 43 | avgpool1d(data) 44 | 45 | hostData = np.zeros(shape=(batchsize, maps, insize + 2 * pad), dtype=np.float32) 46 | hostData[:, :, pad:-pad] = data.get() 47 | hostOutData = np.empty(avgpool1d.data.shape, dtype=np.float32) 48 | 49 | for b in range(batchsize): 50 | for c in range(maps): 51 | for x in range(hostOutData.shape[2]): 52 | hostOutData[b, c, x] = np.mean(hostData[b, c, x * stride:x * stride + size]) 53 | 54 | assert np.allclose(hostOutData, avgpool1d.data.get()) 55 | 56 | grad = gpuarray.to_gpu(np.random.randn(*avgpool1d.data.shape).astype(np.float32)) 57 | avgpool1d.backward(grad) 58 | 59 | hostGrad = grad.get() 60 | hostInGrad = np.zeros(hostData.shape, dtype=np.float32) 61 | 62 | for b in range(batchsize): 63 | for c in range(maps): 64 | for x in range(hostOutData.shape[2]): 65 | for dx in range(size): 66 | hostInGrad[b, c, x * stride+dx] += hostGrad[b, c, x] / size 67 | 68 | assert np.allclose(hostInGrad[:, :, pad:-pad], avgpool1d.grad.get()) 69 | 70 | 71 | if __name__ == "__main__": 72 | unittest() 73 | -------------------------------------------------------------------------------- /Backend/Memory.py: -------------------------------------------------------------------------------- 1 | from PuzzleLib import Config 2 | 3 | 4 | depthConcat = None 5 | depthSplit = None 6 | 7 | moveaxis = None 8 | swapaxes = None 9 | transpose = None 10 | 11 | 12 | def autoinit(): 13 | if not Config.shouldInit(): 14 | return 15 | 16 | if Config.backend == Config.Backend.cuda: 17 | initCuda() 18 | elif Config.backend == Config.Backend.hip: 19 | initHip() 20 | elif Config.isCPUBased(Config.backend): 21 | initCPU() 22 | else: 23 | raise Config.ConfigError(Config.backend) 24 | 25 | 26 | def initCuda(): 27 | from PuzzleLib.Cuda.Backend import getBackend 28 | 29 | backend = getBackend(Config.deviceIdx, initmode=1, logger=Config.getLogger()) 30 | memoryPool, dnn = backend.memoryPool, backend.dnn 31 | 32 | initGPU(memoryPool, dnn) 33 | 34 | 35 | def initHip(): 36 | from PuzzleLib.Hip.Backend import getBackend 37 | 38 | backend = getBackend(Config.deviceIdx, initmode=2, logger=Config.getLogger()) 39 | memoryPool, memmod = backend.memoryPool, backend.memmod 40 | 41 | initGPU(memoryPool, memmod) 42 | 43 | 44 | def initGPU(memoryPool, module): 45 | def wrapDepthConcat(data): 46 | return module.depthConcat(data, allocator=memoryPool) 47 | 48 | def wrapDepthSplit(grad, indata): 49 | return module.depthSplit(grad, indata, allocator=memoryPool) 50 | 51 | global depthConcat, depthSplit 52 | depthConcat = wrapDepthConcat 53 | depthSplit = wrapDepthSplit 54 | 55 | def wrapMoveaxis(data, src, dst): 56 | return module.moveaxis(data, src, dst, allocator=memoryPool) 57 | 58 | def wrapSwapaxes(data, axis1, axis2): 59 | return module.swapaxes(data, axis1, axis2, allocator=memoryPool) 60 | 61 | def wrapTranspose(data, axes): 62 | return module.transpose(data, tuple(axes), allocator=memoryPool) 63 | 64 | global moveaxis, swapaxes, transpose 65 | moveaxis = wrapMoveaxis 66 | swapaxes = wrapSwapaxes 67 | transpose = wrapTranspose 68 | 69 | 70 | def initCPU(): 71 | import numpy as np 72 | from PuzzleLib.CPU.CPUArray import CPUArray 73 | 74 | def wrapMoveAxis(a, src, dst): 75 | out = np.copy(np.moveaxis(a.get(copy=False), src, dst), order="C") 76 | return CPUArray(out.shape, out.dtype, data=out, acquire=True) 77 | 78 | def wrapSwapAxes(a, axis1, axis2): 79 | out = np.copy(np.swapaxes(a.get(copy=False), axis1, axis2), order="C") 80 | return CPUArray(out.shape, out.dtype, data=out, acquire=True) 81 | 82 | def wrapTranspose(a, axes): 83 | out = np.copy(np.transpose(a.get(copy=False), axes), order="C") 84 | return CPUArray(out.shape, out.dtype, data=out, acquire=True) 85 | 86 | global moveaxis, swapaxes, transpose 87 | moveaxis = wrapMoveAxis 88 | swapaxes = wrapSwapAxes 89 | transpose = wrapTranspose 90 | 91 | 92 | autoinit() 93 | -------------------------------------------------------------------------------- /Modules/MaxPool3D.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray 4 | from PuzzleLib.Backend.Dnn import PoolMode, poolNd, poolNdBackward 5 | 6 | from PuzzleLib.Modules.Pool3D import Pool3D 7 | 8 | 9 | class MaxPool3D(Pool3D): 10 | def __init__(self, size=2, stride=2, pad=0, name=None): 11 | super().__init__(size, stride, pad, name) 12 | self.registerBlueprint(locals()) 13 | 14 | self.mode = PoolMode.max 15 | 16 | 17 | def updateData(self, data): 18 | self.data, self.workspace = poolNd(data, size=self.size, stride=self.stride, pad=self.pad, mode=self.mode, 19 | test=not self.train) 20 | 21 | 22 | def updateGrad(self, grad): 23 | self.grad = poolNdBackward(self.inData, self.data, grad, self.workspace, 24 | size=self.size, stride=self.stride, pad=self.pad, mode=self.mode) 25 | 26 | 27 | def unittest(): 28 | batchsize, maps, t, h, w = 1, 1, 6, 6, 6 29 | size, stride, pad = 3, 2, 1 30 | data = gpuarray.to_gpu(np.random.randn(batchsize, maps, t, h, w).astype(np.float32)) 31 | 32 | maxpool3d = MaxPool3D(size=size, stride=stride, pad=pad) 33 | maxpool3d(data) 34 | 35 | hostData = np.full(shape=(batchsize, maps, t + 2 * pad, h + 2 * pad, w + 2 * pad), 36 | fill_value=np.finfo(np.float32).min, dtype=np.float32) 37 | hostData[:, :, pad:-pad, pad:-pad, pad:-pad] = data.get() 38 | hostOutData = np.empty(maxpool3d.data.shape) 39 | 40 | for b in range(batchsize): 41 | for c in range(maps): 42 | for z in range(hostOutData.shape[2]): 43 | for y in range(hostOutData.shape[3]): 44 | for x in range(hostOutData.shape[4]): 45 | hostOutData[b, c, z, y, x] = np.max(hostData[b, c, z * stride:z * stride + size, 46 | y * stride:y * stride + size, x * stride:x * stride + size]) 47 | 48 | assert np.allclose(hostOutData, maxpool3d.data.get()) 49 | 50 | grad = gpuarray.to_gpu(np.random.randn(*maxpool3d.data.shape).astype(np.float32)) 51 | maxpool3d.backward(grad) 52 | 53 | hostGrad = grad.get() 54 | hostInGrad = np.zeros(hostData.shape, dtype=np.float32) 55 | 56 | for b in range(batchsize): 57 | for c in range(maps): 58 | for z in range(hostOutData.shape[2]): 59 | for y in range(hostOutData.shape[3]): 60 | for x in range(hostOutData.shape[4]): 61 | for dz in range(size): 62 | for dy in range(size): 63 | for dx in range(size): 64 | if hostData[b,c,z*stride+dz,y*stride + dy,x*stride + dx] == hostOutData[b,c,z,y,x]: 65 | hostInGrad[b,c,z*stride + dz,y*stride + dy,x*stride + dx] += hostGrad[b,c,z,y,x] 66 | 67 | assert np.allclose(hostInGrad[:, :, pad:-pad, pad:-pad, pad:-pad], maxpool3d.grad.get()) 68 | 69 | 70 | if __name__ == "__main__": 71 | unittest() 72 | -------------------------------------------------------------------------------- /Modules/SpatialTf.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray 4 | from PuzzleLib.Backend.Dnn import spatialTf, spatialTfBackward 5 | 6 | from PuzzleLib.Modules.Module import ModuleError, Module 7 | 8 | 9 | class SpatialTf(Module): 10 | def __init__(self, shape=None, name=None): 11 | super().__init__(name) 12 | self.registerBlueprint(locals()) 13 | 14 | self.shape = shape 15 | self.grid = None 16 | 17 | 18 | def updateData(self, data): 19 | data, transform = data 20 | 21 | if self.train: 22 | self.data, self.grid = spatialTf(data, transform, outshape=self.shape, getGrid=True) 23 | else: 24 | self.data = spatialTf(data, transform, outshape=self.shape, getGrid=False) 25 | 26 | 27 | def updateGrad(self, grad): 28 | data, _ = self.inData 29 | self.grad = spatialTfBackward(grad, data, self.grid) 30 | 31 | 32 | def checkDataShape(self, shapes): 33 | dshape, tshape = shapes 34 | 35 | if len(tshape) != 3 or tshape[1:] != (2, 3): 36 | raise ModuleError("Bad transform shape (%s was given)" % tshape) 37 | 38 | if len(dshape) != 4: 39 | raise ModuleError("Data must be 4d tensor") 40 | 41 | if tshape[0] != dshape[0]: 42 | raise ModuleError("Inconsistency in transform and data batch size (%d in transform vs %d in data)" % 43 | (tshape[0], dshape[0])) 44 | 45 | 46 | def checkGradShape(self, shape): 47 | if len(shape) != 4: 48 | raise ModuleError("Grad must be 4d tensor") 49 | 50 | if self.shape is not None: 51 | if self.shape != shape[1:]: 52 | raise ModuleError("Bad grad shape (was given %s, expected %s)" % (shape[1:], self.shape)) 53 | else: 54 | if self.inData[0].shape != shape: 55 | raise ModuleError("Bad grad shape (was given %s, expected %s)" % (shape, self.inData[0].shape)) 56 | 57 | 58 | def dataShapeFrom(self, shapes): 59 | dshape, tshape = shapes 60 | return (dshape[0], ) + self.shape if self.shape is not None else dshape 61 | 62 | 63 | def gradShapeFrom(self, shape): 64 | return (shape[0], ) + self.inData[0].shape[1:], (shape[0], 2, 3) 65 | 66 | 67 | def reset(self): 68 | super().reset() 69 | self.grid = None 70 | 71 | 72 | def unittest(): 73 | batchsize, maps, inh, inw = 1, 1, 4, 4 74 | data = gpuarray.to_gpu(np.random.randn(batchsize, maps, inh, inw).astype(np.float32)) 75 | 76 | transform = gpuarray.to_gpu( 77 | np.tile(np.array([[1.0, 0.0, 0.001], [0, 1.0, 0.001]], dtype=np.float32), reps=(batchsize, 1, 1)) 78 | ) 79 | 80 | spatialtf = SpatialTf() 81 | spatialtf([data, transform]) 82 | 83 | grad = gpuarray.to_gpu(np.random.randn(*spatialtf.data.shape).astype(np.float32)) 84 | spatialtf.backward(grad) 85 | 86 | 87 | if __name__ == "__main__": 88 | unittest() 89 | -------------------------------------------------------------------------------- /Cost/SmoothL1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray 4 | from PuzzleLib.Backend.gpuarray import memoryPool as memPool 5 | from PuzzleLib.Backend.Kernels.Costs import smoothL1Ker 6 | 7 | from PuzzleLib.Cost.Cost import Cost 8 | 9 | 10 | class SmoothL1(Cost): 11 | def calcGrad(self, pred, target): 12 | grad = gpuarray.empty(pred.shape, dtype=np.float32, allocator=memPool) 13 | 14 | fullnorm = 1.0 / np.prod(target.shape) 15 | norm = 1.0 / np.prod(target.shape[1:]) 16 | 17 | self.devErr.fill(0.0) 18 | 19 | smoothL1Ker(pred, target, self.devErr, grad, norm, fullnorm) 20 | return grad 21 | 22 | 23 | def calcError(self, pred, target): 24 | self.accumErr += self.devErr 25 | 26 | 27 | def calcVal(self, pred, target): 28 | diff = gpuarray.empty(pred.shape, dtype=np.float32, allocator=memPool) 29 | 30 | fullnorm = 1.0 / np.prod(target.shape) 31 | 32 | devErr = gpuarray.zeros((), dtype=np.float32, allocator=memPool) 33 | smoothL1Ker(pred, target, devErr, diff, fullnorm, fullnorm) 34 | 35 | return devErr.get() 36 | 37 | 38 | def checkDataShape(self, pred, target): 39 | assert pred.shape[1:] == target.shape[1:] 40 | 41 | 42 | def checkValDataShape(self, pred, target): 43 | assert pred.shape[1:] == target.shape[1:] 44 | 45 | 46 | def unittest(): 47 | errorTest() 48 | valTest() 49 | 50 | 51 | def errorTest(): 52 | pred = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32)) 53 | target = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32)) 54 | 55 | smoothL1 = SmoothL1() 56 | smoothL1(pred, target) 57 | 58 | hostPred, hostTarget = pred.get(), target.get() 59 | hostGrad = ((np.abs(hostPred - hostTarget) >= 1.0) * np.sign(hostPred - hostTarget) + 60 | (np.abs(hostPred - hostTarget) < 1.0) * (hostPred - hostTarget)) / np.prod(pred.shape) 61 | 62 | assert np.allclose(hostGrad, smoothL1.grad.get()) 63 | 64 | hostError = np.mean((np.abs(hostPred - hostTarget) >= 1.0) * (np.abs(hostPred - hostTarget) - 0.5) + 65 | (np.abs(hostPred - hostTarget) < 1.0) * (hostPred - hostTarget)**2 / 2.0) 66 | 67 | assert np.isclose(smoothL1.error, hostError) 68 | 69 | 70 | def valTest(): 71 | pred = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32)) 72 | target = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32)) 73 | 74 | smoothL1 = SmoothL1() 75 | error = smoothL1.validate(pred, target) 76 | 77 | hostPred, hostTarget = pred.get(), target.get() 78 | 79 | hostError = np.mean((np.abs(hostPred - hostTarget) >= 1.0) * (np.abs(hostPred - hostTarget) - 0.5) + 80 | (np.abs(hostPred - hostTarget) < 1.0) * (hostPred - hostTarget)**2 / 2.0) 81 | 82 | assert np.isclose(error, hostError) 83 | 84 | 85 | if __name__ == "__main__": 86 | unittest() 87 | -------------------------------------------------------------------------------- /Hip/CheckInstall.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from PuzzleLib.Cuda.CheckInstall import checkInstall, checkRuntime, checkPipPackages 3 | 4 | 5 | hipTestKernel = """ 6 | 7 | #include 8 | #include 9 | 10 | 11 | __global__ void iaxpy(int *y, const int *x, int a, int size) 12 | { 13 | int i = blockIdx.x * blockDim.x + threadIdx.x; 14 | if (i < size) y[i] += a * x[i]; 15 | } 16 | 17 | 18 | #define HIP_ASSERT(status) do { if (!hipAssertStatus((status), __LINE__)) exit(1); } while (0) 19 | inline bool hipAssertStatus(hipError_t code, int line) 20 | { 21 | if (code != hipSuccess) 22 | { 23 | fprintf(stderr, "%s (line:%d)\\n", hipGetErrorString(code), line); 24 | return false; 25 | } 26 | 27 | return true; 28 | } 29 | 30 | 31 | int main() 32 | { 33 | int exitcode = 0; 34 | 35 | const int SIZE = 1 << 20; 36 | const int NBYTES = SIZE * sizeof(int); 37 | 38 | int *hostx = (int *)malloc(NBYTES); 39 | int *hosty = (int *)malloc(NBYTES); 40 | 41 | int *devx = NULL, *devy = NULL; 42 | HIP_ASSERT(hipMalloc(&devx, NBYTES)); 43 | HIP_ASSERT(hipMalloc(&devy, NBYTES)); 44 | 45 | for (int i = 0; i < SIZE; i++) 46 | { 47 | hostx[i] = i; 48 | hosty[i] = -i * 2; 49 | } 50 | 51 | HIP_ASSERT(hipMemcpy(devx, hostx, NBYTES, hipMemcpyHostToDevice)); 52 | HIP_ASSERT(hipMemcpy(devy, hosty, NBYTES, hipMemcpyHostToDevice)); 53 | 54 | const int NT = 256; 55 | hipLaunchKernelGGL(iaxpy, dim3((SIZE + NT - 1) / NT), dim3(NT), 0, 0, devy, devx, 2, SIZE); 56 | 57 | HIP_ASSERT(hipMemcpy(hosty, devy, NBYTES, hipMemcpyDeviceToHost)); 58 | 59 | HIP_ASSERT(hipFree(devx)); 60 | HIP_ASSERT(hipFree(devy)); 61 | 62 | for (int i = 0; i < SIZE; i++) 63 | if (hosty[i] != 0) 64 | { 65 | fprintf(stderr, "kernel invocation failed!"); 66 | 67 | exitcode = 1; 68 | goto exit; 69 | } 70 | 71 | printf("finished successfully!"); 72 | fflush(stdout); 73 | 74 | exit: 75 | free(hostx); 76 | free(hosty); 77 | 78 | return exitcode; 79 | } 80 | 81 | """ 82 | 83 | 84 | def checkHipInstall(withRuntime, withPip): 85 | checkInstall( 86 | name="HIP", compiler="hipcc", 87 | download="https://rocm.github.io/install.html#ubuntu-support---installing-from-a-debian-repository", 88 | envpath="HIP_PATH" 89 | ) 90 | 91 | if withRuntime: 92 | checkRuntime(name="HIP", compiler="hipcc", kernel=hipTestKernel, ext=".hip.cpp") 93 | 94 | if withPip: 95 | checkPipPackages() 96 | 97 | 98 | def main(): 99 | try: 100 | checkHipInstall(withRuntime=True, withPip=True) 101 | 102 | except RuntimeError as e: 103 | print(e) 104 | 105 | print("Exiting ...") 106 | sys.exit(1) 107 | 108 | 109 | if __name__ == "__main__": 110 | main() 111 | -------------------------------------------------------------------------------- /Cost/Cost.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray 4 | 5 | 6 | class CostError(Exception): 7 | pass 8 | 9 | 10 | class Cost: 11 | def __init__(self): 12 | self.accumErr = gpuarray.empty((), dtype=np.float32) 13 | self.devErr = gpuarray.empty((), dtype=np.float32) 14 | 15 | self.error = None 16 | self.valError = None 17 | self.grad = None 18 | 19 | self.batchsize = None 20 | self.numOfSamples = None 21 | 22 | self.dirty = True 23 | self.resetAccumulator() 24 | 25 | 26 | def resetAccumulator(self): 27 | self.resetDeviceAccumulator() 28 | 29 | self.batchsize = 0 30 | self.numOfSamples = 0 31 | 32 | 33 | def updateState(self, samples): 34 | self.batchsize = samples 35 | self.numOfSamples += samples 36 | 37 | 38 | def resetDeviceAccumulator(self): 39 | self.accumErr.fill(0.0) 40 | 41 | 42 | def getError(self): 43 | if self.dirty: 44 | self.error = self.devErr.get() / self.batchsize 45 | self.dirty = False 46 | 47 | return self.error 48 | 49 | 50 | def getMeanError(self): 51 | return self.accumErr.get() / self.numOfSamples 52 | 53 | 54 | def getValError(self): 55 | return self.valError 56 | 57 | 58 | def __call__(self, pred, target, queryError=True): 59 | if isinstance(target, gpuarray.GPUArray) and isinstance(pred, gpuarray.GPUArray): 60 | assert pred.shape[0] == target.shape[0] 61 | 62 | self.checkDataShape(pred, target) 63 | self.reset() 64 | 65 | self.grad = self.calcGrad(pred, target) 66 | self.calcError(pred, target) 67 | self.dirty = True 68 | 69 | self.updateState(self.getBatchsize(pred)) 70 | 71 | if queryError: 72 | self.error = self.getError() 73 | 74 | if queryError: 75 | return self.error, self.grad 76 | else: 77 | return self.grad 78 | 79 | 80 | def calcError(self, pred, target): 81 | raise NotImplementedError() 82 | 83 | 84 | def calcGrad(self, pred, target): 85 | raise NotImplementedError() 86 | 87 | 88 | def validate(self, pred, target): 89 | if isinstance(target, gpuarray.GPUArray) and isinstance(pred, gpuarray.GPUArray): 90 | assert pred.shape[0] == target.shape[0] 91 | 92 | self.checkValDataShape(pred, target) 93 | self.valError = self.calcVal(pred, target) 94 | 95 | return self.valError 96 | 97 | 98 | def calcVal(self, pred, target): 99 | raise NotImplementedError() 100 | 101 | 102 | def reset(self): 103 | self.error = None 104 | self.valError = None 105 | 106 | self.grad = None 107 | 108 | 109 | def checkDataShape(self, pred, target): 110 | pass 111 | 112 | 113 | def checkValDataShape(self, pred, target): 114 | pass 115 | 116 | 117 | def getBatchsize(self, pred): 118 | return pred.shape[0] 119 | -------------------------------------------------------------------------------- /Datasets/MnistLoader.py: -------------------------------------------------------------------------------- 1 | import os, struct, array 2 | 3 | import numpy as np 4 | import h5py 5 | 6 | from PuzzleLib.Datasets.DataLoader import DataLoader 7 | 8 | 9 | class MnistLoader(DataLoader): 10 | def __init__(self, onSample=None, cachename="mnist.hdf"): 11 | super().__init__(("data", "labels"), cachename) 12 | 13 | if onSample: 14 | self.onSample = onSample 15 | else: 16 | self.onSample = lambda smp: np.array(smp, dtype=np.float32).reshape((1, 28, 28)) / 255.0 17 | 18 | self.testdata = "t10k-images.idx3-ubyte" 19 | self.testlabels = "t10k-labels.idx1-ubyte" 20 | 21 | self.traindata = "train-images.idx3-ubyte" 22 | self.trainlabels = "train-labels.idx1-ubyte" 23 | 24 | 25 | def load(self, path, compress="gzip", log=True): 26 | self.cachename = os.path.join(path, self.cachename) 27 | 28 | if not os.path.exists(self.cachename): 29 | imgs, lbls = [], [] 30 | 31 | if log: 32 | print("[%s] Started unpacking ..." % self.__class__.__name__) 33 | 34 | for filename in [self.testlabels, self.trainlabels]: 35 | with open(os.path.join(path, filename), "rb") as file: 36 | magic, size = struct.unpack(">II", file.read(8)) 37 | 38 | trueMagic = 2049 39 | if magic != trueMagic: 40 | raise ValueError("Bad magic number (got %s, expected %s)" % (magic, trueMagic)) 41 | 42 | lbls += array.array("B", file.read()) 43 | 44 | for filename in [self.testdata, self.traindata]: 45 | with open(os.path.join(path, filename), "rb") as file: 46 | magic, size, rows, cols = struct.unpack(">IIII", file.read(16)) 47 | 48 | trueMagic = 2051 49 | if magic != trueMagic: 50 | raise ValueError("Bad magic number (got %s, expected %s)" % (magic, trueMagic)) 51 | 52 | data = array.array("B", file.read()) 53 | datsize = rows * cols 54 | 55 | for i in range(size): 56 | dat = data[i * datsize:(i+1) * datsize] 57 | imgs.append(dat) 58 | 59 | images = np.empty((len(imgs), 1, rows, cols), dtype=np.float32) 60 | labels = np.empty((len(imgs), ), dtype=np.int32) 61 | 62 | print("[%s] Building cache ..." % self.__class__.__name__) 63 | 64 | for i in range(len(lbls)): 65 | images[i] = self.onSample(imgs[i]) 66 | labels[i] = lbls[i] 67 | 68 | with h5py.File(self.cachename, "w") as hdf: 69 | dsetname, lblsetname = self.datanames 70 | hdf.create_dataset(dsetname, data=images, compression=compress) 71 | hdf.create_dataset(lblsetname, data=labels, compression=compress) 72 | 73 | hdf = h5py.File(self.cachename, "r") 74 | dsetname, lblsetname = self.datanames 75 | return hdf[dsetname], hdf[lblsetname] 76 | 77 | 78 | def unittest(): 79 | mnist = MnistLoader() 80 | mnist.load(path="../TestData/") 81 | mnist.clear() 82 | 83 | 84 | if __name__ == "__main__": 85 | unittest() 86 | -------------------------------------------------------------------------------- /Optimizers/Adam.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import numpy as np 4 | 5 | from PuzzleLib import Config 6 | 7 | from PuzzleLib.Backend import gpuarray 8 | from PuzzleLib.Backend.Kernels.ElementWise import adamKer 9 | 10 | from PuzzleLib.Optimizers.Optimizer import Optimizer, trainSimpleTest, trainHardTest 11 | 12 | 13 | class Adam(Optimizer): 14 | def __init__(self, alpha=1e-3, beta1=0.9, beta2=0.999, epsilon=1e-8, nodeinfo=None): 15 | super().__init__(nodeinfo) 16 | 17 | self.alpha = None 18 | self.beta1 = None 19 | self.beta2 = None 20 | self.epsilon = None 21 | 22 | self.setAttr("alpha", alpha) 23 | self.setAttr("beta1", beta1) 24 | self.setAttr("beta2", beta2) 25 | self.setAttr("epsilon", epsilon) 26 | 27 | 28 | def setupState(self, var): 29 | return { 30 | "mg": gpuarray.zeros(var.data.shape, dtype=np.float32), 31 | "ms": gpuarray.zeros(var.data.shape, dtype=np.float32) 32 | } 33 | 34 | 35 | def updateVar(self, var, state, stream=None): 36 | fix1, fix2 = 1.0 - self.beta1**self.t, 1.0 - self.beta2**self.t 37 | self.learnRate = self.alpha * math.sqrt(fix2) / fix1 38 | 39 | fix1, fix2 = 1.0 - self.beta1, 1.0 - self.beta2 40 | adamKer(var.data.dtype)( 41 | var.data, var.grad, state["mg"], state["ms"], self.learnRate * var.learnRate, fix1, fix2, self.epsilon, 42 | stream=stream 43 | ) 44 | 45 | 46 | def unittest(): 47 | for dtype, atol in gpuarray.dtypesSupported(): 48 | calcTest(dtype, atol) 49 | trainSimpleTest(Adam, dtype, alpha=1e-2) 50 | 51 | if Config.backend == Config.Backend.cuda: 52 | trainHardTest(Adam, dtype, alpha=1e-2) 53 | 54 | 55 | def calcTest(dtype, atol): 56 | alpha, beta1, beta2, epsilon = 0.01, 0.9, 0.999, 1e-8 57 | shape = (11, 13) 58 | 59 | hostW, hostDw = np.random.randn(*shape).astype(dtype), np.random.randn(*shape).astype(dtype) 60 | hostMs, hostMg = (1.0 + np.random.randn(*shape)**2).astype(np.float32), np.random.randn(*shape).astype(np.float32) 61 | 62 | w, dw = gpuarray.to_gpu(hostW), gpuarray.to_gpu(hostDw) 63 | ms, mg = gpuarray.to_gpu(hostMs), gpuarray.to_gpu(hostMg) 64 | 65 | fix1, fix2 = 1.0 - beta1, 1.0 - beta2 66 | lr = alpha * math.sqrt(fix2) / fix1 67 | 68 | fix1, fix2 = 1.0 - beta1, 1.0 - beta2 69 | adamKer(w.dtype)(w, dw, mg, ms, lr, fix1, fix2, epsilon) 70 | 71 | hostW, hostDw = hostW.astype(np.float32), hostDw.astype(np.float32) 72 | 73 | hostMg = (1 - fix1) * hostMg + fix1 * hostDw 74 | hostMs = (1 - fix2) * hostMs + fix2 * hostDw**2 75 | hostW += lr * hostMg / (np.sqrt(hostMs) + epsilon) 76 | 77 | hostW, hostDw = hostW.astype(dtype), hostDw.astype(dtype) 78 | 79 | assert np.allclose(hostMg, mg.get(), atol=atol) 80 | assert np.allclose(hostMs, ms.get(), atol=atol) 81 | assert np.allclose(hostW, w.get(), atol=atol) 82 | 83 | 84 | if __name__ == "__main__": 85 | unittest() 86 | -------------------------------------------------------------------------------- /Converter/TensorRT/Tests/MnistLenetTest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PuzzleLib.Backend import gpuarray 4 | from PuzzleLib.Datasets import MnistLoader 5 | 6 | from PuzzleLib.Containers import * 7 | from PuzzleLib.Modules import * 8 | from PuzzleLib.Handlers import * 9 | from PuzzleLib.Optimizers import MomentumSGD 10 | from PuzzleLib.Cost import CrossEntropy 11 | 12 | from PuzzleLib.Converter.TensorRT.Tests.Common import benchModels 13 | from PuzzleLib.Converter.TensorRT.BuildRTEngine import buildRTEngine, RTDataType 14 | from PuzzleLib.Converter.TensorRT.DataCalibrator import DataCalibrator 15 | 16 | 17 | def buildNet(): 18 | seq = Sequential(name="lenet-5-like") 19 | seq.append(Conv2D(1, 16, 3)) 20 | seq.append(MaxPool2D()) 21 | seq.append(Activation(relu)) 22 | 23 | seq.append(Conv2D(16, 32, 4)) 24 | seq.append(MaxPool2D()) 25 | seq.append(Activation(relu)) 26 | 27 | seq.append(Flatten()) 28 | seq.append(Linear(32 * 5 * 5, 1024)) 29 | seq.append(Activation(relu)) 30 | 31 | seq.append(Linear(1024, 10)) 32 | 33 | return seq 34 | 35 | 36 | def trainNet(net, data, labels, epochs): 37 | optimizer = MomentumSGD() 38 | optimizer.setupOn(net, useGlobalState=True) 39 | optimizer.learnRate = 0.1 40 | optimizer.momRate = 0.9 41 | 42 | cost = CrossEntropy(maxlabels=10) 43 | trainer = Trainer(net, cost, optimizer) 44 | validator = Validator(net, cost) 45 | 46 | for i in range(epochs): 47 | trainer.trainFromHost( 48 | data[:60000], labels[:60000], macroBatchSize=60000, 49 | onMacroBatchFinish=lambda train: print("Train error: %s" % train.cost.getMeanError()) 50 | ) 51 | print("Accuracy: %s" % (1.0 - validator.validateFromHost(data[60000:], labels[60000:], macroBatchSize=10000))) 52 | 53 | optimizer.learnRate *= 0.9 54 | 55 | 56 | def validate(net, data, labels, batchsize=1): 57 | cost = CrossEntropy(maxlabels=10) 58 | validator = Validator(net, cost, batchsize=batchsize) 59 | 60 | return 1.0 - validator.validateFromHost(data[60000:], labels[60000:], macroBatchSize=10000) 61 | 62 | 63 | def main(): 64 | mnist = MnistLoader() 65 | data, labels = mnist.load(path="../TestData/") 66 | data, labels = data[:], labels[:] 67 | print("Loaded mnist") 68 | 69 | np.random.seed(1234) 70 | 71 | net = buildNet() 72 | trainNet(net, data, labels, 15) 73 | 74 | calibrator = DataCalibrator(data[:60000], cachename="../TestData/mnist_calibration_cache.bin") 75 | net.evalMode() 76 | 77 | engine = buildRTEngine( 78 | net, inshape=data[:1].shape, savepath="../TestData", dtype=RTDataType.int8, calibrator=calibrator 79 | ) 80 | 81 | benchModels(net, engine, gpuarray.to_gpu(data[:1])) 82 | 83 | print("Net accuracy: %s" % validate(net, data, labels)) 84 | print("Engine accuracy: %s" % validate(engine, data, labels, batchsize=1)) 85 | 86 | 87 | if __name__ == "__main__": 88 | main() 89 | --------------------------------------------------------------------------------