├── Hip
    ├── .gitignore
    ├── Kernels
    │   ├── Pad.py
    │   ├── Pool.py
    │   ├── Costs.py
    │   ├── MatVec.py
    │   ├── Memory.py
    │   ├── PRelu.py
    │   ├── Embedder.py
    │   ├── RadixSort.py
    │   ├── Upsample.py
    │   └── CTC.py
    ├── Wrappers
    │   ├── RocBlas.py
    │   └── MIOpenNorm.py
    ├── Benchmarks
    │   └── ConvSpeed.py
    ├── Source
    │   └── Build.py
    ├── GPUArray.py
    ├── Utils.py
    └── CheckInstall.py
├── Cuda
    ├── Source
    │   ├── .gitignore
    │   └── TraceMalloc
    │   │   └── .gitignore
    ├── .gitignore
    ├── Wrappers
    │   └── CuDnnMemory.py
    └── Benchmarks
    │   └── ConvSpeed.py
├── Intel
    ├── Libs
    │   └── .gitignore
    ├── ThirdParty
    │   └── finddnnl.py
    ├── Benchmarks
    │   └── ConvSpeed.py
    └── Wrappers
    │   └── DNNLBlas.py
├── .gitignore
├── Compiler
    ├── TestData
    │   └── .gitignore
    ├── Codegen
    │   ├── PyDefines
    │   │   ├── Generate.py
    │   │   └── PyDefines.h
    │   ├── Vector
    │   │   ├── TVector.h
    │   │   ├── Generate.py
    │   │   └── TVector.c
    │   ├── Map
    │   │   ├── TMap.h
    │   │   └── Generate.py
    │   ├── Malloc
    │   │   ├── TMalloc.h
    │   │   ├── TMalloc.c
    │   │   ├── Generate.py
    │   │   └── TMallocTest.c
    │   └── Tree
    │   │   ├── TTree.h
    │   │   └── Generate.py
    └── Compilers
    │   ├── NVCC.py
    │   └── GCC.py
├── Converter
    ├── TestData
    │   └── .gitignore
    ├── MXNet
    │   └── .gitignore
    ├── OpenVINO
    │   ├── TestData
    │   │   └── .gitignore
    │   ├── .gitignore
    │   ├── Tests
    │   │   ├── ResNet50Test.py
    │   │   ├── Common.py
    │   │   └── GraphTest.py
    │   ├── Source
    │   │   └── Build.py
    │   └── VINOEngine.py
    ├── TensorRT
    │   ├── TestData
    │   │   └── .gitignore
    │   ├── .gitignore
    │   ├── Tests
    │   │   ├── UNetTest.py
    │   │   ├── WaveToLetterTest.py
    │   │   ├── ResNet50Test.py
    │   │   ├── Common.py
    │   │   ├── GraphTest.py
    │   │   └── MnistLenetTest.py
    │   ├── Source
    │   │   └── Plugins.cpp
    │   └── DataCalibrator.py
    ├── Caffe
    │   ├── .gitignore
    │   └── ConvertBlob.py
    └── Examples
    │   ├── NiN.py
    │   ├── VGG.py
    │   ├── Inception.py
    │   ├── ResNet.py
    │   └── Common.py
├── TestData
    ├── .gitignore
    ├── test.tar
    └── test.zip
├── requirements.txt
├── MANIFEST.in
├── Handlers
    └── __init__.py
├── Containers
    └── __init__.py
├── Datasets
    ├── __init__.py
    ├── DataLoader.py
    ├── TarLoader.py
    ├── ZipLoader.py
    ├── PathLoader.py
    └── MnistLoader.py
├── Cost
    ├── __init__.py
    ├── MSE.py
    ├── Abs.py
    ├── SmoothL1.py
    └── Cost.py
├── Optimizers
    ├── __init__.py
    ├── Hooks.py
    ├── SGD.py
    ├── AdaGrad.py
    ├── MomentumSGD.py
    ├── NesterovSGD.py
    ├── RMSProp.py
    ├── AdaDelta.py
    ├── SMORMS3.py
    └── Adam.py
├── TestLib
    ├── NormFilters.py
    ├── OptimizeNet.py
    ├── CnnMnistLenet.py
    ├── GradientCheck.py
    ├── RnnIMDBTrain.py
    ├── BiRnnIMDBTrain.py
    ├── CnnIMDBTrain.py
    ├── ResumeTrain.py
    ├── EncoderTrain.py
    ├── MultiGPUMnist.py
    ├── MultiGPUCifar10.py
    └── CnnCifar10Simple.py
├── Models
    └── Nets
    │   ├── __init__.py
    │   └── LeNet.py
├── Transformers
    ├── Transformer.py
    ├── Generator.py
    └── Serial.py
├── Modules
    ├── Identity.py
    ├── LRN.py
    ├── Flatten.py
    ├── Add.py
    ├── Penalty.py
    ├── Pool1D.py
    ├── Replicate.py
    ├── Mul.py
    ├── Pool2D.py
    ├── CrossMapLRN.py
    ├── MulAddConst.py
    ├── Gelu.py
    ├── Slice.py
    ├── Glue.py
    ├── AvgPool2D.py
    ├── Transpose.py
    ├── MapLRN.py
    ├── MaxPool2D.py
    ├── Pool3D.py
    ├── SoftMax.py
    ├── SwapAxes.py
    ├── AvgPool3D.py
    ├── AvgPool1D.py
    ├── MaxPool3D.py
    └── SpatialTf.py
├── Backend
    ├── Kernels
    │   ├── Embedder.py
    │   ├── Pad.py
    │   ├── PRelu.py
    │   ├── Pool.py
    │   ├── Upsample.py
    │   ├── MatVec.py
    │   └── Costs.py
    └── Memory.py
├── Config.py
├── Variable.py
├── README.md
└── CPU
    ├── Wrappers
        └── NumpyBlas.py
    └── Kernels
        └── Upsample2D.py


/Hip/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.so
3 | 


--------------------------------------------------------------------------------
/Cuda/Source/.gitignore:
--------------------------------------------------------------------------------
1 | *.gen.c
2 | *.gen.h
3 | 


--------------------------------------------------------------------------------
/Intel/Libs/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | 
3 | !.gitignore
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | 
3 | .idea
4 | .DS_Store
5 | 


--------------------------------------------------------------------------------
/Compiler/TestData/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | 
3 | !.gitignore
4 | 


--------------------------------------------------------------------------------
/Converter/TestData/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | 
3 | !.gitignore
4 | 


--------------------------------------------------------------------------------
/Cuda/Source/TraceMalloc/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | 
3 | !.gitignore
4 | 


--------------------------------------------------------------------------------
/Converter/MXNet/.gitignore:
--------------------------------------------------------------------------------
1 | *.params
2 | *.json
3 | 
4 | *.hdf
5 | 


--------------------------------------------------------------------------------
/Converter/OpenVINO/TestData/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | 
3 | !.gitignore
4 | 


--------------------------------------------------------------------------------
/Converter/TensorRT/TestData/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | 
3 | !.gitignore
4 | 


--------------------------------------------------------------------------------
/TestData/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | 
3 | !test.tar
4 | !test.zip
5 | 
6 | !.gitignore
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | h5py
3 | Pillow
4 | graphviz
5 | colorama
6 | pybind11
7 | 


--------------------------------------------------------------------------------
/TestData/test.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/puzzlelib/PuzzleLib/HEAD/TestData/test.tar


--------------------------------------------------------------------------------
/TestData/test.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/puzzlelib/PuzzleLib/HEAD/TestData/test.zip


--------------------------------------------------------------------------------
/Cuda/.gitignore:
--------------------------------------------------------------------------------
 1 | *.exp
 2 | *.lib
 3 | *.ilk
 4 | 
 5 | *.idb
 6 | *.pdb
 7 | 
 8 | *.obj
 9 | *.pyd
10 | 
11 | *.o
12 | *.so
13 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | global-exclude .gitignore
2 | 
3 | exclude MANIFEST.in
4 | 
5 | include *.py
6 | include LICENSE
7 | include requirements.txt


--------------------------------------------------------------------------------
/Converter/OpenVINO/.gitignore:
--------------------------------------------------------------------------------
 1 | *.exp
 2 | *.lib
 3 | *.ilk
 4 | 
 5 | *.idb
 6 | *.pdb
 7 | 
 8 | *.obj
 9 | *.pyd
10 | 
11 | *.o
12 | *.so
13 | 


--------------------------------------------------------------------------------
/Converter/Caffe/.gitignore:
--------------------------------------------------------------------------------
 1 | *.whl
 2 | *.proto
 3 | *.exe
 4 | 
 5 | caffe_pb2.py
 6 | 
 7 | *.caffemodel
 8 | *.binaryproto
 9 | 
10 | *.pkl
11 | *.hdf
12 | 


--------------------------------------------------------------------------------
/Converter/TensorRT/.gitignore:
--------------------------------------------------------------------------------
 1 | *.json
 2 | 
 3 | *.exp
 4 | *.lib
 5 | *.ilk
 6 | 
 7 | *.idb
 8 | *.pdb
 9 | 
10 | *.obj
11 | *.pyd
12 | 
13 | *.o
14 | *.so
15 | 


--------------------------------------------------------------------------------
/Handlers/__init__.py:
--------------------------------------------------------------------------------
1 | from PuzzleLib.Handlers.Calculator import Calculator
2 | from PuzzleLib.Handlers.Trainer import Trainer
3 | from PuzzleLib.Handlers.Validator import Validator
4 | 


--------------------------------------------------------------------------------
/Hip/Kernels/Pad.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Cuda.Kernels.Pad import backendTest
 2 | 
 3 | 
 4 | def unittest():
 5 | 	from PuzzleLib.Hip import Backend
 6 | 	backendTest(Backend)
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 | 	unittest()
11 | 


--------------------------------------------------------------------------------
/Hip/Kernels/Pool.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Cuda.Kernels.Pool import backendTest
 2 | 
 3 | 
 4 | def unittest():
 5 | 	from PuzzleLib.Hip import Backend
 6 | 	backendTest(Backend)
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 | 	unittest()
11 | 


--------------------------------------------------------------------------------
/Containers/__init__.py:
--------------------------------------------------------------------------------
1 | from PuzzleLib.Containers.Container import Container
2 | from PuzzleLib.Containers.Graph import Graph
3 | from PuzzleLib.Containers.Parallel import Parallel
4 | from PuzzleLib.Containers.Sequential import Sequential
5 | 


--------------------------------------------------------------------------------
/Hip/Kernels/Costs.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Cuda.Kernels.Costs import backendTest
 2 | 
 3 | 
 4 | def unittest():
 5 | 	from PuzzleLib.Hip import Backend
 6 | 	backendTest(Backend)
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 | 	unittest()
11 | 


--------------------------------------------------------------------------------
/Hip/Kernels/MatVec.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Cuda.Kernels.MatVec import backendTest
 2 | 
 3 | 
 4 | def unittest():
 5 | 	from PuzzleLib.Hip import Backend
 6 | 	backendTest(Backend)
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 | 	unittest()
11 | 


--------------------------------------------------------------------------------
/Hip/Kernels/Memory.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Cuda.Kernels.Memory import backendTest
 2 | 
 3 | 
 4 | def unittest():
 5 | 	from PuzzleLib.Hip import Backend
 6 | 	backendTest(Backend)
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 | 	unittest()
11 | 


--------------------------------------------------------------------------------
/Hip/Kernels/PRelu.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Cuda.Kernels.PRelu import backendTest
 2 | 
 3 | 
 4 | def unittest():
 5 | 	from PuzzleLib.Hip import Backend
 6 | 	backendTest(Backend)
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 | 	unittest()
11 | 


--------------------------------------------------------------------------------
/Hip/Kernels/Embedder.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Cuda.Kernels.Embedder import backendTest
 2 | 
 3 | 
 4 | def unittest():
 5 | 	from PuzzleLib.Hip import Backend
 6 | 	backendTest(Backend)
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 | 	unittest()
11 | 


--------------------------------------------------------------------------------
/Hip/Kernels/RadixSort.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Cuda.Kernels.RadixSort import backendTest
 2 | 
 3 | 
 4 | def unittest():
 5 | 	from PuzzleLib.Hip import Backend
 6 | 	backendTest(Backend)
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 | 	unittest()
11 | 


--------------------------------------------------------------------------------
/Hip/Kernels/Upsample.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Cuda.Kernels.Upsample import backendTest
 2 | 
 3 | 
 4 | def unittest():
 5 | 	from PuzzleLib.Hip import Backend
 6 | 	backendTest(Backend)
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 | 	unittest()
11 | 


--------------------------------------------------------------------------------
/Hip/Wrappers/RocBlas.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Cuda.Wrappers.CuBlas import backendTest
 2 | 
 3 | 
 4 | def unittest():
 5 | 	from PuzzleLib.Hip import Backend
 6 | 	backendTest(Backend)
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 | 	unittest()
11 | 


--------------------------------------------------------------------------------
/Compiler/Codegen/PyDefines/Generate.py:
--------------------------------------------------------------------------------
1 | import os
2 | from PuzzleLib.Compiler.Toolchain import copySource
3 | 
4 | 
5 | def generatePyDefines(path):
6 | 	dirname = os.path.dirname(__file__)
7 | 	copySource(os.path.join(dirname, "PyDefines.h"), os.path.join(path, "PyDefines.gen.h"))
8 | 


--------------------------------------------------------------------------------
/Datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from PuzzleLib.Datasets.Cifar10Loader import Cifar10Loader
2 | from PuzzleLib.Datasets.IMDBLoader import IMDBLoader
3 | from PuzzleLib.Datasets.MnistLoader import MnistLoader
4 | from PuzzleLib.Datasets.PathLoader import PathLoader
5 | from PuzzleLib.Datasets.SmallNorbLoader import SmallNorbLoader
6 | from PuzzleLib.Datasets.TarLoader import TarLoader
7 | from PuzzleLib.Datasets.ZipLoader import ZipLoader
8 | 


--------------------------------------------------------------------------------
/Datasets/DataLoader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | class DataLoader:
 5 | 	def __init__(self, datanames=None, cachename=None):
 6 | 		self.cachename = cachename
 7 | 
 8 | 		if datanames is None:
 9 | 			self.datanames = ["data"]
10 | 		else:
11 | 			if isinstance(datanames, list) or isinstance(datanames, tuple):
12 | 				self.datanames = datanames
13 | 			else:
14 | 				self.datanames = [datanames]
15 | 
16 | 
17 | 	def clear(self):
18 | 		if os.path.exists(self.cachename):
19 | 			os.remove(self.cachename)
20 | 


--------------------------------------------------------------------------------
/Cost/__init__.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Cost.Abs import Abs
 2 | from PuzzleLib.Cost.BCE import BCE
 3 | from PuzzleLib.Cost.CrossEntropy import CrossEntropy
 4 | from PuzzleLib.Cost.CTC import CTC
 5 | from PuzzleLib.Cost.Hinge import Hinge
 6 | from PuzzleLib.Cost.KLDivergence import KLDivergence
 7 | from PuzzleLib.Cost.L1Hinge import L1Hinge
 8 | from PuzzleLib.Cost.MSE import MSE
 9 | from PuzzleLib.Cost.Multi import Multi
10 | from PuzzleLib.Cost.SmoothL1 import SmoothL1
11 | from PuzzleLib.Cost.SVM import SVM
12 | 


--------------------------------------------------------------------------------
/Optimizers/__init__.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Optimizers.AdaDelta import AdaDelta
 2 | from PuzzleLib.Optimizers.AdaGrad import AdaGrad
 3 | from PuzzleLib.Optimizers.Adam import Adam
 4 | from PuzzleLib.Optimizers.MomentumSGD import MomentumSGD
 5 | from PuzzleLib.Optimizers.NesterovSGD import NesterovSGD
 6 | from PuzzleLib.Optimizers.RMSProp import RMSProp
 7 | from PuzzleLib.Optimizers.RMSPropGraves import RMSPropGraves
 8 | from PuzzleLib.Optimizers.SGD import SGD
 9 | from PuzzleLib.Optimizers.SMORMS3 import SMORMS3
10 | 


--------------------------------------------------------------------------------
/Hip/Benchmarks/ConvSpeed.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Hip.Backend import getBackend
 4 | from PuzzleLib.Cuda.Benchmarks.ConvSpeed import timeConv
 5 | 
 6 | 
 7 | def main():
 8 | 	datashape = (128, 32, 64, 64)
 9 | 	Wshape = (64, 32, 11, 11)
10 | 
11 | 	stride, pad, dilation, groups = 1, 0, 1, datashape[1] // Wshape[1]
12 | 
13 | 	backend = getBackend(initmode=1)
14 | 	timeConv(backend, datashape, Wshape, np.float32, stride, pad, dilation, groups)
15 | 
16 | 
17 | if __name__ == "__main__":
18 | 	main()
19 | 


--------------------------------------------------------------------------------
/Optimizers/Hooks.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend.Kernels.ElementWise import weightDecayKer
 4 | 
 5 | 
 6 | class Hook:
 7 | 	def __call__(self, var, state, stream=None):
 8 | 		raise NotImplementedError()
 9 | 
10 | 
11 | class WeightDecay(Hook):
12 | 	def __init__(self, rate):
13 | 		self.rate = rate
14 | 
15 | 
16 | 	def __call__(self, var, state, stream=None):
17 | 		assert var.grad.dtype == np.float32
18 | 		if var.wc > 0.0:
19 | 			weightDecayKer(var.grad, var.data, self.rate * var.wc, stream=stream)
20 | 


--------------------------------------------------------------------------------
/TestLib/NormFilters.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Backend import gpuarray
 2 | 
 3 | from PuzzleLib.Modules import SubtractMean, LCN
 4 | from PuzzleLib.Visual import loadImage, showImage
 5 | 
 6 | 
 7 | def main():
 8 | 	subtractMean = SubtractMean(size=7)
 9 | 	lcn = LCN(N=7)
10 | 
11 | 	img = gpuarray.to_gpu(loadImage("../TestData/Bench.png"))
12 | 
13 | 	subtractMean(img)
14 | 	showImage(subtractMean.data.get(), "../TestData/ResultSubtractNorm.png")
15 | 
16 | 	lcn(img)
17 | 	showImage(lcn.data.get(), "../TestData/ResultLCN.png")
18 | 
19 | 
20 | if __name__ == "__main__":
21 | 	main()
22 | 


--------------------------------------------------------------------------------
/Models/Nets/__init__.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Models.Nets.Inception import loadInceptionBN, loadInceptionV3
 2 | from PuzzleLib.Models.Nets.LeNet import loadLeNet
 3 | from PuzzleLib.Models.Nets.MiniYolo import loadMiniYolo
 4 | from PuzzleLib.Models.Nets.NiN import loadNiNImageNet
 5 | from PuzzleLib.Models.Nets.OpenPoseCOCO import loadCOCO
 6 | from PuzzleLib.Models.Nets.OpenPoseMPI import loadMPI
 7 | from PuzzleLib.Models.Nets.ResNet import loadResNet
 8 | from PuzzleLib.Models.Nets.UNet import loadUNet
 9 | from PuzzleLib.Models.Nets.VGG import loadVGG
10 | from PuzzleLib.Models.Nets.WaveToLetter import loadW2L
11 | 


--------------------------------------------------------------------------------
/Transformers/Transformer.py:
--------------------------------------------------------------------------------
 1 | class Transformer:
 2 | 	def __call__(self, batch, threadidx):
 3 | 		return batch
 4 | 
 5 | 
 6 | def unittest():
 7 | 	from PuzzleLib.Transformers.Merger import Merger
 8 | 	from PuzzleLib.Datasets.ZipLoader import ZipLoader
 9 | 
10 | 	zipfile = ZipLoader()
11 | 	data1 = zipfile.load("../TestData/test.zip")
12 | 	data2 = zipfile.load("../TestData/test.zip")
13 | 
14 | 	with Merger([data1, data2]) as merger:
15 | 		merger.addTransformer(Transformer())
16 | 
17 | 		for _ in range(10):
18 | 			merger.prepareData(chunksize=4, permutate=False)
19 | 			merger.getData()
20 | 
21 | 
22 | if __name__ == "__main__":
23 | 	unittest()
24 | 


--------------------------------------------------------------------------------
/Cuda/Wrappers/CuDnnMemory.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Cuda.Kernels.Memory import transposeTest, moveAxisTest, swapAxesTest, depthConcatTest
 2 | 
 3 | 
 4 | def unittest():
 5 | 	from PuzzleLib.Cuda import Backend
 6 | 	backendTest(Backend)
 7 | 
 8 | 
 9 | def backendTest(Backend):
10 | 	for deviceIdx in range(Backend.getDeviceCount()):
11 | 		bnd = Backend.getBackend(deviceIdx, initmode=1)
12 | 
13 | 		for dtype, _ in bnd.dtypesSupported():
14 | 			transposeTest(bnd, bnd.dnn, dtype)
15 | 			moveAxisTest(bnd, bnd.dnn, dtype)
16 | 			swapAxesTest(bnd, bnd.dnn, dtype)
17 | 			depthConcatTest(bnd, bnd.dnn, dtype)
18 | 
19 | 
20 | if __name__ == "__main__":
21 | 	unittest()
22 | 


--------------------------------------------------------------------------------
/Compiler/Codegen/Vector/TVector.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stddef.h>
 4 | #include <stdbool.h>
 5 | 
 6 | $HEADER_PREAMBULE
 7 | typedef struct $NAME
 8 | {
 9 | 	$T *ptr;
10 | 	size_t size, capacity;
11 | }
12 | $NAME;
13 | 
14 | 
15 | void ${NAME}_init($NAME *self);
16 | void ${NAME}_dealloc($NAME *self);
17 | 
18 | void ${NAME}_reserve($NAME *self, size_t capacity);
19 | void ${NAME}_append($NAME *self, $T elem);
20 | void ${NAME}_appendEmpty($NAME *self);
21 | bool ${NAME}_pop($NAME *self, $T *elem);
22 | void ${NAME}_clear($NAME *self);
23 | bool ${NAME}_get($NAME *self, size_t index, $T *elem);
24 | bool ${NAME}_set($NAME *self, size_t index, $T elem);
25 | 


--------------------------------------------------------------------------------
/Hip/Kernels/CTC.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Cuda.Kernels.CTC import CTCModule, backendTest
 2 | 
 3 | 
 4 | class HipCTCModule(CTCModule):
 5 | 	@staticmethod
 6 | 	def generateConfig(backend):
 7 | 		return [
 8 | 			(backend.warpSize, 1),
 9 | 			(backend.warpSize * 2, 1),
10 | 			(backend.warpSize, 3),
11 | 			(backend.warpSize * 2, 2),
12 | 			(backend.warpSize, 6),
13 | 			(backend.warpSize * 2, 4),
14 | 			(backend.warpSize, 9),
15 | 			(backend.warpSize * 2, 6),
16 | 			(backend.warpSize * 2, 9),
17 | 			(backend.warpSize * 2, 10)
18 | 		]
19 | 
20 | 
21 | def unittest():
22 | 	from PuzzleLib.Hip import Backend
23 | 	backendTest(Backend, HipCTCModule)
24 | 
25 | 
26 | if __name__ == "__main__":
27 | 	unittest()
28 | 


--------------------------------------------------------------------------------
/Compiler/Codegen/Map/TMap.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stddef.h>
 4 | #include <stdbool.h>
 5 | 
 6 | $HEADER_PREAMBULE
 7 | struct ${NAME}_Bucket;
 8 | 
 9 | 
10 | typedef struct ${NAME}_Bucket
11 | {
12 | 	$K key;
13 | 	$V value;
14 | 
15 | 	struct ${NAME}_Bucket *next;
16 | }
17 | ${NAME}_Bucket;
18 | 
19 | 
20 | typedef struct $NAME
21 | {
22 | 	${NAME}_Bucket **ptr;
23 | 	size_t size, log2capacity;
24 | }
25 | $NAME;
26 | 
27 | 
28 | void ${NAME}_init($NAME *self);
29 | void ${NAME}_dealloc($NAME *self);
30 | 
31 | bool ${NAME}_insert($NAME *self, $K key, $V value);
32 | bool ${NAME}_delete($NAME *self, $K key);
33 | bool ${NAME}_get($NAME *self, $K key, $V *value);
34 | void ${NAME}_clear($NAME *self);
35 | 


--------------------------------------------------------------------------------
/Converter/Examples/NiN.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib import Config
 2 | Config.globalEvalMode = True
 3 | 
 4 | from PuzzleLib.Backend import gpuarray
 5 | from PuzzleLib.Models.Nets import loadNiNImageNet
 6 | 
 7 | from PuzzleLib.Converter.Examples.Common import loadSample, loadLabels, showLabelResults
 8 | 
 9 | 
10 | def main():
11 | 	net = loadNiNImageNet(modelpath="../TestData/nin_imagenet.hdf")
12 | 
13 | 	sample = loadSample("../TestData/barometer.jpg")
14 | 	labels = loadLabels(synpath="../TestData/synsets.txt", wordpath="../TestData/synset_words.txt")
15 | 
16 | 	res = net(gpuarray.to_gpu(sample)).get().reshape(-1)
17 | 	showLabelResults(res, labels, header="NiN")
18 | 
19 | 
20 | if __name__ == "__main__":
21 | 	main()
22 | 


--------------------------------------------------------------------------------
/Transformers/Generator.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Transformers.Provider import Provider
 4 | from PuzzleLib.Transformers.Transformer import Transformer
 5 | 
 6 | 
 7 | class Generator(Provider):
 8 | 	def getNextChunk(self, chunksize, **kwargs):
 9 | 		return None
10 | 
11 | 
12 | class TestGenTransformer(Transformer):
13 | 	def __call__(self, batch, threadidx):
14 | 		return np.random.randn(10, 3, 4, 4).astype(np.float32)
15 | 
16 | 
17 | def unittest():
18 | 	with Generator(numofthreads=4) as generator:
19 | 		generator.addTransformer(TestGenTransformer())
20 | 
21 | 		generator.prepareData()
22 | 		assert generator.getData().shape == (40, 3, 4, 4)
23 | 
24 | 
25 | if __name__ == "__main__":
26 | 	unittest()
27 | 


--------------------------------------------------------------------------------
/Compiler/Codegen/Malloc/TMalloc.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdbool.h>
 4 | #include <stdlib.h>
 5 | 
 6 | 
 7 | #if defined(ENABLE_TRACE_MALLOC)
 8 | 	#define TRACE_MALLOC(size) ${NAME}_malloc(size, __FILE__, __LINE__)
 9 | 	#define TRACE_FREE(ptr) ${NAME}_free(ptr)
10 | 
11 | #else
12 | 	#define TRACE_MALLOC(size) malloc(size)
13 | 	#define TRACE_FREE(ptr) free(ptr)
14 | 
15 | #endif
16 | 
17 | 
18 | void *${NAME}_malloc(size_t size, const char *file, int line);
19 | void ${NAME}_free(void *ptr);
20 | 
21 | size_t ${NAME}_traceLeaks(void);
22 | 
23 | bool ${NAME}_Iterator_init(void);
24 | void ${NAME}_Iterator_dealloc(void);
25 | 
26 | bool ${NAME}_Iterator_move(void);
27 | void ${NAME}_Iterator_item(size_t *size, const char **file, int *line);
28 | 


--------------------------------------------------------------------------------
/Converter/TensorRT/Tests/UNetTest.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray
 4 | from PuzzleLib.Models.Nets.UNet import loadUNet
 5 | 
 6 | from PuzzleLib.Converter.TensorRT.Tests.Common import benchModels
 7 | from PuzzleLib.Converter.TensorRT.BuildRTEngine import buildRTEngine
 8 | 
 9 | 
10 | def main():
11 | 	net = loadUNet(None)
12 | 	data = gpuarray.to_gpu(np.random.randn(1, 1, 256, 256).astype(np.float32))
13 | 
14 | 	engine = buildRTEngine(net, inshape=data.shape, savepath="../TestData")
15 | 
16 | 	net.evalMode()
17 | 	outdata = net(data)
18 | 
19 | 	enginedata = engine(data)
20 | 
21 | 	assert np.allclose(outdata.get(), enginedata.get())
22 | 	benchModels(net, engine, data)
23 | 
24 | 
25 | if __name__ == "__main__":
26 | 	main()
27 | 


--------------------------------------------------------------------------------
/Datasets/TarLoader.py:
--------------------------------------------------------------------------------
 1 | import tarfile
 2 | 
 3 | from PuzzleLib.Datasets.InputLoader import InputLoader
 4 | 
 5 | 
 6 | class TarLoader(InputLoader):
 7 | 	def checkInput(self, archivename):
 8 | 		if not tarfile.is_tarfile(archivename):
 9 | 			raise RuntimeError("'%s' is not tar file" % archivename)
10 | 
11 | 
12 | 	def openInput(self, archivename):
13 | 		return tarfile.open(archivename)
14 | 
15 | 
16 | 	def loadFilelist(self, archive):
17 | 		return [file for file in archive.getnames() if any([file.lower().endswith(ext) for ext in self.exts])]
18 | 
19 | 
20 | 	def openFile(self, archive, file):
21 | 		return archive.extractfile(file)
22 | 
23 | 
24 | def unittest():
25 | 	loader = TarLoader()
26 | 	loader.load("../TestData/test.tar", maxsamples=5, filepacksize=3)
27 | 	loader.clear()
28 | 
29 | 
30 | if __name__ == "__main__":
31 | 	unittest()
32 | 


--------------------------------------------------------------------------------
/Datasets/ZipLoader.py:
--------------------------------------------------------------------------------
 1 | import zipfile
 2 | 
 3 | from PuzzleLib.Datasets.InputLoader import InputLoader
 4 | 
 5 | 
 6 | class ZipLoader(InputLoader):
 7 | 	def checkInput(self, archivename):
 8 | 		if not zipfile.is_zipfile(archivename):
 9 | 			raise RuntimeError("'%s' is not zip file" % archivename)
10 | 
11 | 
12 | 	def openInput(self, archivename):
13 | 		return zipfile.ZipFile(archivename)
14 | 
15 | 
16 | 	def loadFilelist(self, archive):
17 | 		return [file for file in archive.namelist() if any([file.lower().endswith(ext) for ext in self.exts])]
18 | 
19 | 
20 | 	def openFile(self, archive, file):
21 | 		return archive.open(file)
22 | 
23 | 
24 | def unittest():
25 | 	loader = ZipLoader()
26 | 	loader.load("../TestData/test.zip", maxsamples=5, filepacksize=3)
27 | 	loader.clear()
28 | 
29 | 
30 | if __name__ == "__main__":
31 | 	unittest()
32 | 


--------------------------------------------------------------------------------
/Converter/TensorRT/Tests/WaveToLetterTest.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib import Config
 4 | Config.globalEvalMode = True
 5 | 
 6 | from PuzzleLib.Backend import gpuarray
 7 | from PuzzleLib.Models.Nets.WaveToLetter import loadW2L
 8 | 
 9 | from PuzzleLib.Converter.TensorRT.Tests.Common import benchModels
10 | from PuzzleLib.Converter.TensorRT.BuildRTEngine import buildRTEngine
11 | 
12 | 
13 | def main():
14 | 	inmaps = 161
15 | 	net = loadW2L(None, inmaps, nlabels=29)
16 | 
17 | 	data = gpuarray.to_gpu(np.random.randn(1, inmaps, 200).astype(np.float32))
18 | 	engine = buildRTEngine(net, inshape=data.shape, savepath="../TestData")
19 | 
20 | 	net.evalMode()
21 | 	outdata = net(data)
22 | 
23 | 	enginedata = engine(data)
24 | 
25 | 	assert np.allclose(outdata.get(), enginedata.get(), atol=1e-7)
26 | 	benchModels(net, engine, data)
27 | 
28 | 
29 | if __name__ == "__main__":
30 | 	main()
31 | 


--------------------------------------------------------------------------------
/Cuda/Benchmarks/ConvSpeed.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from PuzzleLib.Cuda.Backend import getBackend
 3 | 
 4 | 
 5 | def main():
 6 | 	datashape = (128, 32, 64, 64)
 7 | 	Wshape = (64, 32, 11, 11)
 8 | 
 9 | 	stride, pad, dilation, groups = 1, 0, 1, datashape[1] // Wshape[1]
10 | 	timeConv(getBackend(initmode=1), datashape, Wshape, np.float32, stride, pad, dilation, groups)
11 | 
12 | 
13 | def timeConv(backend, datashape, Wshape, dtype, stride, pad, dilation, groups):
14 | 	fwdResults, bwdDataResults, bwdFilterResults = backend.convNdbenchmark(
15 | 		datashape, Wshape, dtype, stride, pad, dilation, groups
16 | 	)
17 | 
18 | 	print("Forward results:")
19 | 	for res in fwdResults:
20 | 		print(res)
21 | 
22 | 	print("\nBackward filter results:")
23 | 	for res in bwdFilterResults:
24 | 		print(res)
25 | 
26 | 	print("\nBackward data results:")
27 | 	for res in bwdDataResults:
28 | 		print(res)
29 | 
30 | 
31 | if __name__ == "__main__":
32 | 	main()
33 | 


--------------------------------------------------------------------------------
/Modules/Identity.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray
 4 | from PuzzleLib.Modules.Module import Module
 5 | 
 6 | 
 7 | class Identity(Module):
 8 | 	def __init__(self, name=None):
 9 | 		super().__init__(name)
10 | 
11 | 		self.movesData = True
12 | 		self.movesGrad = True
13 | 
14 | 
15 | 	def updateData(self, data):
16 | 		self.data = data
17 | 
18 | 
19 | 	def updateGrad(self, grad):
20 | 		self.grad = grad
21 | 
22 | 
23 | 	def dataShapeFrom(self, shape):
24 | 		return shape
25 | 
26 | 
27 | 	def gradShapeFrom(self, shape):
28 | 		return shape
29 | 
30 | 
31 | 	def calcMode(self, T):
32 | 		self.calctype = T
33 | 
34 | 
35 | def unittest():
36 | 	data = gpuarray.to_gpu(np.random.normal(0.0, 0.01, (10, 3, 40, 40)).astype(np.float32))
37 | 
38 | 	identity = Identity()
39 | 	identity(data)
40 | 
41 | 	assert np.allclose(data.get(), identity.data.get())
42 | 
43 | 
44 | if __name__ == "__main__":
45 | 	unittest()
46 | 


--------------------------------------------------------------------------------
/Converter/TensorRT/Tests/ResNet50Test.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib import Config
 2 | Config.globalEvalMode = True
 3 | 
 4 | from PuzzleLib.Backend import gpuarray
 5 | from PuzzleLib.Models.Nets.ResNet import loadResNet
 6 | 
 7 | from PuzzleLib.Converter.Examples.Common import loadResNetSample, loadLabels
 8 | 
 9 | from PuzzleLib.Converter.TensorRT.Tests.Common import scoreModels, benchModels
10 | from PuzzleLib.Converter.TensorRT.BuildRTEngine import buildRTEngine
11 | 
12 | 
13 | def main():
14 | 	net = loadResNet(modelpath="../../TestData/ResNet-50-model.hdf", layers="50")
15 | 
16 | 	data = gpuarray.to_gpu(loadResNetSample(net, "../../TestData/tarantula.jpg"))
17 | 	labels = loadLabels(synpath="../../TestData/synsets.txt", wordpath="../../TestData/synset_words.txt")
18 | 
19 | 	engine = buildRTEngine(net, inshape=data.shape, savepath="../TestData")
20 | 
21 | 	scoreModels(net, engine, data, labels)
22 | 	benchModels(net, engine, data)
23 | 
24 | 
25 | if __name__ == "__main__":
26 | 	main()
27 | 


--------------------------------------------------------------------------------
/Modules/LRN.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Modules.Module import ModuleError, Module
 2 | 
 3 | 
 4 | class LRN(Module):
 5 | 	def __init__(self, N=5, alpha=1e-4, beta=0.75, K=2.0, name=None):
 6 | 		super().__init__(name)
 7 | 		self.registerBlueprint(locals())
 8 | 
 9 | 		self.N = N
10 | 		self.alpha = alpha
11 | 		self.beta = beta
12 | 		self.K = K
13 | 
14 | 		self.workspace = None
15 | 
16 | 
17 | 	def dataShapeFrom(self, shape):
18 | 		return shape
19 | 
20 | 
21 | 	def checkDataShape(self, shape):
22 | 		if len(shape) != 4:
23 | 			raise ModuleError("Data must be 4d tensor")
24 | 
25 | 
26 | 	def gradShapeFrom(self, shape):
27 | 		return shape
28 | 
29 | 
30 | 	def checkGradShape(self, shape):
31 | 		if len(shape) != 4:
32 | 			raise ModuleError("Grad must be 4d tensor")
33 | 
34 | 
35 | 	def updateData(self, data):
36 | 		raise NotImplementedError()
37 | 
38 | 
39 | 	def updateGrad(self, grad):
40 | 		raise NotImplementedError()
41 | 
42 | 
43 | 	def reset(self):
44 | 		super().reset()
45 | 		self.workspace = None
46 | 


--------------------------------------------------------------------------------
/Converter/OpenVINO/Tests/ResNet50Test.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib import Config
 2 | 
 3 | Config.backend = Config.Backend.intel
 4 | Config.globalEvalMode = True
 5 | 
 6 | from PuzzleLib.Backend import gpuarray
 7 | from PuzzleLib.Models.Nets.ResNet import loadResNet
 8 | 
 9 | from PuzzleLib.Converter.Examples.Common import loadResNetSample, loadLabels
10 | 
11 | from PuzzleLib.Converter.OpenVINO.Tests.Common import scoreModels, benchModels
12 | from PuzzleLib.Converter.OpenVINO.BuildVINOEngine import buildVINOEngine
13 | 
14 | 
15 | def main():
16 | 	net = loadResNet(modelpath="../../TestData/ResNet-50-model.hdf", layers="50")
17 | 
18 | 	data = gpuarray.to_gpu(loadResNetSample(net, "../../TestData/tarantula.jpg"))
19 | 	labels = loadLabels(synpath="../../TestData/synsets.txt", wordpath="../../TestData/synset_words.txt")
20 | 
21 | 	engine = buildVINOEngine(net, inshape=data.shape, savepath="../TestData")
22 | 
23 | 	scoreModels(net, engine, data, labels)
24 | 	benchModels(net, engine, data)
25 | 
26 | 
27 | if __name__ == "__main__":
28 | 	main()
29 | 


--------------------------------------------------------------------------------
/Converter/OpenVINO/Tests/Common.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend.gpuarray import timeKernel
 4 | 
 5 | 
 6 | def scoreModels(net, engine, data, labels):
 7 | 	hostNetData = net(data).get()
 8 | 	hostEngineData = engine(data).get()
 9 | 
10 | 	assert np.allclose(hostNetData, hostEngineData, atol=1e-6)
11 | 
12 | 	printResults(hostNetData, labels, "Net")
13 | 	printResults(hostEngineData, labels, "Engine")
14 | 
15 | 
16 | def printResults(probs, labels, name):
17 | 	probs = probs.flatten()
18 | 
19 | 	idx = (-probs).argsort()[:5]
20 | 	print("%s top-5 predictions: " % name)
21 | 
22 | 	for i in range(5):
23 | 		print("#%s %s (prob=%s)" % (i, labels[idx[i]], probs[idx[i]]))
24 | 
25 | 
26 | def benchModels(net, engine, data):
27 | 	net.optimizeForShape(data.shape)
28 | 
29 | 	nettime = timeKernel(net, args=(data, ), looplength=100, log=False, normalize=True)
30 | 	enginetime = timeKernel(engine, args=(data, ), looplength=100, log=False, normalize=True)
31 | 
32 | 	print("Net    time: host=%.10f" % nettime)
33 | 	print("Engine time: host=%.10f" % enginetime)
34 | 


--------------------------------------------------------------------------------
/Backend/Kernels/Embedder.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib import Config
 2 | 
 3 | 
 4 | embed = None
 5 | embedBackwardParams = None
 6 | 
 7 | 
 8 | def autoinit():
 9 | 	if not Config.shouldInit():
10 | 		return
11 | 
12 | 	if Config.backend == Config.Backend.cuda:
13 | 		initCuda()
14 | 	elif Config.backend == Config.Backend.hip:
15 | 		initHip()
16 | 	elif Config.isCPUBased(Config.backend):
17 | 		initCPU()
18 | 	else:
19 | 		raise Config.ConfigError(Config.backend)
20 | 
21 | 
22 | def initCuda():
23 | 	from PuzzleLib.Cuda import Backend
24 | 	initGPU(Backend)
25 | 
26 | 
27 | def initHip():
28 | 	from PuzzleLib.Hip import Backend
29 | 	initGPU(Backend)
30 | 
31 | 
32 | def initGPU(Backend):
33 | 	backend = Backend.getBackend(Config.deviceIdx, initmode=2, logger=Config.getLogger())
34 | 	memoryPool, embedmod = backend.memoryPool, backend.embedmod
35 | 
36 | 	def wrapEmbed(data, W):
37 | 		return embedmod.embed(data, W, memoryPool)
38 | 
39 | 	global embed, embedBackwardParams
40 | 	embed = wrapEmbed
41 | 	embedBackwardParams = embedmod.embedBackwardParams
42 | 
43 | 
44 | def initCPU():
45 | 	pass
46 | 
47 | 
48 | autoinit()
49 | 


--------------------------------------------------------------------------------
/Intel/ThirdParty/finddnnl.py:
--------------------------------------------------------------------------------
 1 | import sys, os, ctypes
 2 | 
 3 | 
 4 | def findDNNL():
 5 | 	versions = ["1.91", "1.2", "1.1"]
 6 | 
 7 | 	if sys.platform == "linux":
 8 | 		libnames = ["libdnnl.so.%s" % v for v in versions]
 9 | 		libnames += ["/usr/local/lib/%s" % libname for libname in libnames]
10 | 
11 | 	elif sys.platform == "darwin":
12 | 		libnames = ["/usr/local/lib/libdnnl.%s.dylib" % v for v in versions]
13 | 
14 | 	elif sys.platform == "win32":
15 | 		libpaths = [
16 | 			os.environ.get("DNNL_PATH", ""),
17 | 			os.path.normpath(os.path.join(os.path.dirname(__file__), "../Libs/"))
18 | 		]
19 | 
20 | 		libnames = [os.path.join(libpath, "dnnl.dll") for libpath in libpaths]
21 | 
22 | 	else:
23 | 		raise RuntimeError("Unsupported platform for dnnl")
24 | 
25 | 	cloader = ctypes.windll if sys.platform == "win32" else ctypes.cdll
26 | 
27 | 	for libname in libnames:
28 | 		try:
29 | 			clib = cloader.LoadLibrary(libname)
30 | 
31 | 		except OSError:
32 | 			pass
33 | 
34 | 		else:
35 | 			return libname, clib
36 | 
37 | 	raise OSError("dnnl library not found (searched for following version(s): %s)" % versions)
38 | 


--------------------------------------------------------------------------------
/Converter/TensorRT/Tests/Common.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend.gpuarray import timeKernel
 4 | 
 5 | 
 6 | def scoreModels(net, engine, data, labels):
 7 | 	hostNetData = net(data).get()
 8 | 	hostEngineData = engine(data).get()
 9 | 
10 | 	assert np.allclose(hostNetData, hostEngineData)
11 | 
12 | 	printResults(hostNetData, labels, "Net")
13 | 	printResults(hostEngineData, labels, "Engine")
14 | 
15 | 
16 | def printResults(probs, labels, name):
17 | 	probs = probs.flatten()
18 | 
19 | 	idx = (-probs).argsort()[:5]
20 | 	print("%s top-5 predictions: " % name)
21 | 
22 | 	for i in range(5):
23 | 		print("#%s %s (prob=%s)" % (i, labels[idx[i]], probs[idx[i]]))
24 | 
25 | 
26 | def benchModels(net, engine, data):
27 | 	net.optimizeForShape(data.shape)
28 | 
29 | 	nettime = timeKernel(net, args=(data, ), looplength=100, log=False, normalize=True)
30 | 	enginetime = timeKernel(engine, args=(data, ), looplength=100, log=False, normalize=True)
31 | 
32 | 	print("Net    time: device=%.10f host=%.10f" % (nettime[0], nettime[1]))
33 | 	print("Engine time: device=%.10f host=%.10f" % (enginetime[0], enginetime[1]))
34 | 


--------------------------------------------------------------------------------
/Converter/Caffe/ConvertBlob.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | 
 3 | import numpy as np
 4 | import h5py
 5 | 
 6 | 
 7 | def saveAttr(data, name, filename):
 8 | 	hdf = h5py.File(filename, mode="a")
 9 | 
10 | 	modelname = next(iter(hdf["links"].keys())).split(sep=".")[0]
11 | 
12 | 	attrGrpName = "attrs.%s" % modelname
13 | 
14 | 	attrGrp = hdf.require_group(attrGrpName)
15 | 	attrGrp.create_dataset("%s.%s" % (modelname, name), data=data)
16 | 
17 | 
18 | def main():
19 | 	binaryname = "ResNet_mean.binaryproto"
20 | 	modelname = "ResNet-50-model.hdf"
21 | 	attrName = "mean"
22 | 
23 | 	subprocess.check_call(["protoc", "--proto_path", ".", "--python_out", ".", "caffe.proto"])
24 | 	print("Compiled caffe.proto")
25 | 
26 | 	from PuzzleLib.Converter.Caffe import caffe_pb2
27 | 	blob = caffe_pb2.BlobProto()
28 | 
29 | 	msg = open(binaryname, "rb").read()
30 | 
31 | 	print("Started parsing binaryproto %s ..." % binaryname)
32 | 	blob.ParseFromString(msg)
33 | 
34 | 	data = np.array(blob.data, dtype=np.float32).reshape((1, blob.channels, blob.height, blob.width))
35 | 	saveAttr(data, attrName, modelname)
36 | 
37 | 
38 | if __name__ == "__main__":
39 | 	main()
40 | 


--------------------------------------------------------------------------------
/Intel/Benchmarks/ConvSpeed.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Intel.Wrappers import DNNL
 2 | 
 3 | 
 4 | def main():
 5 | 	datashape = (16, 32, 64, 64)
 6 | 	Wshape = (64, 32, 3, 3)
 7 | 
 8 | 	stride, pad = 1, 0
 9 | 	timeConv(datashape, Wshape, stride, pad)
10 | 
11 | 
12 | def timeConv(datashape, Wshape, stride, pad):
13 | 	fwdResults, bwdFilterResults, bwdDataResults = DNNL.convNdbenchmark(datashape, Wshape, stride, pad)
14 | 
15 | 	formatstr = "%-40s %-25s %-28s"
16 | 
17 | 	print("Forward results:")
18 | 	for res in fwdResults:
19 | 		print(formatstr % (
20 | 			"Algo %s" % res.algo, "time %.6f secs" % res.time, "memory %.6f mbytes" % (res.memory / 1024**2)
21 | 		))
22 | 
23 | 	print("\nBackward filter results:")
24 | 	for res in bwdFilterResults:
25 | 		print(formatstr % (
26 | 			"Algo %s" % res.algo, "time %.6f secs" % res.time, "memory %.6f mbytes" % (res.memory / 1024**2)
27 | 		))
28 | 
29 | 	print("\nBackward data results:")
30 | 	for res in bwdDataResults:
31 | 		print(formatstr % (
32 | 			"Algo %s" % DNNL.ConvAlgo(res.algo), "time %.6f secs" % res.time,
33 | 			"memory %.6f mbytes" % (res.memory / 1024**2)
34 | 		))
35 | 
36 | 
37 | if __name__ == "__main__":
38 | 	main()
39 | 


--------------------------------------------------------------------------------
/Config.py:
--------------------------------------------------------------------------------
 1 | import sys, multiprocessing, logging
 2 | from enum import Enum
 3 | 
 4 | 
 5 | class ConfigError(Exception):
 6 | 	pass
 7 | 
 8 | 
 9 | class Backend(Enum):
10 | 	cuda = 0
11 | 	hip = 1
12 | 	cpu = 2
13 | 	intel = 3
14 | 
15 | 
16 | backend = Backend.cuda
17 | deviceIdx = 0
18 | 
19 | 
20 | allowMultiContext = False
21 | systemLog = False
22 | logger = None
23 | 
24 | 
25 | libname = "PuzzleLib"
26 | 
27 | 
28 | globalEvalMode = False
29 | disableDtypeShapeChecks = False
30 | disableModuleCompatChecks = False
31 | verifyData = False
32 | showWarnings = True
33 | 
34 | 
35 | def isCPUBased(bnd):
36 | 	return bnd in {Backend.cpu, Backend.intel}
37 | 
38 | 
39 | def shouldInit():
40 | 	return multiprocessing.current_process().name == "MainProcess" or allowMultiContext
41 | 
42 | 
43 | def getLogger():
44 | 	global logger
45 | 
46 | 	if logger is not None:
47 | 		return logger
48 | 
49 | 	logger = logging.getLogger(libname)
50 | 	logger.setLevel(logging.DEBUG if systemLog else logging.INFO)
51 | 
52 | 	handler = logging.StreamHandler(stream=sys.stdout)
53 | 	handler.setFormatter(logging.Formatter("[%(name)s] %(message)s"))
54 | 
55 | 	logger.addHandler(handler)
56 | 	return logger
57 | 


--------------------------------------------------------------------------------
/Converter/Examples/VGG.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib import Config
 2 | Config.globalEvalMode = True
 3 | 
 4 | from PuzzleLib.Backend import gpuarray
 5 | from PuzzleLib.Models.Nets.VGG import loadVGG
 6 | 
 7 | from PuzzleLib.Converter.Examples.Common import loadVGGSample, loadLabels, showLabelResults
 8 | 
 9 | 
10 | def main():
11 | 	vgg16Test()
12 | 	vgg19Test()
13 | 
14 | 
15 | def vgg16Test():
16 | 	net = loadVGG(modelpath="../TestData/VGG_ILSVRC_16_layers.hdf", layers="16")
17 | 
18 | 	sample = loadVGGSample("../TestData/tarantula.jpg")
19 | 	labels = loadLabels(synpath="../TestData/synsets.txt", wordpath="../TestData/synset_words.txt")
20 | 
21 | 	res = net(gpuarray.to_gpu(sample)).get().reshape(-1)
22 | 	showLabelResults(res, labels, header=net.name)
23 | 
24 | 
25 | def vgg19Test():
26 | 	net = loadVGG(modelpath="../TestData/VGG_ILSVRC_19_layers.hdf", layers="19")
27 | 
28 | 	sample = loadVGGSample("../TestData/tarantula.jpg")
29 | 	labels = loadLabels(synpath="../TestData/synsets.txt", wordpath="../TestData/synset_words.txt")
30 | 
31 | 	res = net(gpuarray.to_gpu(sample)).get().reshape(-1)
32 | 	showLabelResults(res, labels, header=net.name)
33 | 
34 | 
35 | if __name__ == "__main__":
36 | 	main()
37 | 


--------------------------------------------------------------------------------
/Hip/Source/Build.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Cuda.Source.Build import prepareCompiler, generateTemplates, collectCoreSources, collectLibSources
 2 | 
 3 | 
 4 | def buildDriver(debugmode, verbose):
 5 | 	cc = prepareCompiler(debugmode, verbose)
 6 | 	prepareHip(cc)
 7 | 
 8 | 	generateTemplates(path="../../Cuda/Source")
 9 | 
10 | 	driver = "../Driver" + cc.pydext
11 | 	cc.build(driver, collectSources(path="../../Cuda/Source")).clearPath("..")
12 | 
13 | 	return driver
14 | 
15 | 
16 | def prepareHip(cc):
17 | 	cc.cppMode(True).addDefine("__HIP_PLATFORM_HCC__")
18 | 	cc.cflags.extend(["-x", "c++"])
19 | 
20 | 	cc.addLibrary(
21 | 		"hip",
22 | 		[
23 | 			".", "/opt/rocm/hsa/include", "/opt/rocm/hip/include",
24 | 			"/opt/rocm/hiprand/include", "/opt/rocm/rocrand/include",
25 | 			"/opt/rocm/rocblas/include", "/opt/rocm/miopen/include"
26 | 		],
27 | 		["/opt/rocm/hip/lib", "/opt/rocm/hiprand/lib", "/opt/rocm/rocblas/lib", "/opt/rocm/miopen/lib"],
28 | 		["hip_hcc", "hiprtc", "hiprand", "rocblas"]
29 | 	)
30 | 
31 | 
32 | def collectSources(path):
33 | 	return collectCoreSources(path) + collectLibSources(path)
34 | 
35 | 
36 | def main():
37 | 	return buildDriver(debugmode=0, verbose=2)
38 | 
39 | 
40 | if __name__ == "__main__":
41 | 	main()
42 | 


--------------------------------------------------------------------------------
/TestLib/OptimizeNet.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray
 4 | from PuzzleLib.Models.Nets.VGG import loadVGG
 5 | 
 6 | from PuzzleLib.Optimizers import SGD
 7 | from PuzzleLib.Cost import CrossEntropy
 8 | from PuzzleLib.Handlers import Trainer
 9 | 
10 | 
11 | def main():
12 | 	net = loadVGG(None, "16")
13 | 
14 | 	batchsize = 16
15 | 	size = (batchsize, 3, 224, 224)
16 | 
17 | 	batch = np.random.normal(size=size).astype(dtype=np.float32)
18 | 	batch = gpuarray.to_gpu(batch)
19 | 
20 | 	labels = np.random.randint(low=0, high=1000, size=(batchsize, ), dtype=np.int32)
21 | 	labels = gpuarray.to_gpu(labels)
22 | 
23 | 	optimizer = SGD()
24 | 	optimizer.setupOn(net)
25 | 
26 | 	cost = CrossEntropy(maxlabels=1000)
27 | 	trainer = Trainer(net, cost, optimizer)
28 | 
29 | 	print("Started benchmarking %s ..." % net.name)
30 | 	gpuarray.timeKernel(
31 | 		trainer.train, args=(batch, labels), looplength=100, logname="Before optimizing %s" % net.name, normalize=True
32 | 	)
33 | 
34 | 	net.optimizeForShape(size)
35 | 	gpuarray.timeKernel(
36 | 		trainer.train, args=(batch, labels), looplength=100, logname="After optimizing %s" % net.name, normalize=True
37 | 	)
38 | 
39 | 
40 | if __name__ == "__main__":
41 | 	main()
42 | 


--------------------------------------------------------------------------------
/Compiler/Codegen/Tree/TTree.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stddef.h>
 4 | #include <stdbool.h>
 5 | 
 6 | $HEADER_PREAMBULE
 7 | struct ${NAME}_Node;
 8 | 
 9 | 
10 | typedef struct ${NAME}_Node
11 | {
12 | 	bool red;
13 | 	struct ${NAME}_Node *links[2];
14 | 
15 | 	$K key;
16 | 	$V value;
17 | }
18 | ${NAME}_Node;
19 | 
20 | 
21 | typedef struct $NAME
22 | {
23 | 	${NAME}_Node *root;
24 | 	size_t size;
25 | }
26 | $NAME;
27 | 
28 | 
29 | void ${NAME}_init($NAME *self);
30 | void ${NAME}_dealloc($NAME *self);
31 | bool ${NAME}_validate($NAME *self);
32 | 
33 | bool ${NAME}_insert($NAME *self, $K key, $V value);
34 | bool ${NAME}_delete($NAME *self, $K key);
35 | bool ${NAME}_get($NAME *self, $K key, $V *value);
36 | void ${NAME}_clear($NAME *self);
37 | 
38 | 
39 | typedef struct ${NAME}_Iterator
40 | {
41 | 	$NAME *map;
42 | 	${NAME}_Node *node;
43 | 
44 | 	${NAME}_Node *path[16 * sizeof(size_t)];
45 | 	size_t top;
46 | }
47 | ${NAME}_Iterator;
48 | 
49 | 
50 | bool ${NAME}_Iterator_init(${NAME}_Iterator *self, $NAME *map, bool atLeft);
51 | void ${NAME}_Iterator_dealloc(${NAME}_Iterator *self);
52 | 
53 | bool ${NAME}_Iterator_move(${NAME}_Iterator *self, bool toRight);
54 | void ${NAME}_Iterator_item(${NAME}_Iterator *self, $K *key, $V *value);
55 | 


--------------------------------------------------------------------------------
/Converter/Examples/Inception.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib import Config
 2 | Config.globalEvalMode = True
 3 | 
 4 | from PuzzleLib.Backend import gpuarray
 5 | from PuzzleLib.Models.Nets.Inception import loadInceptionBN, loadInceptionV3
 6 | 
 7 | from PuzzleLib.Converter.Examples.Common import loadVGGSample, loadLabels, loadV3Labels, showLabelResults
 8 | 
 9 | 
10 | def main():
11 | 	inceptionBNTest()
12 | 	inceptionV3Test()
13 | 
14 | 
15 | def inceptionBNTest():
16 | 	net = loadInceptionBN(modelpath="../TestData/Inception-BN-0126.hdf")
17 | 
18 | 	sample = loadVGGSample("../TestData/tarantula.jpg")
19 | 	labels = loadLabels(synpath="../TestData/synsets.txt", wordpath="../TestData/synset_words.txt")
20 | 
21 | 	res = net(gpuarray.to_gpu(sample)).get().reshape(-1)
22 | 	showLabelResults(res, labels, header=net.name)
23 | 
24 | 
25 | def inceptionV3Test():
26 | 	net = loadInceptionV3(modelpath="../TestData/Inception-7-0001.hdf")
27 | 
28 | 	sample = loadVGGSample("../TestData/tarantula.jpg", shape=(299, 299), normalize=True)
29 | 	labels = loadV3Labels(filename="../TestData/synset_inception_v3.txt")
30 | 
31 | 	res = net(gpuarray.to_gpu(sample)).get().reshape(-1)
32 | 	showLabelResults(res, labels, header=net.name)
33 | 
34 | 
35 | if __name__ == "__main__":
36 | 	main()
37 | 


--------------------------------------------------------------------------------
/Models/Nets/LeNet.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray
 4 | from PuzzleLib.Containers.Sequential import Sequential
 5 | 
 6 | from PuzzleLib.Modules.Conv2D import Conv2D
 7 | from PuzzleLib.Modules.MaxPool2D import MaxPool2D
 8 | from PuzzleLib.Modules.Activation import Activation, relu
 9 | from PuzzleLib.Modules.Flatten import Flatten
10 | from PuzzleLib.Modules.Linear import Linear
11 | 
12 | 
13 | def loadLeNet(modelpath, initscheme="none", name="lenet-5-like"):
14 | 	net = Sequential(name=name)
15 | 
16 | 	net.append(Conv2D(1, 16, 3, initscheme=initscheme))
17 | 	net.append(MaxPool2D())
18 | 	net.append(Activation(relu))
19 | 
20 | 	net.append(Conv2D(16, 32, 4, initscheme=initscheme))
21 | 	net.append(MaxPool2D())
22 | 	net.append(Activation(relu))
23 | 
24 | 	net.append(Flatten())
25 | 	net.append(Linear(32 * 5 * 5, 1024, initscheme=initscheme))
26 | 	net.append(Activation(relu))
27 | 
28 | 	net.append(Linear(1024, 10, initscheme=initscheme))
29 | 
30 | 	if modelpath is not None:
31 | 		net.load(modelpath)
32 | 
33 | 	return net
34 | 
35 | 
36 | def unittest():
37 | 	data = gpuarray.to_gpu(np.random.randn(1, 1, 28, 28).astype(np.float32))
38 | 
39 | 	net = loadLeNet(None)
40 | 	net(data)
41 | 
42 | 
43 | if __name__ == "__main__":
44 | 	unittest()
45 | 


--------------------------------------------------------------------------------
/Modules/Flatten.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray
 4 | from PuzzleLib.Modules.Module import Module
 5 | 
 6 | 
 7 | class Flatten(Module):
 8 | 	def __init__(self, name=None):
 9 | 		super().__init__(name)
10 | 
11 | 		self.movesData = True
12 | 		self.movesGrad = True
13 | 
14 | 		self.inshape = None
15 | 
16 | 
17 | 	def updateData(self, data):
18 | 		self.inshape = data.shape
19 | 		self.data = data.reshape(data.shape[0], int(np.prod(data.shape[1:])))
20 | 
21 | 
22 | 	def updateGrad(self, grad):
23 | 		self.grad = grad.reshape(self.inshape)
24 | 
25 | 
26 | 	def dataShapeFrom(self, shape):
27 | 		return shape[0], int(np.prod(shape[1:]))
28 | 
29 | 
30 | 	def gradShapeFrom(self, shape):
31 | 		return (shape[0], ) + self.inshape[1:]
32 | 
33 | 
34 | 	def calcMode(self, T):
35 | 		self.calctype = T
36 | 
37 | 
38 | def unittest():
39 | 	data = gpuarray.to_gpu(np.random.randn(10, 10, 10, 10).astype(np.float32))
40 | 
41 | 	flatten = Flatten()
42 | 	flatten(data)
43 | 
44 | 	shape = (10, 1000)
45 | 	assert flatten.data.shape == shape
46 | 
47 | 	grad = gpuarray.to_gpu(np.random.randn(*flatten.data.shape).astype(np.float32))
48 | 	flatten.backward(grad)
49 | 
50 | 	assert flatten.grad.shape == data.shape
51 | 
52 | 
53 | if __name__ == "__main__":
54 | 	unittest()
55 | 


--------------------------------------------------------------------------------
/Converter/TensorRT/Tests/GraphTest.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray
 4 | 
 5 | from PuzzleLib.Containers import Graph
 6 | from PuzzleLib.Modules import Linear, Activation, relu, Add
 7 | 
 8 | from PuzzleLib.Converter.TensorRT.Tests.Common import benchModels
 9 | from PuzzleLib.Converter.TensorRT.BuildRTEngine import buildRTEngine
10 | 
11 | 
12 | def main():
13 | 	batchsize, insize = 16, 1000
14 | 
15 | 	inNode = Linear(insize, 1000, name="linear1").node()
16 | 	node = Activation(relu, name="relu1").node(inNode)
17 | 
18 | 	node1 = Linear(1000, 800, name="linear2").node(node)
19 | 	node1 = Activation(relu, name="relu2").node(node1)
20 | 
21 | 	node2 = Linear(1000, 800, name="linear3").node(node)
22 | 	node2 = Activation(relu, name="relu3").node(node2)
23 | 
24 | 	outNode = Add(name="add").node(node1, node2)
25 | 
26 | 	graph = Graph(inputs=inNode, outputs=outNode, name="graph")
27 | 	engine = buildRTEngine(graph, (batchsize, insize), savepath="../TestData")
28 | 
29 | 	data = gpuarray.to_gpu(np.random.randn(batchsize, insize).astype(np.float32))
30 | 
31 | 	outdata = graph(data)
32 | 	enginedata = engine(data)
33 | 
34 | 	assert np.allclose(outdata.get(), enginedata.get(), atol=1e-6)
35 | 	benchModels(graph, engine, data)
36 | 
37 | 
38 | if __name__ == "__main__":
39 | 	main()
40 | 


--------------------------------------------------------------------------------
/Variable.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib import Config
 2 | from PuzzleLib.Backend import gpuarray
 3 | 
 4 | 
 5 | class Variable:
 6 | 	index = 0
 7 | 
 8 | 
 9 | 	def __init__(self, data, name=None, withgrad=True, grad=None, updater=None, postUpdater=None):
10 | 		if name is None:
11 | 			self.name = str(type(self).index)
12 | 			type(self).index += 1
13 | 		else:
14 | 			self.name = name
15 | 
16 | 		self.data = data
17 | 		self.updater = updater
18 | 
19 | 		if updater is not None:
20 | 			return
21 | 
22 | 		self.postUpdater = postUpdater
23 | 		self.grad = None
24 | 
25 | 		if grad is not None:
26 | 			self.grad = grad
27 | 
28 | 		elif withgrad and not Config.globalEvalMode:
29 | 			self.grad = gpuarray.zeros(shape=self.data.shape, dtype=self.data.dtype)
30 | 
31 | 		self.learnRate, self.momRate = 1.0, 1.0
32 | 		self.wc = 0.0
33 | 
34 | 
35 | 	@property
36 | 	def hasUpdater(self):
37 | 		return self.updater is not None
38 | 
39 | 
40 | 	@property
41 | 	def hasPostUpdater(self):
42 | 		return self.postUpdater is not None
43 | 
44 | 
45 | 	def update(self, learnRate):
46 | 		self.updater(self, learnRate)
47 | 
48 | 
49 | 	def postUpdate(self):
50 | 		self.postUpdater(self)
51 | 
52 | 
53 | 	def set(self, variable):
54 | 		self.data.set(variable.data)
55 | 
56 | 		if self.grad is not None:
57 | 			self.grad.set(variable.grad)
58 | 


--------------------------------------------------------------------------------
/Compiler/Compilers/NVCC.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from PuzzleLib.Compiler.Compilers.GCC import GCCLike
 4 | from PuzzleLib.Compiler.Compilers.MSVC import MSVC
 5 | 
 6 | 
 7 | class NVCC(GCCLike):
 8 | 	cc = "nvcc"
 9 | 
10 | 
11 | 	def __init__(self, verbose=0, forPython=False):
12 | 		super().__init__(verbose)
13 | 		cflags = MSVC.cflags if sys.platform == "win32" else self.cflags
14 | 
15 | 		self.cflags = [flag for cflag in cflags for flag in ["-Xcompiler", cflag]]
16 | 		self.cpp = True
17 | 
18 | 		if not forPython:
19 | 			self.ldflags = []
20 | 
21 | 
22 | 	def cppMode(self, enabled):
23 | 		assert False
24 | 
25 | 
26 | 	def	fullCFlags(self, asObject, debug=True, optimize=True):
27 | 		oflags = self.fullCppFlags()
28 | 
29 | 		if debug and self.debuglevel > 0:
30 | 			oflags.extend(["-g", "-G" if self.debuglevel >= 3 else "-lineinfo"])
31 | 
32 | 		if optimize and self.optlevel > 0:
33 | 			oflags.append("-O3" if self.optlevel >= 3 else "-O%s" % self.optlevel)
34 | 
35 | 			if self.optlevel >= 3:
36 | 				oflags.append("-use_fast_math")
37 | 
38 | 		return self.cflags + oflags + ["-I%s" % idir for idir in self.includeDirs] + (["-c"] if asObject else [])
39 | 
40 | 
41 | 	def fullCppFlags(self):
42 | 		return [] if sys.platform == "win32" else ["-std=c++14"]
43 | 
44 | 
45 | 	def outFlags(self, extfile):
46 | 		return ["-o", extfile]
47 | 


--------------------------------------------------------------------------------
/TestLib/CnnMnistLenet.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Datasets import MnistLoader
 4 | from PuzzleLib.Visual import showFilters
 5 | from PuzzleLib.Handlers import Trainer, Validator
 6 | from PuzzleLib.Optimizers import MomentumSGD
 7 | from PuzzleLib.Cost import CrossEntropy
 8 | 
 9 | from PuzzleLib.Models.Nets.LeNet import loadLeNet
10 | 
11 | 
12 | def main():
13 | 	mnist = MnistLoader()
14 | 	data, labels = mnist.load(path="../TestData/")
15 | 	data, labels = data[:], labels[:]
16 | 	print("Loaded mnist")
17 | 
18 | 	np.random.seed(1234)
19 | 	net = loadLeNet(None, initscheme=None)
20 | 
21 | 	optimizer = MomentumSGD()
22 | 	optimizer.setupOn(net, useGlobalState=True)
23 | 	optimizer.learnRate = 0.1
24 | 	optimizer.momRate = 0.9
25 | 
26 | 	cost = CrossEntropy(maxlabels=10)
27 | 	trainer = Trainer(net, cost, optimizer)
28 | 	validator = Validator(net, cost)
29 | 
30 | 	for i in range(15):
31 | 		trainer.trainFromHost(
32 | 			data[:60000], labels[:60000], macroBatchSize=60000,
33 | 			onMacroBatchFinish=lambda train: print("Train error: %s" % train.cost.getMeanError())
34 | 		)
35 | 		print("Accuracy: %s" % (1.0 - validator.validateFromHost(data[60000:], labels[60000:], macroBatchSize=10000)))
36 | 
37 | 		optimizer.learnRate *= 0.9
38 | 
39 | 		showFilters(net[0].W.get(), "../TestData/conv1.png")
40 | 		showFilters(net[3].W.get(), "../TestData/conv2.png")
41 | 
42 | 
43 | if __name__ == "__main__":
44 | 	main()
45 | 


--------------------------------------------------------------------------------
/Converter/OpenVINO/Tests/GraphTest.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib import Config
 4 | 
 5 | Config.backend = Config.Backend.intel
 6 | Config.globalEvalMode = True
 7 | 
 8 | from PuzzleLib.Backend import gpuarray
 9 | 
10 | from PuzzleLib.Containers import Graph
11 | from PuzzleLib.Modules import Linear, Activation, relu, Add
12 | 
13 | from PuzzleLib.Converter.OpenVINO.Tests.Common import benchModels
14 | from PuzzleLib.Converter.OpenVINO.BuildVINOEngine import buildVINOEngine
15 | 
16 | 
17 | def main():
18 | 	batchsize, insize = 16, 1000
19 | 
20 | 	inNode = Linear(insize, 1000, name="linear1").node()
21 | 	node = Activation(relu, name="relu1").node(inNode)
22 | 
23 | 	node1 = Linear(1000, 800, name="linear2").node(node)
24 | 	node1 = Activation(relu, name="relu2").node(node1)
25 | 
26 | 	node2 = Linear(1000, 800, name="linear3").node(node)
27 | 	node2 = Activation(relu, name="relu3").node(node2)
28 | 
29 | 	outNode = Add(name="add").node(node1, node2)
30 | 
31 | 	graph = Graph(inputs=inNode, outputs=outNode, name="graph")
32 | 
33 | 	data = gpuarray.to_gpu(np.random.randn(batchsize, insize).astype(np.float32))
34 | 
35 | 	engine = buildVINOEngine(graph, (batchsize, insize), savepath="../TestData")
36 | 
37 | 	outdata = graph(data)
38 | 	enginedata = engine(data)
39 | 
40 | 	assert np.allclose(outdata.get(), enginedata.get())
41 | 	benchModels(graph, engine, data)
42 | 
43 | 
44 | if __name__ == "__main__":
45 | 	main()
46 | 


--------------------------------------------------------------------------------
/Optimizers/SGD.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib import Config
 4 | 
 5 | from PuzzleLib.Backend import gpuarray
 6 | from PuzzleLib.Backend.Kernels.ElementWise import toVectorAddVectorKer
 7 | 
 8 | from PuzzleLib.Optimizers.Optimizer import Optimizer, trainSimpleTest, trainHardTest
 9 | 
10 | 
11 | class SGD(Optimizer):
12 | 	def __init__(self, learnRate=1e-3, nodeinfo=None):
13 | 		super().__init__(nodeinfo)
14 | 		self.setAttr("learnRate", learnRate)
15 | 
16 | 
17 | 	def updateVar(self, var, state, stream=None):
18 | 		toVectorAddVectorKer(var.data.dtype)(var.data, var.grad, self.learnRate * var.learnRate, stream=stream)
19 | 
20 | 
21 | def unittest():
22 | 	for dtype, atol in gpuarray.dtypesSupported():
23 | 		calcTest(dtype, atol)
24 | 		trainSimpleTest(SGD, dtype, learnRate=1e-1)
25 | 
26 | 		if Config.backend == Config.Backend.cuda:
27 | 			trainHardTest(SGD, dtype, learnRate=1e-1)
28 | 
29 | 
30 | def calcTest(dtype, atol):
31 | 	lr = 0.01
32 | 	shape = (11, 13)
33 | 
34 | 	hostW, hostDw = np.random.randn(*shape).astype(dtype), np.random.randn(*shape).astype(dtype)
35 | 
36 | 	w, dw = gpuarray.to_gpu(hostW), gpuarray.to_gpu(hostDw)
37 | 	toVectorAddVectorKer(w.dtype)(w, dw, lr)
38 | 
39 | 	hostW, hostDw = hostW.astype(np.float32), hostDw.astype(np.float32)
40 | 
41 | 	hostW += lr * hostDw
42 | 	hostW, hostDw = hostW.astype(dtype), hostDw.astype(dtype)
43 | 
44 | 	assert np.allclose(hostW, w.get(), atol=atol)
45 | 
46 | 
47 | if __name__ == "__main__":
48 | 	unittest()
49 | 


--------------------------------------------------------------------------------
/Converter/TensorRT/Source/Plugins.cpp:
--------------------------------------------------------------------------------
 1 | #include "Plugins.h"
 2 | 
 3 | 
 4 | PuzzlePlugin::PuzzlePlugin() {}
 5 | 
 6 | PuzzlePlugin::PuzzlePlugin(const void *serialData, size_t serialLength)
 7 | {
 8 | 	(void)serialLength;
 9 | 	const char *buffer = static_cast<const char *>(serialData);
10 | 
11 | 	readValue(buffer, m_inshape);
12 | 	readValue(buffer, m_outshape);
13 | 	readValue(buffer, m_datatype);
14 | }
15 | 
16 | size_t PuzzlePlugin::getSerializationSize() const
17 | {
18 | 	return sizeof(m_inshape) + sizeof(m_outshape) + sizeof(m_datatype);
19 | }
20 | 
21 | void PuzzlePlugin::serialize(void *serialData) const
22 | {
23 | 	char *buffer = static_cast<char *>(serialData);
24 | 
25 | 	writeValue(buffer, m_inshape);
26 | 	writeValue(buffer, m_outshape);
27 | 	writeValue(buffer, m_datatype);
28 | }
29 | 
30 | void PuzzlePlugin::setPluginNamespace(const char *pluginNamespace) { m_ns = std::string(pluginNamespace); }
31 | const char *PuzzlePlugin::getPluginNamespace() const { return m_ns.c_str(); }
32 | 
33 | 
34 | const char *PuzzlePluginCreator::getPluginVersion() const { return version; }
35 | void PuzzlePluginCreator::setPluginNamespace(const char *pluginNamespace) { m_ns = std::string(pluginNamespace); }
36 | const char *PuzzlePluginCreator::getPluginNamespace() const { return m_ns.c_str(); }
37 | 
38 | 
39 | const char *PuzzlePluginCreator::version = "1";
40 | const char *PuzzlePluginCreator::reflectPad1DName = "reflectpad1d";
41 | const char *PuzzlePluginCreator::instNorm2DName = "instnorm2d";
42 | 


--------------------------------------------------------------------------------
/Hip/GPUArray.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Cuda.GPUArray import extendGPUArray, arithmTest
 4 | 
 5 | from PuzzleLib.Hip import Driver as HipDriver
 6 | from PuzzleLib.Hip.SourceModule import HipEltwiseKernel, HipEltHalf2Kernel, HipReductionKernel
 7 | 
 8 | 
 9 | HipGPUArray = extendGPUArray(HipDriver, HipEltwiseKernel, HipEltHalf2Kernel, HipReductionKernel)
10 | 
11 | 
12 | def unittest():
13 | 	from PuzzleLib.Hip import Backend
14 | 
15 | 	for deviceIdx in range(Backend.getDeviceCount()):
16 | 		bnd = Backend.getBackend(deviceIdx)
17 | 
18 | 		for dtype, _ in bnd.dtypesSupported():
19 | 			arithmTest(bnd, dtype)
20 | 			memoryTest(bnd, dtype)
21 | 
22 | 
23 | def memoryTest(bnd, dtype):
24 | 	hostA = np.random.randn(10, 10).astype(dtype)
25 | 	a = bnd.GPUArray.toGpu(hostA)
26 | 
27 | 	b = a[:, :6]
28 | 	hostB = hostA[:, :6]
29 | 
30 | 	assert np.allclose(hostB.reshape((2, 5, 6)), b.reshape(2, 5, 6).get())
31 | 	assert np.allclose(hostB.reshape((5, 2, 3, 2)), b.reshape(5, 2, 3, 2).get())
32 | 	assert np.allclose(hostB.reshape((10, 1, 6)), b.reshape(10, 1, 6).get())
33 | 
34 | 	hostA = np.random.randn(10, 10, 10).astype(dtype)
35 | 	a = bnd.GPUArray.toGpu(hostA)
36 | 
37 | 	b = a[:, :, :6]
38 | 	assert np.allclose(hostA[:, :, :6], b.get())
39 | 
40 | 	hostB = np.random.randn(*b.shape).astype(dtype)
41 | 	b.set(hostB)
42 | 	assert np.allclose(hostB, b.get())
43 | 
44 | 	hostB = b.get()
45 | 	b = a[:, :6, :6]
46 | 	assert np.allclose(hostB[:, :6, :6], b.get())
47 | 
48 | 
49 | if __name__ == "__main__":
50 | 	unittest()
51 | 


--------------------------------------------------------------------------------
/Backend/Kernels/Pad.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib import Config
 2 | 
 3 | 
 4 | reflectpad1d = None
 5 | reflectpad1dBackward = None
 6 | 
 7 | reflectpad2d = None
 8 | reflectpad2dBackward = None
 9 | 
10 | 
11 | def autoinit():
12 | 	if not Config.shouldInit():
13 | 		return
14 | 
15 | 	if Config.backend == Config.Backend.cuda:
16 | 		initCuda()
17 | 	elif Config.backend == Config.Backend.hip:
18 | 		initHip()
19 | 	elif Config.isCPUBased(Config.backend):
20 | 		initCPU()
21 | 	else:
22 | 		raise Config.ConfigError(Config.backend)
23 | 
24 | 
25 | def initCuda():
26 | 	from PuzzleLib.Cuda import Backend
27 | 	initGPU(Backend)
28 | 
29 | 
30 | def initHip():
31 | 	from PuzzleLib.Hip import Backend
32 | 	initGPU(Backend)
33 | 
34 | 
35 | def initGPU(Backend):
36 | 	backend = Backend.getBackend(Config.deviceIdx, initmode=2, logger=Config.getLogger())
37 | 	memoryPool, padmod = backend.memoryPool, backend.padmod
38 | 
39 | 	def wrapReflectPad(data, pad):
40 | 		return padmod.reflectpad(data, pad, memoryPool)
41 | 
42 | 	def wrapReflectPadBackward(grad, pad):
43 | 		return padmod.reflectpadBackward(grad, pad, memoryPool)
44 | 
45 | 	global reflectpad1d, reflectpad1dBackward, reflectpad2d, reflectpad2dBackward
46 | 	reflectpad1d = reflectpad2d = wrapReflectPad
47 | 	reflectpad1dBackward = reflectpad2dBackward = wrapReflectPadBackward
48 | 
49 | 
50 | def initCPU():
51 | 	from PuzzleLib.CPU.Kernels import Pad
52 | 
53 | 	global reflectpad1d
54 | 	reflectpad1d = Pad.reflectpad1d
55 | 
56 | 	global reflectpad2d
57 | 	reflectpad2d = Pad.reflectpad2d
58 | 
59 | 
60 | autoinit()
61 | 


--------------------------------------------------------------------------------
/Backend/Kernels/PRelu.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib import Config
 2 | 
 3 | 
 4 | prelu = None
 5 | preluBackwardData = None
 6 | preluBackwardParams = None
 7 | 
 8 | 
 9 | def autoinit():
10 | 	if not Config.shouldInit():
11 | 		return
12 | 
13 | 	if Config.backend == Config.Backend.cuda:
14 | 		initCuda()
15 | 	elif Config.backend == Config.Backend.hip:
16 | 		initHip()
17 | 	elif Config.isCPUBased(Config.backend):
18 | 		initCPU()
19 | 	else:
20 | 		raise Config.ConfigError(Config.backend)
21 | 
22 | 
23 | def initCuda():
24 | 	from PuzzleLib.Cuda import Backend
25 | 	initGPU(Backend)
26 | 
27 | 
28 | def initHip():
29 | 	from PuzzleLib.Hip import Backend
30 | 	initGPU(Backend)
31 | 
32 | 
33 | def initGPU(Backend):
34 | 	backend = Backend.getBackend(Config.deviceIdx, initmode=2, logger=Config.getLogger())
35 | 	memoryPool, prelumod = backend.memoryPool, backend.prelumod
36 | 
37 | 	def wrapPRelu(data, slopes, inplace, sharedMaps):
38 | 		return prelumod.prelu(data, slopes, inplace, sharedMaps, memoryPool)
39 | 
40 | 	def wrapPReluBackwardData(grad, slopes, indata, sharedMaps):
41 | 		return prelumod.preluBackwardData(grad, slopes, indata, sharedMaps, memoryPool)
42 | 
43 | 	def wrapPReluBackwardParams(indata, outgrad, sharedMaps):
44 | 		return prelumod.preluBackwardParams(indata, outgrad, sharedMaps, memoryPool)
45 | 
46 | 	global prelu, preluBackwardData, preluBackwardParams
47 | 	prelu = wrapPRelu
48 | 	preluBackwardData = wrapPReluBackwardData
49 | 	preluBackwardParams = wrapPReluBackwardParams
50 | 
51 | 
52 | def initCPU():
53 | 	pass
54 | 
55 | 
56 | autoinit()
57 | 


--------------------------------------------------------------------------------
/Datasets/PathLoader.py:
--------------------------------------------------------------------------------
 1 | import os, shutil, zipfile
 2 | 
 3 | from PuzzleLib.Datasets.InputLoader import InputLoader
 4 | 
 5 | 
 6 | class PathLoader(InputLoader):
 7 | 	def __init__(self, onFile=None, exts=None, dataname=None, cachename=None, onFileList=None, doOpen=True):
 8 | 		super().__init__(onFile, exts, dataname, cachename, onFileList)
 9 | 		self.doOpen = doOpen
10 | 
11 | 
12 | 	class Path:
13 | 		def __init__(self, path):
14 | 			self.path = path
15 | 
16 | 
17 | 		def __enter__(self):
18 | 			return self
19 | 
20 | 
21 | 		def __exit__(self, exc_type, exc_val, exc_tb):
22 | 			pass
23 | 
24 | 
25 | 	def checkInput(self, path):
26 | 		if not os.path.exists(path):
27 | 			raise RuntimeError("Path '%s' does not exist" % path)
28 | 
29 | 
30 | 	def openInput(self, path):
31 | 		return self.Path(path)
32 | 
33 | 
34 | 	def loadFilelist(self, path):
35 | 		lst = []
36 | 
37 | 		for dirpath, dirnames, filenames in os.walk(path.path):
38 | 			lst.extend([file for file in filenames if any([file.lower().endswith(ext) for ext in self.exts])])
39 | 
40 | 		return lst
41 | 
42 | 
43 | 	def openFile(self, path, file):
44 | 		fullname = os.path.join(path.path, file)
45 | 		return open(fullname, mode="rb") if self.doOpen else fullname
46 | 
47 | 
48 | def unittest():
49 | 	zipname = "../TestData/test.zip"
50 | 	path = os.path.splitext(zipname)[0]
51 | 
52 | 	zipfile.ZipFile(zipname).extractall(path)
53 | 
54 | 	loader = PathLoader()
55 | 	loader.load(path)
56 | 	loader.clear()
57 | 
58 | 	shutil.rmtree(path)
59 | 
60 | 
61 | if __name__ == "__main__":
62 | 	unittest()
63 | 


--------------------------------------------------------------------------------
/Compiler/Codegen/Malloc/TMalloc.c:
--------------------------------------------------------------------------------
 1 | #undef NDEBUG
 2 | #include <assert.h>
 3 | 
 4 | #include "AllocTree.gen.h"
 5 | #include "$HEADER_NAME"
 6 | 
 7 | 
 8 | static AllocTree allocTree;
 9 | static AllocTree_Iterator allocIterator;
10 | 
11 | 
12 | void *${NAME}_malloc(size_t size, const char *file, int line)
13 | {
14 | 	void *ptr = malloc(size);
15 | 
16 | 	Allocation alloc;
17 | 	alloc.size = size;
18 | 	alloc.file = file;
19 | 	alloc.line = line;
20 | 
21 | 	bool inserted = AllocTree_insert(&allocTree, ptr, alloc);
22 | 	assert(inserted);
23 | 
24 | 	return ptr;
25 | }
26 | 
27 | 
28 | void ${NAME}_free(void *ptr)
29 | {
30 | 	if (ptr != NULL)
31 | 	{
32 | 		Allocation alloc;
33 | 
34 | 		bool found = AllocTree_get(&allocTree, ptr, &alloc);
35 | 		assert(found);
36 | 
37 | 		bool deleted = AllocTree_delete(&allocTree, ptr);
38 | 		assert(deleted);
39 | 	}
40 | 
41 | 	free(ptr);
42 | }
43 | 
44 | 
45 | size_t ${NAME}_traceLeaks(void)
46 | {
47 | 	return allocTree.size;
48 | }
49 | 
50 | 
51 | bool ${NAME}_Iterator_init(void)
52 | {
53 | 	return AllocTree_Iterator_init(&allocIterator, &allocTree, true);
54 | }
55 | 
56 | 
57 | void ${NAME}_Iterator_dealloc(void)
58 | {
59 | 	AllocTree_Iterator_dealloc(&allocIterator);
60 | }
61 | 
62 | 
63 | bool ${NAME}_Iterator_move(void)
64 | {
65 | 	return AllocTree_Iterator_move(&allocIterator, true);
66 | }
67 | 
68 | 
69 | void ${NAME}_Iterator_item(size_t *size, const char **file, int *line)
70 | {
71 | 	void *ptr;
72 | 	Allocation alloc;
73 | 
74 | 	AllocTree_Iterator_item(&allocIterator, &ptr, &alloc);
75 | 
76 | 	*size = alloc.size;
77 | 	*file = alloc.file;
78 | 	*line = alloc.line;
79 | }
80 | 


--------------------------------------------------------------------------------
/Converter/Examples/ResNet.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib import Config
 2 | Config.globalEvalMode = True
 3 | 
 4 | from PuzzleLib.Backend import gpuarray
 5 | from PuzzleLib.Models.Nets.ResNet import loadResNet
 6 | 
 7 | from PuzzleLib.Converter.Examples.Common import loadResNetSample, loadLabels, showLabelResults
 8 | 
 9 | 
10 | def main():
11 | 	resNet50Test()
12 | 	resNet101Test()
13 | 	resNet152Test()
14 | 
15 | 
16 | def resNet50Test():
17 | 	net = loadResNet(modelpath="../TestData/ResNet-50-model.hdf", layers="50")
18 | 
19 | 	sample = loadResNetSample(net, "../TestData/tarantula.jpg")
20 | 	labels = loadLabels(synpath="../TestData/synsets.txt", wordpath="../TestData/synset_words.txt")
21 | 
22 | 	res = net(gpuarray.to_gpu(sample)).get().reshape(-1)
23 | 	showLabelResults(res, labels, header=net.name)
24 | 
25 | 
26 | def resNet101Test():
27 | 	net = loadResNet(modelpath="../TestData/ResNet-101-model.hdf", layers="101")
28 | 
29 | 	sample = loadResNetSample(net, "../TestData/tarantula.jpg")
30 | 	labels = loadLabels(synpath="../TestData/synsets.txt", wordpath="../TestData/synset_words.txt")
31 | 
32 | 	res = net(gpuarray.to_gpu(sample)).get().reshape(-1)
33 | 	showLabelResults(res, labels, header=net.name)
34 | 
35 | 
36 | def resNet152Test():
37 | 	net = loadResNet(modelpath="../TestData/ResNet-152-model.hdf", layers="152")
38 | 
39 | 	sample = loadResNetSample(net, "../TestData/tarantula.jpg")
40 | 	labels = loadLabels(synpath="../TestData/synsets.txt", wordpath="../TestData/synset_words.txt")
41 | 
42 | 	res = net(gpuarray.to_gpu(sample)).get().reshape(-1)
43 | 	showLabelResults(res, labels, header=net.name)
44 | 
45 | 
46 | if __name__ == "__main__":
47 | 	main()
48 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PuzzleLib
 2 | 
 3 | [PuzzleLib](https://puzzlelib.org) is a high level Deep Learning framework with CPU (Intel/AMD) and GPU (NVIDIA/AMD) support. The library is written in Python, it is modular and dynamic.
 4 | 
 5 | ## Installation
 6 | 
 7 | For detailed instructions on how to install the library and dependencies, see the documentation for [Windows](https://puzzlelib.org/ru/documentation/general/installation/windows/) and [Linux/macOS](https://puzzlelib.org/ru/documentation/general/installation/linux/) installation.
 8 | 
 9 | PuzzleLib supports:
10 | 
11 | *	NVIDIA GPU (CUDA backend);
12 | *	AMD GPU (ROCm backend);
13 | *	Intel CPU (mkl-dnn backend);
14 | *	AMD CPU (numpy backend).
15 | 
16 | 
17 | ## Documentation
18 | 
19 | https://puzzlelib.org
20 | 
21 | 
22 | ## License
23 | 
24 | [Apache License 2.0](LICENSE)
25 | 
26 | ___
27 | # PuzzleLib
28 | 
29 | [PuzzleLib](https://puzzlelib.org) - это библиотека для построения нейронных сетей с поддержкой вычислений на CPU (Intel/AMD) и GPU (NVIDIA/AMD). Библиотека модульная и динамическая, написана на языке Python.
30 | 
31 | ## Установка
32 | 
33 | Для детальных инструкций по установке библиотеки и её зависимостей смотрите документацию: установка на [Windows](https://puzzlelib.org/ru/documentation/general/installation/windows/) и на [Linux/macOS](https://puzzlelib.org/ru/documentation/general/installation/linux/).
34 | 
35 | PuzzleLib поддерживает:
36 | 
37 | *	NVIDIA GPU (CUDA backend);
38 | *	AMD GPU (ROCm backend);
39 | *	Intel CPU (mkl-dnn backend);
40 | *	AMD CPU (numpy backend).
41 | 
42 | 
43 | ## Документация
44 | 
45 | https://puzzlelib.org
46 | 
47 | 
48 | ## Лицензия
49 | 
50 | [Apache License 2.0](LICENSE)
51 | 


--------------------------------------------------------------------------------
/TestLib/GradientCheck.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray
 4 | 
 5 | from PuzzleLib.Containers import Sequential
 6 | from PuzzleLib.Modules import Conv2D, AvgPool2D, BatchNorm2D, Activation, relu, Flatten
 7 | from PuzzleLib.Cost import BCE
 8 | 
 9 | 
10 | def buildNet():
11 | 	net = Sequential(name="test-net")
12 | 
13 | 	net.append(Conv2D(1, 2, 3, wscale=1.0, initscheme="gaussian"))
14 | 	net.append(AvgPool2D(2, 2))
15 | 
16 | 	net.append(BatchNorm2D(2))
17 | 	net.append(Activation(relu))
18 | 
19 | 	net.append(Conv2D(2, 1, 2, wscale=1.0, initscheme="gaussian"))
20 | 	net.append(Flatten())
21 | 
22 | 	return net
23 | 
24 | 
25 | def gradientCheck(mod, data, target, cost, h=1e-3):
26 | 	vartable = mod.getVarTable()
27 | 
28 | 	mod(data)
29 | 	error, grad = cost(mod.data, target)
30 | 	mod.backward(grad, updGrad=False)
31 | 
32 | 	for var in vartable.keys():
33 | 		w = var.data.get()
34 | 		dw = -var.grad.get()
35 | 
36 | 		for i in range(w.ravel().shape[0]):
37 | 			wph = np.copy(w)
38 | 			wmh = np.copy(w)
39 | 
40 | 			wph.ravel()[i] = w.ravel()[i] + h
41 | 			var.data.set(wph)
42 | 			yph, _ = cost(mod(data), target)
43 | 
44 | 			wmh.ravel()[i] = w.ravel()[i] - h
45 | 			var.data.set(wmh)
46 | 			ymh, _ = cost(mod(data), target)
47 | 
48 | 			host = (yph - ymh) / (2.0 * h)
49 | 			dev = dw.ravel()[i]
50 | 			var.data.set(w)
51 | 
52 | 			print(abs((host - dev) / (dev + h)))
53 | 
54 | 
55 | def main():
56 | 	net = buildNet()
57 | 	cost = BCE()
58 | 
59 | 	data = gpuarray.to_gpu(np.random.randn(1, 1, 6, 6).astype(np.float32))
60 | 	target = gpuarray.to_gpu(np.random.randint(0, 2, size=(1, )))
61 | 
62 | 	gradientCheck(net, data, target, cost)
63 | 
64 | 
65 | if __name__ == "__main__":
66 | 	main()
67 | 


--------------------------------------------------------------------------------
/Backend/Kernels/Pool.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib import Config
 2 | 
 3 | 
 4 | maxpool2d = None
 5 | maxpool2dBackward = None
 6 | maxunpool2d = None
 7 | maxunpool2dBackward = None
 8 | 
 9 | 
10 | def autoinit():
11 | 	if not Config.shouldInit():
12 | 		return
13 | 
14 | 	if Config.backend == Config.Backend.cuda:
15 | 		initCuda()
16 | 	elif Config.backend == Config.Backend.hip:
17 | 		initHip()
18 | 	elif Config.isCPUBased(Config.backend):
19 | 		initCPU()
20 | 	else:
21 | 		raise Config.ConfigError(Config.backend)
22 | 
23 | 
24 | def initCuda():
25 | 	from PuzzleLib.Cuda import Backend
26 | 	initGPU(Backend)
27 | 
28 | 
29 | def initHip():
30 | 	from PuzzleLib.Hip import Backend
31 | 	initGPU(Backend)
32 | 
33 | 
34 | def initGPU(Backend):
35 | 	backend = Backend.getBackend(Config.deviceIdx, initmode=2, logger=Config.getLogger())
36 | 	memoryPool, poolmod = backend.memoryPool, backend.poolmod
37 | 
38 | 	def wrapMaxPool2d(data, size, stride, pad):
39 | 		return poolmod.maxpool2d(data, size, stride, pad, memoryPool)
40 | 
41 | 	def wrapMaxPool2dBackward(grad, origshape, mask, size, stride, pad):
42 | 		return poolmod.maxpool2dBackward(grad, origshape, mask, size, stride, pad, memoryPool)
43 | 
44 | 	global maxpool2d, maxpool2dBackward
45 | 	maxpool2d = wrapMaxPool2d
46 | 	maxpool2dBackward = wrapMaxPool2dBackward
47 | 
48 | 	def wrapMaxUnpool2d(data, origshape, mask):
49 | 		return poolmod.maxunpool2d(data, origshape, mask, memoryPool)
50 | 
51 | 	def wrapMaxUnpool2dBackward(grad, poolshape, mask):
52 | 		return poolmod.maxunpool2dBackward(grad, poolshape, mask, memoryPool)
53 | 
54 | 	global maxunpool2d, maxunpool2dBackward
55 | 	maxunpool2d = wrapMaxUnpool2d
56 | 	maxunpool2dBackward = wrapMaxUnpool2dBackward
57 | 
58 | 
59 | def initCPU():
60 | 	pass
61 | 
62 | 
63 | autoinit()
64 | 


--------------------------------------------------------------------------------
/Converter/TensorRT/DataCalibrator.py:
--------------------------------------------------------------------------------
 1 | import ctypes
 2 | 
 3 | import numpy as np
 4 | 
 5 | from PuzzleLib.Backend import gpuarray
 6 | from PuzzleLib.Backend.gpuarray import memoryPool as memPool
 7 | from PuzzleLib.Converter.TensorRT import Driver
 8 | 
 9 | 
10 | class CalibratorError(Exception):
11 | 	pass
12 | 
13 | 
14 | class DataCalibrator(Driver.ICalibrator):
15 | 	def __init__(self, data, batchsize=100, cachename=None):
16 | 		super().__init__("" if cachename is None else cachename)
17 | 
18 | 		if data is None:
19 | 			if cachename is None:
20 | 				raise CalibratorError("Invalid calibration cache file")
21 | 
22 | 			self.nbatches = 0
23 | 
24 | 		else:
25 | 			if data.shape[0] % batchsize != 0:
26 | 				raise CalibratorError("TensorRT calibration engine requires data size to be divisible by batch size")
27 | 
28 | 			if data.dtype != np.float32:
29 | 				raise CalibratorError("Invalid data type")
30 | 
31 | 			self.nbatches = data.shape[0] // batchsize
32 | 
33 | 		self.data = data
34 | 		self.idx = 0
35 | 
36 | 		self.batchsize = batchsize
37 | 		self.batch = None
38 | 
39 | 
40 | 	def getDataShape(self):
41 | 		return self.data.shape[1:]
42 | 
43 | 
44 | 	def getBatchSize(self):
45 | 		return self.batchsize
46 | 
47 | 
48 | 	def getBatch(self, bindings, names):
49 | 		assert len(bindings) == 1 and len(names) == 1
50 | 
51 | 		if self.idx >= self.nbatches:
52 | 			return False
53 | 
54 | 		self.batch = gpuarray.to_gpu(
55 | 			self.data[self.idx * self.batchsize:(self.idx + 1) * self.batchsize], allocator=memPool
56 | 		)
57 | 
58 | 		ptr = ctypes.cast(bindings[0], ctypes.POINTER(ctypes.c_void_p))
59 | 		ptr.contents.value = self.batch.ptr
60 | 
61 | 		print("Sending batch #%s out of %s" % (self.idx + 1, self.nbatches))
62 | 		self.idx += 1
63 | 
64 | 		return True
65 | 


--------------------------------------------------------------------------------
/TestLib/RnnIMDBTrain.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Backend import Dnn
 2 | 
 3 | from PuzzleLib.Containers import Sequential
 4 | from PuzzleLib.Modules import Embedder, SwapAxes, RNN, Linear
 5 | 
 6 | from PuzzleLib.Datasets import IMDBLoader
 7 | from PuzzleLib.Handlers import Trainer, Validator
 8 | from PuzzleLib.Optimizers import Adam
 9 | from PuzzleLib.Cost import BCE
10 | 
11 | 
12 | def buildNet(numwords, maxlen, hintBatchsize):
13 | 	seq = Sequential()
14 | 	seq.append(Embedder(numwords, maxlen, 128, initscheme="uniform", wscale=0.05, learnable=True))
15 | 
16 | 	seq.append(SwapAxes(0, 1))
17 | 	seq.append(RNN(128, 128, mode="lstm", dropout=0.2, hintBatchSize=hintBatchsize))
18 | 
19 | 	seq.append(Linear(128, 1))
20 | 	return seq
21 | 
22 | 
23 | def main():
24 | 	hintBatchsize, batchsize = (40, 40) if Dnn.deviceSupportsBatchHint() else (None, 32)
25 | 	numwords, maxlen = 20000, 80
26 | 
27 | 	imdb = IMDBLoader(numwords=numwords, maxlen=maxlen)
28 | 	data, labels, _ = imdb.load(path="../TestData/")
29 | 	data, labels = data[:], labels[:]
30 | 	print("Loaded IMDB")
31 | 
32 | 	net = buildNet(numwords, maxlen, hintBatchsize)
33 | 
34 | 	optimizer = Adam(alpha=1e-3)
35 | 	optimizer.setupOn(net, useGlobalState=True)
36 | 
37 | 	cost = BCE()
38 | 	trainer = Trainer(net, cost, optimizer, batchsize=batchsize)
39 | 	validator = Validator(net, cost, batchsize=batchsize)
40 | 
41 | 	print("Started training ...")
42 | 
43 | 	for i in range(15):
44 | 		trainer.trainFromHost(
45 | 			data[:25000], labels[:25000], macroBatchSize=25000,
46 | 			onMacroBatchFinish=lambda train: print("Train error: %s" % train.cost.getMeanError())
47 | 		)
48 | 		print("Accuracy: %s" % (1.0 - validator.validateFromHost(data[25000:], labels[25000:], macroBatchSize=25000)))
49 | 
50 | 
51 | if __name__ == "__main__":
52 | 	main()
53 | 


--------------------------------------------------------------------------------
/Converter/Examples/Common.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib import Visual
 4 | 
 5 | 
 6 | def loadV3Labels(filename):
 7 | 	with open(filename) as f:
 8 | 		synsets = f.readlines()
 9 | 		synsets = [line.strip() for line in synsets]
10 | 
11 | 	labels = {}
12 | 	for i, synset in enumerate(synsets):
13 | 		labels[i] = synset
14 | 
15 | 	return labels
16 | 
17 | 
18 | def loadLabels(synpath, wordpath):
19 | 	with open(synpath) as f:
20 | 		synsets = f.readlines()
21 | 		synsets = [line.strip() for line in synsets]
22 | 
23 | 	with open(wordpath) as f:
24 | 		lines = f.readlines()
25 | 		lines = [line.strip() for line in lines]
26 | 
27 | 	words = {}
28 | 	for line in lines:
29 | 		tags = line.split(sep=" ", maxsplit=1)
30 | 		words[tags[0]] = tags[1]
31 | 
32 | 	labels = {}
33 | 	for i, synset in enumerate(synsets):
34 | 		labels[i] = words[synset]
35 | 
36 | 	return labels
37 | 
38 | 
39 | def showLabelResults(res, labels, limit=5, header=""):
40 | 	idx = (-res).argsort()[:limit]
41 | 
42 | 	print("%sTop-%s predictions:" % ("%s " % header if len(header) > 0 else "", limit))
43 | 	for i in range(limit):
44 | 		print("#%s %s (prob=%s)" % (i + 1, labels[idx[i]], res[idx[i]]))
45 | 
46 | 
47 | def loadVGGSample(filename, shape=None, normalize=False):
48 | 	meanPixel = np.array([103.939, 116.779, 123.68], dtype=np.float32).reshape((1, 3, 1, 1))
49 | 	sample = loadSample(filename, shape) - meanPixel
50 | 
51 | 	return sample * (2.0 / 255.0) - 1.0 if normalize else sample
52 | 
53 | 
54 | def loadResNetSample(net, filename, shape=None):
55 | 	mean = net.getAttr("mean")
56 | 	return loadSample(filename, shape) - mean
57 | 
58 | 
59 | def loadSample(filename, shape=None):
60 | 	return np.ascontiguousarray(
61 | 		Visual.loadImage(filename, shape, normalize=False, contiguous=False)[:, ::-1, :, :],
62 | 		dtype=np.float32
63 | 	)
64 | 


--------------------------------------------------------------------------------
/Backend/Kernels/Upsample.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib import Config
 2 | 
 3 | 
 4 | upsample2d = None
 5 | upsample2dBackward = None
 6 | 
 7 | upsample3d = None
 8 | upsample3dBackward = None
 9 | 
10 | 
11 | def autoinit():
12 | 	if not Config.shouldInit():
13 | 		return
14 | 
15 | 	if Config.backend == Config.Backend.cuda:
16 | 		initCuda()
17 | 	elif Config.backend == Config.Backend.hip:
18 | 		initHip()
19 | 	elif Config.isCPUBased(Config.backend):
20 | 		initCPU()
21 | 	else:
22 | 		raise Config.ConfigError(Config.backend)
23 | 
24 | 
25 | def initCuda():
26 | 	from PuzzleLib.Cuda import Backend
27 | 	initGPU(Backend)
28 | 
29 | 
30 | def initHip():
31 | 	from PuzzleLib.Hip import Backend
32 | 	initGPU(Backend)
33 | 
34 | 
35 | def initGPU(Backend):
36 | 	backend = Backend.getBackend(Config.deviceIdx, initmode=2, logger=Config.getLogger())
37 | 	memoryPool, upsamplemod = backend.memoryPool, backend.upsamplemod
38 | 
39 | 	def wrapUpsample2d(data, scale, mode):
40 | 		return upsamplemod.upsample2d(data, scale, mode, memoryPool)
41 | 
42 | 	def wrapUpsample2dBackward(grad, scale, mode):
43 | 		return upsamplemod.upsample2dBackward(grad, scale, mode, memoryPool)
44 | 
45 | 	global upsample2d, upsample2dBackward
46 | 	upsample2d = wrapUpsample2d
47 | 	upsample2dBackward = wrapUpsample2dBackward
48 | 
49 | 	def wrapUpsample3d(data, scale, mode):
50 | 		return upsamplemod.upsample3d(data, scale, mode, memoryPool)
51 | 
52 | 	def wrapUpsample3dBackward(grad, scale, mode):
53 | 		return upsamplemod.upsample3dBackward(grad, scale, mode, memoryPool)
54 | 
55 | 	global upsample3d, upsample3dBackward
56 | 	upsample3d = wrapUpsample3d
57 | 	upsample3dBackward = wrapUpsample3dBackward
58 | 
59 | 
60 | def initCPU():
61 | 	from PuzzleLib.CPU.Kernels import Upsample2D
62 | 
63 | 	global upsample2d
64 | 	upsample2d = Upsample2D.upsample2d
65 | 
66 | 
67 | autoinit()
68 | 


--------------------------------------------------------------------------------
/Intel/Wrappers/DNNLBlas.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.CPU.CPUArray import CPUArray
 4 | from PuzzleLib.Intel.ThirdParty import libdnnl
 5 | 
 6 | 
 7 | def mulMatrixOnMatrix(A, B, out=None, transpA=False, transpB=False, alpha=1.0, beta=0.0):
 8 | 	assert not (transpA and transpB)
 9 | 	assert A.ndim == 2 and B.ndim == 2
10 | 
11 | 	assert A.dtype == B.dtype and A.dtype == np.float32
12 | 	assert A.flags.c_contiguous and B.flags.c_contiguous
13 | 
14 | 	if transpA:
15 | 		assert A.shape[0] == B.shape[0]
16 | 		shape = (A.shape[1], B.shape[1])
17 | 	elif transpB:
18 | 		assert A.shape[1] == B.shape[1]
19 | 		shape = (A.shape[0], B.shape[0])
20 | 	else:
21 | 		assert A.shape[1] == B.shape[0]
22 | 		shape = (A.shape[0], B.shape[1])
23 | 
24 | 	if out is None:
25 | 		out = CPUArray.empty(shape, dtype=np.float32)
26 | 
27 | 	if transpA:
28 | 		k, m = A.shape
29 | 		n = B.shape[1]
30 | 		libdnnl.dnnl_sgemm('t', 'n', m, n, k, alpha, A.ptr, m, B.ptr, n, beta, out.ptr, n)
31 | 	elif transpB:
32 | 		m, k = A.shape
33 | 		n = B.shape[0]
34 | 		libdnnl.dnnl_sgemm('n', 't', m, n, k, alpha, A.ptr, k, B.ptr, k, beta, out.ptr, n)
35 | 	else:
36 | 		m, k = A.shape
37 | 		n = B.shape[1]
38 | 		libdnnl.dnnl_sgemm('n', 'n', m, n, k, alpha, A.ptr, k, B.ptr, n, beta, out.ptr, n)
39 | 
40 | 	return out
41 | 
42 | 
43 | def unittest():
44 | 	A = CPUArray.toDevice(np.random.randn(5, 3).astype(np.float32))
45 | 	B = CPUArray.toDevice(np.random.randn(3, 4).astype(np.float32))
46 | 
47 | 	C = mulMatrixOnMatrix(A, B)
48 | 	assert np.allclose(np.dot(A.get(), B.get()), C.get())
49 | 
50 | 	F = mulMatrixOnMatrix(B, C, transpB=True)
51 | 	assert np.allclose(np.dot(B.get(), C.get().T), F.get())
52 | 
53 | 	G = mulMatrixOnMatrix(F, B, transpA=True)
54 | 	assert np.allclose(np.dot(F.get().T, B.get()), G.get())
55 | 
56 | 
57 | if __name__ == "__main__":
58 | 	unittest()
59 | 


--------------------------------------------------------------------------------
/Modules/Add.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray, Blas
 4 | from PuzzleLib.Backend.gpuarray import memoryPool as memPool
 5 | 
 6 | from PuzzleLib.Modules.Module import ModuleError, Module
 7 | 
 8 | 
 9 | class Add(Module):
10 | 	def __init__(self, name=None):
11 | 		super().__init__(name)
12 | 		self.movesGrad = True
13 | 
14 | 
15 | 	def updateData(self, data):
16 | 		firstdata = data[0]
17 | 
18 | 		self.data = gpuarray.empty(firstdata.shape, dtype=firstdata.dtype, allocator=memPool)
19 | 		self.data.fill(0)
20 | 
21 | 		for dat in data:
22 | 			Blas.toVectorAddVector(self.data.ravel(), dat.ravel())
23 | 
24 | 
25 | 	def updateGrad(self, grad):
26 | 		self.grad = [grad] * len(self.inData)
27 | 
28 | 
29 | 	def checkDataShape(self, shapes):
30 | 		for shape in shapes:
31 | 			if shape != shapes[0]:
32 | 				raise ModuleError("Shape %s is not equal to initial shape %s" % (shape, shapes[0]))
33 | 
34 | 
35 | 	def dataShapeFrom(self, shape):
36 | 		return shape[0]
37 | 
38 | 
39 | 	def gradShapeFrom(self, shape):
40 | 		return [shape] * len(self.inData)
41 | 
42 | 
43 | 	def calcMode(self, T):
44 | 		dtypes = {dtype for dtype, _ in gpuarray.dtypesSupported()}
45 | 
46 | 		if T not in dtypes:
47 | 			raise ModuleError("Unsupported dtype %s" % T)
48 | 
49 | 		self.calctype = T
50 | 
51 | 
52 | def unittest():
53 | 	for dtype, _ in gpuarray.dtypesSupported():
54 | 		addTest(dtype)
55 | 
56 | 
57 | def addTest(dtype):
58 | 	hostData1 = np.random.randn(2, 5, 5).astype(dtype)
59 | 	hostData2 = np.random.randn(*hostData1.shape).astype(dtype)
60 | 
61 | 	data1, data2 = gpuarray.to_gpu(hostData1), gpuarray.to_gpu(hostData2)
62 | 
63 | 	add = Add()
64 | 	add.calcMode(dtype)
65 | 
66 | 	add([data1, data2])
67 | 	assert np.allclose(hostData1 + hostData2, add.data.get())
68 | 
69 | 
70 | if __name__ == "__main__":
71 | 	unittest()
72 | 


--------------------------------------------------------------------------------
/Compiler/Codegen/Malloc/Generate.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from string import Template
 3 | 
 4 | from PuzzleLib.Compiler.Codegen.Tree.Generate import generateTree
 5 | from PuzzleLib.Compiler.Toolchain import createTemplateNames, writeTemplates, buildTemplateTest
 6 | 
 7 | 
 8 | def generateMalloc(name=None, filename=None):
 9 | 	treename = generateTree(
10 | 		name="AllocTree", K="VoidPtr", V="Allocation",
11 | 		headerPreambule=
12 | """
13 | typedef void *VoidPtr;
14 | 
15 | 
16 | typedef struct Allocation
17 | {
18 | 	size_t size;
19 | 	const char *file;
20 | 	int line;
21 | }
22 | Allocation;
23 | """,
24 | 		filename=os.path.join(os.path.dirname(filename), "AllocTree")
25 | 	)
26 | 
27 | 	name = "TraceMalloc" if name is None else name
28 | 
29 | 	filename = name if filename is None else filename
30 | 	headername, bodyname = createTemplateNames(filename)
31 | 
32 | 	dirname = os.path.dirname(__file__)
33 | 
34 | 	with open(os.path.join(dirname, "TMalloc.h"), mode="r", encoding="utf-8") as f:
35 | 		header = Template(f.read()).substitute(NAME=name)
36 | 
37 | 	with open(os.path.join(dirname, "TMalloc.c"), mode="r", encoding="utf-8") as f:
38 | 		body = Template(f.read()).substitute(HEADER_NAME=os.path.basename(headername), NAME=name)
39 | 
40 | 	writeTemplates([
41 | 		(header, headername),
42 | 		(body, bodyname)
43 | 	])
44 | 
45 | 	return [bodyname, treename]
46 | 
47 | 
48 | def unittest():
49 | 	TraceMalloc = buildTemplateTest(
50 | 		name="TraceMalloc", bindingName="TMallocTest.c", path="../../TestData", generator=generateMalloc,
51 | 		defines=["ENABLE_TRACE_MALLOC"]
52 | 	)
53 | 
54 | 	ptr = TraceMalloc.malloc(16)
55 | 
56 | 	leaks = TraceMalloc.traceLeaks()
57 | 	assert len(leaks) == 1
58 | 
59 | 	TraceMalloc.free(ptr)
60 | 
61 | 	leaks = TraceMalloc.traceLeaks()
62 | 	assert len(leaks) == 0
63 | 
64 | 
65 | if __name__ == "__main__":
66 | 	unittest()
67 | 


--------------------------------------------------------------------------------
/Modules/Penalty.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | import numpy as np
 4 | 
 5 | from PuzzleLib.Backend import gpuarray, Blas
 6 | from PuzzleLib.Backend.gpuarray import memoryPool as memPool
 7 | from PuzzleLib.Backend.Kernels.ElementWise import l1penaltyKer
 8 | 
 9 | from PuzzleLib.Modules.Module import Module
10 | 
11 | 
12 | class PenaltyMode(str, Enum):
13 | 	l1 = "l1"
14 | 	l2 = "l2"
15 | 
16 | 
17 | class Penalty(Module):
18 | 	def __init__(self, mode="l1", weight=1e-2, name=None):
19 | 		super().__init__(name)
20 | 		self.registerBlueprint(locals())
21 | 
22 | 		self.gradUsesOutData = True
23 | 		self.movesData = True
24 | 
25 | 		self.mode = PenaltyMode(mode)
26 | 		self.weight = weight
27 | 
28 | 
29 | 	def updateData(self, data):
30 | 		self.data = data
31 | 
32 | 
33 | 	def updateGrad(self, grad):
34 | 		if self.mode == PenaltyMode.l1:
35 | 			self.grad = gpuarray.empty(grad.shape, dtype=grad.dtype, allocator=memPool)
36 | 			l1penaltyKer(self.grad, grad, self.data, self.weight / grad.shape[0])
37 | 
38 | 		elif self.mode == PenaltyMode.l2:
39 | 			self.grad = Blas.addVectorToVector(
40 | 				grad.ravel(), self.data.ravel(), alpha=1.0, beta=-self.weight / grad.shape[0]
41 | 			).reshape(grad.shape)
42 | 
43 | 		else:
44 | 			raise NotImplementedError(self.mode)
45 | 
46 | 
47 | 	def dataShapeFrom(self, shape):
48 | 		return shape
49 | 
50 | 
51 | 	def gradShapeFrom(self, shape):
52 | 		return shape
53 | 
54 | 
55 | def unittest():
56 | 	data = gpuarray.to_gpu(np.random.randn(10, 50).astype(np.float32))
57 | 
58 | 	penalty = Penalty()
59 | 	penalty(data)
60 | 
61 | 	grad = gpuarray.to_gpu(np.random.randn(10, 50).astype(np.float32))
62 | 	penalty.backward(grad)
63 | 
64 | 	hostGrad = grad.get() - penalty.weight * np.sign(data.get()) / data.shape[0]
65 | 	assert np.allclose(hostGrad, penalty.grad.get())
66 | 
67 | 
68 | if __name__ == "__main__":
69 | 	unittest()
70 | 


--------------------------------------------------------------------------------
/Converter/OpenVINO/Source/Build.py:
--------------------------------------------------------------------------------
 1 | import sys, os
 2 | import pybind11
 3 | 
 4 | from PuzzleLib.Compiler.Toolchain import guessToolchain
 5 | 
 6 | 
 7 | def buildDriver(debugmode=0):
 8 | 	cc = prepareCompiler(debugmode=debugmode)
 9 | 
10 | 	driver = "../Driver%s" % cc.pydext
11 | 	cc.build(driver, sourcefiles="./Driver.cpp").clearPath("..")
12 | 
13 | 	return driver
14 | 
15 | 
16 | def findLibraryPath():
17 | 	OPENVINO_PATH = os.environ.get("OPENVINO_PATH", None)
18 | 
19 | 	if OPENVINO_PATH is None:
20 | 		if sys.platform == "linux":
21 | 			OPENVINO_PATH = "/opt/intel/openvino"
22 | 
23 | 		elif sys.platform == "win32":
24 | 			raise OSError("OpenVINO path needs to be specified in the system variables as OPENVINO_PATH")
25 | 
26 | 		else:
27 | 			raise NotImplementedError(sys.platform)
28 | 
29 | 	return OPENVINO_PATH
30 | 
31 | 
32 | def prepareCompiler(debugmode=0):
33 | 	level, debuglevel = (0, 3) if debugmode > 0 else (4, 0)
34 | 
35 | 	cc = guessToolchain(verbose=2).withOptimizationLevel(level=level, debuglevel=debuglevel).cppMode(True)
36 | 	OPENVINO_PATH = findLibraryPath()
37 | 
38 | 	if sys.platform == "linux":
39 | 		cc.includeDirs.append(pybind11.get_include(user=True))
40 | 
41 | 		cc.addLibrary(
42 | 			"openvino",
43 | 			[os.path.join(OPENVINO_PATH, "inference_engine/include")],
44 | 			[os.path.join(OPENVINO_PATH, "inference_engine/lib/intel64")],
45 | 			["inference_engine"]
46 | 		)
47 | 
48 | 	elif sys.platform == "win32":
49 | 		cc.addLibrary(
50 | 			"openvino",
51 | 			[os.path.join(OPENVINO_PATH, "inference_engine/include")],
52 | 			[os.path.join(OPENVINO_PATH, "inference_engine/lib/intel64/Release")],
53 | 			["inference_engine"]
54 | 		)
55 | 
56 | 	else:
57 | 		raise NotImplementedError(sys.platform)
58 | 
59 | 	return cc
60 | 
61 | 
62 | def main():
63 | 	return buildDriver(debugmode=0)
64 | 
65 | 
66 | if __name__ == "__main__":
67 | 	main()
68 | 


--------------------------------------------------------------------------------
/Cost/MSE.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray, Blas
 4 | from PuzzleLib.Cost.Cost import Cost
 5 | 
 6 | 
 7 | class MSE(Cost):
 8 | 	def calcGrad(self, pred, target):
 9 | 		c = 1.0 / np.prod(target.shape)
10 | 		grad = Blas.addVectorToVector(target.ravel(), pred.ravel(), alpha=c, beta=-c)
11 | 		grad = grad.reshape(pred.shape)
12 | 
13 | 		return grad
14 | 
15 | 
16 | 	def calcError(self, pred, target):
17 | 		self.devErr.fill(
18 | 			Blas.dot(self.grad.ravel(), self.grad.ravel()) * np.prod(self.grad.shape) * self.grad.shape[0] / 2.0
19 | 		)
20 | 		self.accumErr += self.devErr
21 | 
22 | 
23 | 	def calcVal(self, pred, target):
24 | 		diff = Blas.addVectorToVector(target.ravel(), pred.ravel(), alpha=1.0, beta=-1.0)
25 | 		error = Blas.dot(diff, diff) / (2.0 * np.prod(target.shape))
26 | 
27 | 		return error
28 | 
29 | 
30 | 	def checkDataShape(self, pred, target):
31 | 		assert pred.shape[1:] == target.shape[1:]
32 | 
33 | 
34 | 	def checkValDataShape(self, pred, target):
35 | 		assert pred.shape[1:] == target.shape[1:]
36 | 
37 | 
38 | def unittest():
39 | 	errorTest()
40 | 	valTest()
41 | 
42 | 
43 | def errorTest():
44 | 	pred = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32))
45 | 	target = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32))
46 | 
47 | 	mse = MSE()
48 | 	mse(pred, target)
49 | 
50 | 	assert np.isclose(mse.error, np.linalg.norm(target.get() - pred.get())**2 / (2.0 * np.prod(target.shape)))
51 | 
52 | 
53 | def valTest():
54 | 	pred = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32))
55 | 	target = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32))
56 | 
57 | 	mse = MSE()
58 | 	error = mse.validate(pred, target)
59 | 
60 | 	assert np.isclose(error, np.linalg.norm(target.get() - pred.get())**2 / (2.0 * np.prod(target.shape)))
61 | 
62 | 
63 | if __name__ == "__main__":
64 | 	unittest()
65 | 


--------------------------------------------------------------------------------
/TestLib/BiRnnIMDBTrain.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Backend import Dnn
 2 | 
 3 | from PuzzleLib.Containers import Sequential
 4 | from PuzzleLib.Modules import Embedder, SwapAxes, RNN, Concat, Dropout, Linear
 5 | 
 6 | from PuzzleLib.Datasets import IMDBLoader
 7 | from PuzzleLib.Handlers import Trainer, Validator
 8 | from PuzzleLib.Optimizers import Adam
 9 | from PuzzleLib.Cost import BCE
10 | 
11 | 
12 | def buildNet(numwords, maxlen, hintBatchsize):
13 | 	seq = Sequential()
14 | 	seq.append(Embedder(numwords, maxlen, 128, initscheme="uniform", wscale=0.05, learnable=True))
15 | 
16 | 	seq.append(SwapAxes(0, 1))
17 | 	seq.append(RNN(128, 64, mode="lstm", direction="bi", hintBatchSize=hintBatchsize))
18 | 
19 | 	seq.append(Concat(axis=1))
20 | 	seq.append(Dropout(p=0.5))
21 | 
22 | 	seq.append(Linear(128, 1))
23 | 	return seq
24 | 
25 | 
26 | def main():
27 | 	hintBatchsize, batchsize = (40, 40) if Dnn.deviceSupportsBatchHint() else (None, 32)
28 | 	numwords, maxlen = 20000, 100
29 | 
30 | 	imdb = IMDBLoader(numwords=numwords, maxlen=maxlen)
31 | 	data, labels, _ = imdb.load(path="../TestData/")
32 | 	data, labels = data[:], labels[:]
33 | 	print("Loaded IMDB")
34 | 
35 | 	net = buildNet(numwords, maxlen, hintBatchsize)
36 | 
37 | 	optimizer = Adam(alpha=1e-3)
38 | 	optimizer.setupOn(net, useGlobalState=True)
39 | 
40 | 	cost = BCE()
41 | 	trainer = Trainer(net, cost, optimizer, batchsize=batchsize)
42 | 	validator = Validator(net, cost, batchsize=batchsize)
43 | 
44 | 	print("Started training ...")
45 | 
46 | 	for i in range(15):
47 | 		trainer.trainFromHost(
48 | 			data[:25000], labels[:25000], macroBatchSize=25000,
49 | 			onMacroBatchFinish=lambda train: print("Train error: %s" % train.cost.getMeanError())
50 | 		)
51 | 		print("Accuracy: %s" % (1.0 - validator.validateFromHost(data[25000:], labels[25000:], macroBatchSize=25000)))
52 | 
53 | 
54 | if __name__ == "__main__":
55 | 	main()
56 | 


--------------------------------------------------------------------------------
/TestLib/CnnIMDBTrain.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Datasets import IMDBLoader
 2 | 
 3 | from PuzzleLib.Containers import Sequential
 4 | from PuzzleLib.Modules import Embedder, Dropout, SwapAxes, Conv1D, Activation, relu, MaxPool1D, Flatten, Linear
 5 | 
 6 | from PuzzleLib.Handlers import Trainer, Validator
 7 | from PuzzleLib.Optimizers import Adam
 8 | from PuzzleLib.Cost import BCE
 9 | 
10 | 
11 | def buildNet(numwords, maxlen, embsize):
12 | 	seq = Sequential()
13 | 
14 | 	seq.append(Embedder(numwords, maxlen, embsize, initscheme="uniform", wscale=0.05, learnable=True))
15 | 
16 | 	seq.append(Dropout(p=0.2))
17 | 	seq.append(SwapAxes(1, 2))
18 | 
19 | 	seq.append(Conv1D(embsize, embsize, 3))
20 | 	seq.append(Activation(relu))
21 | 
22 | 	seq.append(MaxPool1D(maxlen - 2, 1))
23 | 	seq.append(Flatten())
24 | 
25 | 	seq.append(Linear(embsize, 250))
26 | 	seq.append(Dropout(p=0.2))
27 | 	seq.append(Activation(relu))
28 | 
29 | 	seq.append(Linear(250, 1))
30 | 	return seq
31 | 
32 | 
33 | def main():
34 | 	numwords, maxlen, embsize = 5000, 250, 50
35 | 
36 | 	imdb = IMDBLoader(numwords=numwords, maxlen=maxlen)
37 | 	data, labels, _ = imdb.load(path="../TestData/")
38 | 	data, labels = data[:], labels[:]
39 | 	print("Loaded IMDB")
40 | 
41 | 	net = buildNet(numwords, maxlen, embsize)
42 | 
43 | 	optimizer = Adam(alpha=1e-3)
44 | 	optimizer.setupOn(net, useGlobalState=True)
45 | 
46 | 	cost = BCE()
47 | 	trainer = Trainer(net, cost, optimizer, batchsize=32)
48 | 	validator = Validator(net, cost, batchsize=32)
49 | 
50 | 	for i in range(15):
51 | 		trainer.trainFromHost(
52 | 			data[:25000], labels[:25000], macroBatchSize=25000,
53 | 			onMacroBatchFinish=lambda train: print("Train error: %s" % train.cost.getMeanError())
54 | 		)
55 | 		print("Accuracy: %s" % (1.0 - validator.validateFromHost(data[25000:], labels[25000:], macroBatchSize=25000)))
56 | 
57 | 
58 | if __name__ == "__main__":
59 | 	main()
60 | 


--------------------------------------------------------------------------------
/Modules/Pool1D.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Backend import gpuarray
 2 | from PuzzleLib.Modules.Module import ModuleError, Module
 3 | 
 4 | 
 5 | class Pool1D(Module):
 6 | 	def __init__(self, size=2, stride=2, pad=0, name=None):
 7 | 		super().__init__(name)
 8 | 		self.gradUsesOutData = True
 9 | 
10 | 		self.size = (1, size)
11 | 		self.stride = (1, stride)
12 | 		self.pad = (0, pad)
13 | 
14 | 		self.workspace = None
15 | 
16 | 
17 | 	def dataShapeFrom(self, shape):
18 | 		batchsize, maps, insize = shape
19 | 
20 | 		_, size = self.size
21 | 		_, pad = self.pad
22 | 		_, stride = self.stride
23 | 
24 | 		outsize = (insize + 2 * pad - size) // stride + 1
25 | 
26 | 		return batchsize, maps, outsize
27 | 
28 | 
29 | 	def checkDataShape(self, shape):
30 | 		if len(shape) != 3:
31 | 			raise ModuleError("Data must be 3d tensor")
32 | 
33 | 		_, _, insize = shape
34 | 		if insize + 2 * self.pad[1] < self.size[1]:
35 | 			raise ModuleError("Data maps size is too small (got %d, expected at least %d)" %
36 | 							  (insize + 2 * self.pad[1], self.size[1]))
37 | 
38 | 
39 | 	def gradShapeFrom(self, shape):
40 | 		batchsize, maps, outsize = shape
41 | 
42 | 		_, size = self.size
43 | 		_, pad = self.pad
44 | 		_, stride = self.stride
45 | 
46 | 		insize = (outsize - 1) * stride - 2 * pad + size
47 | 
48 | 		return batchsize, maps, insize
49 | 
50 | 
51 | 	def checkGradShape(self, shape):
52 | 		if len(shape) != 3:
53 | 			raise ModuleError("Grad must be 3d tensor")
54 | 
55 | 
56 | 	def updateData(self, data):
57 | 		raise NotImplementedError()
58 | 
59 | 
60 | 	def updateGrad(self, grad):
61 | 		raise NotImplementedError()
62 | 
63 | 
64 | 	def reset(self):
65 | 		super().reset()
66 | 		self.workspace = None
67 | 
68 | 
69 | 	def calcMode(self, T):
70 | 		dtypes = {dtype for dtype, _ in gpuarray.dtypesSupported()}
71 | 
72 | 		if T not in dtypes:
73 | 			raise ModuleError("Unsupported dtype %s" % T)
74 | 
75 | 		self.calctype = T
76 | 


--------------------------------------------------------------------------------
/Transformers/Serial.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Transformers.Provider import Provider
 4 | 
 5 | 
 6 | class Serial(Provider):
 7 | 	def __init__(self, dataset, labels=None, numofthreads=4):
 8 | 		super().__init__(numofthreads)
 9 | 
10 | 		self.datalen = dataset.shape[0]
11 | 
12 | 		self.labels = labels
13 | 		self.dataset = dataset
14 | 
15 | 		self.index = 0
16 | 
17 | 
18 | 	def getNextChunk(self, chunksize, **kwargs):
19 | 		if chunksize >= self.datalen:
20 | 			self.index = 0
21 | 
22 | 			if self.labels is not None:
23 | 				return np.array(self.dataset), np.array(self.labels)
24 | 			else:
25 | 				return np.array(self.dataset)
26 | 
27 | 		begin = self.index
28 | 		end = self.index + chunksize
29 | 
30 | 		if end > self.datalen:
31 | 			chunk = np.empty((chunksize, ) + self.dataset.shape[1:], dtype=self.dataset.dtype)
32 | 			tup = chunk
33 | 
34 | 			chunk[:self.datalen - begin] = self.dataset[begin:self.datalen]
35 | 
36 | 			self.index = end - self.datalen
37 | 			chunk[self.datalen - begin:] = self.dataset[:self.index]
38 | 
39 | 			if self.labels is not None:
40 | 				labels = np.empty((chunksize, ) + self.dataset.shape[1:], dtype=self.labels.dtype)
41 | 				tup = (chunk, labels)
42 | 
43 | 				labels[:self.datalen - begin] = self.labels[begin:self.datalen]
44 | 				labels[self.datalen - begin:] = self.labels[:self.index]
45 | 
46 | 		else:
47 | 			self.index = end
48 | 			chunk = np.array(self.dataset[begin:end])
49 | 			tup = chunk
50 | 
51 | 			if self.labels is not None:
52 | 				labels = np.array(self.labels[begin:end])
53 | 				tup = (chunk, labels)
54 | 
55 | 		return tup
56 | 
57 | 
58 | def unittest():
59 | 	from PuzzleLib.Datasets.ZipLoader import ZipLoader
60 | 
61 | 	zipfile = ZipLoader()
62 | 	data = zipfile.load("../TestData/test.zip")
63 | 
64 | 	with Serial(data) as serial:
65 | 		for _ in range(10):
66 | 			serial.prepareData(chunksize=4)
67 | 			serial.getData()
68 | 
69 | 
70 | if __name__ == "__main__":
71 | 	unittest()
72 | 


--------------------------------------------------------------------------------
/Modules/Replicate.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray, Blas
 4 | from PuzzleLib.Backend.gpuarray import memoryPool as memPool
 5 | 
 6 | from PuzzleLib.Modules.Module import ModuleError, Module
 7 | 
 8 | 
 9 | class Replicate(Module):
10 | 	def __init__(self, times, name=None):
11 | 		super().__init__(name)
12 | 		self.registerBlueprint(locals())
13 | 
14 | 		self.movesData = True
15 | 		self.times = times
16 | 
17 | 
18 | 	def updateData(self, data):
19 | 		self.data = [data] * self.times
20 | 
21 | 
22 | 	def updateGrad(self, grad):
23 | 		firstgrad = grad[0]
24 | 
25 | 		self.grad = gpuarray.empty(firstgrad.shape, dtype=firstgrad.dtype, allocator=memPool)
26 | 		self.grad.fill(0)
27 | 
28 | 		for gr in grad:
29 | 			Blas.toVectorAddVector(self.grad.ravel(), gr.ravel())
30 | 
31 | 
32 | 	def dataShapeFrom(self, shape):
33 | 		return [shape] * self.times
34 | 
35 | 
36 | 	def gradShapeFrom(self, shape):
37 | 		return shape[0]
38 | 
39 | 
40 | 	def calcMode(self, T):
41 | 		dtypes = {dtype for dtype, _ in gpuarray.dtypesSupported()}
42 | 
43 | 		if T not in dtypes:
44 | 			raise ModuleError("Unsupported dtype %s" % T)
45 | 
46 | 		self.calctype = T
47 | 
48 | 
49 | def unittest():
50 | 	for dtype, _ in gpuarray.dtypesSupported():
51 | 		replicateTest(dtype)
52 | 
53 | 
54 | def replicateTest(dtype):
55 | 	hostData = np.random.randn(10, 10, 3, 3).astype(dtype)
56 | 	data = gpuarray.to_gpu(hostData)
57 | 
58 | 	times = 3
59 | 
60 | 	repl = Replicate(times)
61 | 	repl.calcMode(dtype)
62 | 
63 | 	repl(data)
64 | 
65 | 	assert len(repl.data) == times
66 | 
67 | 	hostGrad = [np.random.randn(10, 10, 3, 3).astype(dtype) for _ in range(times)]
68 | 	grad = [gpuarray.to_gpu(gr) for gr in hostGrad]
69 | 
70 | 	repl.backward(grad)
71 | 
72 | 	hostInGrad = np.zeros(grad[0].shape, dtype=dtype)
73 | 	for i in range(times):
74 | 		hostInGrad += hostGrad[i]
75 | 
76 | 	assert np.allclose(hostInGrad, repl.grad.get())
77 | 
78 | 
79 | if __name__ == "__main__":
80 | 	unittest()
81 | 


--------------------------------------------------------------------------------
/Compiler/Codegen/PyDefines/PyDefines.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <stdbool.h>
 3 | 
 4 | 
 5 | #if defined(__clang__)
 6 | 	#pragma GCC diagnostic push
 7 | 	#pragma GCC diagnostic ignored "-Wvisibility"
 8 | 
 9 | #elif defined(_MSC_VER)
10 | 	#pragma warning(push)
11 | 	#pragma warning(disable: 4115)
12 | 
13 | #endif
14 | 
15 | #include <Python.h>
16 | #include <structmember.h>
17 | 
18 | #if defined(__clang__)
19 | 	#pragma GCC diagnostic pop
20 | 
21 | #elif defined(_MSC_VER)
22 | 	#pragma warning(pop)
23 | 
24 | #endif
25 | 
26 | 
27 | inline static bool createPyClass(PyObject *module, const char *name, PyType_Spec *spec, PyTypeObject **pType)
28 | {
29 | 	PyTypeObject *type = (PyTypeObject *)PyType_FromSpec(spec);
30 | 	if (type == NULL)
31 | 		return false;
32 | 
33 | 	if (PyModule_AddObject(module, name, (PyObject *)type) < 0)
34 | 	{
35 | 		Py_DECREF(type);
36 | 		return false;
37 | 	}
38 | 
39 | 	Py_INCREF(type);
40 | 	*pType = type;
41 | 
42 | 	return true;
43 | }
44 | 
45 | inline static bool createPyExc(PyObject *module, const char *name, const char *fullname, PyObject **pExc)
46 | {
47 | 	PyObject *exc = PyErr_NewException(fullname, NULL, NULL);
48 | 	if (exc == NULL)
49 | 		return false;
50 | 
51 | 	if (PyModule_AddObject(module, name, exc) < 0)
52 | 	{
53 | 		Py_DECREF(exc);
54 | 		return false;
55 | 	}
56 | 
57 | 	Py_INCREF(exc);
58 | 	*pExc = exc;
59 | 
60 | 	return true;
61 | }
62 | 
63 | inline static bool unpackPyOptional(PyObject **pObj, PyTypeObject *type, const char *key)
64 | {
65 | 	PyObject *obj = *pObj;
66 | 
67 | 	if (obj != NULL && Py_TYPE(obj) != type && obj != Py_None)
68 | 	{
69 | 		PyErr_Format(
70 | 			PyExc_TypeError, "%s must be %s or %s, not %s",
71 | 			key, type->tp_name, Py_TYPE(Py_None)->tp_name, Py_TYPE(obj)->tp_name
72 | 		);
73 | 		return false;
74 | 	}
75 | 
76 | 	*pObj = (obj == Py_None) ? NULL : obj;
77 | 	return true;
78 | }
79 | 
80 | #define REMOVE_PY_OBJECT(pObj) do { PyObject *obj = (PyObject *)*(pObj); Py_DECREF(obj); *(pObj) = NULL; } while (0)
81 | 


--------------------------------------------------------------------------------
/TestLib/ResumeTrain.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import numpy as np
 4 | 
 5 | from PuzzleLib.Datasets import MnistLoader
 6 | from PuzzleLib.Handlers import Trainer, Validator
 7 | from PuzzleLib.Optimizers import MomentumSGD
 8 | from PuzzleLib.Cost import CrossEntropy
 9 | 
10 | from PuzzleLib.Models.Nets.LeNet import loadLeNet
11 | 
12 | 
13 | def train(net, optimizer, data, labels, epochs):
14 | 	cost = CrossEntropy(maxlabels=10)
15 | 	trainer = Trainer(net, cost, optimizer)
16 | 	validator = Validator(net, cost)
17 | 
18 | 	for i in range(epochs):
19 | 		trainer.trainFromHost(
20 | 			data[:60000], labels[:60000], macroBatchSize=60000,
21 | 			onMacroBatchFinish=lambda tr: print("Train error: %s" % tr.cost.getMeanError())
22 | 		)
23 | 		print("Accuracy: %s" % (1.0 - validator.validateFromHost(data[60000:], labels[60000:], macroBatchSize=10000)))
24 | 
25 | 		optimizer.learnRate *= 0.9
26 | 		print("Reduced optimizer learn rate to %s" % optimizer.learnRate)
27 | 
28 | 
29 | def main():
30 | 	mnist = MnistLoader()
31 | 	data, labels = mnist.load(path="../TestData/")
32 | 	data, labels = data[:], labels[:]
33 | 	print("Loaded mnist")
34 | 
35 | 	np.random.seed(1234)
36 | 	net = loadLeNet(None, initscheme=None)
37 | 
38 | 	optimizer = MomentumSGD()
39 | 	optimizer.setupOn(net, useGlobalState=True)
40 | 	optimizer.learnRate = 0.1
41 | 	optimizer.momRate = 0.9
42 | 
43 | 	epochs = 10
44 | 	print("Training for %s epochs ..." % epochs)
45 | 	train(net, optimizer, data, labels, epochs)
46 | 
47 | 	print("Saving net and optimizer ...")
48 | 	net.save("../TestData/net.hdf")
49 | 	optimizer.save("../TestData/optimizer.hdf")
50 | 
51 | 	print("Reloading net and optimizer ...")
52 | 	net.load("../TestData/net.hdf")
53 | 	optimizer.load("../TestData/optimizer.hdf")
54 | 
55 | 	print("Continuing training for %s epochs ..." % epochs)
56 | 	train(net, optimizer, data, labels, epochs)
57 | 
58 | 	os.remove("../TestData/net.hdf")
59 | 	os.remove("../TestData/optimizer.hdf")
60 | 
61 | 
62 | if __name__ == "__main__":
63 | 	main()
64 | 


--------------------------------------------------------------------------------
/Optimizers/AdaGrad.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib import Config
 4 | 
 5 | from PuzzleLib.Backend import gpuarray
 6 | from PuzzleLib.Backend.Kernels.ElementWise import adagradKer
 7 | 
 8 | from PuzzleLib.Optimizers.Optimizer import Optimizer, trainSimpleTest, trainHardTest
 9 | 
10 | 
11 | class AdaGrad(Optimizer):
12 | 	def __init__(self, learnRate=1e-3, epsilon=1e-8, nodeinfo=None):
13 | 		super().__init__(nodeinfo)
14 | 
15 | 		self.epsilon = None
16 | 
17 | 		self.setAttr("learnRate", learnRate)
18 | 		self.setAttr("epsilon", epsilon)
19 | 
20 | 
21 | 	def setupState(self, var):
22 | 		return {"h": gpuarray.zeros(var.data.shape, dtype=var.data.dtype)}
23 | 
24 | 
25 | 	def updateVar(self, var, state, stream=None):
26 | 		adagradKer(var.data.dtype)(
27 | 			var.data, var.grad, state["h"], self.learnRate * var.learnRate, self.epsilon, stream=stream
28 | 		)
29 | 
30 | 
31 | def unittest():
32 | 	for dtype, atol in gpuarray.dtypesSupported():
33 | 		calcTest(dtype, atol)
34 | 		trainSimpleTest(AdaGrad, dtype, learnRate=1e-2)
35 | 
36 | 		if Config.backend == Config.Backend.cuda:
37 | 			trainHardTest(AdaGrad, dtype, learnRate=1e-2)
38 | 
39 | 
40 | def calcTest(dtype, atol):
41 | 	lr, epsilon = 0.01, 1e-8
42 | 	shape = (11, 13)
43 | 
44 | 	hostW, hostDw = np.random.randn(*shape).astype(dtype), np.random.randn(*shape).astype(dtype)
45 | 	hostH = (1.0 + np.random.randn(*shape)**2).astype(dtype)
46 | 
47 | 	w, dw, h = gpuarray.to_gpu(hostW), gpuarray.to_gpu(hostDw), gpuarray.to_gpu(hostH)
48 | 	adagradKer(w.dtype)(w, dw, h, lr, epsilon)
49 | 
50 | 	hostW, hostDw, hostH = hostW.astype(np.float32), hostDw.astype(np.float32), hostH.astype(np.float32)
51 | 
52 | 	hostH += hostDw**2
53 | 	hostW += lr * hostDw / (np.sqrt(hostH) + epsilon)
54 | 
55 | 	hostW, hostDw, hostH = hostW.astype(dtype), hostDw.astype(dtype), hostH.astype(dtype)
56 | 
57 | 	assert np.allclose(hostH, h.get(), atol=atol)
58 | 	assert np.allclose(hostW, w.get(), atol=atol)
59 | 
60 | 
61 | if __name__ == "__main__":
62 | 	unittest()
63 | 


--------------------------------------------------------------------------------
/Compiler/Codegen/Vector/Generate.py:
--------------------------------------------------------------------------------
 1 | import os, random
 2 | from string import Template
 3 | 
 4 | from PuzzleLib.Compiler.Toolchain import createTemplateNames, writeTemplates, buildTemplateTest
 5 | 
 6 | 
 7 | def generateVector(name, T, borrow="(void)", destruct="(void)", minCapacity=16,
 8 | 				   headerPreambule=None, bodyPreambule=None, malloc="malloc", free="free", filename=None):
 9 | 	headerPreambule = "%s\n\n" % headerPreambule if headerPreambule is not None else ""
10 | 	bodyPreambule = "%s\n\n" % bodyPreambule if bodyPreambule is not None else ""
11 | 
12 | 	filename = name if filename is None else filename
13 | 	headername, bodyname = createTemplateNames(filename)
14 | 
15 | 	dirname = os.path.dirname(__file__)
16 | 	headerTmpl, bodyTmpl = os.path.join(dirname, "TVector.h"), os.path.join(dirname, "TVector.c")
17 | 
18 | 	with open(headerTmpl, mode="r", encoding="utf-8") as f:
19 | 		header = Template(f.read()).substitute(HEADER_PREAMBULE=headerPreambule, NAME=name, T=T)
20 | 
21 | 	with open(bodyTmpl, mode="r", encoding="utf-8") as f:
22 | 		body = Template(f.read()).substitute(
23 | 			HEADER_NAME=os.path.basename(headername), BODY_PREAMBULE=bodyPreambule, NAME=name, T=T,
24 | 			MIN_CAPACITY=minCapacity, MALLOC=malloc, FREE=free, BORROW=borrow, DESTRUCT=destruct
25 | 		)
26 | 
27 | 	writeTemplates([
28 | 		(header, headername),
29 | 		(body, bodyname)
30 | 	])
31 | 
32 | 	return bodyname
33 | 
34 | 
35 | def unittest():
36 | 	IntVector = buildTemplateTest(
37 | 		name="IntVector", bindingName="TVectorTest.c", path="../../TestData", generator=generateVector, T="int"
38 | 	)
39 | 
40 | 	size = 1 << 16
41 | 
42 | 	pyvec = list(range(size))
43 | 	random.shuffle(pyvec)
44 | 
45 | 	vector = IntVector.IntVector()
46 | 
47 | 	for i in pyvec:
48 | 		vector.append(i)
49 | 
50 | 	assert len(vector) == size
51 | 
52 | 	for i in range(size):
53 | 		assert vector[i] == pyvec[i]
54 | 
55 | 	for i in reversed(pyvec):
56 | 		assert vector.pop() == i
57 | 
58 | 	assert len(vector) == 0
59 | 
60 | 
61 | if __name__ == "__main__":
62 | 	unittest()
63 | 


--------------------------------------------------------------------------------
/TestLib/EncoderTrain.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray
 4 | from PuzzleLib.Backend.gpuarray import memoryPool as memPool
 5 | 
 6 | from PuzzleLib.Containers import Sequential
 7 | from PuzzleLib.Modules import Linear, Activation, relu, Dropout
 8 | 
 9 | from PuzzleLib.Datasets import MnistLoader
10 | from PuzzleLib.Visual import showFilters
11 | from PuzzleLib.Optimizers import MomentumSGD
12 | from PuzzleLib.Cost import MSE
13 | from PuzzleLib.Variable import Variable
14 | 
15 | 
16 | def buildEncoder():
17 | 	seq = Sequential()
18 | 
19 | 	seq.append(Linear(784, 256))
20 | 	seq.append(Activation(relu, inplace=True))
21 | 	seq.append(Dropout())
22 | 	seq.append(Linear(256, 784, empty=True, transpose=True))
23 | 
24 | 	seq[-1].setVar("W", seq[0].vars["W"])
25 | 	seq[-1].setVar("b", Variable(gpuarray.zeros((784,), dtype=np.float32, allocator=memPool)))
26 | 
27 | 	return seq
28 | 
29 | 
30 | def main():
31 | 	mnist = MnistLoader()
32 | 	data, _ = mnist.load(path="../TestData")
33 | 	data = data[:].reshape(data.shape[0], -1)
34 | 	print("Loaded mnist")
35 | 
36 | 	np.random.seed(1234)
37 | 	net = buildEncoder()
38 | 
39 | 	optimizer = MomentumSGD()
40 | 	optimizer.setupOn(net, useGlobalState=True)
41 | 	optimizer.learnRate = 10.0
42 | 	optimizer.momRate = 0.5
43 | 
44 | 	data = gpuarray.to_gpu(data)
45 | 	batchsize = 100
46 | 
47 | 	mse = MSE()
48 | 
49 | 	for epoch in range(40):
50 | 		for i in range(data.shape[0] // batchsize):
51 | 			batch = data[i * batchsize:(i + 1) * batchsize]
52 | 
53 | 			net(batch)
54 | 			_, grad = mse(net.data, batch)
55 | 
56 | 			net.zeroGradParams()
57 | 			net.backward(grad)
58 | 			optimizer.update()
59 | 
60 | 		optimizer.learnRate *= 0.8
61 | 		print("Finished epoch %d" % (epoch + 1))
62 | 
63 | 		print("Error: %s" % (mse.getMeanError()))
64 | 		mse.resetAccumulator()
65 | 
66 | 		if (epoch + 1) % 5 == 0:
67 | 			filters = net[0].W.get().T
68 | 			showFilters(filters.reshape(16, 16, 28, 28), "../TestData/encoder.png")
69 | 
70 | 
71 | if __name__ == "__main__":
72 | 	main()
73 | 


--------------------------------------------------------------------------------
/Optimizers/MomentumSGD.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib import Config
 4 | 
 5 | from PuzzleLib.Backend import gpuarray
 6 | from PuzzleLib.Backend.Kernels.ElementWise import classicMomSGDKer
 7 | 
 8 | from PuzzleLib.Optimizers.Optimizer import trainSimpleTest, trainHardTest
 9 | from PuzzleLib.Optimizers.SGD import SGD
10 | 
11 | 
12 | class MomentumSGD(SGD):
13 | 	def __init__(self, learnRate=1e-3, momRate=0.9, nodeinfo=None):
14 | 		super().__init__(learnRate, nodeinfo)
15 | 
16 | 		self.momRate = None
17 | 		self.setAttr("momRate", momRate)
18 | 
19 | 
20 | 	def setupState(self, var):
21 | 		return {"mom": gpuarray.zeros(var.data.shape, dtype=var.data.dtype)}
22 | 
23 | 
24 | 	def updateVar(self, var, state, stream=None):
25 | 		classicMomSGDKer(var.data.dtype)(
26 | 			var.data, var.grad, state["mom"], self.learnRate * var.learnRate, self.momRate * var.momRate, stream=stream
27 | 		)
28 | 
29 | 
30 | def unittest():
31 | 	for dtype, atol in gpuarray.dtypesSupported():
32 | 		calcTest(dtype, atol)
33 | 		trainSimpleTest(MomentumSGD, dtype, learnRate=1e-1, momRate=0.9)
34 | 
35 | 		if Config.backend == Config.Backend.cuda:
36 | 			trainHardTest(MomentumSGD, dtype, learnRate=1e-1, momRate=0.9)
37 | 
38 | 
39 | def calcTest(dtype, atol):
40 | 	lr, mr = 0.01, 0.9
41 | 	shape = (11, 13)
42 | 
43 | 	hostW, hostDw = np.random.randn(*shape).astype(dtype), np.random.randn(*shape).astype(dtype)
44 | 	hostMom = np.random.randn(*shape).astype(dtype)
45 | 
46 | 	w, dw, mom = gpuarray.to_gpu(hostW), gpuarray.to_gpu(hostDw), gpuarray.to_gpu(hostMom)
47 | 	classicMomSGDKer(w.dtype)(w, dw, mom, lr, mr)
48 | 
49 | 	hostW, hostDw, hostMom = hostW.astype(np.float32), hostDw.astype(np.float32), hostMom.astype(np.float32)
50 | 
51 | 	hostMom = mr * hostMom + lr * hostDw
52 | 	hostW += hostMom
53 | 
54 | 	hostW, hostDw, hostMom = hostW.astype(dtype), hostDw.astype(dtype), hostMom.astype(dtype)
55 | 
56 | 	assert np.allclose(hostMom, mom.get(), atol=atol)
57 | 	assert np.allclose(hostW, w.get(), atol=atol)
58 | 
59 | 
60 | if __name__ == "__main__":
61 | 	unittest()
62 | 


--------------------------------------------------------------------------------
/Cost/Abs.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray, Blas
 4 | from PuzzleLib.Backend.gpuarray import memoryPool as memPool
 5 | from PuzzleLib.Backend.Kernels.ElementWise import l1gradKer
 6 | 
 7 | from PuzzleLib.Cost.Cost import Cost
 8 | 
 9 | 
10 | class Abs(Cost):
11 | 	def calcGrad(self, pred, target):
12 | 		grad = gpuarray.empty(pred.shape, dtype=np.float32, allocator=memPool)
13 | 		norm = 1.0 / np.prod(target.shape)
14 | 
15 | 		l1gradKer(grad, pred, target, norm)
16 | 
17 | 		return grad
18 | 
19 | 
20 | 	def calcError(self, pred, target):
21 | 		diff = Blas.addVectorToVector(pred.ravel(), target.ravel(), alpha=1.0, beta=-1.0)
22 | 
23 | 		self.devErr.fill(Blas.vectorL1Norm(diff) / np.prod(pred.shape[1:]))
24 | 		self.accumErr += self.devErr
25 | 
26 | 
27 | 	def calcVal(self, pred, target):
28 | 		diff = Blas.addVectorToVector(pred.ravel(), target.ravel(), alpha=1.0, beta=-1.0)
29 | 		error = Blas.vectorL1Norm(diff) / np.prod(target.shape)
30 | 
31 | 		return error
32 | 
33 | 
34 | 	def checkDataShape(self, pred, target):
35 | 		assert pred.shape[1:] == target.shape[1:]
36 | 
37 | 
38 | 	def checkValDataShape(self, pred, target):
39 | 		assert pred.shape[1:] == target.shape[1:]
40 | 
41 | 
42 | def unittest():
43 | 	errorTest()
44 | 	valTest()
45 | 
46 | 
47 | def errorTest():
48 | 	pred = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32))
49 | 	target = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32))
50 | 
51 | 	abscost = Abs()
52 | 	abscost(pred, target)
53 | 
54 | 	assert np.isclose(abscost.error, np.linalg.norm((target.get() - pred.get()).ravel(), ord=1) / np.prod(target.shape))
55 | 
56 | 
57 | def valTest():
58 | 	pred = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32))
59 | 	target = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32))
60 | 
61 | 	abscost = Abs()
62 | 	error = abscost.validate(pred, target)
63 | 
64 | 	assert np.isclose(error, np.linalg.norm((target.get() - pred.get()).ravel(), ord=1) / np.prod(target.shape))
65 | 
66 | 
67 | if __name__ == "__main__":
68 | 	unittest()
69 | 


--------------------------------------------------------------------------------
/Compiler/Codegen/Tree/Generate.py:
--------------------------------------------------------------------------------
 1 | import os, random
 2 | from string import Template
 3 | 
 4 | from PuzzleLib.Compiler.Toolchain import createTemplateNames, writeTemplates, buildTemplateTest
 5 | 
 6 | 
 7 | def generateTree(name, K, V, headerPreambule=None, bodyPreambule=None, malloc="malloc", free="free", filename=None):
 8 | 	headerPreambule = "%s\n\n" % headerPreambule if headerPreambule is not None else ""
 9 | 	bodyPreambule = "%s\n\n" % bodyPreambule if bodyPreambule is not None else ""
10 | 
11 | 	filename = name if filename is None else filename
12 | 	headername, bodyname = createTemplateNames(filename)
13 | 
14 | 	dirname = os.path.dirname(__file__)
15 | 	headerTmpl, bodyTmpl = os.path.join(dirname, "TTree.h"), os.path.join(dirname, "TTree.c")
16 | 
17 | 	with open(headerTmpl, mode="r", encoding="utf-8") as f:
18 | 		header = Template(f.read()).substitute(HEADER_PREAMBULE=headerPreambule, NAME=name, K=K, V=V)
19 | 
20 | 	with open(bodyTmpl, mode="r", encoding="utf-8") as f:
21 | 		body = Template(f.read()).substitute(
22 | 			HEADER_NAME=os.path.basename(headername), BODY_PREAMBULE=bodyPreambule, NAME=name, K=K, V=V,
23 | 			MALLOC=malloc, FREE=free
24 | 		)
25 | 
26 | 	writeTemplates([
27 | 		(header, headername),
28 | 		(body, bodyname)
29 | 	])
30 | 
31 | 	return bodyname
32 | 
33 | 
34 | def unittest():
35 | 	IntTree = buildTemplateTest(
36 | 		name="IntTree", bindingName="TTreeTest.c", path="../../TestData", generator=generateTree, K="int", V="int"
37 | 	)
38 | 
39 | 	size = 1 << 16
40 | 
41 | 	keys, values = list(range(size)), list(range(size))
42 | 	random.shuffle(keys)
43 | 	random.shuffle(values)
44 | 
45 | 	pytree = {k: v for k, v in zip(keys, values)}
46 | 
47 | 	inttree = IntTree.IntTree()
48 | 
49 | 	for k, v in pytree.items():
50 | 		inttree[k] = v
51 | 
52 | 	assert len(inttree) == size
53 | 	assert inttree.validate()
54 | 
55 | 	for k in pytree.keys():
56 | 		assert inttree[k] == pytree[k]
57 | 
58 | 	for k in pytree.keys():
59 | 		del inttree[k]
60 | 
61 | 	assert len(inttree) == 0
62 | 	assert inttree.validate()
63 | 
64 | 
65 | if __name__ == "__main__":
66 | 	unittest()
67 | 


--------------------------------------------------------------------------------
/Optimizers/NesterovSGD.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib import Config
 4 | 
 5 | from PuzzleLib.Backend import gpuarray
 6 | from PuzzleLib.Backend.Kernels.ElementWise import nesterovMomSGDKer
 7 | 
 8 | from PuzzleLib.Optimizers.Optimizer import trainSimpleTest, trainHardTest
 9 | from PuzzleLib.Optimizers.SGD import SGD
10 | 
11 | 
12 | class NesterovSGD(SGD):
13 | 	def __init__(self, learnRate=1e-3, momRate=0.9, nodeinfo=None):
14 | 		super().__init__(learnRate, nodeinfo)
15 | 
16 | 		self.momRate = None
17 | 		self.setAttr("momRate", momRate)
18 | 
19 | 
20 | 	def setupState(self, var):
21 | 		return {"mom": gpuarray.zeros(var.data.shape, dtype=var.data.dtype)}
22 | 
23 | 
24 | 	def updateVar(self, var, state, stream=None):
25 | 		nesterovMomSGDKer(var.data.dtype)(
26 | 			var.data, var.grad, state["mom"], self.learnRate * var.learnRate, self.momRate * var.momRate, stream=stream
27 | 		)
28 | 
29 | 
30 | def unittest():
31 | 	for dtype, atol in gpuarray.dtypesSupported():
32 | 		calcTest(dtype, atol)
33 | 		trainSimpleTest(NesterovSGD, dtype, learnRate=1e-1, momRate=0.9)
34 | 
35 | 		if Config.backend == Config.Backend.cuda:
36 | 			trainHardTest(NesterovSGD, dtype, learnRate=1e-1, momRate=0.9)
37 | 
38 | 
39 | def calcTest(dtype, atol):
40 | 	lr, mr = 0.01, 0.9
41 | 	shape = (11, 13)
42 | 
43 | 	hostW, hostDw = np.random.randn(*shape).astype(dtype), np.random.randn(*shape).astype(dtype)
44 | 	hostMom = np.random.randn(*shape).astype(dtype)
45 | 
46 | 	w, dw, mom = gpuarray.to_gpu(hostW), gpuarray.to_gpu(hostDw), gpuarray.to_gpu(hostMom)
47 | 	nesterovMomSGDKer(w.dtype)(w, dw, mom, lr, mr)
48 | 
49 | 	hostW, hostDw, hostMom = hostW.astype(np.float32), hostDw.astype(np.float32), hostMom.astype(np.float32)
50 | 
51 | 	hostW += mr**2 * hostMom + (1 + mr) * lr * hostDw
52 | 	hostMom = mr * hostMom + lr * hostDw
53 | 
54 | 	hostW, hostDw, hostMom = hostW.astype(dtype), hostDw.astype(dtype), hostMom.astype(dtype)
55 | 
56 | 	assert np.allclose(hostMom, mom.get(), atol=atol)
57 | 	assert np.allclose(hostW, w.get(), atol=atol)
58 | 
59 | 
60 | if __name__ == "__main__":
61 | 	unittest()
62 | 


--------------------------------------------------------------------------------
/Hip/Utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Cuda.Utils import SharedArray, shareMemTest, randomTest
 4 | from PuzzleLib.Hip import Driver as HipDriver
 5 | 
 6 | 
 7 | class HipSharedArray(SharedArray):
 8 | 	GPUArray = HipDriver.GPUArray
 9 | 
10 | 
11 | def unittest():
12 | 	from PuzzleLib.Hip import Backend
13 | 
14 | 	for deviceIdx in range(Backend.getDeviceCount()):
15 | 		bnd = Backend.getBackend(deviceIdx, initmode=2)
16 | 
17 | 		for dtype, _ in bnd.dtypesSupported():
18 | 			shareMemTest(bnd, dtype)
19 | 			memCopyTest(bnd, dtype)
20 | 
21 | 		randomTest(bnd)
22 | 
23 | 
24 | def memCopyTest(bnd, dtype):
25 | 	hostSrc = np.random.randn(4, 4, 4, 4).astype(dtype)
26 | 
27 | 	src = bnd.GPUArray.toGpu(hostSrc)
28 | 	assert np.allclose(hostSrc, src.copy().get())
29 | 
30 | 	hostA = np.random.randn(7, 4, 4, 4).astype(dtype)
31 | 	a = bnd.GPUArray.toGpu(hostA)
32 | 
33 | 	out = bnd.concatenate((src, a), axis=0)
34 | 	assert np.allclose(np.concatenate((hostSrc, hostA), axis=0), out.get())
35 | 
36 | 	hostA = np.random.randn(4, 2, 4, 4).astype(dtype)
37 | 	hostB = np.random.randn(4, 1, 4, 4).astype(dtype)
38 | 
39 | 	a, b = bnd.GPUArray.toGpu(hostA), bnd.GPUArray.toGpu(hostB)
40 | 
41 | 	out = bnd.concatenate((src, a, b), axis=1)
42 | 	assert np.allclose(np.concatenate((hostSrc, hostA, hostB), axis=1), out.get())
43 | 
44 | 	hostA = np.random.randn(4, 4, 5, 4).astype(dtype)
45 | 
46 | 	out = bnd.concatenate((bnd.GPUArray.toGpu(hostA), src), axis=2)
47 | 	assert np.allclose(np.concatenate((hostA, hostSrc), axis=2), out.get())
48 | 
49 | 	outs = bnd.split(src, (2, 2), axis=0)
50 | 	assert all(np.allclose(hostSrc[2 * i:2 * (i + 1)], out.get()) for i, out in enumerate(outs))
51 | 
52 | 	outs = bnd.split(src, (2, 2), axis=1)
53 | 	assert all(np.allclose(hostSrc[:, 2 * i:2 * (i + 1), :, :], out.get()) for i, out in enumerate(outs))
54 | 
55 | 	outs = bnd.split(src, (2, 2), axis=2)
56 | 	assert all(np.allclose(hostSrc[:, :, 2 * i:2 * (i + 1), :], out.get()) for i, out in enumerate(outs))
57 | 
58 | 	assert np.allclose(np.tile(hostB, (1, 3, 1, 1)), bnd.tile(b, 3, axis=1).get())
59 | 
60 | 
61 | if __name__ == "__main__":
62 | 	unittest()
63 | 


--------------------------------------------------------------------------------
/Compiler/Codegen/Malloc/TMallocTest.c:
--------------------------------------------------------------------------------
 1 | #define Py_LIMITED_API
 2 | #include <Python.h>
 3 | 
 4 | #include "TraceMalloc.gen.h"
 5 | 
 6 | 
 7 | static PyObject *PyTraceMalloc_malloc(PyObject *self, PyObject *args)
 8 | {
 9 | 	(void)self, (void)args;
10 | 	Py_ssize_t nbytes;
11 | 
12 | 	if (!PyArg_ParseTuple(args, "n", &nbytes))
13 | 		return NULL;
14 | 
15 | 	void *ptr = TRACE_MALLOC(nbytes);
16 | 	return Py_BuildValue("n", (Py_ssize_t)ptr);
17 | }
18 | 
19 | 
20 | static PyObject *PyTraceMalloc_free(PyObject *self, PyObject *args)
21 | {
22 | 	(void)self, (void)args;
23 | 	Py_ssize_t ptr;
24 | 
25 | 	if (!PyArg_ParseTuple(args, "n", &ptr))
26 | 		return NULL;
27 | 
28 | 	TRACE_FREE((void *)ptr);
29 | 	Py_RETURN_NONE;
30 | }
31 | 
32 | 
33 | static PyObject *PyTraceMalloc_traceLeaks(PyObject *self, PyObject *args)
34 | {
35 | 	(void)self, (void)args;
36 | 	size_t nleaks = TraceMalloc_traceLeaks();
37 | 
38 | 	PyObject *leaks = PyList_New(nleaks);
39 | 	if (leaks == NULL)
40 | 		return NULL;
41 | 
42 | 	size_t index = 0;
43 | 	if (!TraceMalloc_Iterator_init())
44 | 		return leaks;
45 | 
46 | 	do
47 | 	{
48 | 		size_t size;
49 | 		const char *file;
50 | 		int line;
51 | 
52 | 		TraceMalloc_Iterator_item(&size, &file, &line);
53 | 
54 | 		PyObject *leak = Py_BuildValue("(nsi)", (Py_ssize_t)size, file, line);
55 | 		if (leak == NULL)
56 | 			goto error;
57 | 
58 | 		PyList_SetItem(leaks, index, leak);
59 | 		index += 1;
60 | 	}
61 | 	while (TraceMalloc_Iterator_move());
62 | 
63 | 	TraceMalloc_Iterator_dealloc();
64 | 	return leaks;
65 | 
66 | error:
67 | 	TraceMalloc_Iterator_dealloc();
68 | 	Py_DECREF(leaks);
69 | 
70 | 	return NULL;
71 | }
72 | 
73 | 
74 | static PyModuleDef PyTraceMalloc_moduleDef = {
75 | 	PyModuleDef_HEAD_INIT,
76 | 	.m_name = "TraceMalloc",
77 | 	.m_methods = (PyMethodDef[]){
78 | 		{"malloc", PyTraceMalloc_malloc, METH_VARARGS, NULL},
79 | 		{"free", PyTraceMalloc_free, METH_VARARGS, NULL},
80 | 		{"traceLeaks", PyTraceMalloc_traceLeaks, METH_NOARGS, NULL},
81 | 		{NULL, NULL, 0, NULL}
82 | 	},
83 | 	.m_slots = NULL
84 | };
85 | 
86 | 
87 | PyMODINIT_FUNC PyInit_TraceMalloc(void)
88 | {
89 | 	return PyModule_Create(&PyTraceMalloc_moduleDef);
90 | }
91 | 


--------------------------------------------------------------------------------
/Optimizers/RMSProp.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib import Config
 4 | 
 5 | from PuzzleLib.Backend import gpuarray
 6 | from PuzzleLib.Backend.Kernels.ElementWise import rmspropKer
 7 | 
 8 | from PuzzleLib.Optimizers.Optimizer import Optimizer, trainSimpleTest, trainHardTest
 9 | 
10 | 
11 | class RMSProp(Optimizer):
12 | 	def __init__(self, learnRate=1e-3, factor=0.9, epsilon=1e-5, nodeinfo=None):
13 | 		super().__init__(nodeinfo)
14 | 
15 | 		self.factor = None
16 | 		self.epsilon = None
17 | 
18 | 		self.setAttr("learnRate", learnRate)
19 | 		self.setAttr("factor", factor)
20 | 		self.setAttr("epsilon", epsilon)
21 | 
22 | 
23 | 	def setupState(self, var):
24 | 		return {"ms": gpuarray.zeros(var.data.shape, dtype=var.data.dtype)}
25 | 
26 | 
27 | 	def updateVar(self, var, state, stream=None):
28 | 		rmspropKer(var.data.dtype)(
29 | 			var.data, var.grad, state["ms"], self.learnRate * var.learnRate, self.factor, self.epsilon, stream=stream
30 | 		)
31 | 
32 | 
33 | def unittest():
34 | 	for dtype, atol in gpuarray.dtypesSupported():
35 | 		calcTest(dtype, atol)
36 | 		trainSimpleTest(RMSProp, dtype, learnRate=1e-2)
37 | 
38 | 		if Config.backend == Config.Backend.cuda:
39 | 			trainHardTest(RMSProp, dtype, learnRate=1e-2)
40 | 
41 | 
42 | def calcTest(dtype, atol):
43 | 	lr, factor, epsilon = 0.01, 0.9, 1e-5
44 | 	shape = (11, 13)
45 | 
46 | 	hostW, hostDw = np.random.randn(*shape).astype(dtype), np.random.randn(*shape).astype(dtype)
47 | 	hostMs = (1.0 + np.random.randn(*shape)**2).astype(dtype)
48 | 
49 | 	w, dw, ms = gpuarray.to_gpu(hostW), gpuarray.to_gpu(hostDw), gpuarray.to_gpu(hostMs)
50 | 	rmspropKer(w.dtype)(w, dw, ms, lr, factor, epsilon)
51 | 
52 | 	hostW, hostDw, hostMs = hostW.astype(np.float32), hostDw.astype(np.float32), hostMs.astype(np.float32)
53 | 
54 | 	hostMs = factor * hostMs + (1 - factor) * hostDw**2
55 | 	hostW += lr * hostDw / (np.sqrt(hostMs) + epsilon)
56 | 
57 | 	hostW, hostDw, hostMs = hostW.astype(dtype), hostDw.astype(dtype), hostMs.astype(dtype)
58 | 
59 | 	assert np.allclose(hostMs, ms.get(), atol=atol)
60 | 	assert np.allclose(hostW, w.get(), atol=atol)
61 | 
62 | 
63 | if __name__ == "__main__":
64 | 	unittest()
65 | 


--------------------------------------------------------------------------------
/Modules/Mul.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray
 4 | from PuzzleLib.Backend.gpuarray import copy, memoryPool as memPool
 5 | from PuzzleLib.Backend.Kernels.ElementWise import mulKer
 6 | 
 7 | from PuzzleLib.Modules.Module import ModuleError, Module
 8 | 
 9 | 
10 | class Mul(Module):
11 | 	def updateData(self, data):
12 | 		self.data = gpuarray.empty(data[0].shape, dtype=data[0].dtype, allocator=memPool)
13 | 		self.data.fill(1.0)
14 | 
15 | 		for dat in data:
16 | 			mulKer(dat.dtype)(self.data, dat, self.data)
17 | 
18 | 
19 | 	def updateGrad(self, grad):
20 | 		self.grad = []
21 | 		for i in range(len(self.inData)):
22 | 			ingrad = copy(None, grad)
23 | 
24 | 			for k in range(len(self.inData)):
25 | 				if k != i:
26 | 					mulKer(ingrad.dtype)(ingrad, self.inData[k], ingrad)
27 | 
28 | 			self.grad.append(ingrad)
29 | 
30 | 
31 | 	def checkDataShape(self, shapes):
32 | 		for shape in shapes:
33 | 			if shape != shapes[0]:
34 | 				raise ModuleError("Shape %s is not equal to initial shape %s" % (shape, shapes[0]))
35 | 
36 | 
37 | 	def dataShapeFrom(self, shape):
38 | 		return shape
39 | 
40 | 
41 | 	def gradShapeFrom(self, shape):
42 | 		return [shape] * len(self.inData)
43 | 
44 | 
45 | 	def calcMode(self, T):
46 | 		dtypes = {dtype for dtype, _ in gpuarray.dtypesSupported()}
47 | 
48 | 		if T not in dtypes:
49 | 			raise ModuleError("Unsupported dtype %s" % T)
50 | 
51 | 		self.calctype = T
52 | 
53 | 
54 | def unittest():
55 | 	for dtype, _ in gpuarray.dtypesSupported():
56 | 		mulTest(dtype)
57 | 
58 | 
59 | def mulTest(dtype):
60 | 	hostData1 = np.random.randn(2, 5, 5).astype(dtype)
61 | 	hostData2 = np.random.randn(*hostData1.shape).astype(dtype)
62 | 
63 | 	data1, data2 = gpuarray.to_gpu(hostData1), gpuarray.to_gpu(hostData2)
64 | 
65 | 	mul = Mul()
66 | 	mul.calcMode(dtype)
67 | 
68 | 	mul([data1, data2])
69 | 	assert np.allclose(mul.data.get(), hostData1 * hostData2)
70 | 
71 | 	hostGrad = np.random.randn(*mul.data.shape).astype(dtype)
72 | 
73 | 	grad = gpuarray.to_gpu(hostGrad)
74 | 	mul.backward(grad)
75 | 
76 | 	assert np.allclose(mul.grad[0].get(), hostGrad * hostData2)
77 | 	assert np.allclose(mul.grad[1].get(), hostGrad * hostData1)
78 | 
79 | 
80 | if __name__ == "__main__":
81 | 	unittest()
82 | 


--------------------------------------------------------------------------------
/Modules/Pool2D.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Backend import gpuarray
 2 | from PuzzleLib.Modules.Module import ModuleError, Module
 3 | 
 4 | 
 5 | class Pool2D(Module):
 6 | 	def __init__(self, size=2, stride=2, pad=0, name=None):
 7 | 		super().__init__(name)
 8 | 		self.gradUsesOutData = True
 9 | 
10 | 		self.size = self.repeat(size, 2)
11 | 		self.stride = self.repeat(stride, 2)
12 | 		self.pad = self.repeat(pad, 2)
13 | 
14 | 		self.workspace = None
15 | 
16 | 
17 | 	def dataShapeFrom(self, shape):
18 | 		batchsize, maps, inh, inw = shape
19 | 
20 | 		hsize, wsize = self.size
21 | 		hpad, wpad = self.pad
22 | 		hstride, wstride = self.stride
23 | 
24 | 		outh = (inh + 2 * hpad - hsize) // hstride + 1
25 | 		outw = (inw + 2 * wpad - wsize) // wstride + 1
26 | 
27 | 		return batchsize, maps, outh, outw
28 | 
29 | 
30 | 	def checkDataShape(self, shape):
31 | 		if len(shape) != 4:
32 | 			raise ModuleError("Data must be 4d tensor")
33 | 
34 | 		_, _, inh, inw = shape
35 | 		if inh + 2 * self.pad[0] < self.size[0]:
36 | 			raise ModuleError("Data maps height is too small (got %d, expected at least %d)" %
37 | 							  (inh + 2 * self.pad[0], self.size[0]))
38 | 
39 | 		if inw + 2 * self.pad[1] < self.size[1]:
40 | 			raise ModuleError("Data maps width is too small (got %d, expected at least %d)" %
41 | 							  (inw + 2 * self.pad[1], self.size[1]))
42 | 
43 | 
44 | 	def gradShapeFrom(self, shape):
45 | 		batchsize, maps, outh, outw = shape
46 | 
47 | 		hsize, wsize = self.size
48 | 		hpad, wpad = self.pad
49 | 		hstride, wstride = self.stride
50 | 
51 | 		inh = (outh - 1) * hstride - 2 * hpad + hsize
52 | 		inw = (outw - 1) * wstride - 2 * wpad + wsize
53 | 
54 | 		return batchsize, maps, inh, inw
55 | 
56 | 
57 | 	def checkGradShape(self, shape):
58 | 		if len(shape) != 4:
59 | 			raise ModuleError("Grad must be 4d tensor")
60 | 
61 | 
62 | 	def updateData(self, data):
63 | 		raise NotImplementedError()
64 | 
65 | 
66 | 	def updateGrad(self, grad):
67 | 		raise NotImplementedError()
68 | 
69 | 
70 | 	def reset(self):
71 | 		super().reset()
72 | 		self.workspace = None
73 | 
74 | 
75 | 	def calcMode(self, T):
76 | 		dtypes = {dtype for dtype, _ in gpuarray.dtypesSupported()}
77 | 
78 | 		if T not in dtypes:
79 | 			raise ModuleError("Unsupported dtype %s" % T)
80 | 
81 | 		self.calctype = T
82 | 


--------------------------------------------------------------------------------
/TestLib/MultiGPUMnist.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Grid import runGrid
 4 | 
 5 | 
 6 | def train(nodeinfo, verbose):
 7 | 	from PuzzleLib.Datasets import MnistLoader
 8 | 	mnist = MnistLoader(cachename="mnist-%s.hdf" % nodeinfo.index)
 9 | 	data, labels = mnist.load(path="../TestData/")
10 | 
11 | 	data, labels = data[:], labels[:]
12 | 	print("[%s]: Loaded mnist" % nodeinfo.index)
13 | 
14 | 	np.random.seed(1234)
15 | 
16 | 	from PuzzleLib.Models.Nets.LeNet import loadLeNet
17 | 	net = loadLeNet(None, initscheme=None)
18 | 
19 | 	from PuzzleLib.Optimizers import MomentumSGD
20 | 	optimizer = MomentumSGD(learnRate=0.1, momRate=0.9, nodeinfo=nodeinfo)
21 | 	optimizer.setupOn(net, useGlobalState=True)
22 | 
23 | 	from PuzzleLib.Cost import CrossEntropy
24 | 	cost = CrossEntropy(maxlabels=10)
25 | 
26 | 	from PuzzleLib.Handlers import Trainer, Validator
27 | 	trainer = Trainer(net, cost, optimizer, batchsize=128 // nodeinfo.gridsize)
28 | 	validator = Validator(net, cost)
29 | 
30 | 	valsize = 10000
31 | 	trainsize = data.shape[0] - valsize
32 | 
33 | 	trainpart = trainsize // nodeinfo.gridsize
34 | 	valpart = valsize // nodeinfo.gridsize
35 | 
36 | 	for i in range(15):
37 | 		start, end = nodeinfo.index * trainpart, (nodeinfo.index + 1) * trainpart
38 | 
39 | 		trainer.trainFromHost(data[start:end], labels[start:end], macroBatchSize=trainpart)
40 | 		trerr = cost.getMeanError()
41 | 
42 | 		if verbose:
43 | 			print("[%s]: Epoch %s local train error: %s" % (nodeinfo.index, i + 1, trerr))
44 | 
45 | 		trerr = nodeinfo.meanValue(trerr)
46 | 
47 | 		if nodeinfo.index == 0:
48 | 			print("Epoch %s global train error: %s" % (i + 1, trerr))
49 | 
50 | 		start, end = trainsize + nodeinfo.index * valpart, trainsize + (nodeinfo.index + 1) * valpart
51 | 		valerr = validator.validateFromHost(data[start:end], labels[start:end], macroBatchSize=valpart)
52 | 
53 | 		if verbose:
54 | 			print("[%s]: Epoch %s local accuracy: %s" % (nodeinfo.index, i + 1, 1.0 - valerr))
55 | 
56 | 		valerr = nodeinfo.meanValue(valerr)
57 | 
58 | 		if nodeinfo.index == 0:
59 | 			print("Epoch %s global accuracy: %s" % (i + 1, 1.0 - valerr))
60 | 
61 | 		optimizer.learnRate *= 0.9
62 | 
63 | 
64 | def main():
65 | 	runGrid(target=train, size=2, verbose=True)
66 | 
67 | 
68 | if __name__ == "__main__":
69 | 	main()
70 | 


--------------------------------------------------------------------------------
/Modules/CrossMapLRN.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray
 4 | from PuzzleLib.Backend.Dnn import crossMapLRN, crossMapLRNBackward
 5 | 
 6 | from PuzzleLib.Modules.LRN import LRN
 7 | 
 8 | 
 9 | class CrossMapLRN(LRN):
10 | 	def __init__(self, N=5, alpha=1e-4, beta=0.75, K=2.0, name=None):
11 | 		super().__init__(N, alpha, beta, K, name)
12 | 		self.gradUsesOutData = True
13 | 
14 | 
15 | 	def updateData(self, data):
16 | 		self.data, self.workspace = crossMapLRN(data, N=self.N, alpha=self.alpha, beta=self.beta, K=self.K,
17 | 												test=not self.train)
18 | 
19 | 
20 | 	def updateGrad(self, grad):
21 | 		self.grad = crossMapLRNBackward(self.inData, self.data, grad, self.workspace,
22 | 										N=self.N, alpha=self.alpha, beta=self.beta, K=self.K)
23 | 
24 | 
25 | def unittest():
26 | 	maps = 10
27 | 	data = gpuarray.to_gpu(np.random.randn(1, maps, 1, 1).astype(np.float32))
28 | 
29 | 	crossMapLrn = CrossMapLRN()
30 | 	crossMapLrn(data)
31 | 
32 | 	lookBehind = int((crossMapLrn.N - 1) / 2)
33 | 	lookAhead = crossMapLrn.N - lookBehind
34 | 
35 | 	hostData = data.get().reshape(maps, ).astype(np.float32)
36 | 	norms = np.empty((maps, ), dtype=np.float32)
37 | 	for i in range(maps):
38 | 		norm = 0.0
39 | 		for j in range(max(0, i - lookBehind), min(maps, i + lookAhead)):
40 | 			norm += hostData[j]**2
41 | 		norms[i] = crossMapLrn.K + norm * crossMapLrn.alpha / crossMapLrn.N
42 | 
43 | 	hostOutData = hostData / norms**crossMapLrn.beta
44 | 	assert np.allclose(hostOutData, crossMapLrn.data.get().reshape(maps, ).astype(np.float32))
45 | 
46 | 	grad = gpuarray.to_gpu(np.random.randn(1, maps, 1, 1).astype(np.float32))
47 | 	crossMapLrn.backward(grad)
48 | 
49 | 	hostGrad = grad.get().reshape(maps, ).astype(np.float32)
50 | 	hostInGrad = np.zeros((maps, ), dtype=np.float32)
51 | 
52 | 	k = 2.0 * crossMapLrn.alpha * crossMapLrn.beta / crossMapLrn.N
53 | 	for i in range(maps):
54 | 		hostInGrad[i] += hostGrad[i] / norms[i]**crossMapLrn.beta
55 | 
56 | 		for j in range(max(0, i - lookBehind), min(maps, i + lookAhead)):
57 | 			hostInGrad[j] -= hostGrad[i] * k * hostData[i] * hostData[j] / norms[i]**(crossMapLrn.beta+1)
58 | 	assert np.allclose(hostInGrad, crossMapLrn.grad.get().reshape(maps, ).astype(np.float32))
59 | 
60 | 
61 | if __name__ == "__main__":
62 | 	unittest()
63 | 


--------------------------------------------------------------------------------
/Modules/MulAddConst.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib import Config
 4 | 
 5 | from PuzzleLib.Backend import gpuarray
 6 | from PuzzleLib.Backend.gpuarray import memoryPool as memPool
 7 | from PuzzleLib.Backend.Kernels.ElementWise import linearKer
 8 | 
 9 | from PuzzleLib.Modules.Module import ModuleError, Module
10 | 
11 | 
12 | class MulAddConst(Module):
13 | 	def __init__(self, a=1.0, b=0.0, inplace=False, name=None):
14 | 		super().__init__(name)
15 | 		self.registerBlueprint(locals())
16 | 
17 | 		self.a, self.b = a, b
18 | 		self.inplace = inplace
19 | 
20 | 		if inplace and Config.showWarnings:
21 | 			Config.getLogger().info("Warning: %s is using inplace flag", self)
22 | 
23 | 
24 | 	def updateData(self, data):
25 | 		self.data = data if self.inplace else gpuarray.empty(data.shape, dtype=data.dtype, allocator=memPool)
26 | 		linearKer(data.dtype)(self.data, data, self.a, self.b)
27 | 
28 | 
29 | 	def updateGrad(self, grad):
30 | 		self.grad = grad if self.inplace else gpuarray.empty(grad.shape, dtype=grad.dtype, allocator=memPool)
31 | 		linearKer(grad.dtype)(self.grad, grad, self.a, 0.0)
32 | 
33 | 
34 | 	def dataShapeFrom(self, shape):
35 | 		return shape
36 | 
37 | 
38 | 	def gradShapeFrom(self, shape):
39 | 		return shape
40 | 
41 | 
42 | 	def calcMode(self, T):
43 | 		dtypes = {dtype for dtype, _ in gpuarray.dtypesSupported()}
44 | 
45 | 		if T not in dtypes:
46 | 			raise ModuleError("Unsupported dtype %s" % T)
47 | 
48 | 		self.calctype = T
49 | 
50 | 
51 | def unittest():
52 | 	for dtype, atol in gpuarray.dtypesSupported():
53 | 		mulAddConstTest(dtype, atol)
54 | 
55 | 
56 | def mulAddConstTest(dtype, atol):
57 | 	hostData = np.random.randn(10, 10, 4, 3).astype(dtype)
58 | 	data = gpuarray.to_gpu(hostData)
59 | 
60 | 	mulAdd = MulAddConst(a=3.141592, b=42.0)
61 | 	mulAdd.calcMode(dtype)
62 | 
63 | 	mulAdd(data)
64 | 
65 | 	hostOutData = (hostData.astype(np.float32) * mulAdd.a + mulAdd.b).astype(dtype)
66 | 	assert np.allclose(hostOutData, mulAdd.data.get(), atol=atol)
67 | 
68 | 	hostGrad = np.random.randn(*data.shape).astype(dtype)
69 | 	grad = gpuarray.to_gpu(hostGrad)
70 | 
71 | 	mulAdd.backward(grad)
72 | 
73 | 	hostInGrad = hostGrad * mulAdd.a
74 | 	assert np.allclose(hostInGrad, mulAdd.grad.get(), atol=atol)
75 | 
76 | 
77 | if __name__ == "__main__":
78 | 	unittest()
79 | 


--------------------------------------------------------------------------------
/Modules/Gelu.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import numpy as np
 4 | 
 5 | from PuzzleLib import Config
 6 | 
 7 | from PuzzleLib.Backend import gpuarray
 8 | from PuzzleLib.Backend.gpuarray import memoryPool as memPool
 9 | from PuzzleLib.Backend.Kernels.ElementWise import geluKer, geluDerKer
10 | 
11 | from PuzzleLib.Modules.Module import ModuleError, Module
12 | 
13 | 
14 | class Gelu(Module):
15 | 	def __init__(self, inplace=False, name=None):
16 | 		super().__init__(name)
17 | 		self.registerBlueprint(locals())
18 | 
19 | 		self.inplace = inplace
20 | 
21 | 		if inplace and Config.showWarnings:
22 | 			Config.getLogger().info("Warning: %s is using inplace flag", self)
23 | 
24 | 
25 | 	def updateData(self, data):
26 | 		self.data = data if self.inplace else gpuarray.empty(data.shape, dtype=data.dtype, allocator=memPool)
27 | 		geluKer(data.dtype)(self.data, data)
28 | 
29 | 
30 | 	def updateGrad(self, grad):
31 | 		self.grad = grad if self.inplace else gpuarray.empty(grad.shape, dtype=grad.dtype, allocator=memPool)
32 | 		geluDerKer(grad.dtype)(self.grad, grad, self.inData)
33 | 
34 | 
35 | 	def dataShapeFrom(self, shape):
36 | 		return shape
37 | 
38 | 
39 | 	def gradShapeFrom(self, shape):
40 | 		return shape
41 | 
42 | 
43 | 	def calcMode(self, T):
44 | 		dtypes = {dtype for dtype, _ in gpuarray.dtypesSupported()}
45 | 
46 | 		if T not in dtypes:
47 | 			raise ModuleError("Unsupported dtype %s" % T)
48 | 
49 | 		self.calctype = T
50 | 
51 | 
52 | def unittest():
53 | 	for dtype, atol in gpuarray.dtypesSupported():
54 | 		geluTest(dtype, atol)
55 | 
56 | 
57 | def geluTest(dtype, atol):
58 | 	gelu = Gelu()
59 | 	gelu.calcMode(dtype)
60 | 
61 | 	hostData = np.random.randn(11, 51).astype(dtype)
62 | 
63 | 	data = gpuarray.to_gpu(hostData)
64 | 	gelu(data)
65 | 
66 | 	erf = np.vectorize(math.erf)
67 | 	hostOutData = 0.5 * hostData * (1.0 + erf(hostData / math.sqrt(2)))
68 | 
69 | 	assert np.allclose(hostOutData, gelu.data.get(), atol=atol)
70 | 
71 | 	hostGrad = np.random.randn(*gelu.data.shape).astype(dtype)
72 | 
73 | 	grad = gpuarray.to_gpu(hostGrad)
74 | 	gelu.backward(grad)
75 | 
76 | 	hostInGrad = hostGrad * (0.5 * (1.0 + erf(hostData / math.sqrt(2))) +
77 | 				 hostData / math.sqrt(math.pi) * np.exp(-0.5 * hostData**2))
78 | 	assert np.allclose(hostInGrad, gelu.grad.get(), atol=atol)
79 | 
80 | 
81 | if __name__ == "__main__":
82 | 	unittest()
83 | 


--------------------------------------------------------------------------------
/Converter/OpenVINO/VINOEngine.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray
 4 | from PuzzleLib.Backend.gpuarray import memoryPool as memPool
 5 | 
 6 | from PuzzleLib.Modules.Module import Module
 7 | from PuzzleLib.Converter.OpenVINO import Driver
 8 | 
 9 | 
10 | def genEngineName(name):
11 | 	return "%s.xml" % name, "%s.bin" % name
12 | 
13 | 
14 | class VINOEngine(Module):
15 | 	def __init__(self, enginepath, batchsize, name=None):
16 | 		super().__init__(name)
17 | 		self.registerBlueprint(locals())
18 | 
19 | 		xmlpath, binpath = enginepath
20 | 		self.engine = Driver.VINOEngine(batchsize, xmlpath, binpath, "CPU")
21 | 
22 | 		inshape, outshape = self.engine.inshape, self.engine.outshape
23 | 
24 | 		inshape = [tuple(inshape[key]) for key in sorted(inshape.keys(), key=lambda nm: nm.split(sep="_")[-1])]
25 | 		outshape = [tuple(outshape[key]) for key in sorted(outshape.keys(), key=lambda nm: nm.split(sep="_")[-1])]
26 | 
27 | 		self.inshape = inshape[0] if len(inshape) == 1 else inshape
28 | 		self.outshape = outshape[0] if len(outshape) == 1 else outshape
29 | 
30 | 
31 | 	def updateData(self, data):
32 | 		data = data if isinstance(data, list) else [data]
33 | 		inputs = {"data_%s" % i: (d.ptr, d.nbytes) for i, d in enumerate(data)}
34 | 
35 | 		outshape = [self.outshape] if not isinstance(self.outshape, list) else self.outshape
36 | 
37 | 		outdata = [gpuarray.empty(shape, dtype=np.float32, allocator=memPool) for shape in outshape]
38 | 		outputs = {"outdata_%s" % i: (data.ptr, data.nbytes) for i, data in enumerate(outdata)}
39 | 
40 | 		self.engine.infer(outputs, inputs)
41 | 		self.data = outdata if isinstance(self.outshape, list) else outdata[0]
42 | 
43 | 
44 | 	def updateGrad(self, grad):
45 | 		assert False
46 | 
47 | 
48 | 	def dataShapeFrom(self, shape):
49 | 		return self.outshape
50 | 
51 | 
52 | 	def checkDataShape(self, shape):
53 | 		if isinstance(shape, list):
54 | 			for i, sh in enumerate(shape):
55 | 				if sh != self.inshape[i]:
56 | 					raise ValueError("Shape %s is not equal to shape %s on index %s" % (sh, self.inshape[i], i))
57 | 
58 | 		elif shape != self.inshape:
59 | 			raise ValueError("Data shape must be equal %s (was given %s)" % (self.inshape, shape))
60 | 
61 | 
62 | 	def gradShapeFrom(self, shape):
63 | 		assert False
64 | 
65 | 
66 | 	def checkGradShape(self, shape):
67 | 		assert False
68 | 


--------------------------------------------------------------------------------
/Compiler/Codegen/Vector/TVector.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include "$HEADER_NAME"
  3 | 
  4 | $BODY_PREAMBULE
  5 | void ${NAME}_init($NAME *self)
  6 | {
  7 | 	self->ptr = NULL;
  8 | 	self->size = self->capacity = 0;
  9 | }
 10 | 
 11 | 
 12 | void ${NAME}_dealloc($NAME *self)
 13 | {
 14 | 	${NAME}_clear(self);
 15 | 	$FREE(self->ptr);
 16 | }
 17 | 
 18 | 
 19 | void ${NAME}_reserve($NAME *self, size_t capacity)
 20 | {
 21 | 	if (self->size < capacity)
 22 | 	{
 23 | 		$T *ptr = ($T *)$MALLOC(sizeof(self->ptr[0]) * capacity);
 24 | 
 25 | 		for (size_t i = 0; i < self->size; i += 1)
 26 | 			ptr[i] = self->ptr[i];
 27 | 
 28 | 		$FREE(self->ptr);
 29 | 
 30 | 		self->ptr = ptr;
 31 | 		self->capacity = capacity;
 32 | 	}
 33 | 	else
 34 | 	{
 35 | 		for (size_t i = capacity; i < self->size; i += 1)
 36 | 			$DESTRUCT(self->ptr[i]);
 37 | 
 38 | 		self->size = self->capacity = capacity;
 39 | 	}
 40 | }
 41 | 
 42 | 
 43 | inline static void ${NAME}_ensureIsAppendable($NAME *self)
 44 | {
 45 | 	if (self->size == self->capacity)
 46 | 	{
 47 | 		size_t size = (self->capacity < $MIN_CAPACITY) ? $MIN_CAPACITY : self->capacity * 2;
 48 | 		${NAME}_reserve(self, size);
 49 | 	}
 50 | }
 51 | 
 52 | 
 53 | void ${NAME}_append($NAME *self, $T elem)
 54 | {
 55 | 	${NAME}_ensureIsAppendable(self);
 56 | 
 57 | 	$BORROW(elem);
 58 | 	self->ptr[self->size] = elem;
 59 | 
 60 | 	self->size += 1;
 61 | }
 62 | 
 63 | 
 64 | void ${NAME}_appendEmpty($NAME *self)
 65 | {
 66 | 	${NAME}_ensureIsAppendable(self);
 67 | 	self->size += 1;
 68 | }
 69 | 
 70 | 
 71 | bool ${NAME}_pop($NAME *self, $T *elem)
 72 | {
 73 | 	if (self->size == 0)
 74 | 		return false;
 75 | 
 76 | 	self->size -= 1;
 77 | 	*elem = self->ptr[self->size];
 78 | 
 79 | 	return true;
 80 | }
 81 | 
 82 | 
 83 | void ${NAME}_clear($NAME *self)
 84 | {
 85 | 	for (size_t i = 0; i < self->size; i += 1)
 86 | 		$DESTRUCT(self->ptr[i]);
 87 | 
 88 | 	self->size = 0;
 89 | }
 90 | 
 91 | 
 92 | bool ${NAME}_get($NAME *self, size_t index, $T *elem)
 93 | {
 94 | 	if (index >= self->size)
 95 | 		return false;
 96 | 
 97 | 	*elem = self->ptr[index];
 98 | 	return true;
 99 | }
100 | 
101 | 
102 | bool ${NAME}_set($NAME *self, size_t index, $T elem)
103 | {
104 | 	if (index >= self->size)
105 | 		return false;
106 | 
107 | 	$BORROW(elem);
108 | 	$DESTRUCT(self->ptr[index]);
109 | 
110 | 	self->ptr[index] = elem;
111 | 	return true;
112 | }
113 | 


--------------------------------------------------------------------------------
/Modules/Slice.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray
 4 | from PuzzleLib.Backend.gpuarray import memoryPool as memPool
 5 | 
 6 | from PuzzleLib.Modules.Module import ModuleError, Module
 7 | 
 8 | 
 9 | class Slice(Module):
10 | 	def __init__(self, slc=None, name=None):
11 | 		super().__init__(name)
12 | 		self.registerBlueprint(locals())
13 | 
14 | 		self.slc = slc
15 | 		self.inshape = None
16 | 
17 | 
18 | 	def __getitem__(self, slc):
19 | 		if not isinstance(slc, tuple):
20 | 			slc = (slc, )
21 | 
22 | 		self.slc = slc
23 | 		return self
24 | 
25 | 
26 | 	def updateData(self, data):
27 | 		self.inshape = data.shape
28 | 		self.data = data[self.slc].copy(allocator=memPool)
29 | 
30 | 
31 | 	def updateGrad(self, grad):
32 | 		self.grad = gpuarray.zeros(self.inshape, dtype=np.float32, allocator=memPool)
33 | 		self.grad[self.slc] = grad
34 | 
35 | 
36 | 	def dataShapeFrom(self, shape):
37 | 		if self.slc is None:
38 | 			raise ModuleError("Slice parameter is not initialized")
39 | 
40 | 		outshape = [None] * len(shape)
41 | 		for i, dim in enumerate(shape):
42 | 			slc = self.slc[i]
43 | 			start, stop, step = slc.indices(dim)
44 | 
45 | 			outshape[i] = (stop - start) // step
46 | 
47 | 		return tuple(outshape)
48 | 
49 | 
50 | 	def checkDataShape(self, shape):
51 | 		if self.slc is None:
52 | 			raise ModuleError("Slice parameter is not initialized")
53 | 
54 | 		if len(shape) < len(self.slc):
55 | 			raise ModuleError("Expected at least %d data dimensions, %d were given" % (len(self.slc), len(shape)))
56 | 
57 | 
58 | 	def gradShapeFrom(self, shape):
59 | 		return self.inshape
60 | 
61 | 
62 | 	def checkGradShape(self, shape):
63 | 		if shape != self.data.shape:
64 | 			raise ModuleError("Grad shape %s is inconsistent with output data shape %s" % (shape, self.data.shape))
65 | 
66 | 
67 | def unittest():
68 | 	data = gpuarray.to_gpu(np.random.randn(3, 4, 5, 6).astype(np.float32))
69 | 
70 | 	slc = Slice()[:, :, 1:-1, 1:-1]
71 | 	slc(data)
72 | 
73 | 	assert slc.dataShapeFrom(data.shape) == slc.data.shape
74 | 	assert np.allclose(slc.data.get(), data.get()[slc.slc])
75 | 
76 | 	grad = gpuarray.to_gpu(np.random.randn(*slc.data.shape).astype(np.float32))
77 | 	slc.backward(grad)
78 | 
79 | 	assert slc.gradShapeFrom(grad.shape) == data.shape
80 | 	assert np.allclose(slc.grad.get()[slc.slc], grad.get())
81 | 
82 | 
83 | if __name__ == "__main__":
84 | 	unittest()
85 | 


--------------------------------------------------------------------------------
/Modules/Glue.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray
 4 | from PuzzleLib.Modules.Module import ModuleError, Module
 5 | 
 6 | 
 7 | class Glue(Module):
 8 | 	def __init__(self, modules=None, fwdGlue=None, bwdGlue=None, fwdShapeGlue=None, bwdShapeGlue=None, name=None):
 9 | 		super().__init__(name)
10 | 
11 | 		if modules is not None and not isinstance(modules, dict):
12 | 			raise ModuleError("Modules object must be non-empty dictionary")
13 | 
14 | 		self.modules = modules
15 | 
16 | 		self.fwdGlue = fwdGlue
17 | 		self.bwdGlue = bwdGlue
18 | 
19 | 		self.fwdShapeGlue = fwdShapeGlue
20 | 		self.bwdShapeGlue = bwdShapeGlue
21 | 
22 | 
23 | 	def updateData(self, data):
24 | 		self.data = self.fwdGlue(data, self.modules)
25 | 
26 | 
27 | 	def updateGrad(self, grad):
28 | 		self.grad = self.bwdGlue(grad, self.modules)
29 | 
30 | 
31 | 	def dataShapeFrom(self, shape):
32 | 		if self.fwdShapeGlue is not None:
33 | 			return self.fwdShapeGlue(shape)
34 | 		else:
35 | 			raise ModuleError("Forward shape glue hook is not installed")
36 | 
37 | 
38 | 	def gradShapeFrom(self, shape):
39 | 		if self.bwdShapeGlue is not None:
40 | 			return self.bwdShapeGlue(shape)
41 | 		else:
42 | 			raise ModuleError("Backward shape glue hook is not installed")
43 | 
44 | 
45 | def unittest():
46 | 	data1 = gpuarray.to_gpu(np.random.randn(10, 2, 3, 3).astype(np.float32))
47 | 	data2 = gpuarray.to_gpu(np.random.randn(10, 2, 3, 3).astype(np.float32))
48 | 	data3 = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32))
49 | 
50 | 	def fwdGlue(data, modules):
51 | 		dat1, dat2, dat3 = data
52 | 		split = modules["split"]
53 | 		out1, out2 = split(data3)
54 | 
55 | 		return [dat1 + dat2, out1, out2]
56 | 
57 | 	def bwdGlue(grad, modules):
58 | 		gr1, gr2, gr3 = grad
59 | 		split = modules["split"]
60 | 		split.backward([gr2, gr3])
61 | 
62 | 		return [gr1, gr1, split.grad]
63 | 
64 | 	from PuzzleLib.Modules.Split import Split
65 | 	glue = Glue(fwdGlue=fwdGlue, bwdGlue=bwdGlue, modules={"split": Split(axis=1, sections=(5, 5))})
66 | 	glue([data1, data2, data3])
67 | 
68 | 	grad1 = gpuarray.to_gpu(np.random.randn(*glue.data[0].shape).astype(np.float32))
69 | 	grad2 = gpuarray.to_gpu(np.random.randn(*glue.data[1].shape).astype(np.float32))
70 | 	grad3 = gpuarray.to_gpu(np.random.randn(*glue.data[2].shape).astype(np.float32))
71 | 
72 | 	glue.backward([grad1, grad2, grad3])
73 | 
74 | 
75 | if __name__ == "__main__":
76 | 	unittest()
77 | 


--------------------------------------------------------------------------------
/Modules/AvgPool2D.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray
 4 | from PuzzleLib.Backend.Dnn import PoolMode, poolNd, poolNdBackward
 5 | 
 6 | from PuzzleLib.Modules.Pool2D import Pool2D
 7 | 
 8 | 
 9 | class AvgPool2D(Pool2D):
10 | 	def __init__(self, size=2, stride=2, pad=0, includePad=True, name=None):
11 | 		super().__init__(size, stride, pad, name)
12 | 		self.registerBlueprint(locals())
13 | 
14 | 		self.mode = PoolMode.avgWithPad if includePad else PoolMode.avgNoPad
15 | 
16 | 
17 | 	def updateData(self, data):
18 | 		self.data, self.workspace = poolNd(data, size=self.size, stride=self.stride, pad=self.pad, mode=self.mode,
19 | 										   test=not self.train)
20 | 
21 | 
22 | 	def updateGrad(self, grad):
23 | 		self.grad = poolNdBackward(self.inData, self.data, grad, self.workspace, size=self.size, stride=self.stride,
24 | 								   pad=self.pad, mode=self.mode)
25 | 
26 | 
27 | def unittest():
28 | 	batchsize, maps, h, w = 2, 3, 6, 6
29 | 	data = gpuarray.to_gpu(np.random.randn(batchsize, maps, h, w).astype(np.float32))
30 | 
31 | 	size = 3
32 | 	stride, pad = 1, 1
33 | 
34 | 	avgpool2d = AvgPool2D(size=size, stride=stride, pad=pad, includePad=True)
35 | 	avgpool2d(data)
36 | 
37 | 	grad = gpuarray.to_gpu(np.random.randn(*avgpool2d.data.shape).astype(np.float32))
38 | 	avgpool2d.backward(grad)
39 | 
40 | 	hostData = np.zeros(shape=(batchsize, maps, h + 2 * pad, w + 2 * pad), dtype=np.float32)
41 | 	hostData[:, :, pad:-pad, pad:-pad] = data.get()
42 | 
43 | 	hostOutData = np.empty(avgpool2d.data.shape, dtype=np.float32)
44 | 
45 | 	for b in range(batchsize):
46 | 		for c in range(maps):
47 | 			for y in range(avgpool2d.data.shape[2]):
48 | 				for x in range(avgpool2d.data.shape[3]):
49 | 					hostOutData[b,c,y,x] = np.sum(hostData[b,c,y*stride:y*stride+size, x*stride:x*stride+size])/size**2
50 | 
51 | 	assert np.allclose(hostOutData, avgpool2d.data.get())
52 | 
53 | 	hostGrad, hostInGrad = grad.get(), np.zeros(hostData.shape, dtype=np.float32)
54 | 
55 | 	for b in range(batchsize):
56 | 		for c in range(maps):
57 | 			for y in range(hostGrad.shape[2]):
58 | 				for x in range(hostGrad.shape[3]):
59 | 					for dy in range(size):
60 | 						for dx in range(size):
61 | 							hostInGrad[b, c, y * stride + dy, x * stride + dx] += hostGrad[b, c, y, x] / size**2
62 | 
63 | 	assert np.allclose(hostInGrad[:, :, pad:-pad, pad:-pad], avgpool2d.grad.get())
64 | 
65 | 
66 | if __name__ == "__main__":
67 | 	unittest()
68 | 


--------------------------------------------------------------------------------
/Modules/Transpose.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray, Memory
 4 | from PuzzleLib.Modules.Module import ModuleError, Module
 5 | 
 6 | 
 7 | class Transpose(Module):
 8 | 	def __init__(self, axes=None, name=None):
 9 | 		super().__init__(name)
10 | 		self.registerBlueprint(locals())
11 | 
12 | 		self.axes = axes
13 | 
14 | 		if axes is None:
15 | 			self.invaxes = None
16 | 		else:
17 | 			self.invaxes = [0] * len(axes)
18 | 			for i, axis in enumerate(axes):
19 | 				self.invaxes[axis] = i
20 | 
21 | 
22 | 	def updateData(self, data):
23 | 		self.data = Memory.transpose(data, self.axes)
24 | 
25 | 
26 | 	def updateGrad(self, grad):
27 | 		self.grad = Memory.transpose(grad, self.invaxes)
28 | 
29 | 
30 | 	def checkDataShape(self, shape):
31 | 		if self.axes is not None and len(shape) != len(self.axes):
32 | 			raise ModuleError("Data dimension needs to be %d, (data has %d)" % (len(self.axes), len(shape)))
33 | 
34 | 
35 | 	def checkGradShape(self, shape):
36 | 		if self.axes is not None and len(shape) != len(self.axes):
37 | 			raise ModuleError("Grad dimension needs to be %d, (grad has %d)" % (len(self.axes), len(shape)))
38 | 
39 | 
40 | 	def dataShapeFrom(self, shape):
41 | 		return tuple(shape[axis] for axis in self.axes)
42 | 
43 | 
44 | 	def gradShapeFrom(self, shape):
45 | 		return tuple(shape[axis] for axis in self.invaxes)
46 | 
47 | 
48 | 	def calcMode(self, T):
49 | 		dtypes = {dtype for dtype, _ in gpuarray.dtypesSupported()}
50 | 
51 | 		if T not in dtypes:
52 | 			raise ModuleError("Unsupported dtype %s" % T)
53 | 
54 | 		self.calctype = T
55 | 
56 | 
57 | def unittest():
58 | 	for dtype, _ in gpuarray.dtypesSupported():
59 | 		transposeTest(dtype)
60 | 
61 | 
62 | def transposeTest(dtype):
63 | 	shape = (10, 3, 5, 4, 2)
64 | 	axes = (2, 4, 1, 3, 0)
65 | 
66 | 	hostData = np.random.randn(*shape).astype(dtype)
67 | 	data = gpuarray.to_gpu(hostData)
68 | 
69 | 	transpose = Transpose(axes)
70 | 	transpose.calcMode(dtype)
71 | 
72 | 	transpose(data)
73 | 
74 | 	hostOutData = np.transpose(hostData, axes=axes)
75 | 	assert np.allclose(hostOutData, transpose.data.get())
76 | 
77 | 	hostGrad = np.random.randn(*transpose.data.shape).astype(dtype)
78 | 	grad = gpuarray.to_gpu(hostGrad)
79 | 
80 | 	transpose.backward(grad)
81 | 
82 | 	hostInGrad = np.transpose(hostGrad, axes=transpose.invaxes)
83 | 	assert np.allclose(hostInGrad, transpose.grad.get())
84 | 
85 | 
86 | if __name__ == "__main__":
87 | 	unittest()
88 | 


--------------------------------------------------------------------------------
/Modules/MapLRN.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib import Config
 4 | 
 5 | from PuzzleLib.Backend import gpuarray
 6 | from PuzzleLib.Backend.Dnn import mapLRN, mapLRNBackward
 7 | 
 8 | from PuzzleLib.Modules.LRN import LRN
 9 | 
10 | 
11 | class MapLRN(LRN):
12 | 	def __init__(self, N=5, alpha=1e-4, beta=0.75, K=2.0, name=None):
13 | 		super().__init__(N, alpha, beta, K, name)
14 | 
15 | 		if Config.backend != Config.Backend.cuda:
16 | 			self.gradUsesOutData = True
17 | 
18 | 
19 | 	def updateData(self, data):
20 | 		self.data, self.workspace = mapLRN(data, None, N=self.N, alpha=self.alpha, beta=self.beta, K=self.K,
21 | 										   test=not self.train)
22 | 
23 | 
24 | 	def updateGrad(self, grad):
25 | 		self.grad = mapLRNBackward(self.inData, self.data, grad, None, self.workspace,
26 | 								   N=self.N, alpha=self.alpha, beta=self.beta, K=self.K)
27 | 
28 | 
29 | def unittest():
30 | 	h, w = 10, 10
31 | 	data = gpuarray.to_gpu(np.random.randn(1, 1, h, w).astype(np.float32))
32 | 
33 | 	mapLrn = MapLRN()
34 | 	mapLrn(data)
35 | 
36 | 	lookBehind = int((mapLrn.N - 1) / 2)
37 | 	lookAhead = mapLrn.N - lookBehind
38 | 
39 | 	hostData = data.get().reshape(h, w).astype(np.float32)
40 | 	norms = np.empty((h, w), dtype=np.float32)
41 | 	for i in range(h):
42 | 		for j in range(w):
43 | 			norm = 0.0
44 | 			for m in range(max(0, i - lookBehind), min(h, i + lookAhead)):
45 | 				for n in range(max(0, j - lookBehind), min(w, j + lookAhead)):
46 | 					norm += hostData[m, n]**2
47 | 			norms[i, j] = mapLrn.K + norm * mapLrn.alpha / mapLrn.N**2
48 | 
49 | 	hostOutData = hostData / norms**mapLrn.beta
50 | 	assert np.allclose(hostOutData, mapLrn.data.get()[0, 0])
51 | 
52 | 	grad = gpuarray.to_gpu(np.random.randn(1, 1, h, w).astype(np.float32))
53 | 	mapLrn.backward(grad)
54 | 
55 | 	hostGrad = grad.get().reshape(h, w).astype(np.float32)
56 | 	hostInGrad = np.zeros((h, w), dtype=np.float32)
57 | 
58 | 	k = 2.0 * mapLrn.alpha * mapLrn.beta / mapLrn.N**2
59 | 	for i in range(h):
60 | 		for j in range(w):
61 | 			hostInGrad[i, j] += hostGrad[i, j] / norms[i, j]**mapLrn.beta
62 | 
63 | 			for m in range(max(0, i - lookBehind), min(h, i + lookAhead)):
64 | 				for n in range(max(0, j - lookBehind), min(w, j + lookAhead)):
65 | 					hostInGrad[i, j] -= k*hostGrad[m, n]*hostData[i, j]*hostData[m, n]/norms[m, n]**(mapLrn.beta+1)
66 | 
67 | 	assert np.allclose(hostInGrad, mapLrn.grad.get()[0, 0])
68 | 
69 | 
70 | if __name__ == "__main__":
71 | 	unittest()
72 | 


--------------------------------------------------------------------------------
/Optimizers/AdaDelta.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib import Config
 4 | 
 5 | from PuzzleLib.Backend import gpuarray
 6 | from PuzzleLib.Backend.Kernels.ElementWise import adadeltaKer
 7 | 
 8 | from PuzzleLib.Optimizers.Optimizer import Optimizer, trainSimpleTest, trainHardTest
 9 | 
10 | 
11 | class AdaDelta(Optimizer):
12 | 	def __init__(self, rho=0.95, epsilon=1e-6, nodeinfo=None):
13 | 		super().__init__(nodeinfo)
14 | 
15 | 		self.rho = None
16 | 		self.epsilon = None
17 | 
18 | 		self.setAttr("rho", rho)
19 | 		self.setAttr("epsilon", epsilon)
20 | 
21 | 
22 | 	def setupState(self, var):
23 | 		return {
24 | 			"msg": gpuarray.zeros(var.data.shape, dtype=var.data.dtype),
25 | 			"msdx": gpuarray.zeros(var.data.shape, dtype=var.data.dtype)
26 | 		}
27 | 
28 | 
29 | 	def updateVar(self, var, state, stream=None):
30 | 		adadeltaKer(var.data.dtype)(
31 | 			var.data, var.grad, state["msg"], state["msdx"], self.rho, self.epsilon, stream=stream
32 | 		)
33 | 
34 | 
35 | def unittest():
36 | 	for dtype, atol in gpuarray.dtypesSupported():
37 | 		calcTest(dtype, atol)
38 | 		trainSimpleTest(AdaDelta, dtype)
39 | 
40 | 		if Config.backend == Config.Backend.cuda:
41 | 			trainHardTest(AdaDelta, dtype)
42 | 
43 | 
44 | def calcTest(dtype, atol):
45 | 	rho, epsilon = 0.95, 1e-6
46 | 	shape = (11, 13)
47 | 
48 | 	hostW, hostDw = np.random.randn(*shape).astype(dtype), np.random.randn(*shape).astype(dtype)
49 | 	hostMsg = (1.0 + np.random.randn(*shape)**2).astype(dtype)
50 | 	hostMsdx = (1.0 + np.random.randn(*shape)**2).astype(dtype)
51 | 
52 | 	w, dw = gpuarray.to_gpu(hostW), gpuarray.to_gpu(hostDw)
53 | 	msg, msdx = gpuarray.to_gpu(hostMsg), gpuarray.to_gpu(hostMsdx)
54 | 
55 | 	adadeltaKer(w.dtype)(w, dw, msg, msdx, rho, epsilon)
56 | 
57 | 	hostW, hostDw = hostW.astype(np.float32), hostDw.astype(np.float32)
58 | 	hostMsg, hostMsdx = hostMsg.astype(np.float32), hostMsdx.astype(np.float32)
59 | 
60 | 	hostMsg += (1.0 - rho) * (hostDw * hostDw - hostMsg)
61 | 	hostDx = np.sqrt((hostMsdx + epsilon) / (hostMsg + epsilon)) * hostDw
62 | 	hostMsdx += (1.0 - rho) * (hostDx**2 - hostMsdx)
63 | 	hostW += hostDx
64 | 
65 | 	hostW, hostDw = hostW.astype(dtype), hostDw.astype(dtype)
66 | 	hostMsg, hostMsdx = hostMsg.astype(dtype), hostMsdx.astype(dtype)
67 | 
68 | 	assert np.allclose(hostMsg, msg.get(), atol=atol)
69 | 	assert np.allclose(hostMsdx, msdx.get(), atol=atol)
70 | 	assert np.allclose(hostW, w.get(), atol=atol)
71 | 
72 | 
73 | if __name__ == "__main__":
74 | 	unittest()
75 | 


--------------------------------------------------------------------------------
/TestLib/MultiGPUCifar10.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Grid import runGrid
 4 | 
 5 | 
 6 | def train(nodeinfo, verbose):
 7 | 	from PuzzleLib.Datasets import Cifar10Loader
 8 | 	cifar10 = Cifar10Loader(cachename="cifar10-%s.hdf" % nodeinfo.index)
 9 | 	data, labels = cifar10.load(path="../TestData/")
10 | 
11 | 	data, labels = data[:], labels[:]
12 | 	print("[%s]: Loaded cifar10" % nodeinfo.index)
13 | 
14 | 	np.random.seed(1234)
15 | 
16 | 	from PuzzleLib.TestLib.CnnCifar10Simple import buildNet
17 | 	net = buildNet()
18 | 
19 | 	from PuzzleLib.Optimizers import MomentumSGD
20 | 	optimizer = MomentumSGD(learnRate=0.01, momRate=0.9, nodeinfo=nodeinfo)
21 | 	optimizer.setupOn(net, useGlobalState=True)
22 | 
23 | 	from PuzzleLib.Cost import CrossEntropy
24 | 	cost = CrossEntropy(maxlabels=10)
25 | 
26 | 	from PuzzleLib.Handlers import Trainer, Validator
27 | 	trainer = Trainer(net, cost, optimizer, batchsize=128 // nodeinfo.gridsize)
28 | 	validator = Validator(net, cost)
29 | 
30 | 	import math
31 | 	currerror = math.inf
32 | 
33 | 	valsize = 10000
34 | 	trainsize = data.shape[0] - valsize
35 | 
36 | 	trainpart = trainsize // nodeinfo.gridsize
37 | 	valpart = valsize // nodeinfo.gridsize
38 | 
39 | 	for i in range(25):
40 | 		start, end = nodeinfo.index * trainpart, (nodeinfo.index + 1) * trainpart
41 | 
42 | 		trainer.trainFromHost(data[start:end], labels[start:end], macroBatchSize=trainpart)
43 | 		trerr = cost.getMeanError()
44 | 
45 | 		if verbose:
46 | 			print("[%s]: Epoch %s local train error: %s" % (nodeinfo.index, i + 1, trerr))
47 | 
48 | 		trerr = nodeinfo.meanValue(trerr)
49 | 
50 | 		if nodeinfo.index == 0:
51 | 			print("Epoch %s global train error: %s" % (i + 1, trerr))
52 | 
53 | 		start, end = trainsize + nodeinfo.index * valpart, trainsize + (nodeinfo.index + 1) * valpart
54 | 		valerr = validator.validateFromHost(data[start:end], labels[start:end], macroBatchSize=valpart)
55 | 
56 | 		if verbose:
57 | 			print("[%s]: Epoch %s local accuracy: %s" % (nodeinfo.index, i + 1, 1.0 - valerr))
58 | 
59 | 		valerr = nodeinfo.meanValue(valerr)
60 | 
61 | 		if nodeinfo.index == 0:
62 | 			print("Epoch %s global accuracy: %s" % (i + 1, 1.0 - valerr))
63 | 
64 | 		if valerr >= currerror:
65 | 			optimizer.learnRate *= 0.5
66 | 			print("[%s]: Lowered learn rate: %s" % (nodeinfo.index, optimizer.learnRate))
67 | 
68 | 		currerror = valerr
69 | 
70 | 
71 | def main():
72 | 	runGrid(target=train, size=2, verbose=True)
73 | 
74 | 
75 | if __name__ == "__main__":
76 | 	main()
77 | 


--------------------------------------------------------------------------------
/CPU/Wrappers/NumpyBlas.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.CPU.CPUArray import CPUArray
 4 | from PuzzleLib.CPU.Kernels import ElementWise
 5 | 
 6 | 
 7 | def sumOnMatrix(A, out=None, cols=True, alpha=1.0, beta=0.0):
 8 | 	assert A.ndim == 2
 9 | 	assert A.flags.c_contiguous
10 | 	assert A.dtype == np.float32
11 | 
12 | 	if out is None:
13 | 		out = CPUArray.empty((A.shape[1], ) if cols else (A.shape[0], ), dtype=np.float32)
14 | 
15 | 	if alpha == 1.0 and beta == 0.0:
16 | 		np.sum(A.data, axis=0 if cols else 1, out=out.data)
17 | 
18 | 	else:
19 | 		s = np.sum(A.data, axis=0 if cols else 1)
20 | 		np.add(beta * out.data, alpha * s, out=out.data)
21 | 
22 | 	return out
23 | 
24 | 
25 | def toVectorAddVector(y, x, alpha=1.0):
26 | 	assert x.ndim == 1
27 | 	assert x.shape == y.shape
28 | 	assert y.flags.forc and x.flags.forc
29 | 
30 | 	assert x.dtype == y.dtype
31 | 	assert x.dtype == np.float32
32 | 
33 | 	ElementWise.toVectorAddVectorKer(y.dtype)(y, x, alpha)
34 | 
35 | 
36 | def addVectorToVector(x, y, out=None, alpha=1.0, beta=1.0):
37 | 	assert x.ndim == 1
38 | 	assert x.flags.forc and y.flags.forc
39 | 	assert x.shape == y.shape
40 | 	assert x.dtype == y.dtype and x.dtype == np.float32
41 | 
42 | 	if out is None:
43 | 		out = CPUArray.empty(x.shape, dtype=np.float32)
44 | 
45 | 	ElementWise.addVectorToVectorKer(out, x, y, alpha, beta)
46 | 	return out
47 | 
48 | 
49 | def vectorL1Norm(x):
50 | 	assert x.ndim == 1
51 | 	assert x.flags.forc
52 | 	assert x.dtype == np.float32
53 | 
54 | 	return np.linalg.norm(x.data, ord=1)
55 | 
56 | 
57 | def dot(x, y):
58 | 	assert x.ndim == 1
59 | 	assert x.shape == y.shape
60 | 	assert x.flags.forc and y.flags.forc
61 | 	assert x.dtype == y.dtype and y.dtype == np.float32
62 | 
63 | 	return np.vdot(x.data, y.data)
64 | 
65 | 
66 | def mulMatrixOnMatrix(A, B, out=None, transpA=False, transpB=False, alpha=1.0, beta=0.0):
67 | 	assert not (transpA and transpB)
68 | 	assert A.ndim == 2 and B.ndim == 2
69 | 
70 | 	assert alpha == 1.0 and beta == 0.0
71 | 
72 | 	if transpA:
73 | 		assert A.shape[0] == B.shape[0]
74 | 		shape = (A.shape[1], B.shape[1])
75 | 
76 | 	elif transpB:
77 | 		assert A.shape[1] == B.shape[1]
78 | 		shape = (A.shape[0], B.shape[0])
79 | 
80 | 	else:
81 | 		assert A.shape[1] == B.shape[0]
82 | 		shape = (A.shape[0], B.shape[1])
83 | 
84 | 	A = A.data.T if transpA else A.data
85 | 	B = B.data.T if transpB else B.data
86 | 
87 | 	if out is None:
88 | 		out = CPUArray.empty(shape, dtype=np.float32)
89 | 
90 | 	np.dot(A, B, out=out.data)
91 | 	return out
92 | 


--------------------------------------------------------------------------------
/Backend/Kernels/MatVec.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib import Config
 2 | 
 3 | 
 4 | addVecToMat = None
 5 | argmax = None
 6 | 
 7 | addVecToMatBatch = None
 8 | argmaxBatch = None
 9 | 
10 | 
11 | def autoinit():
12 | 	if not Config.shouldInit():
13 | 		return
14 | 
15 | 	if Config.backend == Config.Backend.cuda:
16 | 		initCuda()
17 | 	elif Config.backend == Config.Backend.hip:
18 | 		initHip()
19 | 	elif Config.isCPUBased(Config.backend):
20 | 		initCPU()
21 | 	else:
22 | 		raise Config.ConfigError(Config.backend)
23 | 
24 | 
25 | def initCuda():
26 | 	from PuzzleLib.Cuda import Backend
27 | 	initGPU(Backend)
28 | 
29 | 
30 | def initHip():
31 | 	from PuzzleLib.Hip import Backend
32 | 	initGPU(Backend)
33 | 
34 | 
35 | def initGPU(Backend):
36 | 	backend = Backend.getBackend(Config.deviceIdx, initmode=2, logger=Config.getLogger())
37 | 	memoryPool, matmod = backend.memoryPool, backend.matmod
38 | 
39 | 	def wrapAddVecToMat(vec, mat, axis, out):
40 | 		return matmod.addVecToMat(vec, mat, axis, out, memoryPool)
41 | 
42 | 	def wrapArgmax(tensor, axis):
43 | 		return matmod.argmax(tensor, axis, memoryPool)
44 | 
45 | 	global addVecToMat, argmax
46 | 	addVecToMat = wrapAddVecToMat
47 | 	argmax = wrapArgmax
48 | 
49 | 	def wrapAddVecToMatBatch(vec, mat, axis, out):
50 | 		return matmod.addVecToMat(vec, mat, axis, out, memoryPool)
51 | 
52 | 	def wrapArgmaxBatch(tensor, axis):
53 | 		return matmod.argmax(tensor, axis, memoryPool)
54 | 
55 | 	global addVecToMatBatch, argmaxBatch
56 | 	addVecToMatBatch = wrapAddVecToMatBatch
57 | 	argmaxBatch = wrapArgmaxBatch
58 | 
59 | 
60 | def initCPU():
61 | 	import numpy as np
62 | 	from PuzzleLib.CPU.CPUArray import CPUArray
63 | 
64 | 	def wrapAddVecToMat(v, m, axis, out):
65 | 		if axis == 0:
66 | 			v = v[:, np.newaxis]
67 | 		elif axis == 1:
68 | 			v = v[np.newaxis, :]
69 | 
70 | 		np.add(m.get(copy=False), v.get(copy=False), out=out.get(copy=False))
71 | 
72 | 	def wrapArgmax(mats, axis):
73 | 		out = np.empty(mats.shape[:axis] + mats.shape[axis + 1:], dtype=np.int32)
74 | 		np.argmax(mats.get(copy=False), axis, out=out)
75 | 
76 | 		return CPUArray(out.shape, out.dtype, data=out, acquire=True)
77 | 
78 | 	global addVecToMat, argmax
79 | 	addVecToMat = wrapAddVecToMat
80 | 	argmax = wrapArgmax
81 | 
82 | 	def wrapArgmax(mats, axis):
83 | 		out = np.empty(mats.shape[:axis] + mats.shape[axis + 1:], dtype=np.int32)
84 | 		np.argmax(mats.get(copy=False), axis, out=out)
85 | 
86 | 		return CPUArray(out.shape, out.dtype, data=out, acquire=True)
87 | 
88 | 	global argmaxBatch
89 | 	argmaxBatch = wrapArgmax
90 | 
91 | 
92 | autoinit()
93 | 


--------------------------------------------------------------------------------
/CPU/Kernels/Upsample2D.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Compiler.Codegen.Types import void_t, int32_t, float_t
 4 | 
 5 | from PuzzleLib.CPU.SourceModule import SourceModule
 6 | from PuzzleLib.CPU.CPUArray import CPUArray
 7 | 
 8 | 
 9 | upsample2dNearestTmpl = """
10 | 
11 | static void upsample2dNearest(float * __restrict outdata, const float * __restrict indata,
12 | 							  int32_t batchsize, int32_t maps, int32_t inh, int32_t inw, int32_t hscale, int32_t wscale)
13 | {
14 | 	int32_t outh = inh * hscale, outw = inw * wscale;
15 | 
16 | 	for (int32_t z = 0; z < batchsize * maps; z++)
17 | 		for (int32_t y = 0; y < inh; y++)
18 | 			for (int32_t x = 0; x < inw; x++)
19 | 				for (int32_t i = 0; i < hscale; i++)
20 | 					for (int32_t j = 0; j < wscale; j++)
21 | 					{
22 | 						int32_t outidx = z * outh * outw + (y * hscale + i) * outw + (x * wscale + j);
23 | 						outdata[outidx] = indata[z * inh * inw + y * inw + x];
24 | 					}
25 | }
26 | 
27 | """
28 | 
29 | 
30 | nearestMod = SourceModule(upsample2dNearestTmpl, functions=[
31 | 	("upsample2dNearest", void_t, [
32 | 		(float_t.ptr.restrict, "outdata"), (float_t.const.ptr.restrict, "indata"), (int32_t, "batchsize"),
33 | 		(int32_t, "maps"), (int32_t, "inh"), (int32_t, "inw"), (int32_t, "hscale"), (int32_t, "wscale")
34 | 	], True)
35 | ])
36 | 
37 | 
38 | def upsample2d(data, scale, mode="nearest"):
39 | 	batchsize, maps, inh, inw = data.shape
40 | 	hscale, wscale = (scale, scale) if isinstance(scale, int) else scale
41 | 
42 | 	outh, outw = hscale * inh, wscale * inw
43 | 	outdata = CPUArray.empty((batchsize, maps, outh, outw), dtype=data.dtype)
44 | 
45 | 	if mode == "nearest":
46 | 		nearestMod.upsample2dNearest(outdata.data, data.data, batchsize, maps, inh, inw, hscale, wscale)
47 | 
48 | 	else:
49 | 		raise ValueError("Unsupported upsampling mode")
50 | 
51 | 	return outdata
52 | 
53 | 
54 | def unittest():
55 | 	batchsize, maps, inh, inw = 3, 2, 16, 15
56 | 	scale = 2
57 | 
58 | 	data = CPUArray.toDevice(np.random.uniform(low=-1.0, high=1.0, size=(batchsize, maps, inh, inw)).astype(np.float32))
59 | 	outdata = upsample2d(data, scale, mode="nearest")
60 | 
61 | 	hostData = data.get()
62 | 	hostOutData = np.empty(outdata.shape, dtype=np.float32)
63 | 
64 | 	for b in range(batchsize):
65 | 		for c in range(maps):
66 | 			for y in range(inh):
67 | 				for x in range(inw):
68 | 					hostOutData[b, c, y * scale:(y + 1) * scale, x * scale:(x + 1) * scale] = hostData[b, c, y, x]
69 | 
70 | 	assert np.allclose(hostOutData, outdata.get())
71 | 
72 | 
73 | if __name__ == "__main__":
74 | 	unittest()
75 | 


--------------------------------------------------------------------------------
/TestLib/CnnCifar10Simple.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import numpy as np
 4 | 
 5 | from PuzzleLib.Containers import Sequential
 6 | from PuzzleLib.Modules import Conv2D, MaxPool2D, Activation, relu, Flatten, Linear
 7 | 
 8 | from PuzzleLib.Datasets import Cifar10Loader
 9 | from PuzzleLib.Visual import showImageBasedFilters, showFilters
10 | from PuzzleLib.Handlers import Trainer, Validator
11 | from PuzzleLib.Optimizers import MomentumSGD
12 | from PuzzleLib.Cost import CrossEntropy
13 | 
14 | 
15 | def buildNet():
16 | 	seq = Sequential()
17 | 
18 | 	seq.append(Conv2D(3, 32, 5, pad=2, wscale=0.0001, initscheme="gaussian"))
19 | 	seq.append(MaxPool2D(3, 2))
20 | 	seq.append(Activation(relu))
21 | 
22 | 	seq.append(Conv2D(32, 32, 5, pad=2, wscale=0.01, initscheme="gaussian"))
23 | 	seq.append(MaxPool2D(3, 2))
24 | 	seq.append(Activation(relu))
25 | 
26 | 	seq.append(Conv2D(32, 64, 5, pad=2, wscale=0.01, initscheme="gaussian"))
27 | 	seq.append(MaxPool2D(3, 2))
28 | 	seq.append(Activation(relu))
29 | 
30 | 	seq.append(Flatten())
31 | 	seq.append(Linear(seq.dataShapeFrom((1, 3, 32, 32))[1], 64, wscale=0.1, initscheme="gaussian"))
32 | 	seq.append(Activation(relu))
33 | 
34 | 	seq.append(Linear(64, 10, wscale=0.1, initscheme="gaussian"))
35 | 	return seq
36 | 
37 | 
38 | def main():
39 | 	cifar10 = Cifar10Loader()
40 | 	data, labels = cifar10.load(path="../TestData/")
41 | 	data, labels = data[:], labels[:]
42 | 	print("Loaded cifar10")
43 | 
44 | 	np.random.seed(1234)
45 | 	net = buildNet()
46 | 
47 | 	optimizer = MomentumSGD()
48 | 	optimizer.setupOn(net, useGlobalState=True)
49 | 	optimizer.learnRate = 0.01
50 | 	optimizer.momRate = 0.9
51 | 
52 | 	cost = CrossEntropy(maxlabels=10)
53 | 	trainer = Trainer(net, cost, optimizer)
54 | 
55 | 	validator = Validator(net, cost)
56 | 	currerror = math.inf
57 | 
58 | 	for i in range(25):
59 | 		trainer.trainFromHost(
60 | 			data[:50000], labels[:50000], macroBatchSize=50000,
61 | 			onMacroBatchFinish=lambda train: print("Train error: %s" % train.cost.getMeanError())
62 | 		)
63 | 		valerror = validator.validateFromHost(data[50000:], labels[50000:], macroBatchSize=10000)
64 | 		print("Accuracy: %s" % (1.0 - valerror))
65 | 
66 | 		if valerror >= currerror:
67 | 			optimizer.learnRate *= 0.5
68 | 			print("Lowered learn rate: %s" % optimizer.learnRate)
69 | 
70 | 		currerror = valerror
71 | 
72 | 		showImageBasedFilters(net[0].W.get(), "../TestData/conv1.png")
73 | 		showFilters(net[3].W.get(), "../TestData/conv2.png")
74 | 		showFilters(net[6].W.get(), "../TestData/conv3.png")
75 | 
76 | 
77 | if __name__ == "__main__":
78 | 	main()
79 | 


--------------------------------------------------------------------------------
/Compiler/Codegen/Map/Generate.py:
--------------------------------------------------------------------------------
 1 | import os, random
 2 | from string import Template
 3 | 
 4 | from PuzzleLib.Compiler.Toolchain import createTemplateNames, writeTemplates, buildTemplateTest
 5 | 
 6 | 
 7 | def generateMap(name, K, V, hasher, compareKeys, borrowKey, borrowValue,
 8 | 				destructKey="(void)", destructValue="(void)", minLog2Capacity=4,
 9 | 				headerPreambule=None, bodyPreambule=None, malloc="malloc", free="free", filename=None):
10 | 	headerPreambule = "%s\n\n" % headerPreambule if headerPreambule is not None else ""
11 | 	bodyPreambule = "%s\n\n" % bodyPreambule if bodyPreambule is not None else ""
12 | 
13 | 	filename = name if filename is None else filename
14 | 	headername, bodyname = createTemplateNames(filename)
15 | 
16 | 	dirname = os.path.dirname(__file__)
17 | 	headerTmpl, bodyTmpl = os.path.join(dirname, "TMap.h"), os.path.join(dirname, "TMap.c")
18 | 
19 | 	with open(headerTmpl, mode="r", encoding="utf-8") as f:
20 | 		header = Template(f.read()).substitute(HEADER_PREAMBULE=headerPreambule, NAME=name, K=K, V=V)
21 | 
22 | 	with open(bodyTmpl, mode="r", encoding="utf-8") as f:
23 | 		body = Template(f.read()).substitute(
24 | 			HEADER_NAME=os.path.basename(headername), BODY_PREAMBULE=bodyPreambule, NAME=name, K=K, V=V,
25 | 			MIN_LOG2_CAPACITY=minLog2Capacity, MALLOC=malloc, FREE=free,
26 | 			HASHER=hasher, COMPARE_KEYS=compareKeys, BORROW_KEY=borrowKey, BORROW_VALUE=borrowValue,
27 | 			DESTRUCT_KEY=destructKey, DESTRUCT_VALUE=destructValue
28 | 		)
29 | 
30 | 	writeTemplates([
31 | 		(header, headername),
32 | 		(body, bodyname)
33 | 	])
34 | 
35 | 	return bodyname
36 | 
37 | 
38 | def unittest():
39 | 	IntMap = buildTemplateTest(
40 | 		name="IntMap", bindingName="TMapTest.c", path="../../TestData", generator=generateMap, K="int", V="int",
41 | 		hasher="hashKey", compareKeys="compareKeys", borrowKey="(int)", borrowValue="(int)",
42 | 		bodyPreambule="""
43 | inline static size_t hashKey(int key) { return key; }
44 | inline static bool compareKeys(int key1, int key2) { return key1 == key2; }
45 | """)
46 | 
47 | 	size = 1 << 16
48 | 
49 | 	keys, values = list(range(size)), list(range(size))
50 | 	random.shuffle(keys)
51 | 	random.shuffle(values)
52 | 
53 | 	pymap = {k: v for k, v in zip(keys, values)}
54 | 
55 | 	intmap = IntMap.IntMap()
56 | 
57 | 	for k, v in pymap.items():
58 | 		intmap[k] = v
59 | 
60 | 	assert len(intmap) == size
61 | 
62 | 	for k in pymap.keys():
63 | 		assert intmap[k] == pymap[k]
64 | 
65 | 	for k in pymap.keys():
66 | 		del intmap[k]
67 | 
68 | 	assert len(intmap) == 0
69 | 
70 | 
71 | if __name__ == "__main__":
72 | 	unittest()
73 | 


--------------------------------------------------------------------------------
/Modules/MaxPool2D.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray
 4 | from PuzzleLib.Backend.Kernels import Pool
 5 | from PuzzleLib.Backend.Dnn import PoolMode, poolNd, poolNdBackward
 6 | 
 7 | from PuzzleLib.Modules.Pool2D import Pool2D
 8 | 
 9 | 
10 | class MaxPool2D(Pool2D):
11 | 	def __init__(self, size=2, stride=2, pad=0, useMask=False, name=None):
12 | 		super().__init__(size, stride, pad, name)
13 | 		self.registerBlueprint(locals())
14 | 
15 | 		self.useMask = useMask
16 | 		self.mask = None
17 | 
18 | 		self.mode = PoolMode.max
19 | 
20 | 
21 | 	@property
22 | 	def withMask(self):
23 | 		return self.useMask
24 | 
25 | 
26 | 	@withMask.setter
27 | 	def withMask(self, val):
28 | 		self.useMask = val
29 | 		self.gradUsesOutData = False if val else True
30 | 
31 | 
32 | 	def updateData(self, data):
33 | 		if self.useMask:
34 | 			self.data, self.mask = Pool.maxpool2d(data, size=self.size, stride=self.stride, pad=self.pad)
35 | 		else:
36 | 			test = not self.train
37 | 			self.data, self.workspace = poolNd(data, size=self.size, stride=self.stride, pad=self.pad,
38 | 											   mode=self.mode, test=test)
39 | 
40 | 
41 | 	def updateGrad(self, grad):
42 | 		if self.useMask:
43 | 			self.grad = Pool.maxpool2dBackward(grad, self.inData.shape, self.mask,
44 | 											   size=self.size, stride=self.stride, pad=self.pad)
45 | 		else:
46 | 			self.grad = poolNdBackward(self.inData, self.data, grad, self.workspace,
47 | 									   size=self.size, stride=self.stride, pad=self.pad, mode=self.mode)
48 | 
49 | 
50 | 	def reset(self):
51 | 		super().reset()
52 | 		self.mask = None
53 | 
54 | 
55 | def unittest():
56 | 	batchsize, maps, h, w = 1, 1, 6, 6
57 | 	data = gpuarray.to_gpu(np.random.randn(batchsize, maps, h, w).astype(np.float32))
58 | 
59 | 	maxpool2d = MaxPool2D()
60 | 	maxpool2d(data)
61 | 
62 | 	grad = gpuarray.to_gpu(np.random.randn(*maxpool2d.data.shape).astype(np.float32))
63 | 	maxpool2d.backward(grad)
64 | 
65 | 	def maxDownSample2d(dat, factor):
66 | 		trimrows = dat.shape[0] // factor * factor
67 | 		trimcols = dat.shape[1] // factor * factor
68 | 
69 | 		maxSoFar = None
70 | 		first = True
71 | 
72 | 		for coff in range(factor):
73 | 			for roff in range(factor):
74 | 				hopped = dat[roff:trimrows:factor, coff:trimcols:factor]
75 | 				if first:
76 | 					maxSoFar = hopped
77 | 					first = False
78 | 				else:
79 | 					maxSoFar = np.maximum(maxSoFar, hopped)
80 | 
81 | 		return maxSoFar
82 | 
83 | 	hostOutData = maxDownSample2d(data.get()[0, 0], 2)
84 | 	assert np.allclose(hostOutData, maxpool2d.data.get()[0, 0])
85 | 
86 | 
87 | if __name__ == "__main__":
88 | 	unittest()
89 | 


--------------------------------------------------------------------------------
/Modules/Pool3D.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib.Backend import gpuarray
 2 | from PuzzleLib.Modules.Module import ModuleError, Module
 3 | 
 4 | 
 5 | class Pool3D(Module):
 6 | 	def __init__(self, size=2, stride=2, pad=0, name=None):
 7 | 		super().__init__(name)
 8 | 		self.gradUsesOutData = True
 9 | 
10 | 		self.size = self.repeat(size, 3)
11 | 		self.stride = self.repeat(stride, 3)
12 | 		self.pad = self.repeat(pad, 3)
13 | 
14 | 		self.workspace = None
15 | 
16 | 
17 | 	def dataShapeFrom(self, shape):
18 | 		batchsize, maps, ind, inh, inw = shape
19 | 
20 | 		dsize, hsize, wsize = self.size
21 | 		dpad, hpad, wpad = self.pad
22 | 		dstride, hstride, wstride = self.stride
23 | 
24 | 		outd = (ind + 2 * dpad - dsize) // dstride + 1
25 | 		outh = (inh + 2 * hpad - hsize) // hstride + 1
26 | 		outw = (inw + 2 * wpad - wsize) // wstride + 1
27 | 
28 | 		return batchsize, maps, outd, outh, outw
29 | 
30 | 
31 | 	def checkDataShape(self, shape):
32 | 		if len(shape) != 5:
33 | 			raise ModuleError("Data must be 5d tensor")
34 | 
35 | 		_, _, ind, inh, inw = shape
36 | 		if ind + 2 * self.pad[0] < self.size[0]:
37 | 			raise ModuleError("Data cube time is too small (got %d, expected at least %d)" %
38 | 							  (ind + 2 * self.pad[0], self.size[0]))
39 | 
40 | 		if inh + 2 * self.pad[1] < self.size[1]:
41 | 			raise ModuleError("Data cube height is too small (got %d, expected at least %d)" %
42 | 							  (inh + 2 * self.pad[1], self.size[1]))
43 | 
44 | 		if inw + 2 * self.pad[2] < self.size[2]:
45 | 			raise ModuleError("Data cube width is too small (got %d, expected at least %d)" %
46 | 							  (inw + 2 * self.pad[2], self.size[2]))
47 | 
48 | 
49 | 	def gradShapeFrom(self, shape):
50 | 		batchsize, maps, outd, outh, outw = shape
51 | 
52 | 		dsize, hsize, wsize = self.size
53 | 		dpad, hpad, wpad = self.pad
54 | 		dstride, hstride, wstride = self.stride
55 | 
56 | 		ind = (outd - 1) * dstride - 2 * dpad + dsize
57 | 		inh = (outh - 1) * hstride - 2 * hpad + hsize
58 | 		inw = (outw - 1) * wstride - 2 * wpad + wsize
59 | 
60 | 		return batchsize, maps, ind, inh, inw
61 | 
62 | 
63 | 	def checkGradShape(self, shape):
64 | 		if len(shape) != 5:
65 | 			raise ModuleError("Grad must be 5d tensor")
66 | 
67 | 
68 | 	def updateData(self, data):
69 | 		raise NotImplementedError()
70 | 
71 | 
72 | 	def updateGrad(self, grad):
73 | 		raise NotImplementedError()
74 | 
75 | 
76 | 	def reset(self):
77 | 		super().reset()
78 | 		self.workspace = None
79 | 
80 | 
81 | 	def calcMode(self, T):
82 | 		dtypes = {dtype for dtype, _ in gpuarray.dtypesSupported()}
83 | 
84 | 		if T not in dtypes:
85 | 			raise ModuleError("Unsupported dtype %s" % T)
86 | 
87 | 		self.calctype = T
88 | 


--------------------------------------------------------------------------------
/Compiler/Compilers/GCC.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from PuzzleLib.Compiler.Compilers.Compiler import Compiler
 4 | 
 5 | 
 6 | class GCCLike(Compiler):
 7 | 	cflags = ["-Wall", "-Wextra"]
 8 | 	ldflags = ["--shared"]
 9 | 
10 | 
11 | 	def __init__(self, verbose, forPython=True):
12 | 		super().__init__(verbose, forPython=forPython)
13 | 
14 | 		if sys.platform == "linux":
15 | 			self.cflags = self.cflags + ["-fPIC"]
16 | 
17 | 
18 | 	def objectLine(self, extfile, sourcefiles):
19 | 		return self.fullCFlags(asObject=True) + self.outFlags(extfile) + sourcefiles
20 | 
21 | 
22 | 	def linkLine(self, extfile, objfiles):
23 | 		return self.fullLDFlags() + self.outFlags(extfile) + objfiles + self.linkFlags()
24 | 
25 | 
26 | 	def buildLine(self, extfile, sourcefiles):
27 | 		return self.fullCFlags(asObject=False) + self.fullLDFlags() + self.outFlags(extfile) + \
28 | 			   sourcefiles + self.linkFlags()
29 | 
30 | 
31 | 	def depLine(self, sourcefiles):
32 | 		return ["-M"] + self.fullCFlags(asObject=False, debug=False, optimize=False) + sourcefiles
33 | 
34 | 
35 | 	def fullCFlags(self, asObject, debug=True, optimize=True):
36 | 		oflags = self.fullCppFlags()
37 | 
38 | 		if debug and self.debuglevel > 0:
39 | 			oflags.append("-g3" if self.debuglevel >= 3 else "-g")
40 | 
41 | 		if optimize and self.optlevel > 0:
42 | 			oflags.append("-O3" if self.optlevel >= 3 else "-O%s" % self.optlevel)
43 | 
44 | 			if self.optlevel >= 3:
45 | 				oflags.extend(["-march=native", "-mtune=native", "-ffast-math"])
46 | 
47 | 			if debug and self.debuglevel >= 3:
48 | 				oflags.append("-fno-omit-frame-pointer")
49 | 
50 | 		oflags.extend("-D%s" % define for define in self.defines)
51 | 		return self.cflags + oflags + ["-I%s" % idir for idir in self.includeDirs] + (["-c"] if asObject else [])
52 | 
53 | 
54 | 	def fullCppFlags(self):
55 | 		return ["-std=c++14" if self.cpp else "-std=gnu99"]
56 | 
57 | 
58 | 	def fullLDFlags(self):
59 | 		return self.ldflags + ["-L%s" % ldir for ldir in self.libraryDirs]
60 | 
61 | 
62 | 	def outFlags(self, extfile):
63 | 		outFlags = ["-o", extfile]
64 | 
65 | 		if self.optlevel >= 4:
66 | 			outFlags.append("-flto")
67 | 
68 | 		return outFlags
69 | 
70 | 
71 | 	def linkFlags(self):
72 | 		return ["-l%s" % lib for lib in self.libraries]
73 | 
74 | 
75 | class GCC(GCCLike):
76 | 	cc = "gcc"
77 | 
78 | 
79 | class Clang(GCCLike):
80 | 	cc = "clang"
81 | 
82 | 
83 | 	def fullCppFlags(self):
84 | 		return ["-std=c++14" if self.cpp else "-std=c99"]
85 | 
86 | 
87 | 	def outFlags(self, extfile):
88 | 		outflags = super().outFlags(extfile)
89 | 
90 | 		if sys.platform == "win32":
91 | 			outflags.append("-fuse-ld=lld")
92 | 
93 | 		return outflags
94 | 


--------------------------------------------------------------------------------
/Modules/SoftMax.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray
 4 | from PuzzleLib.Backend.Dnn import softmaxNd, softmaxNdBackward
 5 | 
 6 | from PuzzleLib.Modules.Module import ModuleError, Module
 7 | 
 8 | 
 9 | class SoftMax(Module):
10 | 	def __init__(self, name=None):
11 | 		super().__init__(name)
12 | 		self.gradUsesOutData = True
13 | 
14 | 
15 | 	def updateData(self, data):
16 | 		shape = data.shape
17 | 		ndim = max(0, 4 - len(shape))
18 | 
19 | 		data = data.reshape(shape + tuple(1 for _ in range(ndim)))
20 | 		self.data = softmaxNd(data).reshape(shape)
21 | 
22 | 
23 | 	def updateGrad(self, grad):
24 | 		shape = grad.shape
25 | 		ndim = max(0, 4 - len(shape))
26 | 
27 | 		grad = grad.reshape(shape + tuple(1 for _ in range(ndim)))
28 | 		data = self.data.reshape(shape + tuple(1 for _ in range(ndim)))
29 | 
30 | 		self.grad = softmaxNdBackward(data, grad).reshape(shape)
31 | 
32 | 
33 | 	def dataShapeFrom(self, shape):
34 | 		return shape
35 | 
36 | 
37 | 	def gradShapeFrom(self, shape):
38 | 		return shape
39 | 
40 | 
41 | 	def calcMode(self, T):
42 | 		dtypes = {dtype for dtype, _ in gpuarray.dtypesSupported()}
43 | 
44 | 		if T not in dtypes:
45 | 			raise ModuleError("Unsupported dtype %s" % T)
46 | 
47 | 		self.calctype = T
48 | 
49 | 
50 | def unittest():
51 | 	batchsize, maps = 2, 3
52 | 
53 | 	hostData = np.random.randn(batchsize, maps, 1).astype(np.float32)
54 | 	data = gpuarray.to_gpu(hostData)
55 | 
56 | 	softmax = SoftMax()
57 | 	softmax(data)
58 | 
59 | 	def softMaxForward(w):
60 | 		e = np.exp(w - np.amax(w))
61 | 		p = e / np.sum(e)
62 | 		return p
63 | 
64 | 	hostData = hostData.reshape(batchsize, maps).astype(np.float32)
65 | 
66 | 	hostOutData = np.vstack([softMaxForward(hostData[i]) for i in range(batchsize)])
67 | 	assert np.allclose(hostOutData, softmax.data.get().reshape(batchsize, maps).astype(np.float32))
68 | 
69 | 	hostGrad = np.random.randn(batchsize, maps, 1, 1).astype(np.float32)
70 | 	grad = gpuarray.to_gpu(hostGrad)
71 | 
72 | 	softmax.backward(grad)
73 | 	hostGrad = hostGrad.reshape(batchsize, maps).astype(np.float32)
74 | 
75 | 	def softMaxBackward(outdata, gr):
76 | 		ingrad = np.zeros(outdata.shape, dtype=np.float32)
77 | 		for i in range(ingrad.shape[0]):
78 | 			ingrad[i] += outdata[i] * gr[i]
79 | 
80 | 			for j in range(outdata.shape[0]):
81 | 				ingrad[i] -= outdata[i] * outdata[j] * gr[j]
82 | 		return ingrad
83 | 
84 | 	hostInGrad = np.vstack([softMaxBackward(hostOutData[i], hostGrad[i]) for i in range(batchsize)])
85 | 	assert np.allclose(hostInGrad, softmax.grad.get().reshape(batchsize, maps).astype(np.float32))
86 | 
87 | 
88 | if __name__ == "__main__":
89 | 	unittest()
90 | 


--------------------------------------------------------------------------------
/Backend/Kernels/Costs.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib import Config
 2 | 
 3 | 
 4 | bceKer = None
 5 | hingeKer = None
 6 | smoothL1Ker = None
 7 | l1HingeKer = None
 8 | 
 9 | getAccuracyKernel = None
10 | crossEntropyKernel = None
11 | svmKernel = None
12 | 
13 | ctcLoss = None
14 | ctcLossTest = None
15 | 
16 | 
17 | def autoinit():
18 | 	if not Config.shouldInit():
19 | 		return
20 | 
21 | 	if Config.backend == Config.Backend.cuda:
22 | 		initCuda()
23 | 	elif Config.backend == Config.Backend.hip:
24 | 		initHip()
25 | 	elif Config.backend == Config.Backend.cpu:
26 | 		initCPU()
27 | 	elif Config.backend == Config.Backend.intel:
28 | 		initIntel()
29 | 	else:
30 | 		raise Config.ConfigError(Config.backend)
31 | 
32 | 
33 | def initCuda():
34 | 	from PuzzleLib.Cuda import Backend
35 | 	from PuzzleLib.Cuda.Kernels import CTC
36 | 
37 | 	initGPU(Backend, CTC)
38 | 
39 | 
40 | def initHip():
41 | 	from PuzzleLib.Hip import Backend
42 | 	from PuzzleLib.Cuda.Kernels import CTC
43 | 
44 | 	initGPU(Backend, CTC)
45 | 
46 | 
47 | def initGPU(Backend, CTC):
48 | 	backend = Backend.getBackend(Config.deviceIdx, initmode=2, logger=Config.getLogger())
49 | 	memoryPool, costmod, ctcmod = backend.memoryPool, backend.costmod, backend.ctcmod
50 | 
51 | 	global bceKer, hingeKer, smoothL1Ker, l1HingeKer, getAccuracyKernel
52 | 	bceKer = backend.bceKer
53 | 	hingeKer = backend.hingeKer
54 | 	smoothL1Ker = backend.smoothL1Ker
55 | 	l1HingeKer = backend.l1HingeKer
56 | 	getAccuracyKernel = backend.getAccuracyKernel
57 | 
58 | 	def wrapCrossEntropy(scores, labels, weights, error):
59 | 		return costmod.crossEntropy(scores, labels, weights, error, memoryPool)
60 | 
61 | 	def wrapSVM(scores, labels, mode, error):
62 | 		return costmod.svm(scores, labels, mode, error, memoryPool)
63 | 
64 | 	global crossEntropyKernel, svmKernel
65 | 	crossEntropyKernel = wrapCrossEntropy
66 | 	svmKernel = wrapSVM
67 | 
68 | 	def wrapCTC(data, datalen, labels, lengths, blank, error, normalized):
69 | 		return ctcmod.ctcLoss(data, datalen, labels, lengths, blank, error, normalized, allocator=memoryPool)
70 | 
71 | 	global ctcLoss, ctcLossTest
72 | 	ctcLoss = wrapCTC
73 | 	ctcLossTest = CTC.hostCTCLoss
74 | 
75 | 
76 | def initCPU():
77 | 	pass
78 | 
79 | 
80 | def initIntel():
81 | 	from PuzzleLib.Intel.Kernels import Costs
82 | 
83 | 	global bceKer, hingeKer, smoothL1Ker, l1HingeKer, getAccuracyKernel, crossEntropyKernel, svmKernel
84 | 	bceKer = Costs.bceKer
85 | 	hingeKer = Costs.hingeKer
86 | 	smoothL1Ker = Costs.smoothL1Ker
87 | 	l1HingeKer = Costs.l1HingeKer
88 | 	getAccuracyKernel = Costs.getAccuracyKernel
89 | 	crossEntropyKernel = Costs.crossEntropy
90 | 	svmKernel = Costs.svm
91 | 
92 | 
93 | autoinit()
94 | 


--------------------------------------------------------------------------------
/Modules/SwapAxes.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray, Memory
 4 | from PuzzleLib.Modules.Module import ModuleError, Module
 5 | 
 6 | 
 7 | class SwapAxes(Module):
 8 | 	def __init__(self, axis1, axis2, name=None):
 9 | 		super().__init__(name)
10 | 		self.registerBlueprint(locals())
11 | 
12 | 		self.axis1, self.axis2 = (axis2, axis1) if axis1 > axis2 else (axis1, axis2)
13 | 
14 | 
15 | 	def updateData(self, data):
16 | 		self.data = Memory.swapaxes(data, self.axis1, self.axis2)
17 | 
18 | 
19 | 	def updateGrad(self, grad):
20 | 		self.grad = Memory.swapaxes(grad, self.axis1, self.axis2)
21 | 
22 | 
23 | 	def checkDataShape(self, shape):
24 | 		if len(shape) - 1 < self.axis2:
25 | 			raise ModuleError("Data dimension needs to be at least %d, (data has %d)" % (self.axis2 + 1, len(shape)))
26 | 
27 | 
28 | 	def checkGradShape(self, shape):
29 | 		if len(shape) - 1 < self.axis2:
30 | 			raise ModuleError("Grad dimension needs to be at least %d, (grad has %d)" % (self.axis2 + 1, len(shape)))
31 | 
32 | 
33 | 	def dataShapeFrom(self, shape):
34 | 		return shape[:self.axis1] + (shape[self.axis2], ) + shape[self.axis1 + 1:self.axis2] + \
35 | 			   (shape[self.axis1], ) + shape[self.axis2 + 1:]
36 | 
37 | 
38 | 	def gradShapeFrom(self, shape):
39 | 		return shape[:self.axis1] + (shape[self.axis2], ) + shape[self.axis1 + 1:self.axis2] + \
40 | 			   (shape[self.axis1], ) + shape[self.axis2 + 1:]
41 | 
42 | 
43 | 	def calcMode(self, T):
44 | 		dtypes = {dtype for dtype, _ in gpuarray.dtypesSupported()}
45 | 
46 | 		if T not in dtypes:
47 | 			raise ModuleError("Unsupported dtype %s" % T)
48 | 
49 | 		self.calctype = T
50 | 
51 | 
52 | def unittest():
53 | 	for dtype, _ in gpuarray.dtypesSupported():
54 | 		swapAxesTest(dtype)
55 | 
56 | 
57 | def swapAxesTest(dtype):
58 | 	shape = (10, 3, 5, 4, 2)
59 | 
60 | 	for axis1 in range(len(shape)):
61 | 		for axis2 in range(axis1 + 1, len(shape)):
62 | 			hostData = np.random.randn(*shape).astype(dtype)
63 | 			data = gpuarray.to_gpu(hostData)
64 | 
65 | 			swapaxes = SwapAxes(axis1, axis2)
66 | 			swapaxes.calcMode(dtype)
67 | 
68 | 			swapaxes(data)
69 | 
70 | 			hostOutData = np.swapaxes(hostData, axis1=axis1, axis2=axis2)
71 | 			assert np.allclose(hostOutData, swapaxes.data.get())
72 | 
73 | 			hostGrad = np.random.randn(*swapaxes.data.shape).astype(dtype)
74 | 			grad = gpuarray.to_gpu(hostGrad)
75 | 
76 | 			swapaxes.backward(grad)
77 | 
78 | 			hostInGrad = np.swapaxes(hostGrad, axis1=axis2, axis2=axis1)
79 | 
80 | 			assert swapaxes.grad.shape == data.shape
81 | 			assert np.allclose(hostInGrad, swapaxes.grad.get())
82 | 
83 | 
84 | if __name__ == "__main__":
85 | 	unittest()
86 | 


--------------------------------------------------------------------------------
/Optimizers/SMORMS3.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib import Config
 4 | 
 5 | from PuzzleLib.Backend import gpuarray
 6 | from PuzzleLib.Backend.Kernels.ElementWise import smorms3Ker
 7 | 
 8 | from PuzzleLib.Optimizers.Optimizer import Optimizer, trainSimpleTest, trainHardTest
 9 | 
10 | 
11 | class SMORMS3(Optimizer):
12 | 	def __init__(self, learnRate=1e-3, epsilon=1e-16, nodeinfo=None):
13 | 		super().__init__(nodeinfo)
14 | 
15 | 		self.epsilon = None
16 | 
17 | 		self.setAttr("learnRate", learnRate)
18 | 		self.setAttr("epsilon", epsilon)
19 | 
20 | 
21 | 	def setupState(self, var):
22 | 		return {
23 | 			"mem": gpuarray.to_gpu(np.ones(var.data.shape, dtype=np.float32)),
24 | 			"mg": gpuarray.zeros(var.data.shape, dtype=np.float32),
25 | 			"ms": gpuarray.zeros(var.data.shape, dtype=np.float32)
26 | 		}
27 | 
28 | 
29 | 	def updateVar(self, var, state, stream=None):
30 | 		smorms3Ker(var.data.dtype)(
31 | 			var.data, var.grad, state["mem"], state["mg"], state["ms"], self.learnRate * var.learnRate, self.epsilon,
32 | 			stream=stream
33 | 		)
34 | 
35 | 
36 | def unittest():
37 | 	for dtype, atol in gpuarray.dtypesSupported():
38 | 		calcTest(dtype, atol)
39 | 		trainSimpleTest(SMORMS3, dtype, learnRate=1e-2)
40 | 
41 | 		if Config.backend == Config.Backend.cuda:
42 | 			trainHardTest(SMORMS3, dtype, learnRate=1e-2)
43 | 
44 | 
45 | def calcTest(dtype, atol):
46 | 	lr, epsilon = 1e-3, 1e-16
47 | 	shape = (11, 13)
48 | 
49 | 	hostW, hostDw = np.random.randn(*shape).astype(dtype), np.random.randn(*shape).astype(dtype)
50 | 	hostMem = (1.0 + np.random.randn(*shape)**2).astype(np.float32)
51 | 	hostMg, hostMs = np.random.randn(*shape).astype(np.float32), np.random.randn(*shape).astype(np.float32)**2
52 | 
53 | 	w, dw = gpuarray.to_gpu(hostW), gpuarray.to_gpu(hostDw)
54 | 	mem, mg, ms = gpuarray.to_gpu(hostMem), gpuarray.to_gpu(hostMg), gpuarray.to_gpu(hostMs)
55 | 
56 | 	smorms3Ker(w.dtype)(w, dw, mem, mg, ms, lr, epsilon)
57 | 
58 | 	hostW, hostDw = hostW.astype(np.float32), hostDw.astype(np.float32)
59 | 
60 | 	r = 1.0 / (1.0 + hostMem)
61 | 	hostMg = (1.0 - r) * hostMg + r * hostDw
62 | 	hostMs = (1.0 - r) * hostMs + r * hostDw**2
63 | 	x = hostMg**2 / (hostMs + epsilon)
64 | 
65 | 	hostMem = 1.0 + hostMem * (1.0 - x)
66 | 	hostW += hostDw * np.minimum(lr, x) / (np.sqrt(hostMs) + epsilon)
67 | 
68 | 	hostW, hostDw = hostW.astype(dtype), hostDw.astype(dtype)
69 | 
70 | 	assert np.allclose(hostMem, mem.get(), atol=atol)
71 | 	assert np.allclose(hostMg, mg.get(), atol=atol)
72 | 	assert np.allclose(hostMs, ms.get(), atol=atol)
73 | 	assert np.allclose(hostW, w.get(), atol=atol)
74 | 
75 | 
76 | if __name__ == "__main__":
77 | 	unittest()
78 | 


--------------------------------------------------------------------------------
/Modules/AvgPool3D.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray
 4 | from PuzzleLib.Backend.Dnn import PoolMode, poolNd, poolNdBackward
 5 | 
 6 | from PuzzleLib.Modules.Pool3D import Pool3D
 7 | 
 8 | 
 9 | class AvgPool3D(Pool3D):
10 | 	def __init__(self, size=2, stride=2, pad=0, includePad=True, name=None):
11 | 		super().__init__(size, stride, pad, name)
12 | 		self.registerBlueprint(locals())
13 | 
14 | 		self.mode = PoolMode.avgWithPad if includePad else PoolMode.avgNoPad
15 | 
16 | 
17 | 	def updateData(self, data):
18 | 		self.data, self.workspace = poolNd(data, size=self.size, stride=self.stride, pad=self.pad, mode=self.mode,
19 | 										   test=not self.train)
20 | 
21 | 
22 | 	def updateGrad(self, grad):
23 | 		self.grad = poolNdBackward(self.inData, self.data, grad, self.workspace, size=self.size, stride=self.stride,
24 | 								   pad=self.pad, mode=self.mode)
25 | 
26 | 
27 | def unittest():
28 | 	batchsize, maps, d, h, w = 2, 6, 5, 7, 5
29 | 	data = gpuarray.to_gpu(np.random.randn(batchsize, maps, d, h, w).astype(np.float32))
30 | 
31 | 	size = 3
32 | 	stride, pad = 2, 1
33 | 
34 | 	avgpool3d = AvgPool3D(size=size, stride=stride, pad=pad, includePad=True)
35 | 	avgpool3d(data)
36 | 
37 | 	hostData = np.zeros(shape=(batchsize, maps, d + 2 * pad, h + 2 * pad, w + 2 * pad), dtype=np.float32)
38 | 	hostData[:, :, pad:-pad, pad:-pad, pad:-pad] = data.get()
39 | 	hostOutData = np.empty(avgpool3d.data.shape, dtype=np.float32)
40 | 
41 | 	for b in range(batchsize):
42 | 		for c in range(maps):
43 | 			for z in range(hostOutData.shape[2]):
44 | 				for y in range(hostOutData.shape[3]):
45 | 					for x in range(hostOutData.shape[4]):
46 | 						hostOutData[b, c, z, y, x] = np.mean(hostData[b, c, z * stride:z * stride + size,
47 | 															 y * stride:y * stride + size,x * stride:x * stride + size])
48 | 
49 | 	assert np.allclose(hostOutData, avgpool3d.data.get())
50 | 
51 | 	grad = gpuarray.to_gpu(np.random.randn(*avgpool3d.data.shape).astype(np.float32))
52 | 	avgpool3d.backward(grad)
53 | 
54 | 	hostGrad = grad.get()
55 | 	hostInGrad = np.zeros(hostData.shape, dtype=np.float32)
56 | 
57 | 	for b in range(batchsize):
58 | 		for c in range(maps):
59 | 			for z in range(hostOutData.shape[2]):
60 | 				for y in range(hostOutData.shape[3]):
61 | 					for x in range(hostOutData.shape[4]):
62 | 						for dz in range(size):
63 | 							for dy in range(size):
64 | 								for dx in range(size):
65 | 									hostInGrad[b,c,z*stride+dz,y*stride+dy,x*stride+dx] += hostGrad[b,c,z,y,x]/size**3
66 | 
67 | 	assert np.allclose(hostInGrad[:, :, pad:-pad, pad:-pad, pad:-pad], avgpool3d.grad.get())
68 | 
69 | 
70 | if __name__ == "__main__":
71 | 	unittest()
72 | 


--------------------------------------------------------------------------------
/Hip/Wrappers/MIOpenNorm.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import numpy as np
 3 | 
 4 | from PuzzleLib.Cuda.Wrappers.CuDnnNorm import batchNorm2dTest, batchNorm3dTest, instanceNorm2dTest
 5 | 
 6 | 
 7 | def unittest():
 8 | 	from PuzzleLib.Hip import Backend
 9 | 	backendTest(Backend)
10 | 
11 | 
12 | def backendTest(Backend):
13 | 	for deviceIdx in range(Backend.getDeviceCount()):
14 | 		bnd = Backend.getBackend(deviceIdx, initmode=2)
15 | 
16 | 		float32 = bnd.dtypesSupported()[0]
17 | 
18 | 		batchNorm2dTest(bnd, *float32, np.float32)
19 | 		batchNorm3dTest(bnd, *float32, np.float32)
20 | 		instanceNorm2dTest(bnd, *float32, np.float32)
21 | 
22 | 		for dtype, atol in bnd.dtypesSupported():
23 | 			mapLRN2dTest(bnd, dtype, atol)
24 | 
25 | 
26 | def mapLRN2dTest(bnd, dtype, atol):
27 | 	batchsize, maps, h, w = 2, 2, 9, 10
28 | 	N, alpha, beta, K = 5, 1.0, 0.5, 2.0
29 | 
30 | 	lookBehind = int((N - 1) / 2)
31 | 	lookAhead = N - lookBehind
32 | 
33 | 	hostData = np.random.randn(batchsize, maps, h, w).astype(dtype)
34 | 
35 | 	data = bnd.GPUArray.toGpu(hostData)
36 | 	outdata, workspace = bnd.dnn.lrn(data, N=N, alpha=alpha, beta=beta, K=K, mode=bnd.LRNMode.map.value)
37 | 
38 | 	norms = np.empty(hostData.shape, dtype=np.float32)
39 | 
40 | 	for b, c, y, x in itertools.product(range(batchsize), range(maps), range(h), range(w)):
41 | 		slcy = slice(max(0, y - lookBehind), min(h, y + lookAhead))
42 | 		slcx = slice(max(0, x - lookBehind), min(w, x + lookAhead))
43 | 
44 | 		slc = hostData[b, c, slcy, slcx].ravel()
45 | 		norms[b, c, y, x] = K + np.dot(slc, slc) * alpha / N**2
46 | 
47 | 	hostOutData = (hostData / norms**beta).astype(dtype)
48 | 	assert np.allclose(hostOutData, outdata.get(), atol=atol)
49 | 
50 | 	hostGrad = np.random.randn(*outdata.shape).astype(dtype)
51 | 
52 | 	grad = bnd.GPUArray.toGpu(hostGrad)
53 | 	ingrad = bnd.dnn.lrnBackward(
54 | 		grad, data, outdata, workspace, N=N, alpha=alpha, beta=beta, K=K, mode=bnd.LRNMode.map.value
55 | 	)
56 | 
57 | 	hostInGrad = hostGrad / norms**beta
58 | 	k = 2.0 * alpha * beta / N**2
59 | 
60 | 	for b, c, y, x in itertools.product(range(batchsize), range(maps), range(h), range(w)):
61 | 		slcy = slice(max(0, y - lookBehind), min(h, y + lookAhead))
62 | 		slcx = slice(max(0, x - lookBehind), min(w, x + lookAhead))
63 | 
64 | 		slcdata, slcgrad = hostData[b, c, slcy, slcx].ravel(), hostGrad[b, c, slcy, slcx].ravel()
65 | 		slcnorms = norms[b, c, slcy, slcx].ravel()
66 | 
67 | 		hostInGrad[b, c, y, x] -= k * hostData[b, c, y, x] * np.dot(slcgrad, slcdata / slcnorms**(beta + 1))
68 | 
69 | 	hostInGrad = hostInGrad.astype(dtype)
70 | 	assert np.allclose(hostInGrad, ingrad.get(), atol=atol)
71 | 
72 | 
73 | if __name__ == "__main__":
74 | 	unittest()
75 | 


--------------------------------------------------------------------------------
/Modules/AvgPool1D.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray
 4 | from PuzzleLib.Backend.Dnn import PoolMode, poolNd, poolNdBackward
 5 | 
 6 | from PuzzleLib.Modules.Pool1D import Pool1D
 7 | 
 8 | 
 9 | class AvgPool1D(Pool1D):
10 | 	def __init__(self, size=2, stride=2, pad=0, includePad=True, name=None):
11 | 		super().__init__(size, stride, pad, name)
12 | 		self.registerBlueprint(locals())
13 | 
14 | 		self.mode = PoolMode.avgWithPad if includePad else PoolMode.avgNoPad
15 | 
16 | 
17 | 	def updateData(self, data):
18 | 		data = data.reshape(*data.shape[:2], 1, *data.shape[2:])
19 | 		self.data, self.workspace = poolNd(data, size=self.size, stride=self.stride, pad=self.pad, mode=self.mode,
20 | 										   test=not self.train)
21 | 		self.data = self.data.reshape(*self.data.shape[:2], *self.data.shape[3:])
22 | 
23 | 
24 | 	def updateGrad(self, grad):
25 | 		grad = grad.reshape(*grad.shape[:2], 1, *grad.shape[2:])
26 | 
27 | 		indata = self.inData.reshape(*self.inData.shape[:2], 1, *self.inData.shape[2:])
28 | 		outdata = self.data.reshape(*self.data.shape[:2], 1, *self.data.shape[2:])
29 | 
30 | 		self.grad = poolNdBackward(indata, outdata, grad, self.workspace, size=self.size, stride=self.stride,
31 | 								   pad=self.pad, mode=self.mode)
32 | 		self.grad = self.grad.reshape(*self.grad.shape[:2], *self.grad.shape[3:])
33 | 
34 | 
35 | def unittest():
36 | 	batchsize, maps, insize = 2, 6, 5
37 | 	data = gpuarray.to_gpu(np.random.randn(batchsize, maps, insize).astype(np.float32))
38 | 
39 | 	size = 3
40 | 	stride, pad = 2, 1
41 | 
42 | 	avgpool1d = AvgPool1D(size=size, stride=stride, pad=pad, includePad=True)
43 | 	avgpool1d(data)
44 | 
45 | 	hostData = np.zeros(shape=(batchsize, maps, insize + 2 * pad), dtype=np.float32)
46 | 	hostData[:, :, pad:-pad] = data.get()
47 | 	hostOutData = np.empty(avgpool1d.data.shape, dtype=np.float32)
48 | 
49 | 	for b in range(batchsize):
50 | 		for c in range(maps):
51 | 			for x in range(hostOutData.shape[2]):
52 | 				hostOutData[b, c, x] = np.mean(hostData[b, c, x * stride:x * stride + size])
53 | 
54 | 	assert np.allclose(hostOutData, avgpool1d.data.get())
55 | 
56 | 	grad = gpuarray.to_gpu(np.random.randn(*avgpool1d.data.shape).astype(np.float32))
57 | 	avgpool1d.backward(grad)
58 | 
59 | 	hostGrad = grad.get()
60 | 	hostInGrad = np.zeros(hostData.shape, dtype=np.float32)
61 | 
62 | 	for b in range(batchsize):
63 | 		for c in range(maps):
64 | 			for x in range(hostOutData.shape[2]):
65 | 				for dx in range(size):
66 | 					hostInGrad[b, c, x * stride+dx] += hostGrad[b, c, x] / size
67 | 
68 | 	assert np.allclose(hostInGrad[:, :, pad:-pad], avgpool1d.grad.get())
69 | 
70 | 
71 | if __name__ == "__main__":
72 | 	unittest()
73 | 


--------------------------------------------------------------------------------
/Backend/Memory.py:
--------------------------------------------------------------------------------
 1 | from PuzzleLib import Config
 2 | 
 3 | 
 4 | depthConcat = None
 5 | depthSplit = None
 6 | 
 7 | moveaxis = None
 8 | swapaxes = None
 9 | transpose = None
10 | 
11 | 
12 | def autoinit():
13 | 	if not Config.shouldInit():
14 | 		return
15 | 
16 | 	if Config.backend == Config.Backend.cuda:
17 | 		initCuda()
18 | 	elif Config.backend == Config.Backend.hip:
19 | 		initHip()
20 | 	elif Config.isCPUBased(Config.backend):
21 | 		initCPU()
22 | 	else:
23 | 		raise Config.ConfigError(Config.backend)
24 | 
25 | 
26 | def initCuda():
27 | 	from PuzzleLib.Cuda.Backend import getBackend
28 | 
29 | 	backend = getBackend(Config.deviceIdx, initmode=1, logger=Config.getLogger())
30 | 	memoryPool, dnn = backend.memoryPool, backend.dnn
31 | 
32 | 	initGPU(memoryPool, dnn)
33 | 
34 | 
35 | def initHip():
36 | 	from PuzzleLib.Hip.Backend import getBackend
37 | 
38 | 	backend = getBackend(Config.deviceIdx, initmode=2, logger=Config.getLogger())
39 | 	memoryPool, memmod = backend.memoryPool, backend.memmod
40 | 
41 | 	initGPU(memoryPool, memmod)
42 | 
43 | 
44 | def initGPU(memoryPool, module):
45 | 	def wrapDepthConcat(data):
46 | 		return module.depthConcat(data, allocator=memoryPool)
47 | 
48 | 	def wrapDepthSplit(grad, indata):
49 | 		return module.depthSplit(grad, indata, allocator=memoryPool)
50 | 
51 | 	global depthConcat, depthSplit
52 | 	depthConcat = wrapDepthConcat
53 | 	depthSplit = wrapDepthSplit
54 | 
55 | 	def wrapMoveaxis(data, src, dst):
56 | 		return module.moveaxis(data, src, dst, allocator=memoryPool)
57 | 
58 | 	def wrapSwapaxes(data, axis1, axis2):
59 | 		return module.swapaxes(data, axis1, axis2, allocator=memoryPool)
60 | 
61 | 	def wrapTranspose(data, axes):
62 | 		return module.transpose(data, tuple(axes), allocator=memoryPool)
63 | 
64 | 	global moveaxis, swapaxes, transpose
65 | 	moveaxis = wrapMoveaxis
66 | 	swapaxes = wrapSwapaxes
67 | 	transpose = wrapTranspose
68 | 
69 | 
70 | def initCPU():
71 | 	import numpy as np
72 | 	from PuzzleLib.CPU.CPUArray import CPUArray
73 | 
74 | 	def wrapMoveAxis(a, src, dst):
75 | 		out = np.copy(np.moveaxis(a.get(copy=False), src, dst), order="C")
76 | 		return CPUArray(out.shape, out.dtype, data=out, acquire=True)
77 | 
78 | 	def wrapSwapAxes(a, axis1, axis2):
79 | 		out = np.copy(np.swapaxes(a.get(copy=False), axis1, axis2), order="C")
80 | 		return CPUArray(out.shape, out.dtype, data=out, acquire=True)
81 | 
82 | 	def wrapTranspose(a, axes):
83 | 		out = np.copy(np.transpose(a.get(copy=False), axes), order="C")
84 | 		return CPUArray(out.shape, out.dtype, data=out, acquire=True)
85 | 
86 | 	global moveaxis, swapaxes, transpose
87 | 	moveaxis = wrapMoveAxis
88 | 	swapaxes = wrapSwapAxes
89 | 	transpose = wrapTranspose
90 | 
91 | 
92 | autoinit()
93 | 


--------------------------------------------------------------------------------
/Modules/MaxPool3D.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray
 4 | from PuzzleLib.Backend.Dnn import PoolMode, poolNd, poolNdBackward
 5 | 
 6 | from PuzzleLib.Modules.Pool3D import Pool3D
 7 | 
 8 | 
 9 | class MaxPool3D(Pool3D):
10 | 	def __init__(self, size=2, stride=2, pad=0, name=None):
11 | 		super().__init__(size, stride, pad, name)
12 | 		self.registerBlueprint(locals())
13 | 
14 | 		self.mode = PoolMode.max
15 | 
16 | 
17 | 	def updateData(self, data):
18 | 		self.data, self.workspace = poolNd(data, size=self.size, stride=self.stride, pad=self.pad, mode=self.mode,
19 | 										   test=not self.train)
20 | 
21 | 
22 | 	def updateGrad(self, grad):
23 | 		self.grad = poolNdBackward(self.inData, self.data, grad, self.workspace,
24 | 								   size=self.size, stride=self.stride, pad=self.pad, mode=self.mode)
25 | 
26 | 
27 | def unittest():
28 | 	batchsize, maps, t, h, w = 1, 1, 6, 6, 6
29 | 	size, stride, pad = 3, 2, 1
30 | 	data = gpuarray.to_gpu(np.random.randn(batchsize, maps, t, h, w).astype(np.float32))
31 | 
32 | 	maxpool3d = MaxPool3D(size=size, stride=stride, pad=pad)
33 | 	maxpool3d(data)
34 | 
35 | 	hostData = np.full(shape=(batchsize, maps, t + 2 * pad, h + 2 * pad, w + 2 * pad),
36 | 					   fill_value=np.finfo(np.float32).min, dtype=np.float32)
37 | 	hostData[:, :, pad:-pad, pad:-pad, pad:-pad] = data.get()
38 | 	hostOutData = np.empty(maxpool3d.data.shape)
39 | 
40 | 	for b in range(batchsize):
41 | 		for c in range(maps):
42 | 			for z in range(hostOutData.shape[2]):
43 | 				for y in range(hostOutData.shape[3]):
44 | 					for x in range(hostOutData.shape[4]):
45 | 						hostOutData[b, c, z, y, x] = np.max(hostData[b, c, z * stride:z * stride + size,
46 | 															y * stride:y * stride + size, x * stride:x * stride + size])
47 | 
48 | 	assert np.allclose(hostOutData, maxpool3d.data.get())
49 | 
50 | 	grad = gpuarray.to_gpu(np.random.randn(*maxpool3d.data.shape).astype(np.float32))
51 | 	maxpool3d.backward(grad)
52 | 
53 | 	hostGrad = grad.get()
54 | 	hostInGrad = np.zeros(hostData.shape, dtype=np.float32)
55 | 
56 | 	for b in range(batchsize):
57 | 		for c in range(maps):
58 | 			for z in range(hostOutData.shape[2]):
59 | 				for y in range(hostOutData.shape[3]):
60 | 					for x in range(hostOutData.shape[4]):
61 | 						for dz in range(size):
62 | 							for dy in range(size):
63 | 								for dx in range(size):
64 | 									if hostData[b,c,z*stride+dz,y*stride + dy,x*stride + dx] == hostOutData[b,c,z,y,x]:
65 | 										hostInGrad[b,c,z*stride + dz,y*stride + dy,x*stride + dx] += hostGrad[b,c,z,y,x]
66 | 
67 | 	assert np.allclose(hostInGrad[:, :, pad:-pad, pad:-pad, pad:-pad], maxpool3d.grad.get())
68 | 
69 | 
70 | if __name__ == "__main__":
71 | 	unittest()
72 | 


--------------------------------------------------------------------------------
/Modules/SpatialTf.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray
 4 | from PuzzleLib.Backend.Dnn import spatialTf, spatialTfBackward
 5 | 
 6 | from PuzzleLib.Modules.Module import ModuleError, Module
 7 | 
 8 | 
 9 | class SpatialTf(Module):
10 | 	def __init__(self, shape=None, name=None):
11 | 		super().__init__(name)
12 | 		self.registerBlueprint(locals())
13 | 
14 | 		self.shape = shape
15 | 		self.grid = None
16 | 
17 | 
18 | 	def updateData(self, data):
19 | 		data, transform = data
20 | 
21 | 		if self.train:
22 | 			self.data, self.grid = spatialTf(data, transform, outshape=self.shape, getGrid=True)
23 | 		else:
24 | 			self.data = spatialTf(data, transform, outshape=self.shape, getGrid=False)
25 | 
26 | 
27 | 	def updateGrad(self, grad):
28 | 		data, _ = self.inData
29 | 		self.grad = spatialTfBackward(grad, data, self.grid)
30 | 
31 | 
32 | 	def checkDataShape(self, shapes):
33 | 		dshape, tshape = shapes
34 | 
35 | 		if len(tshape) != 3 or tshape[1:] != (2, 3):
36 | 			raise ModuleError("Bad transform shape (%s was given)" % tshape)
37 | 
38 | 		if len(dshape) != 4:
39 | 			raise ModuleError("Data must be 4d tensor")
40 | 
41 | 		if tshape[0] != dshape[0]:
42 | 			raise ModuleError("Inconsistency in transform and data batch size (%d in transform vs %d in data)" %
43 | 							  (tshape[0], dshape[0]))
44 | 
45 | 
46 | 	def checkGradShape(self, shape):
47 | 		if len(shape) != 4:
48 | 			raise ModuleError("Grad must be 4d tensor")
49 | 
50 | 		if self.shape is not None:
51 | 			if self.shape != shape[1:]:
52 | 				raise ModuleError("Bad grad shape (was given %s, expected %s)" % (shape[1:], self.shape))
53 | 		else:
54 | 			if self.inData[0].shape != shape:
55 | 				raise ModuleError("Bad grad shape (was given %s, expected %s)" % (shape, self.inData[0].shape))
56 | 
57 | 
58 | 	def dataShapeFrom(self, shapes):
59 | 		dshape, tshape = shapes
60 | 		return (dshape[0], ) + self.shape if self.shape is not None else dshape
61 | 
62 | 
63 | 	def gradShapeFrom(self, shape):
64 | 		return (shape[0], ) + self.inData[0].shape[1:], (shape[0], 2, 3)
65 | 
66 | 
67 | 	def reset(self):
68 | 		super().reset()
69 | 		self.grid = None
70 | 
71 | 
72 | def unittest():
73 | 	batchsize, maps, inh, inw = 1, 1, 4, 4
74 | 	data = gpuarray.to_gpu(np.random.randn(batchsize, maps, inh, inw).astype(np.float32))
75 | 
76 | 	transform = gpuarray.to_gpu(
77 | 		np.tile(np.array([[1.0, 0.0, 0.001], [0, 1.0, 0.001]], dtype=np.float32), reps=(batchsize, 1, 1))
78 | 	)
79 | 
80 | 	spatialtf = SpatialTf()
81 | 	spatialtf([data, transform])
82 | 
83 | 	grad = gpuarray.to_gpu(np.random.randn(*spatialtf.data.shape).astype(np.float32))
84 | 	spatialtf.backward(grad)
85 | 
86 | 
87 | if __name__ == "__main__":
88 | 	unittest()
89 | 


--------------------------------------------------------------------------------
/Cost/SmoothL1.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray
 4 | from PuzzleLib.Backend.gpuarray import memoryPool as memPool
 5 | from PuzzleLib.Backend.Kernels.Costs import smoothL1Ker
 6 | 
 7 | from PuzzleLib.Cost.Cost import Cost
 8 | 
 9 | 
10 | class SmoothL1(Cost):
11 | 	def calcGrad(self, pred, target):
12 | 		grad = gpuarray.empty(pred.shape, dtype=np.float32, allocator=memPool)
13 | 
14 | 		fullnorm = 1.0 / np.prod(target.shape)
15 | 		norm = 1.0 / np.prod(target.shape[1:])
16 | 
17 | 		self.devErr.fill(0.0)
18 | 
19 | 		smoothL1Ker(pred, target, self.devErr, grad, norm, fullnorm)
20 | 		return grad
21 | 
22 | 
23 | 	def calcError(self, pred, target):
24 | 		self.accumErr += self.devErr
25 | 
26 | 
27 | 	def calcVal(self, pred, target):
28 | 		diff = gpuarray.empty(pred.shape, dtype=np.float32, allocator=memPool)
29 | 
30 | 		fullnorm = 1.0 / np.prod(target.shape)
31 | 
32 | 		devErr = gpuarray.zeros((), dtype=np.float32, allocator=memPool)
33 | 		smoothL1Ker(pred, target, devErr, diff, fullnorm, fullnorm)
34 | 
35 | 		return devErr.get()
36 | 
37 | 
38 | 	def checkDataShape(self, pred, target):
39 | 		assert pred.shape[1:] == target.shape[1:]
40 | 
41 | 
42 | 	def checkValDataShape(self, pred, target):
43 | 		assert pred.shape[1:] == target.shape[1:]
44 | 
45 | 
46 | def unittest():
47 | 	errorTest()
48 | 	valTest()
49 | 
50 | 
51 | def errorTest():
52 | 	pred = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32))
53 | 	target = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32))
54 | 
55 | 	smoothL1 = SmoothL1()
56 | 	smoothL1(pred, target)
57 | 
58 | 	hostPred, hostTarget = pred.get(), target.get()
59 | 	hostGrad = ((np.abs(hostPred - hostTarget) >= 1.0) * np.sign(hostPred - hostTarget) +
60 | 			   (np.abs(hostPred - hostTarget) < 1.0) * (hostPred - hostTarget)) / np.prod(pred.shape)
61 | 
62 | 	assert np.allclose(hostGrad, smoothL1.grad.get())
63 | 
64 | 	hostError = np.mean((np.abs(hostPred - hostTarget) >= 1.0) * (np.abs(hostPred - hostTarget) - 0.5) +
65 | 						(np.abs(hostPred - hostTarget) < 1.0) * (hostPred - hostTarget)**2 / 2.0)
66 | 
67 | 	assert np.isclose(smoothL1.error, hostError)
68 | 
69 | 
70 | def valTest():
71 | 	pred = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32))
72 | 	target = gpuarray.to_gpu(np.random.randn(10, 10).astype(np.float32))
73 | 
74 | 	smoothL1 = SmoothL1()
75 | 	error = smoothL1.validate(pred, target)
76 | 
77 | 	hostPred, hostTarget = pred.get(), target.get()
78 | 
79 | 	hostError = np.mean((np.abs(hostPred - hostTarget) >= 1.0) * (np.abs(hostPred - hostTarget) - 0.5) +
80 | 						(np.abs(hostPred - hostTarget) < 1.0) * (hostPred - hostTarget)**2 / 2.0)
81 | 
82 | 	assert np.isclose(error, hostError)
83 | 
84 | 
85 | if __name__ == "__main__":
86 | 	unittest()
87 | 


--------------------------------------------------------------------------------
/Hip/CheckInstall.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from PuzzleLib.Cuda.CheckInstall import checkInstall, checkRuntime, checkPipPackages
  3 | 
  4 | 
  5 | hipTestKernel = """
  6 | 
  7 | #include <stdio.h>
  8 | #include <hip/hip_runtime.h>
  9 | 
 10 | 
 11 | __global__ void iaxpy(int *y, const int *x, int a, int size)
 12 | {
 13 | 	int i = blockIdx.x * blockDim.x + threadIdx.x;
 14 | 	if (i < size) y[i] += a * x[i];
 15 | }
 16 | 
 17 | 
 18 | #define HIP_ASSERT(status) do { if (!hipAssertStatus((status), __LINE__)) exit(1); } while (0)
 19 | inline bool hipAssertStatus(hipError_t code, int line)
 20 | {
 21 | 	if (code != hipSuccess) 
 22 | 	{
 23 | 		fprintf(stderr, "%s (line:%d)\\n", hipGetErrorString(code), line);
 24 | 		return false;
 25 | 	}
 26 | 
 27 | 	return true;
 28 | }
 29 | 
 30 | 
 31 | int main()
 32 | {
 33 | 	int exitcode = 0;
 34 | 
 35 | 	const int SIZE = 1 << 20;
 36 | 	const int NBYTES = SIZE * sizeof(int);
 37 | 
 38 | 	int *hostx = (int *)malloc(NBYTES);
 39 | 	int *hosty = (int *)malloc(NBYTES);
 40 | 
 41 | 	int *devx = NULL, *devy = NULL;
 42 | 	HIP_ASSERT(hipMalloc(&devx, NBYTES));
 43 | 	HIP_ASSERT(hipMalloc(&devy, NBYTES));
 44 | 
 45 | 	for (int i = 0; i < SIZE; i++)
 46 | 	{
 47 | 		hostx[i] = i;
 48 | 		hosty[i] = -i * 2;
 49 | 	}
 50 | 
 51 | 	HIP_ASSERT(hipMemcpy(devx, hostx, NBYTES, hipMemcpyHostToDevice));
 52 | 	HIP_ASSERT(hipMemcpy(devy, hosty, NBYTES, hipMemcpyHostToDevice));
 53 | 
 54 | 	const int NT = 256;
 55 | 	hipLaunchKernelGGL(iaxpy, dim3((SIZE + NT - 1) / NT), dim3(NT), 0, 0, devy, devx, 2, SIZE);
 56 | 
 57 | 	HIP_ASSERT(hipMemcpy(hosty, devy, NBYTES, hipMemcpyDeviceToHost));
 58 | 
 59 | 	HIP_ASSERT(hipFree(devx));
 60 | 	HIP_ASSERT(hipFree(devy));
 61 | 
 62 | 	for (int i = 0; i < SIZE; i++)
 63 | 		if (hosty[i] != 0)
 64 | 		{
 65 | 			fprintf(stderr, "kernel invocation failed!");
 66 | 
 67 | 			exitcode = 1;
 68 | 			goto exit;
 69 | 		}
 70 | 
 71 | 	printf("finished successfully!");
 72 | 	fflush(stdout);
 73 | 
 74 | exit:
 75 | 	free(hostx);
 76 | 	free(hosty);
 77 | 
 78 | 	return exitcode;
 79 | }
 80 | 
 81 | """
 82 | 
 83 | 
 84 | def checkHipInstall(withRuntime, withPip):
 85 | 	checkInstall(
 86 | 		name="HIP", compiler="hipcc",
 87 | 		download="https://rocm.github.io/install.html#ubuntu-support---installing-from-a-debian-repository",
 88 | 		envpath="HIP_PATH"
 89 | 	)
 90 | 
 91 | 	if withRuntime:
 92 | 		checkRuntime(name="HIP", compiler="hipcc", kernel=hipTestKernel, ext=".hip.cpp")
 93 | 
 94 | 	if withPip:
 95 | 		checkPipPackages()
 96 | 
 97 | 
 98 | def main():
 99 | 	try:
100 | 		checkHipInstall(withRuntime=True, withPip=True)
101 | 
102 | 	except RuntimeError as e:
103 | 		print(e)
104 | 
105 | 		print("Exiting ...")
106 | 		sys.exit(1)
107 | 
108 | 
109 | if __name__ == "__main__":
110 | 	main()
111 | 


--------------------------------------------------------------------------------
/Cost/Cost.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from PuzzleLib.Backend import gpuarray
  4 | 
  5 | 
  6 | class CostError(Exception):
  7 | 	pass
  8 | 
  9 | 
 10 | class Cost:
 11 | 	def __init__(self):
 12 | 		self.accumErr = gpuarray.empty((), dtype=np.float32)
 13 | 		self.devErr = gpuarray.empty((), dtype=np.float32)
 14 | 
 15 | 		self.error = None
 16 | 		self.valError = None
 17 | 		self.grad = None
 18 | 
 19 | 		self.batchsize = None
 20 | 		self.numOfSamples = None
 21 | 
 22 | 		self.dirty = True
 23 | 		self.resetAccumulator()
 24 | 
 25 | 
 26 | 	def resetAccumulator(self):
 27 | 		self.resetDeviceAccumulator()
 28 | 
 29 | 		self.batchsize = 0
 30 | 		self.numOfSamples = 0
 31 | 
 32 | 
 33 | 	def updateState(self, samples):
 34 | 		self.batchsize = samples
 35 | 		self.numOfSamples += samples
 36 | 
 37 | 
 38 | 	def resetDeviceAccumulator(self):
 39 | 		self.accumErr.fill(0.0)
 40 | 
 41 | 
 42 | 	def getError(self):
 43 | 		if self.dirty:
 44 | 			self.error = self.devErr.get() / self.batchsize
 45 | 			self.dirty = False
 46 | 
 47 | 		return self.error
 48 | 
 49 | 
 50 | 	def getMeanError(self):
 51 | 		return self.accumErr.get() / self.numOfSamples
 52 | 
 53 | 
 54 | 	def getValError(self):
 55 | 		return self.valError
 56 | 
 57 | 
 58 | 	def __call__(self, pred, target, queryError=True):
 59 | 		if isinstance(target, gpuarray.GPUArray) and isinstance(pred, gpuarray.GPUArray):
 60 | 			assert pred.shape[0] == target.shape[0]
 61 | 
 62 | 		self.checkDataShape(pred, target)
 63 | 		self.reset()
 64 | 
 65 | 		self.grad = self.calcGrad(pred, target)
 66 | 		self.calcError(pred, target)
 67 | 		self.dirty = True
 68 | 
 69 | 		self.updateState(self.getBatchsize(pred))
 70 | 
 71 | 		if queryError:
 72 | 			self.error = self.getError()
 73 | 
 74 | 		if queryError:
 75 | 			return self.error, self.grad
 76 | 		else:
 77 | 			return self.grad
 78 | 
 79 | 
 80 | 	def calcError(self, pred, target):
 81 | 		raise NotImplementedError()
 82 | 
 83 | 
 84 | 	def calcGrad(self, pred, target):
 85 | 		raise NotImplementedError()
 86 | 
 87 | 
 88 | 	def validate(self, pred, target):
 89 | 		if isinstance(target, gpuarray.GPUArray) and isinstance(pred, gpuarray.GPUArray):
 90 | 			assert pred.shape[0] == target.shape[0]
 91 | 
 92 | 		self.checkValDataShape(pred, target)
 93 | 		self.valError = self.calcVal(pred, target)
 94 | 
 95 | 		return self.valError
 96 | 
 97 | 
 98 | 	def calcVal(self, pred, target):
 99 | 		raise NotImplementedError()
100 | 
101 | 
102 | 	def reset(self):
103 | 		self.error = None
104 | 		self.valError = None
105 | 
106 | 		self.grad = None
107 | 
108 | 
109 | 	def checkDataShape(self, pred, target):
110 | 		pass
111 | 
112 | 
113 | 	def checkValDataShape(self, pred, target):
114 | 		pass
115 | 
116 | 
117 | 	def getBatchsize(self, pred):
118 | 		return pred.shape[0]
119 | 


--------------------------------------------------------------------------------
/Datasets/MnistLoader.py:
--------------------------------------------------------------------------------
 1 | import os, struct, array
 2 | 
 3 | import numpy as np
 4 | import h5py
 5 | 
 6 | from PuzzleLib.Datasets.DataLoader import DataLoader
 7 | 
 8 | 
 9 | class MnistLoader(DataLoader):
10 | 	def __init__(self, onSample=None, cachename="mnist.hdf"):
11 | 		super().__init__(("data", "labels"), cachename)
12 | 
13 | 		if onSample:
14 | 			self.onSample = onSample
15 | 		else:
16 | 			self.onSample = lambda smp: np.array(smp, dtype=np.float32).reshape((1, 28, 28)) / 255.0
17 | 
18 | 		self.testdata = "t10k-images.idx3-ubyte"
19 | 		self.testlabels = "t10k-labels.idx1-ubyte"
20 | 
21 | 		self.traindata = "train-images.idx3-ubyte"
22 | 		self.trainlabels = "train-labels.idx1-ubyte"
23 | 
24 | 
25 | 	def load(self, path, compress="gzip", log=True):
26 | 		self.cachename = os.path.join(path, self.cachename)
27 | 
28 | 		if not os.path.exists(self.cachename):
29 | 			imgs, lbls = [], []
30 | 
31 | 			if log:
32 | 				print("[%s] Started unpacking ..." % self.__class__.__name__)
33 | 
34 | 			for filename in [self.testlabels, self.trainlabels]:
35 | 				with open(os.path.join(path, filename), "rb") as file:
36 | 					magic, size = struct.unpack(">II", file.read(8))
37 | 
38 | 					trueMagic = 2049
39 | 					if magic != trueMagic:
40 | 						raise ValueError("Bad magic number (got %s, expected %s)" % (magic, trueMagic))
41 | 
42 | 					lbls += array.array("B", file.read())
43 | 
44 | 			for filename in [self.testdata, self.traindata]:
45 | 				with open(os.path.join(path, filename), "rb") as file:
46 | 					magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
47 | 
48 | 					trueMagic = 2051
49 | 					if magic != trueMagic:
50 | 						raise ValueError("Bad magic number (got %s, expected %s)" % (magic, trueMagic))
51 | 
52 | 					data = array.array("B", file.read())
53 | 					datsize = rows * cols
54 | 
55 | 					for i in range(size):
56 | 						dat = data[i * datsize:(i+1) * datsize]
57 | 						imgs.append(dat)
58 | 
59 | 			images = np.empty((len(imgs), 1, rows, cols), dtype=np.float32)
60 | 			labels = np.empty((len(imgs), ), dtype=np.int32)
61 | 
62 | 			print("[%s] Building cache ..." % self.__class__.__name__)
63 | 
64 | 			for i in range(len(lbls)):
65 | 				images[i] = self.onSample(imgs[i])
66 | 				labels[i] = lbls[i]
67 | 
68 | 			with h5py.File(self.cachename, "w") as hdf:
69 | 				dsetname, lblsetname = self.datanames
70 | 				hdf.create_dataset(dsetname, data=images, compression=compress)
71 | 				hdf.create_dataset(lblsetname, data=labels, compression=compress)
72 | 
73 | 		hdf = h5py.File(self.cachename, "r")
74 | 		dsetname, lblsetname = self.datanames
75 | 		return hdf[dsetname], hdf[lblsetname]
76 | 
77 | 
78 | def unittest():
79 | 	mnist = MnistLoader()
80 | 	mnist.load(path="../TestData/")
81 | 	mnist.clear()
82 | 
83 | 
84 | if __name__ == "__main__":
85 | 	unittest()
86 | 


--------------------------------------------------------------------------------
/Optimizers/Adam.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import numpy as np
 4 | 
 5 | from PuzzleLib import Config
 6 | 
 7 | from PuzzleLib.Backend import gpuarray
 8 | from PuzzleLib.Backend.Kernels.ElementWise import adamKer
 9 | 
10 | from PuzzleLib.Optimizers.Optimizer import Optimizer, trainSimpleTest, trainHardTest
11 | 
12 | 
13 | class Adam(Optimizer):
14 | 	def __init__(self, alpha=1e-3, beta1=0.9, beta2=0.999, epsilon=1e-8, nodeinfo=None):
15 | 		super().__init__(nodeinfo)
16 | 
17 | 		self.alpha = None
18 | 		self.beta1 = None
19 | 		self.beta2 = None
20 | 		self.epsilon = None
21 | 
22 | 		self.setAttr("alpha", alpha)
23 | 		self.setAttr("beta1", beta1)
24 | 		self.setAttr("beta2", beta2)
25 | 		self.setAttr("epsilon", epsilon)
26 | 
27 | 
28 | 	def setupState(self, var):
29 | 		return {
30 | 			"mg": gpuarray.zeros(var.data.shape, dtype=np.float32),
31 | 			"ms": gpuarray.zeros(var.data.shape, dtype=np.float32)
32 | 		}
33 | 
34 | 
35 | 	def updateVar(self, var, state, stream=None):
36 | 		fix1, fix2 = 1.0 - self.beta1**self.t, 1.0 - self.beta2**self.t
37 | 		self.learnRate = self.alpha * math.sqrt(fix2) / fix1
38 | 
39 | 		fix1, fix2 = 1.0 - self.beta1, 1.0 - self.beta2
40 | 		adamKer(var.data.dtype)(
41 | 			var.data, var.grad, state["mg"], state["ms"], self.learnRate * var.learnRate, fix1, fix2, self.epsilon,
42 | 			stream=stream
43 | 		)
44 | 
45 | 
46 | def unittest():
47 | 	for dtype, atol in gpuarray.dtypesSupported():
48 | 		calcTest(dtype, atol)
49 | 		trainSimpleTest(Adam, dtype, alpha=1e-2)
50 | 
51 | 		if Config.backend == Config.Backend.cuda:
52 | 			trainHardTest(Adam, dtype, alpha=1e-2)
53 | 
54 | 
55 | def calcTest(dtype, atol):
56 | 	alpha, beta1, beta2, epsilon = 0.01, 0.9, 0.999, 1e-8
57 | 	shape = (11, 13)
58 | 
59 | 	hostW, hostDw = np.random.randn(*shape).astype(dtype), np.random.randn(*shape).astype(dtype)
60 | 	hostMs, hostMg = (1.0 + np.random.randn(*shape)**2).astype(np.float32), np.random.randn(*shape).astype(np.float32)
61 | 
62 | 	w, dw = gpuarray.to_gpu(hostW), gpuarray.to_gpu(hostDw)
63 | 	ms, mg = gpuarray.to_gpu(hostMs), gpuarray.to_gpu(hostMg)
64 | 
65 | 	fix1, fix2 = 1.0 - beta1, 1.0 - beta2
66 | 	lr = alpha * math.sqrt(fix2) / fix1
67 | 
68 | 	fix1, fix2 = 1.0 - beta1, 1.0 - beta2
69 | 	adamKer(w.dtype)(w, dw, mg, ms, lr, fix1, fix2, epsilon)
70 | 
71 | 	hostW, hostDw = hostW.astype(np.float32), hostDw.astype(np.float32)
72 | 
73 | 	hostMg = (1 - fix1) * hostMg + fix1 * hostDw
74 | 	hostMs = (1 - fix2) * hostMs + fix2 * hostDw**2
75 | 	hostW += lr * hostMg / (np.sqrt(hostMs) + epsilon)
76 | 
77 | 	hostW, hostDw = hostW.astype(dtype), hostDw.astype(dtype)
78 | 
79 | 	assert np.allclose(hostMg, mg.get(), atol=atol)
80 | 	assert np.allclose(hostMs, ms.get(), atol=atol)
81 | 	assert np.allclose(hostW, w.get(), atol=atol)
82 | 
83 | 
84 | if __name__ == "__main__":
85 | 	unittest()
86 | 


--------------------------------------------------------------------------------
/Converter/TensorRT/Tests/MnistLenetTest.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from PuzzleLib.Backend import gpuarray
 4 | from PuzzleLib.Datasets import MnistLoader
 5 | 
 6 | from PuzzleLib.Containers import *
 7 | from PuzzleLib.Modules import *
 8 | from PuzzleLib.Handlers import *
 9 | from PuzzleLib.Optimizers import MomentumSGD
10 | from PuzzleLib.Cost import CrossEntropy
11 | 
12 | from PuzzleLib.Converter.TensorRT.Tests.Common import benchModels
13 | from PuzzleLib.Converter.TensorRT.BuildRTEngine import buildRTEngine, RTDataType
14 | from PuzzleLib.Converter.TensorRT.DataCalibrator import DataCalibrator
15 | 
16 | 
17 | def buildNet():
18 | 	seq = Sequential(name="lenet-5-like")
19 | 	seq.append(Conv2D(1, 16, 3))
20 | 	seq.append(MaxPool2D())
21 | 	seq.append(Activation(relu))
22 | 
23 | 	seq.append(Conv2D(16, 32, 4))
24 | 	seq.append(MaxPool2D())
25 | 	seq.append(Activation(relu))
26 | 
27 | 	seq.append(Flatten())
28 | 	seq.append(Linear(32 * 5 * 5, 1024))
29 | 	seq.append(Activation(relu))
30 | 
31 | 	seq.append(Linear(1024, 10))
32 | 
33 | 	return seq
34 | 
35 | 
36 | def trainNet(net, data, labels, epochs):
37 | 	optimizer = MomentumSGD()
38 | 	optimizer.setupOn(net, useGlobalState=True)
39 | 	optimizer.learnRate = 0.1
40 | 	optimizer.momRate = 0.9
41 | 
42 | 	cost = CrossEntropy(maxlabels=10)
43 | 	trainer = Trainer(net, cost, optimizer)
44 | 	validator = Validator(net, cost)
45 | 
46 | 	for i in range(epochs):
47 | 		trainer.trainFromHost(
48 | 			data[:60000], labels[:60000], macroBatchSize=60000,
49 | 			onMacroBatchFinish=lambda train: print("Train error: %s" % train.cost.getMeanError())
50 | 		)
51 | 		print("Accuracy: %s" % (1.0 - validator.validateFromHost(data[60000:], labels[60000:], macroBatchSize=10000)))
52 | 
53 | 		optimizer.learnRate *= 0.9
54 | 
55 | 
56 | def validate(net, data, labels, batchsize=1):
57 | 	cost = CrossEntropy(maxlabels=10)
58 | 	validator = Validator(net, cost, batchsize=batchsize)
59 | 
60 | 	return 1.0 - validator.validateFromHost(data[60000:], labels[60000:], macroBatchSize=10000)
61 | 
62 | 
63 | def main():
64 | 	mnist = MnistLoader()
65 | 	data, labels = mnist.load(path="../TestData/")
66 | 	data, labels = data[:], labels[:]
67 | 	print("Loaded mnist")
68 | 
69 | 	np.random.seed(1234)
70 | 
71 | 	net = buildNet()
72 | 	trainNet(net, data, labels, 15)
73 | 
74 | 	calibrator = DataCalibrator(data[:60000], cachename="../TestData/mnist_calibration_cache.bin")
75 | 	net.evalMode()
76 | 
77 | 	engine = buildRTEngine(
78 | 		net, inshape=data[:1].shape, savepath="../TestData", dtype=RTDataType.int8, calibrator=calibrator
79 | 	)
80 | 
81 | 	benchModels(net, engine, gpuarray.to_gpu(data[:1]))
82 | 
83 | 	print("Net    accuracy: %s" % validate(net, data, labels))
84 | 	print("Engine accuracy: %s" % validate(engine, data, labels, batchsize=1))
85 | 
86 | 
87 | if __name__ == "__main__":
88 | 	main()
89 | 


--------------------------------------------------------------------------------