├── .coveragerc ├── .gitattributes ├── .gitignore ├── .vscode └── settings.json ├── LICENSE ├── MANIFEST.in ├── README.md ├── pyproject.toml ├── requirements.txt ├── setup.py ├── src ├── CPUComplexCopy.h ├── CPUComplexType.h ├── CPUComplexTypeImpl.h ├── ComplexTensorApply.h ├── ComplexTypeInfo.h ├── General.h ├── SIMD │ ├── AVX.h │ ├── AVX2.h │ ├── Default.h │ ├── DefaultImpl.h │ └── SIMD.h ├── THTensorApply.h ├── Utils.h └── module.cpp ├── test.py ├── tests ├── __init__.py └── test_tensor.py ├── torch_complex └── __init__.py └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = 3 | torch_complex 4 | tests 5 | branch = True 6 | omit = 7 | torch_complex/cli.py 8 | 9 | [report] 10 | exclude_lines = 11 | no cov 12 | no qa 13 | noqa 14 | pragma: no cover 15 | if __name__ == .__main__.: 16 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig 2 | 3 | # Created by https://www.gitignore.io/api/macos,visualstudiocode,c,c++,python 4 | 5 | ### C ### 6 | # Prerequisites 7 | *.d 8 | 9 | # Object files 10 | *.o 11 | *.ko 12 | *.obj 13 | *.elf 14 | 15 | # Linker output 16 | *.ilk 17 | *.map 18 | *.exp 19 | 20 | # Precompiled Headers 21 | *.gch 22 | *.pch 23 | 24 | # Libraries 25 | *.lib 26 | *.a 27 | *.la 28 | *.lo 29 | 30 | # Shared objects (inc. Windows DLLs) 31 | *.dll 32 | *.so 33 | *.so.* 34 | *.dylib 35 | 36 | torch_complex/*.dll 37 | torch_complex/*.so 38 | torch_complex/*.so.* 39 | torch_complex/*.dylib 40 | 41 | # Executables 42 | *.exe 43 | *.out 44 | *.app 45 | *.i*86 46 | *.x86_64 47 | *.hex 48 | 49 | # Debug files 50 | *.dSYM/ 51 | *.su 52 | *.idb 53 | *.pdb 54 | 55 | # Kernel Module Compile Results 56 | *.mod* 57 | *.cmd 58 | .tmp_versions/ 59 | modules.order 60 | Module.symvers 61 | Mkfile.old 62 | dkms.conf 63 | 64 | ### C++ ### 65 | # Prerequisites 66 | 67 | # Compiled Object files 68 | *.slo 69 | 70 | # Precompiled Headers 71 | 72 | # Compiled Dynamic libraries 73 | 74 | # Fortran module files 75 | *.mod 76 | *.smod 77 | 78 | # Compiled Static libraries 79 | *.lai 80 | 81 | # Executables 82 | 83 | ### macOS ### 84 | # General 85 | .DS_Store 86 | .AppleDouble 87 | .LSOverride 88 | 89 | # Icon must end with two \r 90 | Icon 91 | 92 | # Thumbnails 93 | ._* 94 | 95 | # Files that might appear in the root of a volume 96 | .DocumentRevisions-V100 97 | .fseventsd 98 | .Spotlight-V100 99 | .TemporaryItems 100 | .Trashes 101 | .VolumeIcon.icns 102 | .com.apple.timemachine.donotpresent 103 | 104 | # Directories potentially created on remote AFP share 105 | .AppleDB 106 | .AppleDesktop 107 | Network Trash Folder 108 | Temporary Items 109 | .apdisk 110 | 111 | ### Python ### 112 | # Byte-compiled / optimized / DLL files 113 | __pycache__/ 114 | *.py[cod] 115 | *$py.class 116 | 117 | # C extensions 118 | 119 | # Distribution / packaging 120 | .Python 121 | build/ 122 | develop-eggs/ 123 | dist/ 124 | downloads/ 125 | eggs/ 126 | .eggs/ 127 | lib/ 128 | lib64/ 129 | parts/ 130 | sdist/ 131 | var/ 132 | wheels/ 133 | *.egg-info/ 134 | .installed.cfg 135 | *.egg 136 | MANIFEST 137 | 138 | # PyInstaller 139 | # Usually these files are written by a python script from a template 140 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 141 | *.manifest 142 | *.spec 143 | 144 | # Installer logs 145 | pip-log.txt 146 | pip-delete-this-directory.txt 147 | 148 | # Unit test / coverage reports 149 | htmlcov/ 150 | .tox/ 151 | .coverage 152 | .coverage.* 153 | .cache 154 | nosetests.xml 155 | coverage.xml 156 | *.cover 157 | .hypothesis/ 158 | .pytest_cache/ 159 | 160 | # Translations 161 | *.mo 162 | *.pot 163 | 164 | # Django stuff: 165 | *.log 166 | local_settings.py 167 | db.sqlite3 168 | 169 | # Flask stuff: 170 | instance/ 171 | .webassets-cache 172 | 173 | # Scrapy stuff: 174 | .scrapy 175 | 176 | # Sphinx documentation 177 | docs/_build/ 178 | 179 | # PyBuilder 180 | target/ 181 | 182 | # Jupyter Notebook 183 | .ipynb_checkpoints 184 | 185 | # IPython 186 | profile_default/ 187 | ipython_config.py 188 | 189 | # pyenv 190 | .python-version 191 | 192 | # celery beat schedule file 193 | celerybeat-schedule 194 | 195 | # SageMath parsed files 196 | *.sage.py 197 | 198 | # Environments 199 | .env 200 | .venv 201 | env/ 202 | venv/ 203 | ENV/ 204 | env.bak/ 205 | venv.bak/ 206 | 207 | # Spyder project settings 208 | .spyderproject 209 | .spyproject 210 | 211 | # Rope project settings 212 | .ropeproject 213 | 214 | # mkdocs documentation 215 | /site 216 | 217 | # mypy 218 | .mypy_cache/ 219 | .dmypy.json 220 | dmypy.json 221 | 222 | ### Python Patch ### 223 | .venv/ 224 | 225 | ### Python.VirtualEnv Stack ### 226 | # Virtualenv 227 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ 228 | [Bb]in 229 | [Ii]nclude 230 | [Ll]ib 231 | [Ll]ib64 232 | [Ll]ocal 233 | [Ss]cripts 234 | pyvenv.cfg 235 | pip-selfcheck.json 236 | 237 | ### VisualStudioCode ### 238 | .vscode/* 239 | !.vscode/settings.json 240 | !.vscode/tasks.json 241 | !.vscode/launch.json 242 | !.vscode/extensions.json 243 | 244 | 245 | # End of https://www.gitignore.io/api/macos,visualstudiocode,c,c++,python 246 | 247 | # Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option) 248 | 249 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.venvPath": "/Users/roger/.virtualenvs", 3 | "python.pythonPath": "/Users/roger/.virtualenvs/torch/bin/python", 4 | "files.associations": { 5 | "forward_list": "cpp", 6 | "list": "cpp", 7 | "string": "cpp", 8 | "valarray": "cpp", 9 | "vector": "cpp", 10 | "__bit_reference": "cpp", 11 | "__string": "cpp", 12 | "algorithm": "cpp", 13 | "string_view": "cpp", 14 | "__config": "cpp", 15 | "__nullptr": "cpp", 16 | "cstddef": "cpp", 17 | "exception": "cpp", 18 | "initializer_list": "cpp", 19 | "new": "cpp", 20 | "stdexcept": "cpp", 21 | "type_traits": "cpp", 22 | "typeinfo": "cpp", 23 | "variant": "cpp" 24 | } 25 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 U.N. Owen 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyTorch-Complex 2 | 3 | Complex-valued tensor support for [PyTorch](https://github.com/pytorch/pytorch). (Work in progress) 4 | 5 | **Warning**: This package is at very early stage, do not use it. 6 | 7 | ## Usage 8 | 9 | **Warning**: this package requires a fresh build of PyTorch 10 | revision 6cb593b88cb0c411690b4957850058329526d87b. Other 11 | revisions may work, but you will void the warranty. 12 | 13 | To use this commit, `git clone` pytorch and checkout to this commit, then build pytorch from soruce. 14 | After you build PyTorch successfully, you will be able to build this plugin just like a normal Python package: 15 | 16 | ```sh 17 | python setup.py install 18 | python setup.py build 19 | python setup.py test 20 | ``` 21 | 22 | ```python 23 | from torch_complex import torch 24 | ``` 25 | 26 | or 27 | 28 | ```python 29 | import torch_complex.torch as torch 30 | ``` 31 | 32 | then the complex tensor support will be in `torch` module. Use it just like the other tensor types. 33 | 34 | ## Contribution 35 | 36 | Please read [Pytorch/#755](https://github.com/pytorch/pytorch/issues/755) first. 37 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = 'pytorch-complex' 3 | version = '0.0.1' 4 | description = 'Complex tensor support for PyTorch' 5 | author = 'Roger Luo' 6 | author_email = 'rogerluo.rl18@gmail.com' 7 | license = 'MIT' 8 | url = 'https://github.com/Roger-luo/pytorch-complex' 9 | 10 | [requires] 11 | python_version = ['2.7', '3.5', '3.6', 'pypy', 'pypy3'] 12 | 13 | [build-system] 14 | requires = ['setuptools', 'wheel'] 15 | 16 | [tool.hatch.commands] 17 | prerelease = 'hatch build' 18 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -e . 2 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from io import open 2 | 3 | import platform 4 | import os, shutil, torch 5 | from setuptools import setup, find_packages 6 | import distutils.command.clean 7 | from torch.utils.cpp_extension import CppExtension 8 | 9 | class clean(distutils.command.clean.clean): 10 | 11 | def run(self): 12 | import glob 13 | import re 14 | with open('.gitignore', 'r') as f: 15 | ignores = f.read() 16 | pat = re.compile(r'^#( BEGIN NOT-CLEAN-FILES )?') 17 | for wildcard in filter(None, ignores.split('\n')): 18 | 19 | match = pat.match(wildcard) 20 | if match: 21 | if match.group(1): 22 | # Marker is found and stop reading .gitignore. 23 | break 24 | # Ignore lines which begin with '#'. 25 | else: 26 | for filename in glob.glob(wildcard): 27 | # skip vscode 28 | vscode_pat = re.compile(r'.vscode/.*') 29 | if re.match(vscode_pat, filename): 30 | continue 31 | 32 | try: 33 | os.remove(filename) 34 | except OSError: 35 | shutil.rmtree(filename, ignore_errors=True) 36 | 37 | # It's an old-style class in Python 2.7... 38 | distutils.command.clean.clean.run(self) 39 | 40 | 41 | with open('torch_complex/__init__.py', 'r') as f: 42 | for line in f: 43 | if line.startswith('__version__'): 44 | version = line.strip().split('=')[1].strip(' \'"') 45 | break 46 | else: 47 | version = '0.0.1' 48 | 49 | with open('README.md', 'r', encoding='utf-8') as f: 50 | readme = f.read() 51 | 52 | REQUIRES = [] 53 | 54 | cmdclass = { 55 | "build_ext": torch.utils.cpp_extension.BuildExtension, 56 | 'clean': clean, 57 | } 58 | 59 | if platform.system() == 'Darwin': 60 | extra_compile_args = ["-g", "-stdlib=libc++", "-std=c++11"] 61 | else: 62 | extra_compile_args = ["-g"] 63 | 64 | ext_modules = [ 65 | CppExtension( 66 | "torch_complex.cpp", 67 | ["src/module.cpp"], 68 | extra_compile_args=extra_compile_args, 69 | ) 70 | ] 71 | 72 | setup( 73 | name='torch-complex', 74 | version=version, 75 | description='', 76 | long_description=readme, 77 | author='Roger Luo', 78 | author_email='rogerluo.rl18@gmail.com', 79 | maintainer='Roger Luo', 80 | maintainer_email='rogerluo.rl18@gmail.com', 81 | url='https://github.com/_/torch-complex', 82 | license='MIT', 83 | 84 | keywords=[ 85 | '', 86 | ], 87 | 88 | classifiers=[ 89 | 'Development Status :: 4 - Beta', 90 | 'Intended Audience :: Developers', 91 | 'License :: OSI Approved :: MIT License', 92 | 'Natural Language :: English', 93 | 'Operating System :: OS Independent', 94 | 'Programming Language :: Python :: 2.7', 95 | 'Programming Language :: Python :: 3.5', 96 | 'Programming Language :: Python :: 3.6', 97 | 'Programming Language :: Python :: Implementation :: CPython', 98 | ], 99 | 100 | install_requires=REQUIRES, 101 | tests_require=['coverage', 'pytest'], 102 | 103 | packages=find_packages(), 104 | ext_modules=ext_modules, 105 | cmdclass=cmdclass, 106 | ) 107 | -------------------------------------------------------------------------------- /src/CPUComplexCopy.h: -------------------------------------------------------------------------------- 1 | #ifndef CPU_COMPLEX_COPY_H 2 | #define CPU_COMPLEX_COPY_H 3 | 4 | #include "General.h" 5 | #include "ComplexTypeInfo.h" 6 | #include "CPUComplexType.h" 7 | 8 | namespace at { 9 | 10 | template 11 | struct CPUCopy; 12 | 13 | // template arguments is mixed with C macros, e.g 14 | // TH_TENSOR_APPLY2(CPUTypeInfo, ...) 15 | // will not be correct... 16 | 17 | // template 18 | // struct CPUCopy { 19 | 20 | // inline static void eval(TensorImpl *dst, TensorImpl *src) { 21 | // CPUTypeInfo::scalar_t *dst_data = NULL; 22 | 23 | // TH_TENSOR_APPLY2( 24 | // CPUTypeInfo::scalar_t, dst, 25 | // CPUTypeInfo::scalar_t, src, 26 | // *dst_data = static_cast::scalar_t>(static_cast::scalar_t>>(*src_data)); 27 | // ) 28 | // } 29 | // }; 30 | 31 | // Copy from THTensorCopy 32 | // 33 | // C and C++ have a lovely set of implicit conversion rules, where casting 34 | // signed integral values to unsigned integral values is always valid 35 | // (it basically treats the value as if using modulo arithmetic), however 36 | // converting negative floating point values to unsigned integral types 37 | // is UB! This means that: (double)-1 -> (int64_t)-1 -> (uint8_t)255 is 38 | // guaranteed to look like this, but we have (double)-1 -> (uint8_t) 39 | // because it's UB. This also makes UBSan really angry. 40 | // 41 | // I think those rules are stupid and we really shouldn't conform to them. 42 | // The structs below ensure that for all unsigned types we use (currently 43 | // only uint8_t), we will do an intermediate convertion via int64_t, 44 | // to ensure that any negative values are wrapped around correctly. 45 | // 46 | // Note that conversions from doubles to signed integral types that can't 47 | // represent a particular value after truncating the fracitonal part are UB as well, 48 | // but fixing them is not as simple as adding an int64_t intermediate, beacuse the 49 | // int64_t -> conversion is UB for those large values anyway. 50 | // I guess in that case we just have to live with that, but it's definitely less 51 | // surprising than the thing above. 52 | // 53 | // For the curious: 54 | // https://en.cppreference.com/w/cpp/language/implicit_conversion 55 | // The relevant paragraph is "Floating–integral conversions". 56 | 57 | template 58 | struct inter_copy_type { 59 | using type = T; 60 | }; 61 | 62 | template<> 63 | struct inter_copy_type { 64 | using type = int64_t; 65 | }; 66 | 67 | template 68 | using inter_copy_type_t = typename inter_copy_type::type; 69 | 70 | 71 | template 72 | struct CPUCopy { 73 | inline static void eval(TensorImpl *dst, TensorImpl *src) { 74 | TH_TENSOR_APPLY2( 75 | DST, dst, 76 | SRC, src, 77 | *dst_data = static_cast(static_cast>(*src_data)); 78 | ) 79 | } 80 | }; 81 | 82 | // copy from complex to real 83 | template 84 | struct CPUCopy> { 85 | inline static void eval(TensorImpl *dst, TensorImpl *src) { 86 | TH_TENSOR_APPLY2( 87 | DST, dst, 88 | std::complex, src, 89 | *dst_data = static_cast(static_cast>((*src_data).real())); 90 | ) 91 | } 92 | }; 93 | 94 | 95 | template 96 | Tensor & CPUComplexType::s_copy_(Tensor & dst, const Tensor & src, bool non_blocking) const { 97 | checked_tensor_unwrap(dst, "dst", 0, false, Backend::CPU, CPUComplexTypeInfo::scalar_type); 98 | 99 | switch (src.type().ID()) { 100 | case TypeID::CPUByte: 101 | CPUCopy, int8_t>::eval(dst.unsafeGetTensorImpl(), src.unsafeGetTensorImpl()); 102 | break; 103 | case TypeID::CPUChar: 104 | CPUCopy, int8_t>::eval(dst.unsafeGetTensorImpl(), src.unsafeGetTensorImpl()); 105 | break; 106 | case TypeID::CPUDouble: 107 | std::cout << "double is copied to complex" << std::endl; 108 | CPUCopy, double>::eval(dst.unsafeGetTensorImpl(), src.unsafeGetTensorImpl()); 109 | break; 110 | case TypeID::CPUFloat: 111 | CPUCopy, float>::eval(dst.unsafeGetTensorImpl(), src.unsafeGetTensorImpl()); 112 | break; 113 | case TypeID::CPUComplexFloat: 114 | CPUCopy, std::complex>::eval(dst.unsafeGetTensorImpl(), src.unsafeGetTensorImpl()); 115 | break; 116 | case TypeID::CPUComplexDouble: 117 | CPUCopy, std::complex>::eval(dst.unsafeGetTensorImpl(), src.unsafeGetTensorImpl()); 118 | break; 119 | case TypeID::CPUInt: 120 | CPUCopy, int32_t>::eval(dst.unsafeGetTensorImpl(), src.unsafeGetTensorImpl()); 121 | break; 122 | case TypeID::CPULong: 123 | CPUCopy, int64_t>::eval(dst.unsafeGetTensorImpl(), src.unsafeGetTensorImpl()); 124 | break; 125 | case TypeID::CPUShort: 126 | CPUCopy, int16_t>::eval(dst.unsafeGetTensorImpl(), src.unsafeGetTensorImpl()); 127 | break; 128 | case TypeID::CPUHalf: 129 | std::cout << "copy half" << std::endl; 130 | break; 131 | default: 132 | at::_s_copy_from(src, dst, non_blocking); 133 | return dst; 134 | } 135 | 136 | dst.unsafeGetTensorImpl()->maybe_zero_dim(src.dim() == 0); 137 | return dst; 138 | } 139 | 140 | template 141 | Tensor CPUComplexType::_s_copy_from(const Tensor & src, const Tensor & dst, bool non_blocking) const { 142 | // This handles the copy from other types 143 | 144 | switch (dst.type().ID()) { 145 | case TypeID::CPUByte: 146 | CPUCopy>::eval(dst.unsafeGetTensorImpl(), src.unsafeGetTensorImpl()); 147 | break; 148 | case TypeID::CPUChar: 149 | CPUCopy>::eval(dst.unsafeGetTensorImpl(), src.unsafeGetTensorImpl()); 150 | break; 151 | case TypeID::CPUDouble: 152 | CPUCopy>::eval(dst.unsafeGetTensorImpl(), src.unsafeGetTensorImpl()); 153 | break; 154 | case TypeID::CPUFloat: 155 | CPUCopy>::eval(dst.unsafeGetTensorImpl(), src.unsafeGetTensorImpl()); 156 | break; 157 | case TypeID::CPUInt: 158 | CPUCopy>::eval(dst.unsafeGetTensorImpl(), src.unsafeGetTensorImpl()); 159 | break; 160 | case TypeID::CPULong: 161 | CPUCopy>::eval(dst.unsafeGetTensorImpl(), src.unsafeGetTensorImpl()); 162 | break; 163 | case TypeID::CPUShort: 164 | CPUCopy>::eval(dst.unsafeGetTensorImpl(), src.unsafeGetTensorImpl()); 165 | break; 166 | default: 167 | AT_ERROR("copy does not support ", src.type().toString(), " to ", dst.type().toString(), " copy (copy_from case)."); 168 | } 169 | dst.unsafeGetTensorImpl()->maybe_zero_dim(src.dim() == 0); 170 | return dst; 171 | } 172 | 173 | } // at 174 | 175 | #endif // CPU_COMPLEX_COPY_H 176 | -------------------------------------------------------------------------------- /src/CPUComplexType.h: -------------------------------------------------------------------------------- 1 | #ifndef CPUComplexType_H 2 | #define CPUComplexType_H 3 | 4 | #include "General.h" 5 | #include "ComplexTypeInfo.h" 6 | 7 | namespace at { 8 | 9 | template 10 | struct CPUComplexType: public at::CPUTypeDefault { 11 | 12 | CPUComplexType() 13 | : CPUTypeDefault(CPUTensorId(), /*is_variable=*/false, /*is_undefined=*/false) {} 14 | 15 | virtual ScalarType scalarType() const override; 16 | virtual caffe2::TypeMeta typeMeta() const override; 17 | Backend backend() const override; 18 | const char * toString() const override; 19 | size_t elementSizeInBytes() const override; 20 | TypeID ID() const override; 21 | Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const override; 22 | Tensor _s_copy_from(const Tensor & self, const Tensor & dst, bool non_blocking) const override; 23 | 24 | // Tensor & resize_(Tensor & self, IntList size) const override; 25 | 26 | /* 27 | Tensor _th_tensor(Storage storage, int64_t storageOffset, IntList sizes, IntList strides) const override; 28 | Tensor _th_tensor(IntList sizes, IntList strides) const override; 29 | */ 30 | Tensor empty(IntList size, const TensorOptions & options) const override; 31 | /* 32 | Tensor tensor() const override; 33 | */ 34 | 35 | Tensor & set_(Tensor & self, Storage source, int64_t storage_offset, IntList size, IntList stride) const override; 36 | // Tensor & set_(Tensor & self, Storage source) const override; 37 | // Tensor & set_(Tensor & self, const Tensor & source) const override; 38 | // Tensor & set_(Tensor & self) const override; 39 | 40 | Tensor & cat_out(Tensor & self, TensorList tensors, int64_t dim) const override; 41 | Tensor cat(TensorList tensors, int64_t dim) const override; 42 | 43 | Tensor & fill_(Tensor & self, Scalar value) const override; 44 | Tensor & fill_(Tensor & self, const Tensor & value) const override; 45 | 46 | Tensor & zero_(Tensor & self) const override; 47 | Tensor & native_zero_(Tensor & self) const override; 48 | void* data_ptr(const Tensor & self) const override; 49 | Scalar _local_scalar_dense(const Tensor & self) const override; 50 | 51 | // LinearAlgebra 52 | Tensor & mv_out(Tensor & result, const Tensor & self, const Tensor & vec) const override; 53 | Tensor mv(const Tensor & self, const Tensor & vec) const override; 54 | Tensor & mm_out(Tensor & result, const Tensor & self, const Tensor & mat2) const override; 55 | Tensor mm(const Tensor & self, const Tensor & mat2) const override; 56 | }; 57 | 58 | } // namespace at 59 | 60 | #include "CPUComplexTypeImpl.h" 61 | #include "CPUComplexCopy.h" 62 | 63 | #endif // CPUComplexType_H 64 | -------------------------------------------------------------------------------- /src/CPUComplexTypeImpl.h: -------------------------------------------------------------------------------- 1 | #include "CPUComplexType.h" 2 | #include "Utils.h" 3 | #include "ComplexTensorApply.h" 4 | #include "SIMD/SIMD.h" 5 | 6 | namespace at { 7 | 8 | template 9 | ScalarType CPUComplexType::scalarType() const { 10 | return CPUComplexTypeInfo::scalar_type; 11 | } 12 | 13 | template 14 | caffe2::TypeMeta CPUComplexType::typeMeta() const { 15 | return scalarTypeToTypeMeta(CPUComplexTypeInfo::scalar_type); 16 | } 17 | 18 | template 19 | Backend CPUComplexType::backend() const { 20 | return Backend::CPU; 21 | } 22 | 23 | template 24 | TypeID CPUComplexType::ID() const { 25 | return CPUComplexTypeInfo::type_id; 26 | } 27 | 28 | template 29 | size_t CPUComplexType::elementSizeInBytes() const { 30 | return 2 * sizeof(PT); 31 | } 32 | 33 | #if 0 34 | template 35 | Tensor CPUComplexType::_th_tensor(Storage storage, int64_t storageOffset, IntList sizes, IntList strides) const { 36 | // DeviceGuard omitted 37 | 38 | // checks 39 | if (strides.data()) {AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match");} 40 | auto storage_ = checked_storage(storage, "storage", 1, DeviceType::CPU, at::scalarTypeToDataType(CPUComplexTypeInfo::scalar_type)); 41 | 42 | // make tensor 43 | auto self = c10::make_intrusive( 44 | /* storage */ std::move(storage_), 45 | /* tensor type id */ at::CPUTensorId(), 46 | /* is_variable */ false); 47 | 48 | /* storageOffset */ 49 | if(storageOffset < 0) 50 | THError("Tensor: invalid storage offset"); 51 | self->set_storage_offset(storageOffset); 52 | 53 | // set size 54 | self->set_sizes_and_strides(sizes, strides); 55 | return Tensor(self); 56 | } 57 | 58 | template 59 | Tensor CPUComplexType::_th_tensor(IntList sizes, IntList strides) const { 60 | // DeviceGuard omitted 61 | int64_t numel = 1; 62 | for (auto s : sizes) { 63 | numel *= s; 64 | } 65 | 66 | Storage s{c10::make_intrusive( 67 | scalarTypeToTypeMeta(CPUComplexTypeInfo::scalar_type), 68 | numel, 69 | getCPUAllocator(), 70 | /* resizable */ true)}; 71 | 72 | return tensor(s, 0, sizes, strides); 73 | } 74 | #endif 75 | 76 | template 77 | Tensor CPUComplexType::empty(IntList size, const TensorOptions & options) const { 78 | const DeviceGuard device_guard(options.device()); 79 | return at::native::empty_cpu(/* actuals */ size, options); 80 | } 81 | 82 | #if 0 83 | template 84 | Tensor CPUComplexType::tensor() const { 85 | Storage s{c10::make_intrusive( 86 | scalarTypeToTypeMeta(CPUComplexTypeInfo::scalar_type), 87 | 0, 88 | getCPUAllocator(), 89 | /* resizable */ true)}; 90 | 91 | // make tensor 92 | Tensor t{c10::make_intrusive( 93 | /* storage */ std::move(s), 94 | /* tensor type id */ at::CPUTensorId(), 95 | /* is_variable */ false)}; 96 | 97 | return t; 98 | } 99 | #endif 100 | 101 | template 102 | Tensor & CPUComplexType::set_(Tensor & self, Storage source, int64_t storage_offset, IntList sizes, IntList strides) const { 103 | // DeviceGuard omitted 104 | auto self_ = checked_tensor_unwrap(self,"self",1, false, Backend::CPU, CPUComplexTypeInfo::scalar_type); 105 | auto source_ = checked_storage(source,"source",2, DeviceType::CPU, at::scalarTypeToDataType(CPUComplexTypeInfo::scalar_type)); 106 | 107 | StorageImpl *storage_ptr = source.unsafeGetStorageImpl(); 108 | StorageImpl *self_storage_ptr = self_->storage().unsafeGetStorageImpl(); 109 | 110 | if (self_storage_ptr != storage_ptr) 111 | { 112 | if (!self_storage_ptr) { 113 | AT_ERROR("Tensor: invalid null storage"); 114 | } 115 | 116 | // steal storage 117 | self_->set_storage(at::Storage(c10::intrusive_ptr::reclaim(storage_ptr))); 118 | } 119 | 120 | if (storage_offset < 0) 121 | AT_ERROR("Tensor: invalid storage offset"); 122 | 123 | self_->set_storage_offset(storage_offset); 124 | // set size 125 | self_->set_sizes_and_strides(sizes, strides); 126 | self_->maybe_zero_dim(false); 127 | return self; 128 | } 129 | 130 | template 131 | Tensor & CPUComplexType::cat_out(Tensor & self, TensorList tensors, int64_t dim) const { 132 | const OptionalDeviceGuard device_guard(device_of(self)); 133 | // auto self_ = checked_tensor_unwrap(self, "self", 1, false, Backend::CPU, CPUComplexTypeInfo::scalar_type); 134 | // auto tensors_ = checked_tensor_unwrap(tensors, "tensors", 1, Backend::CPU, CPUComplexTypeInfo::scalar_type); 135 | 136 | AT_ERROR("catArray is not implemented, it's in THTensorMoreMath.cpp"); 137 | }; 138 | 139 | template 140 | Tensor CPUComplexType::cat(TensorList tensors, int64_t dim) const { 141 | AT_ERROR("cat not implemented"); 142 | }; 143 | 144 | /* NOTE: This C macro here mainly because ISO C++03 14.2/4 145 | * 146 | * When the name of a member template specialization appears after . or -> in a postfix-expression, 147 | * or after nested-name-specifier in a qualified-id, and the postfix-expression or qualified-id 148 | * explicitly depends on a template-parameter (14.6.2), the member template name must be prefixed 149 | * by the keyword template. Otherwise the name is assumed to name a non-template. 150 | * 151 | * We have TENSOR->data inside the TH_TENSOR_APPLY macro without template, but our implementation via 152 | * C++ templates for generic complex number requires a template keyword for data. 153 | * 154 | * This is just a workaround, when everything moves to ATen/native, we can use the new protocals. 155 | */ 156 | #define IMPLEMENT_FILL(PrecisionType) \ 157 | template <> \ 158 | Tensor & CPUComplexType::fill_(Tensor & self, Scalar value) const { \ 159 | const OptionalDeviceGuard device_guard(device_of(self)); \ 160 | auto self_ = checked_tensor_unwrap(self,"self",1, false, Backend::CPU, CPUComplexTypeInfo::scalar_type); \ 161 | auto value_ = value.to>(); \ 162 | \ 163 | if(self_->is_contiguous() || is_transposed(self_)) { \ 164 | TH_TENSOR_APPLY_CONTIG(std::complex, self_, simd::Default>::fill(self__data, value_, self__len); ); \ 165 | } else { \ 166 | TH_TENSOR_APPLY(std::complex, self_, \ 167 | if (self__stride == 1) { \ 168 | simd::Default>::fill(self__data, value_, self__size); \ 169 | self__i = self__size; \ 170 | self__data += self__stride * self__size; \ 171 | break; \ 172 | } else { \ 173 | *self__data = value_; \ 174 | } \ 175 | ); \ 176 | } \ 177 | \ 178 | return self; \ 179 | } 180 | 181 | IMPLEMENT_FILL(double) 182 | IMPLEMENT_FILL(float) 183 | 184 | template 185 | Tensor &CPUComplexType::fill_(Tensor &self, const Tensor & value) const { 186 | const OptionalDeviceGuard device_guard(device_of(self)); 187 | if (value.dim() == 0) { 188 | return static_cast(this)->fill_(self, value.item()); 189 | } 190 | AT_ERROR("fill_ only supports a 0-dimensional value tensor, but got tensor " 191 | "with ", value.dim(), " dimension(s)."); 192 | } 193 | 194 | template 195 | Tensor & CPUComplexType::zero_(Tensor & self) const { 196 | return fill_(self, Scalar(0.0)); 197 | } 198 | 199 | template 200 | Tensor &CPUComplexType::native_zero_(Tensor & self) const { 201 | return fill_(self, Scalar(0.0)); 202 | } 203 | 204 | template 205 | void *CPUComplexType::data_ptr(const Tensor & self) const { 206 | auto self_ = checked_tensor_unwrap(self,"self",1, false, Backend::CPU, CPUComplexTypeInfo::scalar_type); 207 | return self_->template data>(); 208 | } 209 | 210 | template 211 | Scalar CPUComplexType::_local_scalar_dense(const Tensor & self) const { 212 | const OptionalDeviceGuard device_guard(device_of(self)); 213 | const auto& self_ty = *this; 214 | (void)self_ty; 215 | return at::native::_local_scalar_dense_cpu(/* actuals */ self); 216 | } 217 | 218 | template <> 219 | inline const char * CPUComplexType::toString() const { 220 | return "CPUComplexTensor"; 221 | } 222 | 223 | template <> 224 | inline const char * CPUComplexType::toString() const { 225 | return "CPUComplexType"; 226 | } 227 | 228 | // Linear Algebra 229 | template 230 | Tensor & CPUComplexType::mv_out(Tensor & result, const Tensor & self, const Tensor & vec) const { 231 | AT_ERROR("mv_out not implemented"); 232 | } 233 | 234 | template 235 | Tensor CPUComplexType::mv(const Tensor & self, const Tensor & vec) const { 236 | AT_ERROR("mv not implemented"); 237 | } 238 | 239 | template 240 | Tensor CPUComplexType::mm(const Tensor &self, const Tensor &mat2) const { 241 | AT_ERROR("mm not implemented"); 242 | } 243 | 244 | template 245 | Tensor & CPUComplexType::mm_out(Tensor & result, const Tensor & self, const Tensor & mat2) const { 246 | AT_ERROR("mm_out not implemented"); 247 | } 248 | 249 | } // at 250 | -------------------------------------------------------------------------------- /src/ComplexTensorApply.h: -------------------------------------------------------------------------------- 1 | #include "ComplexTypeInfo.h" 2 | 3 | #ifndef NAN 4 | #define NAN (nan(NULL)) 5 | #endif 6 | 7 | #ifdef _OPENMP 8 | #include 9 | #endif 10 | 11 | #define HYPER_TH_OMP_OVERHEAD_THRESHOLD 2000 12 | #define ORDIN_TH_OMP_OVERHEAD_THRESHOLD 20000 13 | #define UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD 50000 14 | #define TH_OMP_OVERHEAD_THRESHOLD 100000 15 | 16 | #ifdef _OPENMP 17 | 18 | #ifndef _WIN32 19 | #define PRAGMA(P) _Pragma(#P) 20 | #else 21 | #define PRAGMA(P) __pragma(P) 22 | #endif 23 | 24 | #define TH_TENSOR_APPLY_CONTIG(TYPE, TENSOR, CODE) \ 25 | { \ 26 | int inOmp = omp_in_parallel(); \ 27 | ptrdiff_t TH_TENSOR_size = TENSOR->numel(); \ 28 | PRAGMA(omp parallel if ((TH_TENSOR_size > TH_OMP_OVERHEAD_THRESHOLD) && (!inOmp))) \ 29 | { \ 30 | size_t num_threads = omp_get_num_threads(); \ 31 | size_t tid = omp_get_thread_num(); \ 32 | ptrdiff_t TH_TENSOR_offset = tid * (TH_TENSOR_size / num_threads); \ 33 | ptrdiff_t TH_TENSOR_end = tid == num_threads - 1 ? TH_TENSOR_size : \ 34 | TH_TENSOR_offset + TH_TENSOR_size / num_threads; \ 35 | ptrdiff_t TENSOR##_len = TH_TENSOR_end - TH_TENSOR_offset; \ 36 | TYPE *TENSOR##_data = TENSOR->template data() + TH_TENSOR_offset; \ 37 | CODE \ 38 | } \ 39 | } 40 | #else 41 | #define TH_TENSOR_APPLY_CONTIG(TYPE, TENSOR, CODE) \ 42 | { \ 43 | TYPE *TENSOR##_data = TENSOR->template data(); \ 44 | ptrdiff_t TENSOR##_len = TENSOR->numel(); \ 45 | CODE \ 46 | } 47 | #endif 48 | 49 | #ifdef _OPENMP 50 | #define TH_TENSOR_APPLY2_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \ 51 | { \ 52 | int inOmp = omp_in_parallel(); \ 53 | ptrdiff_t TH_TENSOR_size = TENSOR->numel(); \ 54 | PRAGMA(omp parallel if ((TH_TENSOR_size > TH_OMP_OVERHEAD_THRESHOLD) && (!inOmp))) \ 55 | { \ 56 | size_t num_threads = omp_get_num_threads(); \ 57 | size_t tid = omp_get_thread_num(); \ 58 | ptrdiff_t TH_TENSOR_offset = tid * (TH_TENSOR_size / num_threads); \ 59 | ptrdiff_t TH_TENSOR_end = tid == num_threads - 1 ? TH_TENSOR_size : \ 60 | TH_TENSOR_offset + TH_TENSOR_size / num_threads; \ 61 | ptrdiff_t TENSOR1##_len = TH_TENSOR_end - TH_TENSOR_offset; \ 62 | TYPE1 *TENSOR1##_data = TENSOR1->template data() + TH_TENSOR_offset; \ 63 | TYPE2 *TENSOR2##_data = TENSOR2->template data() + TH_TENSOR_offset; \ 64 | CODE \ 65 | } \ 66 | } 67 | #else 68 | #define TH_TENSOR_APPLY2_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \ 69 | { \ 70 | TYPE1 *TENSOR1##_data = TENSOR1->template data(); \ 71 | TYPE2 *TENSOR2##_data = TENSOR2->template data(); \ 72 | ptrdiff_t TENSOR1##_len = TENSOR1->numel(); \ 73 | CODE \ 74 | } 75 | #endif 76 | 77 | #ifdef _OPENMP 78 | #define TH_TENSOR_APPLY3_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \ 79 | { \ 80 | int inOmp = omp_in_parallel(); \ 81 | ptrdiff_t TH_TENSOR_size = TENSOR1->numel(); \ 82 | PRAGMA(omp parallel if ((TH_TENSOR_size > TH_OMP_OVERHEAD_THRESHOLD) && (!inOmp))) \ 83 | { \ 84 | size_t num_threads = omp_get_num_threads(); \ 85 | size_t tid = omp_get_thread_num(); \ 86 | ptrdiff_t TH_TENSOR_offset = tid * (TH_TENSOR_size / num_threads); \ 87 | ptrdiff_t TH_TENSOR_end = tid == num_threads - 1 ? TH_TENSOR_size : \ 88 | TH_TENSOR_offset + TH_TENSOR_size / num_threads; \ 89 | ptrdiff_t TENSOR1##_len = TH_TENSOR_end - TH_TENSOR_offset; \ 90 | TYPE1 *TENSOR1##_data = TENSOR1->template data() + TH_TENSOR_offset; \ 91 | TYPE2 *TENSOR2##_data = TENSOR2->template data() + TH_TENSOR_offset; \ 92 | TYPE3 *TENSOR3##_data = TENSOR3->template data() + TH_TENSOR_offset; \ 93 | CODE \ 94 | } \ 95 | } 96 | #else 97 | #define TH_TENSOR_APPLY3_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \ 98 | { \ 99 | TYPE1 *TENSOR1##_data = TENSOR1->template data(); \ 100 | TYPE2 *TENSOR2##_data = TENSOR2->template data(); \ 101 | TYPE3 *TENSOR3##_data = TENSOR3->template data(); \ 102 | ptrdiff_t TENSOR1##_len = TENSOR1->numel(); \ 103 | CODE \ 104 | } 105 | #endif 106 | -------------------------------------------------------------------------------- /src/ComplexTypeInfo.h: -------------------------------------------------------------------------------- 1 | #ifndef COMPLEX_TYPE_INFO_H 2 | #define COMPLEX_TYPE_INFO_H 3 | 4 | #include "General.h" 5 | 6 | namespace at { 7 | 8 | template 9 | struct TypeInfo; 10 | 11 | template <> 12 | struct TypeInfo { 13 | using scalar_t = float; 14 | using precision_t = float; 15 | 16 | static const auto scalar_type = ScalarType::Float; 17 | static const auto type_id = TypeID::CPUFloat; 18 | }; 19 | 20 | template <> 21 | struct TypeInfo { 22 | using scalar_t = double; 23 | using precision_t = double; 24 | 25 | static const auto scalar_type = ScalarType::Double; 26 | static const auto type_id = TypeID::CPUDouble; 27 | }; 28 | 29 | template <> 30 | struct TypeInfo, Backend::CPU> { 31 | using scalar_t = std::complex; 32 | using precision_t = float; 33 | 34 | static const auto scalar_type = ScalarType::ComplexFloat; 35 | static const auto type_id = TypeID::CPUComplexFloat; 36 | }; 37 | 38 | template <> 39 | struct TypeInfo, Backend::CPU> { 40 | using scalar_t = std::complex; 41 | using precision_t = double; 42 | 43 | static const auto scalar_type = ScalarType::ComplexDouble; 44 | static const auto type_id = TypeID::CPUComplexDouble; 45 | }; 46 | 47 | 48 | template 49 | using CPUTypeInfo = TypeInfo; 50 | 51 | template 52 | using ComplexTypeInfo = TypeInfo, device>; 53 | 54 | template 55 | using CPUComplexTypeInfo = ComplexTypeInfo; 56 | 57 | } // at 58 | 59 | #endif // COMPLEX_TYPE_INFO_H -------------------------------------------------------------------------------- /src/General.h: -------------------------------------------------------------------------------- 1 | #ifndef GENERAL_H 2 | #define GENERAL_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | // #include "THTensorApply.h" 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | #include "ATen/Config.h" 30 | 31 | #endif // GENERAL_H 32 | -------------------------------------------------------------------------------- /src/SIMD/AVX.h: -------------------------------------------------------------------------------- 1 | #ifndef AVX_H 2 | #define AVX_H 3 | 4 | #if defined(__AVX__) 5 | #ifndef _MSC_VER 6 | #include 7 | #else 8 | #include 9 | #endif 10 | 11 | #include 12 | 13 | namespace simd { 14 | 15 | template 16 | struct AVX { 17 | static inline void copy(T *y, const T *x, const ptrdiff_t n); 18 | static inline void fill(T *z, const T c, const ptrdiff_t n); 19 | static inline void cdiv(T *z, const T *x, const T *y, const ptrdiff_t n); 20 | static inline void divs(T *z, const T *x, const T c, const ptrdiff_t n); 21 | static inline void cmul(T *z, const T *x, const T *y, const ptrdiff_t n); 22 | static inline void muls(T *z, const T *x, const T c, const ptrdiff_t n); 23 | static inline void cadd(T *z, const T *x, const T c, const ptrdiff_t n); 24 | }; 25 | 26 | } // simd 27 | 28 | #endif // defined(__AVX__) 29 | #endif // AVX_H -------------------------------------------------------------------------------- /src/SIMD/AVX2.h: -------------------------------------------------------------------------------- 1 | #ifndef AVX2_H 2 | #define AVX2_H 3 | 4 | #if defined(__AVX2__) 5 | #ifndef _MSC_VER 6 | #include 7 | #else 8 | #include 9 | #endif 10 | 11 | #include 12 | 13 | namespace simd { 14 | 15 | template 16 | struct AVX2 { 17 | static inline void copy(T *y, const T *x, const ptrdiff_t n); 18 | static inline void fill(T *z, const T c, const ptrdiff_t n); 19 | static inline void cdiv(T *z, const T *x, const T *y, const ptrdiff_t n); 20 | static inline void divs(T *z, const T *x, const T c, const ptrdiff_t n); 21 | static inline void cmul(T *z, const T *x, const T *y, const ptrdiff_t n); 22 | static inline void muls(T *z, const T *x, const T c, const ptrdiff_t n); 23 | static inline void cadd(T *z, const T *x, const T c, const ptrdiff_t n); 24 | }; 25 | 26 | } // simd 27 | 28 | #endif // defined(__AVX__) 29 | #endif // AVX2_H -------------------------------------------------------------------------------- /src/SIMD/Default.h: -------------------------------------------------------------------------------- 1 | #ifndef DEFAULT_H 2 | #define DEFAULT_H 3 | 4 | #include 5 | 6 | namespace simd { 7 | 8 | template 9 | struct Default { 10 | static inline void copy(T *y, const T *x, const ptrdiff_t n); 11 | static inline void fill(T *z, const T c, const ptrdiff_t n); 12 | static inline void cdiv(T *z, const T *x, const T *y, const ptrdiff_t n); 13 | static inline void divs(T *z, const T *x, const T c, const ptrdiff_t n); 14 | static inline void cmul(T *z, const T *x, const T *y, const ptrdiff_t n); 15 | static inline void muls(T *y, const T *x, const T c, const ptrdiff_t n); 16 | static inline void cadd(T *z, const T *x, const T *y, const T c, const ptrdiff_t n); 17 | static inline void adds(T *y, const T *x, const T c, const ptrdiff_t n); 18 | }; 19 | 20 | } // simd 21 | 22 | #include "DefaultImpl.h" 23 | 24 | #endif // DEFAULT_H 25 | -------------------------------------------------------------------------------- /src/SIMD/DefaultImpl.h: -------------------------------------------------------------------------------- 1 | #include "Default.h" 2 | 3 | namespace simd { 4 | 5 | template 6 | inline void Default::copy(T *y, const T *x, const ptrdiff_t n) { 7 | ptrdiff_t i = 0; 8 | 9 | for(; i 22 | inline void Default::fill(T *x, const T c, const ptrdiff_t n) { 23 | ptrdiff_t i = 0; 24 | 25 | for(; i 38 | inline void Default::cadd(T *z, const T *x, const T *y, const T c, const ptrdiff_t n) { 39 | ptrdiff_t i = 0; 40 | 41 | for(; i 54 | inline void Default::adds(T *y, const T *x, const T c, const ptrdiff_t n) { 55 | ptrdiff_t i = 0; 56 | 57 | for(; i 70 | inline void Default::cmul(T *z, const T *x, const T*y, const ptrdiff_t n) { 71 | ptrdiff_t i = 0; 72 | 73 | for(; i 86 | inline void Default::muls(T *y, const T *x, const T c, const ptrdiff_t n) 87 | { 88 | ptrdiff_t i = 0; 89 | 90 | for(; i 103 | inline void Default::cdiv(T *z, const T *x, const T *y, const ptrdiff_t n) 104 | { 105 | ptrdiff_t i = 0; 106 | 107 | for(; i 120 | inline void Default::divs(T *y, const T *x, const T c, const ptrdiff_t n) 121 | { 122 | ptrdiff_t i = 0; 123 | 124 | for(; idim(); TENSOR##_i++) \ 40 | TENSOR##_n *= TENSOR->size(TENSOR##_i); \ 41 | \ 42 | if(TENSOR->is_empty()) \ 43 | TH_TENSOR_APPLY_hasFinished = 1; \ 44 | else \ 45 | { \ 46 | TENSOR##_data = THTensor_getStoragePtr(TENSOR)->template data()+TENSOR->storage_offset(); \ 47 | TENSOR##_size = 1; \ 48 | TENSOR##_stride = 1; \ 49 | for(TENSOR##_i = THTensor_nDimensionLegacyAll(TENSOR)-1; TENSOR##_i >= 0; TENSOR##_i--) { \ 50 | if(THTensor_sizeLegacyNoScalars(TENSOR, TENSOR##_i) != 1) { \ 51 | if(THTensor_strideLegacyNoScalars(TENSOR, TENSOR##_i) == TENSOR##_size && TENSOR##_i != DIM) \ 52 | TENSOR##_size *= THTensor_sizeLegacyNoScalars(TENSOR, TENSOR##_i); \ 53 | else{ \ 54 | TENSOR##_contiguous = 0; \ 55 | break; \ 56 | } \ 57 | } \ 58 | } \ 59 | if (!TENSOR##_contiguous) { \ 60 | /* Find the dimension of contiguous sections */ \ 61 | TENSOR##_dim = 1; \ 62 | for(TENSOR##_i = THTensor_nDimensionLegacyAll(TENSOR)-2; TENSOR##_i >= 0; TENSOR##_i--) \ 63 | { \ 64 | if(TENSOR->stride(TENSOR##_i) != TENSOR->stride(TENSOR##_i+1) * TENSOR->size(TENSOR##_i+1) || TENSOR##_i == DIM || TENSOR##_i+1 == DIM) \ 65 | TENSOR##_dim++; \ 66 | } \ 67 | /* Allocate an array of 3*dim elements, where dim is the number of contiguous sections */ \ 68 | TENSOR##_counter = (int64_t*)THAlloc(sizeof(int64_t)*(3*TENSOR##_dim)); \ 69 | TENSOR##_sizes = TENSOR##_counter + TENSOR##_dim; \ 70 | TENSOR##_strides = TENSOR##_counter + 2*TENSOR##_dim; \ 71 | TH_TENSOR_dim_index = TENSOR##_dim-1; \ 72 | TENSOR##_dimOffset = (DIM == THTensor_nDimensionLegacyAll(TENSOR)-1) ? &TENSOR##_i : &TENSOR##_counter[DIM]; \ 73 | TENSOR##_sizes[TH_TENSOR_dim_index] = THTensor_sizeLegacyNoScalars(TENSOR, THTensor_nDimensionLegacyAll(TENSOR)-1); \ 74 | TENSOR##_strides[TH_TENSOR_dim_index] = THTensor_strideLegacyNoScalars(TENSOR, THTensor_nDimensionLegacyAll(TENSOR)-1); \ 75 | /* TENSOR##_counter tracks where we are in the storage. The offset into the */ \ 76 | /* storage is given by storage_offset + (i * j), where i is the stride */ \ 77 | /* vector and j is tensor_counter vector. This sets the starting position for the loop. */ \ 78 | for(TENSOR##_i = TENSOR##_dim-1; TENSOR##_i >= 0; --TENSOR##_i) { \ 79 | TENSOR##_counter[TENSOR##_i] = 0; \ 80 | } \ 81 | for(TENSOR##_i = THTensor_nDimensionLegacyAll(TENSOR)-2; TENSOR##_i >= 0; --TENSOR##_i) { \ 82 | if (TENSOR->stride(TENSOR##_i) == TENSOR->stride(TENSOR##_i+1) * TENSOR->size(TENSOR##_i+1) && TENSOR##_i != DIM && TENSOR##_i+1 != DIM) { \ 83 | TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size(TENSOR##_i) * TENSOR##_sizes[TH_TENSOR_dim_index]; \ 84 | if (DIM != THTensor_nDimensionLegacyAll(TENSOR)-1 && TENSOR##_i < DIM) \ 85 | TENSOR##_dimOffset--; \ 86 | } else { \ 87 | --TH_TENSOR_dim_index; \ 88 | TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size(TENSOR##_i); \ 89 | TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride(TENSOR##_i); \ 90 | } \ 91 | } \ 92 | /* Size of the inner most section */ \ 93 | TENSOR##_size = TENSOR##_sizes[TENSOR##_dim-1]; \ 94 | /* Stride of the inner most section */ \ 95 | TENSOR##_stride = TENSOR##_strides[TENSOR##_dim-1]; \ 96 | } \ 97 | else{\ 98 | TENSOR##_dim = 1;\ 99 | TENSOR##_counter = (int64_t*)THAlloc(sizeof(int64_t)*3);\ 100 | TENSOR##_sizes = TENSOR##_counter + 1;\ 101 | TENSOR##_strides = TENSOR##_counter + 2;\ 102 | TENSOR##_sizes[0] = TENSOR##_n;\ 103 | TENSOR##_strides[0] = 1;\ 104 | TENSOR##_size = TENSOR##_sizes[0];\ 105 | TENSOR##_stride = TENSOR##_strides[0];\ 106 | }\ 107 | } \ 108 | TENSOR##_i = 0; 109 | 110 | #define __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR, ALWAYS_UPDATE) \ 111 | if(TENSOR##_i == TENSOR##_size || ALWAYS_UPDATE) \ 112 | { \ 113 | if(TENSOR##_contiguous) \ 114 | break; \ 115 | \ 116 | if(TENSOR##_dim == 1) \ 117 | break; \ 118 | \ 119 | /* Reset pointer to beginning of loop */ \ 120 | TENSOR##_data -= TENSOR##_size*TENSOR##_stride; \ 121 | for(TENSOR##_i = TENSOR##_dim-2; TENSOR##_i >= 0; TENSOR##_i--) \ 122 | { \ 123 | TENSOR##_counter[TENSOR##_i]++; \ 124 | /* Jump ahread by the stride of this dimension */ \ 125 | TENSOR##_data += TENSOR##_strides[TENSOR##_i]; \ 126 | \ 127 | if(TENSOR##_counter[TENSOR##_i] == TENSOR##_sizes[TENSOR##_i]) \ 128 | { \ 129 | if(TENSOR##_i == 0) \ 130 | { \ 131 | TH_TENSOR_APPLY_hasFinished = 1; \ 132 | break; \ 133 | } \ 134 | else \ 135 | { \ 136 | /* Reset the pointer to the beginning of the chunk defined by this dimension */ \ 137 | TENSOR##_data -= TENSOR##_counter[TENSOR##_i]*TENSOR##_strides[TENSOR##_i]; \ 138 | TENSOR##_counter[TENSOR##_i] = 0; \ 139 | } \ 140 | } \ 141 | else \ 142 | break; \ 143 | } \ 144 | TENSOR##_i = 0; \ 145 | } \ 146 | 147 | #define TH_TENSOR_APPLY3_D(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, DIM, CODE) \ 148 | { \ 149 | int TH_TENSOR_APPLY_hasFinished = 0; \ 150 | int64_t TH_TENSOR_dim_index = 0; \ 151 | __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, DIM, 1) \ 152 | __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, DIM, 1) \ 153 | __TH_TENSOR_APPLYX_PREAMBLE(TYPE3, TENSOR3, DIM, 1) \ 154 | \ 155 | int elements_equal = 1; \ 156 | if(TENSOR1##_n != TENSOR2##_n) { \ 157 | elements_equal = 0; \ 158 | } \ 159 | else if(TENSOR1##_n != TENSOR3##_n) { \ 160 | elements_equal = 0; \ 161 | } \ 162 | if (elements_equal == 0) { \ 163 | AT_ERROR("inconsistent tensor size, expected ", \ 164 | #TENSOR1, " ", TENSOR1->sizes(), ", ", \ 165 | #TENSOR2, " ", TENSOR2->sizes(), " and ", \ 166 | #TENSOR3, " ", TENSOR3->sizes(), " to have the same " \ 167 | "number of elements, but got ", TENSOR1##_n, ", ", \ 168 | TENSOR2##_n, " and ", TENSOR3##_n, " elements respectively"); \ 169 | } \ 170 | \ 171 | while(!TH_TENSOR_APPLY_hasFinished) \ 172 | { \ 173 | /* Loop through the inner most region of the Tensor */ \ 174 | for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size && TENSOR3##_i < TENSOR3##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR3##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride, TENSOR3##_data += TENSOR3##_stride) /* 0 et pas TENSOR##_dim! */ \ 175 | { \ 176 | CODE \ 177 | } \ 178 | __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR1, 0) \ 179 | __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR2, 0) \ 180 | __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR3, 0) \ 181 | } \ 182 | if(TENSOR1##_counter != NULL) \ 183 | THFree(TENSOR1##_counter); \ 184 | if(TENSOR2##_counter != NULL) \ 185 | THFree(TENSOR2##_counter); \ 186 | if(TENSOR3##_counter != NULL) \ 187 | THFree(TENSOR3##_counter); \ 188 | } 189 | 190 | #define TH_TENSOR_APPLY3(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \ 191 | TH_TENSOR_APPLY3_D(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, -1, CODE) 192 | 193 | #define TH_TENSOR_APPLY2_D(TYPE1, TENSOR1, TYPE2, TENSOR2, DIM, CODE) \ 194 | { \ 195 | int TH_TENSOR_APPLY_hasFinished = 0; \ 196 | int64_t TH_TENSOR_dim_index = 0; \ 197 | __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, DIM, 1) \ 198 | __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, DIM, 1) \ 199 | \ 200 | if(TENSOR1##_n != TENSOR2##_n) { \ 201 | AT_ERROR("inconsistent tensor size, expected ", \ 202 | #TENSOR1, " ", TENSOR1->sizes(), " and ", \ 203 | #TENSOR2, " ", TENSOR2->sizes(), \ 204 | " to have the same number of elements, but got ", \ 205 | TENSOR1##_n, " and ", TENSOR2##_n, " elements respectively"); \ 206 | } \ 207 | while(!TH_TENSOR_APPLY_hasFinished) \ 208 | { \ 209 | /* Loop through the inner most region of the Tensor */ \ 210 | for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride) /* 0 et pas TENSOR##_dim! */ \ 211 | { \ 212 | CODE \ 213 | } \ 214 | __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR1, 0) \ 215 | __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR2, 0) \ 216 | } \ 217 | if(TENSOR1##_counter != NULL) \ 218 | THFree(TENSOR1##_counter); \ 219 | if(TENSOR2##_counter != NULL) \ 220 | THFree(TENSOR2##_counter); \ 221 | } 222 | 223 | #define TH_TENSOR_APPLY2(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \ 224 | TH_TENSOR_APPLY2_D(TYPE1, TENSOR1, TYPE2, TENSOR2, -1, CODE) 225 | 226 | #define TH_TENSOR_APPLY_D(TYPE, TENSOR, DIM, CODE) \ 227 | { \ 228 | int TH_TENSOR_APPLY_hasFinished = 0; \ 229 | int64_t TH_TENSOR_dim_index = 0; \ 230 | __TH_TENSOR_APPLYX_PREAMBLE(TYPE, TENSOR, DIM, 0) \ 231 | \ 232 | while(!TH_TENSOR_APPLY_hasFinished) \ 233 | { \ 234 | /* Loop through the inner most region of the Tensor */ \ 235 | for(; TENSOR##_i < TENSOR##_size; TENSOR##_i++, TENSOR##_data += TENSOR##_stride) /* 0 et pas TENSOR##_dim! */ \ 236 | { \ 237 | CODE \ 238 | } \ 239 | __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR, 1) \ 240 | } \ 241 | THFree(TENSOR##_counter); \ 242 | } 243 | 244 | #define TH_TENSOR_APPLY(TYPE, TENSOR, CODE) \ 245 | TH_TENSOR_APPLY_D(TYPE, TENSOR, -1, CODE) 246 | 247 | 248 | #ifdef _OPENMP 249 | 250 | #ifndef _WIN32 251 | #define PRAGMA(P) _Pragma(#P) 252 | #else 253 | #define PRAGMA(P) __pragma(P) 254 | #endif 255 | 256 | #include 257 | 258 | /* 259 | * Calcuate the memory offset of an element in a tensor. The strategy is below: 260 | * 261 | * 1. convert the line index(the index of the element) to the indexs(coordinates) in the tensor. 262 | * It can hinted by a classical problem: Getting each individual digit from a whole integer(Decimal base). 263 | * A N-digit decimal base number could be view as a N-dimension tensor and the sizes of the tensor are 10. 264 | * So the value the whole interger is the line index. And the digits could be viewed as the indexes in 265 | * different dimentions. 266 | * 267 | * 2. convert the indexs(coordinates) in the tensor to the memory offset. 268 | * 269 | * You can get the detailes in the for-statement iterations. 270 | * 271 | * The macro is only used in the first element in each thread. For the rest, the memory offset could update 272 | * according to info of the tensor in order to get better performance. So we should also record the each 273 | * indexs in coresponding dimension of first element. 274 | * The recorded info is stored in the TENSOR##_counter_tmp. 275 | * 276 | */ 277 | #define __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR) \ 278 | int64_t *TENSOR##_counter_tmp = (int64_t*)THAlloc(sizeof(int64_t) * TENSOR##_dim); \ 279 | ptrdiff_t TENSOR##_memory_offset = 0; \ 280 | ptrdiff_t TENSOR##_quot = line_index_start; \ 281 | for (TENSOR##_i = TENSOR##_dim-1; TENSOR##_i>=0; --TENSOR##_i) { \ 282 | TENSOR##_counter_tmp[TENSOR##_i] = TENSOR##_quot%TENSOR##_sizes[TENSOR##_i]; \ 283 | TENSOR##_quot /= TENSOR##_sizes[TENSOR##_i]; \ 284 | TENSOR##_memory_offset += TENSOR##_counter_tmp[TENSOR##_i] * TENSOR##_strides[TENSOR##_i]; \ 285 | } 286 | 287 | /* 288 | * The macro update the indexes in each dimension of the elements except for the first one allocated in 289 | * each thread. 290 | * For a tensor, if the index of some dimension reaches the size of the corresponding dimension. It will carry and clear. 291 | * If the index of next high dimension does do, the index of next high dimension should carry and clear, too. 292 | * 293 | * The momery offset calculatation is a little confusing. If current index carries, the current index is set to 0. So 294 | * the offset should decrease by size*stride of the last dimension. Then the index next high dimension increases by 1. So 295 | * the offset should increase by stride of next high dimension. 296 | */ 297 | #define __TH_TENSOR_APPLYX_UPDATE_COUNTERS_OMP(TENSOR) \ 298 | if(TENSOR##_i == TENSOR##_size && TENSOR##_dim > 1){ /*reaches the edge*/ \ 299 | int TENSOR##_carry_coord = 1; /*set carry flag to true*/ \ 300 | TENSOR##_start = 0; /*the current index be cleared to 0*/\ 301 | TENSOR##_data -= TENSOR##_size * TENSOR##_stride; /*the momery offset reset to the first one in current dimension */\ 302 | for(TENSOR##_i = TENSOR##_dim - 2; (TENSOR##_i >= 0) && (TENSOR##_carry_coord); TENSOR##_i--){ \ 303 | TENSOR##_counter_tmp[TENSOR##_i]++; /*the index of next high dimension update*/ \ 304 | TENSOR##_data += TENSOR##_strides[TENSOR##_i]; /*memory offset increase by stride of next high dimension*/\ 305 | if(TENSOR##_counter_tmp[TENSOR##_i] == TENSOR##_sizes[TENSOR##_i]){ /*The next high dimension also carry, continue 306 | to clear and carry*/\ 307 | TENSOR##_data -= TENSOR##_sizes[TENSOR##_i] * TENSOR##_strides[TENSOR##_i]; \ 308 | TENSOR##_counter_tmp[TENSOR##_i] = 0; \ 309 | } else { \ 310 | TENSOR##_carry_coord = 0; \ 311 | } \ 312 | } \ 313 | } else { \ 314 | TENSOR##_start = TENSOR##_i; \ 315 | } 316 | 317 | 318 | #define TH_TENSOR_APPLY_REDUCTION_OMP(TYPE, TENSOR, OPERATION, CODE, OMP_THRESHOLD) \ 319 | {\ 320 | int TENSOR##Contg = THTensor_(isContiguous)(TENSOR); \ 321 | ptrdiff_t TENSOR##Size = THTensor_(nElement)(TENSOR); \ 322 | if(TENSOR##Contg){ \ 323 | ptrdiff_t iter = 0; \ 324 | TYPE *rp = THTensor_getStoragePtr(TENSOR)->template data()+TENSOR->storage_offset(); \ 325 | PRAGMA( omp parallel for if (TENSOR##Size > OMP_THRESHOLD * 10) firstprivate(rp) reduction(OPERATION) ) \ 326 | for (iter = 0; iter < TENSOR##Size; iter++) { \ 327 | TYPE *TENSOR##_data = rp+iter; \ 328 | CODE \ 329 | } \ 330 | } else { \ 331 | int TH_TENSOR_APPLY_hasFinished = 0; \ 332 | int64_t TH_TENSOR_dim_index = 0; \ 333 | __TH_TENSOR_APPLYX_PREAMBLE(TYPE, TENSOR, -1, 1);\ 334 | if (0 == TH_TENSOR_APPLY_hasFinished) { \ 335 | PRAGMA(omp parallel if (TENSOR##Size > OMP_THRESHOLD) firstprivate(TENSOR##_data, TENSOR##_sizes, TENSOR##_strides, TENSOR##_dim, TENSOR##_stride, TENSOR##_size, TENSOR##_i) reduction(OPERATION))\ 336 | {\ 337 | size_t num_threads = omp_get_num_threads();\ 338 | size_t tid = omp_get_thread_num();\ 339 | size_t line_seg_length_avg = TENSOR##Size/num_threads; \ 340 | ptrdiff_t line_index_start = tid * line_seg_length_avg; \ 341 | ptrdiff_t line_seg_length = (tid == num_threads - 1)? (TENSOR##Size - line_index_start):line_seg_length_avg; \ 342 | __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR);\ 343 | TENSOR##_data += TENSOR##_memory_offset;\ 344 | ptrdiff_t count = 0;\ 345 | ptrdiff_t TENSOR##_start = TENSOR##_counter_tmp[TENSOR##_dim - 1];\ 346 | while(count < line_seg_length){\ 347 | for(TENSOR##_i=TENSOR##_start; (count < line_seg_length)&&(TENSOR##_i < TENSOR##_size); ++TENSOR##_i, ++count){\ 348 | CODE\ 349 | TENSOR##_data += TENSOR##_stride;\ 350 | }\ 351 | if(count < line_seg_length){\ 352 | __TH_TENSOR_APPLYX_UPDATE_COUNTERS_OMP(TENSOR);\ 353 | }\ 354 | }\ 355 | if(TENSOR##_counter_tmp != NULL) \ 356 | THFree(TENSOR##_counter_tmp); \ 357 | }\ 358 | }\ 359 | if(TENSOR##_counter != NULL)\ 360 | THFree(TENSOR##_counter);\ 361 | }\ 362 | } 363 | 364 | #define TH_TENSOR_APPLY2_OMP(SIZE, CONTIG1, CONTIG2, TYPE1, TENSOR1, TYPE2, TENSOR2, CODE, OMP_THRESHOLD) \ 365 | { \ 366 | /* for advanced searching index*/ \ 367 | if( CONTIG1 && CONTIG2 ){ \ 368 | TYPE1 *rp = THTensor_getStoragePtr(TENSOR1)->template data()+TENSOR1->storage_offset(); \ 369 | TYPE2 *tp = THTensor_getStoragePtr(TENSOR2)->template data()+TENSOR2->storage_offset(); \ 370 | ptrdiff_t iter = 0; \ 371 | if(tp != (TYPE2*)rp) { \ 372 | PRAGMA(ivdep) \ 373 | PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) firstprivate(rp, tp)) \ 374 | for (iter = 0; iter < SIZE; iter++) { \ 375 | TYPE2 *TENSOR2##_data = tp+iter; \ 376 | TYPE1 *TENSOR1##_data = rp+iter; \ 377 | CODE \ 378 | }\ 379 | } else {\ 380 | PRAGMA(simd) \ 381 | PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) firstprivate(rp, tp) ) \ 382 | for (iter = 0; iter < SIZE; iter++) {\ 383 | TYPE2* TENSOR2##_data = tp+iter;\ 384 | TYPE1* TENSOR1##_data = rp+iter;\ 385 | CODE \ 386 | }\ 387 | }\ 388 | } else { \ 389 | /* The following strategy is not easy to understand. 390 | * 1. Collapse the dimension of the tensors in order to decrease the number of nested loops. 391 | * 2. Calculate the numbers of elements allocated in each thread and the line index of the first one. 392 | * 3. Calculate the memory offset of the first element and the indexes in each dimension of the 393 | * first one. 394 | * 4. iterate all elements in each thread. update the indexes in each dimension of the rest. 395 | */ \ 396 | int TH_TENSOR_APPLY_hasFinished = 0; \ 397 | int64_t TH_TENSOR_dim_index = 0; \ 398 | /*step 1*/ \ 399 | __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, -1, 1) \ 400 | __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, -1, 1) \ 401 | if (0 == TH_TENSOR_APPLY_hasFinished) { \ 402 | PRAGMA(omp parallel if (SIZE > OMP_THRESHOLD) firstprivate(TENSOR2##_data, TENSOR2##_sizes, TENSOR2##_strides, TENSOR2##_dim, TENSOR2##_stride, TENSOR2##_size, TENSOR2##_i, TENSOR1##_data, TENSOR1##_sizes, TENSOR1##_strides, TENSOR1##_dim, TENSOR1##_stride, TENSOR1##_size, TENSOR1##_i)) \ 403 | { \ 404 | /*step 2*/ \ 405 | size_t num_threads = omp_get_num_threads(); \ 406 | size_t tid = omp_get_thread_num(); \ 407 | size_t line_seg_length_avg = SIZE/num_threads; \ 408 | ptrdiff_t line_index_start = tid * line_seg_length_avg; \ 409 | ptrdiff_t line_seg_length = (tid == num_threads - 1)? (SIZE - line_index_start):line_seg_length_avg; \ 410 | /* step 3*/ \ 411 | __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR2); \ 412 | __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR1); \ 413 | TENSOR2##_data += TENSOR2##_memory_offset; \ 414 | TENSOR1##_data += TENSOR1##_memory_offset; \ 415 | ptrdiff_t count = 0; \ 416 | ptrdiff_t TENSOR2##_start = TENSOR2##_counter_tmp[TENSOR2##_dim-1]; \ 417 | ptrdiff_t TENSOR1##_start = TENSOR1##_counter_tmp[TENSOR1##_dim-1]; \ 418 | /* step 4*/ \ 419 | while (count < line_seg_length) { \ 420 | for(TENSOR2##_i=TENSOR2##_start, TENSOR1##_i = TENSOR1##_start; ((count < line_seg_length) && (TENSOR2##_i < TENSOR2##_size) && (TENSOR1##_i < TENSOR1##_size)); ++TENSOR2##_i, ++TENSOR1##_i, ++count){ \ 421 | CODE \ 422 | TENSOR2##_data += TENSOR2##_stride; \ 423 | TENSOR1##_data += TENSOR1##_stride; \ 424 | } \ 425 | if (count < line_seg_length){ \ 426 | __TH_TENSOR_APPLYX_UPDATE_COUNTERS_OMP(TENSOR2); \ 427 | __TH_TENSOR_APPLYX_UPDATE_COUNTERS_OMP(TENSOR1); \ 428 | } \ 429 | } \ 430 | if(TENSOR1##_counter_tmp != NULL) \ 431 | THFree(TENSOR1##_counter_tmp); \ 432 | if(TENSOR2##_counter_tmp != NULL) \ 433 | THFree(TENSOR2##_counter_tmp); \ 434 | } \ 435 | } \ 436 | if(TENSOR2##_counter != NULL) \ 437 | THFree(TENSOR2##_counter); \ 438 | if(TENSOR1##_counter != NULL) \ 439 | THFree(TENSOR1##_counter);\ 440 | }\ 441 | } 442 | 443 | #define TH_TENSOR_APPLY3_OMP(SIZE, CONTIG1, CONTIG2, CONTIG3, TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE, OMP_THRESHOLD) \ 444 | { \ 445 | /* for adveanced searching index*/ \ 446 | if(CONTIG1 && CONTIG2 && CONTIG3){ \ 447 | TYPE1 *rp = THTensor_getStoragePtr(TENSOR1)->template data()+TENSOR1->storage_offset(); \ 448 | TYPE2 *tp = THTensor_getStoragePtr(TENSOR2)->template data()+TENSOR2->storage_offset(); \ 449 | TYPE3 *srcp = THTensor_getStoragePtr(TENSOR3)->template data()+TENSOR3->storage_offset(); \ 450 | ptrdiff_t iter = 0;\ 451 | if(tp != (TYPE2*)rp) { \ 452 | PRAGMA(ivdep) \ 453 | PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) ) \ 454 | for (iter = 0; iter < SIZE; iter++) {\ 455 | TYPE1 *TENSOR1##_data = rp+iter;\ 456 | TYPE2 *TENSOR2##_data = tp+iter; \ 457 | TYPE3 *TENSOR3##_data = srcp+iter;\ 458 | CODE \ 459 | } \ 460 | } else {\ 461 | PRAGMA(simd) \ 462 | PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) ) \ 463 | for (iter = 0; iter < SIZE; iter++) {\ 464 | TYPE1 *TENSOR1##_data = rp+iter;\ 465 | TYPE2 *TENSOR2##_data = tp+iter; \ 466 | TYPE3 *TENSOR3##_data = srcp+iter;\ 467 | CODE \ 468 | } \ 469 | }\ 470 | } else{ \ 471 | int TH_TENSOR_APPLY_hasFinished = 0;\ 472 | int64_t TH_TENSOR_dim_index = 0;\ 473 | __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, -1, 1) \ 474 | __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, -1, 1) \ 475 | __TH_TENSOR_APPLYX_PREAMBLE(TYPE3, TENSOR3, -1, 1) \ 476 | if (0 == TH_TENSOR_APPLY_hasFinished) { \ 477 | PRAGMA(omp parallel if (SIZE > OMP_THRESHOLD) firstprivate(TENSOR1##_data, TENSOR1##_sizes, TENSOR1##_strides, TENSOR1##_dim, TENSOR1##_stride, TENSOR1##_size, TENSOR1##_i, TENSOR2##_data, TENSOR2##_sizes, TENSOR2##_strides, TENSOR2##_dim, TENSOR2##_stride, TENSOR2##_size, TENSOR2##_i, TENSOR3##_data, TENSOR3##_sizes, TENSOR3##_strides, TENSOR3##_dim, TENSOR3##_stride, TENSOR3##_size, TENSOR3##_i))\ 478 | {\ 479 | size_t num_threads = omp_get_num_threads();\ 480 | size_t tid = omp_get_thread_num();\ 481 | size_t line_seg_length_avg = SIZE/num_threads; \ 482 | ptrdiff_t line_index_start = tid * line_seg_length_avg; \ 483 | ptrdiff_t line_seg_length = (tid == num_threads - 1)? (SIZE - line_index_start):line_seg_length_avg; \ 484 | __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR1);\ 485 | __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR2);\ 486 | __TH_TENSOR_APPLYX_CAL_MEMORY_OFFSET(TENSOR3);\ 487 | TENSOR1##_data += TENSOR1##_memory_offset;\ 488 | TENSOR2##_data += TENSOR2##_memory_offset;\ 489 | TENSOR3##_data += TENSOR3##_memory_offset;\ 490 | ptrdiff_t count = 0;\ 491 | ptrdiff_t TENSOR1##_start = TENSOR1##_counter_tmp[TENSOR1##_dim - 1];\ 492 | ptrdiff_t TENSOR2##_start = TENSOR2##_counter_tmp[TENSOR2##_dim - 1];\ 493 | ptrdiff_t TENSOR3##_start = TENSOR3##_counter_tmp[TENSOR3##_dim - 1];\ 494 | while(count < line_seg_length){\ 495 | for(TENSOR1##_i=TENSOR1##_start, TENSOR2##_i=TENSOR2##_start,TENSOR3##_i=TENSOR3##_start; (count calculate_contiguous_stride(IntList sizes) { 10 | std::vector strides(sizes.size()); 11 | int ndim = sizes.size(); 12 | 13 | for (int d = ndim - 1; d >= 0; d--) 14 | { 15 | if (d == ndim - 1) { 16 | strides[d] = 1; 17 | } 18 | else { 19 | strides[d] = std::max(sizes[d+1], 1) * strides[d+1]; 20 | } 21 | } 22 | return strides; 23 | } 24 | 25 | // Maybe someone wants to move this in Tensor/TensorImpl? 26 | bool is_transposed(const TensorImpl *self) { 27 | int64_t max_stride = 1; 28 | int64_t size_max_stride = 1; 29 | int64_t z = 1; 30 | int d; 31 | for (d = 0; d < self->dim(); ++d) { 32 | if (self->stride(d) == 0 && self->size(d) != 1) 33 | return false; 34 | if (self->stride(d) > max_stride) { 35 | max_stride = self->stride(d); 36 | size_max_stride = self->size(d); 37 | } 38 | z *= self->size(d); 39 | } 40 | if (z == max_stride * size_max_stride) { 41 | return true; 42 | } 43 | return false; 44 | } 45 | 46 | } 47 | 48 | #endif // UTILS_H -------------------------------------------------------------------------------- /src/module.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "CPUComplexType.h" 3 | 4 | namespace at { 5 | 6 | struct ComplexHooks : public at::ComplexHooksInterface { 7 | ComplexHooks(ComplexHooksArgs) {}; 8 | void registerComplexTypes(Context* context) const override { 9 | context->registerType(Backend::CPU, CPUComplexTypeInfo::scalar_type, new CPUComplexType()); 10 | context->registerType(Backend::CPU, CPUComplexTypeInfo::scalar_type, new CPUComplexType()); 11 | } 12 | }; 13 | 14 | 15 | REGISTER_COMPLEX_HOOKS(ComplexHooks); 16 | 17 | } 18 | 19 | // create the extension module 20 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 21 | // m.def("cpptest", &cpptest, "cpp test"); 22 | } 23 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from torch_complex import torch 2 | 3 | a = torch.ones(2, 2, dtype=torch.complex128) 4 | # b = torch.ones(2, 2, dtype=torch.complex128) 5 | 6 | # torch.matmul(a, b) 7 | 2j * a 8 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Roger-luo/pytorch-complex/9c52991228cbacb1519e22d54ede164d9ef63f3a/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_tensor.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from torch_complex import torch 3 | 4 | class TestComplexTensor(unittest.TestCase): 5 | 6 | def test_empty(self): 7 | torch.empty(2, 2, dtype=torch.complex64) 8 | torch.empty(2, 2, dtype=torch.complex128) 9 | 10 | def test_indexing(self): 11 | t = torch.empty(2, 2, dtype=torch.complex128) 12 | t[1] 13 | t[1, 1] 14 | 15 | def test_fill(self): 16 | t = torch.empty(2, 2, dtype=torch.complex128) 17 | t.fill_(1.0) 18 | t.fill_(1.0 + 2.0j) 19 | 20 | def test_scalar_binary_op(self): 21 | a = torch.ones(2, 2, dtype=torch.complex128) 22 | 2 * a 23 | 2 / a 24 | 2 - a 25 | 2 + a 26 | 27 | def test_blas(self): 28 | pass 29 | 30 | def test_rand(self): 31 | pass 32 | 33 | if __name__ == '__main__': 34 | unittest.main() 35 | -------------------------------------------------------------------------------- /torch_complex/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.0.1' 2 | 3 | import torch 4 | import torch_complex.cpp 5 | # import importlib 6 | 7 | # # pretend to be pytorch 8 | # __globals__ = globals() 9 | # for each in dir(torch): 10 | # __globals__[each] = getattr(torch, each) 11 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = 3 | py27, 4 | py35, 5 | py36, 6 | pypy, 7 | pypy3, 8 | 9 | [testenv] 10 | passenv = * 11 | deps = 12 | coverage 13 | pytest 14 | commands = 15 | python setup.py --quiet clean develop 16 | coverage run --parallel-mode -m pytest 17 | coverage combine --append 18 | coverage report -m 19 | --------------------------------------------------------------------------------