├── meson.build ├── README.md ├── FFTSpectrum.vcxproj.filters ├── FFTSpectrum.sln ├── .github └── workflows │ └── build.yaml ├── .gitattributes ├── .gitignore ├── FFTSpectrum.vcxproj ├── FFTSpectrum.c ├── COPYING └── sse_mathfun.h /meson.build: -------------------------------------------------------------------------------- 1 | project('FFTSpectrum', 'c', 2 | default_options : ['buildtype=release', 'b_ndebug=if-release', 'c_std=c99'], 3 | meson_version : '>=0.49.0', 4 | version : '2' 5 | ) 6 | 7 | sources = 'FFTSpectrum.c' 8 | 9 | compiler = meson.get_compiler('c') 10 | 11 | if compiler.get_argument_syntax() == 'msvc' 12 | deps = [ dependency('fftwf') ] 13 | install_dir = 'installed' # dummy 14 | else 15 | vapoursynth_dep = dependency('vapoursynth').partial_dependency(compile_args : true, includes : true) 16 | deps = [ dependency('fftw3f'), vapoursynth_dep ] 17 | install_dir = join_paths(vapoursynth_dep.get_pkgconfig_variable('libdir'), 'vapoursynth') 18 | endif 19 | 20 | shared_module('fftspectrum', sources, 21 | dependencies : deps, 22 | install : true, 23 | install_dir : install_dir, 24 | gnu_symbol_visibility : 'hidden' 25 | ) 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | FFTSpectrum 2 | =========== 3 | 4 | A VapourSynth filter that displays the FFT frequency spectrum of a given clip. 5 | Supposedly useful for determining original resolution of upscaled anime content. 6 | 7 | Usage 8 | ----- 9 | 10 | fftspectrum.FFTSpectrum(clip clip, bint grid=False) 11 | 12 | * **clip** - Clip to process. It must have constant format and dimensions, and a luma plane with 8-bit integer samples. 13 | * **grid** - Specifies whether a grid with origin at the center of the image and spacing of 100 pixels should be drawn over the resulting spectrum. 14 | 15 | Examples 16 | -------- 17 | 18 | Without grid: 19 | ![No grid](https://user-images.githubusercontent.com/3163182/52003131-cfd4b080-24d4-11e9-9ec8-70c818fce3af.png) 20 | 21 | With grid: 22 | ![With grid](https://user-images.githubusercontent.com/3163182/52003207-f692e700-24d4-11e9-89d1-c25d5c1617cc.png) 23 | 24 | Credits 25 | ------- 26 | 27 | FFTSpectrum is based on the AviUtl filter with the same name, written by Hiroaki Gotou in 2008. 28 | -------------------------------------------------------------------------------- /FFTSpectrum.vcxproj.filters: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;hm;inl;inc;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav 15 | 16 | 17 | 18 | 19 | Source Files 20 | 21 | 22 | 23 | 24 | Resource Files 25 | 26 | 27 | -------------------------------------------------------------------------------- /FFTSpectrum.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 15 4 | VisualStudioVersion = 15.0.28307.271 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "FFTSpectrum", "FFTSpectrum.vcxproj", "{FE7D4925-2B0B-4B0F-B63E-E0EBE9DDFE97}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|x64 = Debug|x64 11 | Debug|x86 = Debug|x86 12 | Release|x64 = Release|x64 13 | Release|x86 = Release|x86 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {FE7D4925-2B0B-4B0F-B63E-E0EBE9DDFE97}.Debug|x64.ActiveCfg = Debug|x64 17 | {FE7D4925-2B0B-4B0F-B63E-E0EBE9DDFE97}.Debug|x64.Build.0 = Debug|x64 18 | {FE7D4925-2B0B-4B0F-B63E-E0EBE9DDFE97}.Debug|x86.ActiveCfg = Debug|Win32 19 | {FE7D4925-2B0B-4B0F-B63E-E0EBE9DDFE97}.Debug|x86.Build.0 = Debug|Win32 20 | {FE7D4925-2B0B-4B0F-B63E-E0EBE9DDFE97}.Release|x64.ActiveCfg = Release|x64 21 | {FE7D4925-2B0B-4B0F-B63E-E0EBE9DDFE97}.Release|x64.Build.0 = Release|x64 22 | {FE7D4925-2B0B-4B0F-B63E-E0EBE9DDFE97}.Release|x86.ActiveCfg = Release|Win32 23 | {FE7D4925-2B0B-4B0F-B63E-E0EBE9DDFE97}.Release|x86.Build.0 = Release|Win32 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | GlobalSection(ExtensibilityGlobals) = postSolution 29 | SolutionGuid = {7698D0B8-4303-4929-9A07-FA78A512652F} 30 | EndGlobalSection 31 | EndGlobal 32 | -------------------------------------------------------------------------------- /.github/workflows/build.yaml: -------------------------------------------------------------------------------- 1 | name: Build for Windows 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | strategy: 8 | matrix: 9 | platform: [ windows-latest ] 10 | arch: [ x64 ] 11 | runs-on: ${{ matrix.platform }} 12 | steps: 13 | - uses: actions/checkout@v2 14 | with: 15 | submodules: 'recursive' 16 | 17 | - name: Run vcpkg 18 | uses: lukka/run-vcpkg@v4 19 | with: 20 | vcpkgArguments: 'fftw3[avx2]:x64-windows-static' 21 | vcpkgDirectory: '${{ github.workspace }}/vcpkg' 22 | vcpkgGitCommitId: 5568f110b509a9fd90711978a7cb76bae75bb092 # 2021.05.12 release 23 | 24 | - name: Setup Python 25 | uses: actions/setup-python@v1 26 | with: 27 | python-version: '3.x' 28 | - name: install meson and ninja 29 | run: pip install meson ninja 30 | 31 | - name: download VS headers and patch header location 32 | shell: bash 33 | run: | 34 | git clone https://github.com/vapoursynth/vapoursynth --depth=1 --branch R54 35 | cp vapoursynth/include/*.h vapoursynth/ 36 | 37 | - name: setup MS dev commands 38 | uses: ilammy/msvc-dev-cmd@v1 39 | with: 40 | arch: ${{ matrix.arch }} 41 | - name: Install pkg-config lite 42 | run: choco install pkgconfiglite 43 | - name: Meson setup 44 | run: meson setup builddir/ -Db_vscrt=mt -Dpkg_config_path=${{ github.workspace }}/vcpkg/installed/x64-windows-static/lib/pkgconfig 45 | - name: Meson compile 46 | run: meson compile -C builddir/ -v 47 | - name: Upload artifact 48 | uses: actions/upload-artifact@v2 49 | with: 50 | name: release-${{matrix.arch}} 51 | path: | 52 | builddir/*.dll 53 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | 4 | # User-specific files 5 | *.suo 6 | *.user 7 | *.userosscache 8 | *.sln.docstates 9 | 10 | # User-specific files (MonoDevelop/Xamarin Studio) 11 | *.userprefs 12 | 13 | # Build results 14 | [Dd]ebug/ 15 | [Dd]ebugPublic/ 16 | [Rr]elease/ 17 | [Rr]eleases/ 18 | x64/ 19 | x86/ 20 | bld/ 21 | [Bb]in/ 22 | [Oo]bj/ 23 | [Ll]og/ 24 | 25 | # Visual Studio 2015 cache/options directory 26 | .vs/ 27 | # Uncomment if you have tasks that create the project's static files in wwwroot 28 | #wwwroot/ 29 | 30 | # MSTest test Results 31 | [Tt]est[Rr]esult*/ 32 | [Bb]uild[Ll]og.* 33 | 34 | # NUNIT 35 | *.VisualState.xml 36 | TestResult.xml 37 | 38 | # Build Results of an ATL Project 39 | [Dd]ebugPS/ 40 | [Rr]eleasePS/ 41 | dlldata.c 42 | 43 | # DNX 44 | project.lock.json 45 | project.fragment.lock.json 46 | artifacts/ 47 | 48 | *_i.c 49 | *_p.c 50 | *_i.h 51 | *.ilk 52 | *.meta 53 | *.obj 54 | *.pch 55 | *.pdb 56 | *.pgc 57 | *.pgd 58 | *.rsp 59 | *.sbr 60 | *.tlb 61 | *.tli 62 | *.tlh 63 | *.tmp 64 | *.tmp_proj 65 | *.log 66 | *.vspscc 67 | *.vssscc 68 | .builds 69 | *.pidb 70 | *.svclog 71 | *.scc 72 | 73 | # Chutzpah Test files 74 | _Chutzpah* 75 | 76 | # Visual C++ cache files 77 | ipch/ 78 | *.aps 79 | *.ncb 80 | *.opendb 81 | *.opensdf 82 | *.sdf 83 | *.cachefile 84 | *.VC.db 85 | *.VC.VC.opendb 86 | 87 | # Visual Studio profiler 88 | *.psess 89 | *.vsp 90 | *.vspx 91 | *.sap 92 | 93 | # TFS 2012 Local Workspace 94 | $tf/ 95 | 96 | # Guidance Automation Toolkit 97 | *.gpState 98 | 99 | # ReSharper is a .NET coding add-in 100 | _ReSharper*/ 101 | *.[Rr]e[Ss]harper 102 | *.DotSettings.user 103 | 104 | # JustCode is a .NET coding add-in 105 | .JustCode 106 | 107 | # TeamCity is a build add-in 108 | _TeamCity* 109 | 110 | # DotCover is a Code Coverage Tool 111 | *.dotCover 112 | 113 | # NCrunch 114 | _NCrunch_* 115 | .*crunch*.local.xml 116 | nCrunchTemp_* 117 | 118 | # MightyMoose 119 | *.mm.* 120 | AutoTest.Net/ 121 | 122 | # Web workbench (sass) 123 | .sass-cache/ 124 | 125 | # Installshield output folder 126 | [Ee]xpress/ 127 | 128 | # DocProject is a documentation generator add-in 129 | DocProject/buildhelp/ 130 | DocProject/Help/*.HxT 131 | DocProject/Help/*.HxC 132 | DocProject/Help/*.hhc 133 | DocProject/Help/*.hhk 134 | DocProject/Help/*.hhp 135 | DocProject/Help/Html2 136 | DocProject/Help/html 137 | 138 | # Click-Once directory 139 | publish/ 140 | 141 | # Publish Web Output 142 | *.[Pp]ublish.xml 143 | *.azurePubxml 144 | # TODO: Comment the next line if you want to checkin your web deploy settings 145 | # but database connection strings (with potential passwords) will be unencrypted 146 | #*.pubxml 147 | *.publishproj 148 | 149 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 150 | # checkin your Azure Web App publish settings, but sensitive information contained 151 | # in these scripts will be unencrypted 152 | PublishScripts/ 153 | 154 | # NuGet Packages 155 | *.nupkg 156 | # The packages folder can be ignored because of Package Restore 157 | **/packages/* 158 | # except build/, which is used as an MSBuild target. 159 | !**/packages/build/ 160 | # Uncomment if necessary however generally it will be regenerated when needed 161 | #!**/packages/repositories.config 162 | # NuGet v3's project.json files produces more ignoreable files 163 | *.nuget.props 164 | *.nuget.targets 165 | 166 | # Microsoft Azure Build Output 167 | csx/ 168 | *.build.csdef 169 | 170 | # Microsoft Azure Emulator 171 | ecf/ 172 | rcf/ 173 | 174 | # Windows Store app package directories and files 175 | AppPackages/ 176 | BundleArtifacts/ 177 | Package.StoreAssociation.xml 178 | _pkginfo.txt 179 | 180 | # Visual Studio cache files 181 | # files ending in .cache can be ignored 182 | *.[Cc]ache 183 | # but keep track of directories ending in .cache 184 | !*.[Cc]ache/ 185 | 186 | # Others 187 | ClientBin/ 188 | ~$* 189 | *~ 190 | *.dbmdl 191 | *.dbproj.schemaview 192 | *.jfm 193 | *.pfx 194 | *.publishsettings 195 | node_modules/ 196 | orleans.codegen.cs 197 | 198 | # Since there are multiple workflows, uncomment next line to ignore bower_components 199 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 200 | #bower_components/ 201 | 202 | # RIA/Silverlight projects 203 | Generated_Code/ 204 | 205 | # Backup & report files from converting an old project file 206 | # to a newer Visual Studio version. Backup files are not needed, 207 | # because we have git ;-) 208 | _UpgradeReport_Files/ 209 | Backup*/ 210 | UpgradeLog*.XML 211 | UpgradeLog*.htm 212 | 213 | # SQL Server files 214 | *.mdf 215 | *.ldf 216 | 217 | # Business Intelligence projects 218 | *.rdl.data 219 | *.bim.layout 220 | *.bim_*.settings 221 | 222 | # Microsoft Fakes 223 | FakesAssemblies/ 224 | 225 | # GhostDoc plugin setting file 226 | *.GhostDoc.xml 227 | 228 | # Node.js Tools for Visual Studio 229 | .ntvs_analysis.dat 230 | 231 | # Visual Studio 6 build log 232 | *.plg 233 | 234 | # Visual Studio 6 workspace options file 235 | *.opt 236 | 237 | # Visual Studio LightSwitch build output 238 | **/*.HTMLClient/GeneratedArtifacts 239 | **/*.DesktopClient/GeneratedArtifacts 240 | **/*.DesktopClient/ModelManifest.xml 241 | **/*.Server/GeneratedArtifacts 242 | **/*.Server/ModelManifest.xml 243 | _Pvt_Extensions 244 | 245 | # Paket dependency manager 246 | .paket/paket.exe 247 | paket-files/ 248 | 249 | # FAKE - F# Make 250 | .fake/ 251 | 252 | # JetBrains Rider 253 | .idea/ 254 | *.sln.iml 255 | 256 | # CodeRush 257 | .cr/ 258 | 259 | # Python Tools for Visual Studio (PTVS) 260 | __pycache__/ 261 | *.pyc -------------------------------------------------------------------------------- /FFTSpectrum.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | Debug 14 | x64 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | 15.0 23 | {FE7D4925-2B0B-4B0F-B63E-E0EBE9DDFE97} 24 | Win32Proj 25 | 8.1 26 | 27 | 28 | 29 | DynamicLibrary 30 | true 31 | v141 32 | 33 | 34 | DynamicLibrary 35 | false 36 | v141 37 | 38 | 39 | DynamicLibrary 40 | true 41 | v141 42 | 43 | 44 | DynamicLibrary 45 | false 46 | v141 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | true 68 | 69 | 70 | true 71 | 72 | 73 | 74 | WIN32;_DEBUG;_WINDOWS;_USRDLL;FFTSPECTRUM_EXPORTS;%(PreprocessorDefinitions) 75 | $(ProjectDir)..\fftw-build;$(ProjectDir)..\vapoursynth-sdk\include\vapoursynth;%(AdditionalIncludeDirectories) 76 | MultiThreadedDebugDLL 77 | Level3 78 | ProgramDatabase 79 | Disabled 80 | 81 | 82 | MachineX86 83 | true 84 | Windows 85 | $(ProjectDir)..\fftw-build\fftw3f.lib;%(AdditionalDependencies) 86 | 87 | 88 | 89 | 90 | WIN32;NDEBUG;_WINDOWS;_USRDLL;FFTSPECTRUM_EXPORTS;%(PreprocessorDefinitions) 91 | ..\fftw-build;..\vapoursynth-sdk\include;%(AdditionalIncludeDirectories) 92 | MultiThreadedDLL 93 | Level3 94 | ProgramDatabase 95 | 96 | 97 | MachineX86 98 | true 99 | Windows 100 | true 101 | true 102 | 103 | 104 | 105 | 106 | $(ProjectDir)..\fftw-build;$(ProjectDir)..\vapoursynth-sdk\include;%(AdditionalIncludeDirectories) 107 | 108 | 109 | fftw3f.lib;ucrtd.lib;vcruntimed.lib;%(AdditionalDependencies) 110 | true 111 | $(ProjectDir)..\fftw-build;%(AdditionalLibraryDirectories) 112 | 113 | 114 | 115 | 116 | $(ProjectDir)..\fftw-build;$(ProjectDir)..\vapoursynth-sdk\include;$(ProjectDir);%(AdditionalIncludeDirectories) 117 | 118 | 119 | $(ProjectDir)..\fftw-build;%(AdditionalLibraryDirectories) 120 | fftw3f.lib;ucrt.lib;vcruntime.lib;%(AdditionalDependencies) 121 | true 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | -------------------------------------------------------------------------------- /FFTSpectrum.c: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * Copyright (c) 2008 Hiroaki Gotou. 4 | * Copyright (c) 2019 Evgeny Marchenkov. 5 | * 6 | * This program is free software : you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program.If not, see < https://www.gnu.org/licenses/>. 18 | */ 19 | 20 | #include 21 | #include 22 | 23 | #if defined(_MSC_VER) 24 | #include 25 | 26 | #define USE_SSE_AUTO 27 | #define __SSE4_2__ 28 | #define __x86_64__ 29 | #define SSE_MATHFUN_WITH_CODE 30 | #include "sse_mathfun.h" 31 | #undef SSE_MATHFUN_WITH_CODE 32 | #undef __x86_64__ 33 | #undef __SSE4_2__ 34 | #undef USE_SSE_AUTO 35 | #undef inline 36 | 37 | #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) 38 | #include 39 | 40 | #define USE_SSE4 41 | #define SSE_MATHFUN_WITH_CODE 42 | #include "sse_mathfun.h" 43 | 44 | #endif 45 | 46 | #include "fftw3.h" 47 | 48 | #include "vapoursynth/VapourSynth.h" 49 | #include "vapoursynth/VSHelper.h" 50 | 51 | 52 | typedef struct { 53 | VSNodeRef *node; 54 | const VSVideoInfo *in_vi; 55 | VSVideoInfo out_vi; 56 | 57 | bool show_grid; 58 | 59 | fftwf_complex *fft_in; 60 | fftwf_complex *fft_out; 61 | fftwf_plan p; 62 | float *abs_array; 63 | } FFTSpectrumData; 64 | 65 | static void fill_fft_input_array(fftwf_complex *dst, const uint8_t *src, int width, int height, int stride) { 66 | fftwf_complex *dstp = dst; 67 | const uint8_t *srcp = src; 68 | const int mod16_width = width - (width % 16); 69 | 70 | for (int y = 0; y < height; y++) { 71 | for (int x = 0; x < mod16_width; x += 16) { 72 | __m128i in_buffer, epu32_buffer; 73 | __m128 cvt_buffer, out_buffer[2]; 74 | const __m128 sse_zero = _mm_setzero_ps(); 75 | 76 | in_buffer = _mm_load_si128((const __m128i *)srcp); 77 | 78 | for (int j = 0; j < 4; j++) { 79 | epu32_buffer = _mm_cvtepu8_epi32(in_buffer); 80 | cvt_buffer = _mm_cvtepi32_ps(epu32_buffer); 81 | 82 | out_buffer[0] = _mm_unpacklo_ps(cvt_buffer, sse_zero); 83 | out_buffer[1] = _mm_unpackhi_ps(cvt_buffer, sse_zero); 84 | 85 | _mm_store_ps((float *)(dstp), out_buffer[0]); 86 | _mm_store_ps((float *)(dstp + 2), out_buffer[1]); 87 | 88 | in_buffer = _mm_shuffle_epi32(in_buffer, _MM_SHUFFLE(0, 3, 2, 1)); 89 | 90 | dstp += 4; 91 | } 92 | 93 | srcp += 16; 94 | } 95 | for (int x = mod16_width; x < width; x++) { 96 | *dstp[0] = (float)*srcp; 97 | *dstp[1] = 0.0; 98 | srcp++; 99 | dstp++; 100 | } 101 | srcp += stride - width; 102 | } 103 | } 104 | 105 | static void calculate_absolute_values(float *dst, fftwf_complex *src, int length) { 106 | fftwf_complex *srcp = src; 107 | float *dstp = dst; 108 | const int mod4_length = length - (length % 4); 109 | 110 | for (int i = 0; i < mod4_length; i += 4) { 111 | __m128 in_buffer[2], mul_buffer[2], add_buffer, out_buffer; 112 | const __m128 sse_one = _mm_set_ps1(1.0f); 113 | 114 | in_buffer[0] = _mm_load_ps((float *)(srcp)); 115 | in_buffer[1] = _mm_load_ps((float *)(srcp + 2)); 116 | 117 | mul_buffer[0] = _mm_mul_ps(in_buffer[0], in_buffer[0]); 118 | mul_buffer[1] = _mm_mul_ps(in_buffer[1], in_buffer[1]); 119 | 120 | add_buffer = _mm_hadd_ps(mul_buffer[0], mul_buffer[1]); 121 | add_buffer = _mm_sqrt_ps(add_buffer); 122 | add_buffer = _mm_add_ps(add_buffer, sse_one); 123 | 124 | out_buffer = log_ps(add_buffer); 125 | 126 | _mm_store_ps(dstp, out_buffer); 127 | 128 | srcp += 4; 129 | dstp += 4; 130 | } 131 | for (int i = mod4_length; i < length; i++) { 132 | dstp[i] = logf(sqrtf(src[i][0] * src[i][0] + src[i][1] * src[i][1]) + 1.0); 133 | } 134 | } 135 | 136 | static void draw_fft_spectrum(uint8_t *dst, float *src, int width, int height, int stride) { 137 | uint8_t *dstp = dst; 138 | float *srcp = src; 139 | float max = 0; 140 | 141 | memset(dstp, 0, stride * height); 142 | 143 | for (int i = 1; i < height * width; i++) { 144 | if (srcp[i] > max) { 145 | max = srcp[i]; 146 | } 147 | } 148 | 149 | for (int y = 0; y < height; y++) { 150 | for (int x = 0; x < width; x++) { 151 | float buf; 152 | buf = srcp[x + y * width] > max / 2 ? srcp[x + y * width] : 0; 153 | buf = 255 * buf / max; 154 | if (buf < 0) buf = 0; 155 | if (buf > 255) buf = 255; 156 | 157 | if (y < height / 2) { 158 | if (x < width / 2) { 159 | dstp[x + (width / 2) + stride * (y + height / 2)] = (uint8_t)buf; 160 | } 161 | else { 162 | dstp[x - (width / 2) + stride * (y + height / 2)] = (uint8_t)buf; 163 | } 164 | } 165 | else { 166 | if (x < width / 2) { 167 | dstp[x + (width / 2) + stride * (y - height / 2)] = (uint8_t)buf; 168 | } 169 | else { 170 | dstp[x - (width / 2) + stride * (y - height / 2)] = (uint8_t)buf; 171 | } 172 | } 173 | } 174 | } 175 | } 176 | 177 | static void draw_grid(uint8_t *buf, int width, int height, int stride) { 178 | for (int x = (width / 2) % 100; x < width; x += 100) { 179 | for (int y = 0; y < height; y++) { 180 | buf[x + y * stride] = 255; 181 | } 182 | } 183 | 184 | for (int y = (height / 2) % 100; y < height; y += 100) { 185 | for (int x = 0; x < width; x++) { 186 | buf[x + y * stride] = 255; 187 | } 188 | } 189 | } 190 | 191 | static void VS_CC fftSpectrumInit(VSMap *in, VSMap *out, void **instanceData, VSNode *node, VSCore *core, const VSAPI *vsapi) { 192 | FFTSpectrumData *d = (FFTSpectrumData *) * instanceData; 193 | d->out_vi = *d->in_vi; 194 | d->out_vi.format = vsapi->getFormatPreset(pfGray8, core); 195 | vsapi->setVideoInfo(&d->out_vi, 1, node); 196 | } 197 | 198 | static const VSFrameRef *VS_CC fftSpectrumGetFrame(int n, int activationReason, void **instanceData, void **frameData, VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi) { 199 | FFTSpectrumData *d = (FFTSpectrumData *) * instanceData; 200 | 201 | if (activationReason == arInitial) { 202 | vsapi->requestFrameFilter(n, d->node, frameCtx); 203 | } else if (activationReason == arAllFramesReady) { 204 | 205 | const VSFrameRef *src = vsapi->getFrameFilter(n, d->node, frameCtx); 206 | VSFrameRef *dst = vsapi->newVideoFrame(d->out_vi.format, d->out_vi.width, d->out_vi.height, src, core); 207 | 208 | fill_fft_input_array(d->fft_in, vsapi->getReadPtr(src, 0), d->in_vi->width, d->in_vi->height, vsapi->getStride(src, 0)); 209 | 210 | fftwf_execute_dft(d->p, d->fft_in, d->fft_out); 211 | 212 | calculate_absolute_values(d->abs_array, d->fft_out, (d->in_vi->width * d->in_vi->height)); 213 | 214 | draw_fft_spectrum(vsapi->getWritePtr(dst, 0), d->abs_array, d->out_vi.width, d->out_vi.height, vsapi->getStride(dst, 0)); 215 | 216 | if (d->show_grid) { 217 | draw_grid(vsapi->getWritePtr(dst, 0), d->out_vi.width, d->out_vi.height, vsapi->getStride(dst, 0)); 218 | } 219 | 220 | vsapi->freeFrame(src); 221 | 222 | return dst; 223 | } 224 | 225 | return 0; 226 | } 227 | 228 | static void VS_CC fftSpectrumFree(void *instanceData, VSCore *core, const VSAPI *vsapi) { 229 | FFTSpectrumData *d = (FFTSpectrumData *)instanceData; 230 | vsapi->freeNode(d->node); 231 | VS_ALIGNED_FREE(d->fft_in); 232 | VS_ALIGNED_FREE(d->fft_out); 233 | VS_ALIGNED_FREE(d->abs_array); 234 | fftwf_destroy_plan(d->p); 235 | free(d); 236 | } 237 | 238 | static void VS_CC fftSpectrumCreate(const VSMap *in, VSMap *out, void *userData, VSCore *core, const VSAPI *vsapi) { 239 | FFTSpectrumData *d; 240 | d = malloc(sizeof(FFTSpectrumData)); 241 | 242 | int err; 243 | 244 | d->node = vsapi->propGetNode(in, "clip", 0, 0); 245 | d->in_vi = vsapi->getVideoInfo(d->node); 246 | 247 | if (!isConstantFormat(d->in_vi) || d->in_vi->format->sampleType != stInteger || d->in_vi->format->bitsPerSample != 8 || 248 | d->in_vi->format->colorFamily == cmRGB || d->in_vi->format->colorFamily == cmCompat) { 249 | vsapi->setError(out, "FFTSpectrum: only constant format 8bit integer luma-containing input supported"); 250 | vsapi->freeNode(d->node); 251 | free(d); 252 | return; 253 | } 254 | 255 | d->show_grid = (bool)vsapi->propGetInt(in, "grid", 0, &err); 256 | if (err) { 257 | d->show_grid = false; 258 | } 259 | 260 | VS_ALIGNED_MALLOC(&d->fft_in, (d->in_vi->width * d->in_vi->height * sizeof(fftw_complex)), 32); 261 | VS_ALIGNED_MALLOC(&d->fft_out, (d->in_vi->width * d->in_vi->height * sizeof(fftw_complex)), 32); 262 | VS_ALIGNED_MALLOC(&d->abs_array, (d->in_vi->width * d->in_vi->height * sizeof(float)), 32); 263 | 264 | memset(d->fft_in, 0, (d->in_vi->width * d->in_vi->height * sizeof(fftw_complex))); 265 | memset(d->fft_out, 0, (d->in_vi->width * d->in_vi->height * sizeof(fftw_complex))); 266 | memset(d->abs_array, 0, (d->in_vi->width * d->in_vi->height * sizeof(float))); 267 | 268 | d->p = fftwf_plan_dft_2d(d->in_vi->height, d->in_vi->width, d->fft_in, d->fft_out, FFTW_FORWARD, FFTW_MEASURE | FFTW_DESTROY_INPUT); 269 | 270 | vsapi->createFilter(in, out, "FFTSpectrum", fftSpectrumInit, fftSpectrumGetFrame, fftSpectrumFree, fmParallelRequests, 0, d, core); 271 | } 272 | 273 | 274 | VS_EXTERNAL_API(void) VapourSynthPluginInit(VSConfigPlugin configFunc, VSRegisterFunction registerFunc, VSPlugin *plugin) { 275 | configFunc("org.beatrice-raws.fftspectrum", "fftspectrum", "FFT Spectrum plugin", VAPOURSYNTH_API_VERSION, 1, plugin); 276 | registerFunc("FFTSpectrum", "clip:clip;grid:int:opt;", fftSpectrumCreate, 0, plugin); 277 | } 278 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /sse_mathfun.h: -------------------------------------------------------------------------------- 1 | /*! 2 | @file sse_mathfun.h 3 | 4 | SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log 5 | 6 | Inspired by Intel Approximate Math library, and based on the 7 | corresponding algorithms of the cephes math library 8 | 9 | The default is to use the SSE1 version. If you define USE_SSE2 the 10 | the SSE2 intrinsics will be used in place of the MMX intrinsics. Do 11 | not expect any significant performance improvement with SSE2. 12 | */ 13 | 14 | /* Copyright (C) 2010,2011 RJVB - extensions */ 15 | /* Copyright (C) 2007 Julien Pommier 16 | 17 | This software is provided 'as-is', without any express or implied 18 | warranty. In no event will the authors be held liable for any damages 19 | arising from the use of this software. 20 | 21 | Permission is granted to anyone to use this software for any purpose, 22 | including commercial applications, and to alter it and redistribute it 23 | freely, subject to the following restrictions: 24 | 25 | 1. The origin of this software must not be misrepresented; you must not 26 | claim that you wrote the original software. If you use this software 27 | in a product, an acknowledgment in the product documentation would be 28 | appreciated but is not required. 29 | 2. Altered source versions must be plainly marked as such, and must not be 30 | misrepresented as being the original software. 31 | 3. This notice may not be removed or altered from any source distribution. 32 | 33 | (this is the zlib license) 34 | */ 35 | 36 | #ifndef _SSE_MATHFUN_H 37 | 38 | #ifdef USE_SSE_AUTO 39 | # ifdef __SSE2__ 40 | # if defined(__GNUC__) 41 | # warning "USE_SSE2" 42 | # endif 43 | # define USE_SSE2 44 | # endif 45 | # if defined(__SSE3__) || defined(__SSSE3__) 46 | # if defined(__GNUC__) 47 | # warning "USE_SSE3" 48 | # endif 49 | # define USE_SSE2 50 | # define USE_SSE3 51 | # endif 52 | # if defined(__SSE4__) || defined(__SSE4_1__) || defined(__SSE4_2__) || ((_M_IX86_FP > 1) && !defined(_M_AMD64)) 53 | # if defined(__GNUC__) 54 | # warning "USE_SSE4" 55 | # endif 56 | # define USE_SSE2 57 | # define USE_SSE3 58 | # define USE_SSE4 59 | # endif 60 | #endif 61 | 62 | #include 63 | #include 64 | #include 65 | 66 | /* yes I know, the top of this file is quite ugly */ 67 | 68 | /*! 69 | macros to obtain the required 16bit alignment 70 | */ 71 | #ifdef _MSC_VER /* visual c++ */ 72 | # define ALIGN16_BEG __declspec(align(16)) 73 | # define ALIGN16_END 74 | # define inline __forceinline 75 | #else /* gcc or icc */ 76 | # define ALIGN16_BEG 77 | # define ALIGN16_END __attribute__((aligned(16))) 78 | #endif 79 | 80 | /* __m128 is ugly to write */ 81 | /*! 82 | an SSE vector of 4 floats 83 | */ 84 | typedef __m128 v4sf; // vector of 4 float (sse1) 85 | 86 | #if defined(USE_SSE3) || defined(USE_SSE4) 87 | # define USE_SSE2 88 | #endif 89 | 90 | /*! 91 | an SSE/MMX vector of 4 32bit integers 92 | */ 93 | #ifdef __APPLE_CC__ 94 | typedef int v4si __attribute__ ((__vector_size__ (16), __may_alias__)); 95 | #else 96 | typedef __m128i v4si; // vector of 4 int (sse2) 97 | #endif 98 | // RJVB 20111028: some support for double precision semantics 99 | /*! 100 | an SSE2+ vector of 2 doubles 101 | */ 102 | typedef __m128d v2df; // vector of 2 double (sse2) 103 | /*! 104 | an MMX vector of 2 32bit ints 105 | */ 106 | typedef __m64 v2si; // vector of 2 int (mmx) 107 | 108 | #if defined(USE_SSE3) || defined(USE_SSE4) 109 | # define USE_SSE3 110 | # include 111 | # if defined(__SSSE3__) || (_M_IX86_FP > 1) 112 | # include 113 | # endif 114 | #endif 115 | 116 | #if defined(USE_SSE4) 117 | # define USE_SSE4 118 | # include 119 | #endif 120 | 121 | #ifdef __GNUC__0 122 | # define _MM_SET_PD(b,a) (v2df){(a),(b)} 123 | # define _MM_SET1_PD(a) (v2df){(a),(a)} 124 | // static inline v2df _MM_SET1_PD(double a) 125 | // { 126 | // return (v2df){a,a}; 127 | // } 128 | # define _MM_SETR_PD(a,b) (v2df){(a),(b)} 129 | # define _MM_SETZERO_PD() (v2df){0.0,0.0} 130 | # define _MM_SET_PS(d,c,b,a) (v4sf){(a),(b),(c),(d)} 131 | # define _MM_SET1_PS(a) (v4sf){(a),(a),(a),(a)} 132 | // static inline v4sf _MM_SET1_PS(float a) 133 | // { 134 | // return (v4sf){a,a,a,a}; 135 | // } 136 | # define _MM_SETR_PS(a,b,c,d) (v4sf){(a),(b),(c),(d)} 137 | # define _MM_SETZERO_PS() (v4sf){0.0f,0.0f,0.0f,0.0f} 138 | # define _MM_SETZERO_SI128() (__m128i)(__v4si){0,0,0,0} 139 | # define _MM_SETZERO_SI64() ALIGN16_BEG (__m64 ALIGN16_END)0LL 140 | #else 141 | # define _MM_SET_PD(b,a) _mm_setr_pd((a),(b)) 142 | # define _MM_SET1_PD(a) _mm_set1_pd((a)) 143 | # define _MM_SETR_PD(a,b) _mm_setr_pd((a),(b)) 144 | # define _MM_SETZERO_PD() _mm_setzero_pd() 145 | # define _MM_SET_PS(d,c,b,a) _mm_setr_ps((a),(b),(c),(d)) 146 | # define _MM_SET1_PS(a) _mm_set1_ps((a)) 147 | # define _MM_SETR_PS(a,b,c,d) _mm_setr_ps((a),(b),(c),(d)) 148 | # define _MM_SETZERO_PS() _mm_setzero_ps() 149 | # define _MM_SETZERO_SI128() _mm_setzero_si128() 150 | # define _MM_SETZERO_SI64() _mm_setzero_si64() 151 | #endif 152 | #define VELEM(type,a,n) (((type*)&a)[n]) 153 | 154 | /* declare some SSE constants -- why can't I figure a better way to do that? */ 155 | #define _PS_CONST(Name, Val) \ 156 | static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { (const float)(Val), (const float)(Val), (const float)(Val), (const float)(Val) } 157 | #define _PI32_CONST(Name, Val) \ 158 | static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val } 159 | #define _PS_CONST_TYPE(Name, Type, Val) \ 160 | static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val } 161 | 162 | #define _PD_CONST(Name, Val) \ 163 | static const ALIGN16_BEG double _pd_##Name[2] ALIGN16_END = { (const double)(Val), (const double)(Val) } 164 | #define _PD_CONST_TYPE(Name, Type, Val) \ 165 | static const ALIGN16_BEG Type _pd_##Name[2] ALIGN16_END = { Val, Val } 166 | 167 | #pragma mark code section 168 | #ifdef SSE_MATHFUN_WITH_CODE 169 | 170 | _PS_CONST(1 , 1.0f); 171 | _PS_CONST(0p5, 0.5f); 172 | /* the smallest non denormalized float number */ 173 | _PS_CONST_TYPE(min_norm_pos, int, 0x00800000); 174 | _PS_CONST_TYPE(mant_mask, int, 0x7f800000); 175 | _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); 176 | 177 | _PS_CONST_TYPE(sign_mask, int, 0x80000000); 178 | _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000); 179 | 180 | _PI32_CONST(1, 1); 181 | _PI32_CONST(inv1, ~1); 182 | _PI32_CONST(2, 2); 183 | _PI32_CONST(4, 4); 184 | _PI32_CONST(0x7f, 0x7f); 185 | 186 | _PS_CONST(cephes_SQRTHF, 0.707106781186547524); 187 | _PS_CONST(cephes_log_p0, 7.0376836292E-2); 188 | _PS_CONST(cephes_log_p1, - 1.1514610310E-1); 189 | _PS_CONST(cephes_log_p2, 1.1676998740E-1); 190 | _PS_CONST(cephes_log_p3, - 1.2420140846E-1); 191 | _PS_CONST(cephes_log_p4, + 1.4249322787E-1); 192 | _PS_CONST(cephes_log_p5, - 1.6668057665E-1); 193 | _PS_CONST(cephes_log_p6, + 2.0000714765E-1); 194 | _PS_CONST(cephes_log_p7, - 2.4999993993E-1); 195 | _PS_CONST(cephes_log_p8, + 3.3333331174E-1); 196 | _PS_CONST(cephes_log_q1, -2.12194440e-4); 197 | _PS_CONST(cephes_log_q2, 0.693359375); 198 | 199 | #ifdef USE_SSE2 200 | _PD_CONST(1, 1.0); 201 | _PD_CONST(_1, -1.0); 202 | _PD_CONST(0p5, 0.5); 203 | /* the smallest non denormalised float number */ 204 | // _PD_CONST_TYPE(min_norm_pos, int, 0x00800000); 205 | // _PD_CONST_TYPE(mant_mask, int, 0x7f800000); 206 | // _PD_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); 207 | 208 | _PD_CONST_TYPE(sign_mask, long long, 0x8000000000000000LL); 209 | _PD_CONST_TYPE(inv_sign_mask, long long, ~0x8000000000000000LL); 210 | 211 | #endif 212 | 213 | #if defined (__MINGW32__) 214 | 215 | /* the ugly part below: many versions of gcc used to be completely buggy with respect to some intrinsics 216 | The movehl_ps is fixed in mingw 3.4.5, but I found out that all the _mm_cmp* intrinsics were completely 217 | broken on my mingw gcc 3.4.5 ... 218 | 219 | Note that the bug on _mm_cmp* does occur only at -O0 optimization level 220 | */ 221 | 222 | inline __m128 my_movehl_ps(__m128 a, const __m128 b) { 223 | asm ( 224 | "movhlps %2,%0\n\t" 225 | : "=x" (a) 226 | : "0" (a), "x"(b) 227 | ); 228 | return a; } 229 | #warning "redefined _mm_movehl_ps (see gcc bug 21179)" 230 | #define _mm_movehl_ps my_movehl_ps 231 | 232 | inline __m128 my_cmplt_ps(__m128 a, const __m128 b) { 233 | asm ( 234 | "cmpltps %2,%0\n\t" 235 | : "=x" (a) 236 | : "0" (a), "x"(b) 237 | ); 238 | return a; 239 | } 240 | inline __m128 my_cmpgt_ps(__m128 a, const __m128 b) { 241 | asm ( 242 | "cmpnleps %2,%0\n\t" 243 | : "=x" (a) 244 | : "0" (a), "x"(b) 245 | ); 246 | return a; 247 | } 248 | inline __m128 my_cmpeq_ps(__m128 a, const __m128 b) { 249 | asm ( 250 | "cmpeqps %2,%0\n\t" 251 | : "=x" (a) 252 | : "0" (a), "x"(b) 253 | ); 254 | return a; 255 | } 256 | #warning "redefined _mm_cmpxx_ps functions..." 257 | #define _mm_cmplt_ps my_cmplt_ps 258 | #define _mm_cmpgt_ps my_cmpgt_ps 259 | #define _mm_cmpeq_ps my_cmpeq_ps 260 | #endif 261 | 262 | #ifndef USE_SSE2 263 | typedef union xmm_mm_union { 264 | __m128 xmm; 265 | __m64 mm[2]; 266 | } xmm_mm_union; 267 | 268 | #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \ 269 | xmm_mm_union u; u.xmm = xmm_; \ 270 | mm0_ = u.mm[0]; \ 271 | mm1_ = u.mm[1]; \ 272 | } 273 | 274 | #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \ 275 | xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \ 276 | } 277 | 278 | #endif // USE_SSE2 279 | 280 | /*! 281 | natural logarithm computed for 4 simultaneous float 282 | @n 283 | return NaN for x <= 0 284 | */ 285 | static inline v4sf log_ps(v4sf x) 286 | { 287 | v4sf e; 288 | #ifdef USE_SSE2 289 | v4si emm0; 290 | #else 291 | v2si mm0, mm1; 292 | #endif 293 | v4sf one = *(v4sf*)_ps_1; 294 | v4sf invalid_mask = _mm_cmple_ps(x, _MM_SETZERO_PS()); 295 | 296 | x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */ 297 | 298 | #ifndef USE_SSE2 299 | /* part 1: x = frexpf(x, &e); */ 300 | COPY_XMM_TO_MM(x, mm0, mm1); 301 | mm0 = _mm_srli_pi32(mm0, 23); 302 | mm1 = _mm_srli_pi32(mm1, 23); 303 | #else 304 | emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); 305 | #endif 306 | /* keep only the fractional part */ 307 | x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask); 308 | x = _mm_or_ps(x, *(v4sf*)_ps_0p5); 309 | 310 | #ifndef USE_SSE2 311 | /* now e=mm0:mm1 contain the really base-2 exponent */ 312 | mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f); 313 | mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f); 314 | e = _mm_cvtpi32x2_ps(mm0, mm1); 315 | _mm_empty(); /* bye bye mmx */ 316 | #else 317 | emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f); 318 | e = _mm_cvtepi32_ps(emm0); 319 | #endif 320 | 321 | e = _mm_add_ps(e, one); 322 | 323 | /* part2: 324 | if( x < SQRTHF ) { 325 | e -= 1; 326 | x = x + x - 1.0; 327 | } else { x = x - 1.0; } 328 | */ 329 | { 330 | v4sf z, y; 331 | v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF); 332 | v4sf tmp = _mm_and_ps(x, mask); 333 | x = _mm_sub_ps(x, one); 334 | e = _mm_sub_ps(e, _mm_and_ps(one, mask)); 335 | x = _mm_add_ps(x, tmp); 336 | 337 | 338 | z = _mm_mul_ps(x,x); 339 | 340 | y = *(v4sf*)_ps_cephes_log_p0; 341 | y = _mm_mul_ps(y, x); 342 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1); 343 | y = _mm_mul_ps(y, x); 344 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2); 345 | y = _mm_mul_ps(y, x); 346 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3); 347 | y = _mm_mul_ps(y, x); 348 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4); 349 | y = _mm_mul_ps(y, x); 350 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5); 351 | y = _mm_mul_ps(y, x); 352 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6); 353 | y = _mm_mul_ps(y, x); 354 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7); 355 | y = _mm_mul_ps(y, x); 356 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8); 357 | y = _mm_mul_ps(y, x); 358 | 359 | y = _mm_mul_ps(y, z); 360 | 361 | tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1); 362 | y = _mm_add_ps(y, tmp); 363 | 364 | 365 | tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); 366 | y = _mm_sub_ps(y, tmp); 367 | 368 | tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2); 369 | x = _mm_add_ps(x, y); 370 | x = _mm_add_ps(x, tmp); 371 | x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN 372 | } 373 | return x; 374 | } 375 | 376 | _PS_CONST(exp_hi, 88.3762626647949f); 377 | _PS_CONST(exp_lo, -88.3762626647949f); 378 | 379 | _PS_CONST(cephes_LOG2EF, 1.44269504088896341); 380 | _PS_CONST(cephes_exp_C1, 0.693359375); 381 | _PS_CONST(cephes_exp_C2, -2.12194440e-4); 382 | 383 | _PS_CONST(cephes_exp_p0, 1.9875691500E-4); 384 | _PS_CONST(cephes_exp_p1, 1.3981999507E-3); 385 | _PS_CONST(cephes_exp_p2, 8.3334519073E-3); 386 | _PS_CONST(cephes_exp_p3, 4.1665795894E-2); 387 | _PS_CONST(cephes_exp_p4, 1.6666665459E-1); 388 | _PS_CONST(cephes_exp_p5, 5.0000001201E-1); 389 | 390 | /*! 391 | computes e**x of the 4 floats in x 392 | */ 393 | static inline v4sf exp_ps(v4sf x) 394 | { v4sf tmp = _MM_SETZERO_PS(), fx, mask, y, z; 395 | v4sf pow2n; 396 | #ifdef USE_SSE2 397 | v4si emm0; 398 | #else 399 | v2si mm0, mm1; 400 | #endif 401 | v4sf one = *(v4sf*)_ps_1; 402 | 403 | x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi); 404 | x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo); 405 | 406 | /* express exp(x) as exp(g + n*log(2)) */ 407 | fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF); 408 | fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5); 409 | 410 | /* how to perform a floorf with SSE: just below */ 411 | #ifndef USE_SSE2 412 | /* step 1 : cast to int */ 413 | tmp = _mm_movehl_ps(tmp, fx); 414 | mm0 = _mm_cvttps_pi32(fx); 415 | mm1 = _mm_cvttps_pi32(tmp); 416 | /* step 2 : cast back to float */ 417 | tmp = _mm_cvtpi32x2_ps(mm0, mm1); 418 | #else 419 | emm0 = _mm_cvttps_epi32(fx); 420 | tmp = _mm_cvtepi32_ps(emm0); 421 | #endif 422 | /* if greater, substract 1 */ 423 | mask = _mm_cmpgt_ps(tmp, fx); 424 | mask = _mm_and_ps(mask, one); 425 | fx = _mm_sub_ps(tmp, mask); 426 | 427 | tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1); 428 | z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2); 429 | x = _mm_sub_ps(x, tmp); 430 | x = _mm_sub_ps(x, z); 431 | 432 | z = _mm_mul_ps(x,x); 433 | 434 | y = *(v4sf*)_ps_cephes_exp_p0; 435 | y = _mm_mul_ps(y, x); 436 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1); 437 | y = _mm_mul_ps(y, x); 438 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2); 439 | y = _mm_mul_ps(y, x); 440 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3); 441 | y = _mm_mul_ps(y, x); 442 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4); 443 | y = _mm_mul_ps(y, x); 444 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5); 445 | y = _mm_mul_ps(y, z); 446 | y = _mm_add_ps(y, x); 447 | y = _mm_add_ps(y, one); 448 | 449 | /* build 2^n */ 450 | #ifndef USE_SSE2 451 | z = _mm_movehl_ps(z, fx); 452 | mm0 = _mm_cvttps_pi32(fx); 453 | mm1 = _mm_cvttps_pi32(z); 454 | mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f); 455 | mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f); 456 | mm0 = _mm_slli_pi32(mm0, 23); 457 | mm1 = _mm_slli_pi32(mm1, 23); 458 | 459 | COPY_MM_TO_XMM(mm0, mm1, pow2n); 460 | _mm_empty(); 461 | #else 462 | emm0 = _mm_cvttps_epi32(fx); 463 | emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f); 464 | emm0 = _mm_slli_epi32(emm0, 23); 465 | pow2n = _mm_castsi128_ps(emm0); 466 | #endif 467 | y = _mm_mul_ps(y, pow2n); 468 | return y; 469 | } 470 | 471 | _PS_CONST(minus_cephes_DP1, -0.78515625); 472 | _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 473 | _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 474 | _PS_CONST(sincof_p0, -1.9515295891E-4); 475 | _PS_CONST(sincof_p1, 8.3321608736E-3); 476 | _PS_CONST(sincof_p2, -1.6666654611E-1); 477 | _PS_CONST(coscof_p0, 2.443315711809948E-005); 478 | _PS_CONST(coscof_p1, -1.388731625493765E-003); 479 | _PS_CONST(coscof_p2, 4.166664568298827E-002); 480 | _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI 481 | 482 | #ifdef USE_SSE2 483 | _PD_CONST(minus_cephes_DP1, -0.78515625); 484 | _PD_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 485 | _PD_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 486 | _PD_CONST(sincof_p0, -1.9515295891E-4); 487 | _PD_CONST(sincof_p1, 8.3321608736E-3); 488 | _PD_CONST(sincof_p2, -1.6666654611E-1); 489 | _PD_CONST(coscof_p0, 2.443315711809948E-005); 490 | _PD_CONST(coscof_p1, -1.388731625493765E-003); 491 | _PD_CONST(coscof_p2, 4.166664568298827E-002); 492 | _PD_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI 493 | #endif 494 | 495 | 496 | /*! 497 | evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so 498 | it runs also on old athlons XPs and the pentium III of your grand 499 | mother. 500 | @n 501 | The code is the exact rewriting of the cephes sinf function. 502 | Precision is excellent as long as x < 8192 (I did not bother to 503 | take into account the special handling they have for greater values 504 | -- it does not return garbage for arguments over 8192, though, but 505 | the extra precision is missing). 506 | @n 507 | Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the 508 | surprising but correct result. 509 | @n 510 | Performance is also surprisingly good, 1.33 times faster than the 511 | macos vsinf SSE2 function, and 1.5 times faster than the 512 | __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not 513 | too bad for an SSE1 function (with no special tuning) ! 514 | However the latter libraries probably have a much better handling of NaN, 515 | Inf, denormalized and other special arguments.. 516 | @n 517 | On my core 1 duo, the execution of this function takes approximately 95 cycles. 518 | @n 519 | From what I have observed on the experiments with Intel AMath lib, switching to an 520 | SSE2 version would improve the perf by only 10%. 521 | @n 522 | Since it is based on SSE intrinsics, it has to be compiled at -O2 to 523 | deliver full speed. 524 | */ 525 | static inline v4sf sin_ps(v4sf x) 526 | { // any x 527 | v4sf xmm1, xmm2 = _MM_SETZERO_PS(), xmm3, sign_bit, y, y2, z, tmp; 528 | 529 | v4sf swap_sign_bit, poly_mask; 530 | #ifdef USE_SSE2 531 | v4si emm0, emm2; 532 | #else 533 | v2si mm0, mm1, mm2, mm3; 534 | #endif 535 | sign_bit = x; 536 | /* take the absolute value */ 537 | x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); 538 | /* extract the sign bit (upper one) */ 539 | sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask); 540 | 541 | /* scale by 4/Pi */ 542 | y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); 543 | 544 | //printf("plop:"); print4(y); 545 | #ifdef USE_SSE2 546 | /* store the integer part of y in mm0 */ 547 | emm2 = _mm_cvttps_epi32(y); 548 | /* j=(j+1) & (~1) (see the cephes sources) */ 549 | emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); 550 | emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); 551 | y = _mm_cvtepi32_ps(emm2); 552 | /* get the swap sign flag */ 553 | emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4); 554 | emm0 = _mm_slli_epi32(emm0, 29); 555 | /* get the polynom selection mask 556 | there is one polynom for 0 <= x <= Pi/4 557 | and another one for Pi/4 0 ){ 1112 | v2df *va = (v2df*) xa, vsum = _MM_SETZERO_PD(); 1113 | int i, N_4 = N-4+1; 1114 | for( i = 0 ; i < N_4 ; va+=2 ){ 1115 | vsum = _mm_add_pd( vsum, _mm_add_pd( va[0], va[1] ) ); 1116 | i += 4; 1117 | } 1118 | sum = VELEM(double,vsum,0) + VELEM(double,vsum,1); 1119 | for( ; i < N; i++ ){ 1120 | sum += xa[i]; 1121 | } 1122 | } 1123 | else{ 1124 | sum = 0.0; 1125 | } 1126 | return sum; 1127 | } 1128 | 1129 | 1130 | /*! 1131 | computes the cumulative sum of the squares of the values in double array xa[n] using SSE2 intrinsics 1132 | */ 1133 | static inline double CumSumSq( double *xa, int n ) 1134 | { __m128d vsumsq; 1135 | register int i, N_4 = n-4+1; 1136 | register double sumsq = 0; 1137 | for( i = 0 ; i < N_4 ; i+=4, xa+=4 ){ 1138 | #ifdef __GNUC__ 1139 | vsumsq = *((__m128d*)&xa[2]) * *((__m128d*)&xa[2]) + *((__m128d*)xa) * *((__m128d*)xa); 1140 | #else 1141 | vsumsq = _mm_add_pd( _mm_mul_pd( *((__m128d*)&xa[2]), *((__m128d*)&xa[2]) ), 1142 | _mm_mul_pd( *((__m128d*)xa), *((__m128d*)xa) ) ); 1143 | #endif 1144 | sumsq += *((double*)&vsumsq) + ((double*)&vsumsq)[1]; 1145 | } 1146 | for( ; i < n ; i++, xa++ ){ 1147 | sumsq += *xa * *xa; 1148 | } 1149 | return sumsq; 1150 | } 1151 | 1152 | /*! 1153 | computes the cumulative sum of the values and their squares in double array xa[n] using SSE2 intrinsics 1154 | */ 1155 | static inline double CumSumSumSq( double *xa, int n, double *sumSQ ) 1156 | { __m128d vsum, vsumsq; 1157 | register int i, N_4 = n-4+1; 1158 | register double sum = 0.0, sumsq = 0; 1159 | for( i = 0 ; i < N_4 ; i+=4, xa+=4 ){ 1160 | #ifdef __GNUC__ 1161 | vsum = *((__m128d*)&xa[2]) + *((__m128d*)xa); 1162 | vsumsq = *((__m128d*)&xa[2]) * *((__m128d*)&xa[2]) + *((__m128d*)xa) * *((__m128d*)xa); 1163 | #else 1164 | vsum = _mm_add_pd( *((__m128d*)&xa[2]), *((__m128d*)xa) ); 1165 | vsumsq = _mm_add_pd( _mm_mul_pd( *((__m128d*)&xa[2]), *((__m128d*)&xa[2]) ), 1166 | _mm_mul_pd( *((__m128d*)xa), *((__m128d*)xa) ) ); 1167 | #endif 1168 | sum += *((double*)&vsum) + ((double*)&vsum)[1]; 1169 | sumsq += *((double*)&vsumsq) + ((double*)&vsumsq)[1]; 1170 | } 1171 | for( ; i < n ; i++, xa++ ){ 1172 | sum += *xa; 1173 | sumsq += *xa * *xa; 1174 | } 1175 | *sumSQ = sumsq; 1176 | return sum; 1177 | } 1178 | 1179 | /*! 1180 | scalar version of CumSum without explicit SSE2 intrinsics 1181 | */ 1182 | static inline double scalCumSum( double *xa, int n ) 1183 | { register int i; 1184 | register double sum = 0.0; 1185 | for( i = 0 ; i < n ; i++ ){ 1186 | sum += *xa++; 1187 | } 1188 | return sum; 1189 | } 1190 | 1191 | /*! 1192 | scalar version of CumSumSq without explicit SSE2 intrinsics 1193 | */ 1194 | static inline double scalCumSumSq( double *xa, int n ) 1195 | { register int i; 1196 | register double sumsq = 0.0; 1197 | for( i = 0 ; i < n ; i++, xa++ ){ 1198 | sumsq += *xa * *xa; 1199 | } 1200 | return sumsq; 1201 | } 1202 | 1203 | /*! 1204 | scalar version of CumSumSumSq without explicit SSE2 intrinsics 1205 | */ 1206 | static inline double scalCumSumSumSq( double *xa, int n, double *sumSQ ) 1207 | { register int i; 1208 | register double sum = 0.0, sumsq = 0.0; 1209 | for( i = 0 ; i < n ; i++, xa++ ){ 1210 | sum += *xa; 1211 | sumsq += *xa * *xa; 1212 | } 1213 | *sumSQ = sumsq; 1214 | return sum; 1215 | } 1216 | 1217 | /*! 1218 | computes the cumulative product of the double array xa[n] using SSE2 intrinsics 1219 | */ 1220 | static inline double CumMul(double *xa, int N) 1221 | { double cum; 1222 | if( xa && N > 0 ){ 1223 | v2df *va = (v2df*) xa, vcum = _MM_SET1_PD(1.0); 1224 | int i, N_4 = N-4+1; 1225 | for( i = 0 ; i < N_4 ; va+=2 ){ 1226 | vcum = _mm_mul_pd( vcum, _mm_mul_pd( va[0], va[1] ) ); 1227 | i += 4; 1228 | } 1229 | cum = VELEM(double,vcum,0) * VELEM(double,vcum,1); 1230 | for( ; i < N; i++ ){ 1231 | cum *= xa[i]; 1232 | } 1233 | } 1234 | else{ 1235 | cum = 0.0; 1236 | } 1237 | return cum; 1238 | } 1239 | 1240 | #else 1241 | 1242 | /*! 1243 | computes the cumulative sum of the double array xa[n] using traditional scalar code 1244 | */ 1245 | static inline double CumSum( double *xa, int n ) 1246 | { register int i; 1247 | register double sum = 0.0; 1248 | for( i = 0 ; i < n ; i++ ){ 1249 | sum += *xa++; 1250 | } 1251 | return sum; 1252 | } 1253 | 1254 | /*! 1255 | alternative for CumSum 1256 | */ 1257 | static inline double scalCumSum( double *xa, int n ) 1258 | { 1259 | return CumSum(xa,n); 1260 | } 1261 | 1262 | /*! 1263 | computes the cumulative sum of the squares of the values in double array xa[n] using traditional scalar code 1264 | */ 1265 | static inline double CumSumSq( double *xa, int n ) 1266 | { register int i; 1267 | register double sumsq = 0.0; 1268 | for( i = 0 ; i < n ; i++, xa++ ){ 1269 | sumsq += *xa * *xa; 1270 | } 1271 | return sumsq; 1272 | } 1273 | 1274 | /*! 1275 | alternative for CumSumSq 1276 | */ 1277 | static inline double scalCumSumSq( double *xa, int n ) 1278 | { 1279 | return CumSumSq(xa,n); 1280 | } 1281 | 1282 | /*! 1283 | computes the cumulative sum of the values and their squares in double array xa[n] using traditional scalar code 1284 | */ 1285 | static inline double CumSumSumSq( double *xa, int n, double *sumSQ ) 1286 | { register int i; 1287 | register double sum = 0.0, sumsq = 0.0; 1288 | for( i = 0 ; i < n ; i++, xa++ ){ 1289 | sum += *xa; 1290 | sumsq += *xa * *xa; 1291 | } 1292 | *sumSQ = sumsq; 1293 | return sum; 1294 | } 1295 | 1296 | /*! 1297 | alternative for CumSumSumSq 1298 | */ 1299 | static inline double scalCumSumSumSq( double *xa, int n, double *sumSQ ) 1300 | { 1301 | return CumSumSumSq(xa,n,sumSQ); 1302 | } 1303 | 1304 | #endif //USE_SSE2 1305 | 1306 | #endif // SSE_MATHFUN_WITH_CODE 1307 | 1308 | //// Some SSE "extensions", and equivalents not using SSE explicitly: 1309 | #pragma mark SSE extensions 1310 | 1311 | #ifdef USE_SSE2 1312 | 1313 | # if defined(__x86_64__) || defined(x86_64) || defined(_LP64) 1314 | // static inline v2df _mm_abs_pd( v2df a ) 1315 | // { _PD_CONST_TYPE(abs_mask, long long, ~0x8000000000000000LL); 1316 | // return _mm_and_pd(a, *(v2df*)_pd_abs_mask); 1317 | // } 1318 | /*! 1319 | SSE2 'intrinsic' to take the absolute value of a 1320 | */ 1321 | /*static inline v2df _mm_abs_pd( register v2df a ) 1322 | { const static long long am1[2] = {~0x8000000000000000LL,~0x8000000000000000LL}; 1323 | return _mm_and_pd(a, *((v2df*)am1) ); 1324 | } 1325 | static inline double _mm_abs_sd( double a ) 1326 | { const static long long am2 = {~0x8000000000000000LL}; 1327 | v2si r = _mm_and_si64( *((v2si*)&a), *((v2si*)&am2) ); 1328 | return *((double*) &r); 1329 | }*/ 1330 | # else 1331 | // no native support for 64bit ints: don't lose time on that! 1332 | /*! 1333 | SSE2 'intrinsic' to take the absolute value of a 1334 | */ 1335 | static inline v2df _mm_abs_pd( register v2df a ) 1336 | { const v4si am1 = _mm_set_epi32(0x7fffffff,0xffffffff,0x7fffffff,0xffffffff); 1337 | return _mm_and_pd(a, *((v2df*)&am1) ); 1338 | } 1339 | static inline double _mm_abs_sd( double a ) 1340 | { const static unsigned long long am2 = 0x7fffffffffffffffLL; 1341 | const v4si am1 = _mm_set_epi32(0x7fffffff,0xffffffff,0x7fffffff,0xffffffff); 1342 | v2si r = _mm_and_si64( *((v2si*)&a), *((v2si*)&am1) ); 1343 | _mm_empty(); 1344 | return *((double*)&r); 1345 | // union { double d; v2si r; } ret; 1346 | // ret.r = _mm_and_si64( *((v2si*)&a), *((v2si*)&am1) ); 1347 | // a = ret.d; 1348 | // return a; 1349 | } 1350 | # endif // i386 or x86_64 1351 | static inline v4sf _mm_abs_ps( register v4sf a ) 1352 | { const v4si am1 = _mm_set_epi32(0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff); 1353 | return _mm_and_ps(a, *((v4sf*)&am1) ); 1354 | } 1355 | 1356 | /*! 1357 | clip a value to a min/max range 1358 | */ 1359 | static inline v2df _mm_clip_pd( v2df val, v2df valMin, v2df valMax ) 1360 | { 1361 | return _mm_max_pd( _mm_min_pd( val, valMax ), valMin ); 1362 | } 1363 | 1364 | /*! 1365 | return an SSE2 vector of 2 doubles initialised with val0 and val1, clipped to 1366 | the specified range 1367 | */ 1368 | static inline v2df _mm_setr_clipped_pd( double val0, double val1, v2df valMin, v2df valMax ) 1369 | { 1370 | return _mm_clip_pd( _MM_SETR_PD(val0,val1), valMin, valMax ); 1371 | } 1372 | #endif // USE_SSE2 1373 | #ifdef USE_SSE4 1374 | static inline double ssceil(double a) 1375 | { v2df va = _mm_ceil_pd( _MM_SETR_PD(a,0) ); 1376 | # if !defined(__x86_64__) && !defined(x86_64) && !defined(_LP64) 1377 | _mm_empty(); 1378 | # endif 1379 | return *((double*)&va); 1380 | } 1381 | 1382 | static inline double ssfloor(double a) 1383 | { v2df va = _mm_floor_pd( _MM_SETR_PD(a,0) ); 1384 | # if !defined(__x86_64__) && !defined(x86_64) && !defined(_LP64) 1385 | _mm_empty(); 1386 | # endif 1387 | return *((double*)&va); 1388 | } 1389 | static inline double ssround( double a ) 1390 | { v2df va = _mm_round_pd( _MM_SETR_PD(a,0), _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); 1391 | # if !defined(__x86_64__) && !defined(x86_64) && !defined(_LP64) 1392 | _mm_empty(); 1393 | # endif 1394 | return *((double*)&va); 1395 | } 1396 | #else 1397 | static inline double ssceil(double a) 1398 | { 1399 | return ceil(a); 1400 | } 1401 | static inline double ssfloor(double a) 1402 | { 1403 | return floor(a); 1404 | } 1405 | static inline double ssround( double a ) 1406 | { 1407 | return (a >= 0)? floor( a + 0.5 ) : -ceil( -a - 0.5 ); 1408 | } 1409 | #endif //USE_SSE4 1410 | 1411 | 1412 | // SSE-like convenience functions (note the absence of a leading _!) 1413 | 1414 | /*! 1415 | return an SSE2 vector of 2 doubles initialised with val0 and val1, clipped to 1416 | the specified range. Does not use SSE2 intrinsics. 1417 | */ 1418 | static inline v2df *mm_setr_clipped_pd( v2df *val, double val0, double val1, v2df *valMin, v2df *valMax ) 1419 | { 1420 | if( val0 > ((double*)valMax)[0] ){ 1421 | ((double*)val)[0] = ((double*)valMax)[0]; 1422 | } 1423 | else if( val0 < ((double*)valMin)[0] ){ 1424 | ((double*)val)[0] = ((double*)valMin)[0]; 1425 | } 1426 | else{ 1427 | ((double*)val)[0] = val0; 1428 | } 1429 | if( val1 > ((double*)valMax)[1] ){ 1430 | ((double*)val)[1] = ((double*)valMax)[1]; 1431 | } 1432 | else if( val1 < ((double*)valMin)[1] ){ 1433 | ((double*)val)[1] = ((double*)valMin)[1]; 1434 | } 1435 | else{ 1436 | ((double*)val)[1] = val1; 1437 | } 1438 | return val; 1439 | } 1440 | 1441 | /*! 1442 | SSE2 'intrinsic' to take the absolute value of a. Doesn't use SSE2 intrinsics 1443 | */ 1444 | static inline v2df *mm_clip_pd( v2df *val, v2df *valMin, v2df *valMax ) 1445 | { 1446 | if( ((double*)val)[0] > ((double*)valMax)[0] ){ 1447 | ((double*)val)[0] = ((double*)valMax)[0]; 1448 | } 1449 | else if( ((double*)val)[0] < ((double*)valMin)[0] ){ 1450 | ((double*)val)[0] = ((double*)valMin)[0]; 1451 | } 1452 | if( ((double*)val)[1] > ((double*)valMax)[1] ){ 1453 | ((double*)val)[1] = ((double*)valMax)[1]; 1454 | } 1455 | else if( ((double*)val)[1] < ((double*)valMin)[1] ){ 1456 | ((double*)val)[1] = ((double*)valMin)[1]; 1457 | } 1458 | return val; 1459 | } 1460 | 1461 | /*! 1462 | emulation of the _mm_add_pd SSE2 intrinsic 1463 | */ 1464 | static inline v2df *mm_add_pd( v2df *c, v2df *a, v2df *b ) 1465 | { 1466 | ((double*)c)[0] = ((double*)a)[0] + ((double*)b)[0]; 1467 | ((double*)c)[1] = ((double*)a)[1] + ((double*)b)[1]; 1468 | return c; 1469 | } 1470 | 1471 | /*! 1472 | emulation of the _mm_add_pd SSE2 intrinsic 1473 | */ 1474 | static inline v2df *mm_sub_pd( v2df *c, v2df *a, v2df *b ) 1475 | { 1476 | ((double*)c)[0] = ((double*)a)[0] - ((double*)b)[0]; 1477 | ((double*)c)[1] = ((double*)a)[1] - ((double*)b)[1]; 1478 | return c; 1479 | } 1480 | 1481 | /*! 1482 | emulation of the _mm_sub_pd SSE2 intrinsic 1483 | */ 1484 | static inline v2df *mm_div_pd( v2df *c, v2df *a, v2df *b ) 1485 | { 1486 | ((double*)c)[0] = ((double*)a)[0] / ((double*)b)[0]; 1487 | ((double*)c)[1] = ((double*)a)[1] / ((double*)b)[1]; 1488 | return c; 1489 | } 1490 | 1491 | /*! 1492 | emulation of the _mm_mul_pd SSE2 intrinsic 1493 | */ 1494 | static inline v2df *mm_mul_pd( v2df *c, v2df *a, v2df *b ) 1495 | { 1496 | ((double*)c)[0] = ((double*)a)[0] * ((double*)b)[0]; 1497 | ((double*)c)[1] = ((double*)a)[1] * ((double*)b)[1]; 1498 | return c; 1499 | } 1500 | 1501 | /*! 1502 | non SSE emulation of the _mm_abs_pd 'intrinsic' defined elsewhere in this file 1503 | */ 1504 | static inline v2df *mm_abs_pd( v2df *val, v2df *a ) 1505 | { 1506 | ((double*)val)[0] = (((double*)a)[0] >= 0)? ((double*)a)[0] : -((double*)a)[0]; 1507 | ((double*)val)[1] = (((double*)a)[1] >= 1)? ((double*)a)[1] : -((double*)a)[1]; 1508 | return val; 1509 | } 1510 | 1511 | /*! 1512 | emulation of the _mm_round_pd SSE4 intrinsic. 1513 | @n 1514 | NB: the SSE4 intrinsic is at least twice as fast as the non-SSE calculation, PER value 1515 | so it pays to replace round(x) with _mm_round_pd(_mm_setr_pd(x)) - idem for floor and ceil 1516 | */ 1517 | static inline v2df *mm_round_pd( v2df *val, v2df *a ) 1518 | { 1519 | ((double*)val)[0] = (((double*)a)[0] >= 0)? floor( ((double*)a)[0] + 0.5 ) : -ceil( -((double*)a)[0] - 0.5 ); 1520 | ((double*)val)[1] = (((double*)a)[1] >= 0)? floor( ((double*)a)[1] + 0.5 ) : -ceil( -((double*)a)[1] - 0.5 ); 1521 | return val; 1522 | } 1523 | 1524 | #define _SSE_MATHFUN_H 1525 | #endif 1526 | --------------------------------------------------------------------------------