├── meson.build
├── README.md
├── FFTSpectrum.vcxproj.filters
├── FFTSpectrum.sln
├── .github
└── workflows
│ └── build.yaml
├── .gitattributes
├── .gitignore
├── FFTSpectrum.vcxproj
├── FFTSpectrum.c
├── COPYING
└── sse_mathfun.h
/meson.build:
--------------------------------------------------------------------------------
1 | project('FFTSpectrum', 'c',
2 | default_options : ['buildtype=release', 'b_ndebug=if-release', 'c_std=c99'],
3 | meson_version : '>=0.49.0',
4 | version : '2'
5 | )
6 |
7 | sources = 'FFTSpectrum.c'
8 |
9 | compiler = meson.get_compiler('c')
10 |
11 | if compiler.get_argument_syntax() == 'msvc'
12 | deps = [ dependency('fftwf') ]
13 | install_dir = 'installed' # dummy
14 | else
15 | vapoursynth_dep = dependency('vapoursynth').partial_dependency(compile_args : true, includes : true)
16 | deps = [ dependency('fftw3f'), vapoursynth_dep ]
17 | install_dir = join_paths(vapoursynth_dep.get_pkgconfig_variable('libdir'), 'vapoursynth')
18 | endif
19 |
20 | shared_module('fftspectrum', sources,
21 | dependencies : deps,
22 | install : true,
23 | install_dir : install_dir,
24 | gnu_symbol_visibility : 'hidden'
25 | )
26 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | FFTSpectrum
2 | ===========
3 |
4 | A VapourSynth filter that displays the FFT frequency spectrum of a given clip.
5 | Supposedly useful for determining original resolution of upscaled anime content.
6 |
7 | Usage
8 | -----
9 |
10 | fftspectrum.FFTSpectrum(clip clip, bint grid=False)
11 |
12 | * **clip** - Clip to process. It must have constant format and dimensions, and a luma plane with 8-bit integer samples.
13 | * **grid** - Specifies whether a grid with origin at the center of the image and spacing of 100 pixels should be drawn over the resulting spectrum.
14 |
15 | Examples
16 | --------
17 |
18 | Without grid:
19 | 
20 |
21 | With grid:
22 | 
23 |
24 | Credits
25 | -------
26 |
27 | FFTSpectrum is based on the AviUtl filter with the same name, written by Hiroaki Gotou in 2008.
28 |
--------------------------------------------------------------------------------
/FFTSpectrum.vcxproj.filters:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF}
6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx
7 |
8 |
9 | {93995380-89BD-4b04-88EB-625FBE52EBFB}
10 | h;hh;hpp;hxx;hm;inl;inc;xsd
11 |
12 |
13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01}
14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav
15 |
16 |
17 |
18 |
19 | Source Files
20 |
21 |
22 |
23 |
24 | Resource Files
25 |
26 |
27 |
--------------------------------------------------------------------------------
/FFTSpectrum.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio 15
4 | VisualStudioVersion = 15.0.28307.271
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "FFTSpectrum", "FFTSpectrum.vcxproj", "{FE7D4925-2B0B-4B0F-B63E-E0EBE9DDFE97}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | Debug|x64 = Debug|x64
11 | Debug|x86 = Debug|x86
12 | Release|x64 = Release|x64
13 | Release|x86 = Release|x86
14 | EndGlobalSection
15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
16 | {FE7D4925-2B0B-4B0F-B63E-E0EBE9DDFE97}.Debug|x64.ActiveCfg = Debug|x64
17 | {FE7D4925-2B0B-4B0F-B63E-E0EBE9DDFE97}.Debug|x64.Build.0 = Debug|x64
18 | {FE7D4925-2B0B-4B0F-B63E-E0EBE9DDFE97}.Debug|x86.ActiveCfg = Debug|Win32
19 | {FE7D4925-2B0B-4B0F-B63E-E0EBE9DDFE97}.Debug|x86.Build.0 = Debug|Win32
20 | {FE7D4925-2B0B-4B0F-B63E-E0EBE9DDFE97}.Release|x64.ActiveCfg = Release|x64
21 | {FE7D4925-2B0B-4B0F-B63E-E0EBE9DDFE97}.Release|x64.Build.0 = Release|x64
22 | {FE7D4925-2B0B-4B0F-B63E-E0EBE9DDFE97}.Release|x86.ActiveCfg = Release|Win32
23 | {FE7D4925-2B0B-4B0F-B63E-E0EBE9DDFE97}.Release|x86.Build.0 = Release|Win32
24 | EndGlobalSection
25 | GlobalSection(SolutionProperties) = preSolution
26 | HideSolutionNode = FALSE
27 | EndGlobalSection
28 | GlobalSection(ExtensibilityGlobals) = postSolution
29 | SolutionGuid = {7698D0B8-4303-4929-9A07-FA78A512652F}
30 | EndGlobalSection
31 | EndGlobal
32 |
--------------------------------------------------------------------------------
/.github/workflows/build.yaml:
--------------------------------------------------------------------------------
1 | name: Build for Windows
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | build:
7 | strategy:
8 | matrix:
9 | platform: [ windows-latest ]
10 | arch: [ x64 ]
11 | runs-on: ${{ matrix.platform }}
12 | steps:
13 | - uses: actions/checkout@v2
14 | with:
15 | submodules: 'recursive'
16 |
17 | - name: Run vcpkg
18 | uses: lukka/run-vcpkg@v4
19 | with:
20 | vcpkgArguments: 'fftw3[avx2]:x64-windows-static'
21 | vcpkgDirectory: '${{ github.workspace }}/vcpkg'
22 | vcpkgGitCommitId: 5568f110b509a9fd90711978a7cb76bae75bb092 # 2021.05.12 release
23 |
24 | - name: Setup Python
25 | uses: actions/setup-python@v1
26 | with:
27 | python-version: '3.x'
28 | - name: install meson and ninja
29 | run: pip install meson ninja
30 |
31 | - name: download VS headers and patch header location
32 | shell: bash
33 | run: |
34 | git clone https://github.com/vapoursynth/vapoursynth --depth=1 --branch R54
35 | cp vapoursynth/include/*.h vapoursynth/
36 |
37 | - name: setup MS dev commands
38 | uses: ilammy/msvc-dev-cmd@v1
39 | with:
40 | arch: ${{ matrix.arch }}
41 | - name: Install pkg-config lite
42 | run: choco install pkgconfiglite
43 | - name: Meson setup
44 | run: meson setup builddir/ -Db_vscrt=mt -Dpkg_config_path=${{ github.workspace }}/vcpkg/installed/x64-windows-static/lib/pkgconfig
45 | - name: Meson compile
46 | run: meson compile -C builddir/ -v
47 | - name: Upload artifact
48 | uses: actions/upload-artifact@v2
49 | with:
50 | name: release-${{matrix.arch}}
51 | path: |
52 | builddir/*.dll
53 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | ###############################################################################
2 | # Set default behavior to automatically normalize line endings.
3 | ###############################################################################
4 | * text=auto
5 |
6 | ###############################################################################
7 | # Set default behavior for command prompt diff.
8 | #
9 | # This is need for earlier builds of msysgit that does not have it on by
10 | # default for csharp files.
11 | # Note: This is only used by command line
12 | ###############################################################################
13 | #*.cs diff=csharp
14 |
15 | ###############################################################################
16 | # Set the merge driver for project and solution files
17 | #
18 | # Merging from the command prompt will add diff markers to the files if there
19 | # are conflicts (Merging from VS is not affected by the settings below, in VS
20 | # the diff markers are never inserted). Diff markers may cause the following
21 | # file extensions to fail to load in VS. An alternative would be to treat
22 | # these files as binary and thus will always conflict and require user
23 | # intervention with every merge. To do so, just uncomment the entries below
24 | ###############################################################################
25 | #*.sln merge=binary
26 | #*.csproj merge=binary
27 | #*.vbproj merge=binary
28 | #*.vcxproj merge=binary
29 | #*.vcproj merge=binary
30 | #*.dbproj merge=binary
31 | #*.fsproj merge=binary
32 | #*.lsproj merge=binary
33 | #*.wixproj merge=binary
34 | #*.modelproj merge=binary
35 | #*.sqlproj merge=binary
36 | #*.wwaproj merge=binary
37 |
38 | ###############################################################################
39 | # behavior for image files
40 | #
41 | # image files are treated as binary by default.
42 | ###############################################################################
43 | #*.jpg binary
44 | #*.png binary
45 | #*.gif binary
46 |
47 | ###############################################################################
48 | # diff behavior for common document formats
49 | #
50 | # Convert binary document formats to text before diffing them. This feature
51 | # is only available from the command line. Turn it on by uncommenting the
52 | # entries below.
53 | ###############################################################################
54 | #*.doc diff=astextplain
55 | #*.DOC diff=astextplain
56 | #*.docx diff=astextplain
57 | #*.DOCX diff=astextplain
58 | #*.dot diff=astextplain
59 | #*.DOT diff=astextplain
60 | #*.pdf diff=astextplain
61 | #*.PDF diff=astextplain
62 | #*.rtf diff=astextplain
63 | #*.RTF diff=astextplain
64 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 |
4 | # User-specific files
5 | *.suo
6 | *.user
7 | *.userosscache
8 | *.sln.docstates
9 |
10 | # User-specific files (MonoDevelop/Xamarin Studio)
11 | *.userprefs
12 |
13 | # Build results
14 | [Dd]ebug/
15 | [Dd]ebugPublic/
16 | [Rr]elease/
17 | [Rr]eleases/
18 | x64/
19 | x86/
20 | bld/
21 | [Bb]in/
22 | [Oo]bj/
23 | [Ll]og/
24 |
25 | # Visual Studio 2015 cache/options directory
26 | .vs/
27 | # Uncomment if you have tasks that create the project's static files in wwwroot
28 | #wwwroot/
29 |
30 | # MSTest test Results
31 | [Tt]est[Rr]esult*/
32 | [Bb]uild[Ll]og.*
33 |
34 | # NUNIT
35 | *.VisualState.xml
36 | TestResult.xml
37 |
38 | # Build Results of an ATL Project
39 | [Dd]ebugPS/
40 | [Rr]eleasePS/
41 | dlldata.c
42 |
43 | # DNX
44 | project.lock.json
45 | project.fragment.lock.json
46 | artifacts/
47 |
48 | *_i.c
49 | *_p.c
50 | *_i.h
51 | *.ilk
52 | *.meta
53 | *.obj
54 | *.pch
55 | *.pdb
56 | *.pgc
57 | *.pgd
58 | *.rsp
59 | *.sbr
60 | *.tlb
61 | *.tli
62 | *.tlh
63 | *.tmp
64 | *.tmp_proj
65 | *.log
66 | *.vspscc
67 | *.vssscc
68 | .builds
69 | *.pidb
70 | *.svclog
71 | *.scc
72 |
73 | # Chutzpah Test files
74 | _Chutzpah*
75 |
76 | # Visual C++ cache files
77 | ipch/
78 | *.aps
79 | *.ncb
80 | *.opendb
81 | *.opensdf
82 | *.sdf
83 | *.cachefile
84 | *.VC.db
85 | *.VC.VC.opendb
86 |
87 | # Visual Studio profiler
88 | *.psess
89 | *.vsp
90 | *.vspx
91 | *.sap
92 |
93 | # TFS 2012 Local Workspace
94 | $tf/
95 |
96 | # Guidance Automation Toolkit
97 | *.gpState
98 |
99 | # ReSharper is a .NET coding add-in
100 | _ReSharper*/
101 | *.[Rr]e[Ss]harper
102 | *.DotSettings.user
103 |
104 | # JustCode is a .NET coding add-in
105 | .JustCode
106 |
107 | # TeamCity is a build add-in
108 | _TeamCity*
109 |
110 | # DotCover is a Code Coverage Tool
111 | *.dotCover
112 |
113 | # NCrunch
114 | _NCrunch_*
115 | .*crunch*.local.xml
116 | nCrunchTemp_*
117 |
118 | # MightyMoose
119 | *.mm.*
120 | AutoTest.Net/
121 |
122 | # Web workbench (sass)
123 | .sass-cache/
124 |
125 | # Installshield output folder
126 | [Ee]xpress/
127 |
128 | # DocProject is a documentation generator add-in
129 | DocProject/buildhelp/
130 | DocProject/Help/*.HxT
131 | DocProject/Help/*.HxC
132 | DocProject/Help/*.hhc
133 | DocProject/Help/*.hhk
134 | DocProject/Help/*.hhp
135 | DocProject/Help/Html2
136 | DocProject/Help/html
137 |
138 | # Click-Once directory
139 | publish/
140 |
141 | # Publish Web Output
142 | *.[Pp]ublish.xml
143 | *.azurePubxml
144 | # TODO: Comment the next line if you want to checkin your web deploy settings
145 | # but database connection strings (with potential passwords) will be unencrypted
146 | #*.pubxml
147 | *.publishproj
148 |
149 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
150 | # checkin your Azure Web App publish settings, but sensitive information contained
151 | # in these scripts will be unencrypted
152 | PublishScripts/
153 |
154 | # NuGet Packages
155 | *.nupkg
156 | # The packages folder can be ignored because of Package Restore
157 | **/packages/*
158 | # except build/, which is used as an MSBuild target.
159 | !**/packages/build/
160 | # Uncomment if necessary however generally it will be regenerated when needed
161 | #!**/packages/repositories.config
162 | # NuGet v3's project.json files produces more ignoreable files
163 | *.nuget.props
164 | *.nuget.targets
165 |
166 | # Microsoft Azure Build Output
167 | csx/
168 | *.build.csdef
169 |
170 | # Microsoft Azure Emulator
171 | ecf/
172 | rcf/
173 |
174 | # Windows Store app package directories and files
175 | AppPackages/
176 | BundleArtifacts/
177 | Package.StoreAssociation.xml
178 | _pkginfo.txt
179 |
180 | # Visual Studio cache files
181 | # files ending in .cache can be ignored
182 | *.[Cc]ache
183 | # but keep track of directories ending in .cache
184 | !*.[Cc]ache/
185 |
186 | # Others
187 | ClientBin/
188 | ~$*
189 | *~
190 | *.dbmdl
191 | *.dbproj.schemaview
192 | *.jfm
193 | *.pfx
194 | *.publishsettings
195 | node_modules/
196 | orleans.codegen.cs
197 |
198 | # Since there are multiple workflows, uncomment next line to ignore bower_components
199 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
200 | #bower_components/
201 |
202 | # RIA/Silverlight projects
203 | Generated_Code/
204 |
205 | # Backup & report files from converting an old project file
206 | # to a newer Visual Studio version. Backup files are not needed,
207 | # because we have git ;-)
208 | _UpgradeReport_Files/
209 | Backup*/
210 | UpgradeLog*.XML
211 | UpgradeLog*.htm
212 |
213 | # SQL Server files
214 | *.mdf
215 | *.ldf
216 |
217 | # Business Intelligence projects
218 | *.rdl.data
219 | *.bim.layout
220 | *.bim_*.settings
221 |
222 | # Microsoft Fakes
223 | FakesAssemblies/
224 |
225 | # GhostDoc plugin setting file
226 | *.GhostDoc.xml
227 |
228 | # Node.js Tools for Visual Studio
229 | .ntvs_analysis.dat
230 |
231 | # Visual Studio 6 build log
232 | *.plg
233 |
234 | # Visual Studio 6 workspace options file
235 | *.opt
236 |
237 | # Visual Studio LightSwitch build output
238 | **/*.HTMLClient/GeneratedArtifacts
239 | **/*.DesktopClient/GeneratedArtifacts
240 | **/*.DesktopClient/ModelManifest.xml
241 | **/*.Server/GeneratedArtifacts
242 | **/*.Server/ModelManifest.xml
243 | _Pvt_Extensions
244 |
245 | # Paket dependency manager
246 | .paket/paket.exe
247 | paket-files/
248 |
249 | # FAKE - F# Make
250 | .fake/
251 |
252 | # JetBrains Rider
253 | .idea/
254 | *.sln.iml
255 |
256 | # CodeRush
257 | .cr/
258 |
259 | # Python Tools for Visual Studio (PTVS)
260 | __pycache__/
261 | *.pyc
--------------------------------------------------------------------------------
/FFTSpectrum.vcxproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Debug
6 | Win32
7 |
8 |
9 | Release
10 | Win32
11 |
12 |
13 | Debug
14 | x64
15 |
16 |
17 | Release
18 | x64
19 |
20 |
21 |
22 | 15.0
23 | {FE7D4925-2B0B-4B0F-B63E-E0EBE9DDFE97}
24 | Win32Proj
25 | 8.1
26 |
27 |
28 |
29 | DynamicLibrary
30 | true
31 | v141
32 |
33 |
34 | DynamicLibrary
35 | false
36 | v141
37 |
38 |
39 | DynamicLibrary
40 | true
41 | v141
42 |
43 |
44 | DynamicLibrary
45 | false
46 | v141
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 | true
68 |
69 |
70 | true
71 |
72 |
73 |
74 | WIN32;_DEBUG;_WINDOWS;_USRDLL;FFTSPECTRUM_EXPORTS;%(PreprocessorDefinitions)
75 | $(ProjectDir)..\fftw-build;$(ProjectDir)..\vapoursynth-sdk\include\vapoursynth;%(AdditionalIncludeDirectories)
76 | MultiThreadedDebugDLL
77 | Level3
78 | ProgramDatabase
79 | Disabled
80 |
81 |
82 | MachineX86
83 | true
84 | Windows
85 | $(ProjectDir)..\fftw-build\fftw3f.lib;%(AdditionalDependencies)
86 |
87 |
88 |
89 |
90 | WIN32;NDEBUG;_WINDOWS;_USRDLL;FFTSPECTRUM_EXPORTS;%(PreprocessorDefinitions)
91 | ..\fftw-build;..\vapoursynth-sdk\include;%(AdditionalIncludeDirectories)
92 | MultiThreadedDLL
93 | Level3
94 | ProgramDatabase
95 |
96 |
97 | MachineX86
98 | true
99 | Windows
100 | true
101 | true
102 |
103 |
104 |
105 |
106 | $(ProjectDir)..\fftw-build;$(ProjectDir)..\vapoursynth-sdk\include;%(AdditionalIncludeDirectories)
107 |
108 |
109 | fftw3f.lib;ucrtd.lib;vcruntimed.lib;%(AdditionalDependencies)
110 | true
111 | $(ProjectDir)..\fftw-build;%(AdditionalLibraryDirectories)
112 |
113 |
114 |
115 |
116 | $(ProjectDir)..\fftw-build;$(ProjectDir)..\vapoursynth-sdk\include;$(ProjectDir);%(AdditionalIncludeDirectories)
117 |
118 |
119 | $(ProjectDir)..\fftw-build;%(AdditionalLibraryDirectories)
120 | fftw3f.lib;ucrt.lib;vcruntime.lib;%(AdditionalDependencies)
121 | true
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
--------------------------------------------------------------------------------
/FFTSpectrum.c:
--------------------------------------------------------------------------------
1 |
2 | /*
3 | * Copyright (c) 2008 Hiroaki Gotou.
4 | * Copyright (c) 2019 Evgeny Marchenkov.
5 | *
6 | * This program is free software : you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 2 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program.If not, see < https://www.gnu.org/licenses/>.
18 | */
19 |
20 | #include
21 | #include
22 |
23 | #if defined(_MSC_VER)
24 | #include
25 |
26 | #define USE_SSE_AUTO
27 | #define __SSE4_2__
28 | #define __x86_64__
29 | #define SSE_MATHFUN_WITH_CODE
30 | #include "sse_mathfun.h"
31 | #undef SSE_MATHFUN_WITH_CODE
32 | #undef __x86_64__
33 | #undef __SSE4_2__
34 | #undef USE_SSE_AUTO
35 | #undef inline
36 |
37 | #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
38 | #include
39 |
40 | #define USE_SSE4
41 | #define SSE_MATHFUN_WITH_CODE
42 | #include "sse_mathfun.h"
43 |
44 | #endif
45 |
46 | #include "fftw3.h"
47 |
48 | #include "vapoursynth/VapourSynth.h"
49 | #include "vapoursynth/VSHelper.h"
50 |
51 |
52 | typedef struct {
53 | VSNodeRef *node;
54 | const VSVideoInfo *in_vi;
55 | VSVideoInfo out_vi;
56 |
57 | bool show_grid;
58 |
59 | fftwf_complex *fft_in;
60 | fftwf_complex *fft_out;
61 | fftwf_plan p;
62 | float *abs_array;
63 | } FFTSpectrumData;
64 |
65 | static void fill_fft_input_array(fftwf_complex *dst, const uint8_t *src, int width, int height, int stride) {
66 | fftwf_complex *dstp = dst;
67 | const uint8_t *srcp = src;
68 | const int mod16_width = width - (width % 16);
69 |
70 | for (int y = 0; y < height; y++) {
71 | for (int x = 0; x < mod16_width; x += 16) {
72 | __m128i in_buffer, epu32_buffer;
73 | __m128 cvt_buffer, out_buffer[2];
74 | const __m128 sse_zero = _mm_setzero_ps();
75 |
76 | in_buffer = _mm_load_si128((const __m128i *)srcp);
77 |
78 | for (int j = 0; j < 4; j++) {
79 | epu32_buffer = _mm_cvtepu8_epi32(in_buffer);
80 | cvt_buffer = _mm_cvtepi32_ps(epu32_buffer);
81 |
82 | out_buffer[0] = _mm_unpacklo_ps(cvt_buffer, sse_zero);
83 | out_buffer[1] = _mm_unpackhi_ps(cvt_buffer, sse_zero);
84 |
85 | _mm_store_ps((float *)(dstp), out_buffer[0]);
86 | _mm_store_ps((float *)(dstp + 2), out_buffer[1]);
87 |
88 | in_buffer = _mm_shuffle_epi32(in_buffer, _MM_SHUFFLE(0, 3, 2, 1));
89 |
90 | dstp += 4;
91 | }
92 |
93 | srcp += 16;
94 | }
95 | for (int x = mod16_width; x < width; x++) {
96 | *dstp[0] = (float)*srcp;
97 | *dstp[1] = 0.0;
98 | srcp++;
99 | dstp++;
100 | }
101 | srcp += stride - width;
102 | }
103 | }
104 |
105 | static void calculate_absolute_values(float *dst, fftwf_complex *src, int length) {
106 | fftwf_complex *srcp = src;
107 | float *dstp = dst;
108 | const int mod4_length = length - (length % 4);
109 |
110 | for (int i = 0; i < mod4_length; i += 4) {
111 | __m128 in_buffer[2], mul_buffer[2], add_buffer, out_buffer;
112 | const __m128 sse_one = _mm_set_ps1(1.0f);
113 |
114 | in_buffer[0] = _mm_load_ps((float *)(srcp));
115 | in_buffer[1] = _mm_load_ps((float *)(srcp + 2));
116 |
117 | mul_buffer[0] = _mm_mul_ps(in_buffer[0], in_buffer[0]);
118 | mul_buffer[1] = _mm_mul_ps(in_buffer[1], in_buffer[1]);
119 |
120 | add_buffer = _mm_hadd_ps(mul_buffer[0], mul_buffer[1]);
121 | add_buffer = _mm_sqrt_ps(add_buffer);
122 | add_buffer = _mm_add_ps(add_buffer, sse_one);
123 |
124 | out_buffer = log_ps(add_buffer);
125 |
126 | _mm_store_ps(dstp, out_buffer);
127 |
128 | srcp += 4;
129 | dstp += 4;
130 | }
131 | for (int i = mod4_length; i < length; i++) {
132 | dstp[i] = logf(sqrtf(src[i][0] * src[i][0] + src[i][1] * src[i][1]) + 1.0);
133 | }
134 | }
135 |
136 | static void draw_fft_spectrum(uint8_t *dst, float *src, int width, int height, int stride) {
137 | uint8_t *dstp = dst;
138 | float *srcp = src;
139 | float max = 0;
140 |
141 | memset(dstp, 0, stride * height);
142 |
143 | for (int i = 1; i < height * width; i++) {
144 | if (srcp[i] > max) {
145 | max = srcp[i];
146 | }
147 | }
148 |
149 | for (int y = 0; y < height; y++) {
150 | for (int x = 0; x < width; x++) {
151 | float buf;
152 | buf = srcp[x + y * width] > max / 2 ? srcp[x + y * width] : 0;
153 | buf = 255 * buf / max;
154 | if (buf < 0) buf = 0;
155 | if (buf > 255) buf = 255;
156 |
157 | if (y < height / 2) {
158 | if (x < width / 2) {
159 | dstp[x + (width / 2) + stride * (y + height / 2)] = (uint8_t)buf;
160 | }
161 | else {
162 | dstp[x - (width / 2) + stride * (y + height / 2)] = (uint8_t)buf;
163 | }
164 | }
165 | else {
166 | if (x < width / 2) {
167 | dstp[x + (width / 2) + stride * (y - height / 2)] = (uint8_t)buf;
168 | }
169 | else {
170 | dstp[x - (width / 2) + stride * (y - height / 2)] = (uint8_t)buf;
171 | }
172 | }
173 | }
174 | }
175 | }
176 |
177 | static void draw_grid(uint8_t *buf, int width, int height, int stride) {
178 | for (int x = (width / 2) % 100; x < width; x += 100) {
179 | for (int y = 0; y < height; y++) {
180 | buf[x + y * stride] = 255;
181 | }
182 | }
183 |
184 | for (int y = (height / 2) % 100; y < height; y += 100) {
185 | for (int x = 0; x < width; x++) {
186 | buf[x + y * stride] = 255;
187 | }
188 | }
189 | }
190 |
191 | static void VS_CC fftSpectrumInit(VSMap *in, VSMap *out, void **instanceData, VSNode *node, VSCore *core, const VSAPI *vsapi) {
192 | FFTSpectrumData *d = (FFTSpectrumData *) * instanceData;
193 | d->out_vi = *d->in_vi;
194 | d->out_vi.format = vsapi->getFormatPreset(pfGray8, core);
195 | vsapi->setVideoInfo(&d->out_vi, 1, node);
196 | }
197 |
198 | static const VSFrameRef *VS_CC fftSpectrumGetFrame(int n, int activationReason, void **instanceData, void **frameData, VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi) {
199 | FFTSpectrumData *d = (FFTSpectrumData *) * instanceData;
200 |
201 | if (activationReason == arInitial) {
202 | vsapi->requestFrameFilter(n, d->node, frameCtx);
203 | } else if (activationReason == arAllFramesReady) {
204 |
205 | const VSFrameRef *src = vsapi->getFrameFilter(n, d->node, frameCtx);
206 | VSFrameRef *dst = vsapi->newVideoFrame(d->out_vi.format, d->out_vi.width, d->out_vi.height, src, core);
207 |
208 | fill_fft_input_array(d->fft_in, vsapi->getReadPtr(src, 0), d->in_vi->width, d->in_vi->height, vsapi->getStride(src, 0));
209 |
210 | fftwf_execute_dft(d->p, d->fft_in, d->fft_out);
211 |
212 | calculate_absolute_values(d->abs_array, d->fft_out, (d->in_vi->width * d->in_vi->height));
213 |
214 | draw_fft_spectrum(vsapi->getWritePtr(dst, 0), d->abs_array, d->out_vi.width, d->out_vi.height, vsapi->getStride(dst, 0));
215 |
216 | if (d->show_grid) {
217 | draw_grid(vsapi->getWritePtr(dst, 0), d->out_vi.width, d->out_vi.height, vsapi->getStride(dst, 0));
218 | }
219 |
220 | vsapi->freeFrame(src);
221 |
222 | return dst;
223 | }
224 |
225 | return 0;
226 | }
227 |
228 | static void VS_CC fftSpectrumFree(void *instanceData, VSCore *core, const VSAPI *vsapi) {
229 | FFTSpectrumData *d = (FFTSpectrumData *)instanceData;
230 | vsapi->freeNode(d->node);
231 | VS_ALIGNED_FREE(d->fft_in);
232 | VS_ALIGNED_FREE(d->fft_out);
233 | VS_ALIGNED_FREE(d->abs_array);
234 | fftwf_destroy_plan(d->p);
235 | free(d);
236 | }
237 |
238 | static void VS_CC fftSpectrumCreate(const VSMap *in, VSMap *out, void *userData, VSCore *core, const VSAPI *vsapi) {
239 | FFTSpectrumData *d;
240 | d = malloc(sizeof(FFTSpectrumData));
241 |
242 | int err;
243 |
244 | d->node = vsapi->propGetNode(in, "clip", 0, 0);
245 | d->in_vi = vsapi->getVideoInfo(d->node);
246 |
247 | if (!isConstantFormat(d->in_vi) || d->in_vi->format->sampleType != stInteger || d->in_vi->format->bitsPerSample != 8 ||
248 | d->in_vi->format->colorFamily == cmRGB || d->in_vi->format->colorFamily == cmCompat) {
249 | vsapi->setError(out, "FFTSpectrum: only constant format 8bit integer luma-containing input supported");
250 | vsapi->freeNode(d->node);
251 | free(d);
252 | return;
253 | }
254 |
255 | d->show_grid = (bool)vsapi->propGetInt(in, "grid", 0, &err);
256 | if (err) {
257 | d->show_grid = false;
258 | }
259 |
260 | VS_ALIGNED_MALLOC(&d->fft_in, (d->in_vi->width * d->in_vi->height * sizeof(fftw_complex)), 32);
261 | VS_ALIGNED_MALLOC(&d->fft_out, (d->in_vi->width * d->in_vi->height * sizeof(fftw_complex)), 32);
262 | VS_ALIGNED_MALLOC(&d->abs_array, (d->in_vi->width * d->in_vi->height * sizeof(float)), 32);
263 |
264 | memset(d->fft_in, 0, (d->in_vi->width * d->in_vi->height * sizeof(fftw_complex)));
265 | memset(d->fft_out, 0, (d->in_vi->width * d->in_vi->height * sizeof(fftw_complex)));
266 | memset(d->abs_array, 0, (d->in_vi->width * d->in_vi->height * sizeof(float)));
267 |
268 | d->p = fftwf_plan_dft_2d(d->in_vi->height, d->in_vi->width, d->fft_in, d->fft_out, FFTW_FORWARD, FFTW_MEASURE | FFTW_DESTROY_INPUT);
269 |
270 | vsapi->createFilter(in, out, "FFTSpectrum", fftSpectrumInit, fftSpectrumGetFrame, fftSpectrumFree, fmParallelRequests, 0, d, core);
271 | }
272 |
273 |
274 | VS_EXTERNAL_API(void) VapourSynthPluginInit(VSConfigPlugin configFunc, VSRegisterFunction registerFunc, VSPlugin *plugin) {
275 | configFunc("org.beatrice-raws.fftspectrum", "fftspectrum", "FFT Spectrum plugin", VAPOURSYNTH_API_VERSION, 1, plugin);
276 | registerFunc("FFTSpectrum", "clip:clip;grid:int:opt;", fftSpectrumCreate, 0, plugin);
277 | }
278 |
--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 2, June 1991
3 |
4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
6 | Everyone is permitted to copy and distribute verbatim copies
7 | of this license document, but changing it is not allowed.
8 |
9 | Preamble
10 |
11 | The licenses for most software are designed to take away your
12 | freedom to share and change it. By contrast, the GNU General Public
13 | License is intended to guarantee your freedom to share and change free
14 | software--to make sure the software is free for all its users. This
15 | General Public License applies to most of the Free Software
16 | Foundation's software and to any other program whose authors commit to
17 | using it. (Some other Free Software Foundation software is covered by
18 | the GNU Lesser General Public License instead.) You can apply it to
19 | your programs, too.
20 |
21 | When we speak of free software, we are referring to freedom, not
22 | price. Our General Public Licenses are designed to make sure that you
23 | have the freedom to distribute copies of free software (and charge for
24 | this service if you wish), that you receive source code or can get it
25 | if you want it, that you can change the software or use pieces of it
26 | in new free programs; and that you know you can do these things.
27 |
28 | To protect your rights, we need to make restrictions that forbid
29 | anyone to deny you these rights or to ask you to surrender the rights.
30 | These restrictions translate to certain responsibilities for you if you
31 | distribute copies of the software, or if you modify it.
32 |
33 | For example, if you distribute copies of such a program, whether
34 | gratis or for a fee, you must give the recipients all the rights that
35 | you have. You must make sure that they, too, receive or can get the
36 | source code. And you must show them these terms so they know their
37 | rights.
38 |
39 | We protect your rights with two steps: (1) copyright the software, and
40 | (2) offer you this license which gives you legal permission to copy,
41 | distribute and/or modify the software.
42 |
43 | Also, for each author's protection and ours, we want to make certain
44 | that everyone understands that there is no warranty for this free
45 | software. If the software is modified by someone else and passed on, we
46 | want its recipients to know that what they have is not the original, so
47 | that any problems introduced by others will not reflect on the original
48 | authors' reputations.
49 |
50 | Finally, any free program is threatened constantly by software
51 | patents. We wish to avoid the danger that redistributors of a free
52 | program will individually obtain patent licenses, in effect making the
53 | program proprietary. To prevent this, we have made it clear that any
54 | patent must be licensed for everyone's free use or not licensed at all.
55 |
56 | The precise terms and conditions for copying, distribution and
57 | modification follow.
58 |
59 | GNU GENERAL PUBLIC LICENSE
60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
61 |
62 | 0. This License applies to any program or other work which contains
63 | a notice placed by the copyright holder saying it may be distributed
64 | under the terms of this General Public License. The "Program", below,
65 | refers to any such program or work, and a "work based on the Program"
66 | means either the Program or any derivative work under copyright law:
67 | that is to say, a work containing the Program or a portion of it,
68 | either verbatim or with modifications and/or translated into another
69 | language. (Hereinafter, translation is included without limitation in
70 | the term "modification".) Each licensee is addressed as "you".
71 |
72 | Activities other than copying, distribution and modification are not
73 | covered by this License; they are outside its scope. The act of
74 | running the Program is not restricted, and the output from the Program
75 | is covered only if its contents constitute a work based on the
76 | Program (independent of having been made by running the Program).
77 | Whether that is true depends on what the Program does.
78 |
79 | 1. You may copy and distribute verbatim copies of the Program's
80 | source code as you receive it, in any medium, provided that you
81 | conspicuously and appropriately publish on each copy an appropriate
82 | copyright notice and disclaimer of warranty; keep intact all the
83 | notices that refer to this License and to the absence of any warranty;
84 | and give any other recipients of the Program a copy of this License
85 | along with the Program.
86 |
87 | You may charge a fee for the physical act of transferring a copy, and
88 | you may at your option offer warranty protection in exchange for a fee.
89 |
90 | 2. You may modify your copy or copies of the Program or any portion
91 | of it, thus forming a work based on the Program, and copy and
92 | distribute such modifications or work under the terms of Section 1
93 | above, provided that you also meet all of these conditions:
94 |
95 | a) You must cause the modified files to carry prominent notices
96 | stating that you changed the files and the date of any change.
97 |
98 | b) You must cause any work that you distribute or publish, that in
99 | whole or in part contains or is derived from the Program or any
100 | part thereof, to be licensed as a whole at no charge to all third
101 | parties under the terms of this License.
102 |
103 | c) If the modified program normally reads commands interactively
104 | when run, you must cause it, when started running for such
105 | interactive use in the most ordinary way, to print or display an
106 | announcement including an appropriate copyright notice and a
107 | notice that there is no warranty (or else, saying that you provide
108 | a warranty) and that users may redistribute the program under
109 | these conditions, and telling the user how to view a copy of this
110 | License. (Exception: if the Program itself is interactive but
111 | does not normally print such an announcement, your work based on
112 | the Program is not required to print an announcement.)
113 |
114 | These requirements apply to the modified work as a whole. If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works. But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 |
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 |
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 |
134 | 3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 |
138 | a) Accompany it with the complete corresponding machine-readable
139 | source code, which must be distributed under the terms of Sections
140 | 1 and 2 above on a medium customarily used for software interchange; or,
141 |
142 | b) Accompany it with a written offer, valid for at least three
143 | years, to give any third party, for a charge no more than your
144 | cost of physically performing source distribution, a complete
145 | machine-readable copy of the corresponding source code, to be
146 | distributed under the terms of Sections 1 and 2 above on a medium
147 | customarily used for software interchange; or,
148 |
149 | c) Accompany it with the information you received as to the offer
150 | to distribute corresponding source code. (This alternative is
151 | allowed only for noncommercial distribution and only if you
152 | received the program in object code or executable form with such
153 | an offer, in accord with Subsection b above.)
154 |
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it. For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable. However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 |
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 |
172 | 4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License. Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 |
180 | 5. You are not required to accept this License, since you have not
181 | signed it. However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works. These actions are
183 | prohibited by law if you do not accept this License. Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 |
189 | 6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions. You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 |
197 | 7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License. If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all. For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 |
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 |
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices. Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 |
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 |
229 | 8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded. In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 |
237 | 9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time. Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 |
242 | Each version is given a distinguishing version number. If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation. If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 |
250 | 10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission. For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this. Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 |
258 | NO WARRANTY
259 |
260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 |
270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 |
280 | END OF TERMS AND CONDITIONS
281 |
282 | How to Apply These Terms to Your New Programs
283 |
284 | If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 |
288 | To do so, attach the following notices to the program. It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 |
293 |
294 | Copyright (C)
295 |
296 | This program is free software; you can redistribute it and/or modify
297 | it under the terms of the GNU General Public License as published by
298 | the Free Software Foundation; either version 2 of the License, or
299 | (at your option) any later version.
300 |
301 | This program is distributed in the hope that it will be useful,
302 | but WITHOUT ANY WARRANTY; without even the implied warranty of
303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
304 | GNU General Public License for more details.
305 |
306 | You should have received a copy of the GNU General Public License along
307 | with this program; if not, write to the Free Software Foundation, Inc.,
308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 |
310 | Also add information on how to contact you by electronic and paper mail.
311 |
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 |
315 | Gnomovision version 69, Copyright (C) year name of author
316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 | This is free software, and you are welcome to redistribute it
318 | under certain conditions; type `show c' for details.
319 |
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License. Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 |
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary. Here is a sample; alter the names:
328 |
329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 | `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 |
332 | , 1 April 1989
333 | Ty Coon, President of Vice
334 |
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs. If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library. If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 |
--------------------------------------------------------------------------------
/sse_mathfun.h:
--------------------------------------------------------------------------------
1 | /*!
2 | @file sse_mathfun.h
3 |
4 | SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
5 |
6 | Inspired by Intel Approximate Math library, and based on the
7 | corresponding algorithms of the cephes math library
8 |
9 | The default is to use the SSE1 version. If you define USE_SSE2 the
10 | the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
11 | not expect any significant performance improvement with SSE2.
12 | */
13 |
14 | /* Copyright (C) 2010,2011 RJVB - extensions */
15 | /* Copyright (C) 2007 Julien Pommier
16 |
17 | This software is provided 'as-is', without any express or implied
18 | warranty. In no event will the authors be held liable for any damages
19 | arising from the use of this software.
20 |
21 | Permission is granted to anyone to use this software for any purpose,
22 | including commercial applications, and to alter it and redistribute it
23 | freely, subject to the following restrictions:
24 |
25 | 1. The origin of this software must not be misrepresented; you must not
26 | claim that you wrote the original software. If you use this software
27 | in a product, an acknowledgment in the product documentation would be
28 | appreciated but is not required.
29 | 2. Altered source versions must be plainly marked as such, and must not be
30 | misrepresented as being the original software.
31 | 3. This notice may not be removed or altered from any source distribution.
32 |
33 | (this is the zlib license)
34 | */
35 |
36 | #ifndef _SSE_MATHFUN_H
37 |
38 | #ifdef USE_SSE_AUTO
39 | # ifdef __SSE2__
40 | # if defined(__GNUC__)
41 | # warning "USE_SSE2"
42 | # endif
43 | # define USE_SSE2
44 | # endif
45 | # if defined(__SSE3__) || defined(__SSSE3__)
46 | # if defined(__GNUC__)
47 | # warning "USE_SSE3"
48 | # endif
49 | # define USE_SSE2
50 | # define USE_SSE3
51 | # endif
52 | # if defined(__SSE4__) || defined(__SSE4_1__) || defined(__SSE4_2__) || ((_M_IX86_FP > 1) && !defined(_M_AMD64))
53 | # if defined(__GNUC__)
54 | # warning "USE_SSE4"
55 | # endif
56 | # define USE_SSE2
57 | # define USE_SSE3
58 | # define USE_SSE4
59 | # endif
60 | #endif
61 |
62 | #include
63 | #include
64 | #include
65 |
66 | /* yes I know, the top of this file is quite ugly */
67 |
68 | /*!
69 | macros to obtain the required 16bit alignment
70 | */
71 | #ifdef _MSC_VER /* visual c++ */
72 | # define ALIGN16_BEG __declspec(align(16))
73 | # define ALIGN16_END
74 | # define inline __forceinline
75 | #else /* gcc or icc */
76 | # define ALIGN16_BEG
77 | # define ALIGN16_END __attribute__((aligned(16)))
78 | #endif
79 |
80 | /* __m128 is ugly to write */
81 | /*!
82 | an SSE vector of 4 floats
83 | */
84 | typedef __m128 v4sf; // vector of 4 float (sse1)
85 |
86 | #if defined(USE_SSE3) || defined(USE_SSE4)
87 | # define USE_SSE2
88 | #endif
89 |
90 | /*!
91 | an SSE/MMX vector of 4 32bit integers
92 | */
93 | #ifdef __APPLE_CC__
94 | typedef int v4si __attribute__ ((__vector_size__ (16), __may_alias__));
95 | #else
96 | typedef __m128i v4si; // vector of 4 int (sse2)
97 | #endif
98 | // RJVB 20111028: some support for double precision semantics
99 | /*!
100 | an SSE2+ vector of 2 doubles
101 | */
102 | typedef __m128d v2df; // vector of 2 double (sse2)
103 | /*!
104 | an MMX vector of 2 32bit ints
105 | */
106 | typedef __m64 v2si; // vector of 2 int (mmx)
107 |
108 | #if defined(USE_SSE3) || defined(USE_SSE4)
109 | # define USE_SSE3
110 | # include
111 | # if defined(__SSSE3__) || (_M_IX86_FP > 1)
112 | # include
113 | # endif
114 | #endif
115 |
116 | #if defined(USE_SSE4)
117 | # define USE_SSE4
118 | # include
119 | #endif
120 |
121 | #ifdef __GNUC__0
122 | # define _MM_SET_PD(b,a) (v2df){(a),(b)}
123 | # define _MM_SET1_PD(a) (v2df){(a),(a)}
124 | // static inline v2df _MM_SET1_PD(double a)
125 | // {
126 | // return (v2df){a,a};
127 | // }
128 | # define _MM_SETR_PD(a,b) (v2df){(a),(b)}
129 | # define _MM_SETZERO_PD() (v2df){0.0,0.0}
130 | # define _MM_SET_PS(d,c,b,a) (v4sf){(a),(b),(c),(d)}
131 | # define _MM_SET1_PS(a) (v4sf){(a),(a),(a),(a)}
132 | // static inline v4sf _MM_SET1_PS(float a)
133 | // {
134 | // return (v4sf){a,a,a,a};
135 | // }
136 | # define _MM_SETR_PS(a,b,c,d) (v4sf){(a),(b),(c),(d)}
137 | # define _MM_SETZERO_PS() (v4sf){0.0f,0.0f,0.0f,0.0f}
138 | # define _MM_SETZERO_SI128() (__m128i)(__v4si){0,0,0,0}
139 | # define _MM_SETZERO_SI64() ALIGN16_BEG (__m64 ALIGN16_END)0LL
140 | #else
141 | # define _MM_SET_PD(b,a) _mm_setr_pd((a),(b))
142 | # define _MM_SET1_PD(a) _mm_set1_pd((a))
143 | # define _MM_SETR_PD(a,b) _mm_setr_pd((a),(b))
144 | # define _MM_SETZERO_PD() _mm_setzero_pd()
145 | # define _MM_SET_PS(d,c,b,a) _mm_setr_ps((a),(b),(c),(d))
146 | # define _MM_SET1_PS(a) _mm_set1_ps((a))
147 | # define _MM_SETR_PS(a,b,c,d) _mm_setr_ps((a),(b),(c),(d))
148 | # define _MM_SETZERO_PS() _mm_setzero_ps()
149 | # define _MM_SETZERO_SI128() _mm_setzero_si128()
150 | # define _MM_SETZERO_SI64() _mm_setzero_si64()
151 | #endif
152 | #define VELEM(type,a,n) (((type*)&a)[n])
153 |
154 | /* declare some SSE constants -- why can't I figure a better way to do that? */
155 | #define _PS_CONST(Name, Val) \
156 | static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { (const float)(Val), (const float)(Val), (const float)(Val), (const float)(Val) }
157 | #define _PI32_CONST(Name, Val) \
158 | static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
159 | #define _PS_CONST_TYPE(Name, Type, Val) \
160 | static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
161 |
162 | #define _PD_CONST(Name, Val) \
163 | static const ALIGN16_BEG double _pd_##Name[2] ALIGN16_END = { (const double)(Val), (const double)(Val) }
164 | #define _PD_CONST_TYPE(Name, Type, Val) \
165 | static const ALIGN16_BEG Type _pd_##Name[2] ALIGN16_END = { Val, Val }
166 |
167 | #pragma mark code section
168 | #ifdef SSE_MATHFUN_WITH_CODE
169 |
170 | _PS_CONST(1 , 1.0f);
171 | _PS_CONST(0p5, 0.5f);
172 | /* the smallest non denormalized float number */
173 | _PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
174 | _PS_CONST_TYPE(mant_mask, int, 0x7f800000);
175 | _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
176 |
177 | _PS_CONST_TYPE(sign_mask, int, 0x80000000);
178 | _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
179 |
180 | _PI32_CONST(1, 1);
181 | _PI32_CONST(inv1, ~1);
182 | _PI32_CONST(2, 2);
183 | _PI32_CONST(4, 4);
184 | _PI32_CONST(0x7f, 0x7f);
185 |
186 | _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
187 | _PS_CONST(cephes_log_p0, 7.0376836292E-2);
188 | _PS_CONST(cephes_log_p1, - 1.1514610310E-1);
189 | _PS_CONST(cephes_log_p2, 1.1676998740E-1);
190 | _PS_CONST(cephes_log_p3, - 1.2420140846E-1);
191 | _PS_CONST(cephes_log_p4, + 1.4249322787E-1);
192 | _PS_CONST(cephes_log_p5, - 1.6668057665E-1);
193 | _PS_CONST(cephes_log_p6, + 2.0000714765E-1);
194 | _PS_CONST(cephes_log_p7, - 2.4999993993E-1);
195 | _PS_CONST(cephes_log_p8, + 3.3333331174E-1);
196 | _PS_CONST(cephes_log_q1, -2.12194440e-4);
197 | _PS_CONST(cephes_log_q2, 0.693359375);
198 |
199 | #ifdef USE_SSE2
200 | _PD_CONST(1, 1.0);
201 | _PD_CONST(_1, -1.0);
202 | _PD_CONST(0p5, 0.5);
203 | /* the smallest non denormalised float number */
204 | // _PD_CONST_TYPE(min_norm_pos, int, 0x00800000);
205 | // _PD_CONST_TYPE(mant_mask, int, 0x7f800000);
206 | // _PD_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
207 |
208 | _PD_CONST_TYPE(sign_mask, long long, 0x8000000000000000LL);
209 | _PD_CONST_TYPE(inv_sign_mask, long long, ~0x8000000000000000LL);
210 |
211 | #endif
212 |
213 | #if defined (__MINGW32__)
214 |
215 | /* the ugly part below: many versions of gcc used to be completely buggy with respect to some intrinsics
216 | The movehl_ps is fixed in mingw 3.4.5, but I found out that all the _mm_cmp* intrinsics were completely
217 | broken on my mingw gcc 3.4.5 ...
218 |
219 | Note that the bug on _mm_cmp* does occur only at -O0 optimization level
220 | */
221 |
222 | inline __m128 my_movehl_ps(__m128 a, const __m128 b) {
223 | asm (
224 | "movhlps %2,%0\n\t"
225 | : "=x" (a)
226 | : "0" (a), "x"(b)
227 | );
228 | return a; }
229 | #warning "redefined _mm_movehl_ps (see gcc bug 21179)"
230 | #define _mm_movehl_ps my_movehl_ps
231 |
232 | inline __m128 my_cmplt_ps(__m128 a, const __m128 b) {
233 | asm (
234 | "cmpltps %2,%0\n\t"
235 | : "=x" (a)
236 | : "0" (a), "x"(b)
237 | );
238 | return a;
239 | }
240 | inline __m128 my_cmpgt_ps(__m128 a, const __m128 b) {
241 | asm (
242 | "cmpnleps %2,%0\n\t"
243 | : "=x" (a)
244 | : "0" (a), "x"(b)
245 | );
246 | return a;
247 | }
248 | inline __m128 my_cmpeq_ps(__m128 a, const __m128 b) {
249 | asm (
250 | "cmpeqps %2,%0\n\t"
251 | : "=x" (a)
252 | : "0" (a), "x"(b)
253 | );
254 | return a;
255 | }
256 | #warning "redefined _mm_cmpxx_ps functions..."
257 | #define _mm_cmplt_ps my_cmplt_ps
258 | #define _mm_cmpgt_ps my_cmpgt_ps
259 | #define _mm_cmpeq_ps my_cmpeq_ps
260 | #endif
261 |
262 | #ifndef USE_SSE2
263 | typedef union xmm_mm_union {
264 | __m128 xmm;
265 | __m64 mm[2];
266 | } xmm_mm_union;
267 |
268 | #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
269 | xmm_mm_union u; u.xmm = xmm_; \
270 | mm0_ = u.mm[0]; \
271 | mm1_ = u.mm[1]; \
272 | }
273 |
274 | #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
275 | xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
276 | }
277 |
278 | #endif // USE_SSE2
279 |
280 | /*!
281 | natural logarithm computed for 4 simultaneous float
282 | @n
283 | return NaN for x <= 0
284 | */
285 | static inline v4sf log_ps(v4sf x)
286 | {
287 | v4sf e;
288 | #ifdef USE_SSE2
289 | v4si emm0;
290 | #else
291 | v2si mm0, mm1;
292 | #endif
293 | v4sf one = *(v4sf*)_ps_1;
294 | v4sf invalid_mask = _mm_cmple_ps(x, _MM_SETZERO_PS());
295 |
296 | x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
297 |
298 | #ifndef USE_SSE2
299 | /* part 1: x = frexpf(x, &e); */
300 | COPY_XMM_TO_MM(x, mm0, mm1);
301 | mm0 = _mm_srli_pi32(mm0, 23);
302 | mm1 = _mm_srli_pi32(mm1, 23);
303 | #else
304 | emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
305 | #endif
306 | /* keep only the fractional part */
307 | x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
308 | x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
309 |
310 | #ifndef USE_SSE2
311 | /* now e=mm0:mm1 contain the really base-2 exponent */
312 | mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
313 | mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
314 | e = _mm_cvtpi32x2_ps(mm0, mm1);
315 | _mm_empty(); /* bye bye mmx */
316 | #else
317 | emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
318 | e = _mm_cvtepi32_ps(emm0);
319 | #endif
320 |
321 | e = _mm_add_ps(e, one);
322 |
323 | /* part2:
324 | if( x < SQRTHF ) {
325 | e -= 1;
326 | x = x + x - 1.0;
327 | } else { x = x - 1.0; }
328 | */
329 | {
330 | v4sf z, y;
331 | v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
332 | v4sf tmp = _mm_and_ps(x, mask);
333 | x = _mm_sub_ps(x, one);
334 | e = _mm_sub_ps(e, _mm_and_ps(one, mask));
335 | x = _mm_add_ps(x, tmp);
336 |
337 |
338 | z = _mm_mul_ps(x,x);
339 |
340 | y = *(v4sf*)_ps_cephes_log_p0;
341 | y = _mm_mul_ps(y, x);
342 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
343 | y = _mm_mul_ps(y, x);
344 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
345 | y = _mm_mul_ps(y, x);
346 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
347 | y = _mm_mul_ps(y, x);
348 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
349 | y = _mm_mul_ps(y, x);
350 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
351 | y = _mm_mul_ps(y, x);
352 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
353 | y = _mm_mul_ps(y, x);
354 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
355 | y = _mm_mul_ps(y, x);
356 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
357 | y = _mm_mul_ps(y, x);
358 |
359 | y = _mm_mul_ps(y, z);
360 |
361 | tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
362 | y = _mm_add_ps(y, tmp);
363 |
364 |
365 | tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
366 | y = _mm_sub_ps(y, tmp);
367 |
368 | tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
369 | x = _mm_add_ps(x, y);
370 | x = _mm_add_ps(x, tmp);
371 | x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
372 | }
373 | return x;
374 | }
375 |
376 | _PS_CONST(exp_hi, 88.3762626647949f);
377 | _PS_CONST(exp_lo, -88.3762626647949f);
378 |
379 | _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
380 | _PS_CONST(cephes_exp_C1, 0.693359375);
381 | _PS_CONST(cephes_exp_C2, -2.12194440e-4);
382 |
383 | _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
384 | _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
385 | _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
386 | _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
387 | _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
388 | _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
389 |
390 | /*!
391 | computes e**x of the 4 floats in x
392 | */
393 | static inline v4sf exp_ps(v4sf x)
394 | { v4sf tmp = _MM_SETZERO_PS(), fx, mask, y, z;
395 | v4sf pow2n;
396 | #ifdef USE_SSE2
397 | v4si emm0;
398 | #else
399 | v2si mm0, mm1;
400 | #endif
401 | v4sf one = *(v4sf*)_ps_1;
402 |
403 | x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
404 | x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
405 |
406 | /* express exp(x) as exp(g + n*log(2)) */
407 | fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
408 | fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
409 |
410 | /* how to perform a floorf with SSE: just below */
411 | #ifndef USE_SSE2
412 | /* step 1 : cast to int */
413 | tmp = _mm_movehl_ps(tmp, fx);
414 | mm0 = _mm_cvttps_pi32(fx);
415 | mm1 = _mm_cvttps_pi32(tmp);
416 | /* step 2 : cast back to float */
417 | tmp = _mm_cvtpi32x2_ps(mm0, mm1);
418 | #else
419 | emm0 = _mm_cvttps_epi32(fx);
420 | tmp = _mm_cvtepi32_ps(emm0);
421 | #endif
422 | /* if greater, substract 1 */
423 | mask = _mm_cmpgt_ps(tmp, fx);
424 | mask = _mm_and_ps(mask, one);
425 | fx = _mm_sub_ps(tmp, mask);
426 |
427 | tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
428 | z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
429 | x = _mm_sub_ps(x, tmp);
430 | x = _mm_sub_ps(x, z);
431 |
432 | z = _mm_mul_ps(x,x);
433 |
434 | y = *(v4sf*)_ps_cephes_exp_p0;
435 | y = _mm_mul_ps(y, x);
436 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
437 | y = _mm_mul_ps(y, x);
438 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
439 | y = _mm_mul_ps(y, x);
440 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
441 | y = _mm_mul_ps(y, x);
442 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
443 | y = _mm_mul_ps(y, x);
444 | y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
445 | y = _mm_mul_ps(y, z);
446 | y = _mm_add_ps(y, x);
447 | y = _mm_add_ps(y, one);
448 |
449 | /* build 2^n */
450 | #ifndef USE_SSE2
451 | z = _mm_movehl_ps(z, fx);
452 | mm0 = _mm_cvttps_pi32(fx);
453 | mm1 = _mm_cvttps_pi32(z);
454 | mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
455 | mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
456 | mm0 = _mm_slli_pi32(mm0, 23);
457 | mm1 = _mm_slli_pi32(mm1, 23);
458 |
459 | COPY_MM_TO_XMM(mm0, mm1, pow2n);
460 | _mm_empty();
461 | #else
462 | emm0 = _mm_cvttps_epi32(fx);
463 | emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
464 | emm0 = _mm_slli_epi32(emm0, 23);
465 | pow2n = _mm_castsi128_ps(emm0);
466 | #endif
467 | y = _mm_mul_ps(y, pow2n);
468 | return y;
469 | }
470 |
471 | _PS_CONST(minus_cephes_DP1, -0.78515625);
472 | _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
473 | _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
474 | _PS_CONST(sincof_p0, -1.9515295891E-4);
475 | _PS_CONST(sincof_p1, 8.3321608736E-3);
476 | _PS_CONST(sincof_p2, -1.6666654611E-1);
477 | _PS_CONST(coscof_p0, 2.443315711809948E-005);
478 | _PS_CONST(coscof_p1, -1.388731625493765E-003);
479 | _PS_CONST(coscof_p2, 4.166664568298827E-002);
480 | _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
481 |
482 | #ifdef USE_SSE2
483 | _PD_CONST(minus_cephes_DP1, -0.78515625);
484 | _PD_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
485 | _PD_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
486 | _PD_CONST(sincof_p0, -1.9515295891E-4);
487 | _PD_CONST(sincof_p1, 8.3321608736E-3);
488 | _PD_CONST(sincof_p2, -1.6666654611E-1);
489 | _PD_CONST(coscof_p0, 2.443315711809948E-005);
490 | _PD_CONST(coscof_p1, -1.388731625493765E-003);
491 | _PD_CONST(coscof_p2, 4.166664568298827E-002);
492 | _PD_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
493 | #endif
494 |
495 |
496 | /*!
497 | evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
498 | it runs also on old athlons XPs and the pentium III of your grand
499 | mother.
500 | @n
501 | The code is the exact rewriting of the cephes sinf function.
502 | Precision is excellent as long as x < 8192 (I did not bother to
503 | take into account the special handling they have for greater values
504 | -- it does not return garbage for arguments over 8192, though, but
505 | the extra precision is missing).
506 | @n
507 | Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
508 | surprising but correct result.
509 | @n
510 | Performance is also surprisingly good, 1.33 times faster than the
511 | macos vsinf SSE2 function, and 1.5 times faster than the
512 | __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
513 | too bad for an SSE1 function (with no special tuning) !
514 | However the latter libraries probably have a much better handling of NaN,
515 | Inf, denormalized and other special arguments..
516 | @n
517 | On my core 1 duo, the execution of this function takes approximately 95 cycles.
518 | @n
519 | From what I have observed on the experiments with Intel AMath lib, switching to an
520 | SSE2 version would improve the perf by only 10%.
521 | @n
522 | Since it is based on SSE intrinsics, it has to be compiled at -O2 to
523 | deliver full speed.
524 | */
525 | static inline v4sf sin_ps(v4sf x)
526 | { // any x
527 | v4sf xmm1, xmm2 = _MM_SETZERO_PS(), xmm3, sign_bit, y, y2, z, tmp;
528 |
529 | v4sf swap_sign_bit, poly_mask;
530 | #ifdef USE_SSE2
531 | v4si emm0, emm2;
532 | #else
533 | v2si mm0, mm1, mm2, mm3;
534 | #endif
535 | sign_bit = x;
536 | /* take the absolute value */
537 | x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
538 | /* extract the sign bit (upper one) */
539 | sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
540 |
541 | /* scale by 4/Pi */
542 | y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
543 |
544 | //printf("plop:"); print4(y);
545 | #ifdef USE_SSE2
546 | /* store the integer part of y in mm0 */
547 | emm2 = _mm_cvttps_epi32(y);
548 | /* j=(j+1) & (~1) (see the cephes sources) */
549 | emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
550 | emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
551 | y = _mm_cvtepi32_ps(emm2);
552 | /* get the swap sign flag */
553 | emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
554 | emm0 = _mm_slli_epi32(emm0, 29);
555 | /* get the polynom selection mask
556 | there is one polynom for 0 <= x <= Pi/4
557 | and another one for Pi/4 0 ){
1112 | v2df *va = (v2df*) xa, vsum = _MM_SETZERO_PD();
1113 | int i, N_4 = N-4+1;
1114 | for( i = 0 ; i < N_4 ; va+=2 ){
1115 | vsum = _mm_add_pd( vsum, _mm_add_pd( va[0], va[1] ) );
1116 | i += 4;
1117 | }
1118 | sum = VELEM(double,vsum,0) + VELEM(double,vsum,1);
1119 | for( ; i < N; i++ ){
1120 | sum += xa[i];
1121 | }
1122 | }
1123 | else{
1124 | sum = 0.0;
1125 | }
1126 | return sum;
1127 | }
1128 |
1129 |
1130 | /*!
1131 | computes the cumulative sum of the squares of the values in double array xa[n] using SSE2 intrinsics
1132 | */
1133 | static inline double CumSumSq( double *xa, int n )
1134 | { __m128d vsumsq;
1135 | register int i, N_4 = n-4+1;
1136 | register double sumsq = 0;
1137 | for( i = 0 ; i < N_4 ; i+=4, xa+=4 ){
1138 | #ifdef __GNUC__
1139 | vsumsq = *((__m128d*)&xa[2]) * *((__m128d*)&xa[2]) + *((__m128d*)xa) * *((__m128d*)xa);
1140 | #else
1141 | vsumsq = _mm_add_pd( _mm_mul_pd( *((__m128d*)&xa[2]), *((__m128d*)&xa[2]) ),
1142 | _mm_mul_pd( *((__m128d*)xa), *((__m128d*)xa) ) );
1143 | #endif
1144 | sumsq += *((double*)&vsumsq) + ((double*)&vsumsq)[1];
1145 | }
1146 | for( ; i < n ; i++, xa++ ){
1147 | sumsq += *xa * *xa;
1148 | }
1149 | return sumsq;
1150 | }
1151 |
1152 | /*!
1153 | computes the cumulative sum of the values and their squares in double array xa[n] using SSE2 intrinsics
1154 | */
1155 | static inline double CumSumSumSq( double *xa, int n, double *sumSQ )
1156 | { __m128d vsum, vsumsq;
1157 | register int i, N_4 = n-4+1;
1158 | register double sum = 0.0, sumsq = 0;
1159 | for( i = 0 ; i < N_4 ; i+=4, xa+=4 ){
1160 | #ifdef __GNUC__
1161 | vsum = *((__m128d*)&xa[2]) + *((__m128d*)xa);
1162 | vsumsq = *((__m128d*)&xa[2]) * *((__m128d*)&xa[2]) + *((__m128d*)xa) * *((__m128d*)xa);
1163 | #else
1164 | vsum = _mm_add_pd( *((__m128d*)&xa[2]), *((__m128d*)xa) );
1165 | vsumsq = _mm_add_pd( _mm_mul_pd( *((__m128d*)&xa[2]), *((__m128d*)&xa[2]) ),
1166 | _mm_mul_pd( *((__m128d*)xa), *((__m128d*)xa) ) );
1167 | #endif
1168 | sum += *((double*)&vsum) + ((double*)&vsum)[1];
1169 | sumsq += *((double*)&vsumsq) + ((double*)&vsumsq)[1];
1170 | }
1171 | for( ; i < n ; i++, xa++ ){
1172 | sum += *xa;
1173 | sumsq += *xa * *xa;
1174 | }
1175 | *sumSQ = sumsq;
1176 | return sum;
1177 | }
1178 |
1179 | /*!
1180 | scalar version of CumSum without explicit SSE2 intrinsics
1181 | */
1182 | static inline double scalCumSum( double *xa, int n )
1183 | { register int i;
1184 | register double sum = 0.0;
1185 | for( i = 0 ; i < n ; i++ ){
1186 | sum += *xa++;
1187 | }
1188 | return sum;
1189 | }
1190 |
1191 | /*!
1192 | scalar version of CumSumSq without explicit SSE2 intrinsics
1193 | */
1194 | static inline double scalCumSumSq( double *xa, int n )
1195 | { register int i;
1196 | register double sumsq = 0.0;
1197 | for( i = 0 ; i < n ; i++, xa++ ){
1198 | sumsq += *xa * *xa;
1199 | }
1200 | return sumsq;
1201 | }
1202 |
1203 | /*!
1204 | scalar version of CumSumSumSq without explicit SSE2 intrinsics
1205 | */
1206 | static inline double scalCumSumSumSq( double *xa, int n, double *sumSQ )
1207 | { register int i;
1208 | register double sum = 0.0, sumsq = 0.0;
1209 | for( i = 0 ; i < n ; i++, xa++ ){
1210 | sum += *xa;
1211 | sumsq += *xa * *xa;
1212 | }
1213 | *sumSQ = sumsq;
1214 | return sum;
1215 | }
1216 |
1217 | /*!
1218 | computes the cumulative product of the double array xa[n] using SSE2 intrinsics
1219 | */
1220 | static inline double CumMul(double *xa, int N)
1221 | { double cum;
1222 | if( xa && N > 0 ){
1223 | v2df *va = (v2df*) xa, vcum = _MM_SET1_PD(1.0);
1224 | int i, N_4 = N-4+1;
1225 | for( i = 0 ; i < N_4 ; va+=2 ){
1226 | vcum = _mm_mul_pd( vcum, _mm_mul_pd( va[0], va[1] ) );
1227 | i += 4;
1228 | }
1229 | cum = VELEM(double,vcum,0) * VELEM(double,vcum,1);
1230 | for( ; i < N; i++ ){
1231 | cum *= xa[i];
1232 | }
1233 | }
1234 | else{
1235 | cum = 0.0;
1236 | }
1237 | return cum;
1238 | }
1239 |
1240 | #else
1241 |
1242 | /*!
1243 | computes the cumulative sum of the double array xa[n] using traditional scalar code
1244 | */
1245 | static inline double CumSum( double *xa, int n )
1246 | { register int i;
1247 | register double sum = 0.0;
1248 | for( i = 0 ; i < n ; i++ ){
1249 | sum += *xa++;
1250 | }
1251 | return sum;
1252 | }
1253 |
1254 | /*!
1255 | alternative for CumSum
1256 | */
1257 | static inline double scalCumSum( double *xa, int n )
1258 | {
1259 | return CumSum(xa,n);
1260 | }
1261 |
1262 | /*!
1263 | computes the cumulative sum of the squares of the values in double array xa[n] using traditional scalar code
1264 | */
1265 | static inline double CumSumSq( double *xa, int n )
1266 | { register int i;
1267 | register double sumsq = 0.0;
1268 | for( i = 0 ; i < n ; i++, xa++ ){
1269 | sumsq += *xa * *xa;
1270 | }
1271 | return sumsq;
1272 | }
1273 |
1274 | /*!
1275 | alternative for CumSumSq
1276 | */
1277 | static inline double scalCumSumSq( double *xa, int n )
1278 | {
1279 | return CumSumSq(xa,n);
1280 | }
1281 |
1282 | /*!
1283 | computes the cumulative sum of the values and their squares in double array xa[n] using traditional scalar code
1284 | */
1285 | static inline double CumSumSumSq( double *xa, int n, double *sumSQ )
1286 | { register int i;
1287 | register double sum = 0.0, sumsq = 0.0;
1288 | for( i = 0 ; i < n ; i++, xa++ ){
1289 | sum += *xa;
1290 | sumsq += *xa * *xa;
1291 | }
1292 | *sumSQ = sumsq;
1293 | return sum;
1294 | }
1295 |
1296 | /*!
1297 | alternative for CumSumSumSq
1298 | */
1299 | static inline double scalCumSumSumSq( double *xa, int n, double *sumSQ )
1300 | {
1301 | return CumSumSumSq(xa,n,sumSQ);
1302 | }
1303 |
1304 | #endif //USE_SSE2
1305 |
1306 | #endif // SSE_MATHFUN_WITH_CODE
1307 |
1308 | //// Some SSE "extensions", and equivalents not using SSE explicitly:
1309 | #pragma mark SSE extensions
1310 |
1311 | #ifdef USE_SSE2
1312 |
1313 | # if defined(__x86_64__) || defined(x86_64) || defined(_LP64)
1314 | // static inline v2df _mm_abs_pd( v2df a )
1315 | // { _PD_CONST_TYPE(abs_mask, long long, ~0x8000000000000000LL);
1316 | // return _mm_and_pd(a, *(v2df*)_pd_abs_mask);
1317 | // }
1318 | /*!
1319 | SSE2 'intrinsic' to take the absolute value of a
1320 | */
1321 | /*static inline v2df _mm_abs_pd( register v2df a )
1322 | { const static long long am1[2] = {~0x8000000000000000LL,~0x8000000000000000LL};
1323 | return _mm_and_pd(a, *((v2df*)am1) );
1324 | }
1325 | static inline double _mm_abs_sd( double a )
1326 | { const static long long am2 = {~0x8000000000000000LL};
1327 | v2si r = _mm_and_si64( *((v2si*)&a), *((v2si*)&am2) );
1328 | return *((double*) &r);
1329 | }*/
1330 | # else
1331 | // no native support for 64bit ints: don't lose time on that!
1332 | /*!
1333 | SSE2 'intrinsic' to take the absolute value of a
1334 | */
1335 | static inline v2df _mm_abs_pd( register v2df a )
1336 | { const v4si am1 = _mm_set_epi32(0x7fffffff,0xffffffff,0x7fffffff,0xffffffff);
1337 | return _mm_and_pd(a, *((v2df*)&am1) );
1338 | }
1339 | static inline double _mm_abs_sd( double a )
1340 | { const static unsigned long long am2 = 0x7fffffffffffffffLL;
1341 | const v4si am1 = _mm_set_epi32(0x7fffffff,0xffffffff,0x7fffffff,0xffffffff);
1342 | v2si r = _mm_and_si64( *((v2si*)&a), *((v2si*)&am1) );
1343 | _mm_empty();
1344 | return *((double*)&r);
1345 | // union { double d; v2si r; } ret;
1346 | // ret.r = _mm_and_si64( *((v2si*)&a), *((v2si*)&am1) );
1347 | // a = ret.d;
1348 | // return a;
1349 | }
1350 | # endif // i386 or x86_64
1351 | static inline v4sf _mm_abs_ps( register v4sf a )
1352 | { const v4si am1 = _mm_set_epi32(0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff);
1353 | return _mm_and_ps(a, *((v4sf*)&am1) );
1354 | }
1355 |
1356 | /*!
1357 | clip a value to a min/max range
1358 | */
1359 | static inline v2df _mm_clip_pd( v2df val, v2df valMin, v2df valMax )
1360 | {
1361 | return _mm_max_pd( _mm_min_pd( val, valMax ), valMin );
1362 | }
1363 |
1364 | /*!
1365 | return an SSE2 vector of 2 doubles initialised with val0 and val1, clipped to
1366 | the specified range
1367 | */
1368 | static inline v2df _mm_setr_clipped_pd( double val0, double val1, v2df valMin, v2df valMax )
1369 | {
1370 | return _mm_clip_pd( _MM_SETR_PD(val0,val1), valMin, valMax );
1371 | }
1372 | #endif // USE_SSE2
1373 | #ifdef USE_SSE4
1374 | static inline double ssceil(double a)
1375 | { v2df va = _mm_ceil_pd( _MM_SETR_PD(a,0) );
1376 | # if !defined(__x86_64__) && !defined(x86_64) && !defined(_LP64)
1377 | _mm_empty();
1378 | # endif
1379 | return *((double*)&va);
1380 | }
1381 |
1382 | static inline double ssfloor(double a)
1383 | { v2df va = _mm_floor_pd( _MM_SETR_PD(a,0) );
1384 | # if !defined(__x86_64__) && !defined(x86_64) && !defined(_LP64)
1385 | _mm_empty();
1386 | # endif
1387 | return *((double*)&va);
1388 | }
1389 | static inline double ssround( double a )
1390 | { v2df va = _mm_round_pd( _MM_SETR_PD(a,0), _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
1391 | # if !defined(__x86_64__) && !defined(x86_64) && !defined(_LP64)
1392 | _mm_empty();
1393 | # endif
1394 | return *((double*)&va);
1395 | }
1396 | #else
1397 | static inline double ssceil(double a)
1398 | {
1399 | return ceil(a);
1400 | }
1401 | static inline double ssfloor(double a)
1402 | {
1403 | return floor(a);
1404 | }
1405 | static inline double ssround( double a )
1406 | {
1407 | return (a >= 0)? floor( a + 0.5 ) : -ceil( -a - 0.5 );
1408 | }
1409 | #endif //USE_SSE4
1410 |
1411 |
1412 | // SSE-like convenience functions (note the absence of a leading _!)
1413 |
1414 | /*!
1415 | return an SSE2 vector of 2 doubles initialised with val0 and val1, clipped to
1416 | the specified range. Does not use SSE2 intrinsics.
1417 | */
1418 | static inline v2df *mm_setr_clipped_pd( v2df *val, double val0, double val1, v2df *valMin, v2df *valMax )
1419 | {
1420 | if( val0 > ((double*)valMax)[0] ){
1421 | ((double*)val)[0] = ((double*)valMax)[0];
1422 | }
1423 | else if( val0 < ((double*)valMin)[0] ){
1424 | ((double*)val)[0] = ((double*)valMin)[0];
1425 | }
1426 | else{
1427 | ((double*)val)[0] = val0;
1428 | }
1429 | if( val1 > ((double*)valMax)[1] ){
1430 | ((double*)val)[1] = ((double*)valMax)[1];
1431 | }
1432 | else if( val1 < ((double*)valMin)[1] ){
1433 | ((double*)val)[1] = ((double*)valMin)[1];
1434 | }
1435 | else{
1436 | ((double*)val)[1] = val1;
1437 | }
1438 | return val;
1439 | }
1440 |
1441 | /*!
1442 | SSE2 'intrinsic' to take the absolute value of a. Doesn't use SSE2 intrinsics
1443 | */
1444 | static inline v2df *mm_clip_pd( v2df *val, v2df *valMin, v2df *valMax )
1445 | {
1446 | if( ((double*)val)[0] > ((double*)valMax)[0] ){
1447 | ((double*)val)[0] = ((double*)valMax)[0];
1448 | }
1449 | else if( ((double*)val)[0] < ((double*)valMin)[0] ){
1450 | ((double*)val)[0] = ((double*)valMin)[0];
1451 | }
1452 | if( ((double*)val)[1] > ((double*)valMax)[1] ){
1453 | ((double*)val)[1] = ((double*)valMax)[1];
1454 | }
1455 | else if( ((double*)val)[1] < ((double*)valMin)[1] ){
1456 | ((double*)val)[1] = ((double*)valMin)[1];
1457 | }
1458 | return val;
1459 | }
1460 |
1461 | /*!
1462 | emulation of the _mm_add_pd SSE2 intrinsic
1463 | */
1464 | static inline v2df *mm_add_pd( v2df *c, v2df *a, v2df *b )
1465 | {
1466 | ((double*)c)[0] = ((double*)a)[0] + ((double*)b)[0];
1467 | ((double*)c)[1] = ((double*)a)[1] + ((double*)b)[1];
1468 | return c;
1469 | }
1470 |
1471 | /*!
1472 | emulation of the _mm_add_pd SSE2 intrinsic
1473 | */
1474 | static inline v2df *mm_sub_pd( v2df *c, v2df *a, v2df *b )
1475 | {
1476 | ((double*)c)[0] = ((double*)a)[0] - ((double*)b)[0];
1477 | ((double*)c)[1] = ((double*)a)[1] - ((double*)b)[1];
1478 | return c;
1479 | }
1480 |
1481 | /*!
1482 | emulation of the _mm_sub_pd SSE2 intrinsic
1483 | */
1484 | static inline v2df *mm_div_pd( v2df *c, v2df *a, v2df *b )
1485 | {
1486 | ((double*)c)[0] = ((double*)a)[0] / ((double*)b)[0];
1487 | ((double*)c)[1] = ((double*)a)[1] / ((double*)b)[1];
1488 | return c;
1489 | }
1490 |
1491 | /*!
1492 | emulation of the _mm_mul_pd SSE2 intrinsic
1493 | */
1494 | static inline v2df *mm_mul_pd( v2df *c, v2df *a, v2df *b )
1495 | {
1496 | ((double*)c)[0] = ((double*)a)[0] * ((double*)b)[0];
1497 | ((double*)c)[1] = ((double*)a)[1] * ((double*)b)[1];
1498 | return c;
1499 | }
1500 |
1501 | /*!
1502 | non SSE emulation of the _mm_abs_pd 'intrinsic' defined elsewhere in this file
1503 | */
1504 | static inline v2df *mm_abs_pd( v2df *val, v2df *a )
1505 | {
1506 | ((double*)val)[0] = (((double*)a)[0] >= 0)? ((double*)a)[0] : -((double*)a)[0];
1507 | ((double*)val)[1] = (((double*)a)[1] >= 1)? ((double*)a)[1] : -((double*)a)[1];
1508 | return val;
1509 | }
1510 |
1511 | /*!
1512 | emulation of the _mm_round_pd SSE4 intrinsic.
1513 | @n
1514 | NB: the SSE4 intrinsic is at least twice as fast as the non-SSE calculation, PER value
1515 | so it pays to replace round(x) with _mm_round_pd(_mm_setr_pd(x)) - idem for floor and ceil
1516 | */
1517 | static inline v2df *mm_round_pd( v2df *val, v2df *a )
1518 | {
1519 | ((double*)val)[0] = (((double*)a)[0] >= 0)? floor( ((double*)a)[0] + 0.5 ) : -ceil( -((double*)a)[0] - 0.5 );
1520 | ((double*)val)[1] = (((double*)a)[1] >= 0)? floor( ((double*)a)[1] + 0.5 ) : -ceil( -((double*)a)[1] - 0.5 );
1521 | return val;
1522 | }
1523 |
1524 | #define _SSE_MATHFUN_H
1525 | #endif
1526 |
--------------------------------------------------------------------------------