├── .github └── workflows │ └── build-windows.yml ├── .gitignore ├── meson.build ├── meson_options.txt ├── readme.rst └── src ├── Bullshit.h ├── CPU.c ├── CPU.h ├── CommonFunctions.h ├── CommonMacros.h ├── CopyCode.cpp ├── CopyCode.h ├── DCTFFTW.cpp ├── DCTFFTW.h ├── EntryPoint.c ├── Fakery.c ├── Fakery.h ├── GroupOfPlanes.c ├── GroupOfPlanes.h ├── Luma.cpp ├── Luma.h ├── MVAnalyse.c ├── MVAnalysisData.c ├── MVAnalysisData.h ├── MVBlockFPS.c ├── MVCompensate.c ├── MVDegrains.cpp ├── MVDegrains.h ├── MVDegrains_AVX2.cpp ├── MVDepan.cpp ├── MVFinest.c ├── MVFlow.cpp ├── MVFlowBlur.c ├── MVFlowFPS.c ├── MVFlowFPSHelper.c ├── MVFlowFPSHelper.h ├── MVFlowInter.c ├── MVFrame.cpp ├── MVFrame.h ├── MVFrame_AVX2.cpp ├── MVMask.c ├── MVRecalculate.c ├── MVSCDetection.c ├── MVSuper.c ├── MaskFun.cpp ├── MaskFun.h ├── MaskFun_AVX2.cpp ├── Overlap.cpp ├── Overlap.h ├── Overlap_AVX2.cpp ├── PlaneOfBlocks.cpp ├── PlaneOfBlocks.h ├── SADFunctions.cpp ├── SADFunctions.h ├── SADFunctions_AVX2.cpp ├── SimpleResize.cpp ├── SimpleResize.h ├── SimpleResize_AVX2.cpp ├── asm ├── aarch64-asm.S ├── aarch64-pixel-a-common.S ├── aarch64-pixel-a.S ├── const-a.asm ├── cpu-a.asm ├── include │ ├── x86inc.asm │ └── x86util.asm ├── pixel-32.asm ├── pixel-a.asm └── sad-a.asm └── sse2neon.h /.github/workflows/build-windows.yml: -------------------------------------------------------------------------------- 1 | name: Build-Windows 2 | 3 | on: workflow_dispatch 4 | 5 | permissions: 6 | attestations: write 7 | contents: read 8 | id-token: write 9 | 10 | jobs: 11 | build-windows-x64: 12 | runs-on: windows-latest 13 | defaults: 14 | run: 15 | shell: msys2 {0} 16 | steps: 17 | - name: Checkout code 18 | uses: actions/checkout@v4 19 | with: 20 | submodules: recursive 21 | - name: Setup MSYS2 22 | uses: msys2/setup-msys2@v2 23 | with: 24 | msystem: MINGW64 25 | update: true 26 | install: >- 27 | base-devel 28 | mingw-w64-x86_64-jq 29 | mingw-w64-x86_64-gcc 30 | mingw-w64-x86_64-pkg-config 31 | mingw-w64-x86_64-vapoursynth 32 | mingw-w64-x86_64-meson 33 | mingw-w64-x86_64-ninja 34 | mingw-w64-x86_64-nasm 35 | mingw-w64-x86_64-fftw 36 | - name: Build vs-mvtools 37 | run: | 38 | meson setup build --buildtype release --prefer-static --default-library=static -Dcpp_link_args='-static' 39 | meson compile -vC build 40 | - name: Export version 41 | run: | 42 | echo "ARTIFACT_VERSION=$(meson introspect --projectinfo build | jq -r '.version')" >> $GITHUB_ENV 43 | - name: Upload 44 | uses: actions/upload-artifact@v4.3.3 45 | with: 46 | name: mvtools-windows-x64-${{ env.ARTIFACT_VERSION }} 47 | path: build/libmvtools.dll 48 | build-windows-x86: 49 | runs-on: windows-latest 50 | defaults: 51 | run: 52 | shell: msys2 {0} 53 | steps: 54 | - name: Checkout code 55 | uses: actions/checkout@v4 56 | with: 57 | submodules: recursive 58 | - name: Setup MSYS2 59 | uses: msys2/setup-msys2@v2 60 | with: 61 | msystem: MINGW32 62 | update: true 63 | install: >- 64 | base-devel 65 | mingw-w64-i686-jq 66 | mingw-w64-i686-gcc 67 | mingw-w64-i686-pkg-config 68 | mingw-w64-i686-vapoursynth 69 | mingw-w64-i686-meson 70 | mingw-w64-i686-ninja 71 | mingw-w64-i686-nasm 72 | mingw-w64-i686-fftw 73 | - name: Build vs-mvtools 74 | run: | 75 | meson setup build --buildtype release --prefer-static --default-library=static -Dcpp_link_args='-static' 76 | meson compile -vC build 77 | - name: Export version 78 | run: | 79 | echo "ARTIFACT_VERSION=$(meson introspect --projectinfo build | jq -r '.version')" >> $GITHUB_ENV 80 | - name: Upload 81 | uses: actions/upload-artifact@v4.3.3 82 | with: 83 | name: mvtools-windows-x86-${{ env.ARTIFACT_VERSION }} 84 | path: build/libmvtools.dll 85 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | orig/ 2 | *.zip 3 | releases/ 4 | test stuff/ 5 | *.o 6 | *.la 7 | *.lo 8 | *.so 9 | *.dll 10 | .libs 11 | Makefile 12 | Makefile.in 13 | aclocal.m4 14 | autom4te.cache 15 | compile 16 | config.guess 17 | config.log 18 | config.status 19 | config.sub 20 | configure 21 | depcomp 22 | install-sh 23 | libtool 24 | ltmain.sh 25 | missing 26 | .deps/ 27 | .dirstamp 28 | .cache/ 29 | .venv/ 30 | -------------------------------------------------------------------------------- /meson.build: -------------------------------------------------------------------------------- 1 | project('MVTools', 'c', 'cpp', 2 | version: '24', 3 | default_options: ['c_std=c99', 'cpp_std=c++11', 'buildtype=release', 'b_lto=true'], 4 | meson_version: '>=0.46') 5 | 6 | 7 | warnings = [ 8 | '-Wall', 9 | '-Wextra', 10 | '-Wshadow', 11 | ] 12 | 13 | cflags = [ 14 | warnings, 15 | '-fvisibility=hidden', 16 | '-DPACKAGE_VERSION="@0@"'.format(meson.project_version()), 17 | ] 18 | 19 | ldflags = [ 20 | ] 21 | 22 | nasm_flags = [ 23 | '-I@0@'.format(join_paths(meson.current_source_dir(), 'src/asm/include/')), 24 | '-w', 25 | '-Worphan-labels', 26 | '-Wunrecognized-char', 27 | '-Dprivate_prefix=mvtools', 28 | '-DHIGH_BIT_DEPTH=0', 29 | '-DBIT_DEPTH=8', 30 | ] 31 | 32 | 33 | vapoursynth_dep = dependency('vapoursynth').partial_dependency(includes: true, compile_args: true) 34 | 35 | 36 | sources = [ 37 | 'src/CopyCode.cpp', 38 | 'src/CPU.c', 39 | 'src/DCTFFTW.cpp', 40 | 'src/EntryPoint.c', 41 | 'src/Fakery.c', 42 | 'src/GroupOfPlanes.c', 43 | 'src/Luma.cpp', 44 | 'src/MaskFun.cpp', 45 | 'src/MVAnalyse.c', 46 | 'src/MVAnalysisData.c', 47 | 'src/MVBlockFPS.c', 48 | 'src/MVCompensate.c', 49 | 'src/MVDegrains.cpp', 50 | 'src/MVDepan.cpp', 51 | 'src/MVFinest.c', 52 | 'src/MVFlow.cpp', 53 | 'src/MVFlowBlur.c', 54 | 'src/MVFlowFPS.c', 55 | 'src/MVFlowFPSHelper.c', 56 | 'src/MVFlowInter.c', 57 | 'src/MVFrame.cpp', 58 | 'src/MVMask.c', 59 | 'src/MVRecalculate.c', 60 | 'src/MVSCDetection.c', 61 | 'src/MVSuper.c', 62 | 'src/Overlap.cpp', 63 | 'src/PlaneOfBlocks.cpp', 64 | 'src/SADFunctions.cpp', 65 | 'src/SimpleResize.cpp', 66 | ] 67 | 68 | 69 | debug_build = get_option('buildtype').startswith('debug') 70 | 71 | 72 | host_cpu_family = host_machine.cpu_family() 73 | 74 | if host_cpu_family == 'x86' 75 | host_bits = 32 76 | elif host_cpu_family == 'x86_64' 77 | host_bits = 64 78 | elif host_cpu_family == 'arm' 79 | host_bits = 32 80 | elif host_cpu_family == 'aarch64' 81 | host_bits = 64 82 | endif 83 | 84 | 85 | host_system = host_machine.system() 86 | 87 | if host_system == 'windows' or host_system == 'cygwin' 88 | if host_cpu_family == 'x86' 89 | cflags += '-mstackrealign' 90 | ldflags += '-Wl,--kill-at' 91 | nasm_flags += '-DPREFIX' 92 | endif 93 | 94 | nasm_flags += ['-f', 'win@0@'.format(host_bits)] 95 | elif host_system == 'linux' or host_system == 'bsd' # The BSDs are close enough, right? 96 | if debug_build 97 | nasm_flags += '-gdwarf' 98 | endif 99 | 100 | nasm_flags += ['-f', 'elf@0@'.format(host_bits)] 101 | elif host_system == 'darwin' 102 | if debug_build 103 | nasm_flags += '-gdwarf' 104 | endif 105 | 106 | nasm_flags += ['-DPREFIX', '-f', 'macho@0@'.format(host_bits)] 107 | cflags += ['-DPREFIX'] 108 | else 109 | error('Unknown host system "@0@".'.format(host_system)) 110 | endif 111 | 112 | 113 | helper_libs = [] 114 | 115 | 116 | if host_cpu_family.startswith('x86') 117 | cflags += ['-mfpmath=sse', '-msse2', '-DMVTOOLS_X86=1'] 118 | 119 | 120 | nasm_sources = [ 121 | 'src/asm/const-a.asm', 122 | 'src/asm/cpu-a.asm', 123 | 'src/asm/pixel-a.asm', 124 | 'src/asm/sad-a.asm', 125 | ] 126 | 127 | 128 | if host_cpu_family == 'x86' 129 | nasm_flags += '-DARCH_X86_64=0' 130 | 131 | nasm_sources += [ 132 | 'src/asm/pixel-32.asm', 133 | ] 134 | else 135 | nasm_flags += ['-DARCH_X86_64=1', '-DPIC'] 136 | endif 137 | 138 | nasm = find_program(get_option('with_nasm')) 139 | 140 | outputname = '@BASENAME@.o' 141 | if host_system == 'windows' 142 | outputname = '@BASENAME@.obj' 143 | endif 144 | 145 | nasm_gen = generator(nasm, 146 | output: outputname, 147 | arguments: nasm_flags + ['@INPUT@', '-o', '@OUTPUT@']) 148 | 149 | sources += nasm_gen.process(nasm_sources) 150 | 151 | 152 | libavx2_sources = [ 153 | 'src/MaskFun_AVX2.cpp', 154 | 'src/MVDegrains_AVX2.cpp', 155 | 'src/MVFrame_AVX2.cpp', 156 | 'src/Overlap_AVX2.cpp', 157 | 'src/SADFunctions_AVX2.cpp', 158 | 'src/SimpleResize_AVX2.cpp', 159 | ] 160 | 161 | helper_libs += static_library('avx2', 162 | libavx2_sources, 163 | dependencies: vapoursynth_dep, 164 | cpp_args: [cflags, '-mavx2', '-mtune=haswell'], 165 | install: false) 166 | endif 167 | 168 | if host_cpu_family.startswith('arm') or host_cpu_family.startswith('aarch64') 169 | cflags += ['-DMVTOOLS_ARM=1'] 170 | 171 | if host_cpu_family.startswith('aarch64') 172 | asm_sources = [ 173 | 'src/asm/aarch64-pixel-a.S', 174 | ] 175 | 176 | sources += asm_sources 177 | endif 178 | endif 179 | 180 | 181 | cxx = meson.get_compiler('cpp') 182 | 183 | 184 | deps = [ 185 | vapoursynth_dep, 186 | dependency('fftw3f'), 187 | cxx.find_library('m', required: false), 188 | ] 189 | 190 | shared_module('mvtools', 191 | sources, 192 | dependencies: deps, 193 | link_args: ldflags, 194 | c_args: cflags, 195 | cpp_args: cflags, 196 | link_with: helper_libs, 197 | install: true) 198 | -------------------------------------------------------------------------------- /meson_options.txt: -------------------------------------------------------------------------------- 1 | option('with_nasm', 2 | type: 'string', 3 | value: 'nasm', 4 | description: 'Location of the NASM executable. Only relevant on x86 hosts.') 5 | -------------------------------------------------------------------------------- /readme.rst: -------------------------------------------------------------------------------- 1 | Description 2 | =========== 3 | 4 | MVTools is a set of filters for motion estimation and compensation. 5 | 6 | This is a port of version 2.5.11.20 of the Avisynth plugin. 7 | 8 | Some changes from version 2.5.11.9 of the SVP fork have been incorporated as well (http://www.svp-team.com/wiki/Download). 9 | 10 | The filter DepanEstimate was ported from the Avisynth plugin DepanEstimate, version 1.10. 11 | 12 | The filters DepanCompensate and DepanStabilise were ported from the Avisynth plugin Depan, version 1.13.1. 13 | 14 | 15 | Differences 16 | =========== 17 | 18 | * All: 19 | * Free multithreading, courtesy of VapourSynth. 20 | 21 | * Parameters are all lowercase now. 22 | 23 | * YUY2 is not supported. 24 | 25 | * Grayscale, 4:2:0, 4:2:2, 4:4:0, and 4:4:4 are supported, except for DepanCompensate and DepanStabilise, which don't support 4:4:0. 26 | 27 | * Up to 16 bits per sample are supported. 28 | 29 | * The audio is definitely not killed. 30 | 31 | * No "planar" parameter. 32 | 33 | * "isse" parameter renamed to "opt". 34 | 35 | * Analyse: 36 | * No "temporal" parameter, as it's sort of incompatible with multithreading. 37 | 38 | * No "outfile" parameter. 39 | 40 | * No "sadx264" parameter. If opt is True, the best functions imported from x264 will be selected automatically. Otherwise, only C functions will be used. 41 | 42 | * New parameters "fields" and "tff". 43 | 44 | * The optimised SAD, SATD, and SSD functions from x264 have been updated to the latest versions (as of September 2014). 45 | 46 | * Block sizes of 64x32, 64x64, 128x64, and 128x128 are supported. 47 | 48 | * The "dct" parameter can be 5..10 even with blocks larger than 16x16. 49 | 50 | * Recalculate: 51 | * Same as Analyse. 52 | 53 | * Compensate: 54 | * No "recursion" parameter. It was dodgy. 55 | 56 | * New parameter "tff". 57 | 58 | * Flow 59 | * New parameter "tff". 60 | 61 | * SCDetection: 62 | * No "ysc" parameter. The input frames are returned unchanged, with the ``_SceneChangePrev`` or ``_SceneChangeNext`` property attached. 63 | 64 | * No "isse" parameter. It wasn't used. 65 | 66 | * DepanAnalyse: 67 | * Formerly "MDepan". 68 | 69 | * New parameters "fields" and "tff". 70 | 71 | * No "log", "range", "isse" parameters. 72 | 73 | * DepanEstimate: 74 | * New parameters "fields" and "tff". 75 | 76 | * No "range", "log", "debug", "extlog" parameters. 77 | 78 | * DepanCompensate: 79 | * Formerly "DePan". 80 | 81 | * No "inputlog" parameter. 82 | 83 | * DepanStabilise: 84 | * Formerly "DePanStabilize". 85 | 86 | * No "inputlog" parameter. 87 | 88 | * Methods -1 and 2 unavailable. 89 | 90 | 91 | Usage 92 | ===== 93 | :: 94 | 95 | mv.Super(clip clip[, int hpad=16, int vpad=16, int pel=2, int levels=0, bint chroma=True, int sharp=2, int rfilter=2, clip pelclip=None, bint opt=True]) 96 | 97 | mv.Analyse(clip super[, int blksize=8, int blksizev=blksize, int levels=0, int search=4, int searchparam=2, int pelsearch=0, bint isb=False, int lambda, bint chroma=True, int delta=1, bint truemotion=True, int lsad, int plevel, int global, int pnew, int pzero=pnew, int pglobal=0, int overlap=0, int overlapv=overlap, bint divide=False, int badsad=10000, int badrange=24, bint opt=True, bint meander=True, bint trymany=False, bint fields=False, bint tff, int search_coarse=3, int dct=0]) 98 | 99 | mv.Recalculate(clip super, clip vectors[, int blksize=8, int blksizev=blksize, int search=4, int searchparam=2, int lambda, bint chroma=True, bint truemotion=True, int pnew, int overlap=0, int overlapv=overlap, bint divide=False, bint opt=True, bint meander=True, bint fields=False, bint tff, int dct=0]) 100 | 101 | mv.Compensate(clip clip, clip super, clip vectors[, int scbehavior=1, int thsad=10000, bint fields=False, float time=100.0, int thscd1=400, int thscd2=130, bint opt=True, bint tff]) 102 | 103 | mv.Degrain1(clip clip, clip super, clip mvbw, clip mvfw[, int thsad=400, int thsadc=thsad, int plane=4, int limit=255, int limitc=limit, int thscd1=400, int thscd2=130, bint opt=True]) 104 | 105 | mv.Degrain2(clip clip, clip super, clip mvbw, clip mvfw, clip mvbw2, clip mvfw2[, int thsad=400, int thsadc=thsad, int plane=4, int limit=255, int limitc=limit, int thscd1=400, int thscd2=130, bint opt=True]) 106 | 107 | mv.Degrain3(clip clip, clip super, clip mvbw, clip mvfw, clip mvbw2, clip mvfw2, clip mvbw3, clip mvfw3[, int thsad=400, int thsadc=thsad, int plane=4, int limit=255, int limitc=limit, int thscd1=400, int thscd2=130, bint opt=True]) 108 | 109 | mv.Mask(clip clip, clip vectors[, float ml=100.0, float gamma=1.0, int kind=0, float time=100.0, int ysc=0, int thscd1=400, int thscd2=130, bint opt=True]) 110 | 111 | mv.Finest(clip super[, bint opt=True]) 112 | 113 | mv.Flow(clip clip, clip super, clip vectors[, float time=100.0, int mode=0, bint fields=False, int thscd1=400, int thscd2=130, bint opt=True, bint tff]) 114 | 115 | mv.FlowBlur(clip clip, clip super, clip mvbw, clip mvfw[, float blur=50.0, int prec=1, int thscd1=400, int thscd2=130, bint opt=True]) 116 | 117 | mv.FlowInter(clip clip, clip super, clip mvbw, clip mvfw[, float time=50.0, float ml=100.0, bint blend=True, int thscd1=400, int thscd2=130, bint opt=True]) 118 | 119 | mv.FlowFPS(clip clip, clip super, clip mvbw, clip mvfw[, int num=25, int den=1, int mask=2, float ml=100.0, bint blend=True, int thscd1=400, int thscd2=130, bint opt=True]) 120 | 121 | mv.BlockFPS(clip clip, clip super, clip mvbw, clip mvfw[, int num=25, int den=1, int mode=3, float ml=100.0, bint blend=True, int thscd1=400, int thscd2=130, bint opt=True]) 122 | 123 | mv.SCDetection(clip clip, clip vectors[, int thscd1=400, int thscd2=130]) 124 | 125 | mv.DepanAnalyse(clip clip, clip vectors[, clip mask, bint zoom=True, bint rot=True, float pixaspect=1.0, float error=15.0, bint info=False, float wrong=10.0, float zerow=0.05, int thscd1=400, int thscd2=130, bint fields=False, bint tff]) 126 | 127 | mv.DepanEstimate(clip clip[, float trust=4.0, int winx=0, int winy=0, int wleft=-1, int wtop=-1, int dxmax=-1, int dymax=-1, float zoommax=1.0, float stab=1.0, float pixaspect=1.0, bint info=False, bint show=False, bint fields=False, bint tff]) 128 | 129 | mv.DepanCompensate(clip clip, clip data[, float offset=0.0, int subpixel=2, float pixaspect=1.0, bint matchfields=True, int mirror=0, int blur=0, bint info=False, bint fields=False, bint tff]) 130 | 131 | mv.DepanStabilise(clip clip, clip data[, float cutoff=1.0, float damping=0.9, float initzoom=1.0, bint addzoom=False, int prev=0, int next=0, int mirror=0, int blur=0, float dxmax=60.0, float dymax=30.0, float zoommax=1.05, float rotmax=1.0, int subpixel=2, float pixaspect=1.0, int fitlast=0, float tzoom=3.0, bint info=False, int method=0, bint fields=False]) 132 | 133 | 134 | If *fields* is True, it is assumed that the clip named *clip* first went through std.SeparateFields. 135 | 136 | For information about the other parameters, consult the Avisynth plugins' documentation at http://avisynth.org.ru/mvtools/mvtools2.html or http://www.avisynth.nl/users/fizick/depan/depan.html. This will not be necessary in the future. 137 | 138 | 139 | Compilation 140 | =========== 141 | 142 | FFTW3 configured for 32 bit floats is required ("fftw3f"). 143 | 144 | :: 145 | 146 | meson setup build 147 | ninja -C build 148 | 149 | 150 | License 151 | ======= 152 | 153 | GPL 2, like the Avisynth plugins. 154 | -------------------------------------------------------------------------------- /src/Bullshit.h: -------------------------------------------------------------------------------- 1 | #ifndef BULLSHIT_H 2 | #define BULLSHIT_H 3 | 4 | #if defined(_MSC_VER) && _MSC_VER < 1900 5 | // Don't forget to zero the last byte. _snprintf doesn't do it if the string doesn't fit. 6 | #define snprintf _snprintf 7 | #endif 8 | 9 | #endif // BULLSHIT_H 10 | -------------------------------------------------------------------------------- /src/CPU.c: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * cpu.c: h264 encoder library 3 | ***************************************************************************** 4 | * Copyright (C) 2003 Laurent Aimar 5 | * $Id: cpu.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $ 6 | * 7 | * Authors: Laurent Aimar 8 | * 9 | * This program is free software; you can redistribute it and/or modify 10 | * it under the terms of the GNU General Public License as published by 11 | * the Free Software Foundation; either version 2 of the License, or 12 | * (at your option) any later version. 13 | * 14 | * This program is distributed in the hope that it will be useful, 15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | * GNU General Public License for more details. 18 | * 19 | * You should have received a copy of the GNU General Public License 20 | * along with this program; if not, write to the Free Software 21 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. 22 | *****************************************************************************/ 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | #include "CPU.h" 29 | 30 | 31 | #if defined(MVTOOLS_X86) 32 | 33 | uint32_t cpu_detect(void) { 34 | uint32_t cpu = 0; 35 | uint32_t eax, ebx, ecx, edx; 36 | uint32_t vendor[4] = { 0 }; 37 | uint32_t max_extended_cap, max_basic_cap; 38 | int cache; 39 | 40 | 41 | mvtools_cpu_cpuid(0, &eax, vendor + 0, vendor + 2, vendor + 1); 42 | max_basic_cap = eax; 43 | if (max_basic_cap == 0) 44 | return 0; 45 | 46 | mvtools_cpu_cpuid(1, &eax, &ebx, &ecx, &edx); 47 | if (edx & 0x00800000) 48 | cpu |= X264_CPU_MMX; 49 | else 50 | return cpu; 51 | if (edx & 0x02000000) 52 | cpu |= X264_CPU_MMX2 | X264_CPU_SSE; 53 | if (edx & 0x00008000) 54 | cpu |= X264_CPU_CMOV; 55 | else 56 | return cpu; 57 | if (edx & 0x04000000) 58 | cpu |= X264_CPU_SSE2; 59 | if (ecx & 0x00000001) 60 | cpu |= X264_CPU_SSE3; 61 | if (ecx & 0x00000200) 62 | cpu |= X264_CPU_SSSE3; 63 | if (ecx & 0x00080000) 64 | cpu |= X264_CPU_SSE4; 65 | if (ecx & 0x00100000) 66 | cpu |= X264_CPU_SSE42; 67 | 68 | if (ecx & 0x08000000) { /* XGETBV supported and XSAVE enabled by OS */ 69 | uint64_t xcr0 = mvtools_cpu_xgetbv(0); 70 | if ((xcr0 & 0x6) == 0x6) { /* XMM/YMM state */ 71 | if (ecx & 0x10000000) 72 | cpu |= X264_CPU_AVX; 73 | if (ecx & 0x00001000) 74 | cpu |= X264_CPU_FMA3; 75 | 76 | if (max_basic_cap >= 7) { 77 | mvtools_cpu_cpuid(7, &eax, &ebx, &ecx, &edx); 78 | if (ebx & 0x00000020) 79 | cpu |= X264_CPU_AVX2; 80 | } 81 | } 82 | } 83 | 84 | if (cpu & X264_CPU_SSSE3) 85 | cpu |= X264_CPU_SSE2_IS_FAST; 86 | 87 | mvtools_cpu_cpuid(0x80000000, &eax, &ebx, &ecx, &edx); 88 | max_extended_cap = eax; 89 | 90 | if (max_extended_cap >= 0x80000001) { 91 | mvtools_cpu_cpuid(0x80000001, &eax, &ebx, &ecx, &edx); 92 | 93 | if (ecx & 0x00000020) 94 | cpu |= X264_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */ 95 | if (ecx & 0x00000040) /* SSE4a, AMD only */ 96 | { 97 | int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); 98 | cpu |= X264_CPU_SSE2_IS_FAST; /* Phenom and later CPUs have fast SSE units */ 99 | if (family == 0x14) { 100 | cpu &= ~X264_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */ 101 | cpu |= X264_CPU_SSE2_IS_SLOW; /* Bobcat has 64-bit SIMD units */ 102 | cpu |= X264_CPU_SLOW_PALIGNR; /* palignr is insanely slow on Bobcat */ 103 | } 104 | if (family == 0x16) { 105 | cpu |= X264_CPU_SLOW_PSHUFB; /* Jaguar's pshufb isn't that slow, but it's slow enough 106 | * compared to alternate instruction sequences that this 107 | * is equal or faster on almost all such functions. */ 108 | } 109 | } 110 | 111 | if (cpu & X264_CPU_AVX) { 112 | if (ecx & 0x00000800) /* XOP */ 113 | cpu |= X264_CPU_XOP; 114 | if (ecx & 0x00010000) /* FMA4 */ 115 | cpu |= X264_CPU_FMA4; 116 | } 117 | 118 | if (!strcmp((char *)vendor, "AuthenticAMD")) { 119 | if (edx & 0x00400000) 120 | cpu |= X264_CPU_MMX2; 121 | if (!(cpu & X264_CPU_LZCNT)) 122 | cpu |= X264_CPU_SLOW_CTZ; 123 | if ((cpu & X264_CPU_SSE2) && !(cpu & X264_CPU_SSE2_IS_FAST)) 124 | cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */ 125 | } 126 | } 127 | 128 | if (!strcmp((char *)vendor, "GenuineIntel")) { 129 | mvtools_cpu_cpuid(1, &eax, &ebx, &ecx, &edx); 130 | int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); 131 | int model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0); 132 | if (family == 6) { 133 | /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah") 134 | * theoretically support sse2, but it's significantly slower than mmx for 135 | * almost all of x264's functions, so let's just pretend they don't. */ 136 | if (model == 9 || model == 13 || model == 14) { 137 | cpu &= ~(X264_CPU_SSE2 | X264_CPU_SSE3); 138 | assert(!(cpu & (X264_CPU_SSSE3 | X264_CPU_SSE4))); 139 | } 140 | /* Detect Atom CPU */ 141 | else if (model == 28) { 142 | cpu |= X264_CPU_SLOW_ATOM; 143 | cpu |= X264_CPU_SLOW_CTZ; 144 | cpu |= X264_CPU_SLOW_PSHUFB; 145 | } 146 | /* Conroe has a slow shuffle unit. Check the model number to make sure not 147 | * to include crippled low-end Penryns and Nehalems that don't have SSE4. */ 148 | else if ((cpu & X264_CPU_SSSE3) && !(cpu & X264_CPU_SSE4) && model < 23) 149 | cpu |= X264_CPU_SLOW_SHUFFLE; 150 | } 151 | } 152 | 153 | if ((!strcmp((char *)vendor, "GenuineIntel") || !strcmp((char *)vendor, "CyrixInstead")) && !(cpu & X264_CPU_SSE42)) { 154 | /* cacheline size is specified in 3 places, any of which may be missing */ 155 | mvtools_cpu_cpuid(1, &eax, &ebx, &ecx, &edx); 156 | cache = (ebx & 0xff00) >> 5; // cflush size 157 | if (!cache && max_extended_cap >= 0x80000006) { 158 | mvtools_cpu_cpuid(0x80000006, &eax, &ebx, &ecx, &edx); 159 | cache = ecx & 0xff; // cacheline size 160 | } 161 | if (!cache && max_basic_cap >= 2) { 162 | // Cache and TLB Information 163 | static const unsigned char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 }; 164 | static const unsigned char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67, 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 }; 165 | uint32_t buf[4]; 166 | int max, i = 0; 167 | do { 168 | mvtools_cpu_cpuid(2, buf + 0, buf + 1, buf + 2, buf + 3); 169 | max = buf[0] & 0xff; 170 | buf[0] &= ~0xff; 171 | for (int j = 0; j < 4; j++) 172 | if (!(buf[j] >> 31)) 173 | while (buf[j]) { 174 | if (strchr((const char *)cache32_ids, buf[j] & 0xff)) 175 | cache = 32; 176 | if (strchr((const char *)cache64_ids, buf[j] & 0xff)) 177 | cache = 64; 178 | buf[j] >>= 8; 179 | } 180 | } while (++i < max); 181 | } 182 | 183 | if (cache == 32) 184 | cpu |= X264_CPU_CACHELINE_32; 185 | else if (cache == 64) 186 | cpu |= X264_CPU_CACHELINE_64; 187 | //else 188 | // x264_log( NULL, X264_LOG_WARNING, "unable to determine cacheline size\n" ); 189 | } 190 | 191 | return cpu; 192 | } 193 | 194 | #elif defined(MVTOOLS_ARM) 195 | 196 | uint32_t cpu_detect(void) { 197 | return ~0; // we just assume NEON is available, as there is no instruction to check 198 | } 199 | 200 | #else // not MVTOOLS_X86 or MVTOOLS_ARM 201 | 202 | uint32_t cpu_detect(void) { 203 | return 0; 204 | } 205 | 206 | #endif 207 | -------------------------------------------------------------------------------- /src/CPU.h: -------------------------------------------------------------------------------- 1 | #ifndef MVT_CPU_H 2 | #define MVT_CPU_H 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #include 9 | 10 | 11 | #if defined(MVTOOLS_X86) 12 | 13 | #define X264_CPU_CMOV 0x0000001 14 | #define X264_CPU_MMX 0x0000002 15 | #define X264_CPU_MMX2 0x0000004 /* MMX2 aka MMXEXT aka ISSE */ 16 | #define X264_CPU_MMXEXT X264_CPU_MMX2 17 | #define X264_CPU_SSE 0x0000008 18 | #define X264_CPU_SSE2 0x0000010 19 | #define X264_CPU_SSE3 0x0000020 20 | #define X264_CPU_SSSE3 0x0000040 21 | #define X264_CPU_SSE4 0x0000080 /* SSE4.1 */ 22 | #define X264_CPU_SSE42 0x0000100 /* SSE4.2 */ 23 | #define X264_CPU_LZCNT 0x0000200 /* Phenom support for "leading zero count" instruction. */ 24 | #define X264_CPU_AVX 0x0000400 /* AVX support: requires OS support even if YMM registers aren't used. */ 25 | #define X264_CPU_XOP 0x0000800 /* AMD XOP */ 26 | #define X264_CPU_FMA4 0x0001000 /* AMD FMA4 */ 27 | #define X264_CPU_FMA3 0x0002000 /* FMA3 */ 28 | #define X264_CPU_AVX2 0x0004000 /* AVX2 */ 29 | #define X264_CPU_BMI1 0x0008000 /* BMI1 */ 30 | #define X264_CPU_BMI2 0x0010000 /* BMI2 */ 31 | /* x86 modifiers */ 32 | #define X264_CPU_CACHELINE_32 0x0020000 /* avoid memory loads that span the border between two cachelines */ 33 | #define X264_CPU_CACHELINE_64 0x0040000 /* 32/64 is the size of a cacheline in bytes */ 34 | #define X264_CPU_SSE2_IS_SLOW 0x0080000 /* avoid most SSE2 functions on Athlon64 */ 35 | #define X264_CPU_SSE2_IS_FAST 0x0100000 /* a few functions are only faster on Core2 and Phenom */ 36 | #define X264_CPU_SLOW_SHUFFLE 0x0200000 /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */ 37 | #define X264_CPU_STACK_MOD4 0x0400000 /* if stack is only mod4 and not mod16 */ 38 | #define X264_CPU_SLOW_CTZ 0x0800000 /* BSR/BSF x86 instructions are really slow on some CPUs */ 39 | #define X264_CPU_SLOW_ATOM 0x1000000 /* The Atom is terrible: slow SSE unaligned loads, slow 40 | * SIMD multiplies, slow SIMD variable shifts, slow pshufb, 41 | * cacheline split penalties -- gather everything here that 42 | * isn't shared by other CPUs to avoid making half a dozen 43 | * new SLOW flags. */ 44 | #define X264_CPU_SLOW_PSHUFB 0x2000000 /* such as on the Intel Atom */ 45 | #define X264_CPU_SLOW_PALIGNR 0x4000000 /* such as on the AMD Bobcat */ 46 | 47 | void mvtools_cpu_emms(); 48 | void mvtools_cpu_cpuid(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx); 49 | uint64_t mvtools_cpu_xgetbv(int xcr); 50 | 51 | #endif // MVTOOLS_X86 52 | 53 | uint32_t cpu_detect(void); 54 | 55 | enum { 56 | MVOPT_SCALAR = 0, 57 | #ifdef MVTOOLS_X86 58 | MVOPT_SSE2 = 1, 59 | MVOPT_AVX2 = 2, 60 | #elif MVTOOLS_ARM 61 | MVOPT_NEON = 1, 62 | MVOPT_SSE2 = 1, // SSE2 is converted to Neon 63 | #endif // MVTOOLS_X86 64 | }; 65 | 66 | extern uint32_t g_cpuinfo; 67 | 68 | #ifdef __cplusplus 69 | } // extern "C" 70 | #endif 71 | 72 | #endif // MVT_CPU_H 73 | -------------------------------------------------------------------------------- /src/CommonFunctions.h: -------------------------------------------------------------------------------- 1 | #ifndef __COMMON_F__ 2 | #define __COMMON_F__ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #include 9 | 10 | 11 | // returns a > 0 ? a : 0 12 | inline static int satz(int a) { 13 | return ~(a >> (sizeof(int) * 8 - 1)) & a; 14 | } 15 | 16 | // returns maximum(a, b) 17 | inline static int imax(int a, int b) { 18 | return a + satz(b - a); 19 | } 20 | 21 | // returns minimum(a, b) 22 | inline static int imin(int a, int b) { 23 | return a - satz(a - b); 24 | } 25 | 26 | /* returns the biggest integer x such as 2^x <= i */ 27 | inline static int ilog2(int i) { 28 | int result = 0; 29 | while (i > 1) { 30 | i /= 2; 31 | result++; 32 | } 33 | return result; 34 | } 35 | 36 | /* computes 2^i */ 37 | inline static int iexp2(int i) { 38 | return 1 << satz(i); 39 | // int result = 1; 40 | // while ( i > 0 ) { result *= 2; i--; } 41 | // return result; 42 | } 43 | 44 | // general common divisor (from wikipedia) 45 | inline static int64_t gcd(int64_t u, int64_t v) { 46 | int shift; 47 | 48 | /* GCD(0,x) := x */ 49 | if (u == 0 || v == 0) 50 | return u | v; 51 | 52 | /* Let shift := lg K, where K is the greatest power of 2 53 | dividing both u and v. */ 54 | for (shift = 0; ((u | v) & 1) == 0; ++shift) { 55 | u >>= 1; 56 | v >>= 1; 57 | } 58 | 59 | while ((u & 1) == 0) 60 | u >>= 1; 61 | 62 | /* From here on, u is always odd. */ 63 | do { 64 | while ((v & 1) == 0) /* Loop X */ 65 | v >>= 1; 66 | 67 | /* Now u and v are both odd, so diff(u, v) is even. 68 | Let u = min(u, v), v = diff(u, v)/2. */ 69 | if (u < v) { 70 | v -= u; 71 | } else { 72 | int64_t diff = u - v; 73 | u = v; 74 | v = diff; 75 | } 76 | v >>= 1; 77 | } while (v != 0); 78 | 79 | return u << shift; 80 | } 81 | 82 | #ifdef __cplusplus 83 | } // extern "C" 84 | #endif 85 | 86 | #endif 87 | -------------------------------------------------------------------------------- /src/CommonMacros.h: -------------------------------------------------------------------------------- 1 | #ifndef __COMMON_M__ 2 | #define __COMMON_M__ 3 | 4 | #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(*arr)) 5 | 6 | #endif // __COMMON_M__ 7 | -------------------------------------------------------------------------------- /src/CopyCode.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "CopyCode.h" 6 | #include "CPU.h" 7 | 8 | template 9 | void copyBlock(uint8_t * __restrict pDst, intptr_t nDstPitch, const uint8_t * __restrict pSrc, intptr_t nSrcPitch) { 10 | int unroll = (height >= 8 ? 8 : (height >= 4 ? 4 : (height >= 2 ? 2 : 1))) / ((width + 15) / 16); 11 | unroll = unroll < 1 ? 1 : unroll; 12 | 13 | for (unsigned j = 0; j < height; j += unroll) { 14 | memcpy(pDst + 0 * nDstPitch, pSrc + 0 * nSrcPitch, width); 15 | if (unroll > 1) { 16 | memcpy(pDst + 1 * nDstPitch, pSrc + 1 * nSrcPitch, width); 17 | } 18 | if (unroll > 2) { 19 | memcpy(pDst + 2 * nDstPitch, pSrc + 2 * nSrcPitch, width); 20 | memcpy(pDst + 3 * nDstPitch, pSrc + 3 * nSrcPitch, width); 21 | } 22 | if (unroll > 4) { 23 | memcpy(pDst + 4 * nDstPitch, pSrc + 4 * nSrcPitch, width); 24 | memcpy(pDst + 5 * nDstPitch, pSrc + 5 * nSrcPitch, width); 25 | memcpy(pDst + 6 * nDstPitch, pSrc + 6 * nSrcPitch, width); 26 | memcpy(pDst + 7 * nDstPitch, pSrc + 7 * nSrcPitch, width); 27 | } 28 | pDst += nDstPitch * unroll; 29 | pSrc += nSrcPitch * unroll; 30 | } 31 | } 32 | 33 | 34 | #define KEY(width, height, bits) (width) << 16 | (height) << 8 | (bits) 35 | #define COPY(width, height) \ 36 | { KEY(width, height, 8), copyBlock }, \ 37 | { KEY(width, height, 16), copyBlock }, 38 | 39 | static const std::unordered_map copy_functions = { 40 | COPY(2, 2) 41 | COPY(2, 4) 42 | COPY(4, 2) 43 | COPY(4, 4) 44 | COPY(4, 8) 45 | COPY(8, 1) 46 | COPY(8, 2) 47 | COPY(8, 4) 48 | COPY(8, 8) 49 | COPY(8, 16) 50 | COPY(16, 1) 51 | COPY(16, 2) 52 | COPY(16, 4) 53 | COPY(16, 8) 54 | COPY(16, 16) 55 | COPY(16, 32) 56 | COPY(32, 8) 57 | COPY(32, 16) 58 | COPY(32, 32) 59 | COPY(32, 64) 60 | COPY(64, 16) 61 | COPY(64, 32) 62 | COPY(64, 64) 63 | COPY(64, 128) 64 | COPY(128, 32) 65 | COPY(128, 64) 66 | COPY(128, 128) 67 | }; 68 | 69 | COPYFunction selectCopyFunction(unsigned width, unsigned height, unsigned bits) { 70 | return copy_functions.at(KEY(width, height, bits)); 71 | } 72 | 73 | #undef COPY 74 | #undef KEY 75 | 76 | -------------------------------------------------------------------------------- /src/CopyCode.h: -------------------------------------------------------------------------------- 1 | #ifndef COPYCODE_H 2 | #define COPYCODE_H 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #include 9 | 10 | 11 | typedef void (*COPYFunction)(uint8_t *pDst, intptr_t nDstPitch, 12 | const uint8_t *pSrc, intptr_t nSrcPitch); 13 | 14 | 15 | COPYFunction selectCopyFunction(unsigned width, unsigned height, unsigned bits); 16 | 17 | #ifdef __cplusplus 18 | } // extern "C" 19 | #endif 20 | 21 | #endif // COPYCODE_H 22 | -------------------------------------------------------------------------------- /src/DCTFFTW.cpp: -------------------------------------------------------------------------------- 1 | // DCT calculation with fftw (real) 2 | // Copyright(c)2006 A.G.Balakhnin aka Fizick 3 | // See legal notice in Copying.txt for more information 4 | 5 | // This program is free software; you can redistribute it and/or modify 6 | // it under the terms of the GNU General Public License as published by 7 | // the Free Software Foundation; either version 2 of the License, or 8 | // (at your option) any later version. 9 | // 10 | // This program is distributed in the hope that it will be useful, 11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | // GNU General Public License for more details. 14 | // 15 | // You should have received a copy of the GNU General Public License 16 | // along with this program; if not, write to the Free Software 17 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit 18 | // http://www.gnu.org/copyleft/gpl.html . 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | #include "DCTFFTW.h" 25 | 26 | 27 | static const float sqrt_2_div_2 = 0.70710678118654752440084436210485f; 28 | 29 | 30 | template 31 | static void Float2Pixels_C(const DCTFFTW *dct, uint8_t *dstp8, int dst_pitch, float *realdata) { 32 | PixelType *dstp = (PixelType *)dstp8; 33 | dst_pitch /= sizeof(PixelType); 34 | 35 | PixelType *dstp_orig = dstp; 36 | float *realdata_orig = realdata; 37 | 38 | int pixelMax = (1 << dct->bitsPerSample) - 1; 39 | int pixelHalf = 1 << (dct->bitsPerSample - 1); 40 | 41 | for (int j = 0; j < dct->sizey; j++) { 42 | for (int i = 0; i < dct->sizex; i++) { 43 | float f = realdata[i] * sqrt_2_div_2; // to be compatible with integer DCTINT8 44 | int integ = (int)(nearbyintf(f)); 45 | dstp[i] = std::min(pixelMax, std::max(0, (integ >> dct->dctshift) + pixelHalf)); 46 | } 47 | dstp += dst_pitch; 48 | realdata += dct->sizex; 49 | } 50 | 51 | float f = realdata_orig[0] * 0.5f; // to be compatible with integer DCTINT8 52 | int integ = (int)(nearbyintf(f)); 53 | dstp_orig[0] = std::min(pixelMax, std::max(0, (integ >> dct->dctshift0) + pixelHalf)); // DC 54 | } 55 | 56 | 57 | #if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM) 58 | 59 | #if defined(MVTOOLS_ARM) 60 | #include "sse2neon.h" 61 | #else 62 | #include 63 | #endif 64 | 65 | template 66 | static void Float2Pixels_SSE2(const DCTFFTW *dct, uint8_t *dstp8, int dst_pitch, float *realdata) { 67 | PixelType *dstp = (PixelType *)dstp8; 68 | dst_pitch /= sizeof(PixelType); 69 | 70 | unsigned width = dct->sizex; 71 | unsigned height = dct->sizey; 72 | 73 | PixelType *dstp_orig = dstp; 74 | float *realdata_orig = realdata; 75 | 76 | int pixel_max, pixel_half, pixel_min; 77 | __m128i words_pixel_max, words_pixel_half, words_pixel_min; 78 | 79 | if (sizeof(PixelType) == 1) { 80 | pixel_max = 255; 81 | pixel_half = 128; 82 | pixel_min = 0; 83 | 84 | words_pixel_max = _mm_set1_epi16(pixel_max); 85 | words_pixel_half = _mm_set1_epi16(pixel_half); 86 | words_pixel_min = _mm_set1_epi16(pixel_min); 87 | } else { 88 | pixel_max = (1 << dct->bitsPerSample) - 1; 89 | pixel_half = 1 << (dct->bitsPerSample - 1); 90 | pixel_min = 0; 91 | 92 | // Shitty because of pminsw/pmaxsw. 93 | words_pixel_max = _mm_set1_epi16(pixel_max - pixel_half); 94 | words_pixel_half = _mm_set1_epi16(pixel_half); 95 | words_pixel_min = _mm_set1_epi16(pixel_min - pixel_half); 96 | } 97 | 98 | __m128i dwords_dctshift = _mm_cvtsi32_si128(dct->dctshift); 99 | 100 | for (unsigned y = 0; y < height; y++) { 101 | for (unsigned x = 0; x < width; x += 4) { 102 | __m128 f = _mm_load_ps(&realdata[x]); 103 | f = _mm_mul_ps(f, _mm_set1_ps(sqrt_2_div_2)); 104 | 105 | __m128i i = _mm_cvtps_epi32(f); 106 | i = _mm_sra_epi32(i, dwords_dctshift); 107 | i = _mm_packs_epi32(i, i); 108 | 109 | if (sizeof(PixelType) == 1) { 110 | i = _mm_add_epi16(i, words_pixel_half); 111 | i = _mm_packus_epi16(i, i); 112 | *(int *)(dstp + x) = _mm_cvtsi128_si32(i); 113 | } else { 114 | i = _mm_min_epi16(i, words_pixel_max); 115 | i = _mm_max_epi16(i, words_pixel_min); 116 | i = _mm_add_epi16(i, words_pixel_half); 117 | _mm_storel_epi64((__m128i *)&dstp[x], i); 118 | } 119 | } 120 | 121 | dstp += dst_pitch; 122 | realdata += width; 123 | } 124 | 125 | int i = _mm_cvtss_si32(_mm_set_ss(realdata_orig[0] * 0.5f)); 126 | dstp_orig[0] = std::max(0, std::min((i >> dct->dctshift0) + pixel_half, pixel_max)); 127 | } 128 | 129 | #endif // MVTOOLS_X86 130 | 131 | 132 | std::mutex g_fftw_plans_mutex; 133 | 134 | 135 | void dctInit(DCTFFTW *dct, int sizex, int sizey, int bitsPerSample, int opt) { 136 | dct->sizex = sizex; 137 | dct->sizey = sizey; 138 | dct->bitsPerSample = bitsPerSample; 139 | 140 | int size2d = sizey * sizex; 141 | 142 | int cursize = 1; 143 | dct->dctshift = 0; 144 | while (cursize < size2d) { 145 | dct->dctshift++; 146 | cursize = (cursize << 1); 147 | } 148 | 149 | dct->dctshift0 = dct->dctshift + 2; 150 | 151 | dct->fSrc = (float *)fftwf_malloc(sizeof(float) * size2d); 152 | dct->fSrcDCT = (float *)fftwf_malloc(sizeof(float) * size2d); 153 | 154 | if (bitsPerSample == 8) 155 | dct->Float2Pixels = Float2Pixels_C; 156 | else 157 | dct->Float2Pixels = Float2Pixels_C; 158 | 159 | if (opt) { 160 | #if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM) 161 | if (bitsPerSample == 8) 162 | dct->Float2Pixels = Float2Pixels_SSE2; 163 | else 164 | dct->Float2Pixels = Float2Pixels_SSE2; 165 | #endif 166 | } 167 | 168 | { 169 | std::lock_guard guard(g_fftw_plans_mutex); 170 | dct->dctplan = fftwf_plan_r2r_2d(sizey, sizex, dct->fSrc, dct->fSrcDCT, 171 | FFTW_REDFT10, FFTW_REDFT10, FFTW_ESTIMATE); // direct fft 172 | } 173 | } 174 | 175 | 176 | void dctDeinit(DCTFFTW *dct) { 177 | { 178 | std::lock_guard guard(g_fftw_plans_mutex); 179 | fftwf_destroy_plan(dct->dctplan); 180 | } 181 | fftwf_free(dct->fSrc); 182 | fftwf_free(dct->fSrcDCT); 183 | } 184 | 185 | 186 | // put source data to real array for FFT 187 | template 188 | static void Pixels2Float(const DCTFFTW *dct, const uint8_t *srcp8, int src_pitch, float *realdata) { 189 | for (int j = 0; j < dct->sizey; j++) { 190 | for (int i = 0; i < dct->sizex; i++) { 191 | PixelType *srcp = (PixelType *)srcp8; 192 | realdata[i] = srcp[i]; 193 | } 194 | srcp8 += src_pitch; 195 | realdata += dct->sizex; 196 | } 197 | } 198 | 199 | 200 | void dctBytes2D(DCTFFTW *dct, const uint8_t *srcp, int src_pitch, uint8_t *dctp, int dct_pitch) { 201 | if (dct->bitsPerSample == 8) { 202 | Pixels2Float(dct, srcp, src_pitch, dct->fSrc); 203 | } else { 204 | Pixels2Float(dct, srcp, src_pitch, dct->fSrc); 205 | } 206 | fftwf_execute_r2r(dct->dctplan, dct->fSrc, dct->fSrcDCT); 207 | dct->Float2Pixels(dct, dctp, dct_pitch, dct->fSrcDCT); 208 | } 209 | -------------------------------------------------------------------------------- /src/DCTFFTW.h: -------------------------------------------------------------------------------- 1 | // DCT calculation with fftw (real) 2 | // Copyright(c)2006 A.G.Balakhnin aka Fizick 3 | // See legal notice in Copying.txt for more information 4 | 5 | // This program is free software; you can redistribute it and/or modify 6 | // it under the terms of the GNU General Public License as published by 7 | // the Free Software Foundation; either version 2 of the License, or 8 | // (at your option) any later version. 9 | // 10 | // This program is distributed in the hope that it will be useful, 11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | // GNU General Public License for more details. 14 | // 15 | // You should have received a copy of the GNU General Public License 16 | // along with this program; if not, write to the Free Software 17 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit 18 | // http://www.gnu.org/copyleft/gpl.html . 19 | 20 | #ifndef DCTFFTW_H 21 | #define DCTFFTW_H 22 | 23 | #ifdef __cplusplus 24 | extern "C" { 25 | #endif 26 | 27 | #include 28 | 29 | #include 30 | 31 | 32 | typedef struct DCTFFTW DCTFFTW; 33 | 34 | typedef void (*Float2PixelsFunction)(const DCTFFTW *dct, uint8_t *dstp, int dst_pitch, float *realdata); 35 | 36 | 37 | typedef struct DCTFFTW { 38 | int sizex; 39 | int sizey; 40 | int bitsPerSample; 41 | 42 | float *fSrc; 43 | fftwf_plan dctplan; 44 | float *fSrcDCT; 45 | 46 | int dctshift; 47 | int dctshift0; 48 | 49 | Float2PixelsFunction Float2Pixels; 50 | } DCTFFTW; 51 | 52 | 53 | void dctInit(DCTFFTW *dct, int sizex, int sizey, int bitsPerSample, int opt); 54 | 55 | void dctDeinit(DCTFFTW *dct); 56 | 57 | void dctBytes2D(DCTFFTW *dct, const uint8_t *srcp, int src_pitch, uint8_t *dctp, int dct_pitch); 58 | 59 | #ifdef __cplusplus 60 | } // extern "C" 61 | #endif 62 | 63 | #endif // DCTFFTW_H 64 | -------------------------------------------------------------------------------- /src/EntryPoint.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "CPU.h" 5 | 6 | 7 | // Extra indirection to keep the parameter lists with the respective filters. 8 | 9 | 10 | void mvsuperRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi); 11 | void mvanalyseRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi); 12 | void mvdegrainsRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi); 13 | void mvcompensateRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi); 14 | void mvrecalculateRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi); 15 | void mvmaskRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi); 16 | void mvfinestRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi); 17 | void mvflowRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi); 18 | void mvflowblurRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi); 19 | void mvflowinterRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi); 20 | void mvflowfpsRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi); 21 | void mvblockfpsRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi); 22 | void mvscdetectionRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi); 23 | void mvdepanRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi); 24 | 25 | 26 | uint32_t g_cpuinfo = 0; 27 | 28 | VS_EXTERNAL_API(void) 29 | VapourSynthPluginInit2(VSPlugin *plugin, const VSPLUGINAPI *vspapi) { 30 | const int packageVersion = atoi(PACKAGE_VERSION); 31 | 32 | vspapi->configPlugin("com.nodame.mvtools", "mv", "MVTools v" PACKAGE_VERSION, VS_MAKE_VERSION(packageVersion, 0), VS_MAKE_VERSION(VAPOURSYNTH_API_MAJOR, VAPOURSYNTH_API_MINOR), 0, plugin); 33 | 34 | mvsuperRegister(plugin, vspapi); 35 | mvanalyseRegister(plugin, vspapi); 36 | mvdegrainsRegister(plugin, vspapi); 37 | mvcompensateRegister(plugin, vspapi); 38 | mvrecalculateRegister(plugin, vspapi); 39 | mvmaskRegister(plugin, vspapi); 40 | mvfinestRegister(plugin, vspapi); 41 | mvflowRegister(plugin, vspapi); 42 | mvflowblurRegister(plugin, vspapi); 43 | mvflowinterRegister(plugin, vspapi); 44 | mvflowfpsRegister(plugin, vspapi); 45 | mvblockfpsRegister(plugin, vspapi); 46 | mvscdetectionRegister(plugin, vspapi); 47 | mvdepanRegister(plugin, vspapi); 48 | 49 | g_cpuinfo = cpu_detect(); 50 | } 51 | -------------------------------------------------------------------------------- /src/Fakery.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | 5 | #include "CommonFunctions.h" 6 | #include "Fakery.h" 7 | 8 | 9 | // FakeBlockData 10 | 11 | void fbdUpdate(FakeBlockData *fbd, const VECTOR *array) { 12 | fbd->vector = *array; 13 | } 14 | 15 | 16 | // FakePlaneOfBlocks 17 | 18 | void fpobInit(FakePlaneOfBlocks *fpob, int sizeX, int sizeY, int pel, int nOverlapX, int nOverlapY, int nBlkX, int nBlkY) { 19 | fpob->nBlkSizeX = sizeX; 20 | fpob->nBlkSizeY = sizeY; 21 | fpob->nOverlapX = nOverlapX; 22 | fpob->nOverlapY = nOverlapY; 23 | fpob->nBlkX = nBlkX; 24 | fpob->nBlkY = nBlkY; 25 | fpob->nBlkCount = fpob->nBlkX * fpob->nBlkY; 26 | fpob->nPel = pel; 27 | 28 | fpob->blocks = (FakeBlockData *)malloc(fpob->nBlkCount * sizeof(FakeBlockData)); 29 | 30 | for (int j = 0, blkIdx = 0; j < fpob->nBlkY; j++) { 31 | for (int i = 0; i < fpob->nBlkX; i++, blkIdx++) { 32 | fpob->blocks[blkIdx].x = i * (fpob->nBlkSizeX - fpob->nOverlapX); 33 | fpob->blocks[blkIdx].y = j * (fpob->nBlkSizeY - fpob->nOverlapY); 34 | } 35 | } 36 | } 37 | 38 | 39 | void fpobDeinit(FakePlaneOfBlocks *fpob) { 40 | free(fpob->blocks); 41 | } 42 | 43 | 44 | void fpobUpdate(FakePlaneOfBlocks *fpob, const uint8_t *array) { 45 | const VECTOR *blocks = (const VECTOR *)array; 46 | 47 | for (int i = 0; i < fpob->nBlkCount; i++) 48 | fbdUpdate(&fpob->blocks[i], &blocks[i]); 49 | } 50 | 51 | 52 | int fpobIsSceneChange(const FakePlaneOfBlocks *fpob, int64_t nTh1, int nTh2) { 53 | int sum = 0; 54 | for (int i = 0; i < fpob->nBlkCount; i++) 55 | sum += (fpob->blocks[i].vector.sad > nTh1) ? 1 : 0; 56 | 57 | return (sum > nTh2); 58 | } 59 | 60 | 61 | const FakeBlockData *fpobGetBlock(const FakePlaneOfBlocks *fpob, int i) { 62 | return &fpob->blocks[i]; 63 | } 64 | 65 | 66 | // FakeGroupOfPlanes 67 | 68 | void fgopInit(FakeGroupOfPlanes *fgop, const MVAnalysisData *ad) { 69 | fgop->nLvCount = ad->nLvCount; 70 | int nBlkX1 = ad->nBlkX; 71 | int nBlkY1 = ad->nBlkY; 72 | int nWidth_B = (ad->nBlkSizeX - ad->nOverlapX) * nBlkX1 + ad->nOverlapX; 73 | int nHeight_B = (ad->nBlkSizeY - ad->nOverlapY) * nBlkY1 + ad->nOverlapY; 74 | 75 | fgop->planes = (FakePlaneOfBlocks **)malloc(ad->nLvCount * sizeof(FakePlaneOfBlocks *)); 76 | 77 | fgop->planes[0] = (FakePlaneOfBlocks *)malloc(sizeof(FakePlaneOfBlocks)); 78 | fpobInit(fgop->planes[0], ad->nBlkSizeX, ad->nBlkSizeY, ad->nPel, ad->nOverlapX, ad->nOverlapY, nBlkX1, nBlkY1); 79 | 80 | for (int i = 1; i < ad->nLvCount; i++) { 81 | nBlkX1 = ((nWidth_B >> i) - ad->nOverlapX) / (ad->nBlkSizeX - ad->nOverlapX); 82 | nBlkY1 = ((nHeight_B >> i) - ad->nOverlapY) / (ad->nBlkSizeY - ad->nOverlapY); 83 | 84 | fgop->planes[i] = (FakePlaneOfBlocks *)malloc(sizeof(FakePlaneOfBlocks)); 85 | fpobInit(fgop->planes[i], ad->nBlkSizeX, ad->nBlkSizeY, 1, ad->nOverlapX, ad->nOverlapY, nBlkX1, nBlkY1); // fixed bug with nOverlapX in v1.10.2 86 | } 87 | } 88 | 89 | 90 | void fgopDeinit(FakeGroupOfPlanes *fgop) { 91 | if (fgop->planes) { 92 | for (int i = 0; i < fgop->nLvCount; i++) { 93 | fpobDeinit(fgop->planes[i]); 94 | free(fgop->planes[i]); 95 | } 96 | 97 | free(fgop->planes); 98 | fgop->planes = 0; //v1.2.1 99 | } 100 | } 101 | 102 | 103 | static inline int fgopGetValidity(const uint8_t *array) { 104 | MVArraySizeType validity; 105 | memcpy(&validity, array + sizeof(MVArraySizeType), sizeof(validity)); 106 | return (validity == 1); 107 | } 108 | 109 | 110 | void fgopUpdate(FakeGroupOfPlanes *fgop, const uint8_t *array) { 111 | fgop->validity = fgopGetValidity(array); 112 | 113 | const uint8_t *pA = array + 2 * sizeof(MVArraySizeType); 114 | for (int i = fgop->nLvCount - 1; i >= 0; i--) { 115 | fpobUpdate(fgop->planes[i], pA + sizeof(MVArraySizeType)); 116 | 117 | MVArraySizeType size; 118 | memcpy(&size, pA, sizeof(size)); 119 | pA += size; 120 | } 121 | } 122 | 123 | 124 | int fgopIsSceneChange(const FakeGroupOfPlanes *fgop, int64_t nThSCD1, int nThSCD2) { 125 | return fpobIsSceneChange(fgop->planes[0], nThSCD1, nThSCD2); 126 | } 127 | 128 | 129 | int fgopIsValid(const FakeGroupOfPlanes *fgop) { 130 | return fgop->validity; 131 | } 132 | 133 | 134 | const FakePlaneOfBlocks *fgopGetPlane(const FakeGroupOfPlanes *fgop, int i) { 135 | return fgop->planes[i]; 136 | } 137 | 138 | 139 | const FakeBlockData *fgopGetBlock(const FakeGroupOfPlanes *fgop, int nLevel, int nBlk) { 140 | return fpobGetBlock(fgopGetPlane(fgop, nLevel), nBlk); 141 | } 142 | 143 | 144 | int fgopIsUsable(const FakeGroupOfPlanes *fgop, int64_t thscd1, int thscd2) { 145 | return !fgopIsSceneChange(fgop, thscd1, thscd2) && fgopIsValid(fgop); 146 | } 147 | -------------------------------------------------------------------------------- /src/Fakery.h: -------------------------------------------------------------------------------- 1 | #ifndef MVTOOLS_FAKERY_H 2 | #define MVTOOLS_FAKERY_H 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | 9 | #include "MVAnalysisData.h" 10 | 11 | 12 | typedef struct FakeBlockData { 13 | int x; 14 | int y; 15 | VECTOR vector; 16 | } FakeBlockData; 17 | 18 | 19 | typedef struct FakePlaneOfBlocks { 20 | int nBlkX; 21 | int nBlkY; 22 | int nBlkSizeX; 23 | int nBlkSizeY; 24 | int nBlkCount; 25 | int nPel; 26 | int nOverlapX; 27 | int nOverlapY; 28 | 29 | FakeBlockData *blocks; 30 | } FakePlaneOfBlocks; 31 | 32 | 33 | typedef struct FakeGroupOfPlanes { 34 | int nLvCount; 35 | int validity; 36 | 37 | FakePlaneOfBlocks **planes; 38 | } FakeGroupOfPlanes; 39 | 40 | 41 | // FakeBlockData 42 | 43 | void fbdUpdate(FakeBlockData *fbd, const VECTOR *array); 44 | 45 | 46 | // FakePlaneOfBlocks 47 | 48 | void fpobInit(FakePlaneOfBlocks *fpob, int sizeX, int sizeY, int pel, int nOverlapX, int nOverlapY, int nBlkX, int nBlkY); 49 | 50 | void fpobDeinit(FakePlaneOfBlocks *fpob); 51 | 52 | void fpobUpdate(FakePlaneOfBlocks *fpob, const uint8_t *array); 53 | 54 | int fpobIsSceneChange(const FakePlaneOfBlocks *fpob, int64_t nTh1, int nTh2); 55 | 56 | const FakeBlockData *fpobGetBlock(const FakePlaneOfBlocks *fpob, int i); 57 | 58 | 59 | // FakeGroupOfPlanes 60 | 61 | void fgopInit(FakeGroupOfPlanes *fgop, const MVAnalysisData *ad); 62 | 63 | void fgopDeinit(FakeGroupOfPlanes *fgop); 64 | 65 | void fgopUpdate(FakeGroupOfPlanes *fgop, const uint8_t *array); 66 | 67 | int fgopIsSceneChange(const FakeGroupOfPlanes *fgop, int64_t nThSCD1, int nThSCD2); 68 | 69 | int fgopIsValid(const FakeGroupOfPlanes *fgop); 70 | 71 | const FakePlaneOfBlocks *fgopGetPlane(const FakeGroupOfPlanes *fgop, int i); 72 | 73 | const FakeBlockData *fgopGetBlock(const FakeGroupOfPlanes *fgop, int nLevel, int nBlk); 74 | 75 | int fgopIsUsable(const FakeGroupOfPlanes *fgop, int64_t thscd1, int thscd2); 76 | 77 | 78 | #ifdef __cplusplus 79 | } // extern "C" 80 | #endif 81 | 82 | #endif // MVTOOLS_FAKERY_H 83 | -------------------------------------------------------------------------------- /src/GroupOfPlanes.c: -------------------------------------------------------------------------------- 1 | // Author: Manao 2 | // Copyright(c)2006 A.G.Balakhnin aka Fizick - overlap, global MV, divide 3 | // See legal notice in Copying.txt for more information 4 | // 5 | // This program is free software; you can redistribute it and/or modify 6 | // it under the terms of the GNU General Public License as published by 7 | // the Free Software Foundation; either version 2 of the License, or 8 | // (at your option) any later version. 9 | // 10 | // This program is distributed in the hope that it will be useful, 11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | // GNU General Public License for more details. 14 | // 15 | // You should have received a copy of the GNU General Public License 16 | // along with this program; if not, write to the Free Software 17 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit 18 | // http://www.gnu.org/copyleft/gpl.html . 19 | 20 | #include 21 | 22 | #include "GroupOfPlanes.h" 23 | 24 | 25 | void gopInit(GroupOfPlanes *gop, int nBlkSizeX, int nBlkSizeY, int nLevelCount, int nPel, int nMotionFlags, int nCPUFlags, int nOverlapX, int nOverlapY, int nBlkX, int nBlkY, int xRatioUV, int yRatioUV, int divideExtra, int bitsPerSample) { 26 | gop->nBlkSizeX = nBlkSizeX; 27 | gop->nBlkSizeY = nBlkSizeY; 28 | gop->nLevelCount = nLevelCount; 29 | gop->nOverlapX = nOverlapX; 30 | gop->nOverlapY = nOverlapY; 31 | gop->xRatioUV = xRatioUV; 32 | gop->yRatioUV = yRatioUV; 33 | gop->divideExtra = divideExtra; 34 | 35 | gop->planes = (PlaneOfBlocks **)malloc(gop->nLevelCount * sizeof(PlaneOfBlocks *)); 36 | 37 | int nBlkXCurrent = nBlkX; 38 | int nBlkYCurrent = nBlkY; 39 | 40 | int nPelCurrent = nPel; 41 | int nMotionFlagsCurrent = nMotionFlags; 42 | 43 | int nWidth_B = (gop->nBlkSizeX - gop->nOverlapX) * nBlkX + gop->nOverlapX; 44 | int nHeight_B = (gop->nBlkSizeY - gop->nOverlapY) * nBlkY + gop->nOverlapY; 45 | 46 | for (int i = 0; i < gop->nLevelCount; i++) { 47 | if (i == gop->nLevelCount - 1) 48 | nMotionFlagsCurrent |= MOTION_SMALLEST_PLANE; 49 | nBlkXCurrent = ((nWidth_B >> i) - gop->nOverlapX) / (gop->nBlkSizeX - gop->nOverlapX); 50 | nBlkYCurrent = ((nHeight_B >> i) - gop->nOverlapY) / (gop->nBlkSizeY - gop->nOverlapY); 51 | 52 | gop->planes[i] = (PlaneOfBlocks *)malloc(sizeof(PlaneOfBlocks)); 53 | pobInit(gop->planes[i], nBlkXCurrent, nBlkYCurrent, gop->nBlkSizeX, gop->nBlkSizeY, nPelCurrent, i, nMotionFlagsCurrent, nCPUFlags, gop->nOverlapX, gop->nOverlapY, gop->xRatioUV, gop->yRatioUV, bitsPerSample); 54 | nPelCurrent = 1; 55 | } 56 | } 57 | 58 | 59 | void gopDeinit(GroupOfPlanes *gop) { 60 | for (int i = 0; i < gop->nLevelCount; i++) { 61 | pobDeinit(gop->planes[i]); 62 | free(gop->planes[i]); 63 | } 64 | 65 | free(gop->planes); 66 | } 67 | 68 | 69 | void gopSearchMVs(GroupOfPlanes *gop, MVGroupOfFrames *pSrcGOF, MVGroupOfFrames *pRefGOF, 70 | SearchType searchType, int nSearchParam, int nPelSearch, int nLambda, 71 | int lsad, int pnew, int plevel, int global, 72 | uint8_t *out, int fieldShift, DCTFFTW *DCT, int dctmode, 73 | int pzero, int pglobal, int64_t badSAD, int badrange, int meander, int tryMany, 74 | SearchType coarseSearchType) { 75 | int i; 76 | 77 | // write group's size 78 | MVArraySizeType size = gopGetArraySize(gop); 79 | memcpy(out, &size, sizeof(size)); 80 | 81 | // write validity : 1 in that case 82 | MVArraySizeType validity = 1; 83 | memcpy(out + sizeof(size), &validity, sizeof(validity)); 84 | 85 | out += sizeof(size) + sizeof(validity); 86 | 87 | int fieldShiftCur = (gop->nLevelCount - 1 == 0) ? fieldShift : 0; // may be non zero for finest level only 88 | 89 | VECTOR globalMV = zeroMV; // create and init global motion vector as zero 90 | 91 | if (!global) 92 | pglobal = pzero; 93 | 94 | int meanLumaChange = 0; 95 | 96 | // Search the motion vectors, for the low details interpolations first 97 | SearchType searchTypeSmallest = (gop->nLevelCount == 1 || searchType == SearchHorizontal || searchType == SearchVertical) ? searchType : coarseSearchType; // full search for smallest coarse plane 98 | int nSearchParamSmallest = (gop->nLevelCount == 1) ? nPelSearch : nSearchParam; 99 | int tryManyLevel = tryMany && gop->nLevelCount > 1; 100 | pobSearchMVs(gop->planes[gop->nLevelCount - 1], 101 | pSrcGOF->frames[gop->nLevelCount - 1], 102 | pRefGOF->frames[gop->nLevelCount - 1], 103 | searchTypeSmallest, nSearchParamSmallest, nLambda, lsad, pnew, plevel, 104 | out, &globalMV, fieldShiftCur, DCT, dctmode, &meanLumaChange, 105 | pzero, pglobal, badSAD, badrange, meander, tryManyLevel); 106 | // Refining the search until we reach the highest detail interpolation. 107 | 108 | out += pobGetArraySize(gop->planes[gop->nLevelCount - 1], gop->divideExtra); 109 | 110 | for (i = gop->nLevelCount - 2; i >= 0; i--) { 111 | SearchType searchTypeLevel = (i == 0 || searchType == SearchHorizontal || searchType == SearchVertical) ? searchType : coarseSearchType; // full search for coarse planes 112 | int nSearchParamLevel = (i == 0) ? nPelSearch : nSearchParam; // special case for finest level 113 | if (global) { 114 | pobEstimateGlobalMVDoubled(gop->planes[i + 1], &globalMV); // get updated global MV (doubled) 115 | } 116 | pobInterpolatePrediction(gop->planes[i], gop->planes[i + 1]); 117 | fieldShiftCur = (i == 0) ? fieldShift : 0; // may be non zero for finest level only 118 | tryManyLevel = tryMany && i > 0; // not for finest level to not decrease speed 119 | pobSearchMVs(gop->planes[i], pSrcGOF->frames[i], pRefGOF->frames[i], 120 | searchTypeLevel, nSearchParamLevel, nLambda, lsad, pnew, plevel, 121 | out, &globalMV, fieldShiftCur, DCT, dctmode, &meanLumaChange, 122 | pzero, pglobal, badSAD, badrange, meander, tryManyLevel); 123 | out += pobGetArraySize(gop->planes[i], gop->divideExtra); 124 | } 125 | } 126 | 127 | 128 | void gopRecalculateMVs(GroupOfPlanes *gop, FakeGroupOfPlanes *fgop, MVGroupOfFrames *pSrcGOF, MVGroupOfFrames *pRefGOF, 129 | SearchType searchType, int nSearchParam, int nLambda, 130 | int pnew, 131 | uint8_t *out, int fieldShift, int64_t thSAD, DCTFFTW *DCT, int dctmode, int smooth, int meander) { 132 | // write group's size 133 | MVArraySizeType size = gopGetArraySize(gop); 134 | memcpy(out, &size, sizeof(size)); 135 | 136 | // write validity : 1 in that case 137 | MVArraySizeType validity = 1; 138 | memcpy(out + sizeof(size), &validity, sizeof(validity)); 139 | 140 | out += sizeof(size) + sizeof(validity); 141 | 142 | // Search the motion vectors, for the low details interpolations first 143 | // Refining the search until we reach the highest detail interpolation. 144 | pobRecalculateMVs(gop->planes[0], fgop, pSrcGOF->frames[0], pRefGOF->frames[0], 145 | searchType, nSearchParam, nLambda, pnew, 146 | out, fieldShift, thSAD, DCT, dctmode, smooth, meander); 147 | } 148 | 149 | 150 | void gopWriteDefaultToArray(GroupOfPlanes *gop, uint8_t *array) { 151 | // write group's size 152 | MVArraySizeType size = gopGetArraySize(gop); 153 | memcpy(array, &size, sizeof(size)); 154 | 155 | // write validity : unvalid in that case 156 | MVArraySizeType validity = 0; 157 | memcpy(array + sizeof(size), &validity, sizeof(validity)); 158 | 159 | array += sizeof(size) + sizeof(validity); 160 | 161 | // write planes 162 | for (int i = gop->nLevelCount - 1; i >= 0; i--) 163 | array += pobWriteDefaultToArray(gop->planes[i], array, gop->divideExtra); 164 | } 165 | 166 | 167 | MVArraySizeType gopGetArraySize(GroupOfPlanes *gop) { 168 | MVArraySizeType size = 2 * sizeof(MVArraySizeType); // size, validity 169 | for (int i = gop->nLevelCount - 1; i >= 0; i--) 170 | size += pobGetArraySize(gop->planes[i], gop->divideExtra); 171 | 172 | 173 | return size; 174 | } 175 | 176 | 177 | // FIND MEDIAN OF 3 ELEMENTS 178 | // 179 | static inline int Median3(int a, int b, int c) { 180 | // b a c || c a b 181 | if (((b <= a) && (a <= c)) || ((c <= a) && (a <= b))) 182 | return a; 183 | 184 | // a b c || c b a 185 | else if (((a <= b) && (b <= c)) || ((c <= b) && (b <= a))) 186 | return b; 187 | 188 | // b c a || a c b 189 | else 190 | return c; 191 | } 192 | 193 | 194 | static void GetMedian(int *vx, int *vy, int vx1, int vy1, int vx2, int vy2, int vx3, int vy3) { // existant median vector (not mixed) 195 | *vx = Median3(vx1, vx2, vx3); 196 | *vy = Median3(vy1, vy2, vy3); 197 | if ((*vx == vx1 && *vy == vy1) || (*vx == vx2 && *vy == vy2) || (*vx == vx3 && *vy == vy3)) 198 | return; 199 | else { 200 | *vx = vx1; 201 | *vy = vy1; 202 | } 203 | } 204 | 205 | 206 | void gopExtraDivide(GroupOfPlanes *gop, uint8_t *out) { 207 | out += 2 * sizeof(MVArraySizeType); // skip full size and validity 208 | for (int i = gop->nLevelCount - 1; i >= 1; i--) // skip all levels up to finest estimated 209 | out += pobGetArraySize(gop->planes[i], 0); 210 | 211 | MVArraySizeType size; 212 | memcpy(&size, out, sizeof(size)); 213 | 214 | const VECTOR *blocks_in = (const VECTOR *)(out + sizeof(size)); // finest estimated plane 215 | VECTOR *blocks_out = (VECTOR *)(out + size + sizeof(MVArraySizeType)); // position for divided subblocks data 216 | 217 | int nBlkY = gop->planes[0]->nBlkY; 218 | int nBlkX = gop->planes[0]->nBlkX; 219 | 220 | // top blocks 221 | for (int bx = 0; bx < nBlkX; bx++) { 222 | VECTOR block = blocks_in[bx]; 223 | block.sad >>= 2; 224 | 225 | blocks_out[bx * 2] = block; // top left subblock 226 | blocks_out[bx * 2 + 1] = block; // top right subblock 227 | blocks_out[bx * 2 + nBlkX * 2] = block; // bottom left subblock 228 | blocks_out[bx * 2 + nBlkX * 2 + 1] = block; // bottom right subblock 229 | } 230 | 231 | blocks_out += nBlkX * 4; 232 | blocks_in += nBlkX; 233 | 234 | // middle blocks 235 | for (int by = 1; by < nBlkY - 1; by++) { 236 | int bx = 0; 237 | 238 | VECTOR block = blocks_in[bx]; 239 | block.sad >>= 2; 240 | 241 | blocks_out[bx * 2] = block; // top left subblock 242 | blocks_out[bx * 2 + 1] = block; // top right subblock 243 | blocks_out[bx * 2 + nBlkX * 2] = block; // bottom left subblock 244 | blocks_out[bx * 2 + nBlkX * 2 + 1] = block; // bottom right subblock 245 | 246 | for (bx = 1; bx < nBlkX - 1; bx++) { 247 | block = blocks_in[bx]; 248 | block.sad >>= 2; 249 | 250 | blocks_out[bx * 2] = block; // top left subblock 251 | blocks_out[bx * 2 + 1] = block; // top right subblock 252 | blocks_out[bx * 2 + nBlkX * 2] = block; // bottom left subblock 253 | blocks_out[bx * 2 + nBlkX * 2 + 1] = block; // bottom right subblock 254 | 255 | if (gop->divideExtra > 1) { 256 | GetMedian(&blocks_out[bx * 2].x, &blocks_out[bx * 2].y, 257 | blocks_in[bx].x, blocks_in[bx].y, 258 | blocks_in[bx - 1].x, blocks_in[bx - 1].y, 259 | blocks_in[bx - nBlkX].x, blocks_in[bx - nBlkX].y); 260 | 261 | GetMedian(&blocks_out[bx * 2 + 1].x, &blocks_out[bx * 2 + 1].y, 262 | blocks_in[bx].x, blocks_in[bx].y, 263 | blocks_in[bx + 1].x, blocks_in[bx + 1].y, 264 | blocks_in[bx - nBlkX].x, blocks_in[bx - nBlkX].y); 265 | 266 | GetMedian(&blocks_out[bx * 2 + nBlkX * 2].x, &blocks_out[bx * 2 + nBlkX * 2].y, 267 | blocks_in[bx].x, blocks_in[bx].y, 268 | blocks_in[bx - 1].x, blocks_in[bx - 1].y, 269 | blocks_in[bx + nBlkX].x, blocks_in[bx + nBlkX].y); 270 | 271 | GetMedian(&blocks_out[bx * 2 + nBlkX * 2 + 1].x, &blocks_out[bx * 2 + nBlkX * 2 + 1].y, 272 | blocks_in[bx].x, blocks_in[bx].y, 273 | blocks_in[bx + 1].x, blocks_in[bx + 1].y, 274 | blocks_in[bx + nBlkX].x, blocks_in[bx + nBlkX].y); 275 | } 276 | } 277 | 278 | bx = nBlkX - 1; 279 | 280 | block = blocks_in[bx]; 281 | block.sad >>= 2; 282 | 283 | blocks_out[bx * 2] = block; // top left subblock 284 | blocks_out[bx * 2 + 1] = block; // top right subblock 285 | blocks_out[bx * 2 + nBlkX * 2] = block; // bottom left subblock 286 | blocks_out[bx * 2 + nBlkX * 2 + 1] = block; // bottom right subblock 287 | 288 | blocks_out += nBlkX * 4; 289 | blocks_in += nBlkX; 290 | } 291 | 292 | // bottom blocks 293 | for (int bx = 0; bx < nBlkX; bx++) { 294 | VECTOR block = blocks_in[bx]; 295 | block.sad >>= 2; 296 | 297 | blocks_out[bx * 2] = block; // top left subblock 298 | blocks_out[bx * 2 + 1] = block; // top right subblock 299 | blocks_out[bx * 2 + nBlkX * 2] = block; // bottom left subblock 300 | blocks_out[bx * 2 + nBlkX * 2 + 1] = block; // bottom right subblock 301 | } 302 | } 303 | -------------------------------------------------------------------------------- /src/GroupOfPlanes.h: -------------------------------------------------------------------------------- 1 | // See legal notice in Copying.txt for more information 2 | 3 | // This program is free software; you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation; either version 2 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // This program is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit 16 | // http://www.gnu.org/copyleft/gpl.html . 17 | 18 | #ifndef GROUPOFPLANES_H 19 | #define GROUPOFPLANES_H 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | #include "DCTFFTW.h" 26 | #include "Fakery.h" 27 | #include "MVFrame.h" 28 | #include "PlaneOfBlocks.h" 29 | 30 | 31 | typedef struct GroupOfPlanes { 32 | int nBlkSizeX; 33 | int nBlkSizeY; 34 | int nLevelCount; 35 | int nOverlapX; 36 | int nOverlapY; 37 | int xRatioUV; 38 | int yRatioUV; 39 | int divideExtra; 40 | 41 | PlaneOfBlocks **planes; 42 | } GroupOfPlanes; 43 | 44 | 45 | void gopInit(GroupOfPlanes *gop, int nBlkSizeX, int nBlkSizeY, int nLevelCount, int nPel, int nMotionFlags, int nCPUFlags, int nOverlapX, int nOverlapY, int nBlkX, int nBlkY, int xRatioUV, int yRatioUV, int divideExtra, int bitsPerSample); 46 | 47 | void gopDeinit(GroupOfPlanes *gop); 48 | 49 | void gopSearchMVs(GroupOfPlanes *gop, MVGroupOfFrames *pSrcGOF, MVGroupOfFrames *pRefGOF, SearchType searchType, int nSearchParam, int nPelSearch, int nLambda, int lsad, int pnew, int plevel, int global, uint8_t *out, int fieldShift, DCTFFTW *DCT, int dctmode, int pzero, int pglobal, int64_t badSAD, int badrange, int meander, int tryMany, SearchType coarseSearchType); 50 | 51 | void gopRecalculateMVs(GroupOfPlanes *gop, FakeGroupOfPlanes *fgop, MVGroupOfFrames *pSrcGOF, MVGroupOfFrames *pRefGOF, SearchType searchType, int nSearchParam, int nLambda, int pnew, uint8_t *out, int fieldShift, int64_t thSAD, DCTFFTW *DCT, int dctmode, int smooth, int meander); 52 | 53 | void gopWriteDefaultToArray(GroupOfPlanes *gop, uint8_t *array); 54 | 55 | MVArraySizeType gopGetArraySize(GroupOfPlanes *gop); 56 | 57 | void gopExtraDivide(GroupOfPlanes *gop, uint8_t *out); 58 | 59 | #ifdef __cplusplus 60 | } // extern "C" 61 | #endif 62 | 63 | #endif 64 | -------------------------------------------------------------------------------- /src/Luma.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "Luma.h" 6 | 7 | 8 | enum InstructionSets { 9 | Scalar, 10 | SSE2, 11 | }; 12 | 13 | 14 | template 15 | unsigned int luma_c(const uint8_t *pSrc8, intptr_t nSrcPitch) { 16 | unsigned int meanLuma = 0; 17 | for (unsigned j = 0; j < height; j++) { 18 | for (unsigned i = 0; i < width; i++) { 19 | const PixelType *pSrc = (const PixelType *)pSrc8; 20 | meanLuma += pSrc[i]; 21 | } 22 | pSrc8 += nSrcPitch; 23 | } 24 | return meanLuma; 25 | } 26 | 27 | 28 | #if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM) 29 | 30 | #if defined(MVTOOLS_ARM) 31 | #include "sse2neon.h" 32 | #else 33 | #include 34 | #endif 35 | 36 | 37 | #define zeroes _mm_setzero_si128() 38 | 39 | 40 | template 41 | unsigned int luma_sse2(const uint8_t *pSrc, intptr_t nSrcPitch) { 42 | __m128i sum = zeroes; 43 | 44 | for (unsigned y = 0; y < height; y++) { 45 | for (unsigned x = 0; x < width; x += 16) { 46 | __m128i src; 47 | if (width == 4) 48 | src = _mm_cvtsi32_si128(*(const int *)pSrc); 49 | else if (width == 8) 50 | src = _mm_loadl_epi64((const __m128i *)pSrc); 51 | else 52 | src = _mm_loadu_si128((const __m128i *)&pSrc[x]); 53 | 54 | sum = _mm_add_epi64(sum, _mm_sad_epu8(src, zeroes)); 55 | } 56 | 57 | pSrc += nSrcPitch; 58 | } 59 | 60 | if (width >= 16) 61 | sum = _mm_add_epi64(sum, _mm_srli_si128(sum, 8)); 62 | 63 | return (unsigned)_mm_cvtsi128_si32(sum); 64 | } 65 | 66 | 67 | #undef zeroes 68 | 69 | 70 | #endif // MVTOOLS_X86 71 | 72 | 73 | // opt can fit in four bits, if the width and height need more than eight bits each. 74 | #define KEY(width, height, bits, opt) (unsigned)(width) << 24 | (height) << 16 | (bits) << 8 | (opt) 75 | 76 | #if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM) 77 | #define LUMA_SSE2(width, height) \ 78 | { KEY(width, height, 8, SSE2), luma_sse2 }, 79 | #else 80 | #define LUMA_SSE2(width, height) 81 | #endif 82 | 83 | #define LUMA(width, height) \ 84 | { KEY(width, height, 8, Scalar), luma_c }, \ 85 | { KEY(width, height, 16, Scalar), luma_c }, 86 | 87 | static const std::unordered_map luma_functions = { 88 | LUMA(4, 4) 89 | LUMA(8, 4) 90 | LUMA(8, 8) 91 | LUMA(16, 2) 92 | LUMA(16, 8) 93 | LUMA(16, 16) 94 | LUMA(32, 16) 95 | LUMA(32, 32) 96 | LUMA(64, 32) 97 | LUMA(64, 64) 98 | LUMA(128, 64) 99 | LUMA(128, 128) 100 | LUMA_SSE2(4, 4) 101 | LUMA_SSE2(8, 4) 102 | LUMA_SSE2(8, 8) 103 | LUMA_SSE2(16, 2) 104 | LUMA_SSE2(16, 8) 105 | LUMA_SSE2(16, 16) 106 | LUMA_SSE2(32, 16) 107 | LUMA_SSE2(32, 32) 108 | LUMA_SSE2(64, 32) 109 | LUMA_SSE2(64, 64) 110 | LUMA_SSE2(128, 64) 111 | LUMA_SSE2(128, 128) 112 | }; 113 | 114 | LUMAFunction selectLumaFunction(unsigned width, unsigned height, unsigned bits, int opt) { 115 | LUMAFunction luma = luma_functions.at(KEY(width, height, bits, Scalar)); 116 | 117 | #if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM) 118 | if (opt) { 119 | try { 120 | luma = luma_functions.at(KEY(width, height, bits, SSE2)); 121 | } catch (std::out_of_range &) { } 122 | } 123 | #endif 124 | 125 | return luma; 126 | } 127 | 128 | #undef LUMA 129 | #undef LUMA_SSE2 130 | #undef KEY 131 | -------------------------------------------------------------------------------- /src/Luma.h: -------------------------------------------------------------------------------- 1 | #ifndef LUMA_H 2 | #define LUMA_H 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #include 9 | 10 | 11 | typedef unsigned int (*LUMAFunction)(const uint8_t *pSrc, intptr_t nSrcPitch); 12 | 13 | 14 | LUMAFunction selectLumaFunction(unsigned width, unsigned height, unsigned bits, int opt); 15 | 16 | #ifdef __cplusplus 17 | } // extern "C" 18 | #endif 19 | 20 | #endif // LUMA_H 21 | -------------------------------------------------------------------------------- /src/MVAnalysisData.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "Bullshit.h" 4 | #include "MVAnalysisData.h" 5 | 6 | 7 | void scaleThSCD(int64_t *thscd1, int *thscd2, const MVAnalysisData *ad, const char *filter_name, char *error, size_t error_size) { 8 | if (error_size) { 9 | if (error[0]) 10 | return; 11 | error[0] = '\0'; 12 | } 13 | 14 | int maxSAD = 8 * 8 * 255; 15 | 16 | if (*thscd1 > maxSAD) { 17 | snprintf(error, error_size, "%s: thscd1 can be at most %d.", filter_name, maxSAD); 18 | return; 19 | } 20 | 21 | // SCD thresholds 22 | int referenceBlockSize = 8 * 8; 23 | *thscd1 = *thscd1 * (ad->nBlkSizeX * ad->nBlkSizeY) / referenceBlockSize; 24 | if (ad->nMotionFlags & MOTION_USE_CHROMA_MOTION) 25 | *thscd1 += *thscd1 / (ad->xRatioUV * ad->yRatioUV) * 2; 26 | 27 | int pixelMax = (1 << ad->bitsPerSample) - 1; 28 | *thscd1 = (int64_t)((double)*thscd1 * pixelMax / 255.0 + 0.5); 29 | 30 | *thscd2 = *thscd2 * ad->nBlkX * ad->nBlkY / 256; 31 | } 32 | 33 | 34 | void adataFromVectorClip(struct MVAnalysisData *ad, VSNode *clip, const char *filter_name, const char *vector_name, const VSAPI *vsapi, char *error, size_t error_size) { 35 | if (error_size) { 36 | if (error[0]) 37 | return; 38 | error[0] = '\0'; 39 | } 40 | 41 | char errorMsg[1024]; 42 | const VSFrame *evil = vsapi->getFrame(0, clip, errorMsg, 1024); 43 | if (!evil) { 44 | snprintf(error, error_size, "%s: Failed to retrieve first frame from %s. Error message: %s", filter_name, vector_name, errorMsg); 45 | return; 46 | } 47 | 48 | const VSMap *props = vsapi->getFramePropertiesRO(evil); 49 | int err; 50 | const char *data = vsapi->mapGetData(props, prop_MVTools_MVAnalysisData, 0, &err); 51 | if (err) { 52 | snprintf(error, error_size, "%s: Property '%s' not found in first frame of %s.", filter_name, prop_MVTools_MVAnalysisData, vector_name); 53 | return; 54 | } 55 | 56 | int data_size = vsapi->mapGetDataSize(props, prop_MVTools_MVAnalysisData, 0, NULL); 57 | if (data_size != sizeof(MVAnalysisData)) { 58 | snprintf(error, error_size, "%s: Property '%s' in first frame of %s has wrong size (%d instead of %d).", filter_name, prop_MVTools_MVAnalysisData, vector_name, data_size, (int)sizeof(MVAnalysisData)); 59 | return; 60 | } 61 | 62 | memcpy(ad, data, sizeof(MVAnalysisData)); 63 | 64 | vsapi->freeFrame(evil); 65 | } 66 | 67 | 68 | void adataCheckSimilarity(const MVAnalysisData *ad1, const MVAnalysisData *ad2, const char *filter_name1, const char *filter_name2, const char *vector_name, char *error, size_t error_size) { 69 | if (error_size) { 70 | if (error[0]) 71 | return; 72 | error[0] = '\0'; 73 | } 74 | 75 | if (ad1->nWidth != ad2->nWidth) 76 | snprintf(error, error_size, "%s: %s and %s have different widths.", filter_name1, filter_name2, vector_name); 77 | 78 | if (ad1->nHeight != ad2->nHeight) 79 | snprintf(error, error_size, "%s: %s and %s have different heights.", filter_name1, filter_name2, vector_name); 80 | 81 | if (ad1->nBlkSizeX != ad2->nBlkSizeX || ad1->nBlkSizeY != ad2->nBlkSizeY) 82 | snprintf(error, error_size, "%s: %s and %s have different block sizes.", filter_name1, filter_name2, vector_name); 83 | 84 | if (ad1->nPel != ad2->nPel) 85 | snprintf(error, error_size, "%s: %s and %s have different pel precision.", filter_name1, filter_name2, vector_name); 86 | 87 | if (ad1->nOverlapX != ad2->nOverlapX || ad1->nOverlapY != ad2->nOverlapY) 88 | snprintf(error, error_size, "%s: %s and %s have different overlap.", filter_name1, filter_name2, vector_name); 89 | 90 | if (ad1->xRatioUV != ad2->xRatioUV) 91 | snprintf(error, error_size, "%s: %s and %s have different horizontal subsampling.", filter_name1, filter_name2, vector_name); 92 | 93 | if (ad1->yRatioUV != ad2->yRatioUV) 94 | snprintf(error, error_size, "%s: %s and %s have different vertical subsampling.", filter_name1, filter_name2, vector_name); 95 | 96 | if (ad1->bitsPerSample != ad2->bitsPerSample) 97 | snprintf(error, error_size, "%s: %s and %s have different bit depths.", filter_name1, filter_name2, vector_name); 98 | } 99 | -------------------------------------------------------------------------------- /src/MVAnalysisData.h: -------------------------------------------------------------------------------- 1 | // Define the BlockData class 2 | 3 | // I borrowed a lot of code from XviD's sources here, so I thank all the developpers 4 | // of this wonderful codec 5 | 6 | // See legal notice in Copying.txt for more information 7 | 8 | // This program is free software; you can redistribute it and/or modify 9 | // it under the terms of the GNU General Public License as published by 10 | // the Free Software Foundation; either version 2 of the License, or 11 | // (at your option) any later version. 12 | // 13 | // This program is distributed in the hope that it will be useful, 14 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | // GNU General Public License for more details. 17 | // 18 | // You should have received a copy of the GNU General Public License 19 | // along with this program; if not, write to the Free Software 20 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit 21 | // http://www.gnu.org/copyleft/gpl.html . 22 | 23 | #ifndef MVANALYSISDATA_H 24 | #define MVANALYSISDATA_H 25 | 26 | #ifdef __cplusplus 27 | extern "C" { 28 | #endif 29 | 30 | #include 31 | #include 32 | 33 | #include 34 | 35 | 36 | static const char prop_MVTools_MVAnalysisData[] = "MVTools_MVAnalysisData"; 37 | static const char prop_MVTools_vectors[] = "MVTools_vectors"; 38 | 39 | 40 | typedef struct VECTOR { 41 | int x; 42 | int y; 43 | int64_t sad; 44 | } VECTOR; 45 | 46 | 47 | // Type of the size fields in the arrays of VECTOR. 48 | typedef int MVArraySizeType; 49 | 50 | 51 | /*! \brief Search type : defines the algorithm used for minimizing the SAD */ 52 | typedef enum SearchType { 53 | SearchOnetime, 54 | SearchNstep, 55 | SearchLogarithmic, 56 | SearchExhaustive, 57 | SearchHex2, 58 | SearchUnevenMultiHexagon, 59 | SearchHorizontal, 60 | SearchVertical 61 | } SearchType; 62 | 63 | 64 | #define MOTION_USE_SIMD 0x00000001 65 | #define MOTION_IS_BACKWARD 0x00000002 66 | #define MOTION_SMALLEST_PLANE 0x00000004 67 | #define MOTION_USE_CHROMA_MOTION 0x00000008 68 | //force MVAnalyse to use a different function for SAD / SADCHROMA (debug) 69 | #define MOTION_USE_SSD 0x00000010 70 | #define MOTION_USE_SATD 0x00000020 71 | 72 | 73 | #define MV_DEFAULT_SCD1 400 // increased in v1.4.1 74 | #define MV_DEFAULT_SCD2 130 75 | 76 | //#define MV_BUFFER_FRAMES 10 77 | 78 | static const VECTOR zeroMV = { 0, 0, -1 }; 79 | 80 | 81 | #define MVANALYSIS_DATA_VERSION 5 82 | 83 | typedef struct MVAnalysisData { 84 | /*! \brief Unique identifier, not very useful */ 85 | int nMagicKey; // placed to head in v.1.2.6 86 | 87 | int nVersion; // MVAnalysisData and outfile format version - added in v1.2.6 88 | 89 | /*! \brief size of a block, in pixel */ 90 | int nBlkSizeX; // horizontal block size 91 | 92 | int nBlkSizeY; // vertical block size - v1.7 93 | 94 | /*! \brief pixel refinement of the motion estimation */ 95 | int nPel; 96 | 97 | /*! \brief number of level for the hierarchal search */ 98 | int nLvCount; 99 | 100 | /*! \brief difference between the index of the reference and the index of the current frame */ 101 | int nDeltaFrame; 102 | 103 | /*! \brief direction of the search ( forward / backward ) */ 104 | int isBackward; 105 | 106 | int nCPUFlags; 107 | 108 | /*! \brief diverse flags to set up the search */ 109 | int nMotionFlags; 110 | 111 | /*! \brief Width of the frame */ 112 | int nWidth; 113 | 114 | /*! \brief Height of the frame */ 115 | int nHeight; 116 | 117 | int nOverlapX; // overlap block size - v1.1 118 | 119 | int nOverlapY; // vertical overlap - v1.7 120 | 121 | int nBlkX; // number of blocks along X 122 | 123 | int nBlkY; // number of blocks along Y 124 | 125 | int bitsPerSample; 126 | 127 | int yRatioUV; // ratio of luma plane height to chroma plane height 128 | 129 | int xRatioUV; // ratio of luma plane width to chroma plane width 130 | 131 | int nHPadding; // Horizontal padding - v1.8.1 132 | 133 | int nVPadding; // Vertical padding - v1.8.1 134 | } MVAnalysisData; 135 | 136 | 137 | void scaleThSCD(int64_t *thscd1, int *thscd2, const MVAnalysisData *ad, const char *filter_name, char *error, size_t error_size); 138 | 139 | void adataFromVectorClip(struct MVAnalysisData *ad, VSNode *clip, const char *filter_name, const char *vector_name, const VSAPI *vsapi, char *error, size_t error_size); 140 | 141 | void adataCheckSimilarity(const MVAnalysisData *ad1, const MVAnalysisData *ad2, const char *filter_name1, const char *filter_name2, const char *vector_name, char *error, size_t error_size); 142 | 143 | 144 | //#define MOTION_DELTA_FRAME_BUFFER 5 145 | 146 | 147 | #ifdef __cplusplus 148 | } // extern "C" 149 | #endif 150 | 151 | #endif // MVANALYSISDATA_H 152 | -------------------------------------------------------------------------------- /src/MVDegrains.h: -------------------------------------------------------------------------------- 1 | #ifndef MVDEGRAINS_H 2 | #define MVDEGRAINS_H 3 | 4 | #include 5 | #include 6 | 7 | #include "Fakery.h" 8 | #include "MVFrame.h" 9 | 10 | enum VectorOrder { 11 | Backward1 = 0, 12 | Forward1, 13 | Backward2, 14 | Forward2, 15 | Backward3, 16 | Forward3, 17 | Backward4, 18 | Forward4, 19 | Backward5, 20 | Forward5, 21 | Backward6, 22 | Forward6 23 | }; 24 | 25 | 26 | typedef void (*DenoiseFunction)(uint8_t *pDst, int nDstPitch, const uint8_t *pSrc, int nSrcPitch, const uint8_t **_pRefs, const int *nRefPitches, int WSrc, const int *WRefs); 27 | 28 | 29 | // XXX Moves the pointers passed in pRefs. This is okay because they are not 30 | // used after this function is done with them. 31 | template 32 | static void Degrain_C(uint8_t * __restrict pDst8, int nDstPitch, const uint8_t * __restrict pSrc8, int nSrcPitch, const uint8_t ** __restrict pRefs8, const int * __restrict nRefPitches, int WSrc, const int * __restrict WRefs) { 33 | for (int y = 0; y < blockHeight; y++) { 34 | for (int x = 0; x < blockWidth; x++) { 35 | const PixelType *pSrc = (const PixelType * __restrict)pSrc8; 36 | PixelType *pDst = (PixelType * __restrict)pDst8; 37 | 38 | int sum = 128 + pSrc[x] * WSrc; 39 | 40 | for (int r = 0; r < radius * 2; r++) { 41 | const PixelType *pRef = (const PixelType * __restrict)pRefs8[r]; 42 | sum += pRef[x] * WRefs[r]; 43 | } 44 | 45 | pDst[x] = sum >> 8; 46 | } 47 | 48 | pDst8 += nDstPitch; 49 | pSrc8 += nSrcPitch; 50 | for (int r = 0; r < radius * 2; r++) 51 | pRefs8[r] += nRefPitches[r]; 52 | } 53 | } 54 | 55 | 56 | #if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM) 57 | 58 | #if defined(MVTOOLS_ARM) 59 | #include "sse2neon.h" 60 | #else 61 | #include 62 | 63 | DenoiseFunction selectDegrainFunctionAVX2(unsigned radius, unsigned width, unsigned height, unsigned bits); 64 | #endif 65 | 66 | // XXX Moves the pointers passed in pRefs. This is okay because they are not 67 | // used after this function is done with them. 68 | template 69 | static void Degrain_sse2(uint8_t *pDst, int nDstPitch, const uint8_t *pSrc, int nSrcPitch, const uint8_t **pRefs, const int *nRefPitches, int WSrc, const int *WRefs) { 70 | static_assert(blockWidth >= 4, ""); 71 | 72 | __m128i zero = _mm_setzero_si128(); 73 | __m128i wsrc = _mm_set1_epi16(WSrc); 74 | __m128i wrefs[12]; 75 | 76 | // We intentionally jump by 2 (here and below), as it delineates groups of 77 | // backward/forward and ALSO produces testably faster code. 78 | for(int i = 0; i < radius * 2; i += 2) { 79 | wrefs[i] = _mm_set1_epi16(WRefs[i]); 80 | wrefs[i + 1] = _mm_set1_epi16(WRefs[i + 1]); 81 | } 82 | 83 | __m128i src, accum, refs[12]; 84 | 85 | for (int y = 0; y < blockHeight; y++) { 86 | for (int x = 0; x < blockWidth; x += 8) { 87 | // pDst[x] = (pRefF[x]*WRefF + pSrc[x]*WSrc + pRefB[x]*WRefB + 88 | // pRefF2[x]*WRefF2 + pRefB2[x]*WRefB2 + pRefF3[x]*WRefF3 + pRefB3[x]*WRefB3 89 | // pRefF4[x]*WRefF4 + pRefB4[x]*WRefB4 + pRefF5[x]*WRefF5 + pRefB5[x]*WRefB5 90 | // pRefF6[x]*WRefF6 + pRefB6[x]*WRefB6 + 128)>>8; 91 | 92 | if (blockWidth == 4) { 93 | src = _mm_cvtsi32_si128(*(const int *)pSrc); 94 | for(int i = 0; i < radius * 2; i += 2) { 95 | refs[i] = _mm_cvtsi32_si128(*(const int *)pRefs[i]); 96 | refs[i + 1] = _mm_cvtsi32_si128(*(const int *)pRefs[i + 1]); 97 | } 98 | } else { 99 | src = _mm_loadl_epi64((const __m128i *)(pSrc + x)); 100 | for(int i = 0; i < radius * 2; i += 2) { 101 | refs[i] = _mm_loadl_epi64((const __m128i *)(pRefs[i] + x)); 102 | refs[i + 1] = _mm_loadl_epi64((const __m128i *)(pRefs[i + 1] + x)); 103 | } 104 | } 105 | 106 | src = _mm_unpacklo_epi8(src, zero); 107 | src = _mm_mullo_epi16(src, wsrc); 108 | 109 | for(int i = 0; i < radius * 2; i += 2) { 110 | refs[i] = _mm_unpacklo_epi8(refs[i], zero); 111 | refs[i + 1] = _mm_unpacklo_epi8(refs[i + 1], zero); 112 | 113 | refs[i] = _mm_mullo_epi16(refs[i], wrefs[i]); 114 | refs[i + 1] = _mm_mullo_epi16(refs[i + 1], wrefs[i + 1]); 115 | } 116 | 117 | accum = _mm_set1_epi16(128); 118 | accum = _mm_add_epi16(accum, src); 119 | 120 | for(int i = 0; i < radius * 2; i += 2) { 121 | accum = _mm_add_epi16(accum, refs[i]); 122 | accum = _mm_add_epi16(accum, refs[i + 1]); 123 | } 124 | 125 | accum = _mm_srli_epi16(accum, 8); 126 | accum = _mm_packus_epi16(accum, zero); 127 | 128 | if (blockWidth == 4) 129 | *(int *)pDst = _mm_cvtsi128_si32(accum); 130 | else 131 | _mm_storel_epi64((__m128i *)(pDst + x), accum); 132 | } 133 | pDst += nDstPitch; 134 | pSrc += nSrcPitch; 135 | for(int i = 0; i < radius * 2; i += 2) { 136 | pRefs[i] += nRefPitches[i]; 137 | pRefs[i + 1] += nRefPitches[i + 1]; 138 | } 139 | } 140 | } 141 | 142 | static void LimitChanges_sse2(uint8_t *pDst, intptr_t nDstPitch, const uint8_t *pSrc, intptr_t nSrcPitch, intptr_t nWidth, intptr_t nHeight, intptr_t nLimit) { 143 | __m128i bytes_limit = _mm_set1_epi8(nLimit); 144 | 145 | for (int y = 0; y < nHeight; y++) { 146 | for (int x = 0; x < nWidth; x += 16) { 147 | __m128i m0 = _mm_load_si128((const __m128i *)&pSrc[x]); 148 | __m128i m1 = _mm_load_si128((const __m128i *)&pDst[x]); 149 | 150 | __m128i lower = _mm_subs_epu8(m0, bytes_limit); 151 | __m128i upper = _mm_adds_epu8(m0, bytes_limit); 152 | 153 | m0 = _mm_min_epu8(_mm_max_epu8(lower, m1), upper); 154 | 155 | _mm_store_si128((__m128i *)&pDst[x], m0); 156 | } 157 | 158 | pSrc += nSrcPitch; 159 | pDst += nDstPitch; 160 | } 161 | } 162 | 163 | #endif // MVTOOLS_X86 164 | 165 | 166 | typedef void (*LimitFunction)(uint8_t *pDst, intptr_t nDstPitch, const uint8_t *pSrc, intptr_t nSrcPitch, intptr_t nWidth, intptr_t nHeight, intptr_t nLimit); 167 | 168 | 169 | template 170 | static void LimitChanges_C(uint8_t *pDst8, intptr_t nDstPitch, const uint8_t *pSrc8, intptr_t nSrcPitch, intptr_t nWidth, intptr_t nHeight, intptr_t nLimit) { 171 | for (int h = 0; h < nHeight; h++) { 172 | for (int i = 0; i < nWidth; i++) { 173 | const PixelType *pSrc = (const PixelType *)pSrc8; 174 | PixelType *pDst = (PixelType *)pDst8; 175 | 176 | pDst[i] = (PixelType)VSMIN(VSMAX(pDst[i], (pSrc[i] - nLimit)), (pSrc[i] + nLimit)); 177 | } 178 | pDst8 += nDstPitch; 179 | pSrc8 += nSrcPitch; 180 | } 181 | } 182 | 183 | 184 | static inline int DegrainWeight(int64_t thSAD, int64_t blockSAD) { 185 | if (blockSAD >= thSAD) 186 | return 0; 187 | 188 | return int((thSAD - blockSAD) * (thSAD + blockSAD) * 256 / (double)(thSAD * thSAD + blockSAD * blockSAD)); 189 | } 190 | 191 | 192 | static inline void useBlock(const uint8_t *&p, int &np, int &WRef, int isUsable, const FakeGroupOfPlanes *fgop, int i, MVPlane * const *pPlane, const uint8_t **pSrcCur, int xx, const int *nSrcPitch, int nLogPel, int plane, int xSubUV, int ySubUV, const int64_t *thSAD) { 193 | if (isUsable) { 194 | const FakeBlockData *block = fgopGetBlock(fgop, 0, i); 195 | int blx = (block->x << nLogPel) + block->vector.x; 196 | int bly = (block->y << nLogPel) + block->vector.y; 197 | p = mvpGetPointer(pPlane[plane], plane ? blx >> xSubUV : blx, plane ? bly >> ySubUV : bly); 198 | np = pPlane[plane]->nPitch; 199 | int64_t blockSAD = block->vector.sad; 200 | WRef = DegrainWeight(thSAD[plane], blockSAD); 201 | } else { 202 | p = pSrcCur[plane] + xx; 203 | np = nSrcPitch[plane]; 204 | WRef = 0; 205 | } 206 | } 207 | 208 | 209 | template 210 | static inline void normaliseWeights(int &WSrc, int *WRefs) { 211 | // normalise weights to 256 212 | WSrc = 256; 213 | int WSum = WSrc + 1; 214 | for (int r = 0; r < radius * 2; r++) 215 | WSum += WRefs[r]; 216 | 217 | double scale = 256.0 / WSum; 218 | 219 | for (int r = 0; r < radius * 2; r++) { 220 | WRefs[r] = WRefs[r] * scale; 221 | WSrc -= WRefs[r]; 222 | } 223 | } 224 | 225 | 226 | #endif // MVDEGRAINS_H 227 | -------------------------------------------------------------------------------- /src/MVDegrains_AVX2.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "MVDegrains.h" 5 | 6 | enum InstructionSets { 7 | Scalar, 8 | SSE2, 9 | AVX2, 10 | }; 11 | 12 | // opt can fit in four bits, if the width and height need more than eight bits each. 13 | #define KEY(width, height, bits, opt) (unsigned)(width) << 24 | (height) << 16 | (bits) << 8 | (opt) 14 | 15 | #if defined(MVTOOLS_X86) 16 | #define DEGRAIN_AVX2(radius, width, height) \ 17 | { KEY(width, height, 8, AVX2), Degrain_avx2 }, 18 | 19 | #define DEGRAIN_LEVEL_AVX2(radius) \ 20 | {\ 21 | DEGRAIN_AVX2(radius, 8, 2)\ 22 | DEGRAIN_AVX2(radius, 8, 4)\ 23 | DEGRAIN_AVX2(radius, 8, 8)\ 24 | DEGRAIN_AVX2(radius, 8, 16)\ 25 | DEGRAIN_AVX2(radius, 16, 1)\ 26 | DEGRAIN_AVX2(radius, 16, 2)\ 27 | DEGRAIN_AVX2(radius, 16, 4)\ 28 | DEGRAIN_AVX2(radius, 16, 8)\ 29 | DEGRAIN_AVX2(radius, 16, 16)\ 30 | DEGRAIN_AVX2(radius, 16, 32)\ 31 | DEGRAIN_AVX2(radius, 32, 8)\ 32 | DEGRAIN_AVX2(radius, 32, 16)\ 33 | DEGRAIN_AVX2(radius, 32, 32)\ 34 | DEGRAIN_AVX2(radius, 32, 64)\ 35 | DEGRAIN_AVX2(radius, 64, 16)\ 36 | DEGRAIN_AVX2(radius, 64, 32)\ 37 | DEGRAIN_AVX2(radius, 64, 64)\ 38 | DEGRAIN_AVX2(radius, 64, 128)\ 39 | DEGRAIN_AVX2(radius, 128, 32)\ 40 | DEGRAIN_AVX2(radius, 128, 64)\ 41 | DEGRAIN_AVX2(radius, 128, 128)\ 42 | } 43 | #else 44 | #define DEGRAIN_AVX2(radius, width, height) 45 | #define DEGRAIN_LEVEL_AVX2(radius) 46 | #endif 47 | 48 | 49 | #if defined(MVTOOLS_X86) 50 | 51 | #include 52 | 53 | // XXX Moves the pointers passed in pRefs. This is okay because they are not 54 | // used after this function is done with them. 55 | template 56 | static void Degrain_avx2(uint8_t *pDst, int nDstPitch, const uint8_t *pSrc, int nSrcPitch, const uint8_t **pRefs, const int *nRefPitches, int WSrc, const int *WRefs) { 57 | static_assert(blockWidth >= 16 || (blockWidth == 8 && blockHeight >= 2), ""); 58 | 59 | __m256i zero = _mm256_setzero_si256(); 60 | __m256i wsrc = _mm256_set1_epi16(WSrc); 61 | 62 | __m256i wrefs[12]; 63 | for(int i = 0; i < radius * 2; i += 2) { 64 | wrefs[i] = _mm256_set1_epi16(WRefs[i]); 65 | wrefs[i + 1] = _mm256_set1_epi16(WRefs[i + 1]); 66 | } 67 | __m256i src, accum, refs[12]; 68 | 69 | int pitchMul = blockWidth == 8 ? 2 : 1; 70 | 71 | for (int y = 0; y < blockHeight; y += pitchMul) { 72 | for (int x = 0; x < blockWidth; x += 16 / pitchMul) { 73 | if (blockWidth == 8) { 74 | src = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)(pSrc + x)), _mm_loadl_epi64((const __m128i *)(pSrc + nSrcPitch + x)))); 75 | for(int i = 0; i < radius * 2; i += 2) { 76 | refs[i] = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)(pRefs[i] + x)), _mm_loadl_epi64((const __m128i *)(pRefs[i] + nRefPitches[i] + x)))); 77 | refs[i + 1] = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)(pRefs[i + 1] + x)), _mm_loadl_epi64((const __m128i *)(pRefs[i + 1] + nRefPitches[i + 1] + x)))); 78 | } 79 | } else { 80 | src = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(pSrc + x))); 81 | for(int i = 0; i < radius * 2; i += 2) { 82 | refs[i] = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(pRefs[i] + x))); 83 | refs[i + 1] = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(pRefs[i + 1] + x))); 84 | } 85 | } 86 | 87 | src = _mm256_mullo_epi16(src, wsrc); 88 | for(int i = 0; i < radius * 2; i += 2) { 89 | refs[i] = _mm256_mullo_epi16(refs[i], wrefs[i]); 90 | refs[i + 1] = _mm256_mullo_epi16(refs[i + 1], wrefs[i + 1]); 91 | } 92 | 93 | accum = _mm256_set1_epi16(128); 94 | accum = _mm256_add_epi16(accum, src); 95 | 96 | for(int i = 0; i < radius * 2; i += 2) { 97 | accum = _mm256_add_epi16(accum, refs[i]); 98 | accum = _mm256_add_epi16(accum, refs[i + 1]); 99 | } 100 | accum = _mm256_srli_epi16(accum, 8); 101 | accum = _mm256_packus_epi16(accum, zero); 102 | 103 | if (blockWidth == 8) { 104 | _mm_storel_epi64((__m128i *)(pDst + x), _mm256_castsi256_si128(accum)); 105 | _mm_storel_epi64((__m128i *)(pDst + nDstPitch + x), _mm256_extractf128_si256(accum, 1)); 106 | } else { 107 | accum = _mm256_permute4x64_epi64(accum, _MM_SHUFFLE(0, 0, 2, 0)); 108 | _mm_storeu_si128((__m128i *)(pDst + x), _mm256_castsi256_si128(accum)); 109 | } 110 | } 111 | 112 | pDst += nDstPitch * pitchMul; 113 | pSrc += nSrcPitch * pitchMul; 114 | 115 | for(int i = 0; i < radius * 2; i += 2) { 116 | pRefs[i] += nRefPitches[i] * pitchMul; 117 | pRefs[i + 1] += nRefPitches[i + 1] * pitchMul; 118 | } 119 | } 120 | } 121 | #endif 122 | 123 | static const std::unordered_map degrain_functions[6] = { 124 | DEGRAIN_LEVEL_AVX2(1), 125 | DEGRAIN_LEVEL_AVX2(2), 126 | DEGRAIN_LEVEL_AVX2(3), 127 | DEGRAIN_LEVEL_AVX2(4), 128 | DEGRAIN_LEVEL_AVX2(5), 129 | DEGRAIN_LEVEL_AVX2(6), 130 | }; 131 | 132 | DenoiseFunction selectDegrainFunctionAVX2(unsigned radius, unsigned width, unsigned height, unsigned bits) { 133 | try { 134 | return degrain_functions[radius - 1].at(KEY(width, height, bits, AVX2)); 135 | } catch (std::out_of_range &) { 136 | return nullptr; 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /src/MVFinest.c: -------------------------------------------------------------------------------- 1 | // Pixels flow motion function 2 | // Copyright(c)2005 A.G.Balakhnin aka Fizick 3 | 4 | // See legal notice in Copying.txt for more information 5 | 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; version 2 of the License. 9 | // 10 | // This program is distributed in the hope that it will be useful, 11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | // GNU General Public License for more details. 14 | // 15 | // You should have received a copy of the GNU General Public License 16 | // along with this program; if not, write to the Free Software 17 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit 18 | // http://www.gnu.org/copyleft/gpl.html . 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | #include "MaskFun.h" 25 | #include "CommonMacros.h" 26 | 27 | 28 | 29 | typedef struct MVFinestData { 30 | VSNode *super; 31 | VSVideoInfo vi; 32 | 33 | int opt; 34 | 35 | int nWidth; 36 | int nHeight; 37 | int nSuperHPad; 38 | int nSuperVPad; 39 | int nSuperPel; 40 | int nSuperModeYUV; 41 | int nSuperLevels; 42 | int nPel; 43 | int xRatioUV; 44 | int yRatioUV; 45 | } MVFinestData; 46 | 47 | 48 | static const VSFrame *VS_CC mvfinestGetFrame(int n, int activationReason, void *instanceData, void **frameData, VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi) { 49 | (void)frameData; 50 | 51 | MVFinestData *d = (MVFinestData *)instanceData; 52 | 53 | if (activationReason == arInitial) { 54 | vsapi->requestFrameFilter(n, d->super, frameCtx); 55 | } else if (activationReason == arAllFramesReady) { 56 | const VSFrame *ref = vsapi->getFrameFilter(n, d->super, frameCtx); 57 | VSFrame *dst = vsapi->newVideoFrame(&d->vi.format, d->vi.width, d->vi.height, ref, core); 58 | 59 | uint8_t *pDst[3]; 60 | const uint8_t *pRef[3]; 61 | int nDstPitches[3], nRefPitches[3]; 62 | 63 | for (int i = 0; i < d->vi.format.numPlanes; i++) { 64 | pDst[i] = vsapi->getWritePtr(dst, i); 65 | pRef[i] = vsapi->getReadPtr(ref, i); 66 | nDstPitches[i] = vsapi->getStride(dst, i); 67 | nRefPitches[i] = vsapi->getStride(ref, i); 68 | } 69 | 70 | int bitsPerSample = d->vi.format.bitsPerSample; 71 | int bytesPerSample = d->vi.format.bytesPerSample; 72 | 73 | if (d->nPel == 1) { // simply copy top lines 74 | for (int i = 0; i < d->vi.format.numPlanes; i++) 75 | vsh_bitblt(pDst[i], nDstPitches[i], pRef[i], nRefPitches[i], d->vi.width * bytesPerSample, d->vi.height); 76 | } else { 77 | MVGroupOfFrames pRefGOF = { 0 }; 78 | mvgofInit(&pRefGOF, d->nSuperLevels, d->nWidth, d->nHeight, d->nSuperPel, d->nSuperHPad, d->nSuperVPad, d->nSuperModeYUV, d->opt, d->xRatioUV, d->yRatioUV, bitsPerSample); 79 | 80 | mvgofUpdate(&pRefGOF, (uint8_t **)pRef, nRefPitches); 81 | 82 | MVPlane **pPlanes = pRefGOF.frames[0]->planes; 83 | 84 | 85 | // merge refined planes to big single plane 86 | for (int i = 0; i < 3; i++) { 87 | if (pPlanes[i]) { 88 | if (d->nPel == 2) { 89 | Merge4PlanesToBig(pDst[i], nDstPitches[i], 90 | mvpGetAbsolutePointer(pPlanes[i], 0, 0), 91 | mvpGetAbsolutePointer(pPlanes[i], 1, 0), 92 | mvpGetAbsolutePointer(pPlanes[i], 0, 1), 93 | mvpGetAbsolutePointer(pPlanes[i], 1, 1), 94 | pPlanes[i]->nPaddedWidth, pPlanes[i]->nPaddedHeight, 95 | pPlanes[i]->nPitch, bitsPerSample); 96 | } else if (d->nPel == 4) { 97 | Merge16PlanesToBig(pDst[i], nDstPitches[i], 98 | mvpGetAbsolutePointer(pPlanes[i], 0, 0), 99 | mvpGetAbsolutePointer(pPlanes[i], 1, 0), 100 | mvpGetAbsolutePointer(pPlanes[i], 2, 0), 101 | mvpGetAbsolutePointer(pPlanes[i], 3, 0), 102 | mvpGetAbsolutePointer(pPlanes[i], 0, 1), 103 | mvpGetAbsolutePointer(pPlanes[i], 1, 1), 104 | mvpGetAbsolutePointer(pPlanes[i], 2, 1), 105 | mvpGetAbsolutePointer(pPlanes[i], 3, 1), 106 | mvpGetAbsolutePointer(pPlanes[i], 0, 2), 107 | mvpGetAbsolutePointer(pPlanes[i], 1, 2), 108 | mvpGetAbsolutePointer(pPlanes[i], 2, 2), 109 | mvpGetAbsolutePointer(pPlanes[i], 3, 2), 110 | mvpGetAbsolutePointer(pPlanes[i], 0, 3), 111 | mvpGetAbsolutePointer(pPlanes[i], 1, 3), 112 | mvpGetAbsolutePointer(pPlanes[i], 2, 3), 113 | mvpGetAbsolutePointer(pPlanes[i], 3, 3), 114 | pPlanes[i]->nPaddedWidth, pPlanes[i]->nPaddedHeight, 115 | pPlanes[i]->nPitch, bitsPerSample); 116 | } 117 | } 118 | } 119 | 120 | mvgofDeinit(&pRefGOF); 121 | } 122 | 123 | vsapi->freeFrame(ref); 124 | 125 | return dst; 126 | } 127 | 128 | return 0; 129 | } 130 | 131 | 132 | static void VS_CC mvfinestFree(void *instanceData, VSCore *core, const VSAPI *vsapi) { 133 | (void)core; 134 | 135 | MVFinestData *d = (MVFinestData *)instanceData; 136 | 137 | vsapi->freeNode(d->super); 138 | free(d); 139 | } 140 | 141 | 142 | static void VS_CC mvfinestCreate(const VSMap *in, VSMap *out, void *userData, VSCore *core, const VSAPI *vsapi) { 143 | (void)userData; 144 | 145 | MVFinestData d; 146 | MVFinestData *data; 147 | 148 | int err; 149 | 150 | d.opt = !!vsapi->mapGetInt(in, "opt", 0, &err); 151 | if (err) 152 | d.opt = 1; 153 | 154 | 155 | d.super = vsapi->mapGetNode(in, "super", 0, 0); 156 | d.vi = *vsapi->getVideoInfo(d.super); 157 | 158 | if (!vsh_isConstantVideoFormat(&d.vi) || d.vi.format.bitsPerSample > 16 || d.vi.format.sampleType != stInteger || d.vi.format.subSamplingW > 1 || d.vi.format.subSamplingH > 1 || (d.vi.format.colorFamily != cfYUV && d.vi.format.colorFamily != cfGray)) { 159 | vsapi->mapSetError(out, "Finest: input clip must be GRAY, 420, 422, 440, or 444, up to 16 bits, with constant dimensions."); 160 | vsapi->freeNode(d.super); 161 | return; 162 | } 163 | 164 | #define ERROR_SIZE 1024 165 | char errorMsg[ERROR_SIZE] = "Finest: failed to retrieve first frame from super clip. Error message: "; 166 | size_t errorLen = strlen(errorMsg); 167 | const VSFrame *evil = vsapi->getFrame(0, d.super, errorMsg + errorLen, ERROR_SIZE - errorLen); 168 | #undef ERROR_SIZE 169 | if (!evil) { 170 | vsapi->mapSetError(out, errorMsg); 171 | vsapi->freeNode(d.super); 172 | return; 173 | } 174 | const VSMap *props = vsapi->getFramePropertiesRO(evil); 175 | int evil_err[6]; 176 | d.nHeight = vsapi->mapGetIntSaturated(props, "Super_height", 0, &evil_err[0]); 177 | d.nSuperHPad = vsapi->mapGetIntSaturated(props, "Super_hpad", 0, &evil_err[1]); 178 | d.nSuperVPad = vsapi->mapGetIntSaturated(props, "Super_vpad", 0, &evil_err[2]); 179 | d.nSuperPel = vsapi->mapGetIntSaturated(props, "Super_pel", 0, &evil_err[3]); 180 | d.nSuperModeYUV = vsapi->mapGetIntSaturated(props, "Super_modeyuv", 0, &evil_err[4]); 181 | d.nSuperLevels = vsapi->mapGetIntSaturated(props, "Super_levels", 0, &evil_err[5]); 182 | vsapi->freeFrame(evil); 183 | 184 | for (int i = 0; i < 6; i++) 185 | if (evil_err[i]) { 186 | vsapi->mapSetError(out, "Finest: required properties not found in first frame of super clip. Maybe clip didn't come from mv.Super? Was the first frame trimmed away?"); 187 | vsapi->freeNode(d.super); 188 | return; 189 | } 190 | 191 | d.nPel = d.nSuperPel; 192 | int nSuperWidth = d.vi.width; 193 | d.nWidth = nSuperWidth - 2 * d.nSuperHPad; 194 | 195 | d.xRatioUV = 1 << d.vi.format.subSamplingW; 196 | d.yRatioUV = 1 << d.vi.format.subSamplingH; 197 | 198 | d.vi.width = (d.nWidth + 2 * d.nSuperHPad) * d.nSuperPel; 199 | d.vi.height = (d.nHeight + 2 * d.nSuperVPad) * d.nSuperPel; 200 | 201 | 202 | data = (MVFinestData *)malloc(sizeof(d)); 203 | *data = d; 204 | 205 | VSFilterDependency deps[1] = { 206 | {data->super, rpStrictSpatial}, 207 | }; 208 | 209 | vsapi->createVideoFilter(out, "Finest", &data->vi, mvfinestGetFrame, mvfinestFree, fmParallel, deps, ARRAY_SIZE(deps), data, core); 210 | } 211 | 212 | 213 | void mvfinestRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi) { 214 | vspapi->registerFunction("Finest", 215 | "super:vnode;" 216 | "opt:int:opt;", 217 | "clip:vnode;", 218 | mvfinestCreate, 0, plugin); 219 | } 220 | -------------------------------------------------------------------------------- /src/MVFlowFPSHelper.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | 5 | #include "MaskFun.h" 6 | #include "SimpleResize.h" 7 | 8 | #include "MVFlowFPSHelper.h" 9 | 10 | 11 | const VSFrame *VS_CC mvflowfpshelperGetFrame(int n, int activationReason, void *instanceData, void **frameData, VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi) { 12 | (void)frameData; 13 | 14 | MVFlowFPSHelperData *d = (MVFlowFPSHelperData *)instanceData; 15 | 16 | if (activationReason == arInitial) { 17 | vsapi->requestFrameFilter(n, d->vectors, frameCtx); 18 | } else if (activationReason == arAllFramesReady) { 19 | const VSFrame *src = vsapi->getFrameFilter(n, d->vectors, frameCtx); 20 | 21 | FakeGroupOfPlanes fgop; 22 | 23 | fgopInit(&fgop, &d->vectors_data); 24 | 25 | const VSMap *mvprops = vsapi->getFramePropertiesRO(src); 26 | fgopUpdate(&fgop, (const uint8_t *)vsapi->mapGetData(mvprops, prop_MVTools_vectors, 0, NULL)); 27 | 28 | int isUsable = fgopIsUsable(&fgop, d->thscd1, d->thscd2); 29 | 30 | if (isUsable) { 31 | VSFrame *dst = vsapi->copyFrame(src, core); 32 | vsapi->freeFrame(src); 33 | 34 | VSMap *props = vsapi->getFramePropertiesRW(dst); 35 | 36 | const int xRatioUV = d->vectors_data.xRatioUV; 37 | const int yRatioUV = d->vectors_data.yRatioUV; 38 | const int nBlkX = d->vectors_data.nBlkX; 39 | const int nBlkY = d->vectors_data.nBlkY; 40 | const int nHeightP = d->nHeightP; 41 | const int nHeightPUV = d->nHeightPUV; 42 | const int VPitchY = d->VPitchY; 43 | const int VPitchUV = d->VPitchUV; 44 | const int nBlkXP = d->nBlkXP; 45 | const int nBlkYP = d->nBlkYP; 46 | SimpleResize *upsizer = &d->upsizer; 47 | SimpleResize *upsizerUV = &d->upsizerUV; 48 | 49 | int full_size_y = nHeightP * VPitchY * sizeof(int16_t); 50 | int small_size = nBlkXP * nBlkYP * sizeof(int16_t); 51 | 52 | int16_t *VXFullY = (int16_t *)malloc(full_size_y); 53 | int16_t *VYFullY = (int16_t *)malloc(full_size_y); 54 | int16_t *VXSmallY = (int16_t *)malloc(small_size); 55 | int16_t *VYSmallY = (int16_t *)malloc(small_size); 56 | 57 | // make vector vx and vy small masks 58 | MakeVectorSmallMasks(&fgop, nBlkX, nBlkY, VXSmallY, nBlkXP, VYSmallY, nBlkXP); 59 | 60 | CheckAndPadSmallY(VXSmallY, VYSmallY, nBlkXP, nBlkYP, nBlkX, nBlkY); 61 | 62 | upsizer->simpleResize_int16_t(upsizer, VXFullY, VPitchY, VXSmallY, nBlkXP, 1); 63 | upsizer->simpleResize_int16_t(upsizer, VYFullY, VPitchY, VYSmallY, nBlkXP, 0); 64 | 65 | vsapi->mapSetData(props, prop_VXFullY, (const char *)VXFullY, full_size_y, dtBinary, maReplace); 66 | vsapi->mapSetData(props, prop_VYFullY, (const char *)VYFullY, full_size_y, dtBinary, maReplace); 67 | 68 | free(VXFullY); 69 | free(VYFullY); 70 | 71 | if (d->supervi->format.colorFamily != cfGray) { 72 | int full_size_uv = nHeightPUV * VPitchUV * sizeof(int16_t); 73 | 74 | int16_t *VXFullUV = (int16_t *)malloc(full_size_uv); 75 | int16_t *VYFullUV = (int16_t *)malloc(full_size_uv); 76 | int16_t *VXSmallUV = (int16_t *)malloc(small_size); 77 | int16_t *VYSmallUV = (int16_t *)malloc(small_size); 78 | 79 | VectorSmallMaskYToHalfUV(VXSmallY, nBlkXP, nBlkYP, VXSmallUV, xRatioUV); 80 | VectorSmallMaskYToHalfUV(VYSmallY, nBlkXP, nBlkYP, VYSmallUV, yRatioUV); 81 | 82 | upsizerUV->simpleResize_int16_t(upsizerUV, VXFullUV, VPitchUV, VXSmallUV, nBlkXP, 1); 83 | upsizerUV->simpleResize_int16_t(upsizerUV, VYFullUV, VPitchUV, VYSmallUV, nBlkXP, 0); 84 | 85 | free(VXSmallUV); 86 | free(VYSmallUV); 87 | 88 | vsapi->mapSetData(props, prop_VXFullUV, (const char *)VXFullUV, full_size_uv, dtBinary, maReplace); 89 | vsapi->mapSetData(props, prop_VYFullUV, (const char *)VYFullUV, full_size_uv, dtBinary, maReplace); 90 | 91 | free(VXFullUV); 92 | free(VYFullUV); 93 | } 94 | 95 | free(VXSmallY); 96 | free(VYSmallY); 97 | 98 | 99 | fgopDeinit(&fgop); 100 | 101 | return dst; 102 | } else { // poor estimation 103 | fgopDeinit(&fgop); 104 | 105 | return src; 106 | } 107 | } 108 | 109 | return NULL; 110 | } 111 | 112 | 113 | void VS_CC mvflowfpshelperFree(void *instanceData, VSCore *core, const VSAPI *vsapi) { 114 | (void)core; 115 | 116 | MVFlowFPSHelperData *d = (MVFlowFPSHelperData *)instanceData; 117 | 118 | vsapi->freeNode(d->vectors); 119 | 120 | free(d); 121 | } 122 | -------------------------------------------------------------------------------- /src/MVFlowFPSHelper.h: -------------------------------------------------------------------------------- 1 | #ifndef MVFLOWFPSHELPER_H 2 | #define MVFLOWFPSHELPER_H 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #include 9 | 10 | #include "MVAnalysisData.h" 11 | #include "SimpleResize.h" 12 | 13 | typedef struct MVFlowFPSHelperData { 14 | VSNode *vectors; 15 | const VSVideoInfo *vi; 16 | 17 | const VSVideoInfo *supervi; 18 | 19 | int64_t thscd1; 20 | int thscd2; 21 | 22 | MVAnalysisData vectors_data; 23 | 24 | int nHeightP; 25 | int nHeightPUV; 26 | int VPitchY; 27 | int VPitchUV; 28 | int nBlkXP; 29 | int nBlkYP; 30 | 31 | SimpleResize upsizer; 32 | SimpleResize upsizerUV; 33 | } MVFlowFPSHelperData; 34 | 35 | 36 | static const char prop_VXFullY[] = "VXFullY"; 37 | static const char prop_VYFullY[] = "VYFullY"; 38 | static const char prop_VXFullUV[] = "VXFullUV"; 39 | static const char prop_VYFullUV[] = "VYFullUV"; 40 | 41 | 42 | const VSFrame *VS_CC mvflowfpshelperGetFrame(int n, int activationReason, void *instanceData, void **frameData, VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi); 43 | void VS_CC mvflowfpshelperFree(void *instanceData, VSCore *core, const VSAPI *vsapi); 44 | 45 | #ifdef __cplusplus 46 | } // extern "C" 47 | #endif 48 | 49 | #endif 50 | -------------------------------------------------------------------------------- /src/MVFrame.h: -------------------------------------------------------------------------------- 1 | #ifndef MVTOOLS_MVFRAME_H 2 | #define MVTOOLS_MVFRAME_H 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | 9 | #include 10 | 11 | typedef enum MVPlaneSet { 12 | YPLANE = (1 << 0), 13 | UPLANE = (1 << 1), 14 | VPLANE = (1 << 2), 15 | YUPLANES = YPLANE | UPLANE, 16 | YVPLANES = YPLANE | VPLANE, 17 | UVPLANES = UPLANE | VPLANE, 18 | YUVPLANES = YPLANE | UPLANE | VPLANE 19 | } MVPlaneSet; 20 | 21 | 22 | typedef enum SharpParam { 23 | SharpBilinear = 0, 24 | SharpBicubic = 1, 25 | SharpWiener = 2 26 | } SharpParam; 27 | 28 | 29 | typedef enum RfilterParam { 30 | RfilterSimple = 0, 31 | RfilterTriangle = 1, 32 | RfilterBilinear = 2, 33 | RfilterQuadratic = 3, 34 | RfilterCubic = 4 35 | } RfilterParam; 36 | 37 | 38 | int PlaneHeightLuma(int src_height, int level, int yRatioUV, int vpad); 39 | 40 | int PlaneWidthLuma(int src_width, int level, int xRatioUV, int hpad); 41 | 42 | unsigned int PlaneSuperOffset(int chroma, int src_height, int level, int pel, int vpad, int plane_pitch, int yRatioUV); 43 | 44 | 45 | typedef struct MVPlane { 46 | uint8_t **pPlane; 47 | int nWidth; 48 | int nHeight; 49 | int nPaddedWidth; 50 | int nPaddedHeight; 51 | int nPitch; 52 | int nHPadding; 53 | int nVPadding; 54 | int nOffsetPadding; 55 | int nHPaddingPel; 56 | int nVPaddingPel; 57 | int bitsPerSample; 58 | int bytesPerSample; 59 | 60 | int nPel; 61 | 62 | int opt; 63 | 64 | int isPadded; 65 | int isRefined; 66 | int isFilled; 67 | } MVPlane; 68 | 69 | void mvpInit(MVPlane *mvp, int nWidth, int nHeight, int nPel, int nHPad, int nVPad, int opt, int bitsPerSample); 70 | 71 | void mvpDeinit(MVPlane *mvp); 72 | 73 | void mvpResetState(MVPlane *mvp); 74 | 75 | void mvpUpdate(MVPlane *mvp, uint8_t *pSrc, int _nPitch); 76 | 77 | void mvpFillPlane(MVPlane *mvp, const uint8_t *pNewPlane, int nNewPitch); 78 | 79 | void mvpPad(MVPlane *mvp); 80 | 81 | void mvpRefine(MVPlane *mvp, int sharp); 82 | 83 | void mvpRefineExt(MVPlane *mvp, const uint8_t *pSrc2x, int nSrc2xPitch, int isExtPadded); 84 | 85 | void mvpReduceTo(MVPlane *mvp, MVPlane *pReducedPlane, int rfilter); 86 | 87 | const uint8_t *mvpGetAbsolutePointer(const MVPlane *mvp, int nX, int nY); 88 | 89 | const uint8_t *mvpGetAbsolutePointerPel1(const MVPlane *mvp, int nX, int nY); 90 | 91 | const uint8_t *mvpGetAbsolutePointerPel2(const MVPlane *mvp, int nX, int nY); 92 | 93 | const uint8_t *mvpGetAbsolutePointerPel4(const MVPlane *mvp, int nX, int nY); 94 | 95 | const uint8_t *mvpGetPointer(const MVPlane *mvp, int nX, int nY); 96 | 97 | const uint8_t *mvpGetPointerPel1(const MVPlane *mvp, int nX, int nY); 98 | 99 | const uint8_t *mvpGetPointerPel2(const MVPlane *mvp, int nX, int nY); 100 | 101 | const uint8_t *mvpGetPointerPel4(const MVPlane *mvp, int nX, int nY); 102 | 103 | const uint8_t *mvpGetAbsolutePelPointer(const MVPlane *mvp, int nX, int nY); 104 | 105 | 106 | typedef struct MVFrame { 107 | MVPlane *planes[3]; 108 | 109 | int nMode; 110 | } MVFrame; 111 | 112 | 113 | void mvfInit(MVFrame *mvf, int nWidth, int nHeight, int nPel, int nHPad, int nVPad, int nMode, int opt, int xRatioUV, int yRatioUV, int bitsPerSample); 114 | 115 | void mvfDeinit(MVFrame *mvf); 116 | 117 | void mvfUpdate(MVFrame *mvf, uint8_t **pSrc, int *pitch); 118 | 119 | void mvfFillPlane(MVFrame *mvf, const uint8_t *pNewPlane, int nNewPitch, int plane); 120 | 121 | void mvfRefine(MVFrame *mvf, MVPlaneSet nMode, int sharp); 122 | 123 | void mvfPad(MVFrame *mvf, MVPlaneSet nMode); 124 | 125 | void mvfResetState(MVFrame *mvf); 126 | 127 | void mvfReduceTo(MVFrame *mvf, MVFrame *pFrame, MVPlaneSet nMode, int rfilter); 128 | 129 | 130 | typedef struct MVGroupOfFrames { 131 | int nLevelCount; 132 | MVFrame **frames; 133 | 134 | int nWidth[3]; 135 | int nHeight[3]; 136 | int nPel; 137 | int nHPad[3]; 138 | int nVPad[3]; 139 | int xRatioUV; 140 | int yRatioUV; 141 | } MVGroupOfFrames; 142 | 143 | 144 | void mvgofInit(MVGroupOfFrames *mvgof, int nLevelCount, int nWidth, int nHeight, int nPel, int nHPad, int nVPad, int nMode, int opt, int xRatioUV, int yRatioUV, int bitsPerSample); 145 | 146 | void mvgofDeinit(MVGroupOfFrames *mvgof); 147 | 148 | void mvgofUpdate(MVGroupOfFrames *mvgof, uint8_t **pSrc, int *pitch); 149 | 150 | MVFrame *mvgofGetFrame(MVGroupOfFrames *mvgof, int nLevel); 151 | 152 | void mvgofSetPlane(MVGroupOfFrames *mvgof, const uint8_t *pNewSrc, int nNewPitch, int plane); 153 | 154 | void mvgofRefine(MVGroupOfFrames *mvgof, MVPlaneSet nMode, int sharp); 155 | 156 | void mvgofPad(MVGroupOfFrames *mvgof, MVPlaneSet nMode); 157 | 158 | void mvgofReduce(MVGroupOfFrames *mvgof, MVPlaneSet nMode, int rfilter); 159 | 160 | void mvgofResetState(MVGroupOfFrames *mvgof); 161 | 162 | #ifdef __cplusplus 163 | } // extern "C" 164 | #endif 165 | 166 | #endif // MVTOOLS_MVFRAME_H 167 | -------------------------------------------------------------------------------- /src/MVFrame_AVX2.cpp: -------------------------------------------------------------------------------- 1 | #if defined(MVTOOLS_X86) 2 | 3 | #include 4 | #include 5 | 6 | #define zeroes _mm256_setzero_si256() 7 | 8 | /* TODO: port these 9 | extern "C" void VerticalBicubic_iSSE(uint8_t *pDst, const uint8_t *pSrc, intptr_t nDstPitch, 10 | intptr_t nWidth, intptr_t nHeight); 11 | extern "C" void HorizontalBicubic_iSSE(uint8_t *pDst, const uint8_t *pSrc, intptr_t nDstPitch, 12 | intptr_t nWidth, intptr_t nHeight); 13 | extern "C" void RB2F_iSSE(uint8_t *pDst, const uint8_t *pSrc, intptr_t nDstPitch, 14 | intptr_t nSrcPitch, intptr_t nWidth, intptr_t nHeight); 15 | extern "C" void RB2FilteredVerticalLine_SSE(uint8_t *pDst, const uint8_t *pSrc, intptr_t nSrcPitch, intptr_t nWidthMMX); 16 | extern "C" void RB2FilteredHorizontalInplaceLine_SSE(uint8_t *pSrc, intptr_t nWidthMMX); 17 | */ 18 | 19 | void Average2_avx2(uint8_t *pDst, const uint8_t *pSrc1, const uint8_t *pSrc2, intptr_t nPitch, intptr_t nWidth, intptr_t nHeight) { 20 | for (int y = 0; y < nHeight; y++) { 21 | for (int x = 0; x < nWidth; x += 32) { 22 | __m256i m0 = _mm256_loadu_si256((const __m256i *)&pSrc1[x]); 23 | __m256i m1 = _mm256_loadu_si256((const __m256i *)&pSrc2[x]); 24 | 25 | m0 = _mm256_avg_epu8(m0, m1); 26 | _mm256_storeu_si256((__m256i *)&pDst[x], m0); 27 | } 28 | 29 | pSrc1 += nPitch; 30 | pSrc2 += nPitch; 31 | pDst += nPitch; 32 | } 33 | } 34 | 35 | 36 | void VerticalBilinear_avx2(uint8_t *pDst, const uint8_t *pSrc, intptr_t nPitch, 37 | intptr_t nWidth, intptr_t nHeight, intptr_t bitsPerSample) { 38 | (void)bitsPerSample; 39 | 40 | for (int y = 0; y < nHeight - 1; y++) { 41 | for (int x = 0; x < nWidth; x += 32) { 42 | __m256i m0 = _mm256_loadu_si256((const __m256i *)&pSrc[x]); 43 | __m256i m1 = _mm256_loadu_si256((const __m256i *)&pSrc[x + nPitch]); 44 | 45 | m0 = _mm256_avg_epu8(m0, m1); 46 | _mm256_storeu_si256((__m256i *)&pDst[x], m0); 47 | } 48 | 49 | pSrc += nPitch; 50 | pDst += nPitch; 51 | } 52 | 53 | for (int x = 0; x < nWidth; x++) 54 | pDst[x] = pSrc[x]; 55 | } 56 | 57 | 58 | void HorizontalBilinear_avx2(uint8_t *pDst, const uint8_t *pSrc, intptr_t nPitch, 59 | intptr_t nWidth, intptr_t nHeight, intptr_t bitsPerSample) { 60 | (void)bitsPerSample; 61 | 62 | for (int y = 0; y < nHeight; y++) { 63 | for (int x = 0; x < nWidth; x += 32) { 64 | __m256i m0 = _mm256_loadu_si256((const __m256i *)&pSrc[x]); 65 | __m256i m1 = _mm256_loadu_si256((const __m256i *)&pSrc[x + 1]); 66 | 67 | m0 = _mm256_avg_epu8(m0, m1); 68 | _mm256_storeu_si256((__m256i *)&pDst[x], m0); 69 | } 70 | 71 | pDst[nWidth - 1] = pSrc[nWidth - 1]; 72 | 73 | pSrc += nPitch; 74 | pDst += nPitch; 75 | } 76 | } 77 | 78 | 79 | void DiagonalBilinear_avx2(uint8_t *pDst, const uint8_t *pSrc, intptr_t nPitch, 80 | intptr_t nWidth, intptr_t nHeight, intptr_t bitsPerSample) { 81 | (void)bitsPerSample; 82 | 83 | for (int y = 0; y < nHeight - 1; y++) { 84 | for (int x = 0; x < nWidth; x += 16) { 85 | __m256i m0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x])); 86 | __m256i m1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x + 1])); 87 | __m256i m2 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x + nPitch])); 88 | __m256i m3 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x + nPitch + 1])); 89 | 90 | m0 = _mm256_add_epi16(m0, m1); 91 | m2 = _mm256_add_epi16(m2, m3); 92 | m0 = _mm256_add_epi16(m0, _mm256_set1_epi16(2)); 93 | m0 = _mm256_add_epi16(m0, m2); 94 | 95 | m0 = _mm256_srli_epi16(m0, 2); 96 | 97 | m0 = _mm256_packus_epi16(m0, m0); 98 | m0 = _mm256_permute4x64_epi64(m0, _MM_SHUFFLE(0, 0, 2, 0)); 99 | _mm_storeu_si128((__m128i *)&pDst[x], _mm256_castsi256_si128(m0)); 100 | } 101 | 102 | pDst[nWidth - 1] = (pSrc[nWidth - 1] + pSrc[nWidth - 1 + nPitch] + 1) >> 1; 103 | 104 | pSrc += nPitch; 105 | pDst += nPitch; 106 | } 107 | 108 | for (int x = 0; x < nWidth; x += 32) { 109 | __m256i m0 = _mm256_loadu_si256((const __m256i *)&pSrc[x]); 110 | __m256i m1 = _mm256_loadu_si256((const __m256i *)&pSrc[x + 1]); 111 | 112 | m0 = _mm256_avg_epu8(m0, m1); 113 | _mm256_storeu_si256((__m256i *)&pDst[x], m0); 114 | } 115 | 116 | pDst[nWidth - 1] = pSrc[nWidth - 1]; 117 | } 118 | 119 | void VerticalWiener_avx2(uint8_t *pDst, const uint8_t *pSrc, intptr_t nPitch, 120 | intptr_t nWidth, intptr_t nHeight, intptr_t bitsPerSample) { 121 | (void)bitsPerSample; 122 | 123 | for (int y = 0; y < 2; y++) { 124 | for (int x = 0; x < nWidth; x += 32) { 125 | __m256i m0 = _mm256_loadu_si256((const __m256i *)&pSrc[x]); 126 | __m256i m1 = _mm256_loadu_si256((const __m256i *)&pSrc[x + nPitch]); 127 | 128 | m0 = _mm256_avg_epu8(m0, m1); 129 | _mm256_storeu_si256((__m256i *)&pDst[x], m0); 130 | } 131 | 132 | pSrc += nPitch; 133 | pDst += nPitch; 134 | } 135 | 136 | for (int y = 2; y < nHeight - 4; y++) { 137 | for (int x = 0; x < nWidth; x += 16) { 138 | __m256i m0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x - nPitch * 2])); 139 | __m256i m1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x - nPitch])); 140 | __m256i m2 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x])); 141 | __m256i m3 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x + nPitch])); 142 | __m256i m4 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x + nPitch * 2])); 143 | __m256i m5 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x + nPitch * 3])); 144 | 145 | m2 = _mm256_add_epi16(m2, m3); 146 | m2 = _mm256_slli_epi16(m2, 2); 147 | 148 | m1 = _mm256_add_epi16(m1, m4); 149 | 150 | m2 = _mm256_sub_epi16(m2, m1); 151 | m3 = _mm256_slli_epi16(m2, 2); 152 | m2 = _mm256_add_epi16(m2, m3); 153 | 154 | m0 = _mm256_add_epi16(m0, m5); 155 | m0 = _mm256_add_epi16(m0, m2); 156 | m0 = _mm256_add_epi16(m0, _mm256_set1_epi16(16)); 157 | 158 | m0 = _mm256_srai_epi16(m0, 5); 159 | m0 = _mm256_packus_epi16(m0, m0); 160 | m0 = _mm256_permute4x64_epi64(m0, _MM_SHUFFLE(0, 0, 2, 0)); 161 | _mm_storeu_si128((__m128i *)&pDst[x], _mm256_castsi256_si128(m0)); 162 | } 163 | 164 | pSrc += nPitch; 165 | pDst += nPitch; 166 | } 167 | 168 | for (int y = nHeight - 4; y < nHeight - 1; y++) { 169 | for (int x = 0; x < nWidth; x += 32) { 170 | __m256i m0 = _mm256_loadu_si256((const __m256i *)&pSrc[x]); 171 | __m256i m1 = _mm256_loadu_si256((const __m256i *)&pSrc[x + nPitch]); 172 | 173 | m0 = _mm256_avg_epu8(m0, m1); 174 | _mm256_storeu_si256((__m256i *)&pDst[x], m0); 175 | } 176 | 177 | pSrc += nPitch; 178 | pDst += nPitch; 179 | } 180 | 181 | for (int x = 0; x < nWidth; x++) 182 | pDst[x] = pSrc[x]; 183 | } 184 | 185 | 186 | void HorizontalWiener_avx2(uint8_t *pDst, const uint8_t *pSrc, intptr_t nPitch, 187 | intptr_t nWidth, intptr_t nHeight, intptr_t bitsPerSample) { 188 | (void)bitsPerSample; 189 | 190 | for (int y = 0; y < nHeight; y++) { 191 | pDst[0] = (pSrc[0] + pSrc[1] + 1) >> 1; 192 | pDst[1] = (pSrc[1] + pSrc[2] + 1) >> 1; 193 | 194 | for (int x = 2; x < nWidth - 4; x += 16) { 195 | __m256i m0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x - 2])); 196 | __m256i m1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x - 1])); 197 | __m256i m2 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x])); 198 | __m256i m3 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x + 1])); 199 | __m256i m4 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x + 2])); 200 | __m256i m5 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x + 3])); 201 | 202 | m2 = _mm256_add_epi16(m2, m3); 203 | m2 = _mm256_slli_epi16(m2, 2); 204 | 205 | m1 = _mm256_add_epi16(m1, m4); 206 | 207 | m2 = _mm256_sub_epi16(m2, m1); 208 | m3 = _mm256_slli_epi16(m2, 2); 209 | m2 = _mm256_add_epi16(m2, m3); 210 | 211 | m0 = _mm256_add_epi16(m0, m5); 212 | m0 = _mm256_add_epi16(m0, m2); 213 | m0 = _mm256_add_epi16(m0, _mm256_set1_epi16(16)); 214 | 215 | m0 = _mm256_srai_epi16(m0, 5); 216 | m0 = _mm256_packus_epi16(m0, m0); 217 | m0 = _mm256_permute4x64_epi64(m0, _MM_SHUFFLE(0, 0, 2, 0)); 218 | _mm_storeu_si128((__m128i *)&pDst[x], _mm256_castsi256_si128(m0)); 219 | } 220 | 221 | for (int x = nWidth - 4; x < nWidth - 1; x++) 222 | pDst[x] = (pSrc[x] + pSrc[x + 1] + 1) >> 1; 223 | 224 | pDst[nWidth - 1] = pSrc[nWidth - 1]; 225 | 226 | pDst += nPitch; 227 | pSrc += nPitch; 228 | } 229 | } 230 | 231 | #endif // MVTOOLS_X86 -------------------------------------------------------------------------------- /src/MVSCDetection.c: -------------------------------------------------------------------------------- 1 | // Author: Manao 2 | // Copyright(c)2006 A.G.Balakhnin aka Fizick - YUY2 3 | // See legal notice in Copying.txt for more information 4 | // 5 | // This program is free software; you can redistribute it and/or modify 6 | // it under the terms of the GNU General Public License as published by 7 | // the Free Software Foundation; either version 2 of the License, or 8 | // (at your option) any later version. 9 | // 10 | // This program is distributed in the hope that it will be useful, 11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | // GNU General Public License for more details. 14 | // 15 | // You should have received a copy of the GNU General Public License 16 | // along with this program; if not, write to the Free Software 17 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit 18 | // http://www.gnu.org/copyleft/gpl.html . 19 | 20 | #include 21 | #include 22 | 23 | #include "Fakery.h" 24 | #include "MVAnalysisData.h" 25 | #include "CommonMacros.h" 26 | 27 | 28 | 29 | 30 | typedef struct MVSCDetectionData { 31 | VSNode *node; 32 | const VSVideoInfo *vi; 33 | 34 | VSNode *vectors; 35 | 36 | int64_t thscd1; 37 | int thscd2; 38 | 39 | MVAnalysisData vectors_data; 40 | } MVSCDetectionData; 41 | 42 | 43 | static const VSFrame *VS_CC mvscdetectionGetFrame(int n, int activationReason, void *instanceData, void **frameData, VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi) { 44 | (void)frameData; 45 | 46 | MVSCDetectionData *d = (MVSCDetectionData *)instanceData; 47 | 48 | if (activationReason == arInitial) { 49 | vsapi->requestFrameFilter(n, d->vectors, frameCtx); 50 | vsapi->requestFrameFilter(n, d->node, frameCtx); 51 | } else if (activationReason == arAllFramesReady) { 52 | const VSFrame *src = vsapi->getFrameFilter(n, d->node, frameCtx); 53 | VSFrame *dst = vsapi->copyFrame(src, core); 54 | vsapi->freeFrame(src); 55 | 56 | const VSFrame *mvn = vsapi->getFrameFilter(n, d->vectors, frameCtx); 57 | FakeGroupOfPlanes fgop; 58 | fgopInit(&fgop, &d->vectors_data); 59 | const VSMap *mvprops = vsapi->getFramePropertiesRO(mvn); 60 | fgopUpdate(&fgop, (const uint8_t *)vsapi->mapGetData(mvprops, prop_MVTools_vectors, 0, NULL)); 61 | vsapi->freeFrame(mvn); 62 | 63 | const char *propNames[2] = { "_SceneChangePrev", "_SceneChangeNext" }; 64 | VSMap *props = vsapi->getFramePropertiesRW(dst); 65 | vsapi->mapSetInt(props, propNames[!!d->vectors_data.isBackward], !fgopIsUsable(&fgop, d->thscd1, d->thscd2), maReplace); 66 | 67 | fgopDeinit(&fgop); 68 | 69 | return dst; 70 | } 71 | 72 | return NULL; 73 | } 74 | 75 | 76 | static void VS_CC mvscdetectionFree(void *instanceData, VSCore *core, const VSAPI *vsapi) { 77 | (void)core; 78 | 79 | MVSCDetectionData *d = (MVSCDetectionData *)instanceData; 80 | 81 | vsapi->freeNode(d->node); 82 | vsapi->freeNode(d->vectors); 83 | free(d); 84 | } 85 | 86 | 87 | static void VS_CC mvscdetectionCreate(const VSMap *in, VSMap *out, void *userData, VSCore *core, const VSAPI *vsapi) { 88 | (void)userData; 89 | 90 | MVSCDetectionData d; 91 | MVSCDetectionData *data; 92 | 93 | int err; 94 | 95 | d.thscd1 = vsapi->mapGetInt(in, "thscd1", 0, &err); 96 | if (err) 97 | d.thscd1 = MV_DEFAULT_SCD1; 98 | 99 | d.thscd2 = vsapi->mapGetIntSaturated(in, "thscd2", 0, &err); 100 | if (err) 101 | d.thscd2 = MV_DEFAULT_SCD2; 102 | 103 | 104 | d.vectors = vsapi->mapGetNode(in, "vectors", 0, NULL); 105 | 106 | 107 | #define ERROR_SIZE 512 108 | char error[ERROR_SIZE + 1] = { 0 }; 109 | const char *filter_name = "SCDetection"; 110 | 111 | adataFromVectorClip(&d.vectors_data, d.vectors, filter_name, "vectors", vsapi, error, ERROR_SIZE); 112 | 113 | scaleThSCD(&d.thscd1, &d.thscd2, &d.vectors_data, filter_name, error, ERROR_SIZE); 114 | #undef ERROR_SIZE 115 | 116 | if (error[0]) { 117 | vsapi->mapSetError(out, error); 118 | 119 | vsapi->freeNode(d.vectors); 120 | return; 121 | } 122 | 123 | 124 | d.node = vsapi->mapGetNode(in, "clip", 0, NULL); 125 | d.vi = vsapi->getVideoInfo(d.node); 126 | 127 | 128 | data = (MVSCDetectionData *)malloc(sizeof(d)); 129 | *data = d; 130 | 131 | VSFilterDependency deps[2] = { 132 | {data->node, rpStrictSpatial}, 133 | {data->vectors, rpStrictSpatial}, 134 | }; 135 | vsapi->createVideoFilter(out, "SCDetection", data->vi, mvscdetectionGetFrame, mvscdetectionFree, fmParallel, deps, ARRAY_SIZE(deps), data, core); 136 | } 137 | 138 | 139 | void mvscdetectionRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi) { 140 | vspapi->registerFunction("SCDetection", 141 | "clip:vnode;" 142 | "vectors:vnode;" 143 | "thscd1:int:opt;" 144 | "thscd2:int:opt;", 145 | "clip:vnode;", 146 | mvscdetectionCreate, 0, plugin); 147 | } 148 | -------------------------------------------------------------------------------- /src/MVSuper.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include "MVFrame.h" 9 | #include "CommonMacros.h" 10 | 11 | 12 | 13 | typedef struct MVSuperData { 14 | VSNode *node; 15 | VSVideoInfo vi; 16 | 17 | VSNode *pelclip; // upsized source clip with doubled frame width and heigth (used for pel=2) 18 | 19 | int nHPad; 20 | int nVPad; 21 | int nPel; 22 | int nLevels; 23 | int sharp; 24 | int rfilter; // frame reduce filter mode 25 | int opt; 26 | 27 | int nWidth; 28 | int nHeight; 29 | 30 | int yRatioUV; 31 | int xRatioUV; 32 | int chroma; 33 | int usePelClip; 34 | int nSuperWidth; 35 | int nSuperHeight; 36 | 37 | MVPlaneSet nModeYUV; 38 | 39 | int isPelClipPadded; 40 | } MVSuperData; 41 | 42 | 43 | static const VSFrame *VS_CC mvsuperGetFrame(int n, int activationReason, void *instanceData, void **frameData, VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi) { 44 | (void)frameData; 45 | 46 | MVSuperData *d = (MVSuperData *)instanceData; 47 | 48 | if (activationReason == arInitial) { 49 | vsapi->requestFrameFilter(n, d->node, frameCtx); 50 | if (d->usePelClip) 51 | vsapi->requestFrameFilter(n, d->pelclip, frameCtx); 52 | } else if (activationReason == arAllFramesReady) { 53 | const VSFrame *src = vsapi->getFrameFilter(n, d->node, frameCtx); 54 | 55 | const uint8_t *pSrc[3] = { NULL }; 56 | uint8_t *pDst[3] = { NULL }; 57 | const uint8_t *pSrcPel[3] = { NULL }; 58 | int nSrcPitch[3] = { 0 }; 59 | int nDstPitch[3] = { 0 }; 60 | int nSrcPelPitch[3] = { 0 }; 61 | 62 | const VSFrame *srcPel = NULL; 63 | if (d->usePelClip) 64 | srcPel = vsapi->getFrameFilter(n, d->pelclip, frameCtx); 65 | 66 | VSFrame *dst = vsapi->newVideoFrame(&d->vi.format, d->vi.width, d->vi.height, src, core); 67 | 68 | for (int plane = 0; plane < d->vi.format.numPlanes; plane++) { 69 | pSrc[plane] = vsapi->getReadPtr(src, plane); 70 | nSrcPitch[plane] = vsapi->getStride(src, plane); 71 | 72 | pDst[plane] = vsapi->getWritePtr(dst, plane); 73 | nDstPitch[plane] = vsapi->getStride(dst, plane); 74 | 75 | memset(pDst[plane], 0, nDstPitch[plane] * vsapi->getFrameHeight(dst, plane)); 76 | } 77 | 78 | MVGroupOfFrames pSrcGOF; 79 | mvgofInit(&pSrcGOF, d->nLevels, d->nWidth, d->nHeight, d->nPel, d->nHPad, d->nVPad, d->nModeYUV, d->opt, d->xRatioUV, d->yRatioUV, d->vi.format.bitsPerSample); 80 | 81 | mvgofUpdate(&pSrcGOF, pDst, nDstPitch); 82 | 83 | MVPlaneSet planes[3] = { YPLANE, UPLANE, VPLANE }; 84 | 85 | for (int plane = 0; plane < d->vi.format.numPlanes; plane++) 86 | mvfFillPlane(pSrcGOF.frames[0], pSrc[plane], nSrcPitch[plane], plane); 87 | 88 | mvgofReduce(&pSrcGOF, d->nModeYUV, d->rfilter); 89 | mvgofPad(&pSrcGOF, d->nModeYUV); 90 | 91 | if (d->usePelClip) { 92 | MVFrame *srcFrames = pSrcGOF.frames[0]; 93 | 94 | for (int plane = 0; plane < d->vi.format.numPlanes; plane++) { 95 | pSrcPel[plane] = vsapi->getReadPtr(srcPel, plane); 96 | nSrcPelPitch[plane] = vsapi->getStride(srcPel, plane); 97 | 98 | MVPlane *srcPlane = srcFrames->planes[plane]; 99 | if (d->nModeYUV & planes[plane]) 100 | mvpRefineExt(srcPlane, pSrcPel[plane], nSrcPelPitch[plane], d->isPelClipPadded); 101 | } 102 | } else 103 | mvgofRefine(&pSrcGOF, d->nModeYUV, d->sharp); 104 | 105 | vsapi->freeFrame(src); 106 | if (d->usePelClip) 107 | vsapi->freeFrame(srcPel); 108 | 109 | mvgofDeinit(&pSrcGOF); 110 | 111 | if (n == 0) { 112 | VSMap *props = vsapi->getFramePropertiesRW(dst); 113 | 114 | vsapi->mapSetInt(props, "Super_height", d->nHeight, maReplace); 115 | vsapi->mapSetInt(props, "Super_hpad", d->nHPad, maReplace); 116 | vsapi->mapSetInt(props, "Super_vpad", d->nVPad, maReplace); 117 | vsapi->mapSetInt(props, "Super_pel", d->nPel, maReplace); 118 | vsapi->mapSetInt(props, "Super_modeyuv", d->nModeYUV, maReplace); 119 | vsapi->mapSetInt(props, "Super_levels", d->nLevels, maReplace); 120 | } 121 | 122 | return dst; 123 | } 124 | 125 | return 0; 126 | } 127 | 128 | 129 | static void VS_CC mvsuperFree(void *instanceData, VSCore *core, const VSAPI *vsapi) { 130 | (void)core; 131 | 132 | MVSuperData *d = (MVSuperData *)instanceData; 133 | 134 | vsapi->freeNode(d->node); 135 | vsapi->freeNode(d->pelclip); 136 | free(d); 137 | } 138 | 139 | 140 | static void VS_CC mvsuperCreate(const VSMap *in, VSMap *out, void *userData, VSCore *core, const VSAPI *vsapi) { 141 | (void)userData; 142 | 143 | MVSuperData d; 144 | MVSuperData *data; 145 | 146 | int err; 147 | 148 | d.nHPad = vsapi->mapGetIntSaturated(in, "hpad", 0, &err); 149 | if (err) 150 | d.nHPad = 16; 151 | 152 | d.nVPad = vsapi->mapGetIntSaturated(in, "vpad", 0, &err); 153 | if (err) 154 | d.nVPad = 16; 155 | 156 | d.nPel = vsapi->mapGetIntSaturated(in, "pel", 0, &err); 157 | if (err) 158 | d.nPel = 2; 159 | 160 | d.nLevels = vsapi->mapGetIntSaturated(in, "levels", 0, &err); 161 | 162 | d.chroma = !!vsapi->mapGetInt(in, "chroma", 0, &err); 163 | if (err) 164 | d.chroma = 1; 165 | 166 | d.sharp = vsapi->mapGetIntSaturated(in, "sharp", 0, &err); // pel2 interpolation type 167 | if (err) 168 | d.sharp = SharpWiener; 169 | 170 | d.rfilter = vsapi->mapGetIntSaturated(in, "rfilter", 0, &err); 171 | if (err) 172 | d.rfilter = RfilterBilinear; 173 | 174 | d.opt = !!vsapi->mapGetInt(in, "opt", 0, &err); 175 | if (err) 176 | d.opt = 1; 177 | 178 | 179 | if ((d.nPel != 1) && (d.nPel != 2) && (d.nPel != 4)) { 180 | vsapi->mapSetError(out, "Super: pel must be 1, 2, or 4."); 181 | return; 182 | } 183 | 184 | if (d.sharp < SharpBilinear || d.sharp > SharpWiener) { 185 | vsapi->mapSetError(out, "Super: sharp must be between 0 and 2 (inclusive)."); 186 | return; 187 | } 188 | 189 | if (d.rfilter < RfilterSimple || d.rfilter > RfilterCubic) { 190 | vsapi->mapSetError(out, "Super: rfilter must be between 0 and 4 (inclusive)."); 191 | return; 192 | } 193 | 194 | 195 | d.node = vsapi->mapGetNode(in, "clip", 0, 0); 196 | 197 | // Make a copy of the video info, so we can reference 198 | // it and modify it below. 199 | d.vi = *vsapi->getVideoInfo(d.node); 200 | 201 | d.nWidth = d.vi.width; 202 | d.nHeight = d.vi.height; 203 | 204 | if (!vsh_isConstantVideoFormat(&d.vi) || d.vi.format.bitsPerSample > 16 || d.vi.format.sampleType != stInteger || 205 | d.vi.format.subSamplingW > 1 || d.vi.format.subSamplingH > 1 || (d.vi.format.colorFamily != cfYUV && d.vi.format.colorFamily != cfGray)) { 206 | vsapi->mapSetError(out, "Super: input clip must be GRAY, 420, 422, 440, or 444, up to 16 bits, with constant dimensions."); 207 | vsapi->freeNode(d.node); 208 | return; 209 | } 210 | 211 | if (d.vi.format.colorFamily == cfGray) 212 | d.chroma = 0; 213 | 214 | d.nModeYUV = d.chroma ? YUVPLANES : YPLANE; 215 | 216 | 217 | d.xRatioUV = 1 << d.vi.format.subSamplingW; 218 | d.yRatioUV = 1 << d.vi.format.subSamplingH; 219 | 220 | int nLevelsMax = 0; 221 | while (PlaneHeightLuma(d.vi.height, nLevelsMax, d.yRatioUV, d.nVPad) >= d.yRatioUV * 2 && 222 | PlaneWidthLuma(d.vi.width, nLevelsMax, d.xRatioUV, d.nHPad) >= d.xRatioUV * 2) // at last two pixels width and height of chroma 223 | { 224 | nLevelsMax++; 225 | } 226 | if (d.nLevels <= 0 || d.nLevels > nLevelsMax) 227 | d.nLevels = nLevelsMax; 228 | 229 | d.pelclip = vsapi->mapGetNode(in, "pelclip", 0, &err); 230 | const VSVideoInfo *pelvi = d.pelclip ? vsapi->getVideoInfo(d.pelclip) : NULL; 231 | 232 | if (d.pelclip && (!vsh_isConstantVideoFormat(pelvi) || !vsh_isSameVideoFormat(&pelvi->format, &d.vi.format))) { 233 | vsapi->mapSetError(out, "Super: pelclip must have the same format as the input clip, and it must have constant dimensions."); 234 | vsapi->freeNode(d.node); 235 | vsapi->freeNode(d.pelclip); 236 | return; 237 | } 238 | 239 | d.usePelClip = 0; 240 | if (d.pelclip && (d.nPel >= 2)) { 241 | if ((pelvi->width == d.vi.width * d.nPel) && 242 | (pelvi->height == d.vi.height * d.nPel)) { 243 | d.usePelClip = 1; 244 | d.isPelClipPadded = 0; 245 | } else if ((pelvi->width == (d.vi.width + d.nHPad * 2) * d.nPel) && 246 | (pelvi->height == (d.vi.height + d.nVPad * 2) * d.nPel)) { 247 | d.usePelClip = 1; 248 | d.isPelClipPadded = 1; 249 | } else { 250 | vsapi->mapSetError(out, "Super: pelclip's dimensions must be multiples of the input clip's dimensions."); 251 | vsapi->freeNode(d.pelclip); 252 | vsapi->freeNode(d.node); 253 | return; 254 | } 255 | } 256 | 257 | d.nSuperWidth = d.nWidth + 2 * d.nHPad; 258 | d.nSuperHeight = PlaneSuperOffset(0, d.nHeight, d.nLevels, d.nPel, d.nVPad, d.nSuperWidth, d.yRatioUV) / d.nSuperWidth; 259 | if (d.yRatioUV == 2 && d.nSuperHeight & 1) 260 | d.nSuperHeight++; // even 261 | if (d.xRatioUV == 2 && d.nSuperWidth & 1) 262 | d.nSuperWidth++; 263 | d.vi.width = d.nSuperWidth; 264 | d.vi.height = d.nSuperHeight; 265 | 266 | 267 | data = (MVSuperData *)malloc(sizeof(d)); 268 | *data = d; 269 | 270 | VSFilterDependency deps[1] = { 271 | {data->node, rpStrictSpatial} 272 | }; 273 | 274 | vsapi->createVideoFilter(out, "Super", &data->vi, mvsuperGetFrame, mvsuperFree, fmParallel, deps, ARRAY_SIZE(deps), data, core); 275 | } 276 | 277 | 278 | void mvsuperRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi) { 279 | vspapi->registerFunction("Super", 280 | "clip:vnode;" 281 | "hpad:int:opt;" 282 | "vpad:int:opt;" 283 | "pel:int:opt;" 284 | "levels:int:opt;" 285 | "chroma:int:opt;" 286 | "sharp:int:opt;" 287 | "rfilter:int:opt;" 288 | "pelclip:vnode:opt;" 289 | "opt:int:opt;", 290 | "clip:vnode;", 291 | mvsuperCreate, 0, plugin); 292 | } 293 | -------------------------------------------------------------------------------- /src/MaskFun.h: -------------------------------------------------------------------------------- 1 | // Create an overlay mask with the motion vectors 2 | 3 | // See legal notice in Copying.txt for more information 4 | 5 | // This program is free software; you can redistribute it and/or modify 6 | // it under the terms of the GNU General Public License as published by 7 | // the Free Software Foundation; either version 2 of the License, or 8 | // (at your option) any later version. 9 | // 10 | // This program is distributed in the hope that it will be useful, 11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | // GNU General Public License for more details. 14 | // 15 | // You should have received a copy of the GNU General Public License 16 | // along with this program; if not, write to the Free Software 17 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit 18 | // http://www.gnu.org/copyleft/gpl.html . 19 | 20 | #ifndef MASKFUN_H 21 | #define MASKFUN_H 22 | 23 | #ifdef __cplusplus 24 | extern "C" { 25 | #endif 26 | 27 | #include 28 | 29 | #include "Fakery.h" 30 | #include "MVFrame.h" 31 | 32 | void CheckAndPadSmallY(int16_t *VXSmallY, int16_t *VYSmallY, int nBlkXP, int nBlkYP, int nBlkX, int nBlkY); 33 | 34 | void CheckAndPadMaskSmall(uint8_t *MaskSmall, int nBlkXP, int nBlkYP, int nBlkX, int nBlkY); 35 | 36 | void MakeVectorOcclusionMaskTime(const FakeGroupOfPlanes *fgop, int isBackward, int nBlkX, int nBlkY, double dMaskNormDivider, double fGamma, int nPel, uint8_t *occMask, int occMaskPitch, int time256, int nBlkStepX, int nBlkStepY); 37 | 38 | void MakeSADMaskTime(const FakeGroupOfPlanes *fgop, int nBlkX, int nBlkY, double dSADNormFactor, double fGamma, int nPel, uint8_t *Mask, int MaskPitch, int time256, int nBlkStepX, int nBlkStepY, int bitsPerSample); 39 | 40 | void MakeVectorSmallMasks(const FakeGroupOfPlanes *fgop, int nX, int nY, int16_t *VXSmallY, int pitchVXSmallY, int16_t *VYSmallY, int pitchVYSmallY); 41 | void VectorSmallMaskYToHalfUV(int16_t *VSmallY, int nBlkX, int nBlkY, int16_t *VSmallUV, int ratioUV); 42 | 43 | void Merge4PlanesToBig(uint8_t *pel2Plane, int pel2Pitch, const uint8_t *pPlane0, const uint8_t *pPlane1, 44 | const uint8_t *pPlane2, const uint8_t *pPlane3, int width, int height, int pitch, int bitsPerSample); 45 | 46 | void Merge16PlanesToBig(uint8_t *pel4Plane, int pel4Pitch, 47 | const uint8_t *pPlane0, const uint8_t *pPlane1, const uint8_t *pPlane2, const uint8_t *pPlane3, 48 | const uint8_t *pPlane4, const uint8_t *pPlane5, const uint8_t *pPlane6, const uint8_t *pPlane7, 49 | const uint8_t *pPlane8, const uint8_t *pPlane9, const uint8_t *pPlane10, const uint8_t *pPlane11, 50 | const uint8_t *pPlane12, const uint8_t *pPlane13, const uint8_t *pPlane14, const uint8_t *pPlane15, 51 | int width, int height, int pitch, int bitsPerSample); 52 | 53 | uint8_t SADToMask(unsigned int sad, unsigned int sadnorm1024); 54 | 55 | void Blend(uint8_t *pdst, const uint8_t *psrc, const uint8_t *pref, int height, int width, int dst_pitch, int src_pitch, int ref_pitch, int time256, int bitsPerSample); 56 | 57 | 58 | typedef void (*FlowInterSimpleFunction)( 59 | uint8_t *pdst, int dst_pitch, 60 | const uint8_t *prefB, const uint8_t *prefF, int ref_pitch, 61 | const int16_t *VXFullB, const int16_t *VXFullF, 62 | const int16_t *VYFullB, const int16_t *VYFullF, 63 | const uint8_t *MaskB, const uint8_t *MaskF, int VPitch, 64 | int width, int height, 65 | int time256, int nPel); 66 | 67 | typedef void (*FlowInterFunction)( 68 | uint8_t *pdst, int dst_pitch, 69 | const uint8_t *prefB, const uint8_t *prefF, int ref_pitch, 70 | const int16_t *VXFullB, const int16_t *VXFullF, 71 | const int16_t *VYFullB, const int16_t *VYFullF, 72 | const uint8_t *MaskB, const uint8_t *MaskF, int VPitch, 73 | int width, int height, 74 | int time256, int nPel); 75 | 76 | typedef void (*FlowInterExtraFunction)( 77 | uint8_t *pdst, int dst_pitch, 78 | const uint8_t *prefB, const uint8_t *prefF, int ref_pitch, 79 | const int16_t *VXFullB, const int16_t *VXFullF, 80 | const int16_t *VYFullB, const int16_t *VYFullF, 81 | const uint8_t *MaskB, const uint8_t *MaskF, int VPitch, 82 | int width, int height, 83 | int time256, int nPel, 84 | const int16_t *VXFullBB, const int16_t *VXFullFF, 85 | const int16_t *VYFullBB, const int16_t *VYFullFF); 86 | 87 | void selectFlowInterFunctions(FlowInterSimpleFunction *simple, FlowInterFunction *regular, FlowInterExtraFunction *extra, int bitsPerSample, int opt); 88 | 89 | #ifdef __cplusplus 90 | } // extern "C" 91 | #endif 92 | 93 | #endif 94 | -------------------------------------------------------------------------------- /src/Overlap.cpp: -------------------------------------------------------------------------------- 1 | // Overlap copy (really addition) 2 | // Copyright(c)2006 A.G.Balakhnin aka Fizick 3 | 4 | // This program is free software; you can redistribute it and/or modify 5 | // it under the terms of the GNU General Public License as published by 6 | // the Free Software Foundation; either version 2 of the License, or 7 | // (at your option) any later version. 8 | // 9 | // This program is distributed in the hope that it will be useful, 10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | // GNU General Public License for more details. 13 | // 14 | // You should have received a copy of the GNU General Public License 15 | // along with this program; if not, write to the Free Software 16 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit 17 | // http://www.gnu.org/copyleft/gpl.html . 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "CPU.h" 25 | #include "Overlap.h" 26 | 27 | #ifndef M_PI 28 | #define M_PI 3.14159265358979323846f 29 | #endif 30 | 31 | #ifndef min 32 | #define min(a, b) (((a) < (b)) ? (a) : (b)) 33 | #endif 34 | 35 | #ifndef max 36 | #define max(a, b) (((a) < (b)) ? (b) : (a)) 37 | #endif 38 | 39 | 40 | void overInit(OverlapWindows *over, int nx, int ny, int ox, int oy) { 41 | over->nx = nx; 42 | over->ny = ny; 43 | over->ox = ox; 44 | over->oy = oy; 45 | over->size = nx * ny; 46 | 47 | // windows 48 | over->fWin1UVx = (float *)malloc(nx * sizeof(float)); 49 | over->fWin1UVxfirst = (float *)malloc(nx * sizeof(float)); 50 | over->fWin1UVxlast = (float *)malloc(nx * sizeof(float)); 51 | for (int i = 0; i < ox; i++) { 52 | over->fWin1UVx[i] = cosf(M_PI * (i - ox + 0.5f) / (ox * 2)); 53 | over->fWin1UVx[i] = over->fWin1UVx[i] * over->fWin1UVx[i]; // left window (rised cosine) 54 | over->fWin1UVxfirst[i] = 1; // very first window 55 | over->fWin1UVxlast[i] = over->fWin1UVx[i]; // very last 56 | } 57 | for (int i = ox; i < nx - ox; i++) { 58 | over->fWin1UVx[i] = 1; 59 | over->fWin1UVxfirst[i] = 1; // very first window 60 | over->fWin1UVxlast[i] = 1; // very last 61 | } 62 | for (int i = nx - ox; i < nx; i++) { 63 | over->fWin1UVx[i] = cosf(M_PI * (i - nx + ox + 0.5f) / (ox * 2)); 64 | over->fWin1UVx[i] = over->fWin1UVx[i] * over->fWin1UVx[i]; // right window (falled cosine) 65 | over->fWin1UVxfirst[i] = over->fWin1UVx[i]; // very first window 66 | over->fWin1UVxlast[i] = 1; // very last 67 | } 68 | 69 | over->fWin1UVy = (float *)malloc(ny * sizeof(float)); 70 | over->fWin1UVyfirst = (float *)malloc(ny * sizeof(float)); 71 | over->fWin1UVylast = (float *)malloc(ny * sizeof(float)); 72 | for (int i = 0; i < oy; i++) { 73 | over->fWin1UVy[i] = cosf(M_PI * (i - oy + 0.5f) / (oy * 2)); 74 | over->fWin1UVy[i] = over->fWin1UVy[i] * over->fWin1UVy[i]; // left window (rised cosine) 75 | over->fWin1UVyfirst[i] = 1; // very first window 76 | over->fWin1UVylast[i] = over->fWin1UVy[i]; // very last 77 | } 78 | for (int i = oy; i < ny - oy; i++) { 79 | over->fWin1UVy[i] = 1; 80 | over->fWin1UVyfirst[i] = 1; // very first window 81 | over->fWin1UVylast[i] = 1; // very last 82 | } 83 | for (int i = ny - oy; i < ny; i++) { 84 | over->fWin1UVy[i] = cosf(M_PI * (i - ny + oy + 0.5f) / (oy * 2)); 85 | over->fWin1UVy[i] = over->fWin1UVy[i] * over->fWin1UVy[i]; // right window (falled cosine) 86 | over->fWin1UVyfirst[i] = over->fWin1UVy[i]; // very first window 87 | over->fWin1UVylast[i] = 1; // very last 88 | } 89 | 90 | 91 | over->Overlap9Windows = (int16_t *)malloc(over->size * 9 * sizeof(int16_t)); 92 | 93 | int16_t *winOverUVTL = over->Overlap9Windows; 94 | int16_t *winOverUVTM = over->Overlap9Windows + over->size; 95 | int16_t *winOverUVTR = over->Overlap9Windows + over->size * 2; 96 | int16_t *winOverUVML = over->Overlap9Windows + over->size * 3; 97 | int16_t *winOverUVMM = over->Overlap9Windows + over->size * 4; 98 | int16_t *winOverUVMR = over->Overlap9Windows + over->size * 5; 99 | int16_t *winOverUVBL = over->Overlap9Windows + over->size * 6; 100 | int16_t *winOverUVBM = over->Overlap9Windows + over->size * 7; 101 | int16_t *winOverUVBR = over->Overlap9Windows + over->size * 8; 102 | 103 | for (int j = 0; j < ny; j++) { 104 | for (int i = 0; i < nx; i++) { 105 | winOverUVTL[i] = (int)(over->fWin1UVyfirst[j] * over->fWin1UVxfirst[i] * 2048 + 0.5f); 106 | winOverUVTM[i] = (int)(over->fWin1UVyfirst[j] * over->fWin1UVx[i] * 2048 + 0.5f); 107 | winOverUVTR[i] = (int)(over->fWin1UVyfirst[j] * over->fWin1UVxlast[i] * 2048 + 0.5f); 108 | winOverUVML[i] = (int)(over->fWin1UVy[j] * over->fWin1UVxfirst[i] * 2048 + 0.5f); 109 | winOverUVMM[i] = (int)(over->fWin1UVy[j] * over->fWin1UVx[i] * 2048 + 0.5f); 110 | winOverUVMR[i] = (int)(over->fWin1UVy[j] * over->fWin1UVxlast[i] * 2048 + 0.5f); 111 | winOverUVBL[i] = (int)(over->fWin1UVylast[j] * over->fWin1UVxfirst[i] * 2048 + 0.5f); 112 | winOverUVBM[i] = (int)(over->fWin1UVylast[j] * over->fWin1UVx[i] * 2048 + 0.5f); 113 | winOverUVBR[i] = (int)(over->fWin1UVylast[j] * over->fWin1UVxlast[i] * 2048 + 0.5f); 114 | } 115 | winOverUVTL += nx; 116 | winOverUVTM += nx; 117 | winOverUVTR += nx; 118 | winOverUVML += nx; 119 | winOverUVMM += nx; 120 | winOverUVMR += nx; 121 | winOverUVBL += nx; 122 | winOverUVBM += nx; 123 | winOverUVBR += nx; 124 | } 125 | } 126 | 127 | 128 | void overDeinit(OverlapWindows *over) { 129 | free(over->Overlap9Windows); 130 | free(over->fWin1UVx); 131 | free(over->fWin1UVxfirst); 132 | free(over->fWin1UVxlast); 133 | free(over->fWin1UVy); 134 | free(over->fWin1UVyfirst); 135 | free(over->fWin1UVylast); 136 | } 137 | 138 | 139 | int16_t *overGetWindow(const OverlapWindows *over, int i) { 140 | return over->Overlap9Windows + over->size * i; 141 | } 142 | 143 | 144 | template 145 | void overlaps_c(uint8_t *pDst8, intptr_t nDstPitch, const uint8_t *pSrc8, intptr_t nSrcPitch, int16_t *pWin, intptr_t nWinPitch) { 146 | /* pWin from 0 to 2048 */ 147 | for (unsigned j = 0; j < blockHeight; j++) { 148 | for (unsigned i = 0; i < blockWidth; i++) { 149 | PixelType2 *pDst = (PixelType2 *)pDst8; 150 | const PixelType *pSrc = (const PixelType *)pSrc8; 151 | 152 | pDst[i] += ((pSrc[i] * pWin[i]) >> 6); 153 | } 154 | pDst8 += nDstPitch; 155 | pSrc8 += nSrcPitch; 156 | pWin += nWinPitch; 157 | } 158 | } 159 | 160 | 161 | #if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM) 162 | 163 | #if defined(MVTOOLS_ARM) 164 | #include "sse2neon.h" 165 | #else 166 | #include 167 | #endif 168 | 169 | 170 | #define zeroes _mm_setzero_si128() 171 | 172 | 173 | template 174 | struct OverlapsWrapper { 175 | static_assert(blockWidth >= 8, ""); 176 | 177 | static void overlaps_sse2(uint8_t *pDst8, intptr_t nDstPitch, const uint8_t *pSrc, intptr_t nSrcPitch, int16_t *pWin, intptr_t nWinPitch) { 178 | /* pWin from 0 to 2048 */ 179 | for (unsigned y = 0; y < blockHeight; y++) { 180 | for (unsigned x = 0; x < blockWidth; x += 8) { 181 | uint16_t *pDst = (uint16_t *)pDst8; 182 | 183 | __m128i src = _mm_loadl_epi64((const __m128i *)&pSrc[x]); 184 | __m128i win = _mm_loadu_si128((const __m128i *)&pWin[x]); 185 | __m128i dst = _mm_loadu_si128((__m128i *)&pDst[x]); 186 | 187 | src = _mm_unpacklo_epi8(src, zeroes); 188 | 189 | __m128i lo = _mm_mullo_epi16(src, win); 190 | __m128i hi = _mm_mulhi_epi16(src, win); 191 | lo = _mm_srli_epi16(lo, 6); 192 | hi = _mm_slli_epi16(hi, 10); 193 | dst = _mm_adds_epu16(dst, _mm_or_si128(lo, hi)); 194 | _mm_storeu_si128((__m128i *)&pDst[x], dst); 195 | } 196 | 197 | pDst8 += nDstPitch; 198 | pSrc += nSrcPitch; 199 | pWin += nWinPitch; 200 | } 201 | } 202 | 203 | }; 204 | 205 | 206 | template 207 | struct OverlapsWrapper<4, blockHeight> { 208 | 209 | static void overlaps_sse2(uint8_t *pDst, intptr_t nDstPitch, const uint8_t *pSrc, intptr_t nSrcPitch, int16_t *pWin, intptr_t nWinPitch) { 210 | /* pWin from 0 to 2048 */ 211 | for (unsigned y = 0; y < blockHeight; y++) { 212 | __m128i src = _mm_cvtsi32_si128(*(const int *)pSrc); 213 | __m128i win = _mm_loadl_epi64((const __m128i *)pWin); 214 | __m128i dst = _mm_loadl_epi64((const __m128i *)pDst); 215 | 216 | src = _mm_unpacklo_epi8(src, zeroes); 217 | 218 | __m128i lo = _mm_mullo_epi16(src, win); 219 | __m128i hi = _mm_mulhi_epi16(src, win); 220 | lo = _mm_srli_epi16(lo, 6); 221 | hi = _mm_slli_epi16(hi, 10); 222 | dst = _mm_adds_epu16(dst, _mm_or_si128(lo, hi)); 223 | _mm_storel_epi64((__m128i *)pDst, dst); 224 | 225 | pDst += nDstPitch; 226 | pSrc += nSrcPitch; 227 | pWin += nWinPitch; 228 | } 229 | } 230 | 231 | }; 232 | 233 | 234 | #undef zeroes 235 | 236 | 237 | #endif 238 | 239 | 240 | // opt can fit in four bits, if the width and height need more than eight bits each. 241 | #define KEY(width, height, bits, opt) (unsigned)(width) << 24 | (height) << 16 | (bits) << 8 | (opt) 242 | 243 | #if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM) 244 | #define OVERS_SSE2(width, height) \ 245 | { KEY(width, height, 8, MVOPT_SSE2), OverlapsWrapper::overlaps_sse2 }, 246 | #else 247 | #define OVERS_SSE2(width, height) 248 | #endif 249 | 250 | #define OVERS(width, height) \ 251 | { KEY(width, height, 8, MVOPT_SCALAR), overlaps_c }, \ 252 | { KEY(width, height, 16, MVOPT_SCALAR), overlaps_c }, 253 | 254 | static const std::unordered_map overlaps_functions = { 255 | OVERS(2, 2) 256 | OVERS(2, 4) 257 | OVERS(4, 2) 258 | OVERS(4, 4) 259 | OVERS(4, 8) 260 | OVERS(8, 1) 261 | OVERS(8, 2) 262 | OVERS(8, 4) 263 | OVERS(8, 8) 264 | OVERS(8, 16) 265 | OVERS(16, 1) 266 | OVERS(16, 2) 267 | OVERS(16, 4) 268 | OVERS(16, 8) 269 | OVERS(16, 16) 270 | OVERS(16, 32) 271 | OVERS(32, 8) 272 | OVERS(32, 16) 273 | OVERS(32, 32) 274 | OVERS(32, 64) 275 | OVERS(64, 16) 276 | OVERS(64, 32) 277 | OVERS(64, 64) 278 | OVERS(64, 128) 279 | OVERS(128, 32) 280 | OVERS(128, 64) 281 | OVERS(128, 128) 282 | OVERS_SSE2(4, 2) 283 | OVERS_SSE2(4, 4) 284 | OVERS_SSE2(4, 8) 285 | OVERS_SSE2(8, 1) 286 | OVERS_SSE2(8, 2) 287 | OVERS_SSE2(8, 4) 288 | OVERS_SSE2(8, 8) 289 | OVERS_SSE2(8, 16) 290 | OVERS_SSE2(16, 1) 291 | OVERS_SSE2(16, 2) 292 | OVERS_SSE2(16, 4) 293 | OVERS_SSE2(16, 8) 294 | OVERS_SSE2(16, 16) 295 | OVERS_SSE2(16, 32) 296 | OVERS_SSE2(32, 8) 297 | OVERS_SSE2(32, 16) 298 | OVERS_SSE2(32, 32) 299 | OVERS_SSE2(32, 64) 300 | OVERS_SSE2(64, 16) 301 | OVERS_SSE2(64, 32) 302 | OVERS_SSE2(64, 64) 303 | OVERS_SSE2(64, 128) 304 | OVERS_SSE2(128, 32) 305 | OVERS_SSE2(128, 64) 306 | OVERS_SSE2(128, 128) 307 | }; 308 | 309 | OverlapsFunction selectOverlapsFunction(unsigned width, unsigned height, unsigned bits, int opt) { 310 | OverlapsFunction overs = overlaps_functions.at(KEY(width, height, bits, MVOPT_SCALAR)); 311 | 312 | #if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM) 313 | if (opt) { 314 | try { 315 | overs = overlaps_functions.at(KEY(width, height, bits, MVOPT_SSE2)); 316 | } catch (std::out_of_range &) { } 317 | #ifdef MVTOOLS_X86 318 | if (g_cpuinfo & X264_CPU_AVX2) { 319 | OverlapsFunction tmp = selectOverlapsFunctionAVX2(width, height, bits); 320 | if (tmp) 321 | overs = tmp; 322 | } 323 | #endif 324 | } 325 | #endif 326 | 327 | return overs; 328 | } 329 | 330 | #undef OVERS 331 | #undef OVERS_SSE2 332 | #undef KEY 333 | 334 | 335 | #define ToPixels(PixelType2, PixelType) \ 336 | void ToPixels_##PixelType2##_##PixelType(uint8_t *pDst8, int nDstPitch, const uint8_t *pSrc8, int nSrcPitch, int nWidth, int nHeight, int bitsPerSample) { \ 337 | int pixelMax = (1 << bitsPerSample) - 1; \ 338 | \ 339 | for (int h = 0; h < nHeight; h++) { \ 340 | for (int i = 0; i < nWidth; i++) { \ 341 | const PixelType2 *pSrc = (const PixelType2 *)pSrc8; \ 342 | PixelType *pDst = (PixelType *)pDst8; \ 343 | \ 344 | int a = (pSrc[i] + 16) >> 5; \ 345 | if (sizeof(PixelType) == 1) \ 346 | pDst[i] = a | ((255 - a) >> (sizeof(int) * 8 - 1)); \ 347 | else \ 348 | pDst[i] = min(pixelMax, a); \ 349 | } \ 350 | pDst8 += nDstPitch; \ 351 | pSrc8 += nSrcPitch; \ 352 | } \ 353 | } 354 | 355 | ToPixels(uint16_t, uint8_t) 356 | ToPixels(uint32_t, uint16_t) 357 | -------------------------------------------------------------------------------- /src/Overlap.h: -------------------------------------------------------------------------------- 1 | #ifndef OVERLAP_H 2 | #define OVERLAP_H 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #include 9 | 10 | // top, middle, botom and left, middle, right windows 11 | #define OW_TL 0 12 | #define OW_TM 1 13 | #define OW_TR 2 14 | #define OW_ML 3 15 | #define OW_MM 4 16 | #define OW_MR 5 17 | #define OW_BL 6 18 | #define OW_BM 7 19 | #define OW_BR 8 20 | 21 | typedef struct OverlapWindows { 22 | int nx; // window sizes 23 | int ny; 24 | int ox; // overap sizes 25 | int oy; 26 | int size; // full window size= nx*ny 27 | 28 | int16_t *Overlap9Windows; 29 | 30 | float *fWin1UVx; 31 | float *fWin1UVxfirst; 32 | float *fWin1UVxlast; 33 | float *fWin1UVy; 34 | float *fWin1UVyfirst; 35 | float *fWin1UVylast; 36 | } OverlapWindows; 37 | 38 | void overInit(OverlapWindows *over, int nx, int ny, int ox, int oy); 39 | 40 | void overDeinit(OverlapWindows *over); 41 | 42 | int16_t *overGetWindow(const OverlapWindows *over, int i); 43 | 44 | 45 | typedef void (*OverlapsFunction)(uint8_t *pDst, intptr_t nDstPitch, 46 | const uint8_t *pSrc, intptr_t nSrcPitch, 47 | int16_t *pWin, intptr_t nWinPitch); 48 | 49 | 50 | typedef void (*ToPixelsFunction)(uint8_t *pDst, int nDstPitch, 51 | const uint8_t *pSrc, int nSrcPitch, 52 | int width, int height, int bitsPerSample); 53 | 54 | void ToPixels_uint16_t_uint8_t(uint8_t *pDst8, int nDstPitch, const uint8_t *pSrc8, int nSrcPitch, int nWidth, int nHeight, int bitsPerSample); 55 | void ToPixels_uint32_t_uint16_t(uint8_t *pDst8, int nDstPitch, const uint8_t *pSrc8, int nSrcPitch, int nWidth, int nHeight, int bitsPerSample); 56 | 57 | OverlapsFunction selectOverlapsFunction(unsigned width, unsigned height, unsigned bits, int opt); 58 | 59 | #if defined(MVTOOLS_X86) 60 | OverlapsFunction selectOverlapsFunctionAVX2(unsigned width, unsigned height, unsigned bits); 61 | #endif 62 | 63 | #ifdef __cplusplus 64 | } // extern "C" 65 | #endif 66 | 67 | #endif // OVERLAP_H 68 | -------------------------------------------------------------------------------- /src/Overlap_AVX2.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "Overlap.h" 5 | 6 | #if defined(MVTOOLS_X86) 7 | 8 | #include 9 | 10 | template 11 | static void overlaps_avx2(uint8_t *pDst8, intptr_t nDstPitch, const uint8_t *pSrc, intptr_t nSrcPitch, int16_t *pWin, intptr_t nWinPitch) { 12 | static_assert(blockWidth >= 16 || (blockWidth == 8 && blockHeight >= 2), ""); 13 | 14 | int pitchMul = blockWidth == 8 ? 2 : 1; 15 | 16 | /* pWin from 0 to 2048 */ 17 | for (unsigned y = 0; y < blockHeight; y += pitchMul) { 18 | for (unsigned x = 0; x < blockWidth; x += 16 / pitchMul) { 19 | uint16_t *pDst = (uint16_t *)pDst8; 20 | 21 | __m256i src, win, dst; 22 | 23 | if (blockWidth == 8) { 24 | src = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)(pSrc + x)), _mm_loadl_epi64((const __m128i *)(pSrc + nSrcPitch + x)))); 25 | win = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(pWin + x))), _mm_loadu_si128((const __m128i *)(pWin + nWinPitch + x)), 1); 26 | dst = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(pDst + x))), _mm_loadu_si128((const __m128i *)(pDst8 + nDstPitch + x * sizeof(uint16_t))), 1); 27 | } else { 28 | src = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(pSrc + x))); 29 | win = _mm256_loadu_si256((const __m256i *)(pWin + x)); 30 | dst = _mm256_loadu_si256((const __m256i *)(pDst + x)); 31 | } 32 | 33 | __m256i lo = _mm256_mullo_epi16(src, win); 34 | __m256i hi = _mm256_mulhi_epi16(src, win); 35 | lo = _mm256_srli_epi16(lo, 6); 36 | hi = _mm256_slli_epi16(hi, 10); 37 | dst = _mm256_adds_epu16(dst, _mm256_or_si256(lo, hi)); 38 | 39 | if (blockWidth == 8) { 40 | _mm_storeu_si128((__m128i *)(pDst + x), _mm256_castsi256_si128(dst)); 41 | _mm_storeu_si128((__m128i *)(pDst8 + nDstPitch + x * sizeof(uint16_t)), _mm256_extractf128_si256(dst, 1)); 42 | } else { 43 | _mm256_storeu_si256((__m256i *)(pDst + x), dst); 44 | } 45 | } 46 | 47 | pDst8 += nDstPitch * pitchMul; 48 | pSrc += nSrcPitch * pitchMul; 49 | pWin += nWinPitch * pitchMul; 50 | } 51 | } 52 | 53 | #endif 54 | 55 | 56 | enum InstructionSets { 57 | Scalar, 58 | SSE2, 59 | AVX2, 60 | }; 61 | 62 | 63 | // opt can fit in four bits, if the width and height need more than eight bits each. 64 | #define KEY(width, height, bits, opt) (unsigned)(width) << 24 | (height) << 16 | (bits) << 8 | (opt) 65 | 66 | #if defined(MVTOOLS_X86) 67 | #define OVERS_AVX2(width, height) \ 68 | { KEY(width, height, 8, AVX2), overlaps_avx2 }, 69 | #else 70 | #define OVERS_AVX2(width, height) 71 | #endif 72 | 73 | static const std::unordered_map overlaps_functions = { 74 | OVERS_AVX2(8, 2) 75 | OVERS_AVX2(8, 4) 76 | OVERS_AVX2(8, 8) 77 | OVERS_AVX2(8, 16) 78 | OVERS_AVX2(16, 1) 79 | OVERS_AVX2(16, 2) 80 | OVERS_AVX2(16, 4) 81 | OVERS_AVX2(16, 8) 82 | OVERS_AVX2(16, 16) 83 | OVERS_AVX2(16, 32) 84 | OVERS_AVX2(32, 8) 85 | OVERS_AVX2(32, 16) 86 | OVERS_AVX2(32, 32) 87 | OVERS_AVX2(32, 64) 88 | OVERS_AVX2(64, 16) 89 | OVERS_AVX2(64, 32) 90 | OVERS_AVX2(64, 64) 91 | OVERS_AVX2(64, 128) 92 | OVERS_AVX2(128, 32) 93 | OVERS_AVX2(128, 64) 94 | OVERS_AVX2(128, 128) 95 | }; 96 | 97 | 98 | OverlapsFunction selectOverlapsFunctionAVX2(unsigned width, unsigned height, unsigned bits) { 99 | try { 100 | return overlaps_functions.at(KEY(width, height, bits, AVX2)); 101 | } catch (std::out_of_range &) { 102 | return nullptr; 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/PlaneOfBlocks.h: -------------------------------------------------------------------------------- 1 | // See legal notice in Copying.txt for more information 2 | 3 | // This program is free software; you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation; either version 2 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // This program is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit 16 | // http://www.gnu.org/copyleft/gpl.html . 17 | 18 | #ifndef PLANEOFBLOCKS_H 19 | #define PLANEOFBLOCKS_H 20 | 21 | #include 22 | 23 | #include "Fakery.h" 24 | #include "MVFrame.h" 25 | #include "CopyCode.h" 26 | #include "SADFunctions.h" 27 | #include "CommonFunctions.h" 28 | #include "Luma.h" 29 | #include "DCTFFTW.h" 30 | 31 | #ifdef __cplusplus 32 | extern "C" { 33 | #endif 34 | 35 | #define MAX_PREDICTOR 5 // right now 5 should be enough (TSchniede) 36 | 37 | //#define ONLY_CHECK_NONDEFAULT_MV // make the check if it is no default reference (zero, global,...) 38 | 39 | 40 | typedef struct PlaneOfBlocks { 41 | 42 | /* fields set at initialization */ 43 | 44 | int nBlkX; /* width in number of blocks */ 45 | int nBlkY; /* height in number of blocks */ 46 | int nBlkSizeX; /* size of a block */ 47 | int nBlkSizeY; /* size of a block */ 48 | int nBlkCount; /* number of blocks in the plane */ 49 | int nPel; /* pel refinement accuracy */ 50 | int nLogPel; /* logarithm of the pel refinement accuracy */ 51 | int nScale; /* scaling factor of the plane */ 52 | int nLogScale; /* logarithm of the scaling factor */ 53 | int nOverlapX; // overlap size 54 | int nOverlapY; // overlap size 55 | int xRatioUV; 56 | int yRatioUV; 57 | int nLogxRatioUV; // log of xRatioUV (0 for 1 and 1 for 2) 58 | int nLogyRatioUV; // log of yRatioUV (0 for 1 and 1 for 2) 59 | int bytesPerSample; 60 | 61 | SADFunction SAD; /* function which computes the sad */ 62 | LUMAFunction LUMA; /* function which computes the mean luma */ 63 | COPYFunction BLITLUMA; 64 | COPYFunction BLITCHROMA; 65 | SADFunction SADCHROMA; 66 | SADFunction SATD; /* SATD function, (similar to SAD), used as replacement to dct */ 67 | 68 | VECTOR *vectors; /* motion vectors of the blocks */ 69 | /* before the search, contains the hierachal predictor */ 70 | /* after the search, contains the best motion vector */ 71 | 72 | int smallestPlane; /* say whether vectors can used predictors from a smaller plane */ 73 | int chroma; /* do we do chroma me */ 74 | 75 | /* working fields */ 76 | 77 | MVFrame *pSrcFrame; 78 | MVFrame *pRefFrame; 79 | 80 | int nSrcPitch[3]; 81 | const uint8_t *pSrc[3]; // the alignment of this array is important for speed for some reason (cacheline?) 82 | int nRefPitch[3]; 83 | 84 | VECTOR bestMV; /* best vector found so far during the search */ 85 | int64_t nMinCost; /* minimum cost ( sad + mv cost ) found so far */ 86 | VECTOR predictor; /* best predictor for the current vector */ 87 | 88 | VECTOR predictors[MAX_PREDICTOR]; /* set of predictors for the current block */ 89 | 90 | int nDxMin; /* minimum x coordinate for the vector */ 91 | int nDyMin; /* minimum y coordinate for the vector */ 92 | int nDxMax; /* maximum x corrdinate for the vector */ 93 | int nDyMax; /* maximum y coordinate for the vector */ 94 | 95 | int x[3]; /* absolute x coordinate of the origin of the block in the reference frame */ 96 | int y[3]; /* absolute y coordinate of the origin of the block in the reference frame */ 97 | int blkx; /* x coordinate in blocks */ 98 | int blky; /* y coordinate in blocks */ 99 | int blkIdx; /* index of the block */ 100 | int blkScanDir; // direction of scan (1 is left to rught, -1 is right to left) 101 | 102 | /* search parameters */ 103 | 104 | SearchType searchType; /* search type used */ 105 | int nSearchParam; /* additionnal parameter for this search */ 106 | int64_t nLambda; /* vector cost factor */ 107 | int64_t LSAD; // SAD limit for lambda using - Fizick 108 | int penaltyNew; // cost penalty factor for new candidates 109 | int penaltyZero; // cost penalty factor for zero vector 110 | int pglobal; // cost penalty factor for global predictor 111 | // int nLambdaLen; // penalty factor (lambda) for vector length 112 | int64_t badSAD; // SAD threshold for more wide search 113 | int badrange; // wide search radius 114 | int badcount; // number of bad blocks refined 115 | int tryMany; // try refine around many predictors 116 | 117 | VECTOR globalMVPredictor; // predictor of global motion vector 118 | VECTOR zeroMVfieldShifted; // zero motion vector for fieldbased video at finest level pel2 119 | 120 | DCTFFTW *DCT; 121 | uint8_t *dctSrc; 122 | uint8_t *dctRef; 123 | int dctpitch; 124 | int dctmode; 125 | int srcLuma; 126 | int refLuma; 127 | int sumLumaChange; 128 | int dctweight16; 129 | int *freqArray; // temporary array for global motion estimaton 130 | int freqSize; // size of freqArray 131 | int64_t verybigSAD; 132 | 133 | int nSrcPitch_temp[3]; 134 | uint8_t *pSrc_temp[3]; //for easy WRITE access to temp block 135 | } PlaneOfBlocks; 136 | 137 | 138 | void pobInit(PlaneOfBlocks *pob, int _nBlkX, int _nBlkY, int _nBlkSizeX, int _nBlkSizeY, int _nPel, int _nLevel, int nMotionFlags, int nCPUFlags, int _nOverlapX, int _nOverlapY, int _xRatioUV, int _yRatioUV, int bitsPerSample); 139 | 140 | void pobDeinit(PlaneOfBlocks *pob); 141 | 142 | void pobEstimateGlobalMVDoubled(PlaneOfBlocks *pob, VECTOR *globalMVec); 143 | 144 | MVArraySizeType pobGetArraySize(const PlaneOfBlocks *pob, int divideMode); 145 | 146 | void pobInterpolatePrediction(PlaneOfBlocks *pob, const PlaneOfBlocks *pob2); 147 | 148 | void pobRecalculateMVs(PlaneOfBlocks *pob, const FakeGroupOfPlanes *fgop, MVFrame *pSrcFrame, MVFrame *pRefFrame, SearchType st, int stp, int lambda, int pnew, uint8_t *out, int fieldShift, int64_t thSAD, DCTFFTW *DCT, int dctmode, int smooth, int meander); 149 | 150 | void pobSearchMVs(PlaneOfBlocks *pob, MVFrame *pSrcFrame, MVFrame *pRefFrame, SearchType st, int stp, int lambda, int lsad, int pnew, int plevel, uint8_t *out, VECTOR *globalMVec, int fieldShift, DCTFFTW *DCT, int dctmode, int *pmeanLumaChange, int pzero, int pglobal, int64_t badSAD, int badrange, int meander, int tryMany); 151 | 152 | MVArraySizeType pobWriteDefaultToArray(const PlaneOfBlocks *pob, uint8_t *array, int divideMode); 153 | 154 | #ifdef __cplusplus 155 | } 156 | #endif 157 | 158 | #endif 159 | -------------------------------------------------------------------------------- /src/SADFunctions.h: -------------------------------------------------------------------------------- 1 | // Functions that computes distances between blocks 2 | 3 | // See legal notice in Copying.txt for more information 4 | 5 | // This program is free software; you can redistribute it and/or modify 6 | // it under the terms of the GNU General Public License as published by 7 | // the Free Software Foundation; either version 2 of the License, or 8 | // (at your option) any later version. 9 | // 10 | // This program is distributed in the hope that it will be useful, 11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | // GNU General Public License for more details. 14 | // 15 | // You should have received a copy of the GNU General Public License 16 | // along with this program; if not, write to the Free Software 17 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit 18 | // http://www.gnu.org/copyleft/gpl.html . 19 | 20 | #ifndef SADFUNCTIONS_H 21 | #define SADFUNCTIONS_H 22 | 23 | #ifdef __cplusplus 24 | extern "C" { 25 | #endif 26 | 27 | #include 28 | 29 | 30 | typedef unsigned int (*SADFunction)(const uint8_t *pSrc, intptr_t nSrcPitch, 31 | const uint8_t *pRef, intptr_t nRefPitch); 32 | 33 | 34 | SADFunction selectSADFunction(unsigned width, unsigned height, unsigned bits, int opt, unsigned cpu); 35 | 36 | SADFunction selectSATDFunction(unsigned width, unsigned height, unsigned bits, int opt, unsigned cpu); 37 | 38 | 39 | #if defined(MVTOOLS_X86) 40 | SADFunction selectSADFunctionAVX2(unsigned width, unsigned height, unsigned bits); 41 | #endif 42 | 43 | #ifdef __cplusplus 44 | } // extern "C" 45 | #endif 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /src/SADFunctions_AVX2.cpp: -------------------------------------------------------------------------------- 1 | #if defined(MVTOOLS_X86) 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "SADFunctions.h" 8 | 9 | #define zeroes _mm256_setzero_si256() 10 | 11 | 12 | // This version used for width >= 32. 13 | template 14 | struct SADWrapperU8_AVX2 { 15 | static_assert(width >= 32, ""); 16 | 17 | static unsigned int sad_u8_avx2(const uint8_t *pSrc, intptr_t nSrcPitch, const uint8_t *pRef, intptr_t nRefPitch) { 18 | (void)nSrcPitch; 19 | 20 | __m256i sum = zeroes; 21 | 22 | for (unsigned y = 0; y < height; y++) { 23 | for (unsigned x = 0; x < width; x += 32) { 24 | __m256i m2 = _mm256_loadu_si256((const __m256i *)&pSrc[x]); 25 | __m256i m3 = _mm256_loadu_si256((const __m256i *)&pRef[x]); 26 | 27 | __m256i diff = _mm256_sad_epu8(m2, m3); 28 | 29 | sum = _mm256_add_epi64(sum, diff); 30 | } 31 | 32 | pSrc += /*nSrcPitch*/ width; 33 | pRef += nRefPitch; 34 | } 35 | 36 | sum = _mm256_add_epi64(sum, _mm256_permute4x64_epi64(sum, _MM_SHUFFLE(0, 0, 3, 2))); 37 | sum = _mm256_add_epi64(sum, _mm256_shuffle_epi32(sum, _MM_SHUFFLE(0, 0, 3, 2))); 38 | return (unsigned)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum)); 39 | } 40 | 41 | }; 42 | 43 | template 44 | struct SADWrapperU8_AVX2<16, height> { 45 | static_assert(height >= 2, ""); 46 | 47 | static unsigned int sad_u8_avx2(const uint8_t *pSrc, intptr_t nSrcPitch, const uint8_t *pRef, intptr_t nRefPitch) { 48 | (void)nSrcPitch; 49 | 50 | __m256i sum = zeroes; 51 | 52 | for (int y = 0; (unsigned)y < height; y += 2) { 53 | __m256i m2 = _mm256_loadu_si256((const __m256i *)(pSrc + y * 16)); 54 | __m256i m3 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(pRef + y * nRefPitch))); 55 | m3 = _mm256_insertf128_si256(m3, _mm_loadu_si128((const __m128i *)(pRef + (y + 1) * nRefPitch)), 1); 56 | 57 | __m256i diff = _mm256_sad_epu8(m2, m3); 58 | sum = _mm256_add_epi64(sum, diff); 59 | } 60 | 61 | sum = _mm256_add_epi64(sum, _mm256_permute4x64_epi64(sum, _MM_SHUFFLE(0, 0, 3, 2))); 62 | sum = _mm256_add_epi64(sum, _mm256_shuffle_epi32(sum, _MM_SHUFFLE(0, 0, 3, 2))); 63 | return (unsigned)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum)); 64 | } 65 | }; 66 | 67 | 68 | // opt can fit in four bits, if the width and height need more than eight bits each. 69 | #define KEY(width, height, bits, opt) (unsigned)(width) << 24 | (height) << 16 | (bits) << 8 | (opt) 70 | 71 | 72 | #define SAD_U8_AVX2(width, height) \ 73 | { KEY(width, height, 8, 0), SADWrapperU8_AVX2::sad_u8_avx2 }, 74 | 75 | static const std::unordered_map sad_functions = { 76 | SAD_U8_AVX2(16, 2) 77 | SAD_U8_AVX2(16, 4) 78 | SAD_U8_AVX2(16, 4) 79 | SAD_U8_AVX2(16, 8) 80 | SAD_U8_AVX2(16, 16) 81 | SAD_U8_AVX2(16, 32) 82 | SAD_U8_AVX2(32, 8) 83 | SAD_U8_AVX2(32, 16) 84 | SAD_U8_AVX2(32, 32) 85 | SAD_U8_AVX2(32, 64) 86 | SAD_U8_AVX2(64, 16) 87 | SAD_U8_AVX2(64, 32) 88 | SAD_U8_AVX2(64, 64) 89 | SAD_U8_AVX2(64, 128) 90 | SAD_U8_AVX2(128, 32) 91 | SAD_U8_AVX2(128, 64) 92 | SAD_U8_AVX2(128, 128) 93 | }; 94 | 95 | SADFunction selectSADFunctionAVX2(unsigned width, unsigned height, unsigned bits) { 96 | try { 97 | return sad_functions.at(KEY(width, height, bits, 0)); 98 | } catch (const std::out_of_range &) { 99 | return nullptr; 100 | } 101 | } 102 | 103 | #endif 104 | -------------------------------------------------------------------------------- /src/SimpleResize.cpp: -------------------------------------------------------------------------------- 1 | // This used to contain code from the SimpleResize Avisynth plugin, written 2 | // by Tom Barry and modified by Fizick. All of that was rewritten by dubhater, 3 | // using code by anon32 for, ahem, inspiration. 4 | // Only the name and the basic algorithm remain. 5 | 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | #include "CPU.h" 12 | #include "SimpleResize.h" 13 | 14 | 15 | #if defined(MVTOOLS_X86) 16 | void simpleResize_uint8_t_avx2(const SimpleResize *simple, 17 | uint8_t *dstp, int dst_stride, 18 | const uint8_t *srcp, int src_stride, 19 | int horizontal_vectors); 20 | void simpleResize_int16_t_avx2(const SimpleResize *simple, 21 | int16_t *dstp, int dst_stride, 22 | const int16_t *srcp, int src_stride, 23 | int horizontal_vectors); 24 | #endif 25 | 26 | 27 | static void InitTables(int *offsets, int *weights, int out, int in) { 28 | // We don't do shifts. 29 | float leftmost = 0.5f; // + shift 30 | float rightmost = in - 0.5f; // + shift 31 | 32 | int leftmost_idx = VSMAX((int)leftmost, 0); 33 | int rightmost_idx = VSMIN((int)rightmost, in - 1); 34 | 35 | for (int i = 0; i < out; i++) { 36 | float position = (i + 0.5f) * (float)in / (float)out; 37 | 38 | float weight; 39 | int offset; 40 | 41 | if (position <= leftmost) { 42 | offset = leftmost_idx; 43 | weight = 0.0f; 44 | } else if (position >= rightmost) { 45 | offset = rightmost_idx - 1; 46 | weight = 1.0f; 47 | } else { 48 | offset = (int)(position - leftmost); 49 | weight = position - leftmost - offset; 50 | } 51 | 52 | offsets[i] = offset; 53 | 54 | weights[i] = (int)(weight * simple_resize_weight_max); 55 | } 56 | } 57 | 58 | 59 | // Thread-safe. 60 | template 61 | static void simpleResize(const SimpleResize *simple, 62 | PixelType *dstp, int dst_stride, 63 | const PixelType *srcp, int src_stride, 64 | int horizontal_vectors) { 65 | 66 | // Apparently only 16 bit vectors need limiting. 67 | bool limit_vectors = sizeof(PixelType) == 2; 68 | 69 | int pel = simple->pel; 70 | int minimum = 0; 71 | int maximum = simple->limit_height * pel - 1; 72 | int horizontal_step = horizontal_vectors ? pel : 0; 73 | int vertical_step = horizontal_vectors ? 0 : pel; 74 | 75 | PixelType *workp = (PixelType *)malloc(simple->src_width * sizeof(PixelType)); 76 | 77 | for (int y = 0; y < simple->dst_height; y++) { 78 | int weight_bottom = simple->vertical_weights[y]; 79 | int weight_top = simple_resize_weight_max - weight_bottom; 80 | 81 | const PixelType *srcp1 = srcp + simple->vertical_offsets[y] * src_stride; 82 | const PixelType *srcp2 = srcp1 + src_stride; 83 | 84 | /* vertical */ 85 | for (int x = 0; x < simple->src_width; x++) { 86 | workp[x] = (srcp1[x] * weight_top + srcp2[x] * weight_bottom + simple_resize_weight_half) >> simple_resize_weight_shift; 87 | } 88 | 89 | if (horizontal_vectors) { 90 | minimum = 0; 91 | maximum = simple->limit_width * pel - 1; 92 | } 93 | 94 | /* horizontal */ 95 | for (int x = 0; x < simple->dst_width; x++) { 96 | int weight_right = simple->horizontal_weights[x]; 97 | int weight_left = simple_resize_weight_max - weight_right; 98 | int offset = simple->horizontal_offsets[x]; 99 | 100 | int result = (workp[offset] * weight_left + workp[offset + 1] * weight_right + simple_resize_weight_half) >> simple_resize_weight_shift; 101 | 102 | if (limit_vectors) { 103 | result = std::max(minimum, std::min(result, maximum)); 104 | 105 | minimum -= horizontal_step; 106 | maximum -= horizontal_step; 107 | } 108 | 109 | dstp[x] = result; 110 | } 111 | 112 | dstp += dst_stride; 113 | 114 | if (limit_vectors) { 115 | minimum -= vertical_step; 116 | maximum -= vertical_step; 117 | } 118 | } 119 | 120 | free(workp); 121 | } 122 | 123 | 124 | void simpleInit(SimpleResize *simple, int dst_width, int dst_height, int src_width, int src_height, int limit_width, int limit_height, int pel, int opt) { 125 | simple->src_width = src_width; 126 | simple->src_height = src_height; 127 | simple->dst_width = dst_width; 128 | simple->dst_height = dst_height; 129 | 130 | simple->limit_width = limit_width; 131 | simple->limit_height = limit_height; 132 | simple->pel = pel; 133 | 134 | // Offset to first line of the pair. 135 | simple->vertical_offsets = (int *)malloc(dst_height * sizeof(int)); 136 | // Weight of the second line of the pair. 137 | simple->vertical_weights = (int *)malloc(dst_height * sizeof(int)); 138 | 139 | simple->horizontal_offsets = (int *)malloc(dst_width * sizeof(int)); 140 | simple->horizontal_weights = (int *)malloc(dst_width * sizeof(int)); 141 | 142 | InitTables(simple->horizontal_offsets, simple->horizontal_weights, dst_width, src_width); 143 | InitTables(simple->vertical_offsets, simple->vertical_weights, dst_height, src_height); 144 | 145 | simple->simpleResize_uint8_t = simpleResize; 146 | simple->simpleResize_int16_t = simpleResize; 147 | 148 | if (opt) { 149 | #if defined(MVTOOLS_X86) 150 | if (g_cpuinfo & X264_CPU_AVX2) { 151 | simple->simpleResize_uint8_t = simpleResize_uint8_t_avx2; 152 | simple->simpleResize_int16_t = simpleResize_int16_t_avx2; 153 | 154 | for (int i = 0; i < dst_width; i++) { 155 | int w = simple->horizontal_weights[i]; 156 | simple->horizontal_weights[i] = (w << 16) | (simple_resize_weight_max - w); 157 | } 158 | } 159 | #endif 160 | } 161 | } 162 | 163 | 164 | void simpleDeinit(SimpleResize *simple) { 165 | free(simple->vertical_offsets); 166 | free(simple->vertical_weights); 167 | free(simple->horizontal_offsets); 168 | free(simple->horizontal_weights); 169 | memset(simple, 0, sizeof(SimpleResize)); 170 | } 171 | 172 | -------------------------------------------------------------------------------- /src/SimpleResize.h: -------------------------------------------------------------------------------- 1 | // See legal notice in Copying.txt for more information 2 | 3 | // This program is free software; you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation; either version 2 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // This program is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit 16 | // http://www.gnu.org/copyleft/gpl.html . 17 | 18 | // I (Fizick) borrow code from Tom Barry's SimpleResize here 19 | 20 | #ifndef SIMPLERESIZE_H 21 | #define SIMPLERESIZE_H 22 | 23 | #ifdef __cplusplus 24 | extern "C" { 25 | #endif 26 | 27 | 28 | #include 29 | 30 | 31 | enum { 32 | simple_resize_weight_shift = 14, 33 | simple_resize_weight_max = 1 << simple_resize_weight_shift, 34 | simple_resize_weight_half = simple_resize_weight_max / 2, 35 | }; 36 | 37 | 38 | typedef struct SimpleResize SimpleResize; 39 | 40 | 41 | typedef void (*ResizeFunction8)(const SimpleResize *simple, 42 | uint8_t *dstp, int dst_stride, 43 | const uint8_t *srcp, int src_stride, 44 | int horizontal_vectors); 45 | typedef void (*ResizeFunction16)(const SimpleResize *simple, 46 | int16_t *dstp, int dst_stride, 47 | const int16_t *srcp, int src_stride, 48 | int horizontal_vectors); 49 | 50 | 51 | typedef struct SimpleResize { 52 | int dst_width; 53 | int dst_height; 54 | int src_width; 55 | int src_height; 56 | 57 | // Used only to limit the vectors in the 16 bit resizer. 58 | // dst_width and dst_height are usually the padded dimensions. 59 | // The two below are the unpadded dimensions, i.e. the actual frame size. 60 | int limit_width; 61 | int limit_height; 62 | int pel; 63 | 64 | int *vertical_offsets; 65 | int *vertical_weights; 66 | 67 | int *horizontal_offsets; 68 | int *horizontal_weights; 69 | 70 | ResizeFunction8 simpleResize_uint8_t; 71 | ResizeFunction16 simpleResize_int16_t; 72 | } SimpleResize; 73 | 74 | 75 | void simpleInit(SimpleResize *simple, int dst_width, int dst_height, int src_width, int src_height, int limit_width, int limit_height, int pel, int opt); 76 | void simpleDeinit(SimpleResize *simple); 77 | 78 | 79 | #ifdef __cplusplus 80 | } // extern "C" 81 | #endif 82 | 83 | #endif // SIMPLERESIZE_H 84 | -------------------------------------------------------------------------------- /src/SimpleResize_AVX2.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | 4 | #include "SimpleResize.h" 5 | 6 | 7 | #ifdef _WIN32 8 | #define FORCE_INLINE __forceinline 9 | #else 10 | #define FORCE_INLINE inline __attribute__((always_inline)) 11 | #endif 12 | 13 | #define zeroes _mm_setzero_si128() 14 | 15 | 16 | static FORCE_INLINE void simpleResize_uint8_t_vertical_4px_avx2(uint8_t *workp, const uint8_t *srcp1, const uint8_t *srcp2, int x, const __m128i &dwords_weights) { 17 | __m128i top = _mm_cvtsi32_si128(*(const int *)&srcp1[x]); 18 | __m128i bottom = _mm_cvtsi32_si128(*(const int *)&srcp2[x]); 19 | __m128i pixels = _mm_unpacklo_epi8(_mm_unpacklo_epi8(bottom, top), zeroes); 20 | 21 | __m128i dst = _mm_madd_epi16(pixels, dwords_weights); 22 | 23 | dst = _mm_add_epi32(dst, _mm_set1_epi32(simple_resize_weight_half)); 24 | dst = _mm_srli_epi32(dst, simple_resize_weight_shift); 25 | dst = _mm_packs_epi32(dst, dst); 26 | dst = _mm_packus_epi16(dst, dst); 27 | *(int *)&workp[x] = _mm_cvtsi128_si32(dst); 28 | } 29 | 30 | 31 | static FORCE_INLINE void simpleResize_uint8_t_horizontal_8px_avx2(const SimpleResize *simple, uint8_t *dstp, uint8_t *workp, int x, const __m256i &shuffle_mask) { 32 | __m256i dwords_weights_h = _mm256_loadu_si256((const __m256i *)&simple->horizontal_weights[x]); 33 | __m256i dwords_offsets = _mm256_loadu_si256((const __m256i *)&simple->horizontal_offsets[x]); 34 | __m256i pixels = _mm256_i32gather_epi32((const int *)workp, dwords_offsets, sizeof(uint8_t)); 35 | 36 | pixels = _mm256_shuffle_epi8(pixels, shuffle_mask); 37 | 38 | pixels = _mm256_madd_epi16(pixels, dwords_weights_h); 39 | pixels = _mm256_add_epi32(pixels, _mm256_set1_epi32(simple_resize_weight_half)); 40 | pixels = _mm256_srai_epi32(pixels, simple_resize_weight_shift); 41 | pixels = _mm256_packs_epi32(pixels, pixels); 42 | pixels = _mm256_permute4x64_epi64(pixels, 0xe8); // 0b11101000 43 | pixels = _mm256_packus_epi16(pixels, pixels); 44 | 45 | _mm_storel_epi64((__m128i *)&dstp[x], _mm256_castsi256_si128(pixels)); 46 | } 47 | 48 | 49 | // Thread-safe. 50 | void simpleResize_uint8_t_avx2(const SimpleResize *simple, 51 | uint8_t *dstp, int dst_stride, 52 | const uint8_t *srcp, int src_stride, 53 | int horizontal_vectors) { 54 | (void)horizontal_vectors; 55 | 56 | // Two additional bytes because of vpgatherdd. 57 | uint8_t *workp = (uint8_t *)malloc(simple->src_width * sizeof(uint8_t) + 2); 58 | 59 | #define SHUFFLE_PATTERN -0x80, 13, -0x80, 12, -0x80, 9, -0x80, 8, -0x80, 5, -0x80, 4, -0x80, 1, -0x80, 0 60 | __m256i shuffle_mask = _mm256_set_epi8(SHUFFLE_PATTERN, SHUFFLE_PATTERN); 61 | #undef SHUFFLE_PATTERN 62 | 63 | for (int y = 0; y < simple->dst_height; y++) { 64 | int weight_bottom = simple->vertical_weights[y]; 65 | int weight_top = simple_resize_weight_max - weight_bottom; 66 | 67 | const uint8_t *srcp1 = srcp + simple->vertical_offsets[y] * src_stride; 68 | const uint8_t *srcp2 = srcp1 + src_stride; 69 | 70 | __m128i dwords_weights_v = _mm_set1_epi32((weight_top << 16) | weight_bottom); 71 | 72 | int pixels_per_iteration = 4; 73 | const int src_width_avx2 = simple->src_width & ~(pixels_per_iteration - 1); 74 | 75 | /* vertical */ 76 | for (int x = 0; x < src_width_avx2; x += pixels_per_iteration) 77 | simpleResize_uint8_t_vertical_4px_avx2(workp, srcp1, srcp2, x, dwords_weights_v); 78 | 79 | if (src_width_avx2 < simple->src_width) 80 | simpleResize_uint8_t_vertical_4px_avx2(workp, srcp1, srcp2, simple->src_width - pixels_per_iteration, dwords_weights_v); 81 | 82 | 83 | pixels_per_iteration = 8; 84 | const int dst_width_avx2 = simple->dst_width & ~(pixels_per_iteration - 1); 85 | 86 | /* horizontal */ 87 | for (int x = 0; x < dst_width_avx2; x += pixels_per_iteration) 88 | simpleResize_uint8_t_horizontal_8px_avx2(simple, dstp, workp, x, shuffle_mask); 89 | 90 | if (dst_width_avx2 < simple->dst_width) 91 | simpleResize_uint8_t_horizontal_8px_avx2(simple, dstp, workp, simple->dst_width - pixels_per_iteration, shuffle_mask); 92 | 93 | dstp += dst_stride; 94 | } 95 | 96 | free(workp); 97 | } 98 | 99 | 100 | static FORCE_INLINE void simpleResize_int16_t_vertical_8px_avx2(int16_t *workp, const int16_t *srcp1, const int16_t *srcp2, int x, const __m128i &dwords_weights) { 101 | __m128i top = _mm_loadu_si128((const __m128i *)&srcp1[x]); 102 | __m128i bottom = _mm_loadu_si128((const __m128i *)&srcp2[x]); 103 | __m128i pixels_lo = _mm_unpacklo_epi16(bottom, top); 104 | __m128i pixels_hi = _mm_unpackhi_epi16(bottom, top); 105 | 106 | __m128i dst_lo = _mm_madd_epi16(pixels_lo, dwords_weights); 107 | __m128i dst_hi = _mm_madd_epi16(pixels_hi, dwords_weights); 108 | dst_lo = _mm_add_epi32(dst_lo, _mm_set1_epi32(simple_resize_weight_half)); 109 | dst_hi = _mm_add_epi32(dst_hi, _mm_set1_epi32(simple_resize_weight_half)); 110 | dst_lo = _mm_srai_epi32(dst_lo, simple_resize_weight_shift); 111 | dst_hi = _mm_srai_epi32(dst_hi, simple_resize_weight_shift); 112 | __m128i dst = _mm_packs_epi32(dst_lo, dst_hi); 113 | _mm_storeu_si128((__m128i *)&workp[x], dst); 114 | } 115 | 116 | 117 | static FORCE_INLINE void simpleResize_int16_t_horizontal_8px_avx2(const SimpleResize *simple, int16_t *dstp, int16_t *workp, int x, __m256i &minimum, __m256i &maximum, const __m256i &horizontal_step) { 118 | __m256i dwords_weights_h = _mm256_loadu_si256((const __m256i *)&simple->horizontal_weights[x]); 119 | __m256i dwords_offsets = _mm256_loadu_si256((const __m256i *)&simple->horizontal_offsets[x]); 120 | __m256i pixels = _mm256_i32gather_epi32((const int *)workp, dwords_offsets, sizeof(int16_t)); 121 | pixels = _mm256_madd_epi16(pixels, dwords_weights_h); 122 | pixels = _mm256_add_epi32(pixels, _mm256_set1_epi32(simple_resize_weight_half)); 123 | pixels = _mm256_srai_epi32(pixels, simple_resize_weight_shift); 124 | 125 | pixels = _mm256_max_epi32(minimum, 126 | _mm256_min_epi32(pixels, maximum)); 127 | 128 | pixels = _mm256_packs_epi32(pixels, pixels); 129 | 130 | minimum = _mm256_sub_epi32(minimum, horizontal_step); 131 | maximum = _mm256_sub_epi32(maximum, horizontal_step); 132 | 133 | _mm_storeu_si128((__m128i *)&dstp[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(pixels, 0xe8))); // 0b11101000 134 | } 135 | 136 | 137 | // Thread-safe. 138 | void simpleResize_int16_t_avx2(const SimpleResize *simple, 139 | int16_t *dstp, int dst_stride, 140 | const int16_t *srcp, int src_stride, 141 | int horizontal_vectors) { 142 | int16_t *workp = (int16_t *)malloc(simple->src_width * sizeof(int16_t)); 143 | 144 | const int pixels_per_iteration = 8; 145 | 146 | int pel = simple->pel; 147 | __m256i minimum = _mm256_setzero_si256(); 148 | __m256i maximum = _mm256_set1_epi32(simple->limit_height * pel - 1); 149 | __m256i horizontal_step = _mm256_set1_epi32(horizontal_vectors ? pel * pixels_per_iteration : 0); 150 | __m256i vertical_step = _mm256_set1_epi32(horizontal_vectors ? 0 : pel); 151 | 152 | __m256i initial_horizontal_minimum = _mm256_set_epi32(-7 * pel, 153 | -6 * pel, 154 | -5 * pel, 155 | -4 * pel, 156 | -3 * pel, 157 | -2 * pel, 158 | -1 * pel, 159 | 0 * pel); 160 | __m256i initial_horizontal_maximum = _mm256_set_epi32((simple->limit_width - 7) * pel - 1, 161 | (simple->limit_width - 6) * pel - 1, 162 | (simple->limit_width - 5) * pel - 1, 163 | (simple->limit_width - 4) * pel - 1, 164 | (simple->limit_width - 3) * pel - 1, 165 | (simple->limit_width - 2) * pel - 1, 166 | (simple->limit_width - 1) * pel - 1, 167 | (simple->limit_width - 0) * pel - 1); 168 | 169 | for (int y = 0; y < simple->dst_height; y++) { 170 | int weight_bottom = simple->vertical_weights[y]; 171 | int weight_top = simple_resize_weight_max - weight_bottom; 172 | 173 | const int16_t *srcp1 = srcp + simple->vertical_offsets[y] * src_stride; 174 | const int16_t *srcp2 = srcp1 + src_stride; 175 | 176 | __m128i dwords_weights_v = _mm_set1_epi32((weight_top << 16) | weight_bottom); 177 | 178 | const int src_width_sse2 = simple->src_width & ~(pixels_per_iteration - 1); 179 | 180 | /* vertical */ 181 | for (int x = 0; x < src_width_sse2; x += pixels_per_iteration) 182 | simpleResize_int16_t_vertical_8px_avx2(workp, srcp1, srcp2, x, dwords_weights_v); 183 | 184 | if (src_width_sse2 < simple->src_width) 185 | simpleResize_int16_t_vertical_8px_avx2(workp, srcp1, srcp2, simple->src_width - pixels_per_iteration, dwords_weights_v); 186 | 187 | 188 | if (horizontal_vectors) { 189 | minimum = initial_horizontal_minimum; 190 | maximum = initial_horizontal_maximum; 191 | } 192 | 193 | 194 | const int dst_width_avx2 = simple->dst_width & ~(pixels_per_iteration - 1); 195 | 196 | /* horizontal */ 197 | for (int x = 0; x < dst_width_avx2; x += pixels_per_iteration) 198 | simpleResize_int16_t_horizontal_8px_avx2(simple, dstp, workp, x, minimum, maximum, horizontal_step); 199 | 200 | if (dst_width_avx2 < simple->dst_width) { 201 | if (horizontal_vectors) { 202 | __m256i step_back = _mm256_set1_epi32((pixels_per_iteration - (simple->dst_width - dst_width_avx2)) * pel); 203 | minimum = _mm256_add_epi32(minimum, step_back); 204 | maximum = _mm256_add_epi32(maximum, step_back); 205 | } 206 | 207 | simpleResize_int16_t_horizontal_8px_avx2(simple, dstp, workp, simple->dst_width - pixels_per_iteration, minimum, maximum, horizontal_step); 208 | } 209 | 210 | dstp += dst_stride; 211 | 212 | minimum = _mm256_sub_epi32(minimum, vertical_step); 213 | maximum = _mm256_sub_epi32(maximum, vertical_step); 214 | } 215 | 216 | free(workp); 217 | } 218 | -------------------------------------------------------------------------------- /src/asm/aarch64-asm.S: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | * asm.S: AArch64 utility macros 3 | ***************************************************************************** 4 | * Copyright (C) 2008-2024 x264 project 5 | * 6 | * Authors: Mans Rullgard 7 | * David Conrad 8 | * Janne Grunau 9 | * 10 | * This program is free software; you can redistribute it and/or modify 11 | * it under the terms of the GNU General Public License as published by 12 | * the Free Software Foundation; either version 2 of the License, or 13 | * (at your option) any later version. 14 | * 15 | * This program is distributed in the hope that it will be useful, 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | * GNU General Public License for more details. 19 | * 20 | * You should have received a copy of the GNU General Public License 21 | * along with this program; if not, write to the Free Software 22 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. 23 | * 24 | * This program is also available under a commercial proprietary license. 25 | * For more information, contact us at licensing@x264.com. 26 | *****************************************************************************/ 27 | 28 | /* 29 | #include "config.h" 30 | */ 31 | #define GLUE(a, b) a ## b 32 | #define JOIN(a, b) GLUE(a, b) 33 | 34 | #ifdef PREFIX 35 | # define BASE _mvtools_ 36 | # define SYM_PREFIX _ 37 | #else 38 | # define BASE mvtools_ 39 | # define SYM_PREFIX 40 | #endif 41 | 42 | #ifdef BIT_DEPTH 43 | # define EXTERN_ASM JOIN(JOIN(BASE, BIT_DEPTH), _) 44 | #else 45 | # define EXTERN_ASM BASE 46 | #endif 47 | 48 | #define X(s) JOIN(EXTERN_ASM, s) 49 | #define X264(s) JOIN(BASE, s) 50 | #define EXT(s) JOIN(SYM_PREFIX, s) 51 | 52 | #ifdef __ELF__ 53 | # define ELF 54 | #else 55 | # define ELF # 56 | #endif 57 | 58 | #ifdef __MACH__ 59 | # define MACH 60 | #else 61 | # define MACH # 62 | #endif 63 | 64 | #if HAVE_AS_FUNC 65 | # define FUNC 66 | #else 67 | # define FUNC # 68 | #endif 69 | 70 | .macro function name, export=0, align=2 71 | .macro endfunc 72 | .if \export 73 | ELF .size EXTERN_ASM\name, . - EXTERN_ASM\name 74 | .else 75 | ELF .size \name, . - \name 76 | .endif 77 | FUNC .endfunc 78 | .purgem endfunc 79 | .endm 80 | .text 81 | .align \align 82 | .if \export 83 | .global EXTERN_ASM\name 84 | ELF .type EXTERN_ASM\name, %function 85 | FUNC .func EXTERN_ASM\name 86 | EXTERN_ASM\name: 87 | .else 88 | ELF .type \name, %function 89 | FUNC .func \name 90 | \name: 91 | .endif 92 | .endm 93 | 94 | .macro const name, align=2 95 | .macro endconst 96 | ELF .size \name, . - \name 97 | .purgem endconst 98 | .endm 99 | ELF .section .rodata 100 | MACH .const_data 101 | .align \align 102 | \name: 103 | .endm 104 | 105 | .macro movrel rd, val, offset=0 106 | #if defined(__APPLE__) 107 | .if \offset < 0 108 | adrp \rd, \val@PAGE 109 | add \rd, \rd, \val@PAGEOFF 110 | sub \rd, \rd, -(\offset) 111 | .else 112 | adrp \rd, \val+(\offset)@PAGE 113 | add \rd, \rd, \val+(\offset)@PAGEOFF 114 | .endif 115 | #elif defined(PIC) && defined(_WIN32) 116 | .if \offset < 0 117 | adrp \rd, \val 118 | add \rd, \rd, :lo12:\val 119 | sub \rd, \rd, -(\offset) 120 | .else 121 | adrp \rd, \val+(\offset) 122 | add \rd, \rd, :lo12:\val+(\offset) 123 | .endif 124 | #elif defined(PIC) 125 | adrp \rd, \val+(\offset) 126 | add \rd, \rd, :lo12:\val+(\offset) 127 | #else 128 | ldr \rd, =\val+\offset 129 | #endif 130 | .endm 131 | 132 | #define FDEC_STRIDE 32 133 | #define FENC_STRIDE 16 134 | 135 | 136 | .macro SUMSUB_AB sum, sub, a, b 137 | add \sum, \a, \b 138 | sub \sub, \a, \b 139 | .endm 140 | 141 | .macro unzip t1, t2, s1, s2 142 | uzp1 \t1, \s1, \s2 143 | uzp2 \t2, \s1, \s2 144 | .endm 145 | 146 | .macro transpose t1, t2, s1, s2 147 | trn1 \t1, \s1, \s2 148 | trn2 \t2, \s1, \s2 149 | .endm 150 | 151 | .macro transpose4x4.h v0, v1, v2, v3, t0, t1, t2, t3 152 | transpose \t0\().2s, \t2\().2s, \v0\().2s, \v2\().2s 153 | transpose \t1\().2s, \t3\().2s, \v1\().2s, \v3\().2s 154 | transpose \v0\().4h, \v1\().4h, \t0\().4h, \t1\().4h 155 | transpose \v2\().4h, \v3\().4h, \t2\().4h, \t3\().4h 156 | .endm 157 | 158 | .macro transpose4x8.h v0, v1, v2, v3, t0, t1, t2, t3 159 | transpose \t0\().4s, \t2\().4s, \v0\().4s, \v2\().4s 160 | transpose \t1\().4s, \t3\().4s, \v1\().4s, \v3\().4s 161 | transpose \v0\().8h, \v1\().8h, \t0\().8h, \t1\().8h 162 | transpose \v2\().8h, \v3\().8h, \t2\().8h, \t3\().8h 163 | .endm 164 | 165 | 166 | .macro transpose8x8.h r0, r1, r2, r3, r4, r5, r6, r7, r8, r9 167 | trn1 \r8\().8h, \r0\().8h, \r1\().8h 168 | trn2 \r9\().8h, \r0\().8h, \r1\().8h 169 | trn1 \r1\().8h, \r2\().8h, \r3\().8h 170 | trn2 \r3\().8h, \r2\().8h, \r3\().8h 171 | trn1 \r0\().8h, \r4\().8h, \r5\().8h 172 | trn2 \r5\().8h, \r4\().8h, \r5\().8h 173 | trn1 \r2\().8h, \r6\().8h, \r7\().8h 174 | trn2 \r7\().8h, \r6\().8h, \r7\().8h 175 | 176 | trn1 \r4\().4s, \r0\().4s, \r2\().4s 177 | trn2 \r2\().4s, \r0\().4s, \r2\().4s 178 | trn1 \r6\().4s, \r5\().4s, \r7\().4s 179 | trn2 \r7\().4s, \r5\().4s, \r7\().4s 180 | trn1 \r5\().4s, \r9\().4s, \r3\().4s 181 | trn2 \r9\().4s, \r9\().4s, \r3\().4s 182 | trn1 \r3\().4s, \r8\().4s, \r1\().4s 183 | trn2 \r8\().4s, \r8\().4s, \r1\().4s 184 | 185 | trn1 \r0\().2d, \r3\().2d, \r4\().2d 186 | trn2 \r4\().2d, \r3\().2d, \r4\().2d 187 | 188 | trn1 \r1\().2d, \r5\().2d, \r6\().2d 189 | trn2 \r5\().2d, \r5\().2d, \r6\().2d 190 | 191 | trn2 \r6\().2d, \r8\().2d, \r2\().2d 192 | trn1 \r2\().2d, \r8\().2d, \r2\().2d 193 | 194 | trn1 \r3\().2d, \r9\().2d, \r7\().2d 195 | trn2 \r7\().2d, \r9\().2d, \r7\().2d 196 | .endm 197 | 198 | .macro transpose_8x16.b r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 199 | trn1 \t0\().16b, \r0\().16b, \r1\().16b 200 | trn2 \t1\().16b, \r0\().16b, \r1\().16b 201 | trn1 \r1\().16b, \r2\().16b, \r3\().16b 202 | trn2 \r3\().16b, \r2\().16b, \r3\().16b 203 | trn1 \r0\().16b, \r4\().16b, \r5\().16b 204 | trn2 \r5\().16b, \r4\().16b, \r5\().16b 205 | trn1 \r2\().16b, \r6\().16b, \r7\().16b 206 | trn2 \r7\().16b, \r6\().16b, \r7\().16b 207 | 208 | trn1 \r4\().8h, \r0\().8h, \r2\().8h 209 | trn2 \r2\().8h, \r0\().8h, \r2\().8h 210 | trn1 \r6\().8h, \r5\().8h, \r7\().8h 211 | trn2 \r7\().8h, \r5\().8h, \r7\().8h 212 | trn1 \r5\().8h, \t1\().8h, \r3\().8h 213 | trn2 \t1\().8h, \t1\().8h, \r3\().8h 214 | trn1 \r3\().8h, \t0\().8h, \r1\().8h 215 | trn2 \t0\().8h, \t0\().8h, \r1\().8h 216 | 217 | trn1 \r0\().4s, \r3\().4s, \r4\().4s 218 | trn2 \r4\().4s, \r3\().4s, \r4\().4s 219 | 220 | trn1 \r1\().4s, \r5\().4s, \r6\().4s 221 | trn2 \r5\().4s, \r5\().4s, \r6\().4s 222 | 223 | trn2 \r6\().4s, \t0\().4s, \r2\().4s 224 | trn1 \r2\().4s, \t0\().4s, \r2\().4s 225 | 226 | trn1 \r3\().4s, \t1\().4s, \r7\().4s 227 | trn2 \r7\().4s, \t1\().4s, \r7\().4s 228 | .endm 229 | 230 | .macro transpose_4x16.b r0, r1, r2, r3, t4, t5, t6, t7 231 | trn1 \t4\().16b, \r0\().16b, \r1\().16b 232 | trn2 \t5\().16b, \r0\().16b, \r1\().16b 233 | trn1 \t6\().16b, \r2\().16b, \r3\().16b 234 | trn2 \t7\().16b, \r2\().16b, \r3\().16b 235 | 236 | trn1 \r0\().8h, \t4\().8h, \t6\().8h 237 | trn2 \r2\().8h, \t4\().8h, \t6\().8h 238 | trn1 \r1\().8h, \t5\().8h, \t7\().8h 239 | trn2 \r3\().8h, \t5\().8h, \t7\().8h 240 | .endm 241 | 242 | .macro transpose_4x8.b r0, r1, r2, r3, t4, t5, t6, t7 243 | trn1 \t4\().8b, \r0\().8b, \r1\().8b 244 | trn2 \t5\().8b, \r0\().8b, \r1\().8b 245 | trn1 \t6\().8b, \r2\().8b, \r3\().8b 246 | trn2 \t7\().8b, \r2\().8b, \r3\().8b 247 | 248 | trn1 \r0\().4h, \t4\().4h, \t6\().4h 249 | trn2 \r2\().4h, \t4\().4h, \t6\().4h 250 | trn1 \r1\().4h, \t5\().4h, \t7\().4h 251 | trn2 \r3\().4h, \t5\().4h, \t7\().4h 252 | .endm 253 | -------------------------------------------------------------------------------- /src/asm/aarch64-pixel-a-common.S: -------------------------------------------------------------------------------- 1 | /**************************************************************************** 2 | * pixel-a-common.S: aarch64 pixel metrics 3 | ***************************************************************************** 4 | * Copyright (C) 2009-2024 x264 project 5 | * 6 | * Authors: David Conrad 7 | * Janne Grunau 8 | * David Chen 9 | * 10 | * This program is free software; you can redistribute it and/or modify 11 | * it under the terms of the GNU General Public License as published by 12 | * the Free Software Foundation; either version 2 of the License, or 13 | * (at your option) any later version. 14 | * 15 | * This program is distributed in the hope that it will be useful, 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | * GNU General Public License for more details. 19 | * 20 | * You should have received a copy of the GNU General Public License 21 | * along with this program; if not, write to the Free Software 22 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. 23 | * 24 | * This program is also available under a commercial proprietary license. 25 | * For more information, contact us at licensing@x264.com. 26 | *****************************************************************************/ 27 | 28 | // This file contains the NEON macros and constants that are intended to be used by 29 | // the SVE/SVE2 functions as well 30 | 31 | const mask_ac_4_8 32 | .short 0, -1, -1, -1, 0, -1, -1, -1 33 | .short 0, -1, -1, -1, -1, -1, -1, -1 34 | endconst 35 | 36 | .macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d 37 | SUMSUB_AB \s1, \d1, \a, \b 38 | SUMSUB_AB \s2, \d2, \c, \d 39 | .endm 40 | 41 | .macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4 42 | SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4 43 | SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4 44 | .endm 45 | -------------------------------------------------------------------------------- /src/asm/const-a.asm: -------------------------------------------------------------------------------- 1 | ;***************************************************************************** 2 | ;* const-a.asm: x86 global constants 3 | ;***************************************************************************** 4 | ;* Copyright (C) 2010-2022 x264 project 5 | ;* 6 | ;* Authors: Loren Merritt 7 | ;* Fiona Glaser 8 | ;* 9 | ;* This program is free software; you can redistribute it and/or modify 10 | ;* it under the terms of the GNU General Public License as published by 11 | ;* the Free Software Foundation; either version 2 of the License, or 12 | ;* (at your option) any later version. 13 | ;* 14 | ;* This program is distributed in the hope that it will be useful, 15 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | ;* GNU General Public License for more details. 18 | ;* 19 | ;* You should have received a copy of the GNU General Public License 20 | ;* along with this program; if not, write to the Free Software 21 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. 22 | ;* 23 | ;* This program is also available under a commercial proprietary license. 24 | ;* For more information, contact us at licensing@x264.com. 25 | ;***************************************************************************** 26 | 27 | %include "x86inc.asm" 28 | 29 | SECTION_RODATA 32 30 | 31 | const pb_1, times 32 db 1 32 | const hsub_mul, times 16 db 1, -1 33 | const pw_1, times 16 dw 1 34 | const pw_16, times 16 dw 16 35 | const pw_32, times 16 dw 32 36 | const pw_512, times 16 dw 512 37 | const pw_00ff, times 16 dw 0x00ff 38 | const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1) 39 | const pw_0to15, dw 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 40 | const pd_1, times 8 dd 1 41 | const pd_0123, dd 0,1,2,3 42 | const pd_4567, dd 4,5,6,7 43 | const deinterleave_shufd, dd 0,4,1,5,2,6,3,7 44 | const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 45 | const pb_unpackbd2, times 2 db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7 46 | 47 | const pb_01, times 8 db 0,1 48 | const pb_0, times 16 db 0 49 | const pb_a1, times 16 db 0xa1 50 | const pb_3, times 16 db 3 51 | const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6 52 | 53 | const pw_2, times 8 dw 2 54 | const pw_m2, times 8 dw -2 55 | const pw_4, times 8 dw 4 56 | const pw_8, times 8 dw 8 57 | const pw_64, times 8 dw 64 58 | const pw_256, times 8 dw 256 59 | const pw_32_0, times 4 dw 32 60 | times 4 dw 0 61 | const pw_8000, times 8 dw 0x8000 62 | const pw_3fff, times 8 dw 0x3fff 63 | const pw_ppppmmmm, dw 1,1,1,1,-1,-1,-1,-1 64 | const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1 65 | const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1 66 | const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0 67 | 68 | const pd_8, times 4 dd 8 69 | const pd_32, times 4 dd 32 70 | const pd_1024, times 4 dd 1024 71 | const pd_ffff, times 4 dd 0xffff 72 | const pw_ff00, times 8 dw 0xff00 73 | 74 | const popcnt_table 75 | %assign x 0 76 | %rep 256 77 | ; population count 78 | db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1) 79 | %assign x x+1 80 | %endrep 81 | 82 | const sw_64, dd 64 83 | -------------------------------------------------------------------------------- /src/asm/cpu-a.asm: -------------------------------------------------------------------------------- 1 | ;***************************************************************************** 2 | ;* cpu-a.asm: x86 cpu utilities 3 | ;***************************************************************************** 4 | ;* Copyright (C) 2003-2022 x264 project 5 | ;* 6 | ;* Authors: Laurent Aimar 7 | ;* Loren Merritt 8 | ;* Fiona Glaser 9 | ;* 10 | ;* This program is free software; you can redistribute it and/or modify 11 | ;* it under the terms of the GNU General Public License as published by 12 | ;* the Free Software Foundation; either version 2 of the License, or 13 | ;* (at your option) any later version. 14 | ;* 15 | ;* This program is distributed in the hope that it will be useful, 16 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | ;* GNU General Public License for more details. 19 | ;* 20 | ;* You should have received a copy of the GNU General Public License 21 | ;* along with this program; if not, write to the Free Software 22 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. 23 | ;* 24 | ;* This program is also available under a commercial proprietary license. 25 | ;* For more information, contact us at licensing@x264.com. 26 | ;***************************************************************************** 27 | 28 | %include "x86inc.asm" 29 | 30 | SECTION .text 31 | 32 | ;----------------------------------------------------------------------------- 33 | ; void cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx ) 34 | ;----------------------------------------------------------------------------- 35 | cglobal cpu_cpuid, 5,7 36 | push rbx 37 | push r4 38 | push r3 39 | push r2 40 | push r1 41 | mov eax, r0d 42 | xor ecx, ecx 43 | cpuid 44 | pop r4 45 | mov [r4], eax 46 | pop r4 47 | mov [r4], ebx 48 | pop r4 49 | mov [r4], ecx 50 | pop r4 51 | mov [r4], edx 52 | pop rbx 53 | RET 54 | 55 | ;----------------------------------------------------------------------------- 56 | ; uint64_t cpu_xgetbv( int xcr ) 57 | ;----------------------------------------------------------------------------- 58 | cglobal cpu_xgetbv 59 | movifnidn ecx, r0m 60 | xgetbv 61 | %if ARCH_X86_64 62 | shl rdx, 32 63 | or rax, rdx 64 | %endif 65 | ret 66 | 67 | ;----------------------------------------------------------------------------- 68 | ; void cpu_emms( void ) 69 | ;----------------------------------------------------------------------------- 70 | cglobal cpu_emms 71 | emms 72 | ret 73 | 74 | ;----------------------------------------------------------------------------- 75 | ; void cpu_sfence( void ) 76 | ;----------------------------------------------------------------------------- 77 | cglobal cpu_sfence 78 | sfence 79 | ret 80 | 81 | %if ARCH_X86_64 == 0 82 | ;----------------------------------------------------------------------------- 83 | ; int cpu_cpuid_test( void ) 84 | ; return 0 if unsupported 85 | ;----------------------------------------------------------------------------- 86 | cglobal cpu_cpuid_test 87 | pushfd 88 | push ebx 89 | push ebp 90 | push esi 91 | push edi 92 | pushfd 93 | pop eax 94 | mov ebx, eax 95 | xor eax, 0x200000 96 | push eax 97 | popfd 98 | pushfd 99 | pop eax 100 | xor eax, ebx 101 | pop edi 102 | pop esi 103 | pop ebp 104 | pop ebx 105 | popfd 106 | ret 107 | %endif 108 | -------------------------------------------------------------------------------- /src/asm/pixel-32.asm: -------------------------------------------------------------------------------- 1 | ;***************************************************************************** 2 | ;* pixel-32.asm: x86_32 pixel metrics 3 | ;***************************************************************************** 4 | ;* Copyright (C) 2003-2022 x264 project 5 | ;* 6 | ;* Authors: Loren Merritt 7 | ;* Laurent Aimar 8 | ;* 9 | ;* This program is free software; you can redistribute it and/or modify 10 | ;* it under the terms of the GNU General Public License as published by 11 | ;* the Free Software Foundation; either version 2 of the License, or 12 | ;* (at your option) any later version. 13 | ;* 14 | ;* This program is distributed in the hope that it will be useful, 15 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | ;* GNU General Public License for more details. 18 | ;* 19 | ;* You should have received a copy of the GNU General Public License 20 | ;* along with this program; if not, write to the Free Software 21 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. 22 | ;* 23 | ;* This program is also available under a commercial proprietary license. 24 | ;* For more information, contact us at licensing@x264.com. 25 | ;***************************************************************************** 26 | 27 | %include "x86inc.asm" 28 | %include "x86util.asm" 29 | 30 | cextern pw_ppmmppmm 31 | cextern pw_pmpmpmpm 32 | 33 | SECTION .text 34 | INIT_MMX mmx2 35 | 36 | %if HIGH_BIT_DEPTH == 0 37 | 38 | %macro LOAD_DIFF_4x8P 1 ; dx 39 | LOAD_DIFF m0, m7, none, [r0+%1], [r2+%1] 40 | LOAD_DIFF m1, m6, none, [r0+%1+r1], [r2+%1+r3] 41 | LOAD_DIFF m2, m7, none, [r0+%1+r1*2], [r2+%1+r3*2] 42 | LOAD_DIFF m3, m6, none, [r0+%1+r4], [r2+%1+r5] 43 | lea r0, [r0+4*r1] 44 | lea r2, [r2+4*r3] 45 | LOAD_DIFF m4, m7, none, [r0+%1], [r2+%1] 46 | LOAD_DIFF m5, m6, none, [r0+%1+r1], [r2+%1+r3] 47 | LOAD_DIFF m6, m7, none, [r0+%1+r1*2], [r2+%1+r3*2] 48 | movq [spill], m5 49 | LOAD_DIFF m7, m5, none, [r0+%1+r4], [r2+%1+r5] 50 | movq m5, [spill] 51 | %endmacro 52 | 53 | %macro SUM4x8_MM 0 54 | movq [spill], m6 55 | movq [spill+8], m7 56 | ABSW2 m0, m1, m0, m1, m6, m7 57 | ABSW2 m2, m3, m2, m3, m6, m7 58 | paddw m0, m2 59 | paddw m1, m3 60 | movq m6, [spill] 61 | movq m7, [spill+8] 62 | ABSW2 m4, m5, m4, m5, m2, m3 63 | ABSW2 m6, m7, m6, m7, m2, m3 64 | paddw m4, m6 65 | paddw m5, m7 66 | paddw m0, m4 67 | paddw m1, m5 68 | paddw m0, m1 69 | %endmacro 70 | 71 | ;----------------------------------------------------------------------------- 72 | ; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) 73 | ;----------------------------------------------------------------------------- 74 | cglobal pixel_sa8d_8x8_internal 75 | push r0 76 | push r2 77 | sub esp, 0x74 78 | %define args esp+0x74 79 | %define spill esp+0x60 ; +16 80 | %define trans esp+0 ; +96 81 | LOAD_DIFF_4x8P 0 82 | HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 83 | 84 | movq [spill], m1 85 | TRANSPOSE4x4W 4, 5, 6, 7, 1 86 | movq [trans+0x00], m4 87 | movq [trans+0x08], m5 88 | movq [trans+0x10], m6 89 | movq [trans+0x18], m7 90 | movq m1, [spill] 91 | TRANSPOSE4x4W 0, 1, 2, 3, 4 92 | movq [trans+0x20], m0 93 | movq [trans+0x28], m1 94 | movq [trans+0x30], m2 95 | movq [trans+0x38], m3 96 | 97 | mov r0, [args+4] 98 | mov r2, [args] 99 | LOAD_DIFF_4x8P 4 100 | HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 101 | 102 | movq [spill], m7 103 | TRANSPOSE4x4W 0, 1, 2, 3, 7 104 | movq [trans+0x40], m0 105 | movq [trans+0x48], m1 106 | movq [trans+0x50], m2 107 | movq [trans+0x58], m3 108 | movq m7, [spill] 109 | TRANSPOSE4x4W 4, 5, 6, 7, 1 110 | movq m0, [trans+0x00] 111 | movq m1, [trans+0x08] 112 | movq m2, [trans+0x10] 113 | movq m3, [trans+0x18] 114 | 115 | HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 116 | SUM4x8_MM 117 | movq [trans], m0 118 | 119 | movq m0, [trans+0x20] 120 | movq m1, [trans+0x28] 121 | movq m2, [trans+0x30] 122 | movq m3, [trans+0x38] 123 | movq m4, [trans+0x40] 124 | movq m5, [trans+0x48] 125 | movq m6, [trans+0x50] 126 | movq m7, [trans+0x58] 127 | 128 | HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 129 | SUM4x8_MM 130 | 131 | pavgw m0, [trans] 132 | add esp, 0x7c 133 | ret 134 | %undef args 135 | %undef spill 136 | %undef trans 137 | 138 | %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op 139 | pxor %7, %7 140 | pshufw %4, %1, q1032 141 | pshufw %5, %2, q1032 142 | pshufw %6, %3, q1032 143 | paddusw %1, %4 144 | paddusw %2, %5 145 | paddusw %3, %6 146 | punpcklwd %1, %7 147 | punpcklwd %2, %7 148 | punpcklwd %3, %7 149 | pshufw %4, %1, q1032 150 | pshufw %5, %2, q1032 151 | pshufw %6, %3, q1032 152 | %8 %1, %4 153 | %8 %2, %5 154 | %8 %3, %6 155 | %endmacro 156 | 157 | %macro LOAD_4x8P 1 ; dx 158 | pxor m7, m7 159 | movd m6, [r0+%1+7*FENC_STRIDE] 160 | movd m0, [r0+%1+0*FENC_STRIDE] 161 | movd m1, [r0+%1+1*FENC_STRIDE] 162 | movd m2, [r0+%1+2*FENC_STRIDE] 163 | movd m3, [r0+%1+3*FENC_STRIDE] 164 | movd m4, [r0+%1+4*FENC_STRIDE] 165 | movd m5, [r0+%1+5*FENC_STRIDE] 166 | punpcklbw m6, m7 167 | punpcklbw m0, m7 168 | punpcklbw m1, m7 169 | movq [spill], m6 170 | punpcklbw m2, m7 171 | punpcklbw m3, m7 172 | movd m6, [r0+%1+6*FENC_STRIDE] 173 | punpcklbw m4, m7 174 | punpcklbw m5, m7 175 | punpcklbw m6, m7 176 | movq m7, [spill] 177 | %endmacro 178 | 179 | %macro HSUMSUB2 4 180 | pshufw m4, %1, %3 181 | pshufw m5, %2, %3 182 | pmullw %1, %4 183 | pmullw m5, %4 184 | paddw %1, m4 185 | paddw %2, m5 186 | %endmacro 187 | 188 | ;----------------------------------------------------------------------------- 189 | ; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res ) 190 | ;----------------------------------------------------------------------------- 191 | cglobal intra_sa8d_x3_8x8, 2,3 192 | SUB esp, 0x94 193 | %define edge esp+0x70 ; +32 194 | %define spill esp+0x60 ; +16 195 | %define trans esp+0 ; +96 196 | %define sum esp+0 ; +32 197 | 198 | pxor m7, m7 199 | movq m0, [r1+7] 200 | movq m2, [r1+16] 201 | movq m1, m0 202 | movq m3, m2 203 | punpcklbw m0, m7 204 | punpckhbw m1, m7 205 | punpcklbw m2, m7 206 | punpckhbw m3, m7 207 | movq m6, [pw_ppmmppmm] 208 | HSUMSUB2 m0, m2, q1032, m6 209 | HSUMSUB2 m1, m3, q1032, m6 210 | movq m6, [pw_pmpmpmpm] 211 | HSUMSUB2 m0, m2, q2301, m6 212 | HSUMSUB2 m1, m3, q2301, m6 213 | movq m4, m0 214 | movq m5, m2 215 | paddw m0, m1 216 | paddw m2, m3 217 | psubw m4, m1 218 | psubw m3, m5 219 | movq [edge+0], m0 220 | movq [edge+8], m4 221 | movq [edge+16], m2 222 | movq [edge+24], m3 223 | 224 | LOAD_4x8P 0 225 | HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 226 | 227 | movq [spill], m0 228 | TRANSPOSE4x4W 4, 5, 6, 7, 0 229 | movq [trans+0x00], m4 230 | movq [trans+0x08], m5 231 | movq [trans+0x10], m6 232 | movq [trans+0x18], m7 233 | movq m0, [spill] 234 | TRANSPOSE4x4W 0, 1, 2, 3, 4 235 | movq [trans+0x20], m0 236 | movq [trans+0x28], m1 237 | movq [trans+0x30], m2 238 | movq [trans+0x38], m3 239 | 240 | LOAD_4x8P 4 241 | HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 242 | 243 | movq [spill], m7 244 | TRANSPOSE4x4W 0, 1, 2, 3, 7 245 | movq [trans+0x40], m0 246 | movq [trans+0x48], m1 247 | movq [trans+0x50], m2 248 | movq [trans+0x58], m3 249 | movq m7, [spill] 250 | TRANSPOSE4x4W 4, 5, 6, 7, 0 251 | movq m0, [trans+0x00] 252 | movq m1, [trans+0x08] 253 | movq m2, [trans+0x10] 254 | movq m3, [trans+0x18] 255 | 256 | HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 257 | 258 | movq [spill+0], m0 259 | movq [spill+8], m1 260 | ABSW2 m2, m3, m2, m3, m0, m1 261 | ABSW2 m4, m5, m4, m5, m0, m1 262 | paddw m2, m4 263 | paddw m3, m5 264 | ABSW2 m6, m7, m6, m7, m4, m5 265 | movq m0, [spill+0] 266 | movq m1, [spill+8] 267 | paddw m2, m6 268 | paddw m3, m7 269 | paddw m2, m3 270 | ABSW m1, m1, m4 271 | paddw m2, m1 ; 7x4 sum 272 | movq m7, m0 273 | movq m1, [edge+8] ; left bottom 274 | psllw m1, 3 275 | psubw m7, m1 276 | ABSW2 m0, m7, m0, m7, m5, m3 277 | paddw m0, m2 278 | paddw m7, m2 279 | movq [sum+0], m0 ; dc 280 | movq [sum+8], m7 ; left 281 | 282 | movq m0, [trans+0x20] 283 | movq m1, [trans+0x28] 284 | movq m2, [trans+0x30] 285 | movq m3, [trans+0x38] 286 | movq m4, [trans+0x40] 287 | movq m5, [trans+0x48] 288 | movq m6, [trans+0x50] 289 | movq m7, [trans+0x58] 290 | 291 | HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 292 | 293 | movd [sum+0x10], m0 294 | movd [sum+0x12], m1 295 | movd [sum+0x14], m2 296 | movd [sum+0x16], m3 297 | movd [sum+0x18], m4 298 | movd [sum+0x1a], m5 299 | movd [sum+0x1c], m6 300 | movd [sum+0x1e], m7 301 | 302 | movq [spill], m0 303 | movq [spill+8], m1 304 | ABSW2 m2, m3, m2, m3, m0, m1 305 | ABSW2 m4, m5, m4, m5, m0, m1 306 | paddw m2, m4 307 | paddw m3, m5 308 | paddw m2, m3 309 | movq m0, [spill] 310 | movq m1, [spill+8] 311 | ABSW2 m6, m7, m6, m7, m4, m5 312 | ABSW m1, m1, m3 313 | paddw m2, m7 314 | paddw m1, m6 315 | paddw m2, m1 ; 7x4 sum 316 | movq m1, m0 317 | 318 | movq m7, [edge+0] 319 | psllw m7, 3 ; left top 320 | 321 | mov r2, [edge+0] 322 | add r2, [edge+16] 323 | lea r2, [4*r2+32] 324 | and r2, 0xffc0 325 | movd m6, r2 ; dc 326 | 327 | psubw m1, m7 328 | psubw m0, m6 329 | ABSW2 m0, m1, m0, m1, m5, m6 330 | movq m3, [sum+0] ; dc 331 | paddw m0, m2 332 | paddw m1, m2 333 | movq m2, m0 334 | paddw m0, m3 335 | paddw m1, [sum+8] ; h 336 | psrlq m2, 16 337 | paddw m2, m3 338 | 339 | movq m3, [edge+16] ; top left 340 | movq m4, [edge+24] ; top right 341 | psllw m3, 3 342 | psllw m4, 3 343 | psubw m3, [sum+16] 344 | psubw m4, [sum+24] 345 | ABSW2 m3, m4, m3, m4, m5, m6 346 | paddw m2, m3 347 | paddw m2, m4 ; v 348 | 349 | SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, pavgw 350 | mov r2, r2m 351 | pxor m7, m7 352 | punpckldq m2, m1 353 | pavgw m0, m7 354 | pavgw m2, m7 355 | movd [r2+8], m0 ; dc 356 | movq [r2+0], m2 ; v, h 357 | ADD esp, 0x94 358 | RET 359 | %undef edge 360 | %undef spill 361 | %undef trans 362 | %undef sum 363 | 364 | 365 | 366 | ;----------------------------------------------------------------------------- 367 | ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1, 368 | ; const uint8_t *pix2, intptr_t stride2, int sums[2][4] ) 369 | ;----------------------------------------------------------------------------- 370 | cglobal pixel_ssim_4x4x2_core, 0,5 371 | mov r1, r1m 372 | mov r3, r3m 373 | mov r4, 4 374 | pxor m0, m0 375 | .loop: 376 | mov r0, r0m 377 | mov r2, r2m 378 | add r0, r4 379 | add r2, r4 380 | pxor m1, m1 381 | pxor m2, m2 382 | pxor m3, m3 383 | pxor m4, m4 384 | %rep 4 385 | movd m5, [r0] 386 | movd m6, [r2] 387 | punpcklbw m5, m0 388 | punpcklbw m6, m0 389 | paddw m1, m5 390 | paddw m2, m6 391 | movq m7, m5 392 | pmaddwd m5, m5 393 | pmaddwd m7, m6 394 | pmaddwd m6, m6 395 | paddd m3, m5 396 | paddd m4, m7 397 | paddd m3, m6 398 | add r0, r1 399 | add r2, r3 400 | %endrep 401 | mov r0, r4m 402 | lea r0, [r0+r4*4] 403 | pshufw m5, m1, q0032 404 | pshufw m6, m2, q0032 405 | paddusw m1, m5 406 | paddusw m2, m6 407 | punpcklwd m1, m2 408 | pshufw m2, m1, q0032 409 | pshufw m5, m3, q0032 410 | pshufw m6, m4, q0032 411 | paddusw m1, m2 412 | paddd m3, m5 413 | paddd m4, m6 414 | punpcklwd m1, m0 415 | punpckldq m3, m4 416 | movq [r0+0], m1 417 | movq [r0+8], m3 418 | sub r4, 4 419 | jge .loop 420 | emms 421 | RET 422 | 423 | %endif ; !HIGH_BIT_DEPTH 424 | --------------------------------------------------------------------------------