├── .github
    └── workflows
    │   └── build-windows.yml
├── .gitignore
├── meson.build
├── meson_options.txt
├── readme.rst
└── src
    ├── Bullshit.h
    ├── CPU.c
    ├── CPU.h
    ├── CommonFunctions.h
    ├── CommonMacros.h
    ├── CopyCode.cpp
    ├── CopyCode.h
    ├── DCTFFTW.cpp
    ├── DCTFFTW.h
    ├── EntryPoint.c
    ├── Fakery.c
    ├── Fakery.h
    ├── GroupOfPlanes.c
    ├── GroupOfPlanes.h
    ├── Luma.cpp
    ├── Luma.h
    ├── MVAnalyse.c
    ├── MVAnalysisData.c
    ├── MVAnalysisData.h
    ├── MVBlockFPS.c
    ├── MVCompensate.c
    ├── MVDegrains.cpp
    ├── MVDegrains.h
    ├── MVDegrains_AVX2.cpp
    ├── MVDepan.cpp
    ├── MVFinest.c
    ├── MVFlow.cpp
    ├── MVFlowBlur.c
    ├── MVFlowFPS.c
    ├── MVFlowFPSHelper.c
    ├── MVFlowFPSHelper.h
    ├── MVFlowInter.c
    ├── MVFrame.cpp
    ├── MVFrame.h
    ├── MVFrame_AVX2.cpp
    ├── MVMask.c
    ├── MVRecalculate.c
    ├── MVSCDetection.c
    ├── MVSuper.c
    ├── MaskFun.cpp
    ├── MaskFun.h
    ├── MaskFun_AVX2.cpp
    ├── Overlap.cpp
    ├── Overlap.h
    ├── Overlap_AVX2.cpp
    ├── PlaneOfBlocks.cpp
    ├── PlaneOfBlocks.h
    ├── SADFunctions.cpp
    ├── SADFunctions.h
    ├── SADFunctions_AVX2.cpp
    ├── SimpleResize.cpp
    ├── SimpleResize.h
    ├── SimpleResize_AVX2.cpp
    ├── asm
        ├── aarch64-asm.S
        ├── aarch64-pixel-a-common.S
        ├── aarch64-pixel-a.S
        ├── const-a.asm
        ├── cpu-a.asm
        ├── include
        │   ├── x86inc.asm
        │   └── x86util.asm
        ├── pixel-32.asm
        ├── pixel-a.asm
        └── sad-a.asm
    └── sse2neon.h


/.github/workflows/build-windows.yml:
--------------------------------------------------------------------------------
 1 | name: Build-Windows
 2 | 
 3 | on: workflow_dispatch
 4 | 
 5 | permissions:
 6 |   attestations: write
 7 |   contents: read
 8 |   id-token: write
 9 | 
10 | jobs:
11 |   build-windows-x64:
12 |     runs-on: windows-latest
13 |     defaults:
14 |       run:
15 |         shell: msys2 {0}
16 |     steps:
17 |       - name: Checkout code
18 |         uses: actions/checkout@v4
19 |         with:
20 |           submodules: recursive
21 |       - name: Setup MSYS2
22 |         uses: msys2/setup-msys2@v2
23 |         with:
24 |           msystem: MINGW64
25 |           update: true
26 |           install: >-
27 |             base-devel
28 |             mingw-w64-x86_64-jq
29 |             mingw-w64-x86_64-gcc
30 |             mingw-w64-x86_64-pkg-config
31 |             mingw-w64-x86_64-vapoursynth
32 |             mingw-w64-x86_64-meson
33 |             mingw-w64-x86_64-ninja
34 |             mingw-w64-x86_64-nasm
35 |             mingw-w64-x86_64-fftw
36 |       - name: Build vs-mvtools
37 |         run: |
38 |           meson setup build --buildtype release --prefer-static --default-library=static -Dcpp_link_args='-static'
39 |           meson compile -vC build
40 |       - name: Export version
41 |         run: |
42 |           echo "ARTIFACT_VERSION=$(meson introspect --projectinfo build | jq -r '.version')" >> $GITHUB_ENV
43 |       - name: Upload
44 |         uses: actions/upload-artifact@v4.3.3
45 |         with:
46 |           name: mvtools-windows-x64-${{ env.ARTIFACT_VERSION }}
47 |           path: build/libmvtools.dll
48 |   build-windows-x86:
49 |     runs-on: windows-latest
50 |     defaults:
51 |       run:
52 |         shell: msys2 {0}
53 |     steps:
54 |       - name: Checkout code
55 |         uses: actions/checkout@v4
56 |         with:
57 |           submodules: recursive
58 |       - name: Setup MSYS2
59 |         uses: msys2/setup-msys2@v2
60 |         with:
61 |           msystem: MINGW32
62 |           update: true
63 |           install: >-
64 |             base-devel
65 |             mingw-w64-i686-jq
66 |             mingw-w64-i686-gcc
67 |             mingw-w64-i686-pkg-config
68 |             mingw-w64-i686-vapoursynth
69 |             mingw-w64-i686-meson
70 |             mingw-w64-i686-ninja
71 |             mingw-w64-i686-nasm
72 |             mingw-w64-i686-fftw
73 |       - name: Build vs-mvtools
74 |         run: |
75 |           meson setup build --buildtype release --prefer-static --default-library=static -Dcpp_link_args='-static'
76 |           meson compile -vC build
77 |       - name: Export version
78 |         run: |
79 |           echo "ARTIFACT_VERSION=$(meson introspect --projectinfo build | jq -r '.version')" >> $GITHUB_ENV
80 |       - name: Upload
81 |         uses: actions/upload-artifact@v4.3.3
82 |         with:
83 |           name: mvtools-windows-x86-${{ env.ARTIFACT_VERSION }}
84 |           path: build/libmvtools.dll
85 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | orig/
 2 | *.zip
 3 | releases/
 4 | test stuff/
 5 | *.o
 6 | *.la
 7 | *.lo
 8 | *.so
 9 | *.dll
10 | .libs
11 | Makefile
12 | Makefile.in
13 | aclocal.m4
14 | autom4te.cache
15 | compile
16 | config.guess
17 | config.log
18 | config.status
19 | config.sub
20 | configure
21 | depcomp
22 | install-sh
23 | libtool
24 | ltmain.sh
25 | missing
26 | .deps/
27 | .dirstamp
28 | .cache/
29 | .venv/
30 | 


--------------------------------------------------------------------------------
/meson.build:
--------------------------------------------------------------------------------
  1 | project('MVTools', 'c', 'cpp',
  2 |         version: '24',
  3 |         default_options: ['c_std=c99', 'cpp_std=c++11', 'buildtype=release', 'b_lto=true'],
  4 |         meson_version: '>=0.46')
  5 | 
  6 | 
  7 | warnings = [
  8 |   '-Wall',
  9 |   '-Wextra',
 10 |   '-Wshadow',
 11 | ]
 12 | 
 13 | cflags = [
 14 |   warnings,
 15 |   '-fvisibility=hidden',
 16 |   '-DPACKAGE_VERSION="@0@"'.format(meson.project_version()),
 17 | ]
 18 | 
 19 | ldflags = [
 20 | ]
 21 | 
 22 | nasm_flags = [
 23 |   '-I@0@'.format(join_paths(meson.current_source_dir(), 'src/asm/include/')),
 24 |   '-w',
 25 |   '-Worphan-labels',
 26 |   '-Wunrecognized-char',
 27 |   '-Dprivate_prefix=mvtools',
 28 |   '-DHIGH_BIT_DEPTH=0',
 29 |   '-DBIT_DEPTH=8',
 30 | ]
 31 | 
 32 | 
 33 | vapoursynth_dep = dependency('vapoursynth').partial_dependency(includes: true, compile_args: true)
 34 | 
 35 | 
 36 | sources = [
 37 |   'src/CopyCode.cpp',
 38 |   'src/CPU.c',
 39 |   'src/DCTFFTW.cpp',
 40 |   'src/EntryPoint.c',
 41 |   'src/Fakery.c',
 42 |   'src/GroupOfPlanes.c',
 43 |   'src/Luma.cpp',
 44 |   'src/MaskFun.cpp',
 45 |   'src/MVAnalyse.c',
 46 |   'src/MVAnalysisData.c',
 47 |   'src/MVBlockFPS.c',
 48 |   'src/MVCompensate.c',
 49 |   'src/MVDegrains.cpp',
 50 |   'src/MVDepan.cpp',
 51 |   'src/MVFinest.c',
 52 |   'src/MVFlow.cpp',
 53 |   'src/MVFlowBlur.c',
 54 |   'src/MVFlowFPS.c',
 55 |   'src/MVFlowFPSHelper.c',
 56 |   'src/MVFlowInter.c',
 57 |   'src/MVFrame.cpp',
 58 |   'src/MVMask.c',
 59 |   'src/MVRecalculate.c',
 60 |   'src/MVSCDetection.c',
 61 |   'src/MVSuper.c',
 62 |   'src/Overlap.cpp',
 63 |   'src/PlaneOfBlocks.cpp',
 64 |   'src/SADFunctions.cpp',
 65 |   'src/SimpleResize.cpp',
 66 | ]
 67 | 
 68 | 
 69 | debug_build = get_option('buildtype').startswith('debug')
 70 | 
 71 | 
 72 | host_cpu_family = host_machine.cpu_family()
 73 | 
 74 | if host_cpu_family == 'x86'
 75 |   host_bits = 32
 76 | elif host_cpu_family == 'x86_64'
 77 |   host_bits = 64
 78 | elif host_cpu_family == 'arm'
 79 |   host_bits = 32
 80 | elif host_cpu_family == 'aarch64'
 81 |   host_bits = 64
 82 | endif
 83 | 
 84 | 
 85 | host_system = host_machine.system()
 86 | 
 87 | if host_system == 'windows' or host_system == 'cygwin'
 88 |   if host_cpu_family == 'x86'
 89 |     cflags += '-mstackrealign'
 90 |     ldflags += '-Wl,--kill-at'
 91 |     nasm_flags += '-DPREFIX'
 92 |   endif
 93 | 
 94 |   nasm_flags += ['-f', 'win@0@'.format(host_bits)]
 95 | elif host_system == 'linux' or host_system == 'bsd' # The BSDs are close enough, right?
 96 |   if debug_build
 97 |     nasm_flags += '-gdwarf'
 98 |   endif
 99 | 
100 |   nasm_flags += ['-f', 'elf@0@'.format(host_bits)]
101 | elif host_system == 'darwin'
102 |   if debug_build
103 |     nasm_flags += '-gdwarf'
104 |   endif
105 | 
106 |   nasm_flags += ['-DPREFIX', '-f', 'macho@0@'.format(host_bits)]
107 |   cflags += ['-DPREFIX']
108 | else
109 |   error('Unknown host system "@0@".'.format(host_system))
110 | endif
111 | 
112 | 
113 | helper_libs = []
114 | 
115 | 
116 | if host_cpu_family.startswith('x86')
117 |   cflags += ['-mfpmath=sse', '-msse2', '-DMVTOOLS_X86=1']
118 | 
119 | 
120 |   nasm_sources = [
121 |     'src/asm/const-a.asm',
122 |     'src/asm/cpu-a.asm',
123 |     'src/asm/pixel-a.asm',
124 |     'src/asm/sad-a.asm',
125 |   ]
126 | 
127 | 
128 |   if host_cpu_family == 'x86'
129 |     nasm_flags += '-DARCH_X86_64=0'
130 | 
131 |     nasm_sources += [
132 |       'src/asm/pixel-32.asm',
133 |     ]
134 |   else
135 |     nasm_flags += ['-DARCH_X86_64=1', '-DPIC']
136 |   endif
137 | 
138 |   nasm = find_program(get_option('with_nasm'))
139 | 
140 |   outputname = '@BASENAME@.o'
141 |   if host_system == 'windows'
142 |     outputname = '@BASENAME@.obj'
143 |   endif
144 | 
145 |   nasm_gen = generator(nasm,
146 |                        output: outputname,
147 |                        arguments: nasm_flags + ['@INPUT@', '-o', '@OUTPUT@'])
148 | 
149 |   sources += nasm_gen.process(nasm_sources)
150 | 
151 | 
152 |   libavx2_sources = [
153 |     'src/MaskFun_AVX2.cpp',
154 |     'src/MVDegrains_AVX2.cpp',
155 |     'src/MVFrame_AVX2.cpp',
156 |     'src/Overlap_AVX2.cpp',
157 |     'src/SADFunctions_AVX2.cpp',
158 |     'src/SimpleResize_AVX2.cpp',
159 |   ]
160 | 
161 |   helper_libs += static_library('avx2',
162 |                                 libavx2_sources,
163 |                                 dependencies: vapoursynth_dep,
164 |                                 cpp_args: [cflags, '-mavx2', '-mtune=haswell'],
165 |                                 install: false)
166 | endif
167 | 
168 | if host_cpu_family.startswith('arm') or host_cpu_family.startswith('aarch64')
169 |   cflags += ['-DMVTOOLS_ARM=1']
170 | 
171 |   if host_cpu_family.startswith('aarch64')
172 |     asm_sources = [
173 |       'src/asm/aarch64-pixel-a.S',
174 |     ]
175 | 
176 |     sources += asm_sources
177 |   endif 
178 | endif
179 | 
180 | 
181 | cxx = meson.get_compiler('cpp')
182 | 
183 | 
184 | deps = [
185 |   vapoursynth_dep,
186 |   dependency('fftw3f'),
187 |   cxx.find_library('m', required: false),
188 | ]
189 | 
190 | shared_module('mvtools',
191 |               sources,
192 |               dependencies: deps,
193 |               link_args: ldflags,
194 |               c_args: cflags,
195 |               cpp_args: cflags,
196 |               link_with: helper_libs,
197 |               install: true)
198 | 


--------------------------------------------------------------------------------
/meson_options.txt:
--------------------------------------------------------------------------------
1 | option('with_nasm',
2 |        type: 'string',
3 |        value: 'nasm',
4 |        description: 'Location of the NASM executable. Only relevant on x86 hosts.')
5 | 


--------------------------------------------------------------------------------
/readme.rst:
--------------------------------------------------------------------------------
  1 | Description
  2 | ===========
  3 | 
  4 | MVTools is a set of filters for motion estimation and compensation.
  5 | 
  6 | This is a port of version 2.5.11.20 of the Avisynth plugin.
  7 | 
  8 | Some changes from version 2.5.11.9 of the SVP fork have been incorporated as well (http://www.svp-team.com/wiki/Download).
  9 | 
 10 | The filter DepanEstimate was ported from the Avisynth plugin DepanEstimate, version 1.10.
 11 | 
 12 | The filters DepanCompensate and DepanStabilise were ported from the Avisynth plugin Depan, version 1.13.1.
 13 | 
 14 | 
 15 | Differences
 16 | ===========
 17 | 
 18 | * All:
 19 |     * Free multithreading, courtesy of VapourSynth.
 20 | 
 21 |     * Parameters are all lowercase now.
 22 | 
 23 |     * YUY2 is not supported.
 24 | 
 25 |     * Grayscale, 4:2:0, 4:2:2, 4:4:0, and 4:4:4 are supported, except for DepanCompensate and DepanStabilise, which don't support 4:4:0.
 26 | 
 27 |     * Up to 16 bits per sample are supported.
 28 | 
 29 |     * The audio is definitely not killed.
 30 | 
 31 |     * No "planar" parameter.
 32 | 
 33 |     * "isse" parameter renamed to "opt".
 34 | 
 35 | * Analyse:
 36 |     * No "temporal" parameter, as it's sort of incompatible with multithreading.
 37 | 
 38 |     * No "outfile" parameter.
 39 | 
 40 |     * No "sadx264" parameter. If opt is True, the best functions imported from x264 will be selected automatically. Otherwise, only C functions will be used.
 41 | 
 42 |     * New parameters "fields" and "tff".
 43 | 
 44 |     * The optimised SAD, SATD, and SSD functions from x264 have been updated to the latest versions (as of September 2014).
 45 | 
 46 |     * Block sizes of 64x32, 64x64, 128x64, and 128x128 are supported.
 47 | 
 48 |     * The "dct" parameter can be 5..10 even with blocks larger than 16x16.
 49 | 
 50 | * Recalculate:
 51 |     * Same as Analyse.
 52 | 
 53 | * Compensate:
 54 |     * No "recursion" parameter. It was dodgy.
 55 | 
 56 |     * New parameter "tff".
 57 | 
 58 | * Flow
 59 |     * New parameter "tff".
 60 | 
 61 | * SCDetection:
 62 |     * No "ysc" parameter. The input frames are returned unchanged, with the ``_SceneChangePrev`` or ``_SceneChangeNext`` property attached.
 63 | 
 64 |     * No "isse" parameter. It wasn't used.
 65 | 
 66 | * DepanAnalyse:
 67 |     * Formerly "MDepan".
 68 | 
 69 |     * New parameters "fields" and "tff".
 70 | 
 71 |     * No "log", "range", "isse" parameters.
 72 | 
 73 | * DepanEstimate:
 74 |     * New parameters "fields" and "tff".
 75 | 
 76 |     * No "range", "log", "debug", "extlog" parameters.
 77 | 
 78 | * DepanCompensate:
 79 |     * Formerly "DePan".
 80 | 
 81 |     * No "inputlog" parameter.
 82 | 
 83 | * DepanStabilise:
 84 |     * Formerly "DePanStabilize".
 85 | 
 86 |     * No "inputlog" parameter.
 87 | 
 88 |     * Methods -1 and 2 unavailable.
 89 | 
 90 | 
 91 | Usage
 92 | =====
 93 | ::
 94 | 
 95 |     mv.Super(clip clip[, int hpad=16, int vpad=16, int pel=2, int levels=0, bint chroma=True, int sharp=2, int rfilter=2, clip pelclip=None, bint opt=True])
 96 | 
 97 |     mv.Analyse(clip super[, int blksize=8, int blksizev=blksize, int levels=0, int search=4, int searchparam=2, int pelsearch=0, bint isb=False, int lambda, bint chroma=True, int delta=1, bint truemotion=True, int lsad, int plevel, int global, int pnew, int pzero=pnew, int pglobal=0, int overlap=0, int overlapv=overlap, bint divide=False, int badsad=10000, int badrange=24, bint opt=True, bint meander=True, bint trymany=False, bint fields=False, bint tff, int search_coarse=3, int dct=0])
 98 | 
 99 |     mv.Recalculate(clip super, clip vectors[, int blksize=8, int blksizev=blksize, int search=4, int searchparam=2, int lambda, bint chroma=True, bint truemotion=True, int pnew, int overlap=0, int overlapv=overlap, bint divide=False, bint opt=True, bint meander=True, bint fields=False, bint tff, int dct=0])
100 | 
101 |     mv.Compensate(clip clip, clip super, clip vectors[, int scbehavior=1, int thsad=10000, bint fields=False, float time=100.0, int thscd1=400, int thscd2=130, bint opt=True, bint tff])
102 | 
103 |     mv.Degrain1(clip clip, clip super, clip mvbw, clip mvfw[, int thsad=400, int thsadc=thsad, int plane=4, int limit=255, int limitc=limit, int thscd1=400, int thscd2=130, bint opt=True])
104 | 
105 |     mv.Degrain2(clip clip, clip super, clip mvbw, clip mvfw, clip mvbw2, clip mvfw2[, int thsad=400, int thsadc=thsad, int plane=4, int limit=255, int limitc=limit, int thscd1=400, int thscd2=130, bint opt=True])
106 | 
107 |     mv.Degrain3(clip clip, clip super, clip mvbw, clip mvfw, clip mvbw2, clip mvfw2, clip mvbw3, clip mvfw3[, int thsad=400, int thsadc=thsad, int plane=4, int limit=255, int limitc=limit, int thscd1=400, int thscd2=130, bint opt=True])
108 | 
109 |     mv.Mask(clip clip, clip vectors[, float ml=100.0, float gamma=1.0, int kind=0, float time=100.0, int ysc=0, int thscd1=400, int thscd2=130, bint opt=True])
110 | 
111 |     mv.Finest(clip super[, bint opt=True])
112 | 
113 |     mv.Flow(clip clip, clip super, clip vectors[, float time=100.0, int mode=0, bint fields=False, int thscd1=400, int thscd2=130, bint opt=True, bint tff])
114 | 
115 |     mv.FlowBlur(clip clip, clip super, clip mvbw, clip mvfw[, float blur=50.0, int prec=1, int thscd1=400, int thscd2=130, bint opt=True])
116 | 
117 |     mv.FlowInter(clip clip, clip super, clip mvbw, clip mvfw[, float time=50.0, float ml=100.0, bint blend=True, int thscd1=400, int thscd2=130, bint opt=True])
118 | 
119 |     mv.FlowFPS(clip clip, clip super, clip mvbw, clip mvfw[, int num=25, int den=1, int mask=2, float ml=100.0, bint blend=True, int thscd1=400, int thscd2=130, bint opt=True])
120 | 
121 |     mv.BlockFPS(clip clip, clip super, clip mvbw, clip mvfw[, int num=25, int den=1, int mode=3, float ml=100.0, bint blend=True, int thscd1=400, int thscd2=130, bint opt=True])
122 | 
123 |     mv.SCDetection(clip clip, clip vectors[, int thscd1=400, int thscd2=130])
124 | 
125 |     mv.DepanAnalyse(clip clip, clip vectors[, clip mask, bint zoom=True, bint rot=True, float pixaspect=1.0, float error=15.0, bint info=False, float wrong=10.0, float zerow=0.05, int thscd1=400, int thscd2=130, bint fields=False, bint tff])
126 | 
127 |     mv.DepanEstimate(clip clip[, float trust=4.0, int winx=0, int winy=0, int wleft=-1, int wtop=-1, int dxmax=-1, int dymax=-1, float zoommax=1.0, float stab=1.0, float pixaspect=1.0, bint info=False, bint show=False, bint fields=False, bint tff])
128 | 
129 |     mv.DepanCompensate(clip clip, clip data[, float offset=0.0, int subpixel=2, float pixaspect=1.0, bint matchfields=True, int mirror=0, int blur=0, bint info=False, bint fields=False, bint tff])
130 | 
131 |     mv.DepanStabilise(clip clip, clip data[, float cutoff=1.0, float damping=0.9, float initzoom=1.0, bint addzoom=False, int prev=0, int next=0, int mirror=0, int blur=0, float dxmax=60.0, float dymax=30.0, float zoommax=1.05, float rotmax=1.0, int subpixel=2, float pixaspect=1.0, int fitlast=0, float tzoom=3.0, bint info=False, int method=0, bint fields=False])
132 | 
133 | 
134 | If *fields* is True, it is assumed that the clip named *clip* first went through std.SeparateFields.
135 | 
136 | For information about the other parameters, consult the Avisynth plugins' documentation at http://avisynth.org.ru/mvtools/mvtools2.html or http://www.avisynth.nl/users/fizick/depan/depan.html. This will not be necessary in the future.
137 | 
138 | 
139 | Compilation
140 | ===========
141 | 
142 | FFTW3 configured for 32 bit floats is required ("fftw3f").
143 | 
144 | ::
145 | 
146 |    meson setup build
147 |    ninja -C build
148 | 
149 | 
150 | License
151 | =======
152 | 
153 | GPL 2, like the Avisynth plugins.
154 | 


--------------------------------------------------------------------------------
/src/Bullshit.h:
--------------------------------------------------------------------------------
 1 | #ifndef BULLSHIT_H
 2 | #define BULLSHIT_H
 3 | 
 4 | #if defined(_MSC_VER) && _MSC_VER < 1900
 5 | // Don't forget to zero the last byte. _snprintf doesn't do it if the string doesn't fit.
 6 | #define snprintf _snprintf
 7 | #endif
 8 | 
 9 | #endif // BULLSHIT_H
10 | 


--------------------------------------------------------------------------------
/src/CPU.c:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * cpu.c: h264 encoder library
  3 |  *****************************************************************************
  4 |  * Copyright (C) 2003 Laurent Aimar
  5 |  * $Id: cpu.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
  6 |  *
  7 |  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  8 |  *
  9 |  * This program is free software; you can redistribute it and/or modify
 10 |  * it under the terms of the GNU General Public License as published by
 11 |  * the Free Software Foundation; either version 2 of the License, or
 12 |  * (at your option) any later version.
 13 |  *
 14 |  * This program is distributed in the hope that it will be useful,
 15 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 |  * GNU General Public License for more details.
 18 |  *
 19 |  * You should have received a copy of the GNU General Public License
 20 |  * along with this program; if not, write to the Free Software
 21 |  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
 22 |  *****************************************************************************/
 23 | 
 24 | #include <assert.h>
 25 | #include <stdint.h>
 26 | #include <string.h>
 27 | 
 28 | #include "CPU.h"
 29 | 
 30 | 
 31 | #if defined(MVTOOLS_X86)
 32 | 
 33 | uint32_t cpu_detect(void) {
 34 |     uint32_t cpu = 0;
 35 |     uint32_t eax, ebx, ecx, edx;
 36 |     uint32_t vendor[4] = { 0 };
 37 |     uint32_t max_extended_cap, max_basic_cap;
 38 |     int cache;
 39 | 
 40 | 
 41 |     mvtools_cpu_cpuid(0, &eax, vendor + 0, vendor + 2, vendor + 1);
 42 |     max_basic_cap = eax;
 43 |     if (max_basic_cap == 0)
 44 |         return 0;
 45 | 
 46 |     mvtools_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
 47 |     if (edx & 0x00800000)
 48 |         cpu |= X264_CPU_MMX;
 49 |     else
 50 |         return cpu;
 51 |     if (edx & 0x02000000)
 52 |         cpu |= X264_CPU_MMX2 | X264_CPU_SSE;
 53 |     if (edx & 0x00008000)
 54 |         cpu |= X264_CPU_CMOV;
 55 |     else
 56 |         return cpu;
 57 |     if (edx & 0x04000000)
 58 |         cpu |= X264_CPU_SSE2;
 59 |     if (ecx & 0x00000001)
 60 |         cpu |= X264_CPU_SSE3;
 61 |     if (ecx & 0x00000200)
 62 |         cpu |= X264_CPU_SSSE3;
 63 |     if (ecx & 0x00080000)
 64 |         cpu |= X264_CPU_SSE4;
 65 |     if (ecx & 0x00100000)
 66 |         cpu |= X264_CPU_SSE42;
 67 |     
 68 |     if (ecx & 0x08000000) { /* XGETBV supported and XSAVE enabled by OS */
 69 |         uint64_t xcr0 = mvtools_cpu_xgetbv(0);
 70 |         if ((xcr0 & 0x6) == 0x6) { /* XMM/YMM state */
 71 |             if (ecx & 0x10000000)
 72 |                 cpu |= X264_CPU_AVX;
 73 |             if (ecx & 0x00001000)
 74 |                 cpu |= X264_CPU_FMA3;
 75 |  
 76 |             if (max_basic_cap >= 7) {
 77 |                 mvtools_cpu_cpuid(7, &eax, &ebx, &ecx, &edx);
 78 |                 if (ebx & 0x00000020)
 79 |                     cpu |= X264_CPU_AVX2;
 80 |             }
 81 |         }
 82 |     }
 83 | 
 84 |     if (cpu & X264_CPU_SSSE3)
 85 |         cpu |= X264_CPU_SSE2_IS_FAST;
 86 | 
 87 |     mvtools_cpu_cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
 88 |     max_extended_cap = eax;
 89 | 
 90 |     if (max_extended_cap >= 0x80000001) {
 91 |         mvtools_cpu_cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
 92 | 
 93 |         if (ecx & 0x00000020)
 94 |             cpu |= X264_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */
 95 |         if (ecx & 0x00000040)      /* SSE4a, AMD only */
 96 |         {
 97 |             int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
 98 |             cpu |= X264_CPU_SSE2_IS_FAST; /* Phenom and later CPUs have fast SSE units */
 99 |             if (family == 0x14) {
100 |                 cpu &= ~X264_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
101 |                 cpu |= X264_CPU_SSE2_IS_SLOW;  /* Bobcat has 64-bit SIMD units */
102 |                 cpu |= X264_CPU_SLOW_PALIGNR;  /* palignr is insanely slow on Bobcat */
103 |             }
104 |             if (family == 0x16) {
105 |                 cpu |= X264_CPU_SLOW_PSHUFB; /* Jaguar's pshufb isn't that slow, but it's slow enough
106 |                                                 * compared to alternate instruction sequences that this
107 |                                                 * is equal or faster on almost all such functions. */
108 |             }
109 |         }
110 | 
111 |         if (cpu & X264_CPU_AVX) {
112 |             if (ecx & 0x00000800) /* XOP */
113 |                 cpu |= X264_CPU_XOP;
114 |             if (ecx & 0x00010000) /* FMA4 */
115 |                 cpu |= X264_CPU_FMA4;
116 |         }
117 | 
118 |         if (!strcmp((char *)vendor, "AuthenticAMD")) {
119 |             if (edx & 0x00400000)
120 |                 cpu |= X264_CPU_MMX2;
121 |             if (!(cpu & X264_CPU_LZCNT))
122 |                 cpu |= X264_CPU_SLOW_CTZ;
123 |             if ((cpu & X264_CPU_SSE2) && !(cpu & X264_CPU_SSE2_IS_FAST))
124 |                 cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
125 |         }
126 |     }
127 | 
128 |     if (!strcmp((char *)vendor, "GenuineIntel")) {
129 |         mvtools_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
130 |         int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
131 |         int model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
132 |         if (family == 6) {
133 |             /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
134 |              * theoretically support sse2, but it's significantly slower than mmx for
135 |              * almost all of x264's functions, so let's just pretend they don't. */
136 |             if (model == 9 || model == 13 || model == 14) {
137 |                 cpu &= ~(X264_CPU_SSE2 | X264_CPU_SSE3);
138 |                 assert(!(cpu & (X264_CPU_SSSE3 | X264_CPU_SSE4)));
139 |             }
140 |             /* Detect Atom CPU */
141 |             else if (model == 28) {
142 |                 cpu |= X264_CPU_SLOW_ATOM;
143 |                 cpu |= X264_CPU_SLOW_CTZ;
144 |                 cpu |= X264_CPU_SLOW_PSHUFB;
145 |             }
146 |             /* Conroe has a slow shuffle unit. Check the model number to make sure not
147 |              * to include crippled low-end Penryns and Nehalems that don't have SSE4. */
148 |             else if ((cpu & X264_CPU_SSSE3) && !(cpu & X264_CPU_SSE4) && model < 23)
149 |                 cpu |= X264_CPU_SLOW_SHUFFLE;
150 |         }
151 |     }
152 | 
153 |     if ((!strcmp((char *)vendor, "GenuineIntel") || !strcmp((char *)vendor, "CyrixInstead")) && !(cpu & X264_CPU_SSE42)) {
154 |         /* cacheline size is specified in 3 places, any of which may be missing */
155 |         mvtools_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
156 |         cache = (ebx & 0xff00) >> 5; // cflush size
157 |         if (!cache && max_extended_cap >= 0x80000006) {
158 |             mvtools_cpu_cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
159 |             cache = ecx & 0xff; // cacheline size
160 |         }
161 |         if (!cache && max_basic_cap >= 2) {
162 |             // Cache and TLB Information
163 |             static const unsigned char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
164 |             static const unsigned char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67, 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
165 |             uint32_t buf[4];
166 |             int max, i = 0;
167 |             do {
168 |                 mvtools_cpu_cpuid(2, buf + 0, buf + 1, buf + 2, buf + 3);
169 |                 max = buf[0] & 0xff;
170 |                 buf[0] &= ~0xff;
171 |                 for (int j = 0; j < 4; j++)
172 |                     if (!(buf[j] >> 31))
173 |                         while (buf[j]) {
174 |                             if (strchr((const char *)cache32_ids, buf[j] & 0xff))
175 |                                 cache = 32;
176 |                             if (strchr((const char *)cache64_ids, buf[j] & 0xff))
177 |                                 cache = 64;
178 |                             buf[j] >>= 8;
179 |                         }
180 |             } while (++i < max);
181 |         }
182 | 
183 |         if (cache == 32)
184 |             cpu |= X264_CPU_CACHELINE_32;
185 |         else if (cache == 64)
186 |             cpu |= X264_CPU_CACHELINE_64;
187 |         //else
188 |         //    x264_log( NULL, X264_LOG_WARNING, "unable to determine cacheline size\n" );
189 |     }
190 | 
191 |     return cpu;
192 | }
193 | 
194 | #elif defined(MVTOOLS_ARM)
195 | 
196 | uint32_t cpu_detect(void) {
197 |     return ~0; // we just assume NEON is available, as there is no instruction to check
198 | }
199 | 
200 | #else // not MVTOOLS_X86 or MVTOOLS_ARM
201 | 
202 | uint32_t cpu_detect(void) {
203 |     return 0;
204 | }
205 | 
206 | #endif
207 | 


--------------------------------------------------------------------------------
/src/CPU.h:
--------------------------------------------------------------------------------
 1 | #ifndef MVT_CPU_H
 2 | #define MVT_CPU_H
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | #include <stdint.h>
 9 | 
10 | 
11 | #if defined(MVTOOLS_X86)
12 | 
13 | #define X264_CPU_CMOV            0x0000001
14 | #define X264_CPU_MMX             0x0000002
15 | #define X264_CPU_MMX2            0x0000004  /* MMX2 aka MMXEXT aka ISSE */
16 | #define X264_CPU_MMXEXT          X264_CPU_MMX2
17 | #define X264_CPU_SSE             0x0000008
18 | #define X264_CPU_SSE2            0x0000010
19 | #define X264_CPU_SSE3            0x0000020
20 | #define X264_CPU_SSSE3           0x0000040
21 | #define X264_CPU_SSE4            0x0000080  /* SSE4.1 */
22 | #define X264_CPU_SSE42           0x0000100  /* SSE4.2 */
23 | #define X264_CPU_LZCNT           0x0000200  /* Phenom support for "leading zero count" instruction. */
24 | #define X264_CPU_AVX             0x0000400  /* AVX support: requires OS support even if YMM registers aren't used. */
25 | #define X264_CPU_XOP             0x0000800  /* AMD XOP */
26 | #define X264_CPU_FMA4            0x0001000  /* AMD FMA4 */
27 | #define X264_CPU_FMA3            0x0002000  /* FMA3 */
28 | #define X264_CPU_AVX2            0x0004000  /* AVX2 */
29 | #define X264_CPU_BMI1            0x0008000  /* BMI1 */
30 | #define X264_CPU_BMI2            0x0010000  /* BMI2 */
31 | /* x86 modifiers */
32 | #define X264_CPU_CACHELINE_32    0x0020000  /* avoid memory loads that span the border between two cachelines */
33 | #define X264_CPU_CACHELINE_64    0x0040000  /* 32/64 is the size of a cacheline in bytes */
34 | #define X264_CPU_SSE2_IS_SLOW    0x0080000  /* avoid most SSE2 functions on Athlon64 */
35 | #define X264_CPU_SSE2_IS_FAST    0x0100000  /* a few functions are only faster on Core2 and Phenom */
36 | #define X264_CPU_SLOW_SHUFFLE    0x0200000  /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
37 | #define X264_CPU_STACK_MOD4      0x0400000  /* if stack is only mod4 and not mod16 */
38 | #define X264_CPU_SLOW_CTZ        0x0800000  /* BSR/BSF x86 instructions are really slow on some CPUs */
39 | #define X264_CPU_SLOW_ATOM       0x1000000  /* The Atom is terrible: slow SSE unaligned loads, slow
40 |                                              * SIMD multiplies, slow SIMD variable shifts, slow pshufb,
41 |                                              * cacheline split penalties -- gather everything here that
42 |                                              * isn't shared by other CPUs to avoid making half a dozen
43 |                                              * new SLOW flags. */
44 | #define X264_CPU_SLOW_PSHUFB     0x2000000  /* such as on the Intel Atom */
45 | #define X264_CPU_SLOW_PALIGNR    0x4000000  /* such as on the AMD Bobcat */
46 | 
47 | void mvtools_cpu_emms();
48 | void mvtools_cpu_cpuid(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
49 | uint64_t mvtools_cpu_xgetbv(int xcr);
50 | 
51 | #endif // MVTOOLS_X86
52 | 
53 | uint32_t cpu_detect(void);
54 | 
55 | enum {
56 |     MVOPT_SCALAR = 0,
57 | #ifdef MVTOOLS_X86
58 |     MVOPT_SSE2 = 1,
59 |     MVOPT_AVX2 = 2,
60 | #elif MVTOOLS_ARM
61 |     MVOPT_NEON = 1,
62 |     MVOPT_SSE2 = 1, // SSE2 is converted to Neon
63 | #endif // MVTOOLS_X86
64 | };
65 | 
66 | extern uint32_t g_cpuinfo;
67 | 
68 | #ifdef __cplusplus
69 | } // extern "C"
70 | #endif
71 | 
72 | #endif // MVT_CPU_H
73 | 


--------------------------------------------------------------------------------
/src/CommonFunctions.h:
--------------------------------------------------------------------------------
 1 | #ifndef __COMMON_F__
 2 | #define __COMMON_F__
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | #include <stdint.h>
 9 | 
10 | 
11 | // returns a > 0 ? a : 0
12 | inline static int satz(int a) {
13 |     return ~(a >> (sizeof(int) * 8 - 1)) & a;
14 | }
15 | 
16 | // returns maximum(a, b)
17 | inline static int imax(int a, int b) {
18 |     return a + satz(b - a);
19 | }
20 | 
21 | // returns minimum(a, b)
22 | inline static int imin(int a, int b) {
23 |     return a - satz(a - b);
24 | }
25 | 
26 | /* returns the biggest integer x such as 2^x <= i */
27 | inline static int ilog2(int i) {
28 |     int result = 0;
29 |     while (i > 1) {
30 |         i /= 2;
31 |         result++;
32 |     }
33 |     return result;
34 | }
35 | 
36 | /* computes 2^i */
37 | inline static int iexp2(int i) {
38 |     return 1 << satz(i);
39 |     //     int result = 1;
40 |     //     while ( i > 0 ) { result *= 2; i--; }
41 |     //     return result;
42 | }
43 | 
44 | // general common divisor (from wikipedia)
45 | inline static int64_t gcd(int64_t u, int64_t v) {
46 |     int shift;
47 | 
48 |     /* GCD(0,x) := x */
49 |     if (u == 0 || v == 0)
50 |         return u | v;
51 | 
52 |     /* Let shift := lg K, where K is the greatest power of 2
53 |        dividing both u and v. */
54 |     for (shift = 0; ((u | v) & 1) == 0; ++shift) {
55 |         u >>= 1;
56 |         v >>= 1;
57 |     }
58 | 
59 |     while ((u & 1) == 0)
60 |         u >>= 1;
61 | 
62 |     /* From here on, u is always odd. */
63 |     do {
64 |         while ((v & 1) == 0) /* Loop X */
65 |             v >>= 1;
66 | 
67 |         /* Now u and v are both odd, so diff(u, v) is even.
68 |            Let u = min(u, v), v = diff(u, v)/2. */
69 |         if (u < v) {
70 |             v -= u;
71 |         } else {
72 |             int64_t diff = u - v;
73 |             u = v;
74 |             v = diff;
75 |         }
76 |         v >>= 1;
77 |     } while (v != 0);
78 | 
79 |     return u << shift;
80 | }
81 | 
82 | #ifdef __cplusplus
83 | } // extern "C"
84 | #endif
85 | 
86 | #endif
87 | 


--------------------------------------------------------------------------------
/src/CommonMacros.h:
--------------------------------------------------------------------------------
1 | #ifndef __COMMON_M__
2 | #define __COMMON_M__
3 | 
4 | #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(*arr))
5 | 
6 | #endif // __COMMON_M__
7 | 


--------------------------------------------------------------------------------
/src/CopyCode.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdint>
 2 | #include <cstring>
 3 | #include <unordered_map>
 4 | 
 5 | #include "CopyCode.h"
 6 | #include "CPU.h"
 7 | 
 8 | template <unsigned width, unsigned height>
 9 | void copyBlock(uint8_t * __restrict pDst, intptr_t nDstPitch, const uint8_t * __restrict pSrc, intptr_t nSrcPitch) {
10 |     int unroll = (height >= 8 ? 8 : (height >= 4 ? 4 : (height >= 2 ? 2 : 1))) / ((width + 15) / 16);
11 |     unroll = unroll < 1 ? 1 : unroll;
12 | 
13 |     for (unsigned j = 0; j < height; j += unroll) {
14 |         memcpy(pDst + 0 * nDstPitch, pSrc + 0 * nSrcPitch, width);
15 |         if (unroll > 1) {
16 |             memcpy(pDst + 1 * nDstPitch, pSrc + 1 * nSrcPitch, width);
17 |         }
18 |         if (unroll > 2) {
19 |             memcpy(pDst + 2 * nDstPitch, pSrc + 2 * nSrcPitch, width);
20 |             memcpy(pDst + 3 * nDstPitch, pSrc + 3 * nSrcPitch, width);
21 |         }
22 |         if (unroll > 4) {
23 |             memcpy(pDst + 4 * nDstPitch, pSrc + 4 * nSrcPitch, width);
24 |             memcpy(pDst + 5 * nDstPitch, pSrc + 5 * nSrcPitch, width);
25 |             memcpy(pDst + 6 * nDstPitch, pSrc + 6 * nSrcPitch, width);
26 |             memcpy(pDst + 7 * nDstPitch, pSrc + 7 * nSrcPitch, width);
27 |         }
28 |         pDst += nDstPitch * unroll;
29 |         pSrc += nSrcPitch * unroll;
30 |     }
31 | }
32 | 
33 | 
34 | #define KEY(width, height, bits) (width) << 16 | (height) << 8 | (bits)
35 | #define COPY(width, height) \
36 |     { KEY(width, height, 8), copyBlock<width * sizeof(uint8_t), height> }, \
37 |     { KEY(width, height, 16), copyBlock<width * sizeof(uint16_t), height> },
38 | 
39 | static const std::unordered_map<uint32_t, COPYFunction> copy_functions = {
40 |     COPY(2, 2)
41 |     COPY(2, 4)
42 |     COPY(4, 2)
43 |     COPY(4, 4)
44 |     COPY(4, 8)
45 |     COPY(8, 1)
46 |     COPY(8, 2)
47 |     COPY(8, 4)
48 |     COPY(8, 8)
49 |     COPY(8, 16)
50 |     COPY(16, 1)
51 |     COPY(16, 2)
52 |     COPY(16, 4)
53 |     COPY(16, 8)
54 |     COPY(16, 16)
55 |     COPY(16, 32)
56 |     COPY(32, 8)
57 |     COPY(32, 16)
58 |     COPY(32, 32)
59 |     COPY(32, 64)
60 |     COPY(64, 16)
61 |     COPY(64, 32)
62 |     COPY(64, 64)
63 |     COPY(64, 128)
64 |     COPY(128, 32)
65 |     COPY(128, 64)
66 |     COPY(128, 128)
67 | };
68 | 
69 | COPYFunction selectCopyFunction(unsigned width, unsigned height, unsigned bits) {
70 |     return copy_functions.at(KEY(width, height, bits));
71 | }
72 | 
73 | #undef COPY
74 | #undef KEY
75 | 
76 | 


--------------------------------------------------------------------------------
/src/CopyCode.h:
--------------------------------------------------------------------------------
 1 | #ifndef COPYCODE_H
 2 | #define COPYCODE_H
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | #include <stdint.h>
 9 | 
10 | 
11 | typedef void (*COPYFunction)(uint8_t *pDst, intptr_t nDstPitch,
12 |                              const uint8_t *pSrc, intptr_t nSrcPitch);
13 | 
14 | 
15 | COPYFunction selectCopyFunction(unsigned width, unsigned height, unsigned bits);
16 | 
17 | #ifdef __cplusplus
18 | } // extern "C"
19 | #endif
20 | 
21 | #endif // COPYCODE_H
22 | 


--------------------------------------------------------------------------------
/src/DCTFFTW.cpp:
--------------------------------------------------------------------------------
  1 | // DCT calculation with fftw (real)
  2 | // Copyright(c)2006 A.G.Balakhnin aka Fizick
  3 | // See legal notice in Copying.txt for more information
  4 | 
  5 | // This program is free software; you can redistribute it and/or modify
  6 | // it under the terms of the GNU General Public License as published by
  7 | // the Free Software Foundation; either version 2 of the License, or
  8 | // (at your option) any later version.
  9 | //
 10 | // This program is distributed in the hope that it will be useful,
 11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | // GNU General Public License for more details.
 14 | //
 15 | // You should have received a copy of the GNU General Public License
 16 | // along with this program; if not, write to the Free Software
 17 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
 18 | // http://www.gnu.org/copyleft/gpl.html .
 19 | 
 20 | #include <algorithm>
 21 | #include <cmath>
 22 | #include <mutex>
 23 | 
 24 | #include "DCTFFTW.h"
 25 | 
 26 | 
 27 | static const float sqrt_2_div_2 = 0.70710678118654752440084436210485f;
 28 | 
 29 | 
 30 | template <typename PixelType>
 31 | static void Float2Pixels_C(const DCTFFTW *dct, uint8_t *dstp8, int dst_pitch, float *realdata) {
 32 |     PixelType *dstp = (PixelType *)dstp8;
 33 |     dst_pitch /= sizeof(PixelType);
 34 | 
 35 |     PixelType *dstp_orig = dstp;
 36 |     float *realdata_orig = realdata;
 37 | 
 38 |     int pixelMax = (1 << dct->bitsPerSample) - 1;
 39 |     int pixelHalf = 1 << (dct->bitsPerSample - 1);
 40 | 
 41 |     for (int j = 0; j < dct->sizey; j++) {
 42 |         for (int i = 0; i < dct->sizex; i++) {
 43 |             float f = realdata[i] * sqrt_2_div_2; // to be compatible with integer DCTINT8
 44 |             int integ = (int)(nearbyintf(f));
 45 |             dstp[i] = std::min(pixelMax, std::max(0, (integ >> dct->dctshift) + pixelHalf));
 46 |         }
 47 |         dstp += dst_pitch;
 48 |         realdata += dct->sizex;
 49 |     }
 50 | 
 51 |     float f = realdata_orig[0] * 0.5f; // to be compatible with integer DCTINT8
 52 |     int integ = (int)(nearbyintf(f));
 53 |     dstp_orig[0] = std::min(pixelMax, std::max(0, (integ >> dct->dctshift0) + pixelHalf)); // DC
 54 | }
 55 | 
 56 | 
 57 | #if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
 58 | 
 59 | #if defined(MVTOOLS_ARM)
 60 | #include "sse2neon.h"
 61 | #else
 62 | #include <emmintrin.h>
 63 | #endif
 64 | 
 65 | template <typename PixelType>
 66 | static void Float2Pixels_SSE2(const DCTFFTW *dct, uint8_t *dstp8, int dst_pitch, float *realdata) {
 67 |     PixelType *dstp = (PixelType *)dstp8;
 68 |     dst_pitch /= sizeof(PixelType);
 69 | 
 70 |     unsigned width = dct->sizex;
 71 |     unsigned height = dct->sizey;
 72 | 
 73 |     PixelType *dstp_orig = dstp;
 74 |     float *realdata_orig = realdata;
 75 | 
 76 |     int pixel_max, pixel_half, pixel_min;
 77 |     __m128i words_pixel_max, words_pixel_half, words_pixel_min;
 78 | 
 79 |     if (sizeof(PixelType) == 1) {
 80 |         pixel_max = 255;
 81 |         pixel_half = 128;
 82 |         pixel_min = 0;
 83 | 
 84 |         words_pixel_max = _mm_set1_epi16(pixel_max);
 85 |         words_pixel_half = _mm_set1_epi16(pixel_half);
 86 |         words_pixel_min = _mm_set1_epi16(pixel_min);
 87 |     } else {
 88 |         pixel_max = (1 << dct->bitsPerSample) - 1;
 89 |         pixel_half = 1 << (dct->bitsPerSample - 1);
 90 |         pixel_min = 0;
 91 | 
 92 |         // Shitty because of pminsw/pmaxsw.
 93 |         words_pixel_max = _mm_set1_epi16(pixel_max - pixel_half);
 94 |         words_pixel_half = _mm_set1_epi16(pixel_half);
 95 |         words_pixel_min = _mm_set1_epi16(pixel_min - pixel_half);
 96 |     }
 97 | 
 98 |     __m128i dwords_dctshift = _mm_cvtsi32_si128(dct->dctshift);
 99 | 
100 |     for (unsigned y = 0; y < height; y++) {
101 |         for (unsigned x = 0; x < width; x += 4) {
102 |             __m128 f = _mm_load_ps(&realdata[x]);
103 |             f = _mm_mul_ps(f, _mm_set1_ps(sqrt_2_div_2));
104 | 
105 |             __m128i i = _mm_cvtps_epi32(f);
106 |             i = _mm_sra_epi32(i, dwords_dctshift);
107 |             i = _mm_packs_epi32(i, i);
108 | 
109 |             if (sizeof(PixelType) == 1) {
110 |                 i = _mm_add_epi16(i, words_pixel_half);
111 |                 i = _mm_packus_epi16(i, i);
112 |                 *(int *)(dstp + x) = _mm_cvtsi128_si32(i);
113 |             } else {
114 |                 i = _mm_min_epi16(i, words_pixel_max);
115 |                 i = _mm_max_epi16(i, words_pixel_min);
116 |                 i = _mm_add_epi16(i, words_pixel_half);
117 |                 _mm_storel_epi64((__m128i *)&dstp[x], i);
118 |             }
119 |         }
120 | 
121 |         dstp += dst_pitch;
122 |         realdata += width;
123 |     }
124 | 
125 |     int i = _mm_cvtss_si32(_mm_set_ss(realdata_orig[0] * 0.5f));
126 |     dstp_orig[0] = std::max(0, std::min((i >> dct->dctshift0) + pixel_half, pixel_max));
127 | }
128 | 
129 | #endif // MVTOOLS_X86
130 | 
131 | 
132 | std::mutex g_fftw_plans_mutex;
133 | 
134 | 
135 | void dctInit(DCTFFTW *dct, int sizex, int sizey, int bitsPerSample, int opt) {
136 |     dct->sizex = sizex;
137 |     dct->sizey = sizey;
138 |     dct->bitsPerSample = bitsPerSample;
139 | 
140 |     int size2d = sizey * sizex;
141 | 
142 |     int cursize = 1;
143 |     dct->dctshift = 0;
144 |     while (cursize < size2d) {
145 |         dct->dctshift++;
146 |         cursize = (cursize << 1);
147 |     }
148 | 
149 |     dct->dctshift0 = dct->dctshift + 2;
150 | 
151 |     dct->fSrc = (float *)fftwf_malloc(sizeof(float) * size2d);
152 |     dct->fSrcDCT = (float *)fftwf_malloc(sizeof(float) * size2d);
153 | 
154 |     if (bitsPerSample == 8)
155 |         dct->Float2Pixels = Float2Pixels_C<uint8_t>;
156 |     else
157 |         dct->Float2Pixels = Float2Pixels_C<uint16_t>;
158 | 
159 |     if (opt) {
160 | #if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
161 |         if (bitsPerSample == 8)
162 |             dct->Float2Pixels = Float2Pixels_SSE2<uint8_t>;
163 |         else
164 |             dct->Float2Pixels = Float2Pixels_SSE2<uint16_t>;
165 | #endif
166 |     }
167 | 
168 |     {
169 |         std::lock_guard<std::mutex> guard(g_fftw_plans_mutex);
170 |         dct->dctplan = fftwf_plan_r2r_2d(sizey, sizex, dct->fSrc, dct->fSrcDCT,
171 |                                          FFTW_REDFT10, FFTW_REDFT10, FFTW_ESTIMATE); // direct fft
172 |     }
173 | }
174 | 
175 | 
176 | void dctDeinit(DCTFFTW *dct) {
177 |     {
178 |         std::lock_guard<std::mutex> guard(g_fftw_plans_mutex);
179 |         fftwf_destroy_plan(dct->dctplan);
180 |     }
181 |     fftwf_free(dct->fSrc);
182 |     fftwf_free(dct->fSrcDCT);
183 | }
184 | 
185 | 
186 | //  put source data to real array for FFT
187 | template <typename PixelType>
188 | static void Pixels2Float(const DCTFFTW *dct, const uint8_t *srcp8, int src_pitch, float *realdata) {
189 |     for (int j = 0; j < dct->sizey; j++) {
190 |         for (int i = 0; i < dct->sizex; i++) {
191 |             PixelType *srcp = (PixelType *)srcp8;
192 |             realdata[i] = srcp[i];
193 |         }
194 |         srcp8 += src_pitch;
195 |         realdata += dct->sizex;
196 |     }
197 | }
198 | 
199 | 
200 | void dctBytes2D(DCTFFTW *dct, const uint8_t *srcp, int src_pitch, uint8_t *dctp, int dct_pitch) {
201 |     if (dct->bitsPerSample == 8) {
202 |         Pixels2Float<uint8_t>(dct, srcp, src_pitch, dct->fSrc);
203 |     } else {
204 |         Pixels2Float<uint16_t>(dct, srcp, src_pitch, dct->fSrc);
205 |     }
206 |     fftwf_execute_r2r(dct->dctplan, dct->fSrc, dct->fSrcDCT);
207 |     dct->Float2Pixels(dct, dctp, dct_pitch, dct->fSrcDCT);
208 | }
209 | 


--------------------------------------------------------------------------------
/src/DCTFFTW.h:
--------------------------------------------------------------------------------
 1 | // DCT calculation with fftw (real)
 2 | // Copyright(c)2006 A.G.Balakhnin aka Fizick
 3 | // See legal notice in Copying.txt for more information
 4 | 
 5 | // This program is free software; you can redistribute it and/or modify
 6 | // it under the terms of the GNU General Public License as published by
 7 | // the Free Software Foundation; either version 2 of the License, or
 8 | // (at your option) any later version.
 9 | //
10 | // This program is distributed in the hope that it will be useful,
11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | // GNU General Public License for more details.
14 | //
15 | // You should have received a copy of the GNU General Public License
16 | // along with this program; if not, write to the Free Software
17 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
18 | // http://www.gnu.org/copyleft/gpl.html .
19 | 
20 | #ifndef DCTFFTW_H
21 | #define DCTFFTW_H
22 | 
23 | #ifdef __cplusplus
24 | extern "C" {
25 | #endif
26 | 
27 | #include <stdint.h>
28 | 
29 | #include <fftw3.h>
30 | 
31 | 
32 | typedef struct DCTFFTW DCTFFTW;
33 | 
34 | typedef void (*Float2PixelsFunction)(const DCTFFTW *dct, uint8_t *dstp, int dst_pitch, float *realdata);
35 | 
36 | 
37 | typedef struct DCTFFTW {
38 |     int sizex;
39 |     int sizey;
40 |     int bitsPerSample;
41 | 
42 |     float *fSrc;
43 |     fftwf_plan dctplan;
44 |     float *fSrcDCT;
45 | 
46 |     int dctshift;
47 |     int dctshift0;
48 | 
49 |     Float2PixelsFunction Float2Pixels;
50 | } DCTFFTW;
51 | 
52 | 
53 | void dctInit(DCTFFTW *dct, int sizex, int sizey, int bitsPerSample, int opt);
54 | 
55 | void dctDeinit(DCTFFTW *dct);
56 | 
57 | void dctBytes2D(DCTFFTW *dct, const uint8_t *srcp, int src_pitch, uint8_t *dctp, int dct_pitch);
58 | 
59 | #ifdef __cplusplus
60 | } // extern "C"
61 | #endif
62 | 
63 | #endif // DCTFFTW_H
64 | 


--------------------------------------------------------------------------------
/src/EntryPoint.c:
--------------------------------------------------------------------------------
 1 | #include <VapourSynth4.h>
 2 | #include <stdlib.h>
 3 | 
 4 | #include "CPU.h"
 5 | 
 6 | 
 7 | // Extra indirection to keep the parameter lists with the respective filters.
 8 | 
 9 | 
10 | void mvsuperRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi);
11 | void mvanalyseRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi);
12 | void mvdegrainsRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi);
13 | void mvcompensateRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi);
14 | void mvrecalculateRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi);
15 | void mvmaskRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi);
16 | void mvfinestRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi);
17 | void mvflowRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi);
18 | void mvflowblurRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi);
19 | void mvflowinterRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi);
20 | void mvflowfpsRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi);
21 | void mvblockfpsRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi);
22 | void mvscdetectionRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi);
23 | void mvdepanRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi);
24 | 
25 | 
26 | uint32_t g_cpuinfo = 0;
27 | 
28 | VS_EXTERNAL_API(void)
29 | VapourSynthPluginInit2(VSPlugin *plugin, const VSPLUGINAPI *vspapi) {
30 |     const int packageVersion = atoi(PACKAGE_VERSION);
31 | 
32 |     vspapi->configPlugin("com.nodame.mvtools", "mv", "MVTools v" PACKAGE_VERSION, VS_MAKE_VERSION(packageVersion, 0), VS_MAKE_VERSION(VAPOURSYNTH_API_MAJOR, VAPOURSYNTH_API_MINOR), 0, plugin);
33 | 
34 |     mvsuperRegister(plugin, vspapi);
35 |     mvanalyseRegister(plugin, vspapi);
36 |     mvdegrainsRegister(plugin, vspapi);
37 |     mvcompensateRegister(plugin, vspapi);
38 |     mvrecalculateRegister(plugin, vspapi);
39 |     mvmaskRegister(plugin, vspapi);
40 |     mvfinestRegister(plugin, vspapi);
41 |     mvflowRegister(plugin, vspapi);
42 |     mvflowblurRegister(plugin, vspapi);
43 |     mvflowinterRegister(plugin, vspapi);
44 |     mvflowfpsRegister(plugin, vspapi);
45 |     mvblockfpsRegister(plugin, vspapi);
46 |     mvscdetectionRegister(plugin, vspapi);
47 |     mvdepanRegister(plugin, vspapi);
48 | 
49 |     g_cpuinfo = cpu_detect();
50 | }
51 | 


--------------------------------------------------------------------------------
/src/Fakery.c:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | 
  5 | #include "CommonFunctions.h"
  6 | #include "Fakery.h"
  7 | 
  8 | 
  9 | // FakeBlockData
 10 | 
 11 | void fbdUpdate(FakeBlockData *fbd, const VECTOR *array) {
 12 |     fbd->vector = *array;
 13 | }
 14 | 
 15 | 
 16 | // FakePlaneOfBlocks
 17 | 
 18 | void fpobInit(FakePlaneOfBlocks *fpob, int sizeX, int sizeY, int pel, int nOverlapX, int nOverlapY, int nBlkX, int nBlkY) {
 19 |     fpob->nBlkSizeX = sizeX;
 20 |     fpob->nBlkSizeY = sizeY;
 21 |     fpob->nOverlapX = nOverlapX;
 22 |     fpob->nOverlapY = nOverlapY;
 23 |     fpob->nBlkX = nBlkX;
 24 |     fpob->nBlkY = nBlkY;
 25 |     fpob->nBlkCount = fpob->nBlkX * fpob->nBlkY;
 26 |     fpob->nPel = pel;
 27 | 
 28 |     fpob->blocks = (FakeBlockData *)malloc(fpob->nBlkCount * sizeof(FakeBlockData));
 29 | 
 30 |     for (int j = 0, blkIdx = 0; j < fpob->nBlkY; j++) {
 31 |         for (int i = 0; i < fpob->nBlkX; i++, blkIdx++) {
 32 |             fpob->blocks[blkIdx].x = i * (fpob->nBlkSizeX - fpob->nOverlapX);
 33 |             fpob->blocks[blkIdx].y = j * (fpob->nBlkSizeY - fpob->nOverlapY);
 34 |         }
 35 |     }
 36 | }
 37 | 
 38 | 
 39 | void fpobDeinit(FakePlaneOfBlocks *fpob) {
 40 |     free(fpob->blocks);
 41 | }
 42 | 
 43 | 
 44 | void fpobUpdate(FakePlaneOfBlocks *fpob, const uint8_t *array) {
 45 |     const VECTOR *blocks = (const VECTOR *)array;
 46 | 
 47 |     for (int i = 0; i < fpob->nBlkCount; i++)
 48 |         fbdUpdate(&fpob->blocks[i], &blocks[i]);
 49 | }
 50 | 
 51 | 
 52 | int fpobIsSceneChange(const FakePlaneOfBlocks *fpob, int64_t nTh1, int nTh2) {
 53 |     int sum = 0;
 54 |     for (int i = 0; i < fpob->nBlkCount; i++)
 55 |         sum += (fpob->blocks[i].vector.sad > nTh1) ? 1 : 0;
 56 | 
 57 |     return (sum > nTh2);
 58 | }
 59 | 
 60 | 
 61 | const FakeBlockData *fpobGetBlock(const FakePlaneOfBlocks *fpob, int i) {
 62 |     return &fpob->blocks[i];
 63 | }
 64 | 
 65 | 
 66 | // FakeGroupOfPlanes
 67 | 
 68 | void fgopInit(FakeGroupOfPlanes *fgop, const MVAnalysisData *ad) {
 69 |     fgop->nLvCount = ad->nLvCount;
 70 |     int nBlkX1 = ad->nBlkX;
 71 |     int nBlkY1 = ad->nBlkY;
 72 |     int nWidth_B = (ad->nBlkSizeX - ad->nOverlapX) * nBlkX1 + ad->nOverlapX;
 73 |     int nHeight_B = (ad->nBlkSizeY - ad->nOverlapY) * nBlkY1 + ad->nOverlapY;
 74 | 
 75 |     fgop->planes = (FakePlaneOfBlocks **)malloc(ad->nLvCount * sizeof(FakePlaneOfBlocks *));
 76 | 
 77 |     fgop->planes[0] = (FakePlaneOfBlocks *)malloc(sizeof(FakePlaneOfBlocks));
 78 |     fpobInit(fgop->planes[0], ad->nBlkSizeX, ad->nBlkSizeY, ad->nPel, ad->nOverlapX, ad->nOverlapY, nBlkX1, nBlkY1);
 79 | 
 80 |     for (int i = 1; i < ad->nLvCount; i++) {
 81 |         nBlkX1 = ((nWidth_B >> i) - ad->nOverlapX) / (ad->nBlkSizeX - ad->nOverlapX);
 82 |         nBlkY1 = ((nHeight_B >> i) - ad->nOverlapY) / (ad->nBlkSizeY - ad->nOverlapY);
 83 | 
 84 |         fgop->planes[i] = (FakePlaneOfBlocks *)malloc(sizeof(FakePlaneOfBlocks));
 85 |         fpobInit(fgop->planes[i], ad->nBlkSizeX, ad->nBlkSizeY, 1, ad->nOverlapX, ad->nOverlapY, nBlkX1, nBlkY1); // fixed bug with nOverlapX in v1.10.2
 86 |     }
 87 | }
 88 | 
 89 | 
 90 | void fgopDeinit(FakeGroupOfPlanes *fgop) {
 91 |     if (fgop->planes) {
 92 |         for (int i = 0; i < fgop->nLvCount; i++) {
 93 |             fpobDeinit(fgop->planes[i]);
 94 |             free(fgop->planes[i]);
 95 |         }
 96 | 
 97 |         free(fgop->planes);
 98 |         fgop->planes = 0; //v1.2.1
 99 |     }
100 | }
101 | 
102 | 
103 | static inline int fgopGetValidity(const uint8_t *array) {
104 |     MVArraySizeType validity;
105 |     memcpy(&validity, array + sizeof(MVArraySizeType), sizeof(validity));
106 |     return (validity == 1);
107 | }
108 | 
109 | 
110 | void fgopUpdate(FakeGroupOfPlanes *fgop, const uint8_t *array) {
111 |     fgop->validity = fgopGetValidity(array);
112 | 
113 |     const uint8_t *pA = array + 2 * sizeof(MVArraySizeType);
114 |     for (int i = fgop->nLvCount - 1; i >= 0; i--) {
115 |         fpobUpdate(fgop->planes[i], pA + sizeof(MVArraySizeType));
116 | 
117 |         MVArraySizeType size;
118 |         memcpy(&size, pA, sizeof(size));
119 |         pA += size;
120 |     }
121 | }
122 | 
123 | 
124 | int fgopIsSceneChange(const FakeGroupOfPlanes *fgop, int64_t nThSCD1, int nThSCD2) {
125 |     return fpobIsSceneChange(fgop->planes[0], nThSCD1, nThSCD2);
126 | }
127 | 
128 | 
129 | int fgopIsValid(const FakeGroupOfPlanes *fgop) {
130 |     return fgop->validity;
131 | }
132 | 
133 | 
134 | const FakePlaneOfBlocks *fgopGetPlane(const FakeGroupOfPlanes *fgop, int i) {
135 |     return fgop->planes[i];
136 | }
137 | 
138 | 
139 | const FakeBlockData *fgopGetBlock(const FakeGroupOfPlanes *fgop, int nLevel, int nBlk) {
140 |     return fpobGetBlock(fgopGetPlane(fgop, nLevel), nBlk);
141 | }
142 | 
143 | 
144 | int fgopIsUsable(const FakeGroupOfPlanes *fgop, int64_t thscd1, int thscd2) {
145 |     return !fgopIsSceneChange(fgop, thscd1, thscd2) && fgopIsValid(fgop);
146 | }
147 | 


--------------------------------------------------------------------------------
/src/Fakery.h:
--------------------------------------------------------------------------------
 1 | #ifndef MVTOOLS_FAKERY_H
 2 | #define MVTOOLS_FAKERY_H
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | 
 9 | #include "MVAnalysisData.h"
10 | 
11 | 
12 | typedef struct FakeBlockData {
13 |     int x;
14 |     int y;
15 |     VECTOR vector;
16 | } FakeBlockData;
17 | 
18 | 
19 | typedef struct FakePlaneOfBlocks {
20 |     int nBlkX;
21 |     int nBlkY;
22 |     int nBlkSizeX;
23 |     int nBlkSizeY;
24 |     int nBlkCount;
25 |     int nPel;
26 |     int nOverlapX;
27 |     int nOverlapY;
28 | 
29 |     FakeBlockData *blocks;
30 | } FakePlaneOfBlocks;
31 | 
32 | 
33 | typedef struct FakeGroupOfPlanes {
34 |     int nLvCount;
35 |     int validity;
36 | 
37 |     FakePlaneOfBlocks **planes;
38 | } FakeGroupOfPlanes;
39 | 
40 | 
41 | // FakeBlockData
42 | 
43 | void fbdUpdate(FakeBlockData *fbd, const VECTOR *array);
44 | 
45 | 
46 | // FakePlaneOfBlocks
47 | 
48 | void fpobInit(FakePlaneOfBlocks *fpob, int sizeX, int sizeY, int pel, int nOverlapX, int nOverlapY, int nBlkX, int nBlkY);
49 | 
50 | void fpobDeinit(FakePlaneOfBlocks *fpob);
51 | 
52 | void fpobUpdate(FakePlaneOfBlocks *fpob, const uint8_t *array);
53 | 
54 | int fpobIsSceneChange(const FakePlaneOfBlocks *fpob, int64_t nTh1, int nTh2);
55 | 
56 | const FakeBlockData *fpobGetBlock(const FakePlaneOfBlocks *fpob, int i);
57 | 
58 | 
59 | // FakeGroupOfPlanes
60 | 
61 | void fgopInit(FakeGroupOfPlanes *fgop, const MVAnalysisData *ad);
62 | 
63 | void fgopDeinit(FakeGroupOfPlanes *fgop);
64 | 
65 | void fgopUpdate(FakeGroupOfPlanes *fgop, const uint8_t *array);
66 | 
67 | int fgopIsSceneChange(const FakeGroupOfPlanes *fgop, int64_t nThSCD1, int nThSCD2);
68 | 
69 | int fgopIsValid(const FakeGroupOfPlanes *fgop);
70 | 
71 | const FakePlaneOfBlocks *fgopGetPlane(const FakeGroupOfPlanes *fgop, int i);
72 | 
73 | const FakeBlockData *fgopGetBlock(const FakeGroupOfPlanes *fgop, int nLevel, int nBlk);
74 | 
75 | int fgopIsUsable(const FakeGroupOfPlanes *fgop, int64_t thscd1, int thscd2);
76 | 
77 | 
78 | #ifdef __cplusplus
79 | } // extern "C"
80 | #endif
81 | 
82 | #endif // MVTOOLS_FAKERY_H
83 | 


--------------------------------------------------------------------------------
/src/GroupOfPlanes.c:
--------------------------------------------------------------------------------
  1 | // Author: Manao
  2 | // Copyright(c)2006 A.G.Balakhnin aka Fizick - overlap, global MV, divide
  3 | // See legal notice in Copying.txt for more information
  4 | //
  5 | // This program is free software; you can redistribute it and/or modify
  6 | // it under the terms of the GNU General Public License as published by
  7 | // the Free Software Foundation; either version 2 of the License, or
  8 | // (at your option) any later version.
  9 | //
 10 | // This program is distributed in the hope that it will be useful,
 11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | // GNU General Public License for more details.
 14 | //
 15 | // You should have received a copy of the GNU General Public License
 16 | // along with this program; if not, write to the Free Software
 17 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
 18 | // http://www.gnu.org/copyleft/gpl.html .
 19 | 
 20 | #include <string.h>
 21 | 
 22 | #include "GroupOfPlanes.h"
 23 | 
 24 | 
 25 | void gopInit(GroupOfPlanes *gop, int nBlkSizeX, int nBlkSizeY, int nLevelCount, int nPel, int nMotionFlags, int nCPUFlags, int nOverlapX, int nOverlapY, int nBlkX, int nBlkY, int xRatioUV, int yRatioUV, int divideExtra, int bitsPerSample) {
 26 |     gop->nBlkSizeX = nBlkSizeX;
 27 |     gop->nBlkSizeY = nBlkSizeY;
 28 |     gop->nLevelCount = nLevelCount;
 29 |     gop->nOverlapX = nOverlapX;
 30 |     gop->nOverlapY = nOverlapY;
 31 |     gop->xRatioUV = xRatioUV;
 32 |     gop->yRatioUV = yRatioUV;
 33 |     gop->divideExtra = divideExtra;
 34 | 
 35 |     gop->planes = (PlaneOfBlocks **)malloc(gop->nLevelCount * sizeof(PlaneOfBlocks *));
 36 | 
 37 |     int nBlkXCurrent = nBlkX;
 38 |     int nBlkYCurrent = nBlkY;
 39 | 
 40 |     int nPelCurrent = nPel;
 41 |     int nMotionFlagsCurrent = nMotionFlags;
 42 | 
 43 |     int nWidth_B = (gop->nBlkSizeX - gop->nOverlapX) * nBlkX + gop->nOverlapX;
 44 |     int nHeight_B = (gop->nBlkSizeY - gop->nOverlapY) * nBlkY + gop->nOverlapY;
 45 | 
 46 |     for (int i = 0; i < gop->nLevelCount; i++) {
 47 |         if (i == gop->nLevelCount - 1)
 48 |             nMotionFlagsCurrent |= MOTION_SMALLEST_PLANE;
 49 |         nBlkXCurrent = ((nWidth_B >> i) - gop->nOverlapX) / (gop->nBlkSizeX - gop->nOverlapX);
 50 |         nBlkYCurrent = ((nHeight_B >> i) - gop->nOverlapY) / (gop->nBlkSizeY - gop->nOverlapY);
 51 | 
 52 |         gop->planes[i] = (PlaneOfBlocks *)malloc(sizeof(PlaneOfBlocks));
 53 |         pobInit(gop->planes[i], nBlkXCurrent, nBlkYCurrent, gop->nBlkSizeX, gop->nBlkSizeY, nPelCurrent, i, nMotionFlagsCurrent, nCPUFlags, gop->nOverlapX, gop->nOverlapY, gop->xRatioUV, gop->yRatioUV, bitsPerSample);
 54 |         nPelCurrent = 1;
 55 |     }
 56 | }
 57 | 
 58 | 
 59 | void gopDeinit(GroupOfPlanes *gop) {
 60 |     for (int i = 0; i < gop->nLevelCount; i++) {
 61 |         pobDeinit(gop->planes[i]);
 62 |         free(gop->planes[i]);
 63 |     }
 64 | 
 65 |     free(gop->planes);
 66 | }
 67 | 
 68 | 
 69 | void gopSearchMVs(GroupOfPlanes *gop, MVGroupOfFrames *pSrcGOF, MVGroupOfFrames *pRefGOF,
 70 |                   SearchType searchType, int nSearchParam, int nPelSearch, int nLambda,
 71 |                   int lsad, int pnew, int plevel, int global,
 72 |                   uint8_t *out, int fieldShift, DCTFFTW *DCT, int dctmode,
 73 |                   int pzero, int pglobal, int64_t badSAD, int badrange, int meander, int tryMany,
 74 |                   SearchType coarseSearchType) {
 75 |     int i;
 76 | 
 77 |     // write group's size
 78 |     MVArraySizeType size = gopGetArraySize(gop);
 79 |     memcpy(out, &size, sizeof(size));
 80 | 
 81 |     // write validity : 1 in that case
 82 |     MVArraySizeType validity = 1;
 83 |     memcpy(out + sizeof(size), &validity, sizeof(validity));
 84 | 
 85 |     out += sizeof(size) + sizeof(validity);
 86 | 
 87 |     int fieldShiftCur = (gop->nLevelCount - 1 == 0) ? fieldShift : 0; // may be non zero for finest level only
 88 | 
 89 |     VECTOR globalMV = zeroMV; // create and init global motion vector as zero
 90 | 
 91 |     if (!global)
 92 |         pglobal = pzero;
 93 | 
 94 |     int meanLumaChange = 0;
 95 | 
 96 |     // Search the motion vectors, for the low details interpolations first
 97 |     SearchType searchTypeSmallest = (gop->nLevelCount == 1 || searchType == SearchHorizontal || searchType == SearchVertical) ? searchType : coarseSearchType; // full search for smallest coarse plane
 98 |     int nSearchParamSmallest = (gop->nLevelCount == 1) ? nPelSearch : nSearchParam;
 99 |     int tryManyLevel = tryMany && gop->nLevelCount > 1;
100 |     pobSearchMVs(gop->planes[gop->nLevelCount - 1],
101 |                  pSrcGOF->frames[gop->nLevelCount - 1],
102 |                  pRefGOF->frames[gop->nLevelCount - 1],
103 |                  searchTypeSmallest, nSearchParamSmallest, nLambda, lsad, pnew, plevel,
104 |                  out, &globalMV, fieldShiftCur, DCT, dctmode, &meanLumaChange,
105 |                  pzero, pglobal, badSAD, badrange, meander, tryManyLevel);
106 |     // Refining the search until we reach the highest detail interpolation.
107 | 
108 |     out += pobGetArraySize(gop->planes[gop->nLevelCount - 1], gop->divideExtra);
109 | 
110 |     for (i = gop->nLevelCount - 2; i >= 0; i--) {
111 |         SearchType searchTypeLevel = (i == 0 || searchType == SearchHorizontal || searchType == SearchVertical) ? searchType : coarseSearchType; // full search for coarse planes
112 |         int nSearchParamLevel = (i == 0) ? nPelSearch : nSearchParam;                                                            // special case for finest level
113 |         if (global) {
114 |             pobEstimateGlobalMVDoubled(gop->planes[i + 1], &globalMV); // get updated global MV (doubled)
115 |         }
116 |         pobInterpolatePrediction(gop->planes[i], gop->planes[i + 1]);
117 |         fieldShiftCur = (i == 0) ? fieldShift : 0; // may be non zero for finest level only
118 |         tryManyLevel = tryMany && i > 0;           // not for finest level to not decrease speed
119 |         pobSearchMVs(gop->planes[i], pSrcGOF->frames[i], pRefGOF->frames[i],
120 |                      searchTypeLevel, nSearchParamLevel, nLambda, lsad, pnew, plevel,
121 |                      out, &globalMV, fieldShiftCur, DCT, dctmode, &meanLumaChange,
122 |                      pzero, pglobal, badSAD, badrange, meander, tryManyLevel);
123 |         out += pobGetArraySize(gop->planes[i], gop->divideExtra);
124 |     }
125 | }
126 | 
127 | 
128 | void gopRecalculateMVs(GroupOfPlanes *gop, FakeGroupOfPlanes *fgop, MVGroupOfFrames *pSrcGOF, MVGroupOfFrames *pRefGOF,
129 |                        SearchType searchType, int nSearchParam, int nLambda,
130 |                        int pnew,
131 |                        uint8_t *out, int fieldShift, int64_t thSAD, DCTFFTW *DCT, int dctmode, int smooth, int meander) {
132 |     // write group's size
133 |     MVArraySizeType size = gopGetArraySize(gop);
134 |     memcpy(out, &size, sizeof(size));
135 | 
136 |     // write validity : 1 in that case
137 |     MVArraySizeType validity = 1;
138 |     memcpy(out + sizeof(size), &validity, sizeof(validity));
139 | 
140 |     out += sizeof(size) + sizeof(validity);
141 | 
142 |     // Search the motion vectors, for the low details interpolations first
143 |     // Refining the search until we reach the highest detail interpolation.
144 |     pobRecalculateMVs(gop->planes[0], fgop, pSrcGOF->frames[0], pRefGOF->frames[0],
145 |                       searchType, nSearchParam, nLambda, pnew,
146 |                       out, fieldShift, thSAD, DCT, dctmode, smooth, meander);
147 | }
148 | 
149 | 
150 | void gopWriteDefaultToArray(GroupOfPlanes *gop, uint8_t *array) {
151 |     // write group's size
152 |     MVArraySizeType size = gopGetArraySize(gop);
153 |     memcpy(array, &size, sizeof(size));
154 | 
155 |     // write validity : unvalid in that case
156 |     MVArraySizeType validity = 0;
157 |     memcpy(array + sizeof(size), &validity, sizeof(validity));
158 | 
159 |     array += sizeof(size) + sizeof(validity);
160 | 
161 |     // write planes
162 |     for (int i = gop->nLevelCount - 1; i >= 0; i--)
163 |         array += pobWriteDefaultToArray(gop->planes[i], array, gop->divideExtra);
164 | }
165 | 
166 | 
167 | MVArraySizeType gopGetArraySize(GroupOfPlanes *gop) {
168 |     MVArraySizeType size = 2 * sizeof(MVArraySizeType); // size, validity
169 |     for (int i = gop->nLevelCount - 1; i >= 0; i--)
170 |         size += pobGetArraySize(gop->planes[i], gop->divideExtra);
171 | 
172 | 
173 |     return size;
174 | }
175 | 
176 | 
177 | // FIND MEDIAN OF 3 ELEMENTS
178 | //
179 | static inline int Median3(int a, int b, int c) {
180 |     // b a c || c a b
181 |     if (((b <= a) && (a <= c)) || ((c <= a) && (a <= b)))
182 |         return a;
183 | 
184 |     // a b c || c b a
185 |     else if (((a <= b) && (b <= c)) || ((c <= b) && (b <= a)))
186 |         return b;
187 | 
188 |     // b c a || a c b
189 |     else
190 |         return c;
191 | }
192 | 
193 | 
194 | static void GetMedian(int *vx, int *vy, int vx1, int vy1, int vx2, int vy2, int vx3, int vy3) { // existant median vector (not mixed)
195 |     *vx = Median3(vx1, vx2, vx3);
196 |     *vy = Median3(vy1, vy2, vy3);
197 |     if ((*vx == vx1 && *vy == vy1) || (*vx == vx2 && *vy == vy2) || (*vx == vx3 && *vy == vy3))
198 |         return;
199 |     else {
200 |         *vx = vx1;
201 |         *vy = vy1;
202 |     }
203 | }
204 | 
205 | 
206 | void gopExtraDivide(GroupOfPlanes *gop, uint8_t *out) {
207 |     out += 2 * sizeof(MVArraySizeType);             // skip full size and validity
208 |     for (int i = gop->nLevelCount - 1; i >= 1; i--) // skip all levels up to finest estimated
209 |         out += pobGetArraySize(gop->planes[i], 0);
210 | 
211 |     MVArraySizeType size;
212 |     memcpy(&size, out, sizeof(size));
213 | 
214 |     const VECTOR *blocks_in = (const VECTOR *)(out + sizeof(size)); // finest estimated plane
215 |     VECTOR *blocks_out = (VECTOR *)(out + size + sizeof(MVArraySizeType)); // position for divided subblocks data
216 | 
217 |     int nBlkY = gop->planes[0]->nBlkY;
218 |     int nBlkX = gop->planes[0]->nBlkX;
219 | 
220 |     // top blocks
221 |     for (int bx = 0; bx < nBlkX; bx++) {
222 |         VECTOR block = blocks_in[bx];
223 |         block.sad >>= 2;
224 | 
225 |         blocks_out[bx * 2] = block;                 // top left subblock
226 |         blocks_out[bx * 2 + 1] = block;             // top right subblock
227 |         blocks_out[bx * 2 + nBlkX * 2] = block;     // bottom left subblock
228 |         blocks_out[bx * 2 + nBlkX * 2 + 1] = block; // bottom right subblock
229 |     }
230 | 
231 |     blocks_out += nBlkX * 4;
232 |     blocks_in += nBlkX;
233 | 
234 |     // middle blocks
235 |     for (int by = 1; by < nBlkY - 1; by++) {
236 |         int bx = 0;
237 | 
238 |         VECTOR block = blocks_in[bx];
239 |         block.sad >>= 2;
240 | 
241 |         blocks_out[bx * 2] = block;                 // top left subblock
242 |         blocks_out[bx * 2 + 1] = block;             // top right subblock
243 |         blocks_out[bx * 2 + nBlkX * 2] = block;     // bottom left subblock
244 |         blocks_out[bx * 2 + nBlkX * 2 + 1] = block; // bottom right subblock
245 | 
246 |         for (bx = 1; bx < nBlkX - 1; bx++) {
247 |             block = blocks_in[bx];
248 |             block.sad >>= 2;
249 | 
250 |             blocks_out[bx * 2] = block;                 // top left subblock
251 |             blocks_out[bx * 2 + 1] = block;             // top right subblock
252 |             blocks_out[bx * 2 + nBlkX * 2] = block;     // bottom left subblock
253 |             blocks_out[bx * 2 + nBlkX * 2 + 1] = block; // bottom right subblock
254 | 
255 |             if (gop->divideExtra > 1) {
256 |                 GetMedian(&blocks_out[bx * 2].x,    &blocks_out[bx * 2].y,
257 |                           blocks_in[bx].x,          blocks_in[bx].y,
258 |                           blocks_in[bx - 1].x,      blocks_in[bx - 1].y,
259 |                           blocks_in[bx - nBlkX].x,  blocks_in[bx - nBlkX].y);
260 | 
261 |                 GetMedian(&blocks_out[bx * 2 + 1].x,    &blocks_out[bx * 2 + 1].y,
262 |                           blocks_in[bx].x,              blocks_in[bx].y,
263 |                           blocks_in[bx + 1].x,          blocks_in[bx + 1].y,
264 |                           blocks_in[bx - nBlkX].x,      blocks_in[bx - nBlkX].y);
265 | 
266 |                 GetMedian(&blocks_out[bx * 2 + nBlkX * 2].x,    &blocks_out[bx * 2 + nBlkX * 2].y,
267 |                           blocks_in[bx].x,                      blocks_in[bx].y,
268 |                           blocks_in[bx - 1].x,                  blocks_in[bx - 1].y,
269 |                           blocks_in[bx + nBlkX].x,              blocks_in[bx + nBlkX].y);
270 | 
271 |                 GetMedian(&blocks_out[bx * 2 + nBlkX * 2 + 1].x,    &blocks_out[bx * 2 + nBlkX * 2 + 1].y,
272 |                           blocks_in[bx].x,                          blocks_in[bx].y,
273 |                           blocks_in[bx + 1].x,                      blocks_in[bx + 1].y,
274 |                           blocks_in[bx + nBlkX].x,                  blocks_in[bx + nBlkX].y);
275 |             }
276 |         }
277 | 
278 |         bx = nBlkX - 1;
279 | 
280 |         block = blocks_in[bx];
281 |         block.sad >>= 2;
282 | 
283 |         blocks_out[bx * 2] = block;                 // top left subblock
284 |         blocks_out[bx * 2 + 1] = block;             // top right subblock
285 |         blocks_out[bx * 2 + nBlkX * 2] = block;     // bottom left subblock
286 |         blocks_out[bx * 2 + nBlkX * 2 + 1] = block; // bottom right subblock
287 | 
288 |         blocks_out += nBlkX * 4;
289 |         blocks_in += nBlkX;
290 |     }
291 | 
292 |     // bottom blocks
293 |     for (int bx = 0; bx < nBlkX; bx++) {
294 |         VECTOR block = blocks_in[bx];
295 |         block.sad >>= 2;
296 | 
297 |         blocks_out[bx * 2] = block;                 // top left subblock
298 |         blocks_out[bx * 2 + 1] = block;             // top right subblock
299 |         blocks_out[bx * 2 + nBlkX * 2] = block;     // bottom left subblock
300 |         blocks_out[bx * 2 + nBlkX * 2 + 1] = block; // bottom right subblock
301 |     }
302 | }
303 | 


--------------------------------------------------------------------------------
/src/GroupOfPlanes.h:
--------------------------------------------------------------------------------
 1 | // See legal notice in Copying.txt for more information
 2 | 
 3 | // This program is free software; you can redistribute it and/or modify
 4 | // it under the terms of the GNU General Public License as published by
 5 | // the Free Software Foundation; either version 2 of the License, or
 6 | // (at your option) any later version.
 7 | //
 8 | // This program is distributed in the hope that it will be useful,
 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with this program; if not, write to the Free Software
15 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
16 | // http://www.gnu.org/copyleft/gpl.html .
17 | 
18 | #ifndef GROUPOFPLANES_H
19 | #define GROUPOFPLANES_H
20 | 
21 | #ifdef __cplusplus
22 | extern "C" {
23 | #endif
24 | 
25 | #include "DCTFFTW.h"
26 | #include "Fakery.h"
27 | #include "MVFrame.h"
28 | #include "PlaneOfBlocks.h"
29 | 
30 | 
31 | typedef struct GroupOfPlanes {
32 |     int nBlkSizeX;
33 |     int nBlkSizeY;
34 |     int nLevelCount;
35 |     int nOverlapX;
36 |     int nOverlapY;
37 |     int xRatioUV;
38 |     int yRatioUV;
39 |     int divideExtra;
40 | 
41 |     PlaneOfBlocks **planes;
42 | } GroupOfPlanes;
43 | 
44 | 
45 | void gopInit(GroupOfPlanes *gop, int nBlkSizeX, int nBlkSizeY, int nLevelCount, int nPel, int nMotionFlags, int nCPUFlags, int nOverlapX, int nOverlapY, int nBlkX, int nBlkY, int xRatioUV, int yRatioUV, int divideExtra, int bitsPerSample);
46 | 
47 | void gopDeinit(GroupOfPlanes *gop);
48 | 
49 | void gopSearchMVs(GroupOfPlanes *gop, MVGroupOfFrames *pSrcGOF, MVGroupOfFrames *pRefGOF, SearchType searchType, int nSearchParam, int nPelSearch, int nLambda, int lsad, int pnew, int plevel, int global, uint8_t *out, int fieldShift, DCTFFTW *DCT, int dctmode, int pzero, int pglobal, int64_t badSAD, int badrange, int meander, int tryMany, SearchType coarseSearchType);
50 | 
51 | void gopRecalculateMVs(GroupOfPlanes *gop, FakeGroupOfPlanes *fgop, MVGroupOfFrames *pSrcGOF, MVGroupOfFrames *pRefGOF, SearchType searchType, int nSearchParam, int nLambda, int pnew, uint8_t *out, int fieldShift, int64_t thSAD, DCTFFTW *DCT, int dctmode, int smooth, int meander);
52 | 
53 | void gopWriteDefaultToArray(GroupOfPlanes *gop, uint8_t *array);
54 | 
55 | MVArraySizeType gopGetArraySize(GroupOfPlanes *gop);
56 | 
57 | void gopExtraDivide(GroupOfPlanes *gop, uint8_t *out);
58 | 
59 | #ifdef __cplusplus
60 | } // extern "C"
61 | #endif
62 | 
63 | #endif
64 | 


--------------------------------------------------------------------------------
/src/Luma.cpp:
--------------------------------------------------------------------------------
  1 | #include <cstdint>
  2 | #include <stdexcept>
  3 | #include <unordered_map>
  4 | 
  5 | #include "Luma.h"
  6 | 
  7 | 
  8 | enum InstructionSets {
  9 |     Scalar,
 10 |     SSE2,
 11 | };
 12 | 
 13 | 
 14 | template <unsigned width, unsigned height, typename PixelType>
 15 | unsigned int luma_c(const uint8_t *pSrc8, intptr_t nSrcPitch) {
 16 |     unsigned int meanLuma = 0;
 17 |     for (unsigned j = 0; j < height; j++) {
 18 |         for (unsigned i = 0; i < width; i++) {
 19 |             const PixelType *pSrc = (const PixelType *)pSrc8;
 20 |             meanLuma += pSrc[i];
 21 |         }
 22 |         pSrc8 += nSrcPitch;
 23 |     }
 24 |     return meanLuma;
 25 | }
 26 | 
 27 | 
 28 | #if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
 29 | 
 30 | #if defined(MVTOOLS_ARM)
 31 | #include "sse2neon.h"
 32 | #else
 33 | #include <emmintrin.h>
 34 | #endif
 35 | 
 36 | 
 37 | #define zeroes _mm_setzero_si128()
 38 | 
 39 | 
 40 | template <unsigned width, unsigned height>
 41 | unsigned int luma_sse2(const uint8_t *pSrc, intptr_t nSrcPitch) {
 42 |     __m128i sum = zeroes;
 43 | 
 44 |     for (unsigned y = 0; y < height; y++) {
 45 |         for (unsigned x = 0; x < width; x += 16) {
 46 |             __m128i src;
 47 |             if (width == 4)
 48 |                 src = _mm_cvtsi32_si128(*(const int *)pSrc);
 49 |             else if (width == 8)
 50 |                 src = _mm_loadl_epi64((const __m128i *)pSrc);
 51 |             else
 52 |                 src = _mm_loadu_si128((const __m128i *)&pSrc[x]);
 53 | 
 54 |             sum = _mm_add_epi64(sum, _mm_sad_epu8(src, zeroes));
 55 |         }
 56 | 
 57 |         pSrc += nSrcPitch;
 58 |     }
 59 | 
 60 |     if (width >= 16)
 61 |         sum = _mm_add_epi64(sum, _mm_srli_si128(sum, 8));
 62 | 
 63 |     return (unsigned)_mm_cvtsi128_si32(sum);
 64 | }
 65 | 
 66 | 
 67 | #undef zeroes
 68 | 
 69 | 
 70 | #endif // MVTOOLS_X86
 71 | 
 72 | 
 73 | // opt can fit in four bits, if the width and height need more than eight bits each.
 74 | #define KEY(width, height, bits, opt) (unsigned)(width) << 24 | (height) << 16 | (bits) << 8 | (opt)
 75 | 
 76 | #if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
 77 | #define LUMA_SSE2(width, height) \
 78 |     { KEY(width, height, 8, SSE2), luma_sse2<width, height> },
 79 | #else
 80 | #define LUMA_SSE2(width, height)
 81 | #endif
 82 | 
 83 | #define LUMA(width, height) \
 84 |     { KEY(width, height, 8, Scalar), luma_c<width, height, uint8_t> }, \
 85 |     { KEY(width, height, 16, Scalar), luma_c<width, height, uint16_t> },
 86 | 
 87 | static const std::unordered_map<uint32_t, LUMAFunction> luma_functions = {
 88 |     LUMA(4, 4)
 89 |     LUMA(8, 4)
 90 |     LUMA(8, 8)
 91 |     LUMA(16, 2)
 92 |     LUMA(16, 8)
 93 |     LUMA(16, 16)
 94 |     LUMA(32, 16)
 95 |     LUMA(32, 32)
 96 |     LUMA(64, 32)
 97 |     LUMA(64, 64)
 98 |     LUMA(128, 64)
 99 |     LUMA(128, 128)
100 |     LUMA_SSE2(4, 4)
101 |     LUMA_SSE2(8, 4)
102 |     LUMA_SSE2(8, 8)
103 |     LUMA_SSE2(16, 2)
104 |     LUMA_SSE2(16, 8)
105 |     LUMA_SSE2(16, 16)
106 |     LUMA_SSE2(32, 16)
107 |     LUMA_SSE2(32, 32)
108 |     LUMA_SSE2(64, 32)
109 |     LUMA_SSE2(64, 64)
110 |     LUMA_SSE2(128, 64)
111 |     LUMA_SSE2(128, 128)
112 | };
113 | 
114 | LUMAFunction selectLumaFunction(unsigned width, unsigned height, unsigned bits, int opt) {
115 |     LUMAFunction luma = luma_functions.at(KEY(width, height, bits, Scalar));
116 | 
117 | #if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
118 |     if (opt) {
119 |         try {
120 |             luma = luma_functions.at(KEY(width, height, bits, SSE2));
121 |         } catch (std::out_of_range &) { }
122 |     }
123 | #endif
124 | 
125 |     return luma;
126 | }
127 | 
128 | #undef LUMA
129 | #undef LUMA_SSE2
130 | #undef KEY
131 | 


--------------------------------------------------------------------------------
/src/Luma.h:
--------------------------------------------------------------------------------
 1 | #ifndef LUMA_H
 2 | #define LUMA_H
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | #include <stdint.h>
 9 | 
10 | 
11 | typedef unsigned int (*LUMAFunction)(const uint8_t *pSrc, intptr_t nSrcPitch);
12 | 
13 | 
14 | LUMAFunction selectLumaFunction(unsigned width, unsigned height, unsigned bits, int opt);
15 | 
16 | #ifdef __cplusplus
17 | } // extern "C"
18 | #endif
19 | 
20 | #endif // LUMA_H
21 | 


--------------------------------------------------------------------------------
/src/MVAnalysisData.c:
--------------------------------------------------------------------------------
 1 | #include <string.h>
 2 | 
 3 | #include "Bullshit.h"
 4 | #include "MVAnalysisData.h"
 5 | 
 6 | 
 7 | void scaleThSCD(int64_t *thscd1, int *thscd2, const MVAnalysisData *ad, const char *filter_name, char *error, size_t error_size) {
 8 |     if (error_size) {
 9 |         if (error[0])
10 |             return;
11 |         error[0] = '\0';
12 |     }
13 | 
14 |     int maxSAD = 8 * 8 * 255;
15 | 
16 |     if (*thscd1 > maxSAD) {
17 |         snprintf(error, error_size, "%s: thscd1 can be at most %d.", filter_name, maxSAD);
18 |         return;
19 |     }
20 | 
21 |     // SCD thresholds
22 |     int referenceBlockSize = 8 * 8;
23 |     *thscd1 = *thscd1 * (ad->nBlkSizeX * ad->nBlkSizeY) / referenceBlockSize;
24 |     if (ad->nMotionFlags & MOTION_USE_CHROMA_MOTION)
25 |         *thscd1 += *thscd1 / (ad->xRatioUV * ad->yRatioUV) * 2;
26 | 
27 |     int pixelMax = (1 << ad->bitsPerSample) - 1;
28 |     *thscd1 = (int64_t)((double)*thscd1 * pixelMax / 255.0 + 0.5);
29 | 
30 |     *thscd2 = *thscd2 * ad->nBlkX * ad->nBlkY / 256;
31 | }
32 | 
33 | 
34 | void adataFromVectorClip(struct MVAnalysisData *ad, VSNode *clip, const char *filter_name, const char *vector_name, const VSAPI *vsapi, char *error, size_t error_size) {
35 |     if (error_size) {
36 |         if (error[0])
37 |             return;
38 |         error[0] = '\0';
39 |     }
40 | 
41 |     char errorMsg[1024];
42 |     const VSFrame *evil = vsapi->getFrame(0, clip, errorMsg, 1024);
43 |     if (!evil) {
44 |         snprintf(error, error_size, "%s: Failed to retrieve first frame from %s. Error message: %s", filter_name, vector_name, errorMsg);
45 |         return;
46 |     }
47 | 
48 |     const VSMap *props = vsapi->getFramePropertiesRO(evil);
49 |     int err;
50 |     const char *data = vsapi->mapGetData(props, prop_MVTools_MVAnalysisData, 0, &err);
51 |     if (err) {
52 |         snprintf(error, error_size, "%s: Property '%s' not found in first frame of %s.", filter_name, prop_MVTools_MVAnalysisData, vector_name);
53 |         return;
54 |     }
55 | 
56 |     int data_size = vsapi->mapGetDataSize(props, prop_MVTools_MVAnalysisData, 0, NULL);
57 |     if (data_size != sizeof(MVAnalysisData)) {
58 |         snprintf(error, error_size, "%s: Property '%s' in first frame of %s has wrong size (%d instead of %d).", filter_name, prop_MVTools_MVAnalysisData, vector_name, data_size, (int)sizeof(MVAnalysisData));
59 |         return;
60 |     }
61 | 
62 |     memcpy(ad, data, sizeof(MVAnalysisData));
63 | 
64 |     vsapi->freeFrame(evil);
65 | }
66 | 
67 | 
68 | void adataCheckSimilarity(const MVAnalysisData *ad1, const MVAnalysisData *ad2, const char *filter_name1, const char *filter_name2, const char *vector_name, char *error, size_t error_size) {
69 |     if (error_size) {
70 |         if (error[0])
71 |             return;
72 |         error[0] = '\0';
73 |     }
74 | 
75 |     if (ad1->nWidth != ad2->nWidth)
76 |         snprintf(error, error_size, "%s: %s and %s have different widths.", filter_name1, filter_name2, vector_name);
77 | 
78 |     if (ad1->nHeight != ad2->nHeight)
79 |         snprintf(error, error_size, "%s: %s and %s have different heights.", filter_name1, filter_name2, vector_name);
80 | 
81 |     if (ad1->nBlkSizeX != ad2->nBlkSizeX || ad1->nBlkSizeY != ad2->nBlkSizeY)
82 |         snprintf(error, error_size, "%s: %s and %s have different block sizes.", filter_name1, filter_name2, vector_name);
83 | 
84 |     if (ad1->nPel != ad2->nPel)
85 |         snprintf(error, error_size, "%s: %s and %s have different pel precision.", filter_name1, filter_name2, vector_name);
86 | 
87 |     if (ad1->nOverlapX != ad2->nOverlapX || ad1->nOverlapY != ad2->nOverlapY)
88 |         snprintf(error, error_size, "%s: %s and %s have different overlap.", filter_name1, filter_name2, vector_name);
89 | 
90 |     if (ad1->xRatioUV != ad2->xRatioUV)
91 |         snprintf(error, error_size, "%s: %s and %s have different horizontal subsampling.", filter_name1, filter_name2, vector_name);
92 | 
93 |     if (ad1->yRatioUV != ad2->yRatioUV)
94 |         snprintf(error, error_size, "%s: %s and %s have different vertical subsampling.", filter_name1, filter_name2, vector_name);
95 | 
96 |     if (ad1->bitsPerSample != ad2->bitsPerSample)
97 |         snprintf(error, error_size, "%s: %s and %s have different bit depths.", filter_name1, filter_name2, vector_name);
98 | }
99 | 


--------------------------------------------------------------------------------
/src/MVAnalysisData.h:
--------------------------------------------------------------------------------
  1 | // Define the BlockData class
  2 | 
  3 | // I borrowed a lot of code from XviD's sources here, so I thank all the developpers
  4 | // of this wonderful codec
  5 | 
  6 | // See legal notice in Copying.txt for more information
  7 | 
  8 | // This program is free software; you can redistribute it and/or modify
  9 | // it under the terms of the GNU General Public License as published by
 10 | // the Free Software Foundation; either version 2 of the License, or
 11 | // (at your option) any later version.
 12 | //
 13 | // This program is distributed in the hope that it will be useful,
 14 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 15 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 16 | // GNU General Public License for more details.
 17 | //
 18 | // You should have received a copy of the GNU General Public License
 19 | // along with this program; if not, write to the Free Software
 20 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
 21 | // http://www.gnu.org/copyleft/gpl.html .
 22 | 
 23 | #ifndef MVANALYSISDATA_H
 24 | #define MVANALYSISDATA_H
 25 | 
 26 | #ifdef __cplusplus
 27 | extern "C" {
 28 | #endif
 29 | 
 30 | #include <stdint.h>
 31 | #include <stdio.h>
 32 | 
 33 | #include <VapourSynth4.h>
 34 | 
 35 | 
 36 | static const char prop_MVTools_MVAnalysisData[] = "MVTools_MVAnalysisData";
 37 | static const char prop_MVTools_vectors[] = "MVTools_vectors";
 38 | 
 39 | 
 40 | typedef struct VECTOR {
 41 |     int x;
 42 |     int y;
 43 |     int64_t sad;
 44 | } VECTOR;
 45 | 
 46 | 
 47 | // Type of the size fields in the arrays of VECTOR.
 48 | typedef int MVArraySizeType;
 49 | 
 50 | 
 51 | /*! \brief Search type : defines the algorithm used for minimizing the SAD */
 52 | typedef enum SearchType {
 53 |     SearchOnetime,
 54 |     SearchNstep,
 55 |     SearchLogarithmic,
 56 |     SearchExhaustive,
 57 |     SearchHex2,
 58 |     SearchUnevenMultiHexagon,
 59 |     SearchHorizontal,
 60 |     SearchVertical
 61 | } SearchType;
 62 | 
 63 | 
 64 | #define MOTION_USE_SIMD             0x00000001
 65 | #define MOTION_IS_BACKWARD          0x00000002
 66 | #define MOTION_SMALLEST_PLANE       0x00000004
 67 | #define MOTION_USE_CHROMA_MOTION    0x00000008
 68 | //force MVAnalyse to use a different function for SAD / SADCHROMA (debug)
 69 | #define MOTION_USE_SSD              0x00000010
 70 | #define MOTION_USE_SATD             0x00000020
 71 | 
 72 | 
 73 | #define MV_DEFAULT_SCD1 400 // increased in v1.4.1
 74 | #define MV_DEFAULT_SCD2 130
 75 | 
 76 | //#define MV_BUFFER_FRAMES 10
 77 | 
 78 | static const VECTOR zeroMV = { 0, 0, -1 };
 79 | 
 80 | 
 81 | #define MVANALYSIS_DATA_VERSION 5
 82 | 
 83 | typedef struct MVAnalysisData {
 84 |     /*! \brief Unique identifier, not very useful */
 85 |     int nMagicKey; // placed to head in v.1.2.6
 86 | 
 87 |     int nVersion; // MVAnalysisData and outfile format version - added in v1.2.6
 88 | 
 89 |     /*! \brief size of a block, in pixel */
 90 |     int nBlkSizeX; // horizontal block size
 91 | 
 92 |     int nBlkSizeY; // vertical block size - v1.7
 93 | 
 94 |     /*! \brief pixel refinement of the motion estimation */
 95 |     int nPel;
 96 | 
 97 |     /*! \brief number of level for the hierarchal search */
 98 |     int nLvCount;
 99 | 
100 |     /*! \brief difference between the index of the reference and the index of the current frame */
101 |     int nDeltaFrame;
102 | 
103 |     /*! \brief direction of the search ( forward / backward ) */
104 |     int isBackward;
105 | 
106 |     int nCPUFlags;
107 | 
108 |     /*! \brief diverse flags to set up the search */
109 |     int nMotionFlags;
110 | 
111 |     /*! \brief Width of the frame */
112 |     int nWidth;
113 | 
114 |     /*! \brief Height of the frame */
115 |     int nHeight;
116 | 
117 |     int nOverlapX; // overlap block size - v1.1
118 | 
119 |     int nOverlapY; // vertical overlap - v1.7
120 | 
121 |     int nBlkX; // number of blocks along X
122 | 
123 |     int nBlkY; // number of blocks along Y
124 | 
125 |     int bitsPerSample;
126 | 
127 |     int yRatioUV; // ratio of luma plane height to chroma plane height
128 | 
129 |     int xRatioUV; // ratio of luma plane width to chroma plane width
130 | 
131 |     int nHPadding; // Horizontal padding - v1.8.1
132 | 
133 |     int nVPadding; // Vertical padding - v1.8.1
134 | } MVAnalysisData;
135 | 
136 | 
137 | void scaleThSCD(int64_t *thscd1, int *thscd2, const MVAnalysisData *ad, const char *filter_name, char *error, size_t error_size);
138 | 
139 | void adataFromVectorClip(struct MVAnalysisData *ad, VSNode *clip, const char *filter_name, const char *vector_name, const VSAPI *vsapi, char *error, size_t error_size);
140 | 
141 | void adataCheckSimilarity(const MVAnalysisData *ad1, const MVAnalysisData *ad2, const char *filter_name1, const char *filter_name2, const char *vector_name, char *error, size_t error_size);
142 | 
143 | 
144 | //#define MOTION_DELTA_FRAME_BUFFER 5
145 | 
146 | 
147 | #ifdef __cplusplus
148 | } // extern "C"
149 | #endif
150 | 
151 | #endif // MVANALYSISDATA_H
152 | 


--------------------------------------------------------------------------------
/src/MVDegrains.h:
--------------------------------------------------------------------------------
  1 | #ifndef MVDEGRAINS_H
  2 | #define MVDEGRAINS_H
  3 | 
  4 | #include <cstdint>
  5 | #include <cstring>
  6 | 
  7 | #include "Fakery.h"
  8 | #include "MVFrame.h"
  9 | 
 10 | enum VectorOrder {
 11 |     Backward1 = 0,
 12 |     Forward1,
 13 |     Backward2,
 14 |     Forward2,
 15 |     Backward3,
 16 |     Forward3,
 17 |     Backward4,
 18 |     Forward4,
 19 |     Backward5,
 20 |     Forward5,
 21 |     Backward6,
 22 |     Forward6
 23 | };
 24 | 
 25 | 
 26 | typedef void (*DenoiseFunction)(uint8_t *pDst, int nDstPitch, const uint8_t *pSrc, int nSrcPitch, const uint8_t **_pRefs, const int *nRefPitches, int WSrc, const int *WRefs);
 27 | 
 28 | 
 29 | // XXX Moves the pointers passed in pRefs. This is okay because they are not
 30 | // used after this function is done with them.
 31 | template <int radius, int blockWidth, int blockHeight, typename PixelType>
 32 | static void Degrain_C(uint8_t * __restrict pDst8, int nDstPitch, const uint8_t * __restrict pSrc8, int nSrcPitch, const uint8_t ** __restrict pRefs8, const int * __restrict nRefPitches, int WSrc, const int * __restrict WRefs) {
 33 |     for (int y = 0; y < blockHeight; y++) {
 34 |         for (int x = 0; x < blockWidth; x++) {
 35 |             const PixelType *pSrc = (const PixelType * __restrict)pSrc8;
 36 |             PixelType *pDst = (PixelType * __restrict)pDst8;
 37 | 
 38 |             int sum = 128 + pSrc[x] * WSrc;
 39 | 
 40 |             for (int r = 0; r < radius * 2; r++) {
 41 |                 const PixelType *pRef = (const PixelType * __restrict)pRefs8[r];
 42 |                 sum += pRef[x] * WRefs[r];
 43 |             }
 44 | 
 45 |             pDst[x] = sum >> 8;
 46 |         }
 47 | 
 48 |         pDst8 += nDstPitch;
 49 |         pSrc8 += nSrcPitch;
 50 |         for (int r = 0; r < radius * 2; r++)
 51 |             pRefs8[r] += nRefPitches[r];
 52 |     }
 53 | }
 54 | 
 55 | 
 56 | #if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
 57 | 
 58 | #if defined(MVTOOLS_ARM)
 59 | #include "sse2neon.h"
 60 | #else
 61 | #include <emmintrin.h>
 62 | 
 63 | DenoiseFunction selectDegrainFunctionAVX2(unsigned radius, unsigned width, unsigned height, unsigned bits);
 64 | #endif
 65 | 
 66 | // XXX Moves the pointers passed in pRefs. This is okay because they are not
 67 | // used after this function is done with them.
 68 | template <int radius, int blockWidth, int blockHeight>
 69 | static void Degrain_sse2(uint8_t *pDst, int nDstPitch, const uint8_t *pSrc, int nSrcPitch, const uint8_t **pRefs, const int *nRefPitches, int WSrc, const int *WRefs) {
 70 |     static_assert(blockWidth >= 4, "");
 71 | 
 72 |     __m128i zero = _mm_setzero_si128();
 73 |     __m128i wsrc = _mm_set1_epi16(WSrc);
 74 |     __m128i wrefs[12];
 75 | 
 76 |     // We intentionally jump by 2 (here and below), as it delineates groups of
 77 |     // backward/forward and ALSO produces testably faster code.
 78 |     for(int i = 0; i < radius * 2; i += 2) {
 79 |         wrefs[i] = _mm_set1_epi16(WRefs[i]);
 80 |         wrefs[i + 1] = _mm_set1_epi16(WRefs[i + 1]);
 81 |     }
 82 | 
 83 |     __m128i src, accum, refs[12];
 84 | 
 85 |     for (int y = 0; y < blockHeight; y++) {
 86 |         for (int x = 0; x < blockWidth; x += 8) {
 87 |             // pDst[x] = (pRefF[x]*WRefF + pSrc[x]*WSrc + pRefB[x]*WRefB + 
 88 |             //            pRefF2[x]*WRefF2 + pRefB2[x]*WRefB2 + pRefF3[x]*WRefF3 + pRefB3[x]*WRefB3
 89 |             //            pRefF4[x]*WRefF4 + pRefB4[x]*WRefB4 + pRefF5[x]*WRefF5 + pRefB5[x]*WRefB5
 90 |             //            pRefF6[x]*WRefF6 + pRefB6[x]*WRefB6 + 128)>>8;
 91 | 
 92 |             if (blockWidth == 4) {
 93 |                 src = _mm_cvtsi32_si128(*(const int *)pSrc);
 94 |                 for(int i = 0; i < radius * 2; i += 2) {
 95 |                     refs[i] = _mm_cvtsi32_si128(*(const int *)pRefs[i]);
 96 |                     refs[i + 1] = _mm_cvtsi32_si128(*(const int *)pRefs[i + 1]);
 97 |                 }
 98 |             } else {
 99 |                 src = _mm_loadl_epi64((const __m128i *)(pSrc + x));
100 |                 for(int i = 0; i < radius * 2; i += 2) {
101 |                     refs[i] = _mm_loadl_epi64((const __m128i *)(pRefs[i] + x));
102 |                     refs[i + 1] = _mm_loadl_epi64((const __m128i *)(pRefs[i + 1] + x));
103 |                 }
104 |             }
105 | 
106 |             src = _mm_unpacklo_epi8(src, zero);
107 |             src = _mm_mullo_epi16(src, wsrc);
108 | 
109 |             for(int i = 0; i < radius * 2; i += 2) {
110 |                 refs[i] = _mm_unpacklo_epi8(refs[i], zero);
111 |                 refs[i + 1] = _mm_unpacklo_epi8(refs[i + 1], zero);
112 | 
113 |                 refs[i] = _mm_mullo_epi16(refs[i], wrefs[i]);
114 |                 refs[i + 1] = _mm_mullo_epi16(refs[i + 1], wrefs[i + 1]);
115 |             }
116 | 
117 |             accum = _mm_set1_epi16(128);
118 |             accum = _mm_add_epi16(accum, src);
119 | 
120 |             for(int i = 0; i < radius * 2; i += 2) {
121 |                 accum = _mm_add_epi16(accum, refs[i]);
122 |                 accum = _mm_add_epi16(accum, refs[i + 1]);
123 |             }
124 | 
125 |             accum = _mm_srli_epi16(accum, 8);
126 |             accum = _mm_packus_epi16(accum, zero);
127 | 
128 |             if (blockWidth == 4)
129 |                 *(int *)pDst = _mm_cvtsi128_si32(accum);
130 |             else
131 |                 _mm_storel_epi64((__m128i *)(pDst + x), accum);
132 |         }
133 |         pDst += nDstPitch;
134 |         pSrc += nSrcPitch;
135 |         for(int i = 0; i < radius * 2; i += 2) {
136 |             pRefs[i] += nRefPitches[i];
137 |             pRefs[i + 1] += nRefPitches[i + 1];
138 |         }
139 |     }
140 | }
141 | 
142 | static void LimitChanges_sse2(uint8_t *pDst, intptr_t nDstPitch, const uint8_t *pSrc, intptr_t nSrcPitch, intptr_t nWidth, intptr_t nHeight, intptr_t nLimit) {
143 |     __m128i bytes_limit = _mm_set1_epi8(nLimit);
144 | 
145 |     for (int y = 0; y < nHeight; y++) {
146 |         for (int x = 0; x < nWidth; x += 16) {
147 |             __m128i m0 = _mm_load_si128((const __m128i *)&pSrc[x]);
148 |             __m128i m1 = _mm_load_si128((const __m128i *)&pDst[x]);
149 | 
150 |             __m128i lower = _mm_subs_epu8(m0, bytes_limit);
151 |             __m128i upper = _mm_adds_epu8(m0, bytes_limit);
152 | 
153 |             m0 = _mm_min_epu8(_mm_max_epu8(lower, m1), upper);
154 | 
155 |             _mm_store_si128((__m128i *)&pDst[x], m0);
156 |         }
157 | 
158 |         pSrc += nSrcPitch;
159 |         pDst += nDstPitch;
160 |     }
161 | }
162 | 
163 | #endif // MVTOOLS_X86
164 | 
165 | 
166 | typedef void (*LimitFunction)(uint8_t *pDst, intptr_t nDstPitch, const uint8_t *pSrc, intptr_t nSrcPitch, intptr_t nWidth, intptr_t nHeight, intptr_t nLimit);
167 | 
168 | 
169 | template <typename PixelType>
170 | static void LimitChanges_C(uint8_t *pDst8, intptr_t nDstPitch, const uint8_t *pSrc8, intptr_t nSrcPitch, intptr_t nWidth, intptr_t nHeight, intptr_t nLimit) {
171 |     for (int h = 0; h < nHeight; h++) {
172 |         for (int i = 0; i < nWidth; i++) {
173 |             const PixelType *pSrc = (const PixelType *)pSrc8;
174 |             PixelType *pDst = (PixelType *)pDst8;
175 | 
176 |             pDst[i] = (PixelType)VSMIN(VSMAX(pDst[i], (pSrc[i] - nLimit)), (pSrc[i] + nLimit));
177 |         }
178 |         pDst8 += nDstPitch;
179 |         pSrc8 += nSrcPitch;
180 |     }
181 | }
182 | 
183 | 
184 | static inline int DegrainWeight(int64_t thSAD, int64_t blockSAD) {
185 |     if (blockSAD >= thSAD)
186 |         return 0;
187 | 
188 |     return int((thSAD - blockSAD) * (thSAD + blockSAD) * 256 / (double)(thSAD * thSAD + blockSAD * blockSAD));
189 | }
190 | 
191 | 
192 | static inline void useBlock(const uint8_t *&p, int &np, int &WRef, int isUsable, const FakeGroupOfPlanes *fgop, int i, MVPlane * const *pPlane, const uint8_t **pSrcCur, int xx, const int *nSrcPitch, int nLogPel, int plane, int xSubUV, int ySubUV, const int64_t *thSAD) {
193 |     if (isUsable) {
194 |         const FakeBlockData *block = fgopGetBlock(fgop, 0, i);
195 |         int blx = (block->x << nLogPel) + block->vector.x;
196 |         int bly = (block->y << nLogPel) + block->vector.y;
197 |         p = mvpGetPointer(pPlane[plane], plane ? blx >> xSubUV : blx, plane ? bly >> ySubUV : bly);
198 |         np = pPlane[plane]->nPitch;
199 |         int64_t blockSAD = block->vector.sad;
200 |         WRef = DegrainWeight(thSAD[plane], blockSAD);
201 |     } else {
202 |         p = pSrcCur[plane] + xx;
203 |         np = nSrcPitch[plane];
204 |         WRef = 0;
205 |     }
206 | }
207 | 
208 | 
209 | template <int radius>
210 | static inline void normaliseWeights(int &WSrc, int *WRefs) {
211 |     // normalise weights to 256
212 |     WSrc = 256;
213 |     int WSum = WSrc + 1;
214 |     for (int r = 0; r < radius * 2; r++)
215 |         WSum += WRefs[r];
216 | 
217 |     double scale = 256.0 / WSum;
218 | 
219 |     for (int r = 0; r < radius * 2; r++) {
220 |         WRefs[r] = WRefs[r] * scale;
221 |         WSrc -= WRefs[r];
222 |     }
223 | }
224 | 
225 | 
226 | #endif // MVDEGRAINS_H
227 | 


--------------------------------------------------------------------------------
/src/MVDegrains_AVX2.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdexcept>
  2 | #include <unordered_map>
  3 | 
  4 | #include "MVDegrains.h"
  5 | 
  6 | enum InstructionSets {
  7 |     Scalar,
  8 |     SSE2,
  9 |     AVX2,
 10 | };
 11 | 
 12 | // opt can fit in four bits, if the width and height need more than eight bits each.
 13 | #define KEY(width, height, bits, opt) (unsigned)(width) << 24 | (height) << 16 | (bits) << 8 | (opt)
 14 | 
 15 | #if defined(MVTOOLS_X86)
 16 | #define DEGRAIN_AVX2(radius, width, height) \
 17 |     { KEY(width, height, 8, AVX2), Degrain_avx2<radius, width, height> },
 18 | 
 19 | #define DEGRAIN_LEVEL_AVX2(radius) \
 20 |     {\
 21 |         DEGRAIN_AVX2(radius, 8, 2)\
 22 |         DEGRAIN_AVX2(radius, 8, 4)\
 23 |         DEGRAIN_AVX2(radius, 8, 8)\
 24 |         DEGRAIN_AVX2(radius, 8, 16)\
 25 |         DEGRAIN_AVX2(radius, 16, 1)\
 26 |         DEGRAIN_AVX2(radius, 16, 2)\
 27 |         DEGRAIN_AVX2(radius, 16, 4)\
 28 |         DEGRAIN_AVX2(radius, 16, 8)\
 29 |         DEGRAIN_AVX2(radius, 16, 16)\
 30 |         DEGRAIN_AVX2(radius, 16, 32)\
 31 |         DEGRAIN_AVX2(radius, 32, 8)\
 32 |         DEGRAIN_AVX2(radius, 32, 16)\
 33 |         DEGRAIN_AVX2(radius, 32, 32)\
 34 |         DEGRAIN_AVX2(radius, 32, 64)\
 35 |         DEGRAIN_AVX2(radius, 64, 16)\
 36 |         DEGRAIN_AVX2(radius, 64, 32)\
 37 |         DEGRAIN_AVX2(radius, 64, 64)\
 38 |         DEGRAIN_AVX2(radius, 64, 128)\
 39 |         DEGRAIN_AVX2(radius, 128, 32)\
 40 |         DEGRAIN_AVX2(radius, 128, 64)\
 41 |         DEGRAIN_AVX2(radius, 128, 128)\
 42 |     }
 43 | #else
 44 | #define DEGRAIN_AVX2(radius, width, height)
 45 | #define DEGRAIN_LEVEL_AVX2(radius)
 46 | #endif
 47 | 
 48 | 
 49 | #if defined(MVTOOLS_X86)
 50 | 
 51 | #include <immintrin.h>
 52 | 
 53 | // XXX Moves the pointers passed in pRefs. This is okay because they are not
 54 | // used after this function is done with them.
 55 | template <int radius, int blockWidth, int blockHeight>
 56 | static void Degrain_avx2(uint8_t *pDst, int nDstPitch, const uint8_t *pSrc, int nSrcPitch, const uint8_t **pRefs, const int *nRefPitches, int WSrc, const int *WRefs) {
 57 |     static_assert(blockWidth >= 16 || (blockWidth == 8 && blockHeight >= 2), "");
 58 | 
 59 |     __m256i zero = _mm256_setzero_si256();
 60 |     __m256i wsrc = _mm256_set1_epi16(WSrc);
 61 | 
 62 |     __m256i wrefs[12];
 63 |     for(int i = 0; i < radius * 2; i += 2) {
 64 |         wrefs[i] = _mm256_set1_epi16(WRefs[i]);
 65 |         wrefs[i + 1] = _mm256_set1_epi16(WRefs[i + 1]);
 66 |     }
 67 |     __m256i src, accum, refs[12];
 68 | 
 69 |     int pitchMul = blockWidth == 8 ? 2 : 1;
 70 | 
 71 |     for (int y = 0; y < blockHeight; y += pitchMul) {
 72 |         for (int x = 0; x < blockWidth; x += 16 / pitchMul) {
 73 |             if (blockWidth == 8) {
 74 |                 src = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)(pSrc + x)), _mm_loadl_epi64((const __m128i *)(pSrc + nSrcPitch + x))));
 75 |                 for(int i = 0; i < radius * 2; i += 2) {
 76 |                     refs[i] = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)(pRefs[i] + x)), _mm_loadl_epi64((const __m128i *)(pRefs[i] + nRefPitches[i] + x))));
 77 |                     refs[i + 1] = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)(pRefs[i + 1] + x)), _mm_loadl_epi64((const __m128i *)(pRefs[i + 1] + nRefPitches[i + 1] + x))));
 78 |                 }
 79 |             } else {
 80 |                 src = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(pSrc + x)));
 81 |                 for(int i = 0; i < radius * 2; i += 2) {
 82 |                     refs[i] = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(pRefs[i] + x)));
 83 |                     refs[i + 1] = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(pRefs[i + 1] + x)));
 84 |                 }
 85 |             }
 86 | 
 87 |             src = _mm256_mullo_epi16(src, wsrc);
 88 |             for(int i = 0; i < radius * 2; i += 2) {
 89 |                 refs[i] = _mm256_mullo_epi16(refs[i], wrefs[i]);
 90 |                 refs[i + 1] = _mm256_mullo_epi16(refs[i + 1], wrefs[i + 1]);
 91 |             }
 92 | 
 93 |             accum = _mm256_set1_epi16(128);
 94 |             accum = _mm256_add_epi16(accum, src);
 95 | 
 96 |             for(int i = 0; i < radius * 2; i += 2) {
 97 |                 accum = _mm256_add_epi16(accum, refs[i]);
 98 |                 accum = _mm256_add_epi16(accum, refs[i + 1]);
 99 |             }
100 |             accum = _mm256_srli_epi16(accum, 8);
101 |             accum = _mm256_packus_epi16(accum, zero);
102 | 
103 |             if (blockWidth == 8) {
104 |                 _mm_storel_epi64((__m128i *)(pDst + x), _mm256_castsi256_si128(accum));
105 |                 _mm_storel_epi64((__m128i *)(pDst + nDstPitch + x), _mm256_extractf128_si256(accum, 1));
106 |             } else {
107 |                 accum = _mm256_permute4x64_epi64(accum, _MM_SHUFFLE(0, 0, 2, 0));
108 |                 _mm_storeu_si128((__m128i *)(pDst + x), _mm256_castsi256_si128(accum));
109 |             }
110 |         }
111 | 
112 |         pDst += nDstPitch * pitchMul;
113 |         pSrc += nSrcPitch * pitchMul;
114 | 
115 |         for(int i = 0; i < radius * 2; i += 2) {
116 |             pRefs[i] += nRefPitches[i] * pitchMul;
117 |             pRefs[i + 1] += nRefPitches[i + 1] * pitchMul;
118 |         }
119 |     }
120 | }
121 | #endif
122 | 
123 | static const std::unordered_map<uint32_t, DenoiseFunction> degrain_functions[6] = {
124 |     DEGRAIN_LEVEL_AVX2(1),
125 |     DEGRAIN_LEVEL_AVX2(2),
126 |     DEGRAIN_LEVEL_AVX2(3),
127 |     DEGRAIN_LEVEL_AVX2(4),
128 |     DEGRAIN_LEVEL_AVX2(5),
129 |     DEGRAIN_LEVEL_AVX2(6),
130 | };
131 | 
132 | DenoiseFunction selectDegrainFunctionAVX2(unsigned radius, unsigned width, unsigned height, unsigned bits) {
133 |     try {
134 |         return degrain_functions[radius - 1].at(KEY(width, height, bits, AVX2));
135 |     } catch (std::out_of_range &) {
136 |         return nullptr;
137 |     }
138 | }
139 | 


--------------------------------------------------------------------------------
/src/MVFinest.c:
--------------------------------------------------------------------------------
  1 | // Pixels flow motion function
  2 | // Copyright(c)2005 A.G.Balakhnin aka Fizick
  3 | 
  4 | // See legal notice in Copying.txt for more information
  5 | 
  6 | // This program is free software; you can redistribute it and/or modify
  7 | // it under the terms of the GNU General Public License as published by
  8 | // the Free Software Foundation; version 2 of the License.
  9 | //
 10 | // This program is distributed in the hope that it will be useful,
 11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | // GNU General Public License for more details.
 14 | //
 15 | // You should have received a copy of the GNU General Public License
 16 | // along with this program; if not, write to the Free Software
 17 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
 18 | // http://www.gnu.org/copyleft/gpl.html .
 19 | 
 20 | #include <limits.h>
 21 | #include <VapourSynth4.h>
 22 | #include <VSHelper4.h>
 23 | 
 24 | #include "MaskFun.h"
 25 | #include "CommonMacros.h"
 26 | 
 27 | 
 28 | 
 29 | typedef struct MVFinestData {
 30 |     VSNode *super;
 31 |     VSVideoInfo vi;
 32 | 
 33 |     int opt;
 34 | 
 35 |     int nWidth;
 36 |     int nHeight;
 37 |     int nSuperHPad;
 38 |     int nSuperVPad;
 39 |     int nSuperPel;
 40 |     int nSuperModeYUV;
 41 |     int nSuperLevels;
 42 |     int nPel;
 43 |     int xRatioUV;
 44 |     int yRatioUV;
 45 | } MVFinestData;
 46 | 
 47 | 
 48 | static const VSFrame *VS_CC mvfinestGetFrame(int n, int activationReason, void *instanceData, void **frameData, VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi) {
 49 |     (void)frameData;
 50 | 
 51 |     MVFinestData *d = (MVFinestData *)instanceData;
 52 | 
 53 |     if (activationReason == arInitial) {
 54 |         vsapi->requestFrameFilter(n, d->super, frameCtx);
 55 |     } else if (activationReason == arAllFramesReady) {
 56 |         const VSFrame *ref = vsapi->getFrameFilter(n, d->super, frameCtx);
 57 |         VSFrame *dst = vsapi->newVideoFrame(&d->vi.format, d->vi.width, d->vi.height, ref, core);
 58 | 
 59 |         uint8_t *pDst[3];
 60 |         const uint8_t *pRef[3];
 61 |         int nDstPitches[3], nRefPitches[3];
 62 | 
 63 |         for (int i = 0; i < d->vi.format.numPlanes; i++) {
 64 |             pDst[i] = vsapi->getWritePtr(dst, i);
 65 |             pRef[i] = vsapi->getReadPtr(ref, i);
 66 |             nDstPitches[i] = vsapi->getStride(dst, i);
 67 |             nRefPitches[i] = vsapi->getStride(ref, i);
 68 |         }
 69 | 
 70 |         int bitsPerSample = d->vi.format.bitsPerSample;
 71 |         int bytesPerSample = d->vi.format.bytesPerSample;
 72 | 
 73 |         if (d->nPel == 1) { // simply copy top lines
 74 |             for (int i = 0; i < d->vi.format.numPlanes; i++)
 75 |                 vsh_bitblt(pDst[i], nDstPitches[i], pRef[i], nRefPitches[i], d->vi.width * bytesPerSample, d->vi.height);
 76 |         } else {
 77 |             MVGroupOfFrames pRefGOF = { 0 };
 78 |             mvgofInit(&pRefGOF, d->nSuperLevels, d->nWidth, d->nHeight, d->nSuperPel, d->nSuperHPad, d->nSuperVPad, d->nSuperModeYUV, d->opt, d->xRatioUV, d->yRatioUV, bitsPerSample);
 79 | 
 80 |             mvgofUpdate(&pRefGOF, (uint8_t **)pRef, nRefPitches);
 81 | 
 82 |             MVPlane **pPlanes = pRefGOF.frames[0]->planes;
 83 | 
 84 | 
 85 |             // merge refined planes to big single plane
 86 |             for (int i = 0; i < 3; i++) {
 87 |                 if (pPlanes[i]) {
 88 |                     if (d->nPel == 2) {
 89 |                         Merge4PlanesToBig(pDst[i], nDstPitches[i],
 90 |                                           mvpGetAbsolutePointer(pPlanes[i], 0, 0),
 91 |                                           mvpGetAbsolutePointer(pPlanes[i], 1, 0),
 92 |                                           mvpGetAbsolutePointer(pPlanes[i], 0, 1),
 93 |                                           mvpGetAbsolutePointer(pPlanes[i], 1, 1),
 94 |                                           pPlanes[i]->nPaddedWidth, pPlanes[i]->nPaddedHeight,
 95 |                                           pPlanes[i]->nPitch, bitsPerSample);
 96 |                     } else if (d->nPel == 4) {
 97 |                         Merge16PlanesToBig(pDst[i], nDstPitches[i],
 98 |                                            mvpGetAbsolutePointer(pPlanes[i], 0, 0),
 99 |                                            mvpGetAbsolutePointer(pPlanes[i], 1, 0),
100 |                                            mvpGetAbsolutePointer(pPlanes[i], 2, 0),
101 |                                            mvpGetAbsolutePointer(pPlanes[i], 3, 0),
102 |                                            mvpGetAbsolutePointer(pPlanes[i], 0, 1),
103 |                                            mvpGetAbsolutePointer(pPlanes[i], 1, 1),
104 |                                            mvpGetAbsolutePointer(pPlanes[i], 2, 1),
105 |                                            mvpGetAbsolutePointer(pPlanes[i], 3, 1),
106 |                                            mvpGetAbsolutePointer(pPlanes[i], 0, 2),
107 |                                            mvpGetAbsolutePointer(pPlanes[i], 1, 2),
108 |                                            mvpGetAbsolutePointer(pPlanes[i], 2, 2),
109 |                                            mvpGetAbsolutePointer(pPlanes[i], 3, 2),
110 |                                            mvpGetAbsolutePointer(pPlanes[i], 0, 3),
111 |                                            mvpGetAbsolutePointer(pPlanes[i], 1, 3),
112 |                                            mvpGetAbsolutePointer(pPlanes[i], 2, 3),
113 |                                            mvpGetAbsolutePointer(pPlanes[i], 3, 3),
114 |                                            pPlanes[i]->nPaddedWidth, pPlanes[i]->nPaddedHeight,
115 |                                            pPlanes[i]->nPitch, bitsPerSample);
116 |                     }
117 |                 }
118 |             }
119 | 
120 |             mvgofDeinit(&pRefGOF);
121 |         }
122 | 
123 |         vsapi->freeFrame(ref);
124 | 
125 |         return dst;
126 |     }
127 | 
128 |     return 0;
129 | }
130 | 
131 | 
132 | static void VS_CC mvfinestFree(void *instanceData, VSCore *core, const VSAPI *vsapi) {
133 |     (void)core;
134 | 
135 |     MVFinestData *d = (MVFinestData *)instanceData;
136 | 
137 |     vsapi->freeNode(d->super);
138 |     free(d);
139 | }
140 | 
141 | 
142 | static void VS_CC mvfinestCreate(const VSMap *in, VSMap *out, void *userData, VSCore *core, const VSAPI *vsapi) {
143 |     (void)userData;
144 | 
145 |     MVFinestData d;
146 |     MVFinestData *data;
147 | 
148 |     int err;
149 | 
150 |     d.opt = !!vsapi->mapGetInt(in, "opt", 0, &err);
151 |     if (err)
152 |         d.opt = 1;
153 | 
154 | 
155 |     d.super = vsapi->mapGetNode(in, "super", 0, 0);
156 |     d.vi = *vsapi->getVideoInfo(d.super);
157 | 
158 |     if (!vsh_isConstantVideoFormat(&d.vi) || d.vi.format.bitsPerSample > 16 || d.vi.format.sampleType != stInteger || d.vi.format.subSamplingW > 1 || d.vi.format.subSamplingH > 1 || (d.vi.format.colorFamily != cfYUV && d.vi.format.colorFamily != cfGray)) {
159 |         vsapi->mapSetError(out, "Finest: input clip must be GRAY, 420, 422, 440, or 444, up to 16 bits, with constant dimensions.");
160 |         vsapi->freeNode(d.super);
161 |         return;
162 |     }
163 | 
164 | #define ERROR_SIZE 1024
165 |     char errorMsg[ERROR_SIZE] = "Finest: failed to retrieve first frame from super clip. Error message: ";
166 |     size_t errorLen = strlen(errorMsg);
167 |     const VSFrame *evil = vsapi->getFrame(0, d.super, errorMsg + errorLen, ERROR_SIZE - errorLen);
168 | #undef ERROR_SIZE
169 |     if (!evil) {
170 |         vsapi->mapSetError(out, errorMsg);
171 |         vsapi->freeNode(d.super);
172 |         return;
173 |     }
174 |     const VSMap *props = vsapi->getFramePropertiesRO(evil);
175 |     int evil_err[6];
176 |     d.nHeight = vsapi->mapGetIntSaturated(props, "Super_height", 0, &evil_err[0]);
177 |     d.nSuperHPad = vsapi->mapGetIntSaturated(props, "Super_hpad", 0, &evil_err[1]);
178 |     d.nSuperVPad = vsapi->mapGetIntSaturated(props, "Super_vpad", 0, &evil_err[2]);
179 |     d.nSuperPel = vsapi->mapGetIntSaturated(props, "Super_pel", 0, &evil_err[3]);
180 |     d.nSuperModeYUV = vsapi->mapGetIntSaturated(props, "Super_modeyuv", 0, &evil_err[4]);
181 |     d.nSuperLevels = vsapi->mapGetIntSaturated(props, "Super_levels", 0, &evil_err[5]);
182 |     vsapi->freeFrame(evil);
183 | 
184 |     for (int i = 0; i < 6; i++)
185 |         if (evil_err[i]) {
186 |             vsapi->mapSetError(out, "Finest: required properties not found in first frame of super clip. Maybe clip didn't come from mv.Super? Was the first frame trimmed away?");
187 |             vsapi->freeNode(d.super);
188 |             return;
189 |         }
190 | 
191 |     d.nPel = d.nSuperPel;
192 |     int nSuperWidth = d.vi.width;
193 |     d.nWidth = nSuperWidth - 2 * d.nSuperHPad;
194 | 
195 |     d.xRatioUV = 1 << d.vi.format.subSamplingW;
196 |     d.yRatioUV = 1 << d.vi.format.subSamplingH;
197 | 
198 |     d.vi.width = (d.nWidth + 2 * d.nSuperHPad) * d.nSuperPel;
199 |     d.vi.height = (d.nHeight + 2 * d.nSuperVPad) * d.nSuperPel;
200 | 
201 | 
202 |     data = (MVFinestData *)malloc(sizeof(d));
203 |     *data = d;
204 | 
205 |     VSFilterDependency deps[1] = { 
206 |         {data->super, rpStrictSpatial}, 
207 |     };
208 | 
209 |     vsapi->createVideoFilter(out, "Finest", &data->vi, mvfinestGetFrame, mvfinestFree, fmParallel, deps, ARRAY_SIZE(deps), data, core);
210 | }
211 | 
212 | 
213 | void mvfinestRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi) {
214 |     vspapi->registerFunction("Finest",
215 |                  "super:vnode;"
216 |                  "opt:int:opt;",
217 |                  "clip:vnode;",
218 |                  mvfinestCreate, 0, plugin);
219 | }
220 | 


--------------------------------------------------------------------------------
/src/MVFlowFPSHelper.c:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <VapourSynth4.h>
  3 | #include <VSHelper4.h>
  4 | 
  5 | #include "MaskFun.h"
  6 | #include "SimpleResize.h"
  7 | 
  8 | #include "MVFlowFPSHelper.h"
  9 | 
 10 | 
 11 | const VSFrame *VS_CC mvflowfpshelperGetFrame(int n, int activationReason, void *instanceData, void **frameData, VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi) {
 12 |     (void)frameData;
 13 | 
 14 |     MVFlowFPSHelperData *d = (MVFlowFPSHelperData *)instanceData;
 15 | 
 16 |     if (activationReason == arInitial) {
 17 |         vsapi->requestFrameFilter(n, d->vectors, frameCtx);
 18 |     } else if (activationReason == arAllFramesReady) {
 19 |         const VSFrame *src = vsapi->getFrameFilter(n, d->vectors, frameCtx);
 20 | 
 21 |         FakeGroupOfPlanes fgop;
 22 | 
 23 |         fgopInit(&fgop, &d->vectors_data);
 24 | 
 25 |         const VSMap *mvprops = vsapi->getFramePropertiesRO(src);
 26 |         fgopUpdate(&fgop, (const uint8_t *)vsapi->mapGetData(mvprops, prop_MVTools_vectors, 0, NULL));
 27 | 
 28 |         int isUsable = fgopIsUsable(&fgop, d->thscd1, d->thscd2);
 29 | 
 30 |         if (isUsable) {
 31 |             VSFrame *dst = vsapi->copyFrame(src, core);
 32 |             vsapi->freeFrame(src);
 33 | 
 34 |             VSMap *props = vsapi->getFramePropertiesRW(dst);
 35 | 
 36 |             const int xRatioUV = d->vectors_data.xRatioUV;
 37 |             const int yRatioUV = d->vectors_data.yRatioUV;
 38 |             const int nBlkX = d->vectors_data.nBlkX;
 39 |             const int nBlkY = d->vectors_data.nBlkY;
 40 |             const int nHeightP = d->nHeightP;
 41 |             const int nHeightPUV = d->nHeightPUV;
 42 |             const int VPitchY = d->VPitchY;
 43 |             const int VPitchUV = d->VPitchUV;
 44 |             const int nBlkXP = d->nBlkXP;
 45 |             const int nBlkYP = d->nBlkYP;
 46 |             SimpleResize *upsizer = &d->upsizer;
 47 |             SimpleResize *upsizerUV = &d->upsizerUV;
 48 | 
 49 |             int full_size_y = nHeightP * VPitchY * sizeof(int16_t);
 50 |             int small_size = nBlkXP * nBlkYP * sizeof(int16_t);
 51 | 
 52 |             int16_t *VXFullY = (int16_t *)malloc(full_size_y);
 53 |             int16_t *VYFullY = (int16_t *)malloc(full_size_y);
 54 |             int16_t *VXSmallY = (int16_t *)malloc(small_size);
 55 |             int16_t *VYSmallY = (int16_t *)malloc(small_size);
 56 | 
 57 |             // make  vector vx and vy small masks
 58 |             MakeVectorSmallMasks(&fgop, nBlkX, nBlkY, VXSmallY, nBlkXP, VYSmallY, nBlkXP);
 59 | 
 60 |             CheckAndPadSmallY(VXSmallY, VYSmallY, nBlkXP, nBlkYP, nBlkX, nBlkY);
 61 | 
 62 |             upsizer->simpleResize_int16_t(upsizer, VXFullY, VPitchY, VXSmallY, nBlkXP, 1);
 63 |             upsizer->simpleResize_int16_t(upsizer, VYFullY, VPitchY, VYSmallY, nBlkXP, 0);
 64 | 
 65 |             vsapi->mapSetData(props, prop_VXFullY, (const char *)VXFullY, full_size_y, dtBinary, maReplace);
 66 |             vsapi->mapSetData(props, prop_VYFullY, (const char *)VYFullY, full_size_y, dtBinary, maReplace);
 67 | 
 68 |             free(VXFullY);
 69 |             free(VYFullY);
 70 | 
 71 |             if (d->supervi->format.colorFamily != cfGray) {
 72 |                 int full_size_uv = nHeightPUV * VPitchUV * sizeof(int16_t);
 73 | 
 74 |                 int16_t *VXFullUV = (int16_t *)malloc(full_size_uv);
 75 |                 int16_t *VYFullUV = (int16_t *)malloc(full_size_uv);
 76 |                 int16_t *VXSmallUV = (int16_t *)malloc(small_size);
 77 |                 int16_t *VYSmallUV = (int16_t *)malloc(small_size);
 78 | 
 79 |                 VectorSmallMaskYToHalfUV(VXSmallY, nBlkXP, nBlkYP, VXSmallUV, xRatioUV);
 80 |                 VectorSmallMaskYToHalfUV(VYSmallY, nBlkXP, nBlkYP, VYSmallUV, yRatioUV);
 81 | 
 82 |                 upsizerUV->simpleResize_int16_t(upsizerUV, VXFullUV, VPitchUV, VXSmallUV, nBlkXP, 1);
 83 |                 upsizerUV->simpleResize_int16_t(upsizerUV, VYFullUV, VPitchUV, VYSmallUV, nBlkXP, 0);
 84 | 
 85 |                 free(VXSmallUV);
 86 |                 free(VYSmallUV);
 87 | 
 88 |                 vsapi->mapSetData(props, prop_VXFullUV, (const char *)VXFullUV, full_size_uv, dtBinary, maReplace);
 89 |                 vsapi->mapSetData(props, prop_VYFullUV, (const char *)VYFullUV, full_size_uv, dtBinary, maReplace);
 90 | 
 91 |                 free(VXFullUV);
 92 |                 free(VYFullUV);
 93 |             }
 94 | 
 95 |             free(VXSmallY);
 96 |             free(VYSmallY);
 97 | 
 98 | 
 99 |             fgopDeinit(&fgop);
100 | 
101 |             return dst;
102 |         } else { // poor estimation
103 |             fgopDeinit(&fgop);
104 | 
105 |             return src;
106 |         }
107 |     }
108 | 
109 |     return NULL;
110 | }
111 | 
112 | 
113 | void VS_CC mvflowfpshelperFree(void *instanceData, VSCore *core, const VSAPI *vsapi) {
114 |     (void)core;
115 | 
116 |     MVFlowFPSHelperData *d = (MVFlowFPSHelperData *)instanceData;
117 | 
118 |     vsapi->freeNode(d->vectors);
119 | 
120 |     free(d);
121 | }
122 | 


--------------------------------------------------------------------------------
/src/MVFlowFPSHelper.h:
--------------------------------------------------------------------------------
 1 | #ifndef MVFLOWFPSHELPER_H
 2 | #define MVFLOWFPSHELPER_H
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | #include <VapourSynth4.h>
 9 | 
10 | #include "MVAnalysisData.h"
11 | #include "SimpleResize.h"
12 | 
13 | typedef struct MVFlowFPSHelperData {
14 |     VSNode *vectors;
15 |     const VSVideoInfo *vi;
16 | 
17 |     const VSVideoInfo *supervi;
18 | 
19 |     int64_t thscd1;
20 |     int thscd2;
21 | 
22 |     MVAnalysisData vectors_data;
23 | 
24 |     int nHeightP;
25 |     int nHeightPUV;
26 |     int VPitchY;
27 |     int VPitchUV;
28 |     int nBlkXP;
29 |     int nBlkYP;
30 | 
31 |     SimpleResize upsizer;
32 |     SimpleResize upsizerUV;
33 | } MVFlowFPSHelperData;
34 | 
35 | 
36 | static const char prop_VXFullY[] = "VXFullY";
37 | static const char prop_VYFullY[] = "VYFullY";
38 | static const char prop_VXFullUV[] = "VXFullUV";
39 | static const char prop_VYFullUV[] = "VYFullUV";
40 | 
41 | 
42 | const VSFrame *VS_CC mvflowfpshelperGetFrame(int n, int activationReason, void *instanceData, void **frameData, VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi);
43 | void VS_CC mvflowfpshelperFree(void *instanceData, VSCore *core, const VSAPI *vsapi);
44 | 
45 | #ifdef __cplusplus
46 | } // extern "C"
47 | #endif
48 | 
49 | #endif
50 | 


--------------------------------------------------------------------------------
/src/MVFrame.h:
--------------------------------------------------------------------------------
  1 | #ifndef MVTOOLS_MVFRAME_H
  2 | #define MVTOOLS_MVFRAME_H
  3 | 
  4 | #ifdef __cplusplus
  5 | extern "C" {
  6 | #endif
  7 | 
  8 | 
  9 | #include <stdint.h>
 10 | 
 11 | typedef enum MVPlaneSet {
 12 |     YPLANE = (1 << 0),
 13 |     UPLANE = (1 << 1),
 14 |     VPLANE = (1 << 2),
 15 |     YUPLANES = YPLANE | UPLANE,
 16 |     YVPLANES = YPLANE | VPLANE,
 17 |     UVPLANES = UPLANE | VPLANE,
 18 |     YUVPLANES = YPLANE | UPLANE | VPLANE
 19 | } MVPlaneSet;
 20 | 
 21 | 
 22 | typedef enum SharpParam {
 23 |     SharpBilinear = 0,
 24 |     SharpBicubic = 1,
 25 |     SharpWiener = 2
 26 | } SharpParam;
 27 | 
 28 | 
 29 | typedef enum RfilterParam {
 30 |     RfilterSimple = 0,
 31 |     RfilterTriangle = 1,
 32 |     RfilterBilinear = 2,
 33 |     RfilterQuadratic = 3,
 34 |     RfilterCubic = 4
 35 | } RfilterParam;
 36 | 
 37 | 
 38 | int PlaneHeightLuma(int src_height, int level, int yRatioUV, int vpad);
 39 | 
 40 | int PlaneWidthLuma(int src_width, int level, int xRatioUV, int hpad);
 41 | 
 42 | unsigned int PlaneSuperOffset(int chroma, int src_height, int level, int pel, int vpad, int plane_pitch, int yRatioUV);
 43 | 
 44 | 
 45 | typedef struct MVPlane {
 46 |     uint8_t **pPlane;
 47 |     int nWidth;
 48 |     int nHeight;
 49 |     int nPaddedWidth;
 50 |     int nPaddedHeight;
 51 |     int nPitch;
 52 |     int nHPadding;
 53 |     int nVPadding;
 54 |     int nOffsetPadding;
 55 |     int nHPaddingPel;
 56 |     int nVPaddingPel;
 57 |     int bitsPerSample;
 58 |     int bytesPerSample;
 59 | 
 60 |     int nPel;
 61 | 
 62 |     int opt;
 63 | 
 64 |     int isPadded;
 65 |     int isRefined;
 66 |     int isFilled;
 67 | } MVPlane;
 68 | 
 69 | void mvpInit(MVPlane *mvp, int nWidth, int nHeight, int nPel, int nHPad, int nVPad, int opt, int bitsPerSample);
 70 | 
 71 | void mvpDeinit(MVPlane *mvp);
 72 | 
 73 | void mvpResetState(MVPlane *mvp);
 74 | 
 75 | void mvpUpdate(MVPlane *mvp, uint8_t *pSrc, int _nPitch);
 76 | 
 77 | void mvpFillPlane(MVPlane *mvp, const uint8_t *pNewPlane, int nNewPitch);
 78 | 
 79 | void mvpPad(MVPlane *mvp);
 80 | 
 81 | void mvpRefine(MVPlane *mvp, int sharp);
 82 | 
 83 | void mvpRefineExt(MVPlane *mvp, const uint8_t *pSrc2x, int nSrc2xPitch, int isExtPadded);
 84 | 
 85 | void mvpReduceTo(MVPlane *mvp, MVPlane *pReducedPlane, int rfilter);
 86 | 
 87 | const uint8_t *mvpGetAbsolutePointer(const MVPlane *mvp, int nX, int nY);
 88 | 
 89 | const uint8_t *mvpGetAbsolutePointerPel1(const MVPlane *mvp, int nX, int nY);
 90 | 
 91 | const uint8_t *mvpGetAbsolutePointerPel2(const MVPlane *mvp, int nX, int nY);
 92 | 
 93 | const uint8_t *mvpGetAbsolutePointerPel4(const MVPlane *mvp, int nX, int nY);
 94 | 
 95 | const uint8_t *mvpGetPointer(const MVPlane *mvp, int nX, int nY);
 96 | 
 97 | const uint8_t *mvpGetPointerPel1(const MVPlane *mvp, int nX, int nY);
 98 | 
 99 | const uint8_t *mvpGetPointerPel2(const MVPlane *mvp, int nX, int nY);
100 | 
101 | const uint8_t *mvpGetPointerPel4(const MVPlane *mvp, int nX, int nY);
102 | 
103 | const uint8_t *mvpGetAbsolutePelPointer(const MVPlane *mvp, int nX, int nY);
104 | 
105 | 
106 | typedef struct MVFrame {
107 |     MVPlane *planes[3];
108 | 
109 |     int nMode;
110 | } MVFrame;
111 | 
112 | 
113 | void mvfInit(MVFrame *mvf, int nWidth, int nHeight, int nPel, int nHPad, int nVPad, int nMode, int opt, int xRatioUV, int yRatioUV, int bitsPerSample);
114 | 
115 | void mvfDeinit(MVFrame *mvf);
116 | 
117 | void mvfUpdate(MVFrame *mvf, uint8_t **pSrc, int *pitch);
118 | 
119 | void mvfFillPlane(MVFrame *mvf, const uint8_t *pNewPlane, int nNewPitch, int plane);
120 | 
121 | void mvfRefine(MVFrame *mvf, MVPlaneSet nMode, int sharp);
122 | 
123 | void mvfPad(MVFrame *mvf, MVPlaneSet nMode);
124 | 
125 | void mvfResetState(MVFrame *mvf);
126 | 
127 | void mvfReduceTo(MVFrame *mvf, MVFrame *pFrame, MVPlaneSet nMode, int rfilter);
128 | 
129 | 
130 | typedef struct MVGroupOfFrames {
131 |     int nLevelCount;
132 |     MVFrame **frames;
133 | 
134 |     int nWidth[3];
135 |     int nHeight[3];
136 |     int nPel;
137 |     int nHPad[3];
138 |     int nVPad[3];
139 |     int xRatioUV;
140 |     int yRatioUV;
141 | } MVGroupOfFrames;
142 | 
143 | 
144 | void mvgofInit(MVGroupOfFrames *mvgof, int nLevelCount, int nWidth, int nHeight, int nPel, int nHPad, int nVPad, int nMode, int opt, int xRatioUV, int yRatioUV, int bitsPerSample);
145 | 
146 | void mvgofDeinit(MVGroupOfFrames *mvgof);
147 | 
148 | void mvgofUpdate(MVGroupOfFrames *mvgof, uint8_t **pSrc, int *pitch);
149 | 
150 | MVFrame *mvgofGetFrame(MVGroupOfFrames *mvgof, int nLevel);
151 | 
152 | void mvgofSetPlane(MVGroupOfFrames *mvgof, const uint8_t *pNewSrc, int nNewPitch, int plane);
153 | 
154 | void mvgofRefine(MVGroupOfFrames *mvgof, MVPlaneSet nMode, int sharp);
155 | 
156 | void mvgofPad(MVGroupOfFrames *mvgof, MVPlaneSet nMode);
157 | 
158 | void mvgofReduce(MVGroupOfFrames *mvgof, MVPlaneSet nMode, int rfilter);
159 | 
160 | void mvgofResetState(MVGroupOfFrames *mvgof);
161 | 
162 | #ifdef __cplusplus
163 | } // extern "C"
164 | #endif
165 | 
166 | #endif // MVTOOLS_MVFRAME_H
167 | 


--------------------------------------------------------------------------------
/src/MVFrame_AVX2.cpp:
--------------------------------------------------------------------------------
  1 | #if defined(MVTOOLS_X86)
  2 | 
  3 | #include <cstdint>
  4 | #include <immintrin.h>
  5 | 
  6 | #define zeroes _mm256_setzero_si256()
  7 | 
  8 | /* TODO: port these
  9 |    extern "C" void  VerticalBicubic_iSSE(uint8_t *pDst, const uint8_t *pSrc, intptr_t nDstPitch,
 10 |    intptr_t nWidth, intptr_t nHeight);
 11 |    extern "C" void  HorizontalBicubic_iSSE(uint8_t *pDst, const uint8_t *pSrc, intptr_t nDstPitch,
 12 |    intptr_t nWidth, intptr_t nHeight);
 13 |    extern "C" void  RB2F_iSSE(uint8_t *pDst, const uint8_t *pSrc, intptr_t nDstPitch,
 14 |    intptr_t nSrcPitch, intptr_t nWidth, intptr_t nHeight);
 15 |    extern "C" void  RB2FilteredVerticalLine_SSE(uint8_t *pDst, const uint8_t *pSrc, intptr_t nSrcPitch, intptr_t nWidthMMX);
 16 |    extern "C" void  RB2FilteredHorizontalInplaceLine_SSE(uint8_t *pSrc, intptr_t nWidthMMX);
 17 |    */
 18 | 
 19 | void Average2_avx2(uint8_t *pDst, const uint8_t *pSrc1, const uint8_t *pSrc2, intptr_t nPitch, intptr_t nWidth, intptr_t nHeight) {
 20 |     for (int y = 0; y < nHeight; y++) {
 21 |         for (int x = 0; x < nWidth; x += 32) {
 22 |             __m256i m0 = _mm256_loadu_si256((const __m256i *)&pSrc1[x]);
 23 |             __m256i m1 = _mm256_loadu_si256((const __m256i *)&pSrc2[x]);
 24 | 
 25 |             m0 = _mm256_avg_epu8(m0, m1);
 26 |             _mm256_storeu_si256((__m256i *)&pDst[x], m0);
 27 |         }
 28 | 
 29 |         pSrc1 += nPitch;
 30 |         pSrc2 += nPitch;
 31 |         pDst += nPitch;
 32 |     }
 33 | }
 34 | 
 35 | 
 36 | void VerticalBilinear_avx2(uint8_t *pDst, const uint8_t *pSrc, intptr_t nPitch,
 37 |                            intptr_t nWidth, intptr_t nHeight, intptr_t bitsPerSample) {
 38 |     (void)bitsPerSample;
 39 | 
 40 |     for (int y = 0; y < nHeight - 1; y++) {
 41 |         for (int x = 0; x < nWidth; x += 32) {
 42 |             __m256i m0 = _mm256_loadu_si256((const __m256i *)&pSrc[x]);
 43 |             __m256i m1 = _mm256_loadu_si256((const __m256i *)&pSrc[x + nPitch]);
 44 | 
 45 |             m0 = _mm256_avg_epu8(m0, m1);
 46 |             _mm256_storeu_si256((__m256i *)&pDst[x], m0);
 47 |         }
 48 | 
 49 |         pSrc += nPitch;
 50 |         pDst += nPitch;
 51 |     }
 52 | 
 53 |     for (int x = 0; x < nWidth; x++)
 54 |         pDst[x] = pSrc[x];
 55 | }
 56 | 
 57 | 
 58 | void HorizontalBilinear_avx2(uint8_t *pDst, const uint8_t *pSrc, intptr_t nPitch,
 59 |                              intptr_t nWidth, intptr_t nHeight, intptr_t bitsPerSample) {
 60 |     (void)bitsPerSample;
 61 | 
 62 |     for (int y = 0; y < nHeight; y++) {
 63 |         for (int x = 0; x < nWidth; x += 32) {
 64 |             __m256i m0 = _mm256_loadu_si256((const __m256i *)&pSrc[x]);
 65 |             __m256i m1 = _mm256_loadu_si256((const __m256i *)&pSrc[x + 1]);
 66 | 
 67 |             m0 = _mm256_avg_epu8(m0, m1);
 68 |             _mm256_storeu_si256((__m256i *)&pDst[x], m0);
 69 |         }
 70 | 
 71 |         pDst[nWidth - 1] = pSrc[nWidth - 1];
 72 | 
 73 |         pSrc += nPitch;
 74 |         pDst += nPitch;
 75 |     }
 76 | }
 77 | 
 78 | 
 79 | void DiagonalBilinear_avx2(uint8_t *pDst, const uint8_t *pSrc, intptr_t nPitch,
 80 |                            intptr_t nWidth, intptr_t nHeight, intptr_t bitsPerSample) {
 81 |     (void)bitsPerSample;
 82 | 
 83 |     for (int y = 0; y < nHeight - 1; y++) {
 84 |         for (int x = 0; x < nWidth; x += 16) {
 85 |             __m256i m0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x]));
 86 |             __m256i m1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x + 1]));
 87 |             __m256i m2 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x + nPitch]));
 88 |             __m256i m3 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x + nPitch + 1]));
 89 | 
 90 |             m0 = _mm256_add_epi16(m0, m1);
 91 |             m2 = _mm256_add_epi16(m2, m3);
 92 |             m0 = _mm256_add_epi16(m0, _mm256_set1_epi16(2));
 93 |             m0 = _mm256_add_epi16(m0, m2);
 94 | 
 95 |             m0 = _mm256_srli_epi16(m0, 2);
 96 | 
 97 |             m0 = _mm256_packus_epi16(m0, m0);
 98 |             m0 = _mm256_permute4x64_epi64(m0, _MM_SHUFFLE(0, 0, 2, 0));
 99 |             _mm_storeu_si128((__m128i *)&pDst[x], _mm256_castsi256_si128(m0));
100 |         }
101 | 
102 |         pDst[nWidth - 1] = (pSrc[nWidth - 1] + pSrc[nWidth - 1 + nPitch] + 1) >> 1;
103 | 
104 |         pSrc += nPitch;
105 |         pDst += nPitch;
106 |     }
107 | 
108 |     for (int x = 0; x < nWidth; x += 32) {
109 |         __m256i m0 = _mm256_loadu_si256((const __m256i *)&pSrc[x]);
110 |         __m256i m1 = _mm256_loadu_si256((const __m256i *)&pSrc[x + 1]);
111 | 
112 |         m0 = _mm256_avg_epu8(m0, m1);
113 |         _mm256_storeu_si256((__m256i *)&pDst[x], m0);
114 |     }
115 | 
116 |     pDst[nWidth - 1] = pSrc[nWidth - 1];
117 | }
118 | 
119 | void VerticalWiener_avx2(uint8_t *pDst, const uint8_t *pSrc, intptr_t nPitch,
120 |                          intptr_t nWidth, intptr_t nHeight, intptr_t bitsPerSample) {
121 |     (void)bitsPerSample;
122 | 
123 |     for (int y = 0; y < 2; y++) {
124 |         for (int x = 0; x < nWidth; x += 32) {
125 |             __m256i m0 = _mm256_loadu_si256((const __m256i *)&pSrc[x]);
126 |             __m256i m1 = _mm256_loadu_si256((const __m256i *)&pSrc[x + nPitch]);
127 | 
128 |             m0 = _mm256_avg_epu8(m0, m1);
129 |             _mm256_storeu_si256((__m256i *)&pDst[x], m0);
130 |         }
131 | 
132 |         pSrc += nPitch;
133 |         pDst += nPitch;
134 |     }
135 | 
136 |     for (int y = 2; y < nHeight - 4; y++) {
137 |         for (int x = 0; x < nWidth; x += 16) {
138 |             __m256i m0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x - nPitch * 2]));
139 |             __m256i m1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x - nPitch]));
140 |             __m256i m2 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x]));
141 |             __m256i m3 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x + nPitch]));
142 |             __m256i m4 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x + nPitch * 2]));
143 |             __m256i m5 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x + nPitch * 3]));
144 | 
145 |             m2 = _mm256_add_epi16(m2, m3);
146 |             m2 = _mm256_slli_epi16(m2, 2);
147 | 
148 |             m1 = _mm256_add_epi16(m1, m4);
149 | 
150 |             m2 = _mm256_sub_epi16(m2, m1);
151 |             m3 = _mm256_slli_epi16(m2, 2);
152 |             m2 = _mm256_add_epi16(m2, m3);
153 | 
154 |             m0 = _mm256_add_epi16(m0, m5);
155 |             m0 = _mm256_add_epi16(m0, m2);
156 |             m0 = _mm256_add_epi16(m0, _mm256_set1_epi16(16));
157 | 
158 |             m0 = _mm256_srai_epi16(m0, 5);
159 |             m0 = _mm256_packus_epi16(m0, m0);
160 |             m0 = _mm256_permute4x64_epi64(m0, _MM_SHUFFLE(0, 0, 2, 0));
161 |             _mm_storeu_si128((__m128i *)&pDst[x], _mm256_castsi256_si128(m0));
162 |         }
163 | 
164 |         pSrc += nPitch;
165 |         pDst += nPitch;
166 |     }
167 | 
168 |     for (int y = nHeight - 4; y < nHeight - 1; y++) {
169 |         for (int x = 0; x < nWidth; x += 32) {
170 |             __m256i m0 = _mm256_loadu_si256((const __m256i *)&pSrc[x]);
171 |             __m256i m1 = _mm256_loadu_si256((const __m256i *)&pSrc[x + nPitch]);
172 | 
173 |             m0 = _mm256_avg_epu8(m0, m1);
174 |             _mm256_storeu_si256((__m256i *)&pDst[x], m0);
175 |         }
176 | 
177 |         pSrc += nPitch;
178 |         pDst += nPitch;
179 |     }
180 | 
181 |     for (int x = 0; x < nWidth; x++)
182 |         pDst[x] = pSrc[x];
183 | }
184 | 
185 | 
186 | void HorizontalWiener_avx2(uint8_t *pDst, const uint8_t *pSrc, intptr_t nPitch,
187 |                            intptr_t nWidth, intptr_t nHeight, intptr_t bitsPerSample) {
188 |     (void)bitsPerSample;
189 | 
190 |     for (int y = 0; y < nHeight; y++) {
191 |         pDst[0] = (pSrc[0] + pSrc[1] + 1) >> 1;
192 |         pDst[1] = (pSrc[1] + pSrc[2] + 1) >> 1;
193 | 
194 |         for (int x = 2; x < nWidth - 4; x += 16) {
195 |             __m256i m0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x - 2]));
196 |             __m256i m1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x - 1]));
197 |             __m256i m2 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x]));
198 |             __m256i m3 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x + 1]));
199 |             __m256i m4 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x + 2]));
200 |             __m256i m5 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)&pSrc[x + 3]));
201 | 
202 |             m2 = _mm256_add_epi16(m2, m3);
203 |             m2 = _mm256_slli_epi16(m2, 2);
204 | 
205 |             m1 = _mm256_add_epi16(m1, m4);
206 | 
207 |             m2 = _mm256_sub_epi16(m2, m1);
208 |             m3 = _mm256_slli_epi16(m2, 2);
209 |             m2 = _mm256_add_epi16(m2, m3);
210 | 
211 |             m0 = _mm256_add_epi16(m0, m5);
212 |             m0 = _mm256_add_epi16(m0, m2);
213 |             m0 = _mm256_add_epi16(m0, _mm256_set1_epi16(16));
214 | 
215 |             m0 = _mm256_srai_epi16(m0, 5);
216 |             m0 = _mm256_packus_epi16(m0, m0);
217 |             m0 = _mm256_permute4x64_epi64(m0, _MM_SHUFFLE(0, 0, 2, 0));
218 |             _mm_storeu_si128((__m128i *)&pDst[x], _mm256_castsi256_si128(m0));
219 |         }
220 | 
221 |         for (int x = nWidth - 4; x < nWidth - 1; x++)
222 |             pDst[x] = (pSrc[x] + pSrc[x + 1] + 1) >> 1;
223 | 
224 |         pDst[nWidth - 1] = pSrc[nWidth - 1];
225 | 
226 |         pDst += nPitch;
227 |         pSrc += nPitch;
228 |     }
229 | }
230 | 
231 | #endif // MVTOOLS_X86


--------------------------------------------------------------------------------
/src/MVSCDetection.c:
--------------------------------------------------------------------------------
  1 | // Author: Manao
  2 | // Copyright(c)2006 A.G.Balakhnin aka Fizick - YUY2
  3 | // See legal notice in Copying.txt for more information
  4 | //
  5 | // This program is free software; you can redistribute it and/or modify
  6 | // it under the terms of the GNU General Public License as published by
  7 | // the Free Software Foundation; either version 2 of the License, or
  8 | // (at your option) any later version.
  9 | //
 10 | // This program is distributed in the hope that it will be useful,
 11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | // GNU General Public License for more details.
 14 | //
 15 | // You should have received a copy of the GNU General Public License
 16 | // along with this program; if not, write to the Free Software
 17 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
 18 | // http://www.gnu.org/copyleft/gpl.html .
 19 | 
 20 | #include <VapourSynth4.h>
 21 | #include <VSHelper4.h>
 22 | 
 23 | #include "Fakery.h"
 24 | #include "MVAnalysisData.h"
 25 | #include "CommonMacros.h"
 26 | 
 27 | 
 28 | 
 29 | 
 30 | typedef struct MVSCDetectionData {
 31 |     VSNode *node;
 32 |     const VSVideoInfo *vi;
 33 | 
 34 |     VSNode *vectors;
 35 | 
 36 |     int64_t thscd1;
 37 |     int thscd2;
 38 | 
 39 |     MVAnalysisData vectors_data;
 40 | } MVSCDetectionData;
 41 | 
 42 | 
 43 | static const VSFrame *VS_CC mvscdetectionGetFrame(int n, int activationReason, void *instanceData, void **frameData, VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi) {
 44 |     (void)frameData;
 45 | 
 46 |     MVSCDetectionData *d = (MVSCDetectionData *)instanceData;
 47 | 
 48 |     if (activationReason == arInitial) {
 49 |         vsapi->requestFrameFilter(n, d->vectors, frameCtx);
 50 |         vsapi->requestFrameFilter(n, d->node, frameCtx);
 51 |     } else if (activationReason == arAllFramesReady) {
 52 |         const VSFrame *src = vsapi->getFrameFilter(n, d->node, frameCtx);
 53 |         VSFrame *dst = vsapi->copyFrame(src, core);
 54 |         vsapi->freeFrame(src);
 55 | 
 56 |         const VSFrame *mvn = vsapi->getFrameFilter(n, d->vectors, frameCtx);
 57 |         FakeGroupOfPlanes fgop;
 58 |         fgopInit(&fgop, &d->vectors_data);
 59 |         const VSMap *mvprops = vsapi->getFramePropertiesRO(mvn);
 60 |         fgopUpdate(&fgop, (const uint8_t *)vsapi->mapGetData(mvprops, prop_MVTools_vectors, 0, NULL));
 61 |         vsapi->freeFrame(mvn);
 62 | 
 63 |         const char *propNames[2] = { "_SceneChangePrev", "_SceneChangeNext" };
 64 |         VSMap *props = vsapi->getFramePropertiesRW(dst);
 65 |         vsapi->mapSetInt(props, propNames[!!d->vectors_data.isBackward], !fgopIsUsable(&fgop, d->thscd1, d->thscd2), maReplace);
 66 | 
 67 |         fgopDeinit(&fgop);
 68 | 
 69 |         return dst;
 70 |     }
 71 | 
 72 |     return NULL;
 73 | }
 74 | 
 75 | 
 76 | static void VS_CC mvscdetectionFree(void *instanceData, VSCore *core, const VSAPI *vsapi) {
 77 |     (void)core;
 78 | 
 79 |     MVSCDetectionData *d = (MVSCDetectionData *)instanceData;
 80 | 
 81 |     vsapi->freeNode(d->node);
 82 |     vsapi->freeNode(d->vectors);
 83 |     free(d);
 84 | }
 85 | 
 86 | 
 87 | static void VS_CC mvscdetectionCreate(const VSMap *in, VSMap *out, void *userData, VSCore *core, const VSAPI *vsapi) {
 88 |     (void)userData;
 89 | 
 90 |     MVSCDetectionData d;
 91 |     MVSCDetectionData *data;
 92 | 
 93 |     int err;
 94 | 
 95 |     d.thscd1 = vsapi->mapGetInt(in, "thscd1", 0, &err);
 96 |     if (err)
 97 |         d.thscd1 = MV_DEFAULT_SCD1;
 98 | 
 99 |     d.thscd2 = vsapi->mapGetIntSaturated(in, "thscd2", 0, &err);
100 |     if (err)
101 |         d.thscd2 = MV_DEFAULT_SCD2;
102 | 
103 | 
104 |     d.vectors = vsapi->mapGetNode(in, "vectors", 0, NULL);
105 | 
106 | 
107 | #define ERROR_SIZE 512
108 |     char error[ERROR_SIZE + 1] = { 0 };
109 |     const char *filter_name = "SCDetection";
110 | 
111 |     adataFromVectorClip(&d.vectors_data, d.vectors, filter_name, "vectors", vsapi, error, ERROR_SIZE);
112 | 
113 |     scaleThSCD(&d.thscd1, &d.thscd2, &d.vectors_data, filter_name, error, ERROR_SIZE);
114 | #undef ERROR_SIZE
115 | 
116 |     if (error[0]) {
117 |         vsapi->mapSetError(out, error);
118 | 
119 |         vsapi->freeNode(d.vectors);
120 |         return;
121 |     }
122 | 
123 | 
124 |     d.node = vsapi->mapGetNode(in, "clip", 0, NULL);
125 |     d.vi = vsapi->getVideoInfo(d.node);
126 | 
127 | 
128 |     data = (MVSCDetectionData *)malloc(sizeof(d));
129 |     *data = d;
130 | 
131 |     VSFilterDependency deps[2] = { 
132 |         {data->node, rpStrictSpatial},
133 |         {data->vectors, rpStrictSpatial},
134 |     };
135 |     vsapi->createVideoFilter(out, "SCDetection", data->vi, mvscdetectionGetFrame, mvscdetectionFree, fmParallel, deps, ARRAY_SIZE(deps), data, core);
136 | }
137 | 
138 | 
139 | void mvscdetectionRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi) {
140 |     vspapi->registerFunction("SCDetection",
141 |                  "clip:vnode;"
142 |                  "vectors:vnode;"
143 |                  "thscd1:int:opt;"
144 |                  "thscd2:int:opt;",
145 |                  "clip:vnode;",
146 |                  mvscdetectionCreate, 0, plugin);
147 | }
148 | 


--------------------------------------------------------------------------------
/src/MVSuper.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | 
  5 | #include <VapourSynth4.h>
  6 | #include <VSHelper4.h>
  7 | 
  8 | #include "MVFrame.h"
  9 | #include "CommonMacros.h"
 10 | 
 11 | 
 12 | 
 13 | typedef struct MVSuperData {
 14 |     VSNode *node;
 15 |     VSVideoInfo vi;
 16 | 
 17 |     VSNode *pelclip; // upsized source clip with doubled frame width and heigth (used for pel=2)
 18 | 
 19 |     int nHPad;
 20 |     int nVPad;
 21 |     int nPel;
 22 |     int nLevels;
 23 |     int sharp;
 24 |     int rfilter; // frame reduce filter mode
 25 |     int opt;
 26 | 
 27 |     int nWidth;
 28 |     int nHeight;
 29 | 
 30 |     int yRatioUV;
 31 |     int xRatioUV;
 32 |     int chroma;
 33 |     int usePelClip;
 34 |     int nSuperWidth;
 35 |     int nSuperHeight;
 36 | 
 37 |     MVPlaneSet nModeYUV;
 38 | 
 39 |     int isPelClipPadded;
 40 | } MVSuperData;
 41 | 
 42 | 
 43 | static const VSFrame *VS_CC mvsuperGetFrame(int n, int activationReason, void *instanceData, void **frameData, VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi) {
 44 |     (void)frameData;
 45 | 
 46 |     MVSuperData *d = (MVSuperData *)instanceData;
 47 | 
 48 |     if (activationReason == arInitial) {
 49 |         vsapi->requestFrameFilter(n, d->node, frameCtx);
 50 |         if (d->usePelClip)
 51 |             vsapi->requestFrameFilter(n, d->pelclip, frameCtx);
 52 |     } else if (activationReason == arAllFramesReady) {
 53 |         const VSFrame *src = vsapi->getFrameFilter(n, d->node, frameCtx);
 54 | 
 55 |         const uint8_t *pSrc[3] = { NULL };
 56 |         uint8_t *pDst[3] = { NULL };
 57 |         const uint8_t *pSrcPel[3] = { NULL };
 58 |         int nSrcPitch[3] = { 0 };
 59 |         int nDstPitch[3] = { 0 };
 60 |         int nSrcPelPitch[3] = { 0 };
 61 | 
 62 |         const VSFrame *srcPel = NULL;
 63 |         if (d->usePelClip)
 64 |             srcPel = vsapi->getFrameFilter(n, d->pelclip, frameCtx);
 65 | 
 66 |         VSFrame *dst = vsapi->newVideoFrame(&d->vi.format, d->vi.width, d->vi.height, src, core);
 67 | 
 68 |         for (int plane = 0; plane < d->vi.format.numPlanes; plane++) {
 69 |             pSrc[plane] = vsapi->getReadPtr(src, plane);
 70 |             nSrcPitch[plane] = vsapi->getStride(src, plane);
 71 | 
 72 |             pDst[plane] = vsapi->getWritePtr(dst, plane);
 73 |             nDstPitch[plane] = vsapi->getStride(dst, plane);
 74 | 
 75 |             memset(pDst[plane], 0, nDstPitch[plane] * vsapi->getFrameHeight(dst, plane));
 76 |         }
 77 | 
 78 |         MVGroupOfFrames pSrcGOF;
 79 |         mvgofInit(&pSrcGOF, d->nLevels, d->nWidth, d->nHeight, d->nPel, d->nHPad, d->nVPad, d->nModeYUV, d->opt, d->xRatioUV, d->yRatioUV, d->vi.format.bitsPerSample);
 80 | 
 81 |         mvgofUpdate(&pSrcGOF, pDst, nDstPitch);
 82 | 
 83 |         MVPlaneSet planes[3] = { YPLANE, UPLANE, VPLANE };
 84 | 
 85 |         for (int plane = 0; plane < d->vi.format.numPlanes; plane++)
 86 |             mvfFillPlane(pSrcGOF.frames[0], pSrc[plane], nSrcPitch[plane], plane);
 87 | 
 88 |         mvgofReduce(&pSrcGOF, d->nModeYUV, d->rfilter);
 89 |         mvgofPad(&pSrcGOF, d->nModeYUV);
 90 | 
 91 |         if (d->usePelClip) {
 92 |             MVFrame *srcFrames = pSrcGOF.frames[0];
 93 | 
 94 |             for (int plane = 0; plane < d->vi.format.numPlanes; plane++) {
 95 |                 pSrcPel[plane] = vsapi->getReadPtr(srcPel, plane);
 96 |                 nSrcPelPitch[plane] = vsapi->getStride(srcPel, plane);
 97 | 
 98 |                 MVPlane *srcPlane = srcFrames->planes[plane];
 99 |                 if (d->nModeYUV & planes[plane])
100 |                     mvpRefineExt(srcPlane, pSrcPel[plane], nSrcPelPitch[plane], d->isPelClipPadded);
101 |             }
102 |         } else
103 |             mvgofRefine(&pSrcGOF, d->nModeYUV, d->sharp);
104 | 
105 |         vsapi->freeFrame(src);
106 |         if (d->usePelClip)
107 |             vsapi->freeFrame(srcPel);
108 | 
109 |         mvgofDeinit(&pSrcGOF);
110 | 
111 |         if (n == 0) {
112 |             VSMap *props = vsapi->getFramePropertiesRW(dst);
113 | 
114 |             vsapi->mapSetInt(props, "Super_height", d->nHeight, maReplace);
115 |             vsapi->mapSetInt(props, "Super_hpad", d->nHPad, maReplace);
116 |             vsapi->mapSetInt(props, "Super_vpad", d->nVPad, maReplace);
117 |             vsapi->mapSetInt(props, "Super_pel", d->nPel, maReplace);
118 |             vsapi->mapSetInt(props, "Super_modeyuv", d->nModeYUV, maReplace);
119 |             vsapi->mapSetInt(props, "Super_levels", d->nLevels, maReplace);
120 |         }
121 | 
122 |         return dst;
123 |     }
124 | 
125 |     return 0;
126 | }
127 | 
128 | 
129 | static void VS_CC mvsuperFree(void *instanceData, VSCore *core, const VSAPI *vsapi) {
130 |     (void)core;
131 | 
132 |     MVSuperData *d = (MVSuperData *)instanceData;
133 | 
134 |     vsapi->freeNode(d->node);
135 |     vsapi->freeNode(d->pelclip);
136 |     free(d);
137 | }
138 | 
139 | 
140 | static void VS_CC mvsuperCreate(const VSMap *in, VSMap *out, void *userData, VSCore *core, const VSAPI *vsapi) {
141 |     (void)userData;
142 | 
143 |     MVSuperData d;
144 |     MVSuperData *data;
145 | 
146 |     int err;
147 | 
148 |     d.nHPad = vsapi->mapGetIntSaturated(in, "hpad", 0, &err);
149 |     if (err)
150 |         d.nHPad = 16;
151 | 
152 |     d.nVPad = vsapi->mapGetIntSaturated(in, "vpad", 0, &err);
153 |     if (err)
154 |         d.nVPad = 16;
155 | 
156 |     d.nPel = vsapi->mapGetIntSaturated(in, "pel", 0, &err);
157 |     if (err)
158 |         d.nPel = 2;
159 | 
160 |     d.nLevels = vsapi->mapGetIntSaturated(in, "levels", 0, &err);
161 | 
162 |     d.chroma = !!vsapi->mapGetInt(in, "chroma", 0, &err);
163 |     if (err)
164 |         d.chroma = 1;
165 | 
166 |     d.sharp = vsapi->mapGetIntSaturated(in, "sharp", 0, &err); // pel2 interpolation type
167 |     if (err)
168 |         d.sharp = SharpWiener;
169 | 
170 |     d.rfilter = vsapi->mapGetIntSaturated(in, "rfilter", 0, &err);
171 |     if (err)
172 |         d.rfilter = RfilterBilinear;
173 | 
174 |     d.opt = !!vsapi->mapGetInt(in, "opt", 0, &err);
175 |     if (err)
176 |         d.opt = 1;
177 | 
178 | 
179 |     if ((d.nPel != 1) && (d.nPel != 2) && (d.nPel != 4)) {
180 |         vsapi->mapSetError(out, "Super: pel must be 1, 2, or 4.");
181 |         return;
182 |     }
183 | 
184 |     if (d.sharp < SharpBilinear || d.sharp > SharpWiener) {
185 |         vsapi->mapSetError(out, "Super: sharp must be between 0 and 2 (inclusive).");
186 |         return;
187 |     }
188 | 
189 |     if (d.rfilter < RfilterSimple || d.rfilter > RfilterCubic) {
190 |         vsapi->mapSetError(out, "Super: rfilter must be between 0 and 4 (inclusive).");
191 |         return;
192 |     }
193 | 
194 | 
195 |     d.node = vsapi->mapGetNode(in, "clip", 0, 0);
196 | 
197 |     // Make a copy of the video info, so we can reference
198 |     // it and modify it below.
199 |     d.vi = *vsapi->getVideoInfo(d.node);
200 | 
201 |     d.nWidth = d.vi.width;
202 |     d.nHeight = d.vi.height;
203 | 
204 |     if (!vsh_isConstantVideoFormat(&d.vi) || d.vi.format.bitsPerSample > 16 || d.vi.format.sampleType != stInteger ||
205 |             d.vi.format.subSamplingW > 1 || d.vi.format.subSamplingH > 1 || (d.vi.format.colorFamily != cfYUV && d.vi.format.colorFamily != cfGray)) {
206 |         vsapi->mapSetError(out, "Super: input clip must be GRAY, 420, 422, 440, or 444, up to 16 bits, with constant dimensions.");
207 |         vsapi->freeNode(d.node);
208 |         return;
209 |     }
210 | 
211 |     if (d.vi.format.colorFamily == cfGray)
212 |         d.chroma = 0;
213 | 
214 |     d.nModeYUV = d.chroma ? YUVPLANES : YPLANE;
215 | 
216 | 
217 |     d.xRatioUV = 1 << d.vi.format.subSamplingW;
218 |     d.yRatioUV = 1 << d.vi.format.subSamplingH;
219 | 
220 |     int nLevelsMax = 0;
221 |     while (PlaneHeightLuma(d.vi.height, nLevelsMax, d.yRatioUV, d.nVPad) >= d.yRatioUV * 2 &&
222 |            PlaneWidthLuma(d.vi.width, nLevelsMax, d.xRatioUV, d.nHPad) >= d.xRatioUV * 2) // at last two pixels width and height of chroma
223 |     {
224 |         nLevelsMax++;
225 |     }
226 |     if (d.nLevels <= 0 || d.nLevels > nLevelsMax)
227 |         d.nLevels = nLevelsMax;
228 | 
229 |     d.pelclip = vsapi->mapGetNode(in, "pelclip", 0, &err);
230 |     const VSVideoInfo *pelvi = d.pelclip ? vsapi->getVideoInfo(d.pelclip) : NULL;
231 | 
232 |     if (d.pelclip && (!vsh_isConstantVideoFormat(pelvi) || !vsh_isSameVideoFormat(&pelvi->format, &d.vi.format))) {
233 |         vsapi->mapSetError(out, "Super: pelclip must have the same format as the input clip, and it must have constant dimensions.");
234 |         vsapi->freeNode(d.node);
235 |         vsapi->freeNode(d.pelclip);
236 |         return;
237 |     }
238 | 
239 |     d.usePelClip = 0;
240 |     if (d.pelclip && (d.nPel >= 2)) {
241 |         if ((pelvi->width == d.vi.width * d.nPel) &&
242 |             (pelvi->height == d.vi.height * d.nPel)) {
243 |             d.usePelClip = 1;
244 |             d.isPelClipPadded = 0;
245 |         } else if ((pelvi->width == (d.vi.width + d.nHPad * 2) * d.nPel) &&
246 |                    (pelvi->height == (d.vi.height + d.nVPad * 2) * d.nPel)) {
247 |             d.usePelClip = 1;
248 |             d.isPelClipPadded = 1;
249 |         } else {
250 |             vsapi->mapSetError(out, "Super: pelclip's dimensions must be multiples of the input clip's dimensions.");
251 |             vsapi->freeNode(d.pelclip);
252 |             vsapi->freeNode(d.node);
253 |             return;
254 |         }
255 |     }
256 | 
257 |     d.nSuperWidth = d.nWidth + 2 * d.nHPad;
258 |     d.nSuperHeight = PlaneSuperOffset(0, d.nHeight, d.nLevels, d.nPel, d.nVPad, d.nSuperWidth, d.yRatioUV) / d.nSuperWidth;
259 |     if (d.yRatioUV == 2 && d.nSuperHeight & 1)
260 |         d.nSuperHeight++; // even
261 |     if (d.xRatioUV == 2 && d.nSuperWidth & 1)
262 |         d.nSuperWidth++;
263 |     d.vi.width = d.nSuperWidth;
264 |     d.vi.height = d.nSuperHeight;
265 | 
266 | 
267 |     data = (MVSuperData *)malloc(sizeof(d));
268 |     *data = d;
269 | 
270 |     VSFilterDependency deps[1] = { 
271 |         {data->node, rpStrictSpatial}
272 |     };
273 | 
274 |     vsapi->createVideoFilter(out, "Super", &data->vi, mvsuperGetFrame, mvsuperFree, fmParallel, deps, ARRAY_SIZE(deps), data, core);
275 | }
276 | 
277 | 
278 | void mvsuperRegister(VSPlugin *plugin, const VSPLUGINAPI *vspapi) {
279 |     vspapi->registerFunction("Super", 
280 |                  "clip:vnode;"
281 |                  "hpad:int:opt;"
282 |                  "vpad:int:opt;"
283 |                  "pel:int:opt;"
284 |                  "levels:int:opt;"
285 |                  "chroma:int:opt;"
286 |                  "sharp:int:opt;"
287 |                  "rfilter:int:opt;"
288 |                  "pelclip:vnode:opt;"
289 |                  "opt:int:opt;",
290 |                  "clip:vnode;",
291 |                  mvsuperCreate, 0, plugin);
292 | }
293 | 


--------------------------------------------------------------------------------
/src/MaskFun.h:
--------------------------------------------------------------------------------
 1 | // Create an overlay mask with the motion vectors
 2 | 
 3 | // See legal notice in Copying.txt for more information
 4 | 
 5 | // This program is free software; you can redistribute it and/or modify
 6 | // it under the terms of the GNU General Public License as published by
 7 | // the Free Software Foundation; either version 2 of the License, or
 8 | // (at your option) any later version.
 9 | //
10 | // This program is distributed in the hope that it will be useful,
11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | // GNU General Public License for more details.
14 | //
15 | // You should have received a copy of the GNU General Public License
16 | // along with this program; if not, write to the Free Software
17 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
18 | // http://www.gnu.org/copyleft/gpl.html .
19 | 
20 | #ifndef MASKFUN_H
21 | #define MASKFUN_H
22 | 
23 | #ifdef __cplusplus
24 | extern "C" {
25 | #endif
26 | 
27 | #include <stdint.h>
28 | 
29 | #include "Fakery.h"
30 | #include "MVFrame.h"
31 | 
32 | void CheckAndPadSmallY(int16_t *VXSmallY, int16_t *VYSmallY, int nBlkXP, int nBlkYP, int nBlkX, int nBlkY);
33 | 
34 | void CheckAndPadMaskSmall(uint8_t *MaskSmall, int nBlkXP, int nBlkYP, int nBlkX, int nBlkY);
35 | 
36 | void MakeVectorOcclusionMaskTime(const FakeGroupOfPlanes *fgop, int isBackward, int nBlkX, int nBlkY, double dMaskNormDivider, double fGamma, int nPel, uint8_t *occMask, int occMaskPitch, int time256, int nBlkStepX, int nBlkStepY);
37 | 
38 | void MakeSADMaskTime(const FakeGroupOfPlanes *fgop, int nBlkX, int nBlkY, double dSADNormFactor, double fGamma, int nPel, uint8_t *Mask, int MaskPitch, int time256, int nBlkStepX, int nBlkStepY, int bitsPerSample);
39 | 
40 | void MakeVectorSmallMasks(const FakeGroupOfPlanes *fgop, int nX, int nY, int16_t *VXSmallY, int pitchVXSmallY, int16_t *VYSmallY, int pitchVYSmallY);
41 | void VectorSmallMaskYToHalfUV(int16_t *VSmallY, int nBlkX, int nBlkY, int16_t *VSmallUV, int ratioUV);
42 | 
43 | void Merge4PlanesToBig(uint8_t *pel2Plane, int pel2Pitch, const uint8_t *pPlane0, const uint8_t *pPlane1,
44 |                        const uint8_t *pPlane2, const uint8_t *pPlane3, int width, int height, int pitch, int bitsPerSample);
45 | 
46 | void Merge16PlanesToBig(uint8_t *pel4Plane, int pel4Pitch,
47 |                         const uint8_t *pPlane0, const uint8_t *pPlane1, const uint8_t *pPlane2, const uint8_t *pPlane3,
48 |                         const uint8_t *pPlane4, const uint8_t *pPlane5, const uint8_t *pPlane6, const uint8_t *pPlane7,
49 |                         const uint8_t *pPlane8, const uint8_t *pPlane9, const uint8_t *pPlane10, const uint8_t *pPlane11,
50 |                         const uint8_t *pPlane12, const uint8_t *pPlane13, const uint8_t *pPlane14, const uint8_t *pPlane15,
51 |                         int width, int height, int pitch, int bitsPerSample);
52 | 
53 | uint8_t SADToMask(unsigned int sad, unsigned int sadnorm1024);
54 | 
55 | void Blend(uint8_t *pdst, const uint8_t *psrc, const uint8_t *pref, int height, int width, int dst_pitch, int src_pitch, int ref_pitch, int time256, int bitsPerSample);
56 | 
57 | 
58 | typedef void (*FlowInterSimpleFunction)(
59 |         uint8_t *pdst, int dst_pitch,
60 |         const uint8_t *prefB, const uint8_t *prefF, int ref_pitch,
61 |         const int16_t *VXFullB, const int16_t *VXFullF,
62 |         const int16_t *VYFullB, const int16_t *VYFullF,
63 |         const uint8_t *MaskB, const uint8_t *MaskF, int VPitch,
64 |         int width, int height,
65 |         int time256, int nPel);
66 | 
67 | typedef void (*FlowInterFunction)(
68 |         uint8_t *pdst, int dst_pitch,
69 |         const uint8_t *prefB, const uint8_t *prefF, int ref_pitch,
70 |         const int16_t *VXFullB, const int16_t *VXFullF,
71 |         const int16_t *VYFullB, const int16_t *VYFullF,
72 |         const uint8_t *MaskB, const uint8_t *MaskF, int VPitch,
73 |         int width, int height,
74 |         int time256, int nPel);
75 | 
76 | typedef void (*FlowInterExtraFunction)(
77 |         uint8_t *pdst, int dst_pitch,
78 |         const uint8_t *prefB, const uint8_t *prefF, int ref_pitch,
79 |         const int16_t *VXFullB, const int16_t *VXFullF,
80 |         const int16_t *VYFullB, const int16_t *VYFullF,
81 |         const uint8_t *MaskB, const uint8_t *MaskF, int VPitch,
82 |         int width, int height,
83 |         int time256, int nPel,
84 |         const int16_t *VXFullBB, const int16_t *VXFullFF,
85 |         const int16_t *VYFullBB, const int16_t *VYFullFF);
86 | 
87 | void selectFlowInterFunctions(FlowInterSimpleFunction *simple, FlowInterFunction *regular, FlowInterExtraFunction *extra, int bitsPerSample, int opt);
88 | 
89 | #ifdef __cplusplus
90 | } // extern "C"
91 | #endif
92 | 
93 | #endif
94 | 


--------------------------------------------------------------------------------
/src/Overlap.cpp:
--------------------------------------------------------------------------------
  1 | // Overlap copy (really addition)
  2 | // Copyright(c)2006 A.G.Balakhnin aka Fizick
  3 | 
  4 | // This program is free software; you can redistribute it and/or modify
  5 | // it under the terms of the GNU General Public License as published by
  6 | // the Free Software Foundation; either version 2 of the License, or
  7 | // (at your option) any later version.
  8 | //
  9 | // This program is distributed in the hope that it will be useful,
 10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | // GNU General Public License for more details.
 13 | //
 14 | // You should have received a copy of the GNU General Public License
 15 | // along with this program; if not, write to the Free Software
 16 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
 17 | // http://www.gnu.org/copyleft/gpl.html .
 18 | 
 19 | #include <cmath>
 20 | #include <cstdlib>
 21 | #include <stdexcept>
 22 | #include <unordered_map>
 23 | 
 24 | #include "CPU.h"
 25 | #include "Overlap.h"
 26 | 
 27 | #ifndef M_PI
 28 | #define M_PI       3.14159265358979323846f
 29 | #endif
 30 | 
 31 | #ifndef min
 32 | #define min(a, b) (((a) < (b)) ? (a) : (b))
 33 | #endif
 34 | 
 35 | #ifndef max
 36 | #define max(a, b) (((a) < (b)) ? (b) : (a))
 37 | #endif
 38 | 
 39 | 
 40 | void overInit(OverlapWindows *over, int nx, int ny, int ox, int oy) {
 41 |     over->nx = nx;
 42 |     over->ny = ny;
 43 |     over->ox = ox;
 44 |     over->oy = oy;
 45 |     over->size = nx * ny;
 46 | 
 47 |     //  windows
 48 |     over->fWin1UVx = (float *)malloc(nx * sizeof(float));
 49 |     over->fWin1UVxfirst = (float *)malloc(nx * sizeof(float));
 50 |     over->fWin1UVxlast = (float *)malloc(nx * sizeof(float));
 51 |     for (int i = 0; i < ox; i++) {
 52 |         over->fWin1UVx[i] = cosf(M_PI * (i - ox + 0.5f) / (ox * 2));
 53 |         over->fWin1UVx[i] = over->fWin1UVx[i] * over->fWin1UVx[i];  // left window (rised cosine)
 54 |         over->fWin1UVxfirst[i] = 1;                                 // very first window
 55 |         over->fWin1UVxlast[i] = over->fWin1UVx[i];                  // very last
 56 |     }
 57 |     for (int i = ox; i < nx - ox; i++) {
 58 |         over->fWin1UVx[i] = 1;
 59 |         over->fWin1UVxfirst[i] = 1; // very first window
 60 |         over->fWin1UVxlast[i] = 1;  // very last
 61 |     }
 62 |     for (int i = nx - ox; i < nx; i++) {
 63 |         over->fWin1UVx[i] = cosf(M_PI * (i - nx + ox + 0.5f) / (ox * 2));
 64 |         over->fWin1UVx[i] = over->fWin1UVx[i] * over->fWin1UVx[i];  // right window (falled cosine)
 65 |         over->fWin1UVxfirst[i] = over->fWin1UVx[i];                 // very first window
 66 |         over->fWin1UVxlast[i] = 1;                                  // very last
 67 |     }
 68 | 
 69 |     over->fWin1UVy = (float *)malloc(ny * sizeof(float));
 70 |     over->fWin1UVyfirst = (float *)malloc(ny * sizeof(float));
 71 |     over->fWin1UVylast = (float *)malloc(ny * sizeof(float));
 72 |     for (int i = 0; i < oy; i++) {
 73 |         over->fWin1UVy[i] = cosf(M_PI * (i - oy + 0.5f) / (oy * 2));
 74 |         over->fWin1UVy[i] = over->fWin1UVy[i] * over->fWin1UVy[i];  // left window (rised cosine)
 75 |         over->fWin1UVyfirst[i] = 1;                                 // very first window
 76 |         over->fWin1UVylast[i] = over->fWin1UVy[i];                  // very last
 77 |     }
 78 |     for (int i = oy; i < ny - oy; i++) {
 79 |         over->fWin1UVy[i] = 1;
 80 |         over->fWin1UVyfirst[i] = 1; // very first window
 81 |         over->fWin1UVylast[i] = 1;  // very last
 82 |     }
 83 |     for (int i = ny - oy; i < ny; i++) {
 84 |         over->fWin1UVy[i] = cosf(M_PI * (i - ny + oy + 0.5f) / (oy * 2));
 85 |         over->fWin1UVy[i] = over->fWin1UVy[i] * over->fWin1UVy[i];  // right window (falled cosine)
 86 |         over->fWin1UVyfirst[i] = over->fWin1UVy[i];                 // very first window
 87 |         over->fWin1UVylast[i] = 1;                                  // very last
 88 |     }
 89 | 
 90 | 
 91 |     over->Overlap9Windows = (int16_t *)malloc(over->size * 9 * sizeof(int16_t));
 92 | 
 93 |     int16_t *winOverUVTL = over->Overlap9Windows;
 94 |     int16_t *winOverUVTM = over->Overlap9Windows + over->size;
 95 |     int16_t *winOverUVTR = over->Overlap9Windows + over->size * 2;
 96 |     int16_t *winOverUVML = over->Overlap9Windows + over->size * 3;
 97 |     int16_t *winOverUVMM = over->Overlap9Windows + over->size * 4;
 98 |     int16_t *winOverUVMR = over->Overlap9Windows + over->size * 5;
 99 |     int16_t *winOverUVBL = over->Overlap9Windows + over->size * 6;
100 |     int16_t *winOverUVBM = over->Overlap9Windows + over->size * 7;
101 |     int16_t *winOverUVBR = over->Overlap9Windows + over->size * 8;
102 | 
103 |     for (int j = 0; j < ny; j++) {
104 |         for (int i = 0; i < nx; i++) {
105 |             winOverUVTL[i] = (int)(over->fWin1UVyfirst[j] * over->fWin1UVxfirst[i] * 2048 + 0.5f);
106 |             winOverUVTM[i] = (int)(over->fWin1UVyfirst[j] * over->fWin1UVx[i] * 2048 + 0.5f);
107 |             winOverUVTR[i] = (int)(over->fWin1UVyfirst[j] * over->fWin1UVxlast[i] * 2048 + 0.5f);
108 |             winOverUVML[i] = (int)(over->fWin1UVy[j] * over->fWin1UVxfirst[i] * 2048 + 0.5f);
109 |             winOverUVMM[i] = (int)(over->fWin1UVy[j] * over->fWin1UVx[i] * 2048 + 0.5f);
110 |             winOverUVMR[i] = (int)(over->fWin1UVy[j] * over->fWin1UVxlast[i] * 2048 + 0.5f);
111 |             winOverUVBL[i] = (int)(over->fWin1UVylast[j] * over->fWin1UVxfirst[i] * 2048 + 0.5f);
112 |             winOverUVBM[i] = (int)(over->fWin1UVylast[j] * over->fWin1UVx[i] * 2048 + 0.5f);
113 |             winOverUVBR[i] = (int)(over->fWin1UVylast[j] * over->fWin1UVxlast[i] * 2048 + 0.5f);
114 |         }
115 |         winOverUVTL += nx;
116 |         winOverUVTM += nx;
117 |         winOverUVTR += nx;
118 |         winOverUVML += nx;
119 |         winOverUVMM += nx;
120 |         winOverUVMR += nx;
121 |         winOverUVBL += nx;
122 |         winOverUVBM += nx;
123 |         winOverUVBR += nx;
124 |     }
125 | }
126 | 
127 | 
128 | void overDeinit(OverlapWindows *over) {
129 |     free(over->Overlap9Windows);
130 |     free(over->fWin1UVx);
131 |     free(over->fWin1UVxfirst);
132 |     free(over->fWin1UVxlast);
133 |     free(over->fWin1UVy);
134 |     free(over->fWin1UVyfirst);
135 |     free(over->fWin1UVylast);
136 | }
137 | 
138 | 
139 | int16_t *overGetWindow(const OverlapWindows *over, int i) {
140 |     return over->Overlap9Windows + over->size * i;
141 | }
142 | 
143 | 
144 | template <unsigned blockWidth, unsigned blockHeight, typename PixelType2, typename PixelType>
145 | void overlaps_c(uint8_t *pDst8, intptr_t nDstPitch, const uint8_t *pSrc8, intptr_t nSrcPitch, int16_t *pWin, intptr_t nWinPitch) {
146 |     /* pWin from 0 to 2048 */
147 |     for (unsigned j = 0; j < blockHeight; j++) {
148 |         for (unsigned i = 0; i < blockWidth; i++) {
149 |             PixelType2 *pDst = (PixelType2 *)pDst8;
150 |             const PixelType *pSrc = (const PixelType *)pSrc8;
151 | 
152 |             pDst[i] += ((pSrc[i] * pWin[i]) >> 6);
153 |         }
154 |         pDst8 += nDstPitch;
155 |         pSrc8 += nSrcPitch;
156 |         pWin += nWinPitch;
157 |     }
158 | }
159 | 
160 | 
161 | #if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
162 | 
163 | #if defined(MVTOOLS_ARM)
164 | #include "sse2neon.h"
165 | #else
166 | #include <emmintrin.h>
167 | #endif
168 | 
169 | 
170 | #define zeroes _mm_setzero_si128()
171 | 
172 | 
173 | template <unsigned blockWidth, unsigned blockHeight>
174 | struct OverlapsWrapper {
175 |     static_assert(blockWidth >= 8, "");
176 | 
177 |     static void overlaps_sse2(uint8_t *pDst8, intptr_t nDstPitch, const uint8_t *pSrc, intptr_t nSrcPitch, int16_t *pWin, intptr_t nWinPitch) {
178 |         /* pWin from 0 to 2048 */
179 |         for (unsigned y = 0; y < blockHeight; y++) {
180 |             for (unsigned x = 0; x < blockWidth; x += 8) {
181 |                 uint16_t *pDst = (uint16_t *)pDst8;
182 | 
183 |                 __m128i src = _mm_loadl_epi64((const __m128i *)&pSrc[x]);
184 |                 __m128i win = _mm_loadu_si128((const __m128i *)&pWin[x]);
185 |                 __m128i dst = _mm_loadu_si128((__m128i *)&pDst[x]);
186 | 
187 |                 src = _mm_unpacklo_epi8(src, zeroes);
188 | 
189 |                 __m128i lo = _mm_mullo_epi16(src, win);
190 |                 __m128i hi = _mm_mulhi_epi16(src, win);
191 |                 lo = _mm_srli_epi16(lo, 6);
192 |                 hi = _mm_slli_epi16(hi, 10);
193 |                 dst = _mm_adds_epu16(dst, _mm_or_si128(lo, hi));
194 |                 _mm_storeu_si128((__m128i *)&pDst[x], dst);
195 |             }
196 | 
197 |             pDst8 += nDstPitch;
198 |             pSrc += nSrcPitch;
199 |             pWin += nWinPitch;
200 |         }
201 |     }
202 | 
203 | };
204 | 
205 | 
206 | template <unsigned blockHeight>
207 | struct OverlapsWrapper<4, blockHeight> {
208 | 
209 |     static void overlaps_sse2(uint8_t *pDst, intptr_t nDstPitch, const uint8_t *pSrc, intptr_t nSrcPitch, int16_t *pWin, intptr_t nWinPitch) {
210 |         /* pWin from 0 to 2048 */
211 |         for (unsigned y = 0; y < blockHeight; y++) {
212 |             __m128i src = _mm_cvtsi32_si128(*(const int *)pSrc);
213 |             __m128i win = _mm_loadl_epi64((const __m128i *)pWin);
214 |             __m128i dst = _mm_loadl_epi64((const __m128i *)pDst);
215 | 
216 |             src = _mm_unpacklo_epi8(src, zeroes);
217 | 
218 |             __m128i lo = _mm_mullo_epi16(src, win);
219 |             __m128i hi = _mm_mulhi_epi16(src, win);
220 |             lo = _mm_srli_epi16(lo, 6);
221 |             hi = _mm_slli_epi16(hi, 10);
222 |             dst = _mm_adds_epu16(dst, _mm_or_si128(lo, hi));
223 |             _mm_storel_epi64((__m128i *)pDst, dst);
224 | 
225 |             pDst += nDstPitch;
226 |             pSrc += nSrcPitch;
227 |             pWin += nWinPitch;
228 |         }
229 |     }
230 | 
231 | };
232 | 
233 | 
234 | #undef zeroes
235 | 
236 | 
237 | #endif
238 | 
239 | 
240 | // opt can fit in four bits, if the width and height need more than eight bits each.
241 | #define KEY(width, height, bits, opt) (unsigned)(width) << 24 | (height) << 16 | (bits) << 8 | (opt)
242 | 
243 | #if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
244 | #define OVERS_SSE2(width, height) \
245 |     { KEY(width, height, 8, MVOPT_SSE2), OverlapsWrapper<width, height>::overlaps_sse2 },
246 | #else
247 | #define OVERS_SSE2(width, height)
248 | #endif
249 | 
250 | #define OVERS(width, height) \
251 |     { KEY(width, height, 8, MVOPT_SCALAR), overlaps_c<width, height, uint16_t, uint8_t> }, \
252 |     { KEY(width, height, 16, MVOPT_SCALAR), overlaps_c<width, height, uint32_t, uint16_t> },
253 | 
254 | static const std::unordered_map<uint32_t, OverlapsFunction> overlaps_functions = {
255 |     OVERS(2, 2)
256 |     OVERS(2, 4)
257 |     OVERS(4, 2)
258 |     OVERS(4, 4)
259 |     OVERS(4, 8)
260 |     OVERS(8, 1)
261 |     OVERS(8, 2)
262 |     OVERS(8, 4)
263 |     OVERS(8, 8)
264 |     OVERS(8, 16)
265 |     OVERS(16, 1)
266 |     OVERS(16, 2)
267 |     OVERS(16, 4)
268 |     OVERS(16, 8)
269 |     OVERS(16, 16)
270 |     OVERS(16, 32)
271 |     OVERS(32, 8)
272 |     OVERS(32, 16)
273 |     OVERS(32, 32)
274 |     OVERS(32, 64)
275 |     OVERS(64, 16)
276 |     OVERS(64, 32)
277 |     OVERS(64, 64)
278 |     OVERS(64, 128)
279 |     OVERS(128, 32)
280 |     OVERS(128, 64)
281 |     OVERS(128, 128)
282 |     OVERS_SSE2(4, 2)
283 |     OVERS_SSE2(4, 4)
284 |     OVERS_SSE2(4, 8)
285 |     OVERS_SSE2(8, 1)
286 |     OVERS_SSE2(8, 2)
287 |     OVERS_SSE2(8, 4)
288 |     OVERS_SSE2(8, 8)
289 |     OVERS_SSE2(8, 16)
290 |     OVERS_SSE2(16, 1)
291 |     OVERS_SSE2(16, 2)
292 |     OVERS_SSE2(16, 4)
293 |     OVERS_SSE2(16, 8)
294 |     OVERS_SSE2(16, 16)
295 |     OVERS_SSE2(16, 32)
296 |     OVERS_SSE2(32, 8)
297 |     OVERS_SSE2(32, 16)
298 |     OVERS_SSE2(32, 32)
299 |     OVERS_SSE2(32, 64)
300 |     OVERS_SSE2(64, 16)
301 |     OVERS_SSE2(64, 32)
302 |     OVERS_SSE2(64, 64)
303 |     OVERS_SSE2(64, 128)
304 |     OVERS_SSE2(128, 32)
305 |     OVERS_SSE2(128, 64)
306 |     OVERS_SSE2(128, 128)
307 | };
308 | 
309 | OverlapsFunction selectOverlapsFunction(unsigned width, unsigned height, unsigned bits, int opt) {
310 |     OverlapsFunction overs = overlaps_functions.at(KEY(width, height, bits, MVOPT_SCALAR));
311 | 
312 | #if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
313 |     if (opt) {
314 |         try {
315 |             overs = overlaps_functions.at(KEY(width, height, bits, MVOPT_SSE2));
316 |         } catch (std::out_of_range &) { }
317 | #ifdef MVTOOLS_X86
318 |         if (g_cpuinfo & X264_CPU_AVX2) {
319 |             OverlapsFunction tmp = selectOverlapsFunctionAVX2(width, height, bits);
320 |             if (tmp)
321 |                 overs = tmp;
322 |         }
323 | #endif
324 |     }
325 | #endif
326 | 
327 |     return overs;
328 | }
329 | 
330 | #undef OVERS
331 | #undef OVERS_SSE2
332 | #undef KEY
333 | 
334 | 
335 | #define ToPixels(PixelType2, PixelType) \
336 | void ToPixels_##PixelType2##_##PixelType(uint8_t *pDst8, int nDstPitch, const uint8_t *pSrc8, int nSrcPitch, int nWidth, int nHeight, int bitsPerSample) { \
337 |     int pixelMax = (1 << bitsPerSample) - 1; \
338 |  \
339 |     for (int h = 0; h < nHeight; h++) { \
340 |         for (int i = 0; i < nWidth; i++) { \
341 |             const PixelType2 *pSrc = (const PixelType2 *)pSrc8; \
342 |             PixelType *pDst = (PixelType *)pDst8; \
343 |  \
344 |             int a = (pSrc[i] + 16) >> 5; \
345 |             if (sizeof(PixelType) == 1) \
346 |                 pDst[i] = a | ((255 - a) >> (sizeof(int) * 8 - 1)); \
347 |             else \
348 |                 pDst[i] = min(pixelMax, a); \
349 |         } \
350 |         pDst8 += nDstPitch; \
351 |         pSrc8 += nSrcPitch; \
352 |     } \
353 | }
354 | 
355 | ToPixels(uint16_t, uint8_t)
356 | ToPixels(uint32_t, uint16_t)
357 | 


--------------------------------------------------------------------------------
/src/Overlap.h:
--------------------------------------------------------------------------------
 1 | #ifndef OVERLAP_H
 2 | #define OVERLAP_H
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | #include <stdint.h>
 9 | 
10 | // top, middle, botom and left, middle, right windows
11 | #define OW_TL 0
12 | #define OW_TM 1
13 | #define OW_TR 2
14 | #define OW_ML 3
15 | #define OW_MM 4
16 | #define OW_MR 5
17 | #define OW_BL 6
18 | #define OW_BM 7
19 | #define OW_BR 8
20 | 
21 | typedef struct OverlapWindows {
22 |     int nx; // window sizes
23 |     int ny;
24 |     int ox; // overap sizes
25 |     int oy;
26 |     int size; // full window size= nx*ny
27 | 
28 |     int16_t *Overlap9Windows;
29 | 
30 |     float *fWin1UVx;
31 |     float *fWin1UVxfirst;
32 |     float *fWin1UVxlast;
33 |     float *fWin1UVy;
34 |     float *fWin1UVyfirst;
35 |     float *fWin1UVylast;
36 | } OverlapWindows;
37 | 
38 | void overInit(OverlapWindows *over, int nx, int ny, int ox, int oy);
39 | 
40 | void overDeinit(OverlapWindows *over);
41 | 
42 | int16_t *overGetWindow(const OverlapWindows *over, int i);
43 | 
44 | 
45 | typedef void (*OverlapsFunction)(uint8_t *pDst, intptr_t nDstPitch,
46 |                                  const uint8_t *pSrc, intptr_t nSrcPitch,
47 |                                  int16_t *pWin, intptr_t nWinPitch);
48 | 
49 | 
50 | typedef void (*ToPixelsFunction)(uint8_t *pDst, int nDstPitch,
51 |                                  const uint8_t *pSrc, int nSrcPitch,
52 |                                  int width, int height, int bitsPerSample);
53 | 
54 | void ToPixels_uint16_t_uint8_t(uint8_t *pDst8, int nDstPitch, const uint8_t *pSrc8, int nSrcPitch, int nWidth, int nHeight, int bitsPerSample);
55 | void ToPixels_uint32_t_uint16_t(uint8_t *pDst8, int nDstPitch, const uint8_t *pSrc8, int nSrcPitch, int nWidth, int nHeight, int bitsPerSample);
56 | 
57 | OverlapsFunction selectOverlapsFunction(unsigned width, unsigned height, unsigned bits, int opt);
58 | 
59 | #if defined(MVTOOLS_X86)
60 | OverlapsFunction selectOverlapsFunctionAVX2(unsigned width, unsigned height, unsigned bits);
61 | #endif
62 | 
63 | #ifdef __cplusplus
64 | } // extern "C"
65 | #endif
66 | 
67 | #endif // OVERLAP_H
68 | 


--------------------------------------------------------------------------------
/src/Overlap_AVX2.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdexcept>
  2 | #include <unordered_map>
  3 | 
  4 | #include "Overlap.h"
  5 | 
  6 | #if defined(MVTOOLS_X86)
  7 | 
  8 | #include <immintrin.h>
  9 | 
 10 | template <int blockWidth, int blockHeight>
 11 | static void overlaps_avx2(uint8_t *pDst8, intptr_t nDstPitch, const uint8_t *pSrc, intptr_t nSrcPitch, int16_t *pWin, intptr_t nWinPitch) {
 12 |     static_assert(blockWidth >= 16 || (blockWidth == 8 && blockHeight >= 2), "");
 13 | 
 14 |     int pitchMul = blockWidth == 8 ? 2 : 1;
 15 | 
 16 |     /* pWin from 0 to 2048 */
 17 |     for (unsigned y = 0; y < blockHeight; y += pitchMul) {
 18 |         for (unsigned x = 0; x < blockWidth; x += 16 / pitchMul) {
 19 |             uint16_t *pDst = (uint16_t *)pDst8;
 20 | 
 21 |             __m256i src, win, dst;
 22 | 
 23 |             if (blockWidth == 8) {
 24 |                 src = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)(pSrc + x)), _mm_loadl_epi64((const __m128i *)(pSrc + nSrcPitch + x))));
 25 |                 win = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(pWin + x))), _mm_loadu_si128((const __m128i *)(pWin + nWinPitch + x)), 1);
 26 |                 dst = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(pDst + x))), _mm_loadu_si128((const __m128i *)(pDst8 + nDstPitch + x * sizeof(uint16_t))), 1);
 27 |             } else {
 28 |                 src = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(pSrc + x)));
 29 |                 win = _mm256_loadu_si256((const __m256i *)(pWin + x));
 30 |                 dst = _mm256_loadu_si256((const __m256i *)(pDst + x));
 31 |             }
 32 | 
 33 |             __m256i lo = _mm256_mullo_epi16(src, win);
 34 |             __m256i hi = _mm256_mulhi_epi16(src, win);
 35 |             lo = _mm256_srli_epi16(lo, 6);
 36 |             hi = _mm256_slli_epi16(hi, 10);
 37 |             dst = _mm256_adds_epu16(dst, _mm256_or_si256(lo, hi));
 38 | 
 39 |             if (blockWidth == 8) {
 40 |                 _mm_storeu_si128((__m128i *)(pDst + x), _mm256_castsi256_si128(dst));
 41 |                 _mm_storeu_si128((__m128i *)(pDst8 + nDstPitch + x * sizeof(uint16_t)), _mm256_extractf128_si256(dst, 1));
 42 |             } else {
 43 |                 _mm256_storeu_si256((__m256i *)(pDst + x), dst);
 44 |             }
 45 |         }
 46 | 
 47 |         pDst8 += nDstPitch * pitchMul;
 48 |         pSrc += nSrcPitch * pitchMul;
 49 |         pWin += nWinPitch * pitchMul;
 50 |     }
 51 | }
 52 | 
 53 | #endif
 54 | 
 55 | 
 56 | enum InstructionSets {
 57 |     Scalar,
 58 |     SSE2,
 59 |     AVX2,
 60 | };
 61 | 
 62 | 
 63 | // opt can fit in four bits, if the width and height need more than eight bits each.
 64 | #define KEY(width, height, bits, opt) (unsigned)(width) << 24 | (height) << 16 | (bits) << 8 | (opt)
 65 | 
 66 | #if defined(MVTOOLS_X86)
 67 | #define OVERS_AVX2(width, height) \
 68 |     { KEY(width, height, 8, AVX2), overlaps_avx2<width, height> },
 69 | #else
 70 | #define OVERS_AVX2(width, height)
 71 | #endif
 72 | 
 73 | static const std::unordered_map<uint32_t, OverlapsFunction> overlaps_functions = {
 74 |     OVERS_AVX2(8, 2)
 75 |     OVERS_AVX2(8, 4)
 76 |     OVERS_AVX2(8, 8)
 77 |     OVERS_AVX2(8, 16)
 78 |     OVERS_AVX2(16, 1)
 79 |     OVERS_AVX2(16, 2)
 80 |     OVERS_AVX2(16, 4)
 81 |     OVERS_AVX2(16, 8)
 82 |     OVERS_AVX2(16, 16)
 83 |     OVERS_AVX2(16, 32)
 84 |     OVERS_AVX2(32, 8)
 85 |     OVERS_AVX2(32, 16)
 86 |     OVERS_AVX2(32, 32)
 87 |     OVERS_AVX2(32, 64)
 88 |     OVERS_AVX2(64, 16)
 89 |     OVERS_AVX2(64, 32)
 90 |     OVERS_AVX2(64, 64)
 91 |     OVERS_AVX2(64, 128)
 92 |     OVERS_AVX2(128, 32)
 93 |     OVERS_AVX2(128, 64)
 94 |     OVERS_AVX2(128, 128)
 95 | };
 96 | 
 97 | 
 98 | OverlapsFunction selectOverlapsFunctionAVX2(unsigned width, unsigned height, unsigned bits) {
 99 |     try {
100 |         return overlaps_functions.at(KEY(width, height, bits, AVX2));
101 |     } catch (std::out_of_range &) {
102 |         return nullptr;
103 |     }
104 | }
105 | 


--------------------------------------------------------------------------------
/src/PlaneOfBlocks.h:
--------------------------------------------------------------------------------
  1 | // See legal notice in Copying.txt for more information
  2 | 
  3 | // This program is free software; you can redistribute it and/or modify
  4 | // it under the terms of the GNU General Public License as published by
  5 | // the Free Software Foundation; either version 2 of the License, or
  6 | // (at your option) any later version.
  7 | //
  8 | // This program is distributed in the hope that it will be useful,
  9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 | // GNU General Public License for more details.
 12 | //
 13 | // You should have received a copy of the GNU General Public License
 14 | // along with this program; if not, write to the Free Software
 15 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
 16 | // http://www.gnu.org/copyleft/gpl.html .
 17 | 
 18 | #ifndef PLANEOFBLOCKS_H
 19 | #define PLANEOFBLOCKS_H
 20 | 
 21 | #include <stdlib.h>
 22 | 
 23 | #include "Fakery.h"
 24 | #include "MVFrame.h"
 25 | #include "CopyCode.h"
 26 | #include "SADFunctions.h"
 27 | #include "CommonFunctions.h"
 28 | #include "Luma.h"
 29 | #include "DCTFFTW.h"
 30 | 
 31 | #ifdef __cplusplus
 32 | extern "C" {
 33 | #endif
 34 | 
 35 | #define MAX_PREDICTOR 5 // right now 5 should be enough (TSchniede)
 36 | 
 37 | //#define    ONLY_CHECK_NONDEFAULT_MV // make the check if it is no default reference (zero, global,...)
 38 | 
 39 | 
 40 | typedef struct PlaneOfBlocks {
 41 | 
 42 |     /* fields set at initialization */
 43 | 
 44 |     int nBlkX;        /* width in number of blocks */
 45 |     int nBlkY;        /* height in number of blocks */
 46 |     int nBlkSizeX;    /* size of a block */
 47 |     int nBlkSizeY;    /* size of a block */
 48 |     int nBlkCount;    /* number of blocks in the plane */
 49 |     int nPel;         /* pel refinement accuracy */
 50 |     int nLogPel;      /* logarithm of the pel refinement accuracy */
 51 |     int nScale;       /* scaling factor of the plane */
 52 |     int nLogScale;    /* logarithm of the scaling factor */
 53 |     int nOverlapX; // overlap size
 54 |     int nOverlapY; // overlap size
 55 |     int xRatioUV;
 56 |     int yRatioUV;
 57 |     int nLogxRatioUV; // log of xRatioUV (0 for 1 and 1 for 2)
 58 |     int nLogyRatioUV; // log of yRatioUV (0 for 1 and 1 for 2)
 59 |     int bytesPerSample;
 60 | 
 61 |     SADFunction SAD;   /* function which computes the sad */
 62 |     LUMAFunction LUMA; /* function which computes the mean luma */
 63 |     COPYFunction BLITLUMA;
 64 |     COPYFunction BLITCHROMA;
 65 |     SADFunction SADCHROMA;
 66 |     SADFunction SATD; /* SATD function, (similar to SAD), used as replacement to dct */
 67 | 
 68 |     VECTOR *vectors; /* motion vectors of the blocks */
 69 |     /* before the search, contains the hierachal predictor */
 70 |     /* after the search, contains the best motion vector */
 71 | 
 72 |     int smallestPlane; /* say whether vectors can used predictors from a smaller plane */
 73 |     int chroma;        /* do we do chroma me */
 74 | 
 75 |     /* working fields */
 76 | 
 77 |     MVFrame *pSrcFrame;
 78 |     MVFrame *pRefFrame;
 79 | 
 80 |     int nSrcPitch[3];
 81 |     const uint8_t *pSrc[3]; // the alignment of this array is important for speed for some reason (cacheline?)
 82 |     int nRefPitch[3];
 83 | 
 84 |     VECTOR bestMV;    /* best vector found so far during the search */
 85 |     int64_t nMinCost; /* minimum cost ( sad + mv cost ) found so far */
 86 |     VECTOR predictor; /* best predictor for the current vector */
 87 | 
 88 |     VECTOR predictors[MAX_PREDICTOR]; /* set of predictors for the current block */
 89 | 
 90 |     int nDxMin; /* minimum x coordinate for the vector */
 91 |     int nDyMin; /* minimum y coordinate for the vector */
 92 |     int nDxMax; /* maximum x corrdinate for the vector */
 93 |     int nDyMax; /* maximum y coordinate for the vector */
 94 | 
 95 |     int x[3];       /* absolute x coordinate of the origin of the block in the reference frame */
 96 |     int y[3];       /* absolute y coordinate of the origin of the block in the reference frame */
 97 |     int blkx;       /* x coordinate in blocks */
 98 |     int blky;       /* y coordinate in blocks */
 99 |     int blkIdx;     /* index of the block */
100 |     int blkScanDir; // direction of scan (1 is left to rught, -1 is right to left)
101 | 
102 |     /* search parameters */
103 | 
104 |     SearchType searchType; /* search type used */
105 |     int nSearchParam;      /* additionnal parameter for this search */
106 |     int64_t nLambda;       /* vector cost factor */
107 |     int64_t LSAD;          // SAD limit for lambda using - Fizick
108 |     int penaltyNew;        // cost penalty factor for new candidates
109 |     int penaltyZero;       // cost penalty factor for zero vector
110 |     int pglobal;           // cost penalty factor for global predictor
111 |     //   int nLambdaLen; //  penalty factor (lambda) for vector length
112 |     int64_t badSAD;   // SAD threshold for more wide search
113 |     int badrange;     // wide search radius
114 |     int badcount;     // number of bad blocks refined
115 |     int tryMany;     // try refine around many predictors
116 | 
117 |     VECTOR globalMVPredictor;  // predictor of global motion vector
118 |     VECTOR zeroMVfieldShifted; // zero motion vector for fieldbased video at finest level pel2
119 | 
120 |     DCTFFTW *DCT;
121 |     uint8_t *dctSrc;
122 |     uint8_t *dctRef;
123 |     int dctpitch;
124 |     int dctmode;
125 |     int srcLuma;
126 |     int refLuma;
127 |     int sumLumaChange;
128 |     int dctweight16;
129 |     int *freqArray; // temporary array for global motion estimaton
130 |     int freqSize;   // size of freqArray
131 |     int64_t verybigSAD;
132 | 
133 |     int nSrcPitch_temp[3];
134 |     uint8_t *pSrc_temp[3]; //for easy WRITE access to temp block
135 | } PlaneOfBlocks;
136 | 
137 | 
138 | void pobInit(PlaneOfBlocks *pob, int _nBlkX, int _nBlkY, int _nBlkSizeX, int _nBlkSizeY, int _nPel, int _nLevel, int nMotionFlags, int nCPUFlags, int _nOverlapX, int _nOverlapY, int _xRatioUV, int _yRatioUV, int bitsPerSample);
139 | 
140 | void pobDeinit(PlaneOfBlocks *pob);
141 | 
142 | void pobEstimateGlobalMVDoubled(PlaneOfBlocks *pob, VECTOR *globalMVec);
143 | 
144 | MVArraySizeType pobGetArraySize(const PlaneOfBlocks *pob, int divideMode);
145 | 
146 | void pobInterpolatePrediction(PlaneOfBlocks *pob, const PlaneOfBlocks *pob2);
147 | 
148 | void pobRecalculateMVs(PlaneOfBlocks *pob, const FakeGroupOfPlanes *fgop, MVFrame *pSrcFrame, MVFrame *pRefFrame, SearchType st, int stp, int lambda, int pnew, uint8_t *out, int fieldShift, int64_t thSAD, DCTFFTW *DCT, int dctmode, int smooth, int meander);
149 | 
150 | void pobSearchMVs(PlaneOfBlocks *pob, MVFrame *pSrcFrame, MVFrame *pRefFrame, SearchType st, int stp, int lambda, int lsad, int pnew, int plevel, uint8_t *out, VECTOR *globalMVec, int fieldShift, DCTFFTW *DCT, int dctmode, int *pmeanLumaChange, int pzero, int pglobal, int64_t badSAD, int badrange, int meander, int tryMany);
151 | 
152 | MVArraySizeType pobWriteDefaultToArray(const PlaneOfBlocks *pob, uint8_t *array, int divideMode);
153 | 
154 | #ifdef __cplusplus
155 | }
156 | #endif
157 | 
158 | #endif
159 | 


--------------------------------------------------------------------------------
/src/SADFunctions.h:
--------------------------------------------------------------------------------
 1 | // Functions that computes distances between blocks
 2 | 
 3 | // See legal notice in Copying.txt for more information
 4 | 
 5 | // This program is free software; you can redistribute it and/or modify
 6 | // it under the terms of the GNU General Public License as published by
 7 | // the Free Software Foundation; either version 2 of the License, or
 8 | // (at your option) any later version.
 9 | //
10 | // This program is distributed in the hope that it will be useful,
11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | // GNU General Public License for more details.
14 | //
15 | // You should have received a copy of the GNU General Public License
16 | // along with this program; if not, write to the Free Software
17 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
18 | // http://www.gnu.org/copyleft/gpl.html .
19 | 
20 | #ifndef SADFUNCTIONS_H
21 | #define SADFUNCTIONS_H
22 | 
23 | #ifdef __cplusplus
24 | extern "C" {
25 | #endif
26 | 
27 | #include <stdint.h>
28 | 
29 | 
30 | typedef unsigned int (*SADFunction)(const uint8_t *pSrc, intptr_t nSrcPitch,
31 |                                     const uint8_t *pRef, intptr_t nRefPitch);
32 | 
33 | 
34 | SADFunction selectSADFunction(unsigned width, unsigned height, unsigned bits, int opt, unsigned cpu);
35 | 
36 | SADFunction selectSATDFunction(unsigned width, unsigned height, unsigned bits, int opt, unsigned cpu);
37 | 
38 | 
39 | #if defined(MVTOOLS_X86)
40 | SADFunction selectSADFunctionAVX2(unsigned width, unsigned height, unsigned bits);
41 | #endif
42 | 
43 | #ifdef __cplusplus
44 | } // extern "C"
45 | #endif
46 | 
47 | #endif
48 | 


--------------------------------------------------------------------------------
/src/SADFunctions_AVX2.cpp:
--------------------------------------------------------------------------------
  1 | #if defined(MVTOOLS_X86)
  2 | 
  3 | #include <cstdint>
  4 | #include <stdexcept>
  5 | #include <unordered_map>
  6 | #include <immintrin.h>
  7 | #include "SADFunctions.h"
  8 | 
  9 | #define zeroes _mm256_setzero_si256()
 10 | 
 11 | 
 12 | // This version used for width >= 32.
 13 | template <unsigned width, unsigned height>
 14 | struct SADWrapperU8_AVX2 {
 15 |     static_assert(width >= 32, "");
 16 | 
 17 |     static unsigned int sad_u8_avx2(const uint8_t *pSrc, intptr_t nSrcPitch, const uint8_t *pRef, intptr_t nRefPitch) {
 18 |         (void)nSrcPitch;
 19 | 
 20 |         __m256i sum = zeroes;
 21 | 
 22 |         for (unsigned y = 0; y < height; y++) {
 23 |             for (unsigned x = 0; x < width; x += 32) {
 24 |                 __m256i m2 = _mm256_loadu_si256((const __m256i *)&pSrc[x]);
 25 |                 __m256i m3 = _mm256_loadu_si256((const __m256i *)&pRef[x]);
 26 | 
 27 |                 __m256i diff = _mm256_sad_epu8(m2, m3);
 28 | 
 29 |                 sum = _mm256_add_epi64(sum, diff);
 30 |             }
 31 | 
 32 |             pSrc += /*nSrcPitch*/ width;
 33 |             pRef += nRefPitch;
 34 |         }
 35 | 
 36 |         sum = _mm256_add_epi64(sum, _mm256_permute4x64_epi64(sum, _MM_SHUFFLE(0, 0, 3, 2)));
 37 |         sum = _mm256_add_epi64(sum, _mm256_shuffle_epi32(sum, _MM_SHUFFLE(0, 0, 3, 2)));
 38 |         return (unsigned)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum));
 39 |     }
 40 | 
 41 | };
 42 | 
 43 | template <unsigned height>
 44 | struct SADWrapperU8_AVX2<16, height> {
 45 |     static_assert(height >= 2, "");
 46 | 
 47 |     static unsigned int sad_u8_avx2(const uint8_t *pSrc, intptr_t nSrcPitch, const uint8_t *pRef, intptr_t nRefPitch) {
 48 |         (void)nSrcPitch;
 49 | 
 50 |         __m256i sum = zeroes;
 51 | 
 52 |         for (int y = 0; (unsigned)y < height; y += 2) {
 53 |             __m256i m2 = _mm256_loadu_si256((const __m256i *)(pSrc + y * 16));
 54 |             __m256i m3 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(pRef + y * nRefPitch)));
 55 |             m3 = _mm256_insertf128_si256(m3, _mm_loadu_si128((const __m128i *)(pRef + (y + 1) * nRefPitch)), 1);
 56 | 
 57 |             __m256i diff = _mm256_sad_epu8(m2, m3);
 58 |             sum = _mm256_add_epi64(sum, diff);
 59 |         }
 60 | 
 61 |         sum = _mm256_add_epi64(sum, _mm256_permute4x64_epi64(sum, _MM_SHUFFLE(0, 0, 3, 2)));
 62 |         sum = _mm256_add_epi64(sum, _mm256_shuffle_epi32(sum, _MM_SHUFFLE(0, 0, 3, 2)));
 63 |         return (unsigned)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum));
 64 |     }
 65 | };
 66 | 
 67 | 
 68 | // opt can fit in four bits, if the width and height need more than eight bits each.
 69 | #define KEY(width, height, bits, opt) (unsigned)(width) << 24 | (height) << 16 | (bits) << 8 | (opt)
 70 | 
 71 | 
 72 | #define SAD_U8_AVX2(width, height) \
 73 |     { KEY(width, height, 8, 0), SADWrapperU8_AVX2<width, height>::sad_u8_avx2 },
 74 | 
 75 | static const std::unordered_map<uint32_t, SADFunction> sad_functions = {
 76 |     SAD_U8_AVX2(16, 2)
 77 |     SAD_U8_AVX2(16, 4)
 78 |     SAD_U8_AVX2(16, 4)
 79 |     SAD_U8_AVX2(16, 8)
 80 |     SAD_U8_AVX2(16, 16)
 81 |     SAD_U8_AVX2(16, 32)
 82 |     SAD_U8_AVX2(32, 8)
 83 |     SAD_U8_AVX2(32, 16)
 84 |     SAD_U8_AVX2(32, 32)
 85 |     SAD_U8_AVX2(32, 64)
 86 |     SAD_U8_AVX2(64, 16)
 87 |     SAD_U8_AVX2(64, 32)
 88 |     SAD_U8_AVX2(64, 64)
 89 |     SAD_U8_AVX2(64, 128)
 90 |     SAD_U8_AVX2(128, 32)
 91 |     SAD_U8_AVX2(128, 64)
 92 |     SAD_U8_AVX2(128, 128)
 93 | };
 94 | 
 95 | SADFunction selectSADFunctionAVX2(unsigned width, unsigned height, unsigned bits) {
 96 |     try {
 97 |         return sad_functions.at(KEY(width, height, bits, 0));
 98 |     } catch (const std::out_of_range &) {
 99 |         return nullptr;
100 |     }
101 | }
102 | 
103 | #endif
104 | 


--------------------------------------------------------------------------------
/src/SimpleResize.cpp:
--------------------------------------------------------------------------------
  1 | // This used to contain code from the SimpleResize Avisynth plugin, written
  2 | // by Tom Barry and modified by Fizick. All of that was rewritten by dubhater,
  3 | // using code by anon32 for, ahem, inspiration.
  4 | // Only the name and the basic algorithm remain.
  5 | 
  6 | #include <algorithm>
  7 | #include <stdlib.h>
  8 | 
  9 | #include <VSHelper.h>
 10 | 
 11 | #include "CPU.h"
 12 | #include "SimpleResize.h"
 13 | 
 14 | 
 15 | #if defined(MVTOOLS_X86)
 16 | void simpleResize_uint8_t_avx2(const SimpleResize *simple,
 17 |                                uint8_t *dstp, int dst_stride,
 18 |                                const uint8_t *srcp, int src_stride,
 19 |                                int horizontal_vectors);
 20 | void simpleResize_int16_t_avx2(const SimpleResize *simple,
 21 |                                int16_t *dstp, int dst_stride,
 22 |                                const int16_t *srcp, int src_stride,
 23 |                                int horizontal_vectors);
 24 | #endif
 25 | 
 26 | 
 27 | static void InitTables(int *offsets, int *weights, int out, int in) {
 28 |     // We don't do shifts.
 29 |     float leftmost = 0.5f;       // + shift
 30 |     float rightmost = in - 0.5f; // + shift
 31 | 
 32 |     int leftmost_idx = VSMAX((int)leftmost, 0);
 33 |     int rightmost_idx = VSMIN((int)rightmost, in - 1);
 34 | 
 35 |     for (int i = 0; i < out; i++) {
 36 |         float position = (i + 0.5f) * (float)in / (float)out;
 37 | 
 38 |         float weight;
 39 |         int offset;
 40 | 
 41 |         if (position <= leftmost) {
 42 |             offset = leftmost_idx;
 43 |             weight = 0.0f;
 44 |         } else if (position >= rightmost) {
 45 |             offset = rightmost_idx - 1;
 46 |             weight = 1.0f;
 47 |         } else {
 48 |             offset = (int)(position - leftmost);
 49 |             weight = position - leftmost - offset;
 50 |         }
 51 | 
 52 |         offsets[i] = offset;
 53 | 
 54 |         weights[i] = (int)(weight * simple_resize_weight_max);
 55 |     }
 56 | }
 57 | 
 58 | 
 59 | // Thread-safe.
 60 | template <typename PixelType>
 61 | static void simpleResize(const SimpleResize *simple,
 62 |                          PixelType *dstp, int dst_stride,
 63 |                          const PixelType *srcp, int src_stride,
 64 |                          int horizontal_vectors) {
 65 | 
 66 |     // Apparently only 16 bit vectors need limiting.
 67 |     bool limit_vectors = sizeof(PixelType) == 2;
 68 | 
 69 |     int pel = simple->pel;
 70 |     int minimum = 0;
 71 |     int maximum = simple->limit_height * pel - 1;
 72 |     int horizontal_step = horizontal_vectors ? pel : 0;
 73 |     int vertical_step = horizontal_vectors ? 0 : pel;
 74 | 
 75 |     PixelType *workp = (PixelType *)malloc(simple->src_width * sizeof(PixelType));
 76 | 
 77 |     for (int y = 0; y < simple->dst_height; y++) {
 78 |         int weight_bottom = simple->vertical_weights[y];
 79 |         int weight_top = simple_resize_weight_max - weight_bottom;
 80 | 
 81 |         const PixelType *srcp1 = srcp + simple->vertical_offsets[y] * src_stride;
 82 |         const PixelType *srcp2 = srcp1 + src_stride;
 83 | 
 84 |         /* vertical */
 85 |         for (int x = 0; x < simple->src_width; x++) {
 86 |             workp[x] = (srcp1[x] * weight_top + srcp2[x] * weight_bottom + simple_resize_weight_half) >> simple_resize_weight_shift;
 87 |         }
 88 | 
 89 |         if (horizontal_vectors) {
 90 |             minimum = 0;
 91 |             maximum = simple->limit_width * pel - 1;
 92 |         }
 93 | 
 94 |         /* horizontal */
 95 |         for (int x = 0; x < simple->dst_width; x++) {
 96 |             int weight_right = simple->horizontal_weights[x];
 97 |             int weight_left = simple_resize_weight_max - weight_right;
 98 |             int offset = simple->horizontal_offsets[x];
 99 | 
100 |             int result = (workp[offset] * weight_left + workp[offset + 1] * weight_right + simple_resize_weight_half) >> simple_resize_weight_shift;
101 | 
102 |             if (limit_vectors) {
103 |                 result = std::max(minimum, std::min(result, maximum));
104 | 
105 |                 minimum -= horizontal_step;
106 |                 maximum -= horizontal_step;
107 |             }
108 | 
109 |             dstp[x] = result;
110 |         }
111 | 
112 |         dstp += dst_stride;
113 | 
114 |         if (limit_vectors) {
115 |             minimum -= vertical_step;
116 |             maximum -= vertical_step;
117 |         }
118 |     }
119 | 
120 |     free(workp);
121 | }
122 | 
123 | 
124 | void simpleInit(SimpleResize *simple, int dst_width, int dst_height, int src_width, int src_height, int limit_width, int limit_height, int pel, int opt) {
125 |     simple->src_width = src_width;
126 |     simple->src_height = src_height;
127 |     simple->dst_width = dst_width;
128 |     simple->dst_height = dst_height;
129 | 
130 |     simple->limit_width = limit_width;
131 |     simple->limit_height = limit_height;
132 |     simple->pel = pel;
133 | 
134 |     // Offset to first line of the pair.
135 |     simple->vertical_offsets = (int *)malloc(dst_height * sizeof(int));
136 |     // Weight of the second line of the pair.
137 |     simple->vertical_weights = (int *)malloc(dst_height * sizeof(int));
138 | 
139 |     simple->horizontal_offsets = (int *)malloc(dst_width * sizeof(int));
140 |     simple->horizontal_weights = (int *)malloc(dst_width * sizeof(int));
141 | 
142 |     InitTables(simple->horizontal_offsets, simple->horizontal_weights, dst_width, src_width);
143 |     InitTables(simple->vertical_offsets, simple->vertical_weights, dst_height, src_height);
144 | 
145 |     simple->simpleResize_uint8_t = simpleResize<uint8_t>;
146 |     simple->simpleResize_int16_t = simpleResize<int16_t>;
147 | 
148 |     if (opt) {
149 | #if defined(MVTOOLS_X86)
150 |         if (g_cpuinfo & X264_CPU_AVX2) {
151 |             simple->simpleResize_uint8_t = simpleResize_uint8_t_avx2;
152 |             simple->simpleResize_int16_t = simpleResize_int16_t_avx2;
153 | 
154 |             for (int i = 0; i < dst_width; i++) {
155 |                 int w = simple->horizontal_weights[i];
156 |                 simple->horizontal_weights[i] = (w << 16) | (simple_resize_weight_max - w);
157 |             }
158 |         }
159 | #endif
160 |     }
161 | }
162 | 
163 | 
164 | void simpleDeinit(SimpleResize *simple) {
165 |     free(simple->vertical_offsets);
166 |     free(simple->vertical_weights);
167 |     free(simple->horizontal_offsets);
168 |     free(simple->horizontal_weights);
169 |     memset(simple, 0, sizeof(SimpleResize));
170 | }
171 | 
172 | 


--------------------------------------------------------------------------------
/src/SimpleResize.h:
--------------------------------------------------------------------------------
 1 | // See legal notice in Copying.txt for more information
 2 | 
 3 | // This program is free software; you can redistribute it and/or modify
 4 | // it under the terms of the GNU General Public License as published by
 5 | // the Free Software Foundation; either version 2 of the License, or
 6 | // (at your option) any later version.
 7 | //
 8 | // This program is distributed in the hope that it will be useful,
 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with this program; if not, write to the Free Software
15 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
16 | // http://www.gnu.org/copyleft/gpl.html .
17 | 
18 | // I (Fizick) borrow code from Tom Barry's SimpleResize here
19 | 
20 | #ifndef SIMPLERESIZE_H
21 | #define SIMPLERESIZE_H
22 | 
23 | #ifdef __cplusplus
24 | extern "C" {
25 | #endif
26 | 
27 | 
28 | #include <stdint.h>
29 | 
30 | 
31 | enum {
32 |     simple_resize_weight_shift = 14,
33 |     simple_resize_weight_max = 1 << simple_resize_weight_shift,
34 |     simple_resize_weight_half = simple_resize_weight_max / 2,
35 | };
36 | 
37 | 
38 | typedef struct SimpleResize SimpleResize;
39 | 
40 | 
41 | typedef void (*ResizeFunction8)(const SimpleResize *simple,
42 |                                 uint8_t *dstp, int dst_stride,
43 |                                 const uint8_t *srcp, int src_stride,
44 |                                 int horizontal_vectors);
45 | typedef void (*ResizeFunction16)(const SimpleResize *simple,
46 |                                  int16_t *dstp, int dst_stride,
47 |                                  const int16_t *srcp, int src_stride,
48 |                                  int horizontal_vectors);
49 | 
50 | 
51 | typedef struct SimpleResize {
52 |     int dst_width;
53 |     int dst_height;
54 |     int src_width;
55 |     int src_height;
56 | 
57 |     // Used only to limit the vectors in the 16 bit resizer.
58 |     // dst_width and dst_height are usually the padded dimensions.
59 |     // The two below are the unpadded dimensions, i.e. the actual frame size.
60 |     int limit_width;
61 |     int limit_height;
62 |     int pel;
63 | 
64 |     int *vertical_offsets;
65 |     int *vertical_weights;
66 | 
67 |     int *horizontal_offsets;
68 |     int *horizontal_weights;
69 | 
70 |     ResizeFunction8 simpleResize_uint8_t;
71 |     ResizeFunction16 simpleResize_int16_t;
72 | } SimpleResize;
73 | 
74 | 
75 | void simpleInit(SimpleResize *simple, int dst_width, int dst_height, int src_width, int src_height, int limit_width, int limit_height, int pel, int opt);
76 | void simpleDeinit(SimpleResize *simple);
77 | 
78 | 
79 | #ifdef __cplusplus
80 | } // extern "C"
81 | #endif
82 | 
83 | #endif // SIMPLERESIZE_H
84 | 


--------------------------------------------------------------------------------
/src/SimpleResize_AVX2.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <immintrin.h>
  3 | 
  4 | #include "SimpleResize.h"
  5 | 
  6 | 
  7 | #ifdef _WIN32
  8 | #define FORCE_INLINE __forceinline
  9 | #else
 10 | #define FORCE_INLINE inline __attribute__((always_inline))
 11 | #endif
 12 | 
 13 | #define zeroes _mm_setzero_si128()
 14 | 
 15 | 
 16 | static FORCE_INLINE void simpleResize_uint8_t_vertical_4px_avx2(uint8_t *workp, const uint8_t *srcp1, const uint8_t *srcp2, int x, const __m128i &dwords_weights) {
 17 |     __m128i top = _mm_cvtsi32_si128(*(const int *)&srcp1[x]);
 18 |     __m128i bottom = _mm_cvtsi32_si128(*(const int *)&srcp2[x]);
 19 |     __m128i pixels = _mm_unpacklo_epi8(_mm_unpacklo_epi8(bottom, top), zeroes);
 20 | 
 21 |     __m128i dst = _mm_madd_epi16(pixels, dwords_weights);
 22 | 
 23 |     dst = _mm_add_epi32(dst, _mm_set1_epi32(simple_resize_weight_half));
 24 |     dst = _mm_srli_epi32(dst, simple_resize_weight_shift);
 25 |     dst = _mm_packs_epi32(dst, dst);
 26 |     dst = _mm_packus_epi16(dst, dst);
 27 |     *(int *)&workp[x] =  _mm_cvtsi128_si32(dst);
 28 | }
 29 | 
 30 | 
 31 | static FORCE_INLINE void simpleResize_uint8_t_horizontal_8px_avx2(const SimpleResize *simple, uint8_t *dstp, uint8_t *workp, int x, const __m256i &shuffle_mask) {
 32 |     __m256i dwords_weights_h = _mm256_loadu_si256((const __m256i *)&simple->horizontal_weights[x]);
 33 |     __m256i dwords_offsets = _mm256_loadu_si256((const __m256i *)&simple->horizontal_offsets[x]);
 34 |     __m256i pixels = _mm256_i32gather_epi32((const int *)workp, dwords_offsets, sizeof(uint8_t));
 35 | 
 36 |     pixels = _mm256_shuffle_epi8(pixels, shuffle_mask);
 37 | 
 38 |     pixels = _mm256_madd_epi16(pixels, dwords_weights_h);
 39 |     pixels = _mm256_add_epi32(pixels, _mm256_set1_epi32(simple_resize_weight_half));
 40 |     pixels = _mm256_srai_epi32(pixels, simple_resize_weight_shift);
 41 |     pixels = _mm256_packs_epi32(pixels, pixels);
 42 |     pixels = _mm256_permute4x64_epi64(pixels, 0xe8); // 0b11101000
 43 |     pixels = _mm256_packus_epi16(pixels, pixels);
 44 | 
 45 |     _mm_storel_epi64((__m128i *)&dstp[x], _mm256_castsi256_si128(pixels));
 46 | }
 47 | 
 48 | 
 49 | // Thread-safe.
 50 | void simpleResize_uint8_t_avx2(const SimpleResize *simple,
 51 |                                uint8_t *dstp, int dst_stride,
 52 |                                const uint8_t *srcp, int src_stride,
 53 |                                int horizontal_vectors) {
 54 |     (void)horizontal_vectors;
 55 | 
 56 |     // Two additional bytes because of vpgatherdd.
 57 |     uint8_t *workp = (uint8_t *)malloc(simple->src_width * sizeof(uint8_t) + 2);
 58 | 
 59 | #define SHUFFLE_PATTERN -0x80, 13, -0x80, 12, -0x80, 9, -0x80, 8, -0x80, 5, -0x80, 4, -0x80, 1, -0x80, 0
 60 |     __m256i shuffle_mask = _mm256_set_epi8(SHUFFLE_PATTERN, SHUFFLE_PATTERN);
 61 | #undef SHUFFLE_PATTERN
 62 | 
 63 |     for (int y = 0; y < simple->dst_height; y++) {
 64 |         int weight_bottom = simple->vertical_weights[y];
 65 |         int weight_top = simple_resize_weight_max - weight_bottom;
 66 | 
 67 |         const uint8_t *srcp1 = srcp + simple->vertical_offsets[y] * src_stride;
 68 |         const uint8_t *srcp2 = srcp1 + src_stride;
 69 | 
 70 |         __m128i dwords_weights_v = _mm_set1_epi32((weight_top << 16) | weight_bottom);
 71 | 
 72 |         int pixels_per_iteration = 4;
 73 |         const int src_width_avx2 = simple->src_width & ~(pixels_per_iteration - 1);
 74 | 
 75 |         /* vertical */
 76 |         for (int x = 0; x < src_width_avx2; x += pixels_per_iteration)
 77 |             simpleResize_uint8_t_vertical_4px_avx2(workp, srcp1, srcp2, x, dwords_weights_v);
 78 | 
 79 |         if (src_width_avx2 < simple->src_width)
 80 |             simpleResize_uint8_t_vertical_4px_avx2(workp, srcp1, srcp2, simple->src_width - pixels_per_iteration, dwords_weights_v);
 81 | 
 82 | 
 83 |         pixels_per_iteration = 8;
 84 |         const int dst_width_avx2 = simple->dst_width & ~(pixels_per_iteration - 1);
 85 | 
 86 |         /* horizontal */
 87 |         for (int x = 0; x < dst_width_avx2; x += pixels_per_iteration)
 88 |             simpleResize_uint8_t_horizontal_8px_avx2(simple, dstp, workp, x, shuffle_mask);
 89 | 
 90 |         if (dst_width_avx2 < simple->dst_width)
 91 |             simpleResize_uint8_t_horizontal_8px_avx2(simple, dstp, workp, simple->dst_width - pixels_per_iteration, shuffle_mask);
 92 | 
 93 |         dstp += dst_stride;
 94 |     }
 95 | 
 96 |     free(workp);
 97 | }
 98 | 
 99 | 
100 | static FORCE_INLINE void simpleResize_int16_t_vertical_8px_avx2(int16_t *workp, const int16_t *srcp1, const int16_t *srcp2, int x, const __m128i &dwords_weights) {
101 |     __m128i top = _mm_loadu_si128((const __m128i *)&srcp1[x]);
102 |     __m128i bottom = _mm_loadu_si128((const __m128i *)&srcp2[x]);
103 |     __m128i pixels_lo = _mm_unpacklo_epi16(bottom, top);
104 |     __m128i pixels_hi = _mm_unpackhi_epi16(bottom, top);
105 | 
106 |     __m128i dst_lo = _mm_madd_epi16(pixels_lo, dwords_weights);
107 |     __m128i dst_hi = _mm_madd_epi16(pixels_hi, dwords_weights);
108 |     dst_lo = _mm_add_epi32(dst_lo, _mm_set1_epi32(simple_resize_weight_half));
109 |     dst_hi = _mm_add_epi32(dst_hi, _mm_set1_epi32(simple_resize_weight_half));
110 |     dst_lo = _mm_srai_epi32(dst_lo, simple_resize_weight_shift);
111 |     dst_hi = _mm_srai_epi32(dst_hi, simple_resize_weight_shift);
112 |     __m128i dst = _mm_packs_epi32(dst_lo, dst_hi);
113 |     _mm_storeu_si128((__m128i *)&workp[x], dst);
114 | }
115 | 
116 | 
117 | static FORCE_INLINE void simpleResize_int16_t_horizontal_8px_avx2(const SimpleResize *simple, int16_t *dstp, int16_t *workp, int x, __m256i &minimum, __m256i &maximum, const __m256i &horizontal_step) {
118 |     __m256i dwords_weights_h = _mm256_loadu_si256((const __m256i *)&simple->horizontal_weights[x]);
119 |     __m256i dwords_offsets = _mm256_loadu_si256((const __m256i *)&simple->horizontal_offsets[x]);
120 |     __m256i pixels = _mm256_i32gather_epi32((const int *)workp, dwords_offsets, sizeof(int16_t));
121 |     pixels = _mm256_madd_epi16(pixels, dwords_weights_h);
122 |     pixels = _mm256_add_epi32(pixels, _mm256_set1_epi32(simple_resize_weight_half));
123 |     pixels = _mm256_srai_epi32(pixels, simple_resize_weight_shift);
124 | 
125 |     pixels = _mm256_max_epi32(minimum,
126 |                               _mm256_min_epi32(pixels, maximum));
127 | 
128 |     pixels = _mm256_packs_epi32(pixels, pixels);
129 | 
130 |     minimum = _mm256_sub_epi32(minimum, horizontal_step);
131 |     maximum = _mm256_sub_epi32(maximum, horizontal_step);
132 | 
133 |     _mm_storeu_si128((__m128i *)&dstp[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(pixels, 0xe8))); // 0b11101000
134 | }
135 | 
136 | 
137 | // Thread-safe.
138 | void simpleResize_int16_t_avx2(const SimpleResize *simple,
139 |                                int16_t *dstp, int dst_stride,
140 |                                const int16_t *srcp, int src_stride,
141 |                                int horizontal_vectors) {
142 |     int16_t *workp = (int16_t *)malloc(simple->src_width * sizeof(int16_t));
143 | 
144 |     const int pixels_per_iteration = 8;
145 | 
146 |     int pel = simple->pel;
147 |     __m256i minimum = _mm256_setzero_si256();
148 |     __m256i maximum = _mm256_set1_epi32(simple->limit_height * pel - 1);
149 |     __m256i horizontal_step = _mm256_set1_epi32(horizontal_vectors ? pel * pixels_per_iteration : 0);
150 |     __m256i vertical_step = _mm256_set1_epi32(horizontal_vectors ? 0 : pel);
151 | 
152 |     __m256i initial_horizontal_minimum = _mm256_set_epi32(-7 * pel,
153 |                                                           -6 * pel,
154 |                                                           -5 * pel,
155 |                                                           -4 * pel,
156 |                                                           -3 * pel,
157 |                                                           -2 * pel,
158 |                                                           -1 * pel,
159 |                                                           0 * pel);
160 |     __m256i initial_horizontal_maximum = _mm256_set_epi32((simple->limit_width - 7) * pel - 1,
161 |                                                           (simple->limit_width - 6) * pel - 1,
162 |                                                           (simple->limit_width - 5) * pel - 1,
163 |                                                           (simple->limit_width - 4) * pel - 1,
164 |                                                           (simple->limit_width - 3) * pel - 1,
165 |                                                           (simple->limit_width - 2) * pel - 1,
166 |                                                           (simple->limit_width - 1) * pel - 1,
167 |                                                           (simple->limit_width - 0) * pel - 1);
168 | 
169 |     for (int y = 0; y < simple->dst_height; y++) {
170 |         int weight_bottom = simple->vertical_weights[y];
171 |         int weight_top = simple_resize_weight_max - weight_bottom;
172 | 
173 |         const int16_t *srcp1 = srcp + simple->vertical_offsets[y] * src_stride;
174 |         const int16_t *srcp2 = srcp1 + src_stride;
175 | 
176 |         __m128i dwords_weights_v = _mm_set1_epi32((weight_top << 16) | weight_bottom);
177 | 
178 |         const int src_width_sse2 = simple->src_width & ~(pixels_per_iteration - 1);
179 | 
180 |         /* vertical */
181 |         for (int x = 0; x < src_width_sse2; x += pixels_per_iteration)
182 |             simpleResize_int16_t_vertical_8px_avx2(workp, srcp1, srcp2, x, dwords_weights_v);
183 | 
184 |         if (src_width_sse2 < simple->src_width)
185 |             simpleResize_int16_t_vertical_8px_avx2(workp, srcp1, srcp2, simple->src_width - pixels_per_iteration, dwords_weights_v);
186 | 
187 | 
188 |         if (horizontal_vectors) {
189 |             minimum = initial_horizontal_minimum;
190 |             maximum = initial_horizontal_maximum;
191 |         }
192 | 
193 | 
194 |         const int dst_width_avx2 = simple->dst_width & ~(pixels_per_iteration - 1);
195 | 
196 |         /* horizontal */
197 |         for (int x = 0; x < dst_width_avx2; x += pixels_per_iteration)
198 |             simpleResize_int16_t_horizontal_8px_avx2(simple, dstp, workp, x, minimum, maximum, horizontal_step);
199 | 
200 |         if (dst_width_avx2 < simple->dst_width) {
201 |             if (horizontal_vectors) {
202 |                 __m256i step_back = _mm256_set1_epi32((pixels_per_iteration - (simple->dst_width - dst_width_avx2)) * pel);
203 |                 minimum = _mm256_add_epi32(minimum, step_back);
204 |                 maximum = _mm256_add_epi32(maximum, step_back);
205 |             }
206 | 
207 |             simpleResize_int16_t_horizontal_8px_avx2(simple, dstp, workp, simple->dst_width - pixels_per_iteration, minimum, maximum, horizontal_step);
208 |         }
209 | 
210 |         dstp += dst_stride;
211 | 
212 |         minimum = _mm256_sub_epi32(minimum, vertical_step);
213 |         maximum = _mm256_sub_epi32(maximum, vertical_step);
214 |     }
215 | 
216 |     free(workp);
217 | }
218 | 


--------------------------------------------------------------------------------
/src/asm/aarch64-asm.S:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 |  * asm.S: AArch64 utility macros
  3 |  *****************************************************************************
  4 |  * Copyright (C) 2008-2024 x264 project
  5 |  *
  6 |  * Authors: Mans Rullgard <mans@mansr.com>
  7 |  *          David Conrad <lessen42@gmail.com>
  8 |  *          Janne Grunau <janne-x264@jannau.net>
  9 |  *
 10 |  * This program is free software; you can redistribute it and/or modify
 11 |  * it under the terms of the GNU General Public License as published by
 12 |  * the Free Software Foundation; either version 2 of the License, or
 13 |  * (at your option) any later version.
 14 |  *
 15 |  * This program is distributed in the hope that it will be useful,
 16 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 |  * GNU General Public License for more details.
 19 |  *
 20 |  * You should have received a copy of the GNU General Public License
 21 |  * along with this program; if not, write to the Free Software
 22 |  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 23 |  *
 24 |  * This program is also available under a commercial proprietary license.
 25 |  * For more information, contact us at licensing@x264.com.
 26 |  *****************************************************************************/
 27 | 
 28 | /*
 29 | #include "config.h"
 30 | */
 31 | #define GLUE(a, b) a ## b
 32 | #define JOIN(a, b) GLUE(a, b)
 33 | 
 34 | #ifdef PREFIX
 35 | #   define BASE _mvtools_
 36 | #   define SYM_PREFIX _
 37 | #else
 38 | #   define BASE mvtools_
 39 | #   define SYM_PREFIX
 40 | #endif
 41 | 
 42 | #ifdef BIT_DEPTH
 43 | #   define EXTERN_ASM JOIN(JOIN(BASE, BIT_DEPTH), _)
 44 | #else
 45 | #   define EXTERN_ASM BASE
 46 | #endif
 47 | 
 48 | #define X(s) JOIN(EXTERN_ASM, s)
 49 | #define X264(s) JOIN(BASE, s)
 50 | #define EXT(s) JOIN(SYM_PREFIX, s)
 51 | 
 52 | #ifdef __ELF__
 53 | #   define ELF
 54 | #else
 55 | #   define ELF  #
 56 | #endif
 57 | 
 58 | #ifdef __MACH__
 59 | #   define MACH
 60 | #else
 61 | #   define MACH #
 62 | #endif
 63 | 
 64 | #if HAVE_AS_FUNC
 65 | #   define FUNC
 66 | #else
 67 | #   define FUNC #
 68 | #endif
 69 | 
 70 | .macro  function name, export=0, align=2
 71 |     .macro endfunc
 72 | .if \export
 73 | ELF     .size   EXTERN_ASM\name, . - EXTERN_ASM\name
 74 | .else
 75 | ELF     .size   \name, . - \name
 76 | .endif
 77 | FUNC    .endfunc
 78 |         .purgem endfunc
 79 |     .endm
 80 |         .text
 81 |         .align          \align
 82 |     .if \export
 83 |         .global EXTERN_ASM\name
 84 | ELF     .type   EXTERN_ASM\name, %function
 85 | FUNC    .func   EXTERN_ASM\name
 86 | EXTERN_ASM\name:
 87 |     .else
 88 | ELF     .type   \name, %function
 89 | FUNC    .func   \name
 90 | \name:
 91 |     .endif
 92 | .endm
 93 | 
 94 | .macro  const   name, align=2
 95 |     .macro endconst
 96 | ELF     .size   \name, . - \name
 97 |         .purgem endconst
 98 |     .endm
 99 | ELF     .section        .rodata
100 | MACH    .const_data
101 |         .align          \align
102 | \name:
103 | .endm
104 | 
105 | .macro  movrel rd, val, offset=0
106 | #if defined(__APPLE__)
107 |   .if \offset < 0
108 |         adrp            \rd, \val@PAGE
109 |         add             \rd, \rd, \val@PAGEOFF
110 |         sub             \rd, \rd, -(\offset)
111 |   .else
112 |         adrp            \rd, \val+(\offset)@PAGE
113 |         add             \rd, \rd, \val+(\offset)@PAGEOFF
114 |   .endif
115 | #elif defined(PIC) && defined(_WIN32)
116 |   .if \offset < 0
117 |         adrp            \rd, \val
118 |         add             \rd, \rd, :lo12:\val
119 |         sub             \rd, \rd, -(\offset)
120 |   .else
121 |         adrp            \rd, \val+(\offset)
122 |         add             \rd, \rd, :lo12:\val+(\offset)
123 |   .endif
124 | #elif defined(PIC)
125 |         adrp            \rd, \val+(\offset)
126 |         add             \rd, \rd, :lo12:\val+(\offset)
127 | #else
128 |         ldr             \rd, =\val+\offset
129 | #endif
130 | .endm
131 | 
132 | #define FDEC_STRIDE 32
133 | #define FENC_STRIDE 16
134 | 
135 | 
136 | .macro SUMSUB_AB   sum, sub, a, b
137 |     add         \sum,  \a,  \b
138 |     sub         \sub,  \a,  \b
139 | .endm
140 | 
141 | .macro unzip t1, t2, s1, s2
142 |     uzp1        \t1,  \s1,  \s2
143 |     uzp2        \t2,  \s1,  \s2
144 | .endm
145 | 
146 | .macro transpose t1, t2, s1, s2
147 |     trn1        \t1,  \s1,  \s2
148 |     trn2        \t2,  \s1,  \s2
149 | .endm
150 | 
151 | .macro transpose4x4.h v0, v1, v2, v3, t0, t1, t2, t3
152 |     transpose   \t0\().2s,  \t2\().2s,  \v0\().2s,  \v2\().2s
153 |     transpose   \t1\().2s,  \t3\().2s,  \v1\().2s,  \v3\().2s
154 |     transpose   \v0\().4h,  \v1\().4h,  \t0\().4h,  \t1\().4h
155 |     transpose   \v2\().4h,  \v3\().4h,  \t2\().4h,  \t3\().4h
156 | .endm
157 | 
158 | .macro transpose4x8.h v0, v1, v2, v3, t0, t1, t2, t3
159 |     transpose   \t0\().4s,  \t2\().4s,  \v0\().4s,  \v2\().4s
160 |     transpose   \t1\().4s,  \t3\().4s,  \v1\().4s,  \v3\().4s
161 |     transpose   \v0\().8h,  \v1\().8h,  \t0\().8h,  \t1\().8h
162 |     transpose   \v2\().8h,  \v3\().8h,  \t2\().8h,  \t3\().8h
163 | .endm
164 | 
165 | 
166 | .macro transpose8x8.h r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
167 |     trn1        \r8\().8h,  \r0\().8h,  \r1\().8h
168 |     trn2        \r9\().8h,  \r0\().8h,  \r1\().8h
169 |     trn1        \r1\().8h,  \r2\().8h,  \r3\().8h
170 |     trn2        \r3\().8h,  \r2\().8h,  \r3\().8h
171 |     trn1        \r0\().8h,  \r4\().8h,  \r5\().8h
172 |     trn2        \r5\().8h,  \r4\().8h,  \r5\().8h
173 |     trn1        \r2\().8h,  \r6\().8h,  \r7\().8h
174 |     trn2        \r7\().8h,  \r6\().8h,  \r7\().8h
175 | 
176 |     trn1        \r4\().4s,  \r0\().4s,  \r2\().4s
177 |     trn2        \r2\().4s,  \r0\().4s,  \r2\().4s
178 |     trn1        \r6\().4s,  \r5\().4s,  \r7\().4s
179 |     trn2        \r7\().4s,  \r5\().4s,  \r7\().4s
180 |     trn1        \r5\().4s,  \r9\().4s,  \r3\().4s
181 |     trn2        \r9\().4s,  \r9\().4s,  \r3\().4s
182 |     trn1        \r3\().4s,  \r8\().4s,  \r1\().4s
183 |     trn2        \r8\().4s,  \r8\().4s,  \r1\().4s
184 | 
185 |     trn1        \r0\().2d,  \r3\().2d,  \r4\().2d
186 |     trn2        \r4\().2d,  \r3\().2d,  \r4\().2d
187 | 
188 |     trn1        \r1\().2d,  \r5\().2d,  \r6\().2d
189 |     trn2        \r5\().2d,  \r5\().2d,  \r6\().2d
190 | 
191 |     trn2        \r6\().2d,  \r8\().2d,  \r2\().2d
192 |     trn1        \r2\().2d,  \r8\().2d,  \r2\().2d
193 | 
194 |     trn1        \r3\().2d,  \r9\().2d,  \r7\().2d
195 |     trn2        \r7\().2d,  \r9\().2d,  \r7\().2d
196 | .endm
197 | 
198 | .macro  transpose_8x16.b r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
199 |     trn1        \t0\().16b, \r0\().16b, \r1\().16b
200 |     trn2        \t1\().16b, \r0\().16b, \r1\().16b
201 |     trn1        \r1\().16b, \r2\().16b, \r3\().16b
202 |     trn2        \r3\().16b, \r2\().16b, \r3\().16b
203 |     trn1        \r0\().16b, \r4\().16b, \r5\().16b
204 |     trn2        \r5\().16b, \r4\().16b, \r5\().16b
205 |     trn1        \r2\().16b, \r6\().16b, \r7\().16b
206 |     trn2        \r7\().16b, \r6\().16b, \r7\().16b
207 | 
208 |     trn1        \r4\().8h,  \r0\().8h,  \r2\().8h
209 |     trn2        \r2\().8h,  \r0\().8h,  \r2\().8h
210 |     trn1        \r6\().8h,  \r5\().8h,  \r7\().8h
211 |     trn2        \r7\().8h,  \r5\().8h,  \r7\().8h
212 |     trn1        \r5\().8h,  \t1\().8h,  \r3\().8h
213 |     trn2        \t1\().8h,  \t1\().8h,  \r3\().8h
214 |     trn1        \r3\().8h,  \t0\().8h,  \r1\().8h
215 |     trn2        \t0\().8h,  \t0\().8h,  \r1\().8h
216 | 
217 |     trn1        \r0\().4s,  \r3\().4s,  \r4\().4s
218 |     trn2        \r4\().4s,  \r3\().4s,  \r4\().4s
219 | 
220 |     trn1        \r1\().4s,  \r5\().4s,  \r6\().4s
221 |     trn2        \r5\().4s,  \r5\().4s,  \r6\().4s
222 | 
223 |     trn2        \r6\().4s,  \t0\().4s,  \r2\().4s
224 |     trn1        \r2\().4s,  \t0\().4s,  \r2\().4s
225 | 
226 |     trn1        \r3\().4s,  \t1\().4s,  \r7\().4s
227 |     trn2        \r7\().4s,  \t1\().4s,  \r7\().4s
228 | .endm
229 | 
230 | .macro  transpose_4x16.b r0, r1, r2, r3, t4, t5, t6, t7
231 |     trn1        \t4\().16b, \r0\().16b,  \r1\().16b
232 |     trn2        \t5\().16b, \r0\().16b,  \r1\().16b
233 |     trn1        \t6\().16b, \r2\().16b,  \r3\().16b
234 |     trn2        \t7\().16b, \r2\().16b,  \r3\().16b
235 | 
236 |     trn1        \r0\().8h,  \t4\().8h,  \t6\().8h
237 |     trn2        \r2\().8h,  \t4\().8h,  \t6\().8h
238 |     trn1        \r1\().8h,  \t5\().8h,  \t7\().8h
239 |     trn2        \r3\().8h,  \t5\().8h,  \t7\().8h
240 | .endm
241 | 
242 | .macro  transpose_4x8.b  r0, r1, r2, r3, t4, t5, t6, t7
243 |     trn1        \t4\().8b,  \r0\().8b,  \r1\().8b
244 |     trn2        \t5\().8b,  \r0\().8b,  \r1\().8b
245 |     trn1        \t6\().8b,  \r2\().8b,  \r3\().8b
246 |     trn2        \t7\().8b,  \r2\().8b,  \r3\().8b
247 | 
248 |     trn1        \r0\().4h,  \t4\().4h,  \t6\().4h
249 |     trn2        \r2\().4h,  \t4\().4h,  \t6\().4h
250 |     trn1        \r1\().4h,  \t5\().4h,  \t7\().4h
251 |     trn2        \r3\().4h,  \t5\().4h,  \t7\().4h
252 | .endm
253 | 


--------------------------------------------------------------------------------
/src/asm/aarch64-pixel-a-common.S:
--------------------------------------------------------------------------------
 1 | /****************************************************************************
 2 |  * pixel-a-common.S: aarch64 pixel metrics
 3 |  *****************************************************************************
 4 |  * Copyright (C) 2009-2024 x264 project
 5 |  *
 6 |  * Authors: David Conrad <lessen42@gmail.com>
 7 |  *          Janne Grunau <janne-x264@jannau.net>
 8 |  *          David Chen   <david.chen@myais.com.cn>
 9 |  *
10 |  * This program is free software; you can redistribute it and/or modify
11 |  * it under the terms of the GNU General Public License as published by
12 |  * the Free Software Foundation; either version 2 of the License, or
13 |  * (at your option) any later version.
14 |  *
15 |  * This program is distributed in the hope that it will be useful,
16 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 |  * GNU General Public License for more details.
19 |  *
20 |  * You should have received a copy of the GNU General Public License
21 |  * along with this program; if not, write to the Free Software
22 |  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
23 |  *
24 |  * This program is also available under a commercial proprietary license.
25 |  * For more information, contact us at licensing@x264.com.
26 |  *****************************************************************************/
27 | 
28 | // This file contains the NEON macros and constants that are intended to be used by
29 | // the SVE/SVE2 functions as well
30 | 
31 | const mask_ac_4_8
32 | .short 0, -1, -1, -1,  0, -1, -1, -1
33 | .short 0, -1, -1, -1, -1, -1, -1, -1
34 | endconst
35 | 
36 | .macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
37 |     SUMSUB_AB   \s1, \d1, \a, \b
38 |     SUMSUB_AB   \s2, \d2, \c, \d
39 | .endm
40 | 
41 | .macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
42 |     SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
43 |     SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
44 | .endm
45 | 


--------------------------------------------------------------------------------
/src/asm/const-a.asm:
--------------------------------------------------------------------------------
 1 | ;*****************************************************************************
 2 | ;* const-a.asm: x86 global constants
 3 | ;*****************************************************************************
 4 | ;* Copyright (C) 2010-2022 x264 project
 5 | ;*
 6 | ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 7 | ;*          Fiona Glaser <fiona@x264.com>
 8 | ;*
 9 | ;* This program is free software; you can redistribute it and/or modify
10 | ;* it under the terms of the GNU General Public License as published by
11 | ;* the Free Software Foundation; either version 2 of the License, or
12 | ;* (at your option) any later version.
13 | ;*
14 | ;* This program is distributed in the hope that it will be useful,
15 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 | ;* GNU General Public License for more details.
18 | ;*
19 | ;* You should have received a copy of the GNU General Public License
20 | ;* along with this program; if not, write to the Free Software
21 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22 | ;*
23 | ;* This program is also available under a commercial proprietary license.
24 | ;* For more information, contact us at licensing@x264.com.
25 | ;*****************************************************************************
26 | 
27 | %include "x86inc.asm"
28 | 
29 | SECTION_RODATA 32
30 | 
31 | const pb_1,        times 32 db 1
32 | const hsub_mul,    times 16 db 1, -1
33 | const pw_1,        times 16 dw 1
34 | const pw_16,       times 16 dw 16
35 | const pw_32,       times 16 dw 32
36 | const pw_512,      times 16 dw 512
37 | const pw_00ff,     times 16 dw 0x00ff
38 | const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
39 | const pw_0to15,    dw 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
40 | const pd_1,        times 8 dd 1
41 | const pd_0123,     dd 0,1,2,3
42 | const pd_4567,     dd 4,5,6,7
43 | const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
44 | const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
45 | const pb_unpackbd2, times 2 db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
46 | 
47 | const pb_01,       times  8 db 0,1
48 | const pb_0,        times 16 db 0
49 | const pb_a1,       times 16 db 0xa1
50 | const pb_3,        times 16 db 3
51 | const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
52 | 
53 | const pw_2,        times 8 dw 2
54 | const pw_m2,       times 8 dw -2
55 | const pw_4,        times 8 dw 4
56 | const pw_8,        times 8 dw 8
57 | const pw_64,       times 8 dw 64
58 | const pw_256,      times 8 dw 256
59 | const pw_32_0,     times 4 dw 32
60 |                    times 4 dw 0
61 | const pw_8000,     times 8 dw 0x8000
62 | const pw_3fff,     times 8 dw 0x3fff
63 | const pw_ppppmmmm, dw 1,1,1,1,-1,-1,-1,-1
64 | const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1
65 | const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1
66 | const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0
67 | 
68 | const pd_8,        times 4 dd 8
69 | const pd_32,       times 4 dd 32
70 | const pd_1024,     times 4 dd 1024
71 | const pd_ffff,     times 4 dd 0xffff
72 | const pw_ff00,     times 8 dw 0xff00
73 | 
74 | const popcnt_table
75 | %assign x 0
76 | %rep 256
77 | ; population count
78 | db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1)
79 | %assign x x+1
80 | %endrep
81 | 
82 | const sw_64,       dd 64
83 | 


--------------------------------------------------------------------------------
/src/asm/cpu-a.asm:
--------------------------------------------------------------------------------
  1 | ;*****************************************************************************
  2 | ;* cpu-a.asm: x86 cpu utilities
  3 | ;*****************************************************************************
  4 | ;* Copyright (C) 2003-2022 x264 project
  5 | ;*
  6 | ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
  7 | ;*          Loren Merritt <lorenm@u.washington.edu>
  8 | ;*          Fiona Glaser <fiona@x264.com>
  9 | ;*
 10 | ;* This program is free software; you can redistribute it and/or modify
 11 | ;* it under the terms of the GNU General Public License as published by
 12 | ;* the Free Software Foundation; either version 2 of the License, or
 13 | ;* (at your option) any later version.
 14 | ;*
 15 | ;* This program is distributed in the hope that it will be useful,
 16 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 | ;* GNU General Public License for more details.
 19 | ;*
 20 | ;* You should have received a copy of the GNU General Public License
 21 | ;* along with this program; if not, write to the Free Software
 22 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 23 | ;*
 24 | ;* This program is also available under a commercial proprietary license.
 25 | ;* For more information, contact us at licensing@x264.com.
 26 | ;*****************************************************************************
 27 | 
 28 | %include "x86inc.asm"
 29 | 
 30 | SECTION .text
 31 | 
 32 | ;-----------------------------------------------------------------------------
 33 | ; void cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
 34 | ;-----------------------------------------------------------------------------
 35 | cglobal cpu_cpuid, 5,7
 36 |     push rbx
 37 |     push  r4
 38 |     push  r3
 39 |     push  r2
 40 |     push  r1
 41 |     mov  eax, r0d
 42 |     xor  ecx, ecx
 43 |     cpuid
 44 |     pop   r4
 45 |     mov [r4], eax
 46 |     pop   r4
 47 |     mov [r4], ebx
 48 |     pop   r4
 49 |     mov [r4], ecx
 50 |     pop   r4
 51 |     mov [r4], edx
 52 |     pop  rbx
 53 |     RET
 54 | 
 55 | ;-----------------------------------------------------------------------------
 56 | ; uint64_t cpu_xgetbv( int xcr )
 57 | ;-----------------------------------------------------------------------------
 58 | cglobal cpu_xgetbv
 59 |     movifnidn ecx, r0m
 60 |     xgetbv
 61 | %if ARCH_X86_64
 62 |     shl       rdx, 32
 63 |     or        rax, rdx
 64 | %endif
 65 |     ret
 66 | 
 67 | ;-----------------------------------------------------------------------------
 68 | ; void cpu_emms( void )
 69 | ;-----------------------------------------------------------------------------
 70 | cglobal cpu_emms
 71 |     emms
 72 |     ret
 73 | 
 74 | ;-----------------------------------------------------------------------------
 75 | ; void cpu_sfence( void )
 76 | ;-----------------------------------------------------------------------------
 77 | cglobal cpu_sfence
 78 |     sfence
 79 |     ret
 80 | 
 81 | %if ARCH_X86_64 == 0
 82 | ;-----------------------------------------------------------------------------
 83 | ; int cpu_cpuid_test( void )
 84 | ; return 0 if unsupported
 85 | ;-----------------------------------------------------------------------------
 86 | cglobal cpu_cpuid_test
 87 |     pushfd
 88 |     push    ebx
 89 |     push    ebp
 90 |     push    esi
 91 |     push    edi
 92 |     pushfd
 93 |     pop     eax
 94 |     mov     ebx, eax
 95 |     xor     eax, 0x200000
 96 |     push    eax
 97 |     popfd
 98 |     pushfd
 99 |     pop     eax
100 |     xor     eax, ebx
101 |     pop     edi
102 |     pop     esi
103 |     pop     ebp
104 |     pop     ebx
105 |     popfd
106 |     ret
107 | %endif
108 | 


--------------------------------------------------------------------------------
/src/asm/pixel-32.asm:
--------------------------------------------------------------------------------
  1 | ;*****************************************************************************
  2 | ;* pixel-32.asm: x86_32 pixel metrics
  3 | ;*****************************************************************************
  4 | ;* Copyright (C) 2003-2022 x264 project
  5 | ;*
  6 | ;* Authors: Loren Merritt <lorenm@u.washington.edu>
  7 | ;*          Laurent Aimar <fenrir@via.ecp.fr>
  8 | ;*
  9 | ;* This program is free software; you can redistribute it and/or modify
 10 | ;* it under the terms of the GNU General Public License as published by
 11 | ;* the Free Software Foundation; either version 2 of the License, or
 12 | ;* (at your option) any later version.
 13 | ;*
 14 | ;* This program is distributed in the hope that it will be useful,
 15 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | ;* GNU General Public License for more details.
 18 | ;*
 19 | ;* You should have received a copy of the GNU General Public License
 20 | ;* along with this program; if not, write to the Free Software
 21 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 22 | ;*
 23 | ;* This program is also available under a commercial proprietary license.
 24 | ;* For more information, contact us at licensing@x264.com.
 25 | ;*****************************************************************************
 26 | 
 27 | %include "x86inc.asm"
 28 | %include "x86util.asm"
 29 | 
 30 | cextern pw_ppmmppmm
 31 | cextern pw_pmpmpmpm
 32 | 
 33 | SECTION .text
 34 | INIT_MMX mmx2
 35 | 
 36 | %if HIGH_BIT_DEPTH == 0
 37 | 
 38 | %macro LOAD_DIFF_4x8P 1 ; dx
 39 |     LOAD_DIFF  m0, m7, none, [r0+%1],      [r2+%1]
 40 |     LOAD_DIFF  m1, m6, none, [r0+%1+r1],   [r2+%1+r3]
 41 |     LOAD_DIFF  m2, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
 42 |     LOAD_DIFF  m3, m6, none, [r0+%1+r4],   [r2+%1+r5]
 43 |     lea  r0, [r0+4*r1]
 44 |     lea  r2, [r2+4*r3]
 45 |     LOAD_DIFF  m4, m7, none, [r0+%1],      [r2+%1]
 46 |     LOAD_DIFF  m5, m6, none, [r0+%1+r1],   [r2+%1+r3]
 47 |     LOAD_DIFF  m6, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
 48 |     movq [spill], m5
 49 |     LOAD_DIFF  m7, m5, none, [r0+%1+r4],   [r2+%1+r5]
 50 |     movq m5, [spill]
 51 | %endmacro
 52 | 
 53 | %macro SUM4x8_MM 0
 54 |     movq [spill],   m6
 55 |     movq [spill+8], m7
 56 |     ABSW2    m0, m1, m0, m1, m6, m7
 57 |     ABSW2    m2, m3, m2, m3, m6, m7
 58 |     paddw    m0, m2
 59 |     paddw    m1, m3
 60 |     movq     m6, [spill]
 61 |     movq     m7, [spill+8]
 62 |     ABSW2    m4, m5, m4, m5, m2, m3
 63 |     ABSW2    m6, m7, m6, m7, m2, m3
 64 |     paddw    m4, m6
 65 |     paddw    m5, m7
 66 |     paddw    m0, m4
 67 |     paddw    m1, m5
 68 |     paddw    m0, m1
 69 | %endmacro
 70 | 
 71 | ;-----------------------------------------------------------------------------
 72 | ; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
 73 | ;-----------------------------------------------------------------------------
 74 | cglobal pixel_sa8d_8x8_internal
 75 |     push   r0
 76 |     push   r2
 77 |     sub    esp, 0x74
 78 | %define args  esp+0x74
 79 | %define spill esp+0x60 ; +16
 80 | %define trans esp+0    ; +96
 81 |     LOAD_DIFF_4x8P 0
 82 |     HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
 83 | 
 84 |     movq   [spill], m1
 85 |     TRANSPOSE4x4W 4, 5, 6, 7, 1
 86 |     movq   [trans+0x00], m4
 87 |     movq   [trans+0x08], m5
 88 |     movq   [trans+0x10], m6
 89 |     movq   [trans+0x18], m7
 90 |     movq   m1, [spill]
 91 |     TRANSPOSE4x4W 0, 1, 2, 3, 4
 92 |     movq   [trans+0x20], m0
 93 |     movq   [trans+0x28], m1
 94 |     movq   [trans+0x30], m2
 95 |     movq   [trans+0x38], m3
 96 | 
 97 |     mov    r0, [args+4]
 98 |     mov    r2, [args]
 99 |     LOAD_DIFF_4x8P 4
100 |     HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
101 | 
102 |     movq   [spill], m7
103 |     TRANSPOSE4x4W 0, 1, 2, 3, 7
104 |     movq   [trans+0x40], m0
105 |     movq   [trans+0x48], m1
106 |     movq   [trans+0x50], m2
107 |     movq   [trans+0x58], m3
108 |     movq   m7, [spill]
109 |     TRANSPOSE4x4W 4, 5, 6, 7, 1
110 |     movq   m0, [trans+0x00]
111 |     movq   m1, [trans+0x08]
112 |     movq   m2, [trans+0x10]
113 |     movq   m3, [trans+0x18]
114 | 
115 |     HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
116 |     SUM4x8_MM
117 |     movq   [trans], m0
118 | 
119 |     movq   m0, [trans+0x20]
120 |     movq   m1, [trans+0x28]
121 |     movq   m2, [trans+0x30]
122 |     movq   m3, [trans+0x38]
123 |     movq   m4, [trans+0x40]
124 |     movq   m5, [trans+0x48]
125 |     movq   m6, [trans+0x50]
126 |     movq   m7, [trans+0x58]
127 | 
128 |     HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
129 |     SUM4x8_MM
130 | 
131 |     pavgw  m0, [trans]
132 |     add   esp, 0x7c
133 |     ret
134 | %undef args
135 | %undef spill
136 | %undef trans
137 | 
138 | %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
139 |     pxor        %7, %7
140 |     pshufw      %4, %1, q1032
141 |     pshufw      %5, %2, q1032
142 |     pshufw      %6, %3, q1032
143 |     paddusw     %1, %4
144 |     paddusw     %2, %5
145 |     paddusw     %3, %6
146 |     punpcklwd   %1, %7
147 |     punpcklwd   %2, %7
148 |     punpcklwd   %3, %7
149 |     pshufw      %4, %1, q1032
150 |     pshufw      %5, %2, q1032
151 |     pshufw      %6, %3, q1032
152 |     %8          %1, %4
153 |     %8          %2, %5
154 |     %8          %3, %6
155 | %endmacro
156 | 
157 | %macro LOAD_4x8P 1 ; dx
158 |     pxor        m7, m7
159 |     movd        m6, [r0+%1+7*FENC_STRIDE]
160 |     movd        m0, [r0+%1+0*FENC_STRIDE]
161 |     movd        m1, [r0+%1+1*FENC_STRIDE]
162 |     movd        m2, [r0+%1+2*FENC_STRIDE]
163 |     movd        m3, [r0+%1+3*FENC_STRIDE]
164 |     movd        m4, [r0+%1+4*FENC_STRIDE]
165 |     movd        m5, [r0+%1+5*FENC_STRIDE]
166 |     punpcklbw   m6, m7
167 |     punpcklbw   m0, m7
168 |     punpcklbw   m1, m7
169 |     movq   [spill], m6
170 |     punpcklbw   m2, m7
171 |     punpcklbw   m3, m7
172 |     movd        m6, [r0+%1+6*FENC_STRIDE]
173 |     punpcklbw   m4, m7
174 |     punpcklbw   m5, m7
175 |     punpcklbw   m6, m7
176 |     movq        m7, [spill]
177 | %endmacro
178 | 
179 | %macro HSUMSUB2 4
180 |     pshufw m4, %1, %3
181 |     pshufw m5, %2, %3
182 |     pmullw %1, %4
183 |     pmullw m5, %4
184 |     paddw  %1, m4
185 |     paddw  %2, m5
186 | %endmacro
187 | 
188 | ;-----------------------------------------------------------------------------
189 | ; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
190 | ;-----------------------------------------------------------------------------
191 | cglobal intra_sa8d_x3_8x8, 2,3
192 |     SUB    esp, 0x94
193 | %define edge  esp+0x70 ; +32
194 | %define spill esp+0x60 ; +16
195 | %define trans esp+0    ; +96
196 | %define sum   esp+0    ; +32
197 | 
198 |     pxor      m7, m7
199 |     movq      m0, [r1+7]
200 |     movq      m2, [r1+16]
201 |     movq      m1, m0
202 |     movq      m3, m2
203 |     punpcklbw m0, m7
204 |     punpckhbw m1, m7
205 |     punpcklbw m2, m7
206 |     punpckhbw m3, m7
207 |     movq      m6, [pw_ppmmppmm]
208 |     HSUMSUB2  m0, m2, q1032, m6
209 |     HSUMSUB2  m1, m3, q1032, m6
210 |     movq      m6, [pw_pmpmpmpm]
211 |     HSUMSUB2  m0, m2, q2301, m6
212 |     HSUMSUB2  m1, m3, q2301, m6
213 |     movq      m4, m0
214 |     movq      m5, m2
215 |     paddw     m0, m1
216 |     paddw     m2, m3
217 |     psubw     m4, m1
218 |     psubw     m3, m5
219 |     movq [edge+0], m0
220 |     movq [edge+8], m4
221 |     movq [edge+16], m2
222 |     movq [edge+24], m3
223 | 
224 |     LOAD_4x8P 0
225 |     HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
226 | 
227 |     movq   [spill], m0
228 |     TRANSPOSE4x4W 4, 5, 6, 7, 0
229 |     movq   [trans+0x00], m4
230 |     movq   [trans+0x08], m5
231 |     movq   [trans+0x10], m6
232 |     movq   [trans+0x18], m7
233 |     movq   m0, [spill]
234 |     TRANSPOSE4x4W 0, 1, 2, 3, 4
235 |     movq   [trans+0x20], m0
236 |     movq   [trans+0x28], m1
237 |     movq   [trans+0x30], m2
238 |     movq   [trans+0x38], m3
239 | 
240 |     LOAD_4x8P 4
241 |     HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
242 | 
243 |     movq   [spill], m7
244 |     TRANSPOSE4x4W 0, 1, 2, 3, 7
245 |     movq   [trans+0x40], m0
246 |     movq   [trans+0x48], m1
247 |     movq   [trans+0x50], m2
248 |     movq   [trans+0x58], m3
249 |     movq   m7, [spill]
250 |     TRANSPOSE4x4W 4, 5, 6, 7, 0
251 |     movq   m0, [trans+0x00]
252 |     movq   m1, [trans+0x08]
253 |     movq   m2, [trans+0x10]
254 |     movq   m3, [trans+0x18]
255 | 
256 |     HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
257 | 
258 |     movq [spill+0], m0
259 |     movq [spill+8], m1
260 |     ABSW2    m2, m3, m2, m3, m0, m1
261 |     ABSW2    m4, m5, m4, m5, m0, m1
262 |     paddw    m2, m4
263 |     paddw    m3, m5
264 |     ABSW2    m6, m7, m6, m7, m4, m5
265 |     movq     m0, [spill+0]
266 |     movq     m1, [spill+8]
267 |     paddw    m2, m6
268 |     paddw    m3, m7
269 |     paddw    m2, m3
270 |     ABSW     m1, m1, m4
271 |     paddw    m2, m1 ; 7x4 sum
272 |     movq     m7, m0
273 |     movq     m1, [edge+8] ; left bottom
274 |     psllw    m1, 3
275 |     psubw    m7, m1
276 |     ABSW2    m0, m7, m0, m7, m5, m3
277 |     paddw    m0, m2
278 |     paddw    m7, m2
279 |     movq [sum+0], m0 ; dc
280 |     movq [sum+8], m7 ; left
281 | 
282 |     movq   m0, [trans+0x20]
283 |     movq   m1, [trans+0x28]
284 |     movq   m2, [trans+0x30]
285 |     movq   m3, [trans+0x38]
286 |     movq   m4, [trans+0x40]
287 |     movq   m5, [trans+0x48]
288 |     movq   m6, [trans+0x50]
289 |     movq   m7, [trans+0x58]
290 | 
291 |     HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
292 | 
293 |     movd   [sum+0x10], m0
294 |     movd   [sum+0x12], m1
295 |     movd   [sum+0x14], m2
296 |     movd   [sum+0x16], m3
297 |     movd   [sum+0x18], m4
298 |     movd   [sum+0x1a], m5
299 |     movd   [sum+0x1c], m6
300 |     movd   [sum+0x1e], m7
301 | 
302 |     movq [spill],   m0
303 |     movq [spill+8], m1
304 |     ABSW2    m2, m3, m2, m3, m0, m1
305 |     ABSW2    m4, m5, m4, m5, m0, m1
306 |     paddw    m2, m4
307 |     paddw    m3, m5
308 |     paddw    m2, m3
309 |     movq     m0, [spill]
310 |     movq     m1, [spill+8]
311 |     ABSW2    m6, m7, m6, m7, m4, m5
312 |     ABSW     m1, m1, m3
313 |     paddw    m2, m7
314 |     paddw    m1, m6
315 |     paddw    m2, m1 ; 7x4 sum
316 |     movq     m1, m0
317 | 
318 |     movq     m7, [edge+0]
319 |     psllw    m7, 3   ; left top
320 | 
321 |     mov      r2, [edge+0]
322 |     add      r2, [edge+16]
323 |     lea      r2, [4*r2+32]
324 |     and      r2, 0xffc0
325 |     movd     m6, r2 ; dc
326 | 
327 |     psubw    m1, m7
328 |     psubw    m0, m6
329 |     ABSW2    m0, m1, m0, m1, m5, m6
330 |     movq     m3, [sum+0] ; dc
331 |     paddw    m0, m2
332 |     paddw    m1, m2
333 |     movq     m2, m0
334 |     paddw    m0, m3
335 |     paddw    m1, [sum+8] ; h
336 |     psrlq    m2, 16
337 |     paddw    m2, m3
338 | 
339 |     movq     m3, [edge+16] ; top left
340 |     movq     m4, [edge+24] ; top right
341 |     psllw    m3, 3
342 |     psllw    m4, 3
343 |     psubw    m3, [sum+16]
344 |     psubw    m4, [sum+24]
345 |     ABSW2    m3, m4, m3, m4, m5, m6
346 |     paddw    m2, m3
347 |     paddw    m2, m4 ; v
348 | 
349 |     SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, pavgw
350 |     mov      r2, r2m
351 |     pxor      m7, m7
352 |     punpckldq m2, m1
353 |     pavgw     m0, m7
354 |     pavgw     m2, m7
355 |     movd  [r2+8], m0 ; dc
356 |     movq  [r2+0], m2 ; v, h
357 |     ADD     esp, 0x94
358 |     RET
359 | %undef edge
360 | %undef spill
361 | %undef trans
362 | %undef sum
363 | 
364 | 
365 | 
366 | ;-----------------------------------------------------------------------------
367 | ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
368 | ;                             const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
369 | ;-----------------------------------------------------------------------------
370 | cglobal pixel_ssim_4x4x2_core, 0,5
371 |     mov       r1, r1m
372 |     mov       r3, r3m
373 |     mov       r4, 4
374 |     pxor      m0, m0
375 | .loop:
376 |     mov       r0, r0m
377 |     mov       r2, r2m
378 |     add       r0, r4
379 |     add       r2, r4
380 |     pxor      m1, m1
381 |     pxor      m2, m2
382 |     pxor      m3, m3
383 |     pxor      m4, m4
384 | %rep 4
385 |     movd      m5, [r0]
386 |     movd      m6, [r2]
387 |     punpcklbw m5, m0
388 |     punpcklbw m6, m0
389 |     paddw     m1, m5
390 |     paddw     m2, m6
391 |     movq      m7, m5
392 |     pmaddwd   m5, m5
393 |     pmaddwd   m7, m6
394 |     pmaddwd   m6, m6
395 |     paddd     m3, m5
396 |     paddd     m4, m7
397 |     paddd     m3, m6
398 |     add       r0, r1
399 |     add       r2, r3
400 | %endrep
401 |     mov       r0, r4m
402 |     lea       r0, [r0+r4*4]
403 |     pshufw    m5, m1, q0032
404 |     pshufw    m6, m2, q0032
405 |     paddusw   m1, m5
406 |     paddusw   m2, m6
407 |     punpcklwd m1, m2
408 |     pshufw    m2, m1, q0032
409 |     pshufw    m5, m3, q0032
410 |     pshufw    m6, m4, q0032
411 |     paddusw   m1, m2
412 |     paddd     m3, m5
413 |     paddd     m4, m6
414 |     punpcklwd m1, m0
415 |     punpckldq m3, m4
416 |     movq  [r0+0], m1
417 |     movq  [r0+8], m3
418 |     sub       r4, 4
419 |     jge .loop
420 |     emms
421 |     RET
422 | 
423 | %endif ; !HIGH_BIT_DEPTH
424 | 


--------------------------------------------------------------------------------