├── img
    ├── mask.png
    ├── coppers.png
    ├── rainbow.png
    ├── rainbow2.png
    ├── weights.png
    ├── mpv-shot0001.jpg
    ├── mpv-shot0002.jpg
    ├── mpv-shot0003.jpg
    ├── mpv-shot0004.jpg
    ├── coppers.php
    └── rainbow.php
├── newhash
    ├── newhash.hh
    ├── endian.hh
    ├── newhash.cc
    └── simd.hh
├── make-reencoded.sh
├── reencode.sh
├── blur.hh
├── README.md
└── crt-filter.cc


/img/mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bisqwit/crt-filter/HEAD/img/mask.png


--------------------------------------------------------------------------------
/img/coppers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bisqwit/crt-filter/HEAD/img/coppers.png


--------------------------------------------------------------------------------
/img/rainbow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bisqwit/crt-filter/HEAD/img/rainbow.png


--------------------------------------------------------------------------------
/img/rainbow2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bisqwit/crt-filter/HEAD/img/rainbow2.png


--------------------------------------------------------------------------------
/img/weights.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bisqwit/crt-filter/HEAD/img/weights.png


--------------------------------------------------------------------------------
/img/mpv-shot0001.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bisqwit/crt-filter/HEAD/img/mpv-shot0001.jpg


--------------------------------------------------------------------------------
/img/mpv-shot0002.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bisqwit/crt-filter/HEAD/img/mpv-shot0002.jpg


--------------------------------------------------------------------------------
/img/mpv-shot0003.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bisqwit/crt-filter/HEAD/img/mpv-shot0003.jpg


--------------------------------------------------------------------------------
/img/mpv-shot0004.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bisqwit/crt-filter/HEAD/img/mpv-shot0004.jpg


--------------------------------------------------------------------------------
/newhash/newhash.hh:
--------------------------------------------------------------------------------
 1 | #ifndef bqtNewhashHH
 2 | #define bqtNewhashHH
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | #include "endian.hh"
 9 | 
10 | typedef std::uint_least32_t newhash_t;
11 | 
12 | extern newhash_t newhash_calc(const unsigned char* buf, unsigned long size);
13 | extern newhash_t newhash_calc_upd(newhash_t c, const unsigned char* buf, unsigned long size);
14 | 
15 | #ifdef __cplusplus
16 | }
17 | #endif
18 | 
19 | #endif
20 | 


--------------------------------------------------------------------------------
/make-reencoded.sh:
--------------------------------------------------------------------------------
 1 | parallel -j6 -- \
 2 | 	"./reencode.sh 200 sync-gwbasic.avi sync-gwbasic-fancy.mkv 640 200   60 2880 2160" \
 3 | 	"./reencode.sh 400 tp5.avi tp5-fancy.mkv                   2880 400  60 2880 2160" \
 4 | 	"./reencode.sh 400 sync-qbasic.avi sync-qbasic-fancy.mkv   2880 400  60 2880 2160" \
 5 | 	"./reencode.sh 400 sync-editv2.avi sync-editv2-fancy.mkv   2880 400  60 2880 2160" \
 6 | 	"./reencode.sh 400 sync-bc3.avi sync-bc3-fancy.mkv         2880 400  60 2880 2160" \
 7 | 	"./reencode.sh 400 sync-bp7.avi sync-bp7-fancy.mkv         2880 400  60 2880 2160" \
 8 | 	"./reencode.sh 400 bp7conf.avi bp7conf-fancy.mkv           2880 400  60 2880 2160" \
 9 | 	"./reencode.sh 350 q.avi q-fancy.mkv                       640 350   60 2880 2160"
10 | 


--------------------------------------------------------------------------------
/reencode.sh:
--------------------------------------------------------------------------------
 1 | scanlines=$1
 2 | outputfile="$3"
 3 | 
 4 | w=$4
 5 | h=$5
 6 | 
 7 | ow=$7
 8 | oh=$8
 9 | r=$6
10 | 
11 | f="$2"
12 | 
13 | ffmpeg -i "$f" -sws_flags lanczos -vf scale=$w:$h -pix_fmt bgra \
14 | 	-f rawvideo -threads 14 -r $r -y /dev/stdout \
15 | | ./crt-filter $w $h $ow $oh $scanlines \
16 | | ffmpeg -f rawvideo -pixel_format bgra -video_size $ow"x"$oh \
17 | 	 -framerate $r -i /dev/stdin \
18 | 	 -c:v h264 -pix_fmt yuv444p -crf 14 -threads 14 \
19 | 	 -g $((r/2)) -preset veryslow "$outputfile"
20 | 
21 | 
22 | # The first ffmpeg just converts the colorspace into BGRA.
23 | # It should not change the resolution.
24 | 
25 | # The second one does the rescaling.
26 | 
27 | # The third one compresses as H.264 — again, without rescaling.
28 | 


--------------------------------------------------------------------------------
/img/coppers.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | $c=1;
 4 | function eq($n)
 5 | {
 6 |   global $c;
 7 |   return exp(-0.5*($n-0.5)*($n-0.5)/($c*$c));
 8 | }
 9 | 
10 | $w = 80;
11 | $h = 392;
12 | 
13 | $im    = ImageCreateTrueColor($w*3+4, $h);
14 | $lines = 4;
15 | 
16 | $s=0;
17 | foreach(Array(0.1, 0.3, 0.5) as $c)
18 | {
19 |   for($y=0; $y<$h; ++$y)
20 |   {
21 |     $line = $y*$lines/$h;
22 |     $eq   = eq($line - (int)$line);
23 |     
24 |     $color = ImageColorAllocate($im, $eq*200, $eq*50, $eq*255);
25 |     ImageLine($im, $s, $y, $s+$w, $y, $color);
26 |   }
27 |   for($l=0; $l<$lines; ++$l)
28 |   {
29 |     for($d = 0; $d < 4; ++$d)
30 |     {
31 |       $y = ($l + $d/4) * $h/$lines;
32 |       
33 |       $wid = 1/16;
34 |       if($d) $wid /= 4;
35 |       if($d==2) $wid *= 2;
36 |       
37 |       ImageLine($im, $s+0, $y-1, $s+$w*$wid, $y-1, 0xAA55AA);
38 |       ImageLine($im, $s+0, $y  , $s+$w*$wid, $y  , 0xFFFFFF);
39 |       ImageLine($im, $s+0, $y+1, $s+$w*$wid, $y+1, 0xAA55AA);
40 |       
41 |       ImageString($im, 2, $s+$w*$wid+3, $y, $l+$d/4, 0xFFFFFF);
42 |     }
43 |   }
44 |   ImageString($im, 3, $s+$w*0.4, 0, "c = $c", 0xFFFF55);
45 | 
46 |   $s += $w+2;
47 | }
48 | 
49 | ImagePng($im, 'test3.png');
50 | 
51 | 


--------------------------------------------------------------------------------
/blur.hh:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | 
 3 | /* blur(): Really fast O(n) gaussian blur algorithm (gaussBlur_4)
 4 |  * By Ivan Kuckir with ideas from Wojciech Jarosz
 5 |  * Adapted from http://blog.ivank.net/fastest-gaussian-blur.html
 6 |  *
 7 |  * input:  The two-dimensional array of input signal. Must contain w*h elements.
 8 |  * output: Where the two-dimensional array of blurred signal will be written
 9 |  * temp:   Another array, for temporary use. Same size as input and output.
10 |  * w:      Width of array
11 |  * h:      Height of array.
12 |  * sigma:  Blurring kernel size. Must be smaller than w and h.
13 |  * n_boxes: Controls the blurring quality. 1 = box filter. 3 = pretty good filter.
14 |  *          Higher number = diminishingly better results, but linearly slower.
15 |  * elem_t: Type of elements. Should be integer type.
16 |  */
17 | template<unsigned n_boxes, typename elem_t>
18 | void blur(const elem_t* input, elem_t* output, elem_t* temp,
19 |           unsigned w,unsigned h,float sigma)
20 | {
21 |     auto wIdeal = std::sqrt((12*sigma*sigma/n_boxes)+1);  // Ideal averaging filter width
22 |     unsigned wl = wIdeal; if(wl%2==0) --wl;
23 |     unsigned wu = wl+2;
24 |     auto mIdeal = (12*sigma*sigma - n_boxes*wl*wl - 4*n_boxes*wl - 3*n_boxes)/(-4.*wl - 4);
25 |     unsigned m = std::round(mIdeal);
26 |     const elem_t* data = input;
27 |     for(unsigned n=0; n<n_boxes; ++n)
28 |     {
29 |         unsigned r = ((n<m ? wl : wu) - 1)/2; // IDK should this be float?
30 |         // boxBlur_4:
31 |         float iarr = 1.f / (r+r+1);
32 |         // boxBlurH_4 (blur horizontally for each row):
33 |         const elem_t* scl = data; elem_t* tcl = temp;
34 |         for(unsigned i=0; i<h; ++i)
35 |         {
36 |             auto ti = i*w, li = ti, ri = ti+r;
37 |             auto fv = scl[ti], lv = scl[ti+w-1]; int val = 0;
38 |             #pragma omp simd reduction(+:val)
39 |             for(unsigned j=0; j<r; j++) val += scl[ti+j];
40 |             val += (r+1)*fv;
41 |             for(unsigned j=0  ; j<=r ; j++) { val += scl[ri++] - fv       ;   tcl[ti++] = std::round(val*iarr); }
42 |             for(unsigned j=r+1; j<w-r; j++) { val += scl[ri++] - scl[li++];   tcl[ti++] = std::round(val*iarr); }
43 |             for(unsigned j=w-r; j<w  ; j++) { val += lv        - scl[li++];   tcl[ti++] = std::round(val*iarr); }
44 |         }
45 |         // boxBlurT_4 (blur vertically for each column)
46 |         scl = temp; tcl = output;
47 |         for(unsigned i=0; i<w; ++i)
48 |         {
49 |             auto ti = i, li = ti, ri = ti+r*w;
50 |             auto fv = scl[ti], lv = scl[ti+w*(h-1)]; int val = 0;
51 |             #pragma omp simd reduction(+:val)
52 |             for(unsigned j=0; j<r;  ++j) val += scl[ti + j*w];
53 |             val += (r+1)*fv;
54 |             for(unsigned j=0; j<=r; ++j)    { val += scl[ri] - fv     ;  tcl[ti] = std::round(val*iarr);  ri+=w; ti+=w; }
55 |             for(unsigned j=r+1; j<h-r; ++j) { val += scl[ri] - scl[li];  tcl[ti] = std::round(val*iarr);  li+=w; ri+=w; ti+=w; }
56 |             for(unsigned j=h-r; j<h; ++j)   { val += lv      - scl[li];  tcl[ti] = std::round(val*iarr);  li+=w; ti+=w; }
57 |         }
58 |         data = output;
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/img/rainbow.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | $a = 2126;
  4 | $b = 7152;
  5 | $c = 722;
  6 | 
  7 | $abc = $a+$b+$c+0.0;
  8 | 
  9 | function hsv_to_bgr($h,$s, $v)
 10 | {
 11 |     $v *= 255.0;
 12 | 
 13 |     if ($s == 0.0)
 14 |         return Array($v,$v,$v);
 15 |     else
 16 |     {
 17 |         while ($h < 0) $h += 360;
 18 |         $h = fmod($h, 360.0) / 60.0;
 19 |         $i = (int)$h;
 20 |         $frac = $h - $i;
 21 |         $p    = $v - $v*$s;
 22 |         $qt   = $v*$s*$frac;
 23 |         
 24 |         $bgr = Array();
 25 |         $bgr[ ((10-$i)%6)>>1 ] = ($v); // max  
 26 |         $bgr[ ( (7-$i)%6)>>1 ] = ($p); // min
 27 |         $bgr[    ($i+1)%3 ] = (
 28 |             ($i&1) ? $v - $qt    // max downto min
 29 |                    : $p + $qt ); // min   upto max
 30 |         return $bgr;
 31 |     }
 32 | }
 33 | 
 34 | $dither = Array(1,49,13,61,4,52,16,64,33,17,45,29,36,20,48,32,9,57,5,53,12,60,8,56,41,25,37,21,44,28,40,24,3,51,15,63,2,50,14,62,35,19,47,31,34,18,46,30,11,59,7,55,10,58,6,54,43,27,39,23,42,26,38,22);
 35 | foreach($dither as &$v)$v /= 64.; unset($v);
 36 | 
 37 | function clamp($pix, $x,$y)
 38 | {
 39 |   global $gamma,$dither;
 40 | 
 41 |   if($pix[0] < 0) $pix[0] = 0;
 42 |   if($pix[1] < 0) $pix[1] = 0;
 43 |   if($pix[2] < 0) $pix[2] = 0;
 44 |   $pix[0] = pow($pix[0]/255, $gamma)*255;
 45 |   $pix[1] = pow($pix[1]/255, $gamma)*255;
 46 |   $pix[2] = pow($pix[2]/255, $gamma)*255;
 47 | 
 48 |   $d = $dither[($x&7)*8+($y&7)];
 49 |   $pix[0] += $d;
 50 |   $pix[1] += $d;
 51 |   $pix[2] += $d;
 52 |   if($pix[0] > 255) $pix[0] = 255;
 53 |   if($pix[1] > 255) $pix[1] = 255;
 54 |   if($pix[2] > 255) $pix[2] = 255;
 55 |   
 56 |   return (int)$pix[2] * 65536
 57 |        + (int)$pix[1] * 256
 58 |        + (int)$pix[0];
 59 | }
 60 | function clamp2($pix, $x,$y)
 61 | {
 62 |   global $a,$b,$c,$abc;
 63 |   $lum = $pix[2]*$a + $pix[1]*$b + $pix[0]*$c;
 64 |   if($lum > 255*$abc) return 0xFFFFFF;
 65 |   if($lum <= 0)       return 0x000000;
 66 |   $lum /= ($abc*1.0);
 67 | 
 68 |   $aa=$a; $bb=$b; $cc=$c;
 69 |   #$aa=1111; $bb=3333; $cc=5334;
 70 |   for($round=0; $round<3; ++$round)
 71 |   {
 72 |     $excess = $aa*max(0, $pix[2]-255)
 73 |             + $bb*max(0, $pix[1]-255)
 74 |             + $cc*max(0, $pix[0]-255);
 75 |     // $excess is the amount of excess color energy that
 76 |     // we must dissipate.
 77 |     if($excess > 0)
 78 |     {
 79 |       // Check how much capacity there is on each channel.
 80 |       $capacity = 0;
 81 |       $cap2 = (255-$pix[2])*$aa;
 82 |       $cap1 = (255-$pix[1])*$bb;
 83 |       $cap0 = (255-$pix[0])*$cc;
 84 |       $capacity = max(0,$cap2) + max(0,$cap1) + max(0,$cap0);
 85 |       if($capacity > 0)
 86 |       {
 87 |         $distribute = min($capacity, $excess);
 88 |         $factor1    = $distribute/$capacity;
 89 | 
 90 |         // Add the color energy to capable channels
 91 |         if($cap2 > 0) $pix[2] += ($cap2*$factor1)/$aa;
 92 |         if($cap1 > 0) $pix[1] += ($cap1*$factor1)/$bb;
 93 |         if($cap0 > 0) $pix[0] += ($cap0*$factor1)/$cc;
 94 | 
 95 |         // And take it away from channels that had excess
 96 |         $factor2    = $distribute/$excess;
 97 |         if($cap2 < 0) $pix[2] += ($cap2*$factor2)/$aa;
 98 |         if($cap1 < 0) $pix[1] += ($cap1*$factor2)/$bb;
 99 |         if($cap0 < 0) $pix[0] += ($cap0*$factor2)/$cc;
100 |       }
101 |     }
102 |     
103 |     $debt = $aa*min(0, $pix[2])
104 |           + $bb*min(0, $pix[1])
105 |           + $cc*min(0, $pix[0]);
106 |     // $debt is the amount of debt color energy
107 |     // that we must borrow.
108 |     if($debt < 0)
109 |     {
110 |       // Check how much capacity there is on each channel.
111 |       $capacity = 0;
112 |       $cap2 = ($pix[2])*$aa;
113 |       $cap1 = ($pix[1])*$bb;
114 |       $cap0 = ($pix[0])*$cc;
115 |       $capacity = max(0,$cap2) + max(0,$cap1) + max(0,$cap0);
116 |       if($capacity > 0)
117 |       {
118 |         $distribute = min($capacity, $excess);
119 |         $factor1    = $distribute/$capacity;
120 | 
121 |         // Take away color energy from capable channels
122 |         if($cap2 > 0) $pix[2] -= ($cap2*$factor1)/$aa;
123 |         if($cap1 > 0) $pix[1] -= ($cap1*$factor1)/$bb;
124 |         if($cap0 > 0) $pix[0] -= ($cap0*$factor1)/$cc;
125 | 
126 |         // And give it to channels that need it
127 |         $factor2    = $distribute/$excess;
128 |         if($cap2 < 0) $pix[2] -= ($cap2*$factor2)/$aa;
129 |         if($cap1 < 0) $pix[1] -= ($cap1*$factor2)/$bb;
130 |         if($cap0 < 0) $pix[0] -= ($cap0*$factor2)/$cc;
131 |       }
132 |     }
133 |     if(!$excess && !$debt) break;
134 |   }
135 |   return clamp($pix,$x,$y);
136 | }
137 | 
138 | $w  = 848;
139 | $h  = 480;
140 | $im = ImageCreateTrueColor($w*2, $h);
141 | 
142 | $gamma  = 1/2.0;
143 | $gamma2 = 2.0;
144 | for($y=0; $y<$h; ++$y)
145 | {
146 |   #$bright = pow($y/$h, 2.0)*3;
147 |   $bright = pow($y/$h, $gamma2)*1.0;
148 |   for($x=0; $x<$w; ++$x)
149 |   {
150 |     $pix = hsv_to_bgr($x * 1.5*360 / $w - 180,
151 |                       min(1, $x*1.9/$w),
152 |                       0.1);
153 |     $pix[0] = pow($pix[0]/255., 1/$gamma);
154 |     $pix[1] = pow($pix[1]/255., 1/$gamma);
155 |     $pix[2] = pow($pix[2]/255., 1/$gamma);
156 | 
157 |     $lum = ($pix[2]*$a + $pix[1]*$b + $pix[0]*$c) / $abc;
158 | 
159 |     $pix[0] = ($pix[0] * $bright/$lum)*255;
160 |     $pix[1] = ($pix[1] * $bright/$lum)*255;
161 |     $pix[2] = ($pix[2] * $bright/$lum)*255;
162 |     
163 |     $color = clamp($pix, $x,$y);
164 |     ImageSetPixel($im, $x,$y, $color);
165 | 
166 |     $color = clamp2($pix, $x,$y);
167 |     ImageSetPixel($im, $x+$w+16,$y, $color);
168 |   }
169 | }
170 | ImagePng($im, 'test.png');
171 | 
172 | print "done\n";
173 | 
174 | 


--------------------------------------------------------------------------------
/newhash/endian.hh:
--------------------------------------------------------------------------------
  1 | #ifndef bqtEndianHH
  2 | #define bqtEndianHH
  3 | 
  4 | #ifndef __STDC_CONSTANT_MACROS
  5 | #define __STDC_CONSTANT_MACROS /* for UINT16_C etc */
  6 | #endif
  7 | 
  8 | #include <cstdint>
  9 | 
 10 | #if defined(__x86_64)||defined(__i386)
 11 | #define LITTLE_ENDIAN_AND_UNALIGNED_ACCESS_OK
 12 | #else
 13 | #undef LITTLE_ENDIAN_AND_UNALIGNED_ACCESS_OK
 14 | #endif
 15 | 
 16 | #ifdef WIN32
 17 | # define LL_FMT "I64"
 18 | #else
 19 | # define LL_FMT "ll"
 20 | #endif
 21 | 
 22 | 
 23 | static inline std::uint_fast16_t get_8(const void* p)
 24 | {
 25 |     const unsigned char* data = (const unsigned char*)p;
 26 |     return data[0];
 27 | }
 28 | static inline std::uint_fast16_t get_16(const void* p)
 29 | {
 30 |   #ifdef LITTLE_ENDIAN_AND_UNALIGNED_ACCESS_OK
 31 |     return *(const std::uint_least16_t*)p;
 32 |   #else
 33 |     const unsigned char* data = (const unsigned char*)p;
 34 |     return get_8(data)  | (get_8(data+1) << UINT16_C(8));
 35 |   #endif
 36 | }
 37 | static inline std::uint_fast16_t R16r(const void* p)
 38 | {
 39 |   #ifdef BIG_ENDIAN_AND_UNALIGNED_ACCESS_OK
 40 |     return *(const std::uint_least16_t*)p;
 41 |   #else
 42 |     const unsigned char* data = (const unsigned char*)p;
 43 |     return get_8(data+1)  | (get_8(data) << UINT16_C(8));
 44 |   #endif
 45 | }
 46 | static inline std::uint_fast32_t R24(const void* p)
 47 | {
 48 |     /* Note: This might be faster if implemented through R32 and a bitwise and,
 49 |      * but we cannot do that because we don't know if the third byte is a valid
 50 |      * memory location.
 51 |      */
 52 |     const unsigned char* data = (const unsigned char*)p;
 53 |     return get_16(data) | (get_8(data+2) << UINT32_C(16));
 54 | }
 55 | static inline std::uint_fast32_t R24r(const void* p)
 56 | {
 57 |     const unsigned char* data = (const unsigned char*)p;
 58 |     return get_16(data+1) | (get_8(data) << UINT32_C(16));
 59 | }
 60 | static inline std::uint_fast32_t get_32(const void* p)
 61 | {
 62 |   #ifdef LITTLE_ENDIAN_AND_UNALIGNED_ACCESS_OK
 63 |     return *(const std::uint_least32_t*)p;
 64 |   #else
 65 |     const unsigned char* data = (const unsigned char*)p;
 66 |     return get_16(data) | (get_16(data+2) << UINT32_C(16));
 67 |   #endif
 68 | }
 69 | static inline std::uint_fast32_t R32r(const void* p)
 70 | {
 71 |   #ifdef BIG_ENDIAN_AND_UNALIGNED_ACCESS_OK
 72 |     return *(const std::uint_least32_t*)p;
 73 |   #else
 74 |     const unsigned char* data = (const unsigned char*)p;
 75 |     return get_16(data+2) | (get_16(data) << UINT32_C(16));
 76 |   #endif
 77 | }
 78 | 
 79 | #define L (std::uint_fast64_t)
 80 | 
 81 | static inline std::uint_fast64_t get_64(const void* p)
 82 | {
 83 |   #ifdef LITTLE_ENDIAN_AND_UNALIGNED_ACCESS_OK
 84 |     return *(const std::uint_least64_t*)p;
 85 |   #else
 86 |     const unsigned char* data = (const unsigned char*)p;
 87 |     return (L get_32(data)) | ((L get_32(data+4)) << UINT64_C(32));
 88 |   #endif
 89 | }
 90 | static inline std::uint_fast64_t R64r(const void* p)
 91 | {
 92 |   #ifdef BIG_ENDIAN_AND_UNALIGNED_ACCESS_OK
 93 |     return *(const std::uint_least64_t*)p;
 94 |   #else
 95 |     const unsigned char* data = (const unsigned char*)p;
 96 |     return (L get_32(data+4)) | ((L get_32(data)) << UINT64_C(32));
 97 |   #endif
 98 | }
 99 | 
100 | #undef L
101 | 
102 | static inline std::uint_fast64_t get_n(const void* p, unsigned bytes)
103 | {
104 |     const unsigned char* data = (const unsigned char*)p;
105 |     std::uint_fast64_t res(0);
106 |     switch(bytes)
107 |     {
108 |         case 8: return get_64(p);
109 |         case 4: return get_32(p);
110 |         case 2: return get_16(p);
111 |         case 7: res |= ((std::uint_fast64_t)get_8(data+6)) << 48; [[fallthrough]];
112 |         case 6: res |= ((std::uint_fast64_t)get_8(data+5)) << 40; [[fallthrough]];
113 |         case 5: res |= ((std::uint_fast64_t)get_16(data+3)) << 24; [[fallthrough]];
114 |         case 3: res |= ((std::uint_fast64_t)get_16(data+1)) << 8; [[fallthrough]];
115 |         case 1: res |= get_8(data);
116 |     }
117 |     return res;
118 | }
119 | 
120 | static void put_8(void* p, std::uint_fast8_t value)
121 | {
122 |     unsigned char* data = (unsigned char*)p;
123 |     data[0] = value;
124 | }
125 | static void put_16(void* p, std::uint_fast16_t value)
126 | {
127 |   #ifdef LITTLE_ENDIAN_AND_UNALIGNED_ACCESS_OK
128 |     *(std::uint_least16_t*)p = value;
129 |   #else
130 |     unsigned char* data = (unsigned char*)p;
131 |     put_8(data+0, value   );
132 |     put_8(data+1, value>>8);
133 |   #endif
134 | }
135 | static void W24(void* p, std::uint_fast32_t value)
136 | {
137 |     unsigned char* data = (unsigned char*)p;
138 |     put_16(data+0, value);
139 |     put_8(data+2,  value >> UINT32_C(16));
140 | }
141 | static void put_32(void* p, std::uint_fast32_t value)
142 | {
143 |   #ifdef LITTLE_ENDIAN_AND_UNALIGNED_ACCESS_OK
144 |     *(std::uint_least32_t*)p = value;
145 |   #else
146 |     unsigned char* data = (unsigned char*)p;
147 |     put_16(data+0, value);
148 |     put_16(data+2, value >> UINT32_C(16));
149 |   #endif
150 | }
151 | static void put_64(void* p, std::uint_fast64_t value)
152 | {
153 |   #ifdef LITTLE_ENDIAN_AND_UNALIGNED_ACCESS_OK
154 |     *(std::uint_least64_t*)p = value;
155 |   #else
156 |     unsigned char* data = (unsigned char*)p;
157 |     put_32(data+0, (value));
158 |     put_32(data+4, (value >> UINT64_C(32)));
159 |   #endif
160 | }
161 | 
162 | static inline void put_n(void* p, std::uint_fast64_t value, unsigned bytes)
163 | {
164 |     unsigned char* data = (unsigned char*)p;
165 |     switch(bytes)
166 |     {
167 |         case 8: put_64(p, value); break;
168 |         case 7: put_8(data+6, value>>48);
169 |         case 6: put_8(data+5, value>>40);
170 |         case 5: put_8(data+4, value>>32);
171 |         case 4: put_32(p, value); break;
172 |         case 3: W24(p, value); break;
173 |         case 2: put_16(p, value); break;
174 |         case 1: put_8(p, value); break;
175 |     }
176 | }
177 | 
178 | #endif
179 | 


--------------------------------------------------------------------------------
/newhash/newhash.cc:
--------------------------------------------------------------------------------
  1 | #include "endian.hh"
  2 | #include "newhash.hh"
  3 | #include <algorithm>
  4 | 
  5 | #include "simd.hh"
  6 | 
  7 | #ifdef __GNUC__
  8 | # define likely(x)       __builtin_expect(!!(x), 1)
  9 | # define unlikely(x)     __builtin_expect(!!(x), 0)
 10 | #else
 11 | # define likely(x)   (x)
 12 | # define unlikely(x) (x)
 13 | #endif
 14 | 
 15 | /* Note: The differing algorithms in this
 16 |  *       file are not value-compatible.
 17 |  */
 18 | 
 19 | #if defined(__x86_64) || (defined(USE_MMX) && defined(__SSE2__))
 20 | /* On x86_64, we can use 64-bit registers. Which is fast.
 21 |  * On 32-bit, we can use MMX registers, if MMX is enabled
 22 |  *   However, SSE2 must ALSO be enabled, because otherwise
 23 |  *   we cannot do 64-bit sub/add efficiently.
 24 |  */
 25 | # define SIXTY_BIT_PLATFORM
 26 | #else
 27 | # undef SIXTY_BIT_PLATFORM
 28 | #endif
 29 | 
 30 | #if 0 && defined(__GNUC__) && defined(__LP64__) && !defined(__ICC)
 31 | # define HUNDREDTWENTYEIGHTBIT_PLATFORM
 32 | /*
 33 |  * 128-bit with SSE2 is not feasible, because SSE2 does not
 34 |  * have 128-bit add/sub ops. They cannot be even synthesized
 35 |  * from 64-bit adds/subs, because there's no carry update.
 36 |  * More importantly there's no 128-bit shift to left or right.
 37 |  *
 38 |  * __attribute__((mode(TI))) can be used to created a 128-bit
 39 |  * integer type on GCC, however, it does not work on ICC.
 40 |  */
 41 | 
 42 | #ifdef __SSE2__
 43 |  #include <xmmintrin.h>
 44 | #endif
 45 | 
 46 | #else
 47 | # undef HUNDREDTWENTYEIGHTBIT_PLATFORM
 48 | #endif
 49 | 
 50 | /* Based on Robert J. Jenkins Jr.'s "zobra" hash code
 51 |  * References:
 52 |  *   http://www.burtleburtle.net/bob/hash/evahash.html
 53 |  *   http://www.cris.com/~Ttwang/tech/inthash.htm
 54 |  *
 55 |  * Copyright (C) 2011 Joel Yliluoma (http://iki.fi/bisqwit/)
 56 |  */
 57 | 
 58 | template<typename T>
 59 | static inline T rol(T v, int n) { return (v<<n) | (v>>int( sizeof(T)*8 - n)); }
 60 | 
 61 | /* The mixing step */
 62 | #define mix32z(a,b,c)  \
 63 | do{ \
 64 |   a=(a-c) ^ rol(c,16); c += b; \
 65 |   b=(b-a) ^ rol(a,23); a += c; \
 66 |   c=(c-b) ^ rol(b,29); b += a; \
 67 |   a=(a-c) ^ rol(c,16); c += b; \
 68 |   b=(b-a) ^ rol(a,19); a += c; \
 69 |   c=(c-b) ^ rol(b,17); b += a; \
 70 | }while(0)
 71 | #define final32z(a,b,c)  \
 72 | do{ \
 73 |   c=(c^b) - rol(b, 5); \
 74 |   a=(a^c) - rol(c,10); \
 75 |   b=(b^a) - rol(a, 6); \
 76 |   c=(c^b) - rol(b, 9); \
 77 | }while(0)
 78 | 
 79 | #define mix64z(a,b,c)  \
 80 | do{ \
 81 |   a=(a-c) ^ rol(c, 2); c += b; \
 82 |   b=(b-a) ^ rol(a,22); a += c; \
 83 |   c=(c-b) ^ rol(b, 3); b += a; \
 84 |   a=(a-c) ^ rol(c,36); c += b; \
 85 |   b=(b-a) ^ rol(a,48); a += c; \
 86 |   c=(c-b) ^ rol(b,42); b += a; \
 87 | }while(0)
 88 | #define final64z(a,b,c)  \
 89 | do{ \
 90 |   c=(c^b) - rol(b,22); \
 91 |   a=(a^c) - rol(c, 3); \
 92 |   b=(b^a) - rol(a,58); \
 93 |   c=(c^b) - rol(b,48); \
 94 | }while(0)
 95 | 
 96 | #define mix128z(a,b,c)  \
 97 | do{ \
 98 |   a=(a-c) ^ rol(c, 79); c += b; \
 99 |   b=(b-a) ^ rol(a,124); a += c; \
100 |   c=(c-b) ^ rol(b, 60); b += a; \
101 |   a=(a-c) ^ rol(c, 74); c += b; \
102 |   b=(b-a) ^ rol(a,115); a += c; \
103 |   c=(c-b) ^ rol(b,101); b += a; \
104 | }while(0)
105 | #define final128z(a,b,c)  \
106 | do{ \
107 |   c=(c^b) - rol(b, 60); \
108 |   a=(a^c) - rol(c, 20); \
109 |   b=(b^a) - rol(a, 91); \
110 |   c=(c^b) - rol(b,106); \
111 | }while(0)
112 | 
113 | 
114 | #ifdef HUNDREDTWENTYEIGHTBIT_PLATFORM
115 | typedef unsigned int std::uint128_t __attribute__((mode(TI)));
116 | 
117 | class c128
118 | {
119 | public:
120 |     std::uint128_t value;
121 | public:
122 |     c128() : value()
123 |     {
124 |     }
125 |     c128(std::uint128_t v) : value(v) { }
126 |     c128(std::uint_fast64_t a, std::uint_fast64_t b)
127 |         : value(a)
128 |     {
129 |         value <<= 64;
130 |         value |= b;
131 |     }
132 |     c128(std::uint_least64_t a) : value(a)
133 |     {
134 |     }
135 |     c128(std::uint_least32_t a) : value(a)
136 |     {
137 |     }
138 |     #ifdef __SSE2__
139 |     c128(const __m128& b) : value(*(const std::uint128_t*)&b)
140 |     {
141 |     }
142 |     #endif
143 | 
144 |     c128& operator += (const c128& b) { value += b.value; return *this; }
145 |     c128& operator -= (const c128& b) { value -= b.value; return *this; }
146 |     c128& operator ^= (const c128& b)
147 |     {
148 |     #ifdef __SSE2__
149 |         *(__m128*)&value = _mm_xor_ps( *(const __m128*)&value, *(const __m128*)&b.value);
150 |     #else
151 |         value ^= b.value;
152 |     #endif
153 |         return *this;
154 |     }
155 |     c128& operator &= (const c128& b)
156 |     {
157 |     #ifdef __SSE2__
158 |         *(__m128*)&value = _mm_and_ps( *(const __m128*)&value, *(const __m128*)&b.value);
159 |     #else
160 |         value &= b.value;
161 |     #endif
162 |         return *this;
163 |     }
164 |     c128& operator |= (const c128& b)
165 |     {
166 |     #ifdef __SSE2__
167 |         *(__m128*)&value = _mm_or_ps( *(const __m128*)&value, *(const __m128*)&b.value);
168 |     #else
169 |         value |= b.value;
170 |     #endif
171 |         return *this;
172 |     }
173 |     c128& operator <<= (int nbits) { value <<= nbits; return *this; }
174 |     c128& operator >>= (int nbits) { value >>= nbits; return *this; }
175 | 
176 |     c128 operator+ (const c128& b) const { return value + b.value; }
177 |     c128 operator- (const c128& b) const { return value - b.value; }
178 |     c128 operator^ (const c128& b) const
179 |     {
180 |     #ifdef __SSE2__
181 |         return _mm_xor_ps( *(const __m128*)&value, *(const __m128*)&b.value);
182 |     #else
183 |         return value ^ b.value;
184 |     #endif
185 |     }
186 |     c128 operator& (const c128& b) const
187 |     {
188 |     #ifdef __SSE2__
189 |         return _mm_and_ps( *(const __m128*)&value, *(const __m128*)&b.value);
190 |     #else
191 |         return value & b.value;
192 |     #endif
193 |     }
194 |     c128 operator| (const c128& b) const
195 |     {
196 |     #ifdef __SSE2__
197 |         return _mm_or_ps( *(const __m128*)&value, *(const __m128*)&b.value);
198 |     #else
199 |         return value | b.value;
200 |     #endif
201 |     }
202 |     c128 operator<< (int nbits) const { return value << nbits; }
203 |     c128 operator>> (int nbits) const { return value >> nbits; }
204 |     c128 operator~ () const { return ~value; }
205 | };
206 | 
207 | c128 get_128(const void* p)
208 | {
209 |   #ifdef LITTLE_ENDIAN_AND_UNALIGNED_ACCESS_OK
210 |     return *(const std::uint128_t*)p;
211 |   #else
212 |     const unsigned char* data = (const unsigned char*)p;
213 |     c128 res( get_64(data) );
214 |     c128 res2( get_64(data + 8) );
215 |     res |= res2 << 64;
216 |     return res;
217 |   #endif
218 | }
219 | static inline c128 RnSubstitute(const void* p, unsigned bytes)
220 | {
221 |     const unsigned char* data = (const unsigned char*)p;
222 |     switch(bytes)
223 |     {
224 |         case 1: case 2: case 3: case 4:
225 |         case 5: case 6: case 7: case 8:
226 |             return get_n(p, bytes);
227 |         case 16: return get_128(p);
228 |     }
229 |     return c128(get_64(data)) | (c128(get_n(data+8, bytes-8)) << 64);
230 | }
231 | #define Rn RnSubstitute
232 | 
233 | #endif // 128bit
234 | 
235 | newhash_t newhash_calc(const unsigned char* buf, unsigned long size)
236 | {
237 |     return newhash_calc_upd(0, buf, size);
238 | }
239 | newhash_t newhash_calc_upd(newhash_t c, const unsigned char* buf, unsigned long size)
240 | {
241 | #ifdef HUNDREDTWENTYEIGHTBIT_PLATFORM
242 |     c128 c_cast = c; {
243 |     unsigned long len = size;
244 |     c128 a(UINT64_C(0x9e3779b97f4a7c15),UINT64_C(0xf39cc0605cedc834)); // 2^128 / ((1+sqrt(5))/2)
245 |     a += c_cast + size;
246 |     c128 b(a), c(a);
247 |     while(len >= 16*3)
248 |     {
249 |         a += (c128)get_128(buf+0);
250 |         b += (c128)get_128(buf+16);
251 |         c += (c128)get_128(buf+32);
252 |         mix128z(a,b,c);
253 |         buf += 48; len -= 48;
254 |     }
255 |     /*------------------------------------- handle the last 47 bytes */
256 |     if(len > 0)
257 |     {
258 |         if(len >= 32)      { a += (c128)get_128(buf); b += (c128)get_128(buf+16); c += (c128)get_n(buf+32,len-32); }
259 |         else if(len >= 16) { a += (c128)get_128(buf); b += (c128)get_n(buf+16, len-16); }
260 |         else               { a += (c128)get_n(buf, len); }
261 |         final128z(a,b,c);
262 |     }
263 |     /*-------------------------------------------- report the result */
264 |     return c.value; /* Note: this returns just the lowest 32 bits of the hash */
265 |    }
266 | #elif defined(SIXTY_BIT_PLATFORM)
267 |     c64 c_cast = (std::uint_fast64_t)c; {
268 |     unsigned long len = size;
269 |     c64 a(UINT64_C(0x9e3779b97f4a7c13)); // 2^64 / ((1+sqrt(5))/2)
270 |     a += c_cast + c64(std::uint64_t(size));
271 |     c64 b(a), c(a);
272 |     while(len >= 8*3)
273 |     {
274 |         a += (c64)get_64(buf+0);
275 |         b += (c64)get_64(buf+8);
276 |         c += (c64)get_64(buf+16);
277 |         mix64z(a,b,c);
278 |         buf += 24; len -= 24;
279 |     }
280 |     /*------------------------------------- handle the last 23 bytes */
281 |     if(len > 0)
282 |     {
283 |         if(len >= 16)     { a += (c64)get_64(buf); b += (c64)get_64(buf+8); c += (c64)get_n(buf+16,len-16); }
284 |         else if(len >= 8) { a += (c64)get_64(buf); b += (c64)get_n(buf+8, len-8); }
285 |         else              { a += (c64)get_n(buf, len); }
286 |         final64z(a,b,c);
287 |     }
288 |     /*-------------------------------------------- report the result */
289 |   #ifdef USE_MMX
290 |     newhash_t result = get_32(&c.value); /* Note: this returns just the lowest 32 bits of the hash */
291 |     MMX_clear();
292 |     return result;
293 |   #else
294 |     return c.value; /* Note: this returns just the lowest 32 bits of the hash */
295 |   #endif
296 |    }
297 | #else
298 |     typedef std::uint_least32_t c32;
299 |     c32 a,b;
300 |     unsigned long len = size;
301 |     c += size + UINT32_C(0x9e3779b9); // 2^32 / ((1+sqrt(5))/2
302 |     a = b = c;
303 |     while(len >= 4*3)
304 |     {
305 |         a += get_32(buf+0);
306 |         b += get_32(buf+4);
307 |         c += get_32(buf+8);
308 |         mix32z(a,b,c);
309 |         buf += 12; len -= 12;
310 |     }
311 |     /*------------------------------------- handle the last 11 bytes */
312 |     if(len > 0)
313 |     {
314 |         if(len >= 8)      { a += (c32)get_32(buf); b += (c32)get_32(buf+4); c += (c32)get_n(buf+8,len-8); }
315 |         else if(len >= 4) { a += (c32)get_32(buf); b += (c32)get_n(buf+4, len-4); }
316 |         else              { a += (c32)get_n(buf, len); }
317 |         final32z(a,b,c);
318 |     }
319 |     /*-------------------------------------------- report the result */
320 |     return c;
321 | #endif
322 | }
323 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Bisqwit’s CRT filter
  2 | 
  3 | This is the CRT filter that I used in my ”What is That Editor” video,
  4 | at https://www.youtube.com/watch?v=ZMBQmhO8KqI.
  5 | 
  6 | It received some accolades, but I forgot to publish it.
  7 | Here it is finally.
  8 | 
  9 | ## To build
 10 | 
 11 | Run this command to build the filter:
 12 | 
 13 |     g++ -o crt-filter crt-filter.cc -fopenmp -Ofast -march=native -Wall -Wextra -std=c++17
 14 | 
 15 | ## Usage
 16 | 
 17 | The filter takes BGRA (RGB32) video (RAW!) from stdin,
 18 | and produces BGRA video (RAW!) into stdout.
 19 | 
 20 | The filter takes five commandline parameters:
 21 | 
 22 |     ./crt-filter <sourcewidth> <sourceheight> <outputwidth> <outputheight> <scanlines>
 23 | 
 24 | The sourcewidth and sourceheight denote the size of the original video.
 25 | The outputwidth and outputheight denote the size that you want to produce.
 26 | Generally speaking you want to produce as high quality as possible.
 27 | Vertical resolution is more important than horizontal resolution.
 28 | 
 29 | Scanlines is the number of scanlines you wish to simulate.
 30 | Generally that would be the same as the vertical resolution of the source video,
 31 | but that is not a requirement.
 32 | 
 33 | For best quality, the number of scanlines should be chosen
 34 | such that the intermediate height (see Constants)
 35 | is its integer multiple.
 36 | The intermediate width should ideally also be an integer
 37 | multiple of the source width. None of this is required though.
 38 | 
 39 | IMPORTANT: This filter does *not* decode or produce video formats like avi/mp4/mkv/whatever.
 40 | It only deals with raw video frames. You need to use an external program,
 41 | like ffmpeg, to perform the conversions.
 42 | See `make-reencoded.sh` and `reencode.sh` for a practical example.
 43 | 
 44 | ## Screenshots
 45 | 
 46 | (Click to enlarge the filtered pictures)
 47 | 
 48 | ![Original1](img/mpv-shot0001.jpg)
 49 | ![Filtered1](img/mpv-shot0002.jpg)
 50 | 
 51 | ![Original2](img/mpv-shot0003.jpg)
 52 | ![Filtered2](img/mpv-shot0004.jpg)
 53 | 
 54 | ## How it works
 55 | 
 56 | ### Constants
 57 | 
 58 | These constants specify the pixel grid (shadow mask) used by the simulated CRT monitor.
 59 | 
 60 | Currently they are hardcoded in the program,
 61 | but they are easy to find if you want to tweak the source code.
 62 | 
 63 | ![width](https://render.githubusercontent.com/render/math?math=\begin{align*}npix_{width}%26=640+%5C%5C+npix_{height}%26=400+%5C%5C+cellwidth_{red}%26=cellwidth_{green}=cellwidth_{blue}=2+%5C%5C+cellblank_{red}%26=cellblank_{green}=1+%5C%5C+cellblank_{blue}%26=2+%5C%5C+cellheight_{vert}%26=5+%5C%5C+cellblank_{vert}%26=1+%5C%5C+cellstagger%26=3+%5C%5C+intermediatewidth%26=npix_{width}\cdot%28cellwidth_{red}%2Bcellblank_{red}%2Bcellheight_{green}%2Bcellblank_{green}%2Bcellwidth_{blue}%2Bcellblank_{blue}%29=6400+%5C%5C+intermediateheight%26=npix_{height}\cdot%28cellheight_{vert}%2Bcellblank_{vert}%29=2400\end{align*})
 64 | 
 65 | The cell widths and heights and staggering specify the geometry of the shadow
 66 | mask. See Filtering, below, for an example of what it looks like.
 67 | 
 68 | **NB: This page uses GitHub’s own LaTeX math renderer to show equations.
 69 | Unfortunately, this renderer produces transparent pictures with black text,
 70 | and has very poor usability on *dark mode.*
 71 | I am aware of this problem, but there is very little I can do about it,
 72 | until GitHub itself fixes it!
 73 | Sorry. Please view this site on desktop with non-dark mode.**
 74 | 
 75 | ### Hashing
 76 | 
 77 | The filter is designed for DOS videos, and specifically for sessions
 78 | involving the text mode. Because chances are that successive frames are
 79 | often identical, the filter calculates a hash of every source frame.
 80 | 
 81 | If the hash is found to be identical to some previous frame,
 82 | the filtered result of the previous frame is sent.
 83 | Otherwise, the new frame is processed, and saved into a cache with the hash of the input image.
 84 | 
 85 | Four previous unique frames are cached. This accounts e.g. for blinking cursors.
 86 | 
 87 | ### Converting into linear colors
 88 | 
 89 | First, the image is un-gammacorrected.
 90 | 
 91 | ![1/gamma](https://render.githubusercontent.com/render/math?math=value\leftarrow+value^{\gamma^{-1}}\text{ where }\gamma=2\text{ for every color channel }value\text{ in the picture})
 92 | 
 93 | ### Rescaling to scanline count
 94 | 
 95 | Then, the image is rescaled to the height of number of given scanlines using a Lanczos filter.
 96 | Kernel size 2 was was selected for the Lanczos filter.
 97 | 
 98 | If your source height is greater than the number of scanlines you specified, you will lose detail.
 99 | 
100 | ### Rescaling to intermediate size
101 | 
102 | Next, the image is rescaled to the intermediate width and height using a nearest-neighbor filter.
103 | 
104 | The scaling is performed first vertically and then horizontally.
105 | Before horizontal scaling, the brightness of each row of pixels
106 | is adjusted by a constant factor that is calculated by
107 | 
108 | ![formula](https://render.githubusercontent.com/render/math?math=e^{-0.5%28n-0.5%29^{2}c^{-2}}\text{ where }c=0.3\text{ and }n\text{ is+the+fractional+part+of+the+source+Y+coordinate.})
109 | 
110 | This formula produces a figure that sort of looks like a hill.
111 | It peaks in the middle and fades smoothly to the sides. 
112 | This hill represents the brightness of each scanline, as a function of distance from its beginning.
113 | Plotted in a graphing calculator, it looks like this.
114 | The c constant controls how steep that hill is. A small value like 0.1
115 | produces a very narrow hill with very sharp and narrow scanlines,
116 | and bigger values produce flatter hills and less pronounced scanlines.
117 | 0.3 looked like a good compromise.
118 | 
119 | This simulates the electron gun passing through in horizontal lines called scanlines,
120 | as it renders the picture line by line.
121 | 
122 | ![Gaussian](img/weights.png)
123 | ![Copper bars](img/coppers.png)
124 | 
125 | You can download the source code of the right-hand-side illustration in
126 | [img/coppers.php](img/coppers.php).
127 | 
128 | ### Filtering
129 | 
130 | Each color channel and each pixel of the picture — now intermediate width and height — is multiplied by a mask
131 | that is either one or zero, depending on whether that pixel belongs inside a
132 | cell of that color according to the hardcoded cell geometry.
133 | 
134 | The mask is a repeating pattern that essentially looks like this:
135 | 
136 | ![Mask](img/mask.png)
137 | 
138 | Red pixels denote 1 for red channel,
139 | green pixels denote 1 for green channel,
140 | blue pixels denote 1 for blue channel,
141 | and everything else for everyone is 0.
142 | 
143 | This simulates the shadow mask in front of the cathode ray tube.
144 | 
145 | The mask is generated procedurally from the cell parameters
146 | (see Constants).
147 | 
148 | ### Rescaling to target size
149 | 
150 | Then the image is rescaled to the target picture width and target picture height using a Lanczos filter.
151 | The scaling is performed first vertically and the horizontally.
152 | 
153 | A Lanczos filter was chosen because it is generally deemed the
154 | best compromise between blurring and fringing
155 | among several simple filters
156 | ([Wikipedia](https://en.wikipedia.org/wiki/Lanczos_resampling)).
157 | I have been using it for years for interpolating all sorts of signals
158 | from pictures to sounds.
159 | 
160 | ### Bloom
161 | 
162 | First, the brightness of each pixel is normalized so that the sum of masks
163 | and scanline magnitudes does not change the overall brightness of the picture.
164 | 
165 | Then, a copy is created of the picture.
166 | This copy is gamma-corrected and amplified with a significant factor, to promote bloom.
167 | 
168 | ![gamma](https://render.githubusercontent.com/render/math?math=value_{copy}=\frac{600}{255}value^\gamma\text{ for every color channel }value\text{ in the picture})
169 | 
170 | This copy is 2D-gaussian-blurred using a three-step box filter,
171 | where the blur width is set as output-width / 640.
172 | The blur algorithm is very fast and works in linear time,
173 | adapted from http://blog.ivank.net/fastest-gaussian-blur.html .
174 | 
175 | Then, the actual picture is gamma-corrected, this time without a brightening factor.
176 | 
177 | ![gamma](https://render.githubusercontent.com/render/math?math=value\leftarrow+value^\gamma\text{ for every color channel }value\text{ in the picture})
178 | 
179 | Then, the blurry copy is merged into the picture,
180 | by literally adding its pixel values into the target pixel values.
181 | 
182 | ![gamma](https://render.githubusercontent.com/render/math?math=value\leftarrow+value%2Bvalue_{copy}\text{ for every color channel }value\text{ in the picture})
183 | 
184 | Because of the combination of amplification and blurring,
185 | if there are isolated bright pixels in the scene,
186 | their power is spread out on big area
187 | and thus do not contribute much to the final picture,
188 | but if there is a large cluster of bright pixels closeby,
189 | they remain bright even after blurring,
190 | and will influence the final picture a lot.
191 | This produces a bloom effect.
192 | 
193 | ### Clamping
194 | 
195 | Finally, before quantizing the floating-point colors and sending the frame to output,
196 | each pixel is clamped to the target range using a desaturation formula.
197 | 
198 | #### The desaturation formula
199 | 
200 | The desaturation formula first calculates a luminosity value from the input R,G,B
201 | components using ITU coefficients (see [sRGB on Wikipedia](https://en.wikipedia.org/wiki/SRGB)):
202 | 
203 | ![luma calculation](https://render.githubusercontent.com/render/math?math=luma=0.2126\cdot+value_{red}%2B0.7152\cdot+value_{green}%2B0.0722\cdot+value_{blue})
204 | 
205 | * If the luminosity is less than 0, black is returned.
206 | * If the luminosity is more than 1, white is returned.
207 | * Otherwise, a saturation value is initialized as 1, and then adjusted by inspecting each color channel value separately:
208 | 
209 | ![adjust](https://render.githubusercontent.com/render/math?math=saturation\leftarrow\begin{cases}\min%28saturation,\frac{luma-1}{luma-value_{channel}}%29,+%26+\text{if }value_{channel}\gt+1%5C%5C%0D%0A\min%28saturation,\frac{luma}{luma-value_{channel}}%29,+%26+\text{if }value_{channel}\lt+0%5C%5Csaturation%26\text{otherwise}\end{cases})
210 | 
211 | After analyzing all color channels,
212 | if the saturation still remains as 1, the input color is returned verbatim.
213 | Otherwise each color channel is readjusted as:
214 | 
215 | ![adjust](https://render.githubusercontent.com/render/math?math=value_{channel}\prime=\min%281,\max%280,%28value_{channel}-luma%29\cdot+saturation%2Bluma%29%29)
216 | 
217 | The readjusted color channel values are then joined together to form the returned color.
218 | 
219 | The advantage of desaturation-aware clamping over naïve clamping
220 | is that it does a much better job at preserving energy.
221 | To illustrate, here is a picture with two color ramps.
222 | The brightness of the color ramp increases linearly along the Y axis.
223 | That is, top is darkest (0) and bottom is brightest (1, i.e. full).
224 | Every pixel on each scanline should be approximately same brightness.
225 | 
226 | The brightness scaling in this illustration is done by simply multiplying
227 | the RGB color with the brightness value. At high brightness values, this
228 | produces colors that are impossible to show on the screen.
229 | 
230 | ![Rainbow illustration](img/rainbow.png)
231 | 
232 | In the leftside picture with naïve clamping (i.e. `if x>255, then set x to 255`),
233 | you can see that the further
234 | down you go in the picture, the more different the color brightnesses are.
235 | The blue stripe is much, much darker than anything else in the picture,
236 | even though it is fully saturated and as bright as your screen can make it.*
237 | 
238 | However, on the right side, with the desaturation aware clamping formula,
239 | every scanline remains at perfectly even brightness, even
240 | when you exceed the maximum possible brightness of the screen colors.
241 | 
242 | In the desaturation-aware algorithm, colors that are impossible 
243 | to show on screen due to excess brightness are approximated with 
244 | desaturated versions, that preserve the brightness perception 
245 | at the cost of color saturation.
246 | 
247 | (Note: “Perfectly” was a hyperbole.
248 | The colors are not quite the same brightness,
249 | because of differences in screen calibration and because of
250 | differences in human individual eyes. This is more of an illustration.)
251 | You can download the source code of this illustration in
252 | [img/rainbow.php](img/rainbow.php).
253 | 
254 | Note that this does *not* mean that all colors become more washed out.
255 | You may come to this mistaken conclusion, because this illustration is
256 | fixed for perceptual brightness. The only colors that will be
257 | desaturated are those that are have out-of-range values
258 | (i.e. individual channel values are greater than 255 or smaller than 0);
259 | marked with crosshatch pattern in the below picture.
260 | Everything else is kept unchanged.
261 | 
262 | ![Rainbow with crosshatch](img/rainbow2.png)
263 | 
264 | *) Note that \#0000FF is not blue at brightness 1. While it is maximally bright
265 | fully saturated blue, its brightness is only about 10 % of the brightness of
266 | \#00FF00, maximally bright fully saturated green, and only about 7 % of the
267 | brightness of \#FFFFFF, a maximally bright white pixel (which does have
268 | brightness level of 1).
269 | 
270 | This is trivial to
271 | prove: \#FFFFFF is a color where you light up all the LEDs that comprise
272 | color \#0000FF, but you also light up all the LEDs that comprise \#FF0000
273 | and all the LEDs that comprise \#00FF00. Because there are three times as
274 | many LEDs shining as when just \#0000FF is shown, the brightness of \#FFFFFF
275 | cannot be the same, but has to be much higher. Therefore, \#0000FF cannot
276 | have brightness level of 1.
277 | 
278 | It is also worth noting that brightness is not the same as radiant energy.
279 | This has nothing to do with energy.
280 | The human eye is simply differently sensitive to different wavelengths
281 | of visible light; least of them to blue (see
282 | [V(λ)](https://en.wikipedia.org/wiki/Luminous_efficiency_function)).
283 | Brightness is a perception phenomenon.
284 | 


--------------------------------------------------------------------------------
/newhash/simd.hh:
--------------------------------------------------------------------------------
  1 | #if defined(__MMX__) && !(defined(__x86_64) || defined(_M_X64))
  2 | #define USE_MMX
  3 | #endif
  4 | #if defined(__SSE__)
  5 | #define USE_SSE
  6 | #endif
  7 | 
  8 | /* SIMD interface (MMX) written by Bisqwit
  9 |  * Copyright (C) 1992,2011 Joel Yliluoma (http://iki.fi/bisqwit/)
 10 |  */
 11 | 
 12 | #ifdef __3dNOW__
 13 | # include <mm3dnow.h> /* Note: not available on ICC */
 14 | #elif defined(__MMX__)
 15 | # include <mmintrin.h>
 16 | #endif
 17 | #ifdef __SSE__
 18 | #include <xmmintrin.h>
 19 |  #ifdef __ICC
 20 |  typedef __m128 __v4sf;
 21 |  #endif
 22 | #endif
 23 | 
 24 | struct c64_common
 25 | {
 26 |     static signed char clamp_s8(int_fast64_t v)
 27 |         { return v<-128 ? -128 : (v > 127 ? 127 : v); }
 28 |     static unsigned char clamp_u8(int_fast64_t v)
 29 |         { return v<0 ? 0 : (v > 255 ? 255 : v); }
 30 |     static short clamp_s16(int_fast64_t v)
 31 |         { return v<-32768 ? -32768 : (v > 32767 ? 32767 : v); }
 32 | 
 33 |     static inline std::uint_fast64_t expand32_8(std::uint_fast32_t a)
 34 |     {
 35 |         // 0000abcd -> 0a0b0c0d
 36 |         typedef std::uint_fast64_t v;
 37 |         return (a&0xFFU)
 38 |             | ((a&0xFF00U)<<8)    // base: 8+8 = 16
 39 |             | ((v)(a&0xFF0000U)<<16) // base: 16+16 = 32
 40 |             | ((v)(a&0xFF000000UL)<<24); // base: 24+24 = 48
 41 |     }
 42 |     static inline std::uint_fast64_t expand32_16(std::uint_fast32_t a)
 43 |     {
 44 |         // 0000abcd -> 00ab00cd
 45 |         typedef std::uint_fast64_t v;
 46 |         return (a&0xFFFFU)
 47 |          | ((v)(a&0xFFFF0000UL)<<16);   // base: 16+16 = 32
 48 |     }
 49 | };
 50 | 
 51 | #ifdef __MMX__
 52 | /* 64-bit integers that use MMX / 3Dnow operations where relevant */
 53 | struct c64_MMX: public c64_common
 54 | {
 55 |     typedef c64_MMX c64;
 56 |     typedef __m64 valuetype;
 57 | 
 58 |     valuetype value;
 59 | 
 60 |     inline c64_MMX() : value() { }
 61 |     inline c64_MMX(__m64 v) : value(v) { }
 62 |     inline c64_MMX(const std::uint64_t& v) : value( *(const __m64*)& v) { }
 63 |     inline c64_MMX(int v) : value(_m_from_int(v)) { }
 64 |     inline c64_MMX(short a,short b,short c, short d)
 65 |         : value(_mm_setr_pi16(a,b,c,d)) { }
 66 | 
 67 |     inline c64 operator<< (int b) const { if(b < 0) return *this >> -b; return shl64(b); }
 68 |     inline c64 operator>> (int b) const { if(b < 0) return *this << -b; return shr64(b); }
 69 |     c64& operator<<= (int n) { return *this = shl64(n); }
 70 |     c64& operator>>= (int n) { return *this = shr64(n); }
 71 | 
 72 |     operator bool() const
 73 |     {
 74 |         union m64union { struct { std::uint_least32_t a, b; }; __m64 c; };
 75 |         const m64union& tmp = (const m64union&) value;
 76 |         return !(tmp.a ^ tmp.b);
 77 |     } // TODO: verify
 78 | 
 79 |     c64 conv_s16_u8() const { return conv_s16_u8(*this); }
 80 |     c64 conv_s16_s8() const { return conv_s16_s8(*this); }
 81 | 
 82 |     void Get(const unsigned char* p)      { value = *(const __m64*)p; }
 83 |     void Put(      unsigned char* p)const { *(__m64*)p =  value; }
 84 | 
 85 |     void Init16(short a,short b,short c, short d)
 86 |         { value = _mm_setr_pi16(a,b,c,d); }
 87 |     void Init16(short a)
 88 |         { value = _mm_set1_pi16(a); }
 89 | 
 90 |     void GetD(const unsigned char* p)      { value = *(const __m64*)p; }
 91 | 
 92 |     template<int n>
 93 |     short Extract16() const { return ((const short*)&value)[n]; }
 94 |     template<int n>
 95 |     int Extract32() const { return ((const int*)&value)[n]; }
 96 | 
 97 |     short Extract88_from_1616lo() const
 98 |     {
 99 |         const unsigned char* data = (const unsigned char*)&value;
100 |         // bytes:  76543210
101 |         // shorts: 33221100
102 |         // take:        H L
103 |         return data[0] | *(const short*)(data+1);
104 |         //return data[0] | ((*(const unsigned int*)data) >> 8);
105 |     }
106 |     short Extract88_from_1616hi() const
107 |     {
108 |         const unsigned char* data = 4+(const unsigned char*)&value;
109 |         // bytes:  76543210
110 |         // shorts: 33221100
111 |         // take:    H L
112 |         return data[0] | *(const short*)(data+1);
113 |         //return data[0] | ((*(const unsigned int*)data) >> 8);
114 |     }
115 | 
116 | 
117 |     c64& operator&= (const c64& b) { value=_mm_and_si64(value,b.value); return *this; }
118 |     c64& operator|= (const c64& b) { value=_mm_or_si64(value,b.value); return *this; }
119 |     c64& operator^= (const c64& b) { value=_mm_xor_si64(value,b.value); return *this; }
120 |     c64& operator+= (const c64& b) { return *this = *this + b; }
121 |     c64& operator-= (const c64& b) { return *this = *this - b; }
122 | 
123 |     c64 operator~ () const {
124 |         static const std::uint_least64_t negpat = ~(std::uint_least64_t)0;
125 |         return c64(_mm_xor_si64(value, *(const __m64*)&negpat));
126 |     }
127 | 
128 |             /* psllqi: p = packed
129 |                        s = shift
130 |                        r = right, l = left
131 |                        l = shift in zero, a = shift in sign bit
132 |                        q = 64-bit, d = 32-bit, w = 16-bit
133 |                       [i = immed amount]
134 |              */
135 |     c64 operator& (const c64& b) const { return c64(_mm_and_si64(value,b.value)); }
136 |     c64 operator| (const c64& b) const { return c64(_mm_or_si64(value,b.value)); }
137 |     c64 operator^ (const c64& b) const { return c64(_mm_xor_si64(value,b.value)); }
138 | 
139 |     c64 operator- (const c64& b) const
140 |     {
141 |         #ifdef __SSE2__
142 |         return _mm_sub_si64(value, b.value);
143 |         #else
144 |         return (const std::uint64_t&)value - (const std::uint64_t&)b.value;
145 |         #endif
146 |     }
147 |     c64 operator+ (const c64& b) const
148 |     {
149 |         #ifdef __SSE2__
150 |         return _mm_add_si64(value, b.value);
151 |         #else
152 |         return (const std::uint64_t&)value + (const std::uint64_t&)b.value;
153 |         #endif
154 |     }
155 | 
156 | 
157 |     c64 shl64(int b) const { return _mm_slli_si64(value, b); }
158 |     c64 shr64(int b) const { return _mm_srli_si64(value, b); }
159 |     c64 shl16(int b) const { return _mm_slli_pi16(value, b); }
160 |     c64 shr16(int b) const { return _mm_srli_pi16(value, b); }
161 |     c64 sar32(int b) const { return _mm_srai_pi32(value, b); }
162 |     c64 sar16(int b) const { return _mm_srai_pi16(value, b); }
163 |     c64 add32(const c64& b) const { return _mm_add_pi32(value, b.value); }
164 |     c64 add16(const c64& b) const { return _mm_add_pi16(value, b.value); }
165 |     c64 sub32(const c64& b) const { return _mm_sub_pi32(value, b.value); }
166 |     c64 sub16(const c64& b) const { return _mm_sub_pi16(value, b.value); }
167 |     c64 mul16(const c64& b) const   { return _mm_mullo_pi16(value, b.value); }
168 |     c64 mul16hi(const c64& b) const { return _mm_mulhi_pi16(value, b.value); }
169 |     //c64 mul32(const c64& b) const { return _mm_mullo_pi32(value, b.value); }
170 |     c64 add8(const c64& b) const { return _mm_add_pi8(value, b.value); }
171 |     c64 sub8(const c64& b) const { return _mm_sub_pi8(value, b.value); }
172 | 
173 |     c64 unpacklbw(const c64& b) const { return _mm_unpacklo_pi8(b.value,value); }
174 |     c64 unpacklwd(const c64& b) const { return _mm_unpacklo_pi16(b.value,value); }
175 |     c64 unpackhbw(const c64& b) const { return _mm_unpackhi_pi8(b.value,value); }
176 |     c64 unpackhwd(const c64& b) const { return _mm_unpackhi_pi16(b.value,value); }
177 |     c64 unpackldq(const c64& b) const { return _mm_unpacklo_pi32(b.value,value); }
178 |     c64 unpackldq() const { return _mm_unpacklo_pi32(value,value); }
179 | 
180 |     c64 operator& (const std::uint64_t& v) { return c64(_mm_and_si64(value, *(const __m64*)& v)); }
181 | 
182 |     c64 conv_s32_s16(const c64& b) const { return _mm_packs_pi32(value, b.value); }
183 |     c64 conv_s16_u8(const c64& b) const { return _mm_packs_pu16(value, b.value); }
184 |     c64 conv_s16_s8(const c64& b) const { return _mm_packs_pi16(value, b.value); }
185 | };
186 | #endif
187 | 
188 | struct c64_nonMMX: public c64_common
189 | {
190 |     typedef c64_nonMMX c64;
191 |     typedef std::uint_least64_t valuetype;
192 |     valuetype value;
193 | 
194 |     inline c64_nonMMX() : value() { }
195 |     inline c64_nonMMX(std::uint64_t v) : value(v) { }
196 |     inline c64_nonMMX(int v) : value(v) { }
197 |     inline c64_nonMMX(short a,short b,short c, short d) : value()
198 |         { Init16(a,b,c,d); }
199 | 
200 |     c64 operator<< (int b) const { if(b < 0) return *this >> -b; return shl64(b); }
201 |     c64 operator>> (int b) const { if(b < 0) return *this << -b; return shr64(b); }
202 |     c64& operator<<= (int n) { return *this = shl64(n); }
203 |     c64& operator>>= (int n) { return *this = shr64(n); }
204 | 
205 |     operator bool() const { return value; }
206 | 
207 |     c64 conv_s16_u8() const { return conv_s16_u8(*this); }
208 |     c64 conv_s16_s8() const { return conv_s16_s8(*this); }
209 | 
210 |     void Init16(short a,short b,short c, short d)
211 |         { std::uint_fast64_t aa = (unsigned short)a,
212 |                         bb = (unsigned short)b,
213 |                         cc = (unsigned short)c,
214 |                         dd = (unsigned short)d;
215 |           value = aa | (bb << 16) | (cc << 32) | (dd << 48); }
216 |     void Init16(short a)
217 |         { Init16(a,a,a,a); }
218 |     void Init8(unsigned char a,unsigned char b,unsigned char c,unsigned char d,
219 |                unsigned char e,unsigned char f,unsigned char g,unsigned char h)
220 |     {
221 |         value = ((std::uint_fast64_t)(a | (b << 8) | (c << 16) | (d << 24)))
222 |               | (((std::uint_fast64_t)e) << 32)
223 |               | (((std::uint_fast64_t)f) << 40)
224 |               | (((std::uint_fast64_t)g) << 48)
225 |               | (((std::uint_fast64_t)h) << 56);
226 |     }
227 | 
228 |     void Get(const unsigned char* p)      { value = *(const std::uint_least64_t*)p; }
229 |     void Put(      unsigned char* p)const { *(std::uint_least64_t*)p =  value; }
230 | 
231 |     c64& operator&= (const c64& b) { value&=b.value; return *this; }
232 |     c64& operator|= (const c64& b) { value|=b.value; return *this; }
233 |     c64& operator^= (const c64& b) { value^=b.value; return *this; }
234 |     c64& operator+= (const c64& b) { value+=b.value; return *this; }
235 |     c64& operator-= (const c64& b) { value-=b.value; return *this; }
236 |     c64 operator& (const c64& b) const { return value & b.value; }
237 |     c64 operator| (const c64& b) const { return value | b.value; }
238 |     c64 operator^ (const c64& b) const { return value ^ b.value; }
239 |     c64 operator- (const c64& b) const { return value - b.value; }
240 |     c64 operator+ (const c64& b) const { return value + b.value; }
241 | 
242 |     c64 operator& (std::uint_fast64_t b) const { return value & b; }
243 | 
244 |     c64 operator~ () const { return ~value; }
245 | 
246 |     #define usimdsim(type, count, op) \
247 |         type* p = (type*)&res.value; \
248 |         for(int n=0; n<count; ++n) p[n] = (p[n] op b)
249 | 
250 |     #define simdsim(type, count, op) \
251 |         type* p = (type*)&res.value; \
252 |         const type* o = (const type*)&b.value; \
253 |         for(int n=0; n<count; ++n) p[n] = (p[n] op o[n])
254 | 
255 |     c64 shl64(int b) const { return value << b; }
256 |     c64 shr64(int b) const { return value >> b; }
257 |     c64 shl16(int b) const { c64 res = *this; usimdsim(short, 2, <<); return res; }
258 |     c64 shr16(int b) const { c64 res = *this; usimdsim(unsigned short, 2, >>); return res; }
259 |     c64 sar32(int b) const { c64 res = *this; usimdsim(int, 2, >>); return res; }
260 |     c64 sar16(int b) const { c64 res = *this; usimdsim(short, 2, >>); return res; }
261 | 
262 |     c64 add16(const c64& b) const { c64 res = *this; simdsim(short, 4, +); return res; }
263 |     c64 sub16(const c64& b) const { c64 res = *this; simdsim(short, 4, -); return res; }
264 |     c64 add32(const c64& b) const { c64 res = *this; simdsim(int,   2, +); return res; }
265 |     c64 sub32(const c64& b) const { c64 res = *this; simdsim(int,   2, -); return res; }
266 |     c64 mul16(const c64& b) const { c64 res = *this; simdsim(short, 4, *); return res; }
267 |     c64 mul16hi(const c64& b) const { c64 res = *this; simdsim(short, 4, *) >> 16; return res; }
268 |     c64 add8(const c64& b) const { c64 res = *this; simdsim(unsigned char, 8, +); return res; }
269 |     c64 sub8(const c64& b) const { c64 res = *this; simdsim(unsigned char, 8, -); return res; }
270 | 
271 |     #undef simdsim
272 |     #undef usimdsim
273 | 
274 |     c64 conv_s32_s16(const c64& b) const
275 |     {
276 |         c64 res; res.
277 |         Init16(clamp_s16(value & 0xFFFFFFFFU),
278 |                clamp_s16(value >> 32),
279 |                clamp_s16(b.value & 0xFFFFFFFFU),
280 |                clamp_s16(b.value >> 32));
281 |         return res;
282 |     }
283 |     c64 conv_s16_u8(const c64& b) const
284 |     {
285 |         c64 res; res.
286 |         Init8(clamp_u8(value & 0xFFFF),
287 |               clamp_u8((value >> 16) & 0xFFFF),
288 |               clamp_u8((value >> 32) & 0xFFFF),
289 |               clamp_u8((value >> 48) & 0xFFFF),
290 |               clamp_u8(b.value & 0xFFFF),
291 |               clamp_u8((b.value >> 16) & 0xFFFF),
292 |               clamp_u8((b.value >> 32) & 0xFFFF),
293 |               clamp_u8((b.value >> 48) & 0xFFFF));
294 |         return res;
295 |     }
296 |     c64 conv_s16_s8(const c64& b) const
297 |     {
298 |         c64 res; res.
299 |         Init8(clamp_s8(value & 0xFFFF),
300 |               clamp_s8((value >> 16) & 0xFFFF),
301 |               clamp_s8((value >> 32) & 0xFFFF),
302 |               clamp_s8((value >> 48) & 0xFFFF),
303 |               clamp_s8(b.value & 0xFFFF),
304 |               clamp_s8((b.value >> 16) & 0xFFFF),
305 |               clamp_s8((b.value >> 32) & 0xFFFF),
306 |               clamp_s8((b.value >> 48) & 0xFFFF));
307 |         return res;
308 |     }
309 | 
310 |     /* TODO: Verify that these are correct (though they should never be used anyway) */
311 |     c64 unpacklbw(const c64& p) const
312 |     {
313 |     #if defined(__MMX__) && !defined(__ICC)
314 |         /* ICC says [error: type of cast must be integral or enum]
315 |          * on the return value cast,
316 |          * so we cannot use this code on ICC. Fine for GCC. */
317 |         return (std::uint_least64_t)_m_punpcklbw(*(const __m64*)&p.value, *(const __m64*)&value);
318 |     #else
319 |         std::uint_fast64_t a=value, b=p.value;
320 |         return expand32_8(a) | (expand32_8(b) << 8);
321 |     #endif
322 |     }
323 |     c64 unpackhbw(const c64& p) const
324 |     {
325 |     #if defined(__MMX__) && !defined(__ICC)
326 |         return (std::uint_least64_t)_m_punpckhbw(*(const __m64*)&p.value, *(const __m64*)&value);
327 |     #else
328 |         std::uint_fast64_t a=value, b=p.value;
329 |         return expand32_8(a>>32) | (expand32_8(b>>32) << 8);
330 |     #endif
331 |     }
332 |     c64 unpacklwd(const c64& p) const
333 |     {
334 |     #if defined(__MMX__) && !defined(__ICC)
335 |         return (std::uint_least64_t)_m_punpcklwd(*(const __m64*)&p.value, *(const __m64*)&value);
336 |     #else
337 |         std::uint_fast64_t a=value, b=p.value;
338 |         return expand32_16(a) | (expand32_16(b) << 16);
339 |     #endif
340 |     }
341 |     c64 unpackhwd(const c64& p) const
342 |     {
343 |     #if defined(__MMX__) && !defined(__ICC)
344 |         return (std::uint_least64_t)_m_punpckhwd(*(const __m64*)&p.value, *(const __m64*)&value);
345 |     #else
346 |         std::uint_fast64_t a=value, b=p.value;
347 |         return expand32_16(a>>32) | (expand32_16(b>>32) << 16);
348 |     #endif
349 |     }
350 |     c64 unpackldq() const { return unpackldq(*this); }
351 |     c64 unpackldq(const c64& p) const
352 |     {
353 |     #if defined(__MMX__) && !defined(__ICC)
354 |         return (std::uint_least64_t)_m_punpckldq(*(const __m64*)&p.value, *(const __m64*)&value);
355 |     #else
356 |         return value | (p.value << 32);
357 |     #endif
358 |     }
359 | };
360 | 
361 | #ifdef USE_MMX
362 | typedef c64_MMX c64;
363 | #else
364 | typedef c64_nonMMX c64;
365 | #endif
366 | 
367 | static inline void MMX_clear()
368 | {
369 |     #ifdef __3dNOW__
370 |     _m_femms(); /* Note: not available on ICC or Valgrind */
371 |     //_mm_empty();
372 |     #elif defined(__MMX__)
373 |     _mm_empty();
374 |     #endif
375 | }
376 | 


--------------------------------------------------------------------------------
/crt-filter.cc:
--------------------------------------------------------------------------------
  1 | #include <cmath>
  2 | #include <cstdint>
  3 | #include <cstdlib>
  4 | #include <cstdio>
  5 | #include <vector>
  6 | #include <cerrno>
  7 | #include <unistd.h>
  8 | #include "blur.hh"
  9 | 
 10 | #define likely(x)       __builtin_expect(!!(x), 1)
 11 | #define unlikely(x)     __builtin_expect(!!(x), 0)
 12 | 
 13 | #include "newhash/newhash.cc"
 14 | 
 15 | /* Magnitude of scaled scanline, where n = 0..1 = position between scanlines */
 16 | inline constexpr float ScanlineMagnitude(float n) { float c = 0.3f; return std::exp(-(n-0.5f)*(n-0.5f)/(2.f*c*c)); }
 17 | 
 18 | constexpr unsigned NumHorizPixels   = 640;
 19 | constexpr unsigned CellWidth0 = 2, CellBlank0 = 1; // R
 20 | constexpr unsigned CellWidth1 = 2, CellBlank1 = 1; // G
 21 | constexpr unsigned CellWidth2 = 2, CellBlank2 = 2; // B
 22 | constexpr unsigned TotalHorizRes = NumHorizPixels * (CellWidth0 + CellBlank0 + CellWidth1 + CellBlank1 + CellWidth2 + CellBlank2);
 23 | constexpr unsigned Cell0Start = 0,                   Cell0End = Cell0Start + CellWidth0;
 24 | constexpr unsigned Cell1Start = Cell0End+CellBlank0, Cell1End = Cell1Start + CellWidth1;
 25 | constexpr unsigned Cell2Start = Cell1End+CellBlank1, Cell2End = Cell2Start + CellWidth2;
 26 | 
 27 | constexpr unsigned NumVertPixels    = 400;
 28 | constexpr unsigned CellHeight0 = 5; // Height of RGB triplet
 29 | constexpr unsigned CellHeight1 = 1; // Blank after RGB triplet
 30 | constexpr unsigned CellStagger = 3; // Offset of successive columns
 31 | constexpr unsigned TotalVertRes = NumVertPixels * (CellHeight0 + CellHeight1);
 32 | 
 33 | template<unsigned Start,unsigned End>
 34 | static inline float GetMask(unsigned x, unsigned y)
 35 | {
 36 |     constexpr unsigned cellwidth = CellWidth0 + CellBlank0 + CellWidth1 + CellBlank1 + CellWidth2 + CellBlank2;
 37 |     unsigned hpix = x / cellwidth, hmod = x % cellwidth;
 38 | 
 39 |     constexpr unsigned cellheight = CellHeight0 + CellHeight1;
 40 |     unsigned vmod = (y + CellStagger * hpix) % cellheight;
 41 |     return (vmod < CellHeight0) & (hmod >= Start) & (hmod < End);
 42 | }
 43 | 
 44 | template<unsigned Shift>
 45 | static void ConvertPlane(unsigned num, const std::uint32_t* pixels, float* output)
 46 | {
 47 |     #pragma omp simd
 48 |     for(unsigned n=0; n<num; ++n)
 49 |     {
 50 |         unsigned p = pixels[n];
 51 |         p >>= Shift;
 52 |         p &= 0xFF;
 53 |         output[n] = p;// * (1.f / 255.f);
 54 |     }
 55 | }
 56 | 
 57 | 
 58 | 
 59 | template<int Radius, bool check>
 60 | static inline float Lanczos_pi(float x_pi)
 61 | {
 62 |     if(unlikely(x_pi == (float)0.0)) return (float)1.0;
 63 |     if(check)
 64 |     {
 65 |         if (x_pi <= (float)(-Radius*M_PI)
 66 |          || x_pi >= (float)( Radius*M_PI)) return (float)0.0;
 67 |     }
 68 | 
 69 |     float x_pi_div_Radius = x_pi / Radius;
 70 | 
 71 |     //float a = sin(x_pi)            / x_pi;
 72 |     //float b = sin(x_pi_div_Radius) / x_pi_div_Radius;
 73 |     //return a * b;
 74 | 
 75 |     return std::sin(x_pi) * std::sin(x_pi_div_Radius) / (x_pi * x_pi_div_Radius);
 76 | }
 77 | template<int Radius>
 78 | static inline float Lanczos(float x)
 79 | {
 80 |     return Lanczos_pi<Radius,true>(x * (float)M_PI);
 81 | }
 82 | template<typename SrcTab, typename DestTab>
 83 | class Lanczos2DBase
 84 | {
 85 |     const SrcTab& src; DestTab& tgt;
 86 | 
 87 |     // Note: In this vocabulary,
 88 |     //  y denotes outer loop and
 89 |     //  x denotes inner loop, but
 90 |     // it could be vice versa.
 91 |     int xinc_src, yinc_src;
 92 |     int xinc_tgt, yinc_tgt;
 93 |     int ylimit;
 94 | public:
 95 |     Lanczos2DBase(
 96 |         const SrcTab& src_, DestTab& tgt_,
 97 |         int sxi,int syi, int txi,int tyi, int ylim)
 98 |         : src(src_), tgt(tgt_),
 99 |           xinc_src(sxi), yinc_src(syi),
100 |           xinc_tgt(txi), yinc_tgt(tyi),
101 |           ylimit(ylim)
102 |         { }
103 | 
104 |     void ScaleOne
105 |         (int srcpos, int tgtpos, int nmax,
106 |          const float contrib[], float density_rev) const
107 |     {
108 |         float res = 0.0;
109 |         int srctemp = srcpos;
110 | 
111 |         //#pragma omp parallel for reduction(+:r,g,b)
112 |         #pragma omp simd reduction(+:res)
113 |         for(int n=0; n<nmax; ++n)
114 |         {
115 |             res     += contrib[n] * src[srctemp];
116 |             srctemp += xinc_src;   // source x increment
117 |         }
118 | 
119 |         // Multiplication is faster than division, so we use the reciprocal.
120 |         res *= density_rev;
121 | 
122 |         tgt[tgtpos] = res;
123 |     }
124 | 
125 |     void StripeLoop(int tx, int sx, int nmax, const float contrib[], float density) const
126 |     {
127 |         int srcpos = sx * xinc_src; // source x pos at y = 0
128 |         int tgtpos = tx * xinc_tgt; // target x pos at y = 0
129 | 
130 |         /*
131 |         fprintf(stderr, "StripeLoop sx=%d, tx=%d, srcpos=%d, tgtpos=%d, srcsize=%d, tgtsize=%d, ylimit=%d\n",
132 |             sx,tx, srcpos, tgtpos,
133 |             (int)src.size(), (int)tgt.size(), ylimit);*/
134 | 
135 |         const float density_rev = (density == 0.0f || density == 1.0f) ? 1.0f : (1.0f / density);
136 | 
137 |         for(int y=ylimit; y-->0; )
138 |         {
139 |             /*fprintf(stderr, "- within: srcpos=%d, tgtpos=%d, y=%d\n", srcpos,tgtpos, y);*/
140 |             ScaleOne(srcpos, tgtpos, nmax, contrib, density_rev);
141 | 
142 |             srcpos += yinc_src;  // source y increment
143 |             tgtpos += yinc_tgt;  // target y increment
144 |         }
145 |     }
146 | };
147 | 
148 | template<typename SrcTab, typename DestTab>
149 | class HorizScaler: public Lanczos2DBase<SrcTab,DestTab>
150 | {
151 | public:
152 |     /*
153 |             <-------------->
154 |             <-------------->
155 |             <-------------->
156 |             <-------------->
157 |             <-------------->
158 |             <-------------->
159 | 
160 |             For each output column (out_size = {ow}),
161 |             {h} rows (source and target) get processed
162 | 
163 |             On each row,
164 |               {nmax} source columns get transformed
165 |               into 1 target column
166 | 
167 |             Target:
168 |                New column stride = {1}
169 |                New row    stride = {ow}
170 |             Source:
171 |                Next column stride = {1}
172 |                Next row    stride = {iw}
173 |     */
174 | 
175 |     HorizScaler(
176 |         int iw,int ow, int h,
177 |         const SrcTab& src, DestTab& tgt)
178 |         : Lanczos2DBase<SrcTab,DestTab>(
179 |             src,tgt,
180 |             1,  // xinc_src ok
181 |             iw, // yinc_src ok
182 |             1,  // xinc_tgt ok
183 |             ow, // yinc_tgt ok
184 |             h   // ylimit   ok
185 |          ) { }
186 | };
187 | 
188 | template<typename SrcTab, typename DestTab>
189 | class VertScaler: public Lanczos2DBase<SrcTab,DestTab>
190 | {
191 | public:
192 |     /*
193 |             ^^^^^^^^^^^^^^^^
194 |             ||||||||||||||||
195 |             ||||||||||||||||
196 |             ||||||||||||||||
197 |             ||||||||||||||||
198 |             vvvvvvvvvvvvvvvv
199 | 
200 |             For each output row (out_size = {oh}),
201 |             {w} columns (source and target) get processed
202 | 
203 |             On each column,
204 |               {nmax} source rows get transformed
205 |               into 1 target row
206 | 
207 |             Target:
208 |                New row    stride = {w}
209 |                New column stride = {1}
210 |             Source:
211 |                Next row    stride = {w}
212 |                Next column stride = {1}
213 |     */
214 | 
215 |     VertScaler(
216 |         int w,
217 |         const SrcTab& src, DestTab& tgt)
218 | 
219 |         : Lanczos2DBase<SrcTab,DestTab>(
220 |             src,tgt,
221 |             w, // xinc_src ok
222 |             1, // yinc_src ok
223 |             w, // xinc_tgt ok
224 |             1, // yinc_tgt ok
225 |             w  // ylimit   ok
226 |          ) { }
227 | };
228 | 
229 | /*template<typename SrcTab, typename DestTab>
230 | class ScalarScaler: private Lanczos2DBase<SrcTab, DestTab>
231 | {
232 | public:
233 |     ScalarScaler(const SrcTab& src, DestTab& tgt)
234 |         : Lanczos2DBase<SrcTab,DestTab>(src,tgt, 1,1,1,1,1) { }
235 | 
236 |     void StripeLoop(int tx, int sx, int nmax,
237 |                     const float contrib[], float density) const
238 |     {
239 |         const float density_rev = (density == 0.0 || density == 1.0)
240 |             ? 1.0
241 |             : (1.0 / density);
242 |         ScaleOne(sx, tx, nmax, contrib, density_rev);
243 |     }
244 | };*/
245 | 
246 | struct LanczosCoreCalcRes
247 | {
248 |     int start;
249 |     int nmax;
250 |     float density;
251 | };
252 | 
253 | template<int FilterRadius>
254 | inline LanczosCoreCalcRes LanczosCoreCalc
255 |     (int in_size,
256 |      float center, float support, float scale,
257 |      float contrib[])
258 | {
259 |     const int start = std::max((int)(center-support+(float)0.5), 0);
260 |     const int end   = std::min((int)(center+support+(float)0.5), in_size);
261 |     const int nmax = end-start;
262 | 
263 |     const float scale_pi = scale * M_PI;
264 | 
265 |     const float s_min = -FilterRadius*M_PI;
266 |     const float s_max =  FilterRadius*M_PI;
267 | 
268 |     float s_pi     = (start-center+(float)0.5) * scale_pi;
269 | 
270 |     float density  = 0.0;
271 | 
272 |     { int n=0;
273 |       for(; n < nmax && unlikely(s_pi < s_min); ++n, s_pi += scale_pi)
274 |         {}
275 |       for(; n < nmax && likely(s_pi < s_max); ++n, s_pi += scale_pi)
276 |       {
277 |         float l = Lanczos_pi<FilterRadius,false> (s_pi);
278 |         contrib[n] = l;
279 |         density += l;
280 |       }
281 |     }
282 | 
283 |     LanczosCoreCalcRes res;
284 |     res.start   = start;
285 |     res.nmax    = nmax;
286 |     res.density = density;
287 |     return res;
288 | }
289 | 
290 | /* A generic Lanczos scaler suitable for
291 |  * converting something to something else
292 |  * at once.
293 |  * For image pixels, use Triplet<type>
294 |  * For stereo samples, use Triplet<type, 2>
295 |  * For mono samples, just use type
296 |  */
297 | template<typename Handler>
298 | static void LanczosScale(int in_size, int out_size, Handler& target)
299 | {
300 |     const int FilterRadius = 2;
301 |     const float blur         = 1.0f;
302 | 
303 |     const float factor       = out_size / (float)in_size;
304 |     const float scale        = std::min(factor, (float)1.0) / blur;
305 |     const float support      = FilterRadius / scale;
306 | 
307 |     const std::size_t contrib_size = std::min(in_size, 5+int(2*support));
308 |     float contrib[contrib_size];
309 | 
310 |     /*fprintf(stderr, "Scaling (%d->%d), contrib=%d\n",
311 |         in_size, out_size, (int)contrib_size);*/
312 | 
313 |     #pragma omp parallel for schedule(static)
314 |     for(int outpos=0; outpos<out_size; ++outpos)
315 |     {
316 | 
317 |         float center = (outpos+0.5f) / factor;
318 |         LanczosCoreCalcRes res = LanczosCoreCalc<FilterRadius>(in_size, center, support, scale, contrib);
319 |         target.StripeLoop(outpos, res.start, res.nmax, &contrib[0], res.density);
320 |     }
321 | }
322 | 
323 | 
324 | static void VLanczos(unsigned in_width,unsigned in_height, unsigned out_height, const float* in, float* out)
325 | {
326 |     VertScaler<const float*, float*> handler_y(in_width, in, out);
327 |     LanczosScale(in_height, out_height, handler_y);
328 | }
329 | static void HLanczos(unsigned in_width,unsigned in_height, unsigned out_width, const float* in, float* out)
330 | {
331 |     HorizScaler<const float*, float*> handler_x(in_width,out_width, in_height, in, out);
332 |     LanczosScale(in_width, out_width, handler_x);
333 | }
334 | 
335 | static std::uint32_t ClampWithDesaturation(int r,int g,int b)
336 | {
337 |     const int R = 2126, G = 7152, B = 722, sum=R+G+B;
338 |     int luma = r*R + g*G + b*B;
339 |     if(luma > 255*sum) { r=g=b=255; }
340 |     else if(luma <= 0) { r=g=b=0; }
341 |     else
342 |     {
343 |         // See explanations below on the uses of this function.
344 |         auto spread = [&r,&g,&b,R,G,B](auto&& test, auto&& cap, int sign)
345 |         {
346 |             // Is there load waiting to be shared?
347 |             int cr,cg,cb, work = R*std::max(0, test(r))
348 |                                + G*std::max(0, test(g))
349 |                                + B*std::max(0, test(b));
350 |             if(!work) return false;
351 |             // Are there capable load bearers?
352 |             if(int capacity = R*std::max(0, (cr = cap(r)))
353 |                             + G*std::max(0, (cg = cap(g)))
354 |                             + B*std::max(0, (cb = cap(b))))
355 |             {
356 |                 // Distribute the load (take it away & give to others).
357 |                 int act = std::min(work, capacity);
358 |                 r += cr * sign * act / (cr > 0 ? capacity : work);
359 |                 g += cg * sign * act / (cg > 0 ? capacity : work);
360 |                 b += cb * sign * act / (cb > 0 ? capacity : work);
361 |             }
362 |             return true;
363 |         };
364 |         // Find out the amount of excess color energy.
365 |         // Dissipate it to capable channels,
366 |         // and take it away from those that had excess.
367 |         for(int rounds=0; rounds<4; ++rounds)
368 |         {
369 |             bool excess = spread([](int c) { return c-255; }, // Amount of access
370 |                                  [](int c) { return 255-c; }, // Capacity for reception
371 |                                  1);
372 |             // Find out the amount of color energy debt.
373 |             // Borrow energy from capable channels,
374 |             // and give it to channels that need it.
375 |             bool debt  = spread([](int c) { return -c; }, // Amount of debt
376 |                                 [](int c) { return c; },  // Capacity for borrowing
377 |                                 -1);
378 |             if(!excess && !debt) break;
379 |         }
380 |         // Normally, 1 round should be fine. In case it isn't, we provide
381 |         // a few retry rounds. 2 round is at most needed, in my experiments.
382 |         // But just to be perfectly safe, in the unlikely case that one of
383 |         // the channels still needs clamping... We do it the traditional way.
384 |         // They need clamping if one of the values has bits other than 0-7 set.
385 |         if(unlikely((r | g | b) & ~0xFF))
386 |         {
387 |             r = std::clamp(r, 0, 255);
388 |             g = std::clamp(g, 0, 255);
389 |             b = std::clamp(b, 0, 255);
390 |         }
391 |     }
392 |     return unsigned(r)*65536u + unsigned(g)*256u + b;
393 | }
394 | 
395 | 
396 | void ConvertPicture(unsigned in_width,
397 |                     unsigned in_height,
398 |                     unsigned out_width,
399 |                     unsigned out_height,
400 |                     unsigned NumScanlines,
401 |                     const std::uint32_t* pixels,
402 |                     std::uint32_t* outpixels)
403 | {
404 |     std::vector<float> plane(NumScanlines * in_width * 3);
405 |     std::vector<float> tempplane(TotalVertRes * out_width * 3);
406 |     std::vector<float> resuplane(out_width * out_height * 3);
407 |     constexpr float Gamma = 2.0;
408 | 
409 |     if(in_height == NumScanlines)
410 |     {
411 |         ConvertPlane<16>(NumScanlines*in_width, pixels, &plane[NumScanlines*in_width*0 + 0]);
412 |         ConvertPlane< 8>(NumScanlines*in_width, pixels, &plane[NumScanlines*in_width*1 + 0]);
413 |         ConvertPlane< 0>(NumScanlines*in_width, pixels, &plane[NumScanlines*in_width*2 + 0]);
414 | 
415 |         #pragma omp parallel for simd schedule(static)
416 |         for(unsigned n=0; n<NumScanlines*in_width*3; ++n)
417 |             plane[n] = std::pow(plane[n] / 255.f, 1.0 / Gamma);
418 |     }
419 |     else
420 |     {
421 |         std::vector<float> indata(in_width * in_height * 3);
422 |         ConvertPlane<16>(in_height*in_width, pixels, &indata[in_height*in_width*0 + 0]);
423 |         ConvertPlane< 8>(in_height*in_width, pixels, &indata[in_height*in_width*1 + 0]);
424 |         ConvertPlane< 0>(in_height*in_width, pixels, &indata[in_height*in_width*2 + 0]);
425 | 
426 |         #pragma omp parallel for simd schedule(static)
427 |         for(unsigned n=0; n<in_height*in_width*3; ++n)
428 |             indata[n] = std::pow(indata[n] / 255.f, 1.0 / Gamma);
429 | 
430 |         #pragma omp parallel for schedule(dynamic)
431 |         for(unsigned n=0; n<3; ++n)
432 |             VLanczos(in_width,in_height, NumScanlines, &indata[in_height*in_width*n], &plane[NumScanlines*in_width*n]);
433 |     }
434 | 
435 |     #pragma omp parallel for schedule(static)
436 |     for(unsigned y=0; y<TotalVertRes; ++y)
437 |     {
438 |         float srcy_flt = y * float(float(NumScanlines) / TotalVertRes);
439 |         unsigned srcy = unsigned(srcy_flt);
440 | 
441 |         float ScaledScanline[TotalHorizRes/*in_width*/ * 3];
442 |         float XScaledScanline[TotalHorizRes * 3];
443 |         float XMask[TotalHorizRes * 3];
444 | 
445 |         float factor = ScanlineMagnitude(srcy_flt - srcy);
446 | 
447 |         #pragma omp simd collapse(2)
448 |         for(unsigned n=0; n<3; ++n)
449 |             for(unsigned x=0; x<in_width; ++x)
450 |             {
451 |                 ScaledScanline[x + in_width*n] = plane[NumScanlines*in_width*n + srcy*in_width + x] * factor;
452 |             }
453 | 
454 |         #pragma omp simd
455 |         for(unsigned x=0; x<TotalHorizRes; ++x)
456 |         {
457 |             XMask[x + TotalHorizRes*0] = GetMask<Cell0Start,Cell0End>(x,y);
458 |             XMask[x + TotalHorizRes*1] = GetMask<Cell1Start,Cell1End>(x,y);
459 |             XMask[x + TotalHorizRes*2] = GetMask<Cell2Start,Cell2End>(x,y);
460 |         }
461 | 
462 |         #pragma omp simd collapse(1)
463 |         for(unsigned n=0; n<3; ++n)
464 |             for(unsigned x=0; x<TotalHorizRes; ++x)
465 |                 XScaledScanline[x + TotalHorizRes*n] = ScaledScanline[x*in_width/TotalHorizRes + in_width*n];
466 | 
467 |         #pragma omp simd
468 |         for(unsigned x=0; x<TotalHorizRes; ++x)
469 |         {
470 |             XScaledScanline[x + TotalHorizRes*0] *= XMask[x + TotalHorizRes*0];
471 |             XScaledScanline[x + TotalHorizRes*1] *= XMask[x + TotalHorizRes*1];
472 |             XScaledScanline[x + TotalHorizRes*2] *= XMask[x + TotalHorizRes*2];
473 |         }
474 | 
475 |         for(unsigned n=0; n<3; ++n)
476 |             HLanczos(TotalHorizRes,1, out_width, &XScaledScanline[TotalHorizRes*n], &tempplane[TotalVertRes*out_width*n + y*out_width]);
477 |     }
478 | 
479 |     #pragma omp parallel for schedule(dynamic)
480 |     for(unsigned n=0; n<3; ++n)
481 |         VLanczos(out_width,TotalVertRes, out_height, &tempplane[TotalVertRes*out_width*n], &resuplane[out_height*out_width*n]);
482 | 
483 |     unsigned hpix = CellWidth0 + CellBlank0 + CellWidth1 + CellBlank1 + CellWidth2 + CellBlank2;
484 |     unsigned vpix = CellHeight0 + CellHeight1;
485 |     float sum = 0, sum2 = 0; unsigned facsum = 0, facsum2 = 0;
486 |     for(unsigned y=0; y<vpix; ++y)
487 |         for(unsigned x=0; x<hpix; ++x)
488 |             { facsum += 1; sum += GetMask<Cell0Start,Cell0End>(x,y) + GetMask<Cell1Start,Cell1End>(x,y) + GetMask<Cell2Start,Cell2End>(x,y); }
489 |     for(unsigned n=0; n<8; ++n)
490 |         { facsum2 += 1; sum2 += ScanlineMagnitude(n/8.f); }
491 |     float factor = facsum*facsum2 / (sum*sum2);
492 | 
493 |     #pragma omp parallel for simd schedule(static)
494 |     for(unsigned n=0; n<out_width*out_height*3; ++n) resuplane[n] = (resuplane[n] /*+ 0.075f*/) * factor;
495 | 
496 |     std::vector<short> resuplanes(out_width * out_height * 3);
497 |     std::vector<short> resuplanestmp(out_width * out_height * 3);
498 |     std::vector<short> resuplaneout(out_width * out_height * 3);
499 | 
500 |     #pragma omp parallel for simd schedule(static)
501 |     for(unsigned n=0; n<out_width*out_height*3; ++n)
502 |         resuplanes[n] = 600.f * std::pow(resuplane[n], Gamma);
503 | 
504 |     for(unsigned n=0; n<3; ++n)
505 |     {
506 |         blur<3>(&resuplanes[n*out_width*out_height],
507 |                 &resuplaneout[n*out_width*out_height],
508 |                 &resuplanestmp[n*out_width*out_height],
509 |                 out_width, out_height, out_width / 640.f);
510 |     }
511 | 
512 |     #pragma omp parallel for simd schedule(static)
513 |     for(unsigned n=0; n<out_width*out_height*3; ++n)
514 |         resuplanes[n] = 255.f * std::pow(resuplane[n], Gamma);
515 | 
516 |     #pragma omp parallel for schedule(static)
517 |     for(unsigned n=0; n<out_width*out_height; ++n)
518 |     {
519 |         outpixels[n] = ClampWithDesaturation(resuplanes[out_height*out_width*0+n] + resuplaneout[out_height*out_width*0+n],
520 |                                              resuplanes[out_height*out_width*1+n] + resuplaneout[out_height*out_width*1+n],
521 |                                              resuplanes[out_height*out_width*2+n] + resuplaneout[out_height*out_width*2+n]);
522 |     }
523 | }
524 | 
525 | static long FullyWrite(int fd, const void* b, std::size_t length) // SafeWrite
526 | {
527 |     const unsigned char* buf = (const unsigned char*) b;
528 |     auto origbuf = buf;
529 |   Retry:;
530 |     int result = write(fd, buf, length);
531 |     if(result == -1 && errno==EAGAIN) goto Retry;
532 |     if(result == -1 && errno==EINTR) goto Retry;
533 |     if(result == 0) { std::fprintf(stderr, "\33[1mwrite: EOF\33[m\n"); return 0; }
534 |     if(result < 0) { std::perror("write"); return -(long)errno; }
535 |     length -= result;
536 |     buf    += result;
537 |     if(length) goto Retry;
538 |     return buf-origbuf;
539 | }
540 | static long FullyRead(int fd, void* b, std::size_t length) // SafeRead
541 | {
542 |     unsigned char* buf = (unsigned char*) b;
543 |     auto origbuf = buf;
544 |   Retry:;
545 |     int result = read(fd, buf, length);
546 |     if(result == -1 && errno==EAGAIN) goto Retry;
547 |     if(result == -1 && errno==EINTR) goto Retry;
548 |     if(result == 0) { std::fprintf(stderr, "\33[1mread: EOF\33[m\n"); return 0; }
549 |     if(result < 0) { std::perror("read"); return -(long)errno; }
550 |     length -= result;
551 |     buf    += result;
552 |     if(length) goto Retry;
553 |     return buf-origbuf;
554 | }
555 | 
556 | int main(int argc, char** argv)
557 | {
558 |     if(argc != 6)
559 |     {
560 |         std::fprintf(stderr, "\33[1mInvalid parameters.\n"
561 |                              "crt-filter <in-width> <in-height> <out-width> <out-height> <numscanlines>\33[m\n");
562 |         return 1;
563 |     }
564 |     unsigned in_width  = std::atoi(argv[1]);
565 |     unsigned in_height = std::atoi(argv[2]);
566 |     unsigned out_width  = std::atoi(argv[3]);
567 |     unsigned out_height = std::atoi(argv[4]);
568 |     unsigned NumScanlines = std::atoi(argv[5]);
569 |     std::vector<std::uint32_t> inbuf(in_width*in_height);
570 |     std::vector<std::uint32_t> outbuf(out_width*out_height);
571 | 
572 |     constexpr unsigned NFrames = 4;
573 |     newhash_t                  hashes[NFrames];
574 |     std::vector<std::uint32_t> saved_outputs[NFrames];
575 |     std::vector<std::uint32_t> saved_inputs[NFrames];
576 |     for(;;)
577 |     {
578 |         if(FullyRead(0, &inbuf[0], inbuf.size()*4) < (long)inbuf.size()*4) break;
579 | 
580 |         newhash_t hash = newhash_calc((const unsigned char*)&inbuf[0],
581 |                                       inbuf.size()*sizeof(inbuf[0]));
582 |         bool found = false;
583 |         for(unsigned n=0; n<NFrames; ++n)
584 |             if(hash == hashes[n] && inbuf == saved_inputs[n])
585 |             {
586 |                 outbuf = saved_outputs[n];
587 |                 found  = true;
588 |                 break;
589 |             }
590 |         if(!found)
591 |         {
592 |             ConvertPicture(in_width, in_height, out_width, out_height, NumScanlines, &inbuf[0], &outbuf[0]);
593 | 
594 |             static unsigned n = 0;
595 |             saved_inputs[n]  = inbuf;
596 |             saved_outputs[n] = outbuf;
597 |             hashes[n]        = hash;
598 |             n = (n+1)%NFrames;
599 |         }
600 | 
601 |         if(FullyWrite(1, &outbuf[0], outbuf.size()*4) < (long)outbuf.size()*4) break;
602 |     }
603 |     return 0;
604 | }
605 | 


--------------------------------------------------------------------------------