├── img ├── mask.png ├── coppers.png ├── rainbow.png ├── rainbow2.png ├── weights.png ├── mpv-shot0001.jpg ├── mpv-shot0002.jpg ├── mpv-shot0003.jpg ├── mpv-shot0004.jpg ├── coppers.php └── rainbow.php ├── newhash ├── newhash.hh ├── endian.hh ├── newhash.cc └── simd.hh ├── make-reencoded.sh ├── reencode.sh ├── blur.hh ├── README.md └── crt-filter.cc /img/mask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bisqwit/crt-filter/HEAD/img/mask.png -------------------------------------------------------------------------------- /img/coppers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bisqwit/crt-filter/HEAD/img/coppers.png -------------------------------------------------------------------------------- /img/rainbow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bisqwit/crt-filter/HEAD/img/rainbow.png -------------------------------------------------------------------------------- /img/rainbow2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bisqwit/crt-filter/HEAD/img/rainbow2.png -------------------------------------------------------------------------------- /img/weights.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bisqwit/crt-filter/HEAD/img/weights.png -------------------------------------------------------------------------------- /img/mpv-shot0001.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bisqwit/crt-filter/HEAD/img/mpv-shot0001.jpg -------------------------------------------------------------------------------- /img/mpv-shot0002.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bisqwit/crt-filter/HEAD/img/mpv-shot0002.jpg -------------------------------------------------------------------------------- /img/mpv-shot0003.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bisqwit/crt-filter/HEAD/img/mpv-shot0003.jpg -------------------------------------------------------------------------------- /img/mpv-shot0004.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bisqwit/crt-filter/HEAD/img/mpv-shot0004.jpg -------------------------------------------------------------------------------- /newhash/newhash.hh: -------------------------------------------------------------------------------- 1 | #ifndef bqtNewhashHH 2 | #define bqtNewhashHH 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #include "endian.hh" 9 | 10 | typedef std::uint_least32_t newhash_t; 11 | 12 | extern newhash_t newhash_calc(const unsigned char* buf, unsigned long size); 13 | extern newhash_t newhash_calc_upd(newhash_t c, const unsigned char* buf, unsigned long size); 14 | 15 | #ifdef __cplusplus 16 | } 17 | #endif 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /make-reencoded.sh: -------------------------------------------------------------------------------- 1 | parallel -j6 -- \ 2 | "./reencode.sh 200 sync-gwbasic.avi sync-gwbasic-fancy.mkv 640 200 60 2880 2160" \ 3 | "./reencode.sh 400 tp5.avi tp5-fancy.mkv 2880 400 60 2880 2160" \ 4 | "./reencode.sh 400 sync-qbasic.avi sync-qbasic-fancy.mkv 2880 400 60 2880 2160" \ 5 | "./reencode.sh 400 sync-editv2.avi sync-editv2-fancy.mkv 2880 400 60 2880 2160" \ 6 | "./reencode.sh 400 sync-bc3.avi sync-bc3-fancy.mkv 2880 400 60 2880 2160" \ 7 | "./reencode.sh 400 sync-bp7.avi sync-bp7-fancy.mkv 2880 400 60 2880 2160" \ 8 | "./reencode.sh 400 bp7conf.avi bp7conf-fancy.mkv 2880 400 60 2880 2160" \ 9 | "./reencode.sh 350 q.avi q-fancy.mkv 640 350 60 2880 2160" 10 | -------------------------------------------------------------------------------- /reencode.sh: -------------------------------------------------------------------------------- 1 | scanlines=$1 2 | outputfile="$3" 3 | 4 | w=$4 5 | h=$5 6 | 7 | ow=$7 8 | oh=$8 9 | r=$6 10 | 11 | f="$2" 12 | 13 | ffmpeg -i "$f" -sws_flags lanczos -vf scale=$w:$h -pix_fmt bgra \ 14 | -f rawvideo -threads 14 -r $r -y /dev/stdout \ 15 | | ./crt-filter $w $h $ow $oh $scanlines \ 16 | | ffmpeg -f rawvideo -pixel_format bgra -video_size $ow"x"$oh \ 17 | -framerate $r -i /dev/stdin \ 18 | -c:v h264 -pix_fmt yuv444p -crf 14 -threads 14 \ 19 | -g $((r/2)) -preset veryslow "$outputfile" 20 | 21 | 22 | # The first ffmpeg just converts the colorspace into BGRA. 23 | # It should not change the resolution. 24 | 25 | # The second one does the rescaling. 26 | 27 | # The third one compresses as H.264 — again, without rescaling. 28 | -------------------------------------------------------------------------------- /img/coppers.php: -------------------------------------------------------------------------------- 1 | 2 | 3 | /* blur(): Really fast O(n) gaussian blur algorithm (gaussBlur_4) 4 | * By Ivan Kuckir with ideas from Wojciech Jarosz 5 | * Adapted from http://blog.ivank.net/fastest-gaussian-blur.html 6 | * 7 | * input: The two-dimensional array of input signal. Must contain w*h elements. 8 | * output: Where the two-dimensional array of blurred signal will be written 9 | * temp: Another array, for temporary use. Same size as input and output. 10 | * w: Width of array 11 | * h: Height of array. 12 | * sigma: Blurring kernel size. Must be smaller than w and h. 13 | * n_boxes: Controls the blurring quality. 1 = box filter. 3 = pretty good filter. 14 | * Higher number = diminishingly better results, but linearly slower. 15 | * elem_t: Type of elements. Should be integer type. 16 | */ 17 | template 18 | void blur(const elem_t* input, elem_t* output, elem_t* temp, 19 | unsigned w,unsigned h,float sigma) 20 | { 21 | auto wIdeal = std::sqrt((12*sigma*sigma/n_boxes)+1); // Ideal averaging filter width 22 | unsigned wl = wIdeal; if(wl%2==0) --wl; 23 | unsigned wu = wl+2; 24 | auto mIdeal = (12*sigma*sigma - n_boxes*wl*wl - 4*n_boxes*wl - 3*n_boxes)/(-4.*wl - 4); 25 | unsigned m = std::round(mIdeal); 26 | const elem_t* data = input; 27 | for(unsigned n=0; n>1 ] = ($v); // max 26 | $bgr[ ( (7-$i)%6)>>1 ] = ($p); // min 27 | $bgr[ ($i+1)%3 ] = ( 28 | ($i&1) ? $v - $qt // max downto min 29 | : $p + $qt ); // min upto max 30 | return $bgr; 31 | } 32 | } 33 | 34 | $dither = Array(1,49,13,61,4,52,16,64,33,17,45,29,36,20,48,32,9,57,5,53,12,60,8,56,41,25,37,21,44,28,40,24,3,51,15,63,2,50,14,62,35,19,47,31,34,18,46,30,11,59,7,55,10,58,6,54,43,27,39,23,42,26,38,22); 35 | foreach($dither as &$v)$v /= 64.; unset($v); 36 | 37 | function clamp($pix, $x,$y) 38 | { 39 | global $gamma,$dither; 40 | 41 | if($pix[0] < 0) $pix[0] = 0; 42 | if($pix[1] < 0) $pix[1] = 0; 43 | if($pix[2] < 0) $pix[2] = 0; 44 | $pix[0] = pow($pix[0]/255, $gamma)*255; 45 | $pix[1] = pow($pix[1]/255, $gamma)*255; 46 | $pix[2] = pow($pix[2]/255, $gamma)*255; 47 | 48 | $d = $dither[($x&7)*8+($y&7)]; 49 | $pix[0] += $d; 50 | $pix[1] += $d; 51 | $pix[2] += $d; 52 | if($pix[0] > 255) $pix[0] = 255; 53 | if($pix[1] > 255) $pix[1] = 255; 54 | if($pix[2] > 255) $pix[2] = 255; 55 | 56 | return (int)$pix[2] * 65536 57 | + (int)$pix[1] * 256 58 | + (int)$pix[0]; 59 | } 60 | function clamp2($pix, $x,$y) 61 | { 62 | global $a,$b,$c,$abc; 63 | $lum = $pix[2]*$a + $pix[1]*$b + $pix[0]*$c; 64 | if($lum > 255*$abc) return 0xFFFFFF; 65 | if($lum <= 0) return 0x000000; 66 | $lum /= ($abc*1.0); 67 | 68 | $aa=$a; $bb=$b; $cc=$c; 69 | #$aa=1111; $bb=3333; $cc=5334; 70 | for($round=0; $round<3; ++$round) 71 | { 72 | $excess = $aa*max(0, $pix[2]-255) 73 | + $bb*max(0, $pix[1]-255) 74 | + $cc*max(0, $pix[0]-255); 75 | // $excess is the amount of excess color energy that 76 | // we must dissipate. 77 | if($excess > 0) 78 | { 79 | // Check how much capacity there is on each channel. 80 | $capacity = 0; 81 | $cap2 = (255-$pix[2])*$aa; 82 | $cap1 = (255-$pix[1])*$bb; 83 | $cap0 = (255-$pix[0])*$cc; 84 | $capacity = max(0,$cap2) + max(0,$cap1) + max(0,$cap0); 85 | if($capacity > 0) 86 | { 87 | $distribute = min($capacity, $excess); 88 | $factor1 = $distribute/$capacity; 89 | 90 | // Add the color energy to capable channels 91 | if($cap2 > 0) $pix[2] += ($cap2*$factor1)/$aa; 92 | if($cap1 > 0) $pix[1] += ($cap1*$factor1)/$bb; 93 | if($cap0 > 0) $pix[0] += ($cap0*$factor1)/$cc; 94 | 95 | // And take it away from channels that had excess 96 | $factor2 = $distribute/$excess; 97 | if($cap2 < 0) $pix[2] += ($cap2*$factor2)/$aa; 98 | if($cap1 < 0) $pix[1] += ($cap1*$factor2)/$bb; 99 | if($cap0 < 0) $pix[0] += ($cap0*$factor2)/$cc; 100 | } 101 | } 102 | 103 | $debt = $aa*min(0, $pix[2]) 104 | + $bb*min(0, $pix[1]) 105 | + $cc*min(0, $pix[0]); 106 | // $debt is the amount of debt color energy 107 | // that we must borrow. 108 | if($debt < 0) 109 | { 110 | // Check how much capacity there is on each channel. 111 | $capacity = 0; 112 | $cap2 = ($pix[2])*$aa; 113 | $cap1 = ($pix[1])*$bb; 114 | $cap0 = ($pix[0])*$cc; 115 | $capacity = max(0,$cap2) + max(0,$cap1) + max(0,$cap0); 116 | if($capacity > 0) 117 | { 118 | $distribute = min($capacity, $excess); 119 | $factor1 = $distribute/$capacity; 120 | 121 | // Take away color energy from capable channels 122 | if($cap2 > 0) $pix[2] -= ($cap2*$factor1)/$aa; 123 | if($cap1 > 0) $pix[1] -= ($cap1*$factor1)/$bb; 124 | if($cap0 > 0) $pix[0] -= ($cap0*$factor1)/$cc; 125 | 126 | // And give it to channels that need it 127 | $factor2 = $distribute/$excess; 128 | if($cap2 < 0) $pix[2] -= ($cap2*$factor2)/$aa; 129 | if($cap1 < 0) $pix[1] -= ($cap1*$factor2)/$bb; 130 | if($cap0 < 0) $pix[0] -= ($cap0*$factor2)/$cc; 131 | } 132 | } 133 | if(!$excess && !$debt) break; 134 | } 135 | return clamp($pix,$x,$y); 136 | } 137 | 138 | $w = 848; 139 | $h = 480; 140 | $im = ImageCreateTrueColor($w*2, $h); 141 | 142 | $gamma = 1/2.0; 143 | $gamma2 = 2.0; 144 | for($y=0; $y<$h; ++$y) 145 | { 146 | #$bright = pow($y/$h, 2.0)*3; 147 | $bright = pow($y/$h, $gamma2)*1.0; 148 | for($x=0; $x<$w; ++$x) 149 | { 150 | $pix = hsv_to_bgr($x * 1.5*360 / $w - 180, 151 | min(1, $x*1.9/$w), 152 | 0.1); 153 | $pix[0] = pow($pix[0]/255., 1/$gamma); 154 | $pix[1] = pow($pix[1]/255., 1/$gamma); 155 | $pix[2] = pow($pix[2]/255., 1/$gamma); 156 | 157 | $lum = ($pix[2]*$a + $pix[1]*$b + $pix[0]*$c) / $abc; 158 | 159 | $pix[0] = ($pix[0] * $bright/$lum)*255; 160 | $pix[1] = ($pix[1] * $bright/$lum)*255; 161 | $pix[2] = ($pix[2] * $bright/$lum)*255; 162 | 163 | $color = clamp($pix, $x,$y); 164 | ImageSetPixel($im, $x,$y, $color); 165 | 166 | $color = clamp2($pix, $x,$y); 167 | ImageSetPixel($im, $x+$w+16,$y, $color); 168 | } 169 | } 170 | ImagePng($im, 'test.png'); 171 | 172 | print "done\n"; 173 | 174 | -------------------------------------------------------------------------------- /newhash/endian.hh: -------------------------------------------------------------------------------- 1 | #ifndef bqtEndianHH 2 | #define bqtEndianHH 3 | 4 | #ifndef __STDC_CONSTANT_MACROS 5 | #define __STDC_CONSTANT_MACROS /* for UINT16_C etc */ 6 | #endif 7 | 8 | #include 9 | 10 | #if defined(__x86_64)||defined(__i386) 11 | #define LITTLE_ENDIAN_AND_UNALIGNED_ACCESS_OK 12 | #else 13 | #undef LITTLE_ENDIAN_AND_UNALIGNED_ACCESS_OK 14 | #endif 15 | 16 | #ifdef WIN32 17 | # define LL_FMT "I64" 18 | #else 19 | # define LL_FMT "ll" 20 | #endif 21 | 22 | 23 | static inline std::uint_fast16_t get_8(const void* p) 24 | { 25 | const unsigned char* data = (const unsigned char*)p; 26 | return data[0]; 27 | } 28 | static inline std::uint_fast16_t get_16(const void* p) 29 | { 30 | #ifdef LITTLE_ENDIAN_AND_UNALIGNED_ACCESS_OK 31 | return *(const std::uint_least16_t*)p; 32 | #else 33 | const unsigned char* data = (const unsigned char*)p; 34 | return get_8(data) | (get_8(data+1) << UINT16_C(8)); 35 | #endif 36 | } 37 | static inline std::uint_fast16_t R16r(const void* p) 38 | { 39 | #ifdef BIG_ENDIAN_AND_UNALIGNED_ACCESS_OK 40 | return *(const std::uint_least16_t*)p; 41 | #else 42 | const unsigned char* data = (const unsigned char*)p; 43 | return get_8(data+1) | (get_8(data) << UINT16_C(8)); 44 | #endif 45 | } 46 | static inline std::uint_fast32_t R24(const void* p) 47 | { 48 | /* Note: This might be faster if implemented through R32 and a bitwise and, 49 | * but we cannot do that because we don't know if the third byte is a valid 50 | * memory location. 51 | */ 52 | const unsigned char* data = (const unsigned char*)p; 53 | return get_16(data) | (get_8(data+2) << UINT32_C(16)); 54 | } 55 | static inline std::uint_fast32_t R24r(const void* p) 56 | { 57 | const unsigned char* data = (const unsigned char*)p; 58 | return get_16(data+1) | (get_8(data) << UINT32_C(16)); 59 | } 60 | static inline std::uint_fast32_t get_32(const void* p) 61 | { 62 | #ifdef LITTLE_ENDIAN_AND_UNALIGNED_ACCESS_OK 63 | return *(const std::uint_least32_t*)p; 64 | #else 65 | const unsigned char* data = (const unsigned char*)p; 66 | return get_16(data) | (get_16(data+2) << UINT32_C(16)); 67 | #endif 68 | } 69 | static inline std::uint_fast32_t R32r(const void* p) 70 | { 71 | #ifdef BIG_ENDIAN_AND_UNALIGNED_ACCESS_OK 72 | return *(const std::uint_least32_t*)p; 73 | #else 74 | const unsigned char* data = (const unsigned char*)p; 75 | return get_16(data+2) | (get_16(data) << UINT32_C(16)); 76 | #endif 77 | } 78 | 79 | #define L (std::uint_fast64_t) 80 | 81 | static inline std::uint_fast64_t get_64(const void* p) 82 | { 83 | #ifdef LITTLE_ENDIAN_AND_UNALIGNED_ACCESS_OK 84 | return *(const std::uint_least64_t*)p; 85 | #else 86 | const unsigned char* data = (const unsigned char*)p; 87 | return (L get_32(data)) | ((L get_32(data+4)) << UINT64_C(32)); 88 | #endif 89 | } 90 | static inline std::uint_fast64_t R64r(const void* p) 91 | { 92 | #ifdef BIG_ENDIAN_AND_UNALIGNED_ACCESS_OK 93 | return *(const std::uint_least64_t*)p; 94 | #else 95 | const unsigned char* data = (const unsigned char*)p; 96 | return (L get_32(data+4)) | ((L get_32(data)) << UINT64_C(32)); 97 | #endif 98 | } 99 | 100 | #undef L 101 | 102 | static inline std::uint_fast64_t get_n(const void* p, unsigned bytes) 103 | { 104 | const unsigned char* data = (const unsigned char*)p; 105 | std::uint_fast64_t res(0); 106 | switch(bytes) 107 | { 108 | case 8: return get_64(p); 109 | case 4: return get_32(p); 110 | case 2: return get_16(p); 111 | case 7: res |= ((std::uint_fast64_t)get_8(data+6)) << 48; [[fallthrough]]; 112 | case 6: res |= ((std::uint_fast64_t)get_8(data+5)) << 40; [[fallthrough]]; 113 | case 5: res |= ((std::uint_fast64_t)get_16(data+3)) << 24; [[fallthrough]]; 114 | case 3: res |= ((std::uint_fast64_t)get_16(data+1)) << 8; [[fallthrough]]; 115 | case 1: res |= get_8(data); 116 | } 117 | return res; 118 | } 119 | 120 | static void put_8(void* p, std::uint_fast8_t value) 121 | { 122 | unsigned char* data = (unsigned char*)p; 123 | data[0] = value; 124 | } 125 | static void put_16(void* p, std::uint_fast16_t value) 126 | { 127 | #ifdef LITTLE_ENDIAN_AND_UNALIGNED_ACCESS_OK 128 | *(std::uint_least16_t*)p = value; 129 | #else 130 | unsigned char* data = (unsigned char*)p; 131 | put_8(data+0, value ); 132 | put_8(data+1, value>>8); 133 | #endif 134 | } 135 | static void W24(void* p, std::uint_fast32_t value) 136 | { 137 | unsigned char* data = (unsigned char*)p; 138 | put_16(data+0, value); 139 | put_8(data+2, value >> UINT32_C(16)); 140 | } 141 | static void put_32(void* p, std::uint_fast32_t value) 142 | { 143 | #ifdef LITTLE_ENDIAN_AND_UNALIGNED_ACCESS_OK 144 | *(std::uint_least32_t*)p = value; 145 | #else 146 | unsigned char* data = (unsigned char*)p; 147 | put_16(data+0, value); 148 | put_16(data+2, value >> UINT32_C(16)); 149 | #endif 150 | } 151 | static void put_64(void* p, std::uint_fast64_t value) 152 | { 153 | #ifdef LITTLE_ENDIAN_AND_UNALIGNED_ACCESS_OK 154 | *(std::uint_least64_t*)p = value; 155 | #else 156 | unsigned char* data = (unsigned char*)p; 157 | put_32(data+0, (value)); 158 | put_32(data+4, (value >> UINT64_C(32))); 159 | #endif 160 | } 161 | 162 | static inline void put_n(void* p, std::uint_fast64_t value, unsigned bytes) 163 | { 164 | unsigned char* data = (unsigned char*)p; 165 | switch(bytes) 166 | { 167 | case 8: put_64(p, value); break; 168 | case 7: put_8(data+6, value>>48); 169 | case 6: put_8(data+5, value>>40); 170 | case 5: put_8(data+4, value>>32); 171 | case 4: put_32(p, value); break; 172 | case 3: W24(p, value); break; 173 | case 2: put_16(p, value); break; 174 | case 1: put_8(p, value); break; 175 | } 176 | } 177 | 178 | #endif 179 | -------------------------------------------------------------------------------- /newhash/newhash.cc: -------------------------------------------------------------------------------- 1 | #include "endian.hh" 2 | #include "newhash.hh" 3 | #include 4 | 5 | #include "simd.hh" 6 | 7 | #ifdef __GNUC__ 8 | # define likely(x) __builtin_expect(!!(x), 1) 9 | # define unlikely(x) __builtin_expect(!!(x), 0) 10 | #else 11 | # define likely(x) (x) 12 | # define unlikely(x) (x) 13 | #endif 14 | 15 | /* Note: The differing algorithms in this 16 | * file are not value-compatible. 17 | */ 18 | 19 | #if defined(__x86_64) || (defined(USE_MMX) && defined(__SSE2__)) 20 | /* On x86_64, we can use 64-bit registers. Which is fast. 21 | * On 32-bit, we can use MMX registers, if MMX is enabled 22 | * However, SSE2 must ALSO be enabled, because otherwise 23 | * we cannot do 64-bit sub/add efficiently. 24 | */ 25 | # define SIXTY_BIT_PLATFORM 26 | #else 27 | # undef SIXTY_BIT_PLATFORM 28 | #endif 29 | 30 | #if 0 && defined(__GNUC__) && defined(__LP64__) && !defined(__ICC) 31 | # define HUNDREDTWENTYEIGHTBIT_PLATFORM 32 | /* 33 | * 128-bit with SSE2 is not feasible, because SSE2 does not 34 | * have 128-bit add/sub ops. They cannot be even synthesized 35 | * from 64-bit adds/subs, because there's no carry update. 36 | * More importantly there's no 128-bit shift to left or right. 37 | * 38 | * __attribute__((mode(TI))) can be used to created a 128-bit 39 | * integer type on GCC, however, it does not work on ICC. 40 | */ 41 | 42 | #ifdef __SSE2__ 43 | #include 44 | #endif 45 | 46 | #else 47 | # undef HUNDREDTWENTYEIGHTBIT_PLATFORM 48 | #endif 49 | 50 | /* Based on Robert J. Jenkins Jr.'s "zobra" hash code 51 | * References: 52 | * http://www.burtleburtle.net/bob/hash/evahash.html 53 | * http://www.cris.com/~Ttwang/tech/inthash.htm 54 | * 55 | * Copyright (C) 2011 Joel Yliluoma (http://iki.fi/bisqwit/) 56 | */ 57 | 58 | template 59 | static inline T rol(T v, int n) { return (v<>int( sizeof(T)*8 - n)); } 60 | 61 | /* The mixing step */ 62 | #define mix32z(a,b,c) \ 63 | do{ \ 64 | a=(a-c) ^ rol(c,16); c += b; \ 65 | b=(b-a) ^ rol(a,23); a += c; \ 66 | c=(c-b) ^ rol(b,29); b += a; \ 67 | a=(a-c) ^ rol(c,16); c += b; \ 68 | b=(b-a) ^ rol(a,19); a += c; \ 69 | c=(c-b) ^ rol(b,17); b += a; \ 70 | }while(0) 71 | #define final32z(a,b,c) \ 72 | do{ \ 73 | c=(c^b) - rol(b, 5); \ 74 | a=(a^c) - rol(c,10); \ 75 | b=(b^a) - rol(a, 6); \ 76 | c=(c^b) - rol(b, 9); \ 77 | }while(0) 78 | 79 | #define mix64z(a,b,c) \ 80 | do{ \ 81 | a=(a-c) ^ rol(c, 2); c += b; \ 82 | b=(b-a) ^ rol(a,22); a += c; \ 83 | c=(c-b) ^ rol(b, 3); b += a; \ 84 | a=(a-c) ^ rol(c,36); c += b; \ 85 | b=(b-a) ^ rol(a,48); a += c; \ 86 | c=(c-b) ^ rol(b,42); b += a; \ 87 | }while(0) 88 | #define final64z(a,b,c) \ 89 | do{ \ 90 | c=(c^b) - rol(b,22); \ 91 | a=(a^c) - rol(c, 3); \ 92 | b=(b^a) - rol(a,58); \ 93 | c=(c^b) - rol(b,48); \ 94 | }while(0) 95 | 96 | #define mix128z(a,b,c) \ 97 | do{ \ 98 | a=(a-c) ^ rol(c, 79); c += b; \ 99 | b=(b-a) ^ rol(a,124); a += c; \ 100 | c=(c-b) ^ rol(b, 60); b += a; \ 101 | a=(a-c) ^ rol(c, 74); c += b; \ 102 | b=(b-a) ^ rol(a,115); a += c; \ 103 | c=(c-b) ^ rol(b,101); b += a; \ 104 | }while(0) 105 | #define final128z(a,b,c) \ 106 | do{ \ 107 | c=(c^b) - rol(b, 60); \ 108 | a=(a^c) - rol(c, 20); \ 109 | b=(b^a) - rol(a, 91); \ 110 | c=(c^b) - rol(b,106); \ 111 | }while(0) 112 | 113 | 114 | #ifdef HUNDREDTWENTYEIGHTBIT_PLATFORM 115 | typedef unsigned int std::uint128_t __attribute__((mode(TI))); 116 | 117 | class c128 118 | { 119 | public: 120 | std::uint128_t value; 121 | public: 122 | c128() : value() 123 | { 124 | } 125 | c128(std::uint128_t v) : value(v) { } 126 | c128(std::uint_fast64_t a, std::uint_fast64_t b) 127 | : value(a) 128 | { 129 | value <<= 64; 130 | value |= b; 131 | } 132 | c128(std::uint_least64_t a) : value(a) 133 | { 134 | } 135 | c128(std::uint_least32_t a) : value(a) 136 | { 137 | } 138 | #ifdef __SSE2__ 139 | c128(const __m128& b) : value(*(const std::uint128_t*)&b) 140 | { 141 | } 142 | #endif 143 | 144 | c128& operator += (const c128& b) { value += b.value; return *this; } 145 | c128& operator -= (const c128& b) { value -= b.value; return *this; } 146 | c128& operator ^= (const c128& b) 147 | { 148 | #ifdef __SSE2__ 149 | *(__m128*)&value = _mm_xor_ps( *(const __m128*)&value, *(const __m128*)&b.value); 150 | #else 151 | value ^= b.value; 152 | #endif 153 | return *this; 154 | } 155 | c128& operator &= (const c128& b) 156 | { 157 | #ifdef __SSE2__ 158 | *(__m128*)&value = _mm_and_ps( *(const __m128*)&value, *(const __m128*)&b.value); 159 | #else 160 | value &= b.value; 161 | #endif 162 | return *this; 163 | } 164 | c128& operator |= (const c128& b) 165 | { 166 | #ifdef __SSE2__ 167 | *(__m128*)&value = _mm_or_ps( *(const __m128*)&value, *(const __m128*)&b.value); 168 | #else 169 | value |= b.value; 170 | #endif 171 | return *this; 172 | } 173 | c128& operator <<= (int nbits) { value <<= nbits; return *this; } 174 | c128& operator >>= (int nbits) { value >>= nbits; return *this; } 175 | 176 | c128 operator+ (const c128& b) const { return value + b.value; } 177 | c128 operator- (const c128& b) const { return value - b.value; } 178 | c128 operator^ (const c128& b) const 179 | { 180 | #ifdef __SSE2__ 181 | return _mm_xor_ps( *(const __m128*)&value, *(const __m128*)&b.value); 182 | #else 183 | return value ^ b.value; 184 | #endif 185 | } 186 | c128 operator& (const c128& b) const 187 | { 188 | #ifdef __SSE2__ 189 | return _mm_and_ps( *(const __m128*)&value, *(const __m128*)&b.value); 190 | #else 191 | return value & b.value; 192 | #endif 193 | } 194 | c128 operator| (const c128& b) const 195 | { 196 | #ifdef __SSE2__ 197 | return _mm_or_ps( *(const __m128*)&value, *(const __m128*)&b.value); 198 | #else 199 | return value | b.value; 200 | #endif 201 | } 202 | c128 operator<< (int nbits) const { return value << nbits; } 203 | c128 operator>> (int nbits) const { return value >> nbits; } 204 | c128 operator~ () const { return ~value; } 205 | }; 206 | 207 | c128 get_128(const void* p) 208 | { 209 | #ifdef LITTLE_ENDIAN_AND_UNALIGNED_ACCESS_OK 210 | return *(const std::uint128_t*)p; 211 | #else 212 | const unsigned char* data = (const unsigned char*)p; 213 | c128 res( get_64(data) ); 214 | c128 res2( get_64(data + 8) ); 215 | res |= res2 << 64; 216 | return res; 217 | #endif 218 | } 219 | static inline c128 RnSubstitute(const void* p, unsigned bytes) 220 | { 221 | const unsigned char* data = (const unsigned char*)p; 222 | switch(bytes) 223 | { 224 | case 1: case 2: case 3: case 4: 225 | case 5: case 6: case 7: case 8: 226 | return get_n(p, bytes); 227 | case 16: return get_128(p); 228 | } 229 | return c128(get_64(data)) | (c128(get_n(data+8, bytes-8)) << 64); 230 | } 231 | #define Rn RnSubstitute 232 | 233 | #endif // 128bit 234 | 235 | newhash_t newhash_calc(const unsigned char* buf, unsigned long size) 236 | { 237 | return newhash_calc_upd(0, buf, size); 238 | } 239 | newhash_t newhash_calc_upd(newhash_t c, const unsigned char* buf, unsigned long size) 240 | { 241 | #ifdef HUNDREDTWENTYEIGHTBIT_PLATFORM 242 | c128 c_cast = c; { 243 | unsigned long len = size; 244 | c128 a(UINT64_C(0x9e3779b97f4a7c15),UINT64_C(0xf39cc0605cedc834)); // 2^128 / ((1+sqrt(5))/2) 245 | a += c_cast + size; 246 | c128 b(a), c(a); 247 | while(len >= 16*3) 248 | { 249 | a += (c128)get_128(buf+0); 250 | b += (c128)get_128(buf+16); 251 | c += (c128)get_128(buf+32); 252 | mix128z(a,b,c); 253 | buf += 48; len -= 48; 254 | } 255 | /*------------------------------------- handle the last 47 bytes */ 256 | if(len > 0) 257 | { 258 | if(len >= 32) { a += (c128)get_128(buf); b += (c128)get_128(buf+16); c += (c128)get_n(buf+32,len-32); } 259 | else if(len >= 16) { a += (c128)get_128(buf); b += (c128)get_n(buf+16, len-16); } 260 | else { a += (c128)get_n(buf, len); } 261 | final128z(a,b,c); 262 | } 263 | /*-------------------------------------------- report the result */ 264 | return c.value; /* Note: this returns just the lowest 32 bits of the hash */ 265 | } 266 | #elif defined(SIXTY_BIT_PLATFORM) 267 | c64 c_cast = (std::uint_fast64_t)c; { 268 | unsigned long len = size; 269 | c64 a(UINT64_C(0x9e3779b97f4a7c13)); // 2^64 / ((1+sqrt(5))/2) 270 | a += c_cast + c64(std::uint64_t(size)); 271 | c64 b(a), c(a); 272 | while(len >= 8*3) 273 | { 274 | a += (c64)get_64(buf+0); 275 | b += (c64)get_64(buf+8); 276 | c += (c64)get_64(buf+16); 277 | mix64z(a,b,c); 278 | buf += 24; len -= 24; 279 | } 280 | /*------------------------------------- handle the last 23 bytes */ 281 | if(len > 0) 282 | { 283 | if(len >= 16) { a += (c64)get_64(buf); b += (c64)get_64(buf+8); c += (c64)get_n(buf+16,len-16); } 284 | else if(len >= 8) { a += (c64)get_64(buf); b += (c64)get_n(buf+8, len-8); } 285 | else { a += (c64)get_n(buf, len); } 286 | final64z(a,b,c); 287 | } 288 | /*-------------------------------------------- report the result */ 289 | #ifdef USE_MMX 290 | newhash_t result = get_32(&c.value); /* Note: this returns just the lowest 32 bits of the hash */ 291 | MMX_clear(); 292 | return result; 293 | #else 294 | return c.value; /* Note: this returns just the lowest 32 bits of the hash */ 295 | #endif 296 | } 297 | #else 298 | typedef std::uint_least32_t c32; 299 | c32 a,b; 300 | unsigned long len = size; 301 | c += size + UINT32_C(0x9e3779b9); // 2^32 / ((1+sqrt(5))/2 302 | a = b = c; 303 | while(len >= 4*3) 304 | { 305 | a += get_32(buf+0); 306 | b += get_32(buf+4); 307 | c += get_32(buf+8); 308 | mix32z(a,b,c); 309 | buf += 12; len -= 12; 310 | } 311 | /*------------------------------------- handle the last 11 bytes */ 312 | if(len > 0) 313 | { 314 | if(len >= 8) { a += (c32)get_32(buf); b += (c32)get_32(buf+4); c += (c32)get_n(buf+8,len-8); } 315 | else if(len >= 4) { a += (c32)get_32(buf); b += (c32)get_n(buf+4, len-4); } 316 | else { a += (c32)get_n(buf, len); } 317 | final32z(a,b,c); 318 | } 319 | /*-------------------------------------------- report the result */ 320 | return c; 321 | #endif 322 | } 323 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bisqwit’s CRT filter 2 | 3 | This is the CRT filter that I used in my ”What is That Editor” video, 4 | at https://www.youtube.com/watch?v=ZMBQmhO8KqI. 5 | 6 | It received some accolades, but I forgot to publish it. 7 | Here it is finally. 8 | 9 | ## To build 10 | 11 | Run this command to build the filter: 12 | 13 | g++ -o crt-filter crt-filter.cc -fopenmp -Ofast -march=native -Wall -Wextra -std=c++17 14 | 15 | ## Usage 16 | 17 | The filter takes BGRA (RGB32) video (RAW!) from stdin, 18 | and produces BGRA video (RAW!) into stdout. 19 | 20 | The filter takes five commandline parameters: 21 | 22 | ./crt-filter 23 | 24 | The sourcewidth and sourceheight denote the size of the original video. 25 | The outputwidth and outputheight denote the size that you want to produce. 26 | Generally speaking you want to produce as high quality as possible. 27 | Vertical resolution is more important than horizontal resolution. 28 | 29 | Scanlines is the number of scanlines you wish to simulate. 30 | Generally that would be the same as the vertical resolution of the source video, 31 | but that is not a requirement. 32 | 33 | For best quality, the number of scanlines should be chosen 34 | such that the intermediate height (see Constants) 35 | is its integer multiple. 36 | The intermediate width should ideally also be an integer 37 | multiple of the source width. None of this is required though. 38 | 39 | IMPORTANT: This filter does *not* decode or produce video formats like avi/mp4/mkv/whatever. 40 | It only deals with raw video frames. You need to use an external program, 41 | like ffmpeg, to perform the conversions. 42 | See `make-reencoded.sh` and `reencode.sh` for a practical example. 43 | 44 | ## Screenshots 45 | 46 | (Click to enlarge the filtered pictures) 47 | 48 | ![Original1](img/mpv-shot0001.jpg) 49 | ![Filtered1](img/mpv-shot0002.jpg) 50 | 51 | ![Original2](img/mpv-shot0003.jpg) 52 | ![Filtered2](img/mpv-shot0004.jpg) 53 | 54 | ## How it works 55 | 56 | ### Constants 57 | 58 | These constants specify the pixel grid (shadow mask) used by the simulated CRT monitor. 59 | 60 | Currently they are hardcoded in the program, 61 | but they are easy to find if you want to tweak the source code. 62 | 63 | ![width](https://render.githubusercontent.com/render/math?math=\begin{align*}npix_{width}%26=640+%5C%5C+npix_{height}%26=400+%5C%5C+cellwidth_{red}%26=cellwidth_{green}=cellwidth_{blue}=2+%5C%5C+cellblank_{red}%26=cellblank_{green}=1+%5C%5C+cellblank_{blue}%26=2+%5C%5C+cellheight_{vert}%26=5+%5C%5C+cellblank_{vert}%26=1+%5C%5C+cellstagger%26=3+%5C%5C+intermediatewidth%26=npix_{width}\cdot%28cellwidth_{red}%2Bcellblank_{red}%2Bcellheight_{green}%2Bcellblank_{green}%2Bcellwidth_{blue}%2Bcellblank_{blue}%29=6400+%5C%5C+intermediateheight%26=npix_{height}\cdot%28cellheight_{vert}%2Bcellblank_{vert}%29=2400\end{align*}) 64 | 65 | The cell widths and heights and staggering specify the geometry of the shadow 66 | mask. See Filtering, below, for an example of what it looks like. 67 | 68 | **NB: This page uses GitHub’s own LaTeX math renderer to show equations. 69 | Unfortunately, this renderer produces transparent pictures with black text, 70 | and has very poor usability on *dark mode.* 71 | I am aware of this problem, but there is very little I can do about it, 72 | until GitHub itself fixes it! 73 | Sorry. Please view this site on desktop with non-dark mode.** 74 | 75 | ### Hashing 76 | 77 | The filter is designed for DOS videos, and specifically for sessions 78 | involving the text mode. Because chances are that successive frames are 79 | often identical, the filter calculates a hash of every source frame. 80 | 81 | If the hash is found to be identical to some previous frame, 82 | the filtered result of the previous frame is sent. 83 | Otherwise, the new frame is processed, and saved into a cache with the hash of the input image. 84 | 85 | Four previous unique frames are cached. This accounts e.g. for blinking cursors. 86 | 87 | ### Converting into linear colors 88 | 89 | First, the image is un-gammacorrected. 90 | 91 | ![1/gamma](https://render.githubusercontent.com/render/math?math=value\leftarrow+value^{\gamma^{-1}}\text{ where }\gamma=2\text{ for every color channel }value\text{ in the picture}) 92 | 93 | ### Rescaling to scanline count 94 | 95 | Then, the image is rescaled to the height of number of given scanlines using a Lanczos filter. 96 | Kernel size 2 was was selected for the Lanczos filter. 97 | 98 | If your source height is greater than the number of scanlines you specified, you will lose detail. 99 | 100 | ### Rescaling to intermediate size 101 | 102 | Next, the image is rescaled to the intermediate width and height using a nearest-neighbor filter. 103 | 104 | The scaling is performed first vertically and then horizontally. 105 | Before horizontal scaling, the brightness of each row of pixels 106 | is adjusted by a constant factor that is calculated by 107 | 108 | ![formula](https://render.githubusercontent.com/render/math?math=e^{-0.5%28n-0.5%29^{2}c^{-2}}\text{ where }c=0.3\text{ and }n\text{ is+the+fractional+part+of+the+source+Y+coordinate.}) 109 | 110 | This formula produces a figure that sort of looks like a hill. 111 | It peaks in the middle and fades smoothly to the sides. 112 | This hill represents the brightness of each scanline, as a function of distance from its beginning. 113 | Plotted in a graphing calculator, it looks like this. 114 | The c constant controls how steep that hill is. A small value like 0.1 115 | produces a very narrow hill with very sharp and narrow scanlines, 116 | and bigger values produce flatter hills and less pronounced scanlines. 117 | 0.3 looked like a good compromise. 118 | 119 | This simulates the electron gun passing through in horizontal lines called scanlines, 120 | as it renders the picture line by line. 121 | 122 | ![Gaussian](img/weights.png) 123 | ![Copper bars](img/coppers.png) 124 | 125 | You can download the source code of the right-hand-side illustration in 126 | [img/coppers.php](img/coppers.php). 127 | 128 | ### Filtering 129 | 130 | Each color channel and each pixel of the picture — now intermediate width and height — is multiplied by a mask 131 | that is either one or zero, depending on whether that pixel belongs inside a 132 | cell of that color according to the hardcoded cell geometry. 133 | 134 | The mask is a repeating pattern that essentially looks like this: 135 | 136 | ![Mask](img/mask.png) 137 | 138 | Red pixels denote 1 for red channel, 139 | green pixels denote 1 for green channel, 140 | blue pixels denote 1 for blue channel, 141 | and everything else for everyone is 0. 142 | 143 | This simulates the shadow mask in front of the cathode ray tube. 144 | 145 | The mask is generated procedurally from the cell parameters 146 | (see Constants). 147 | 148 | ### Rescaling to target size 149 | 150 | Then the image is rescaled to the target picture width and target picture height using a Lanczos filter. 151 | The scaling is performed first vertically and the horizontally. 152 | 153 | A Lanczos filter was chosen because it is generally deemed the 154 | best compromise between blurring and fringing 155 | among several simple filters 156 | ([Wikipedia](https://en.wikipedia.org/wiki/Lanczos_resampling)). 157 | I have been using it for years for interpolating all sorts of signals 158 | from pictures to sounds. 159 | 160 | ### Bloom 161 | 162 | First, the brightness of each pixel is normalized so that the sum of masks 163 | and scanline magnitudes does not change the overall brightness of the picture. 164 | 165 | Then, a copy is created of the picture. 166 | This copy is gamma-corrected and amplified with a significant factor, to promote bloom. 167 | 168 | ![gamma](https://render.githubusercontent.com/render/math?math=value_{copy}=\frac{600}{255}value^\gamma\text{ for every color channel }value\text{ in the picture}) 169 | 170 | This copy is 2D-gaussian-blurred using a three-step box filter, 171 | where the blur width is set as output-width / 640. 172 | The blur algorithm is very fast and works in linear time, 173 | adapted from http://blog.ivank.net/fastest-gaussian-blur.html . 174 | 175 | Then, the actual picture is gamma-corrected, this time without a brightening factor. 176 | 177 | ![gamma](https://render.githubusercontent.com/render/math?math=value\leftarrow+value^\gamma\text{ for every color channel }value\text{ in the picture}) 178 | 179 | Then, the blurry copy is merged into the picture, 180 | by literally adding its pixel values into the target pixel values. 181 | 182 | ![gamma](https://render.githubusercontent.com/render/math?math=value\leftarrow+value%2Bvalue_{copy}\text{ for every color channel }value\text{ in the picture}) 183 | 184 | Because of the combination of amplification and blurring, 185 | if there are isolated bright pixels in the scene, 186 | their power is spread out on big area 187 | and thus do not contribute much to the final picture, 188 | but if there is a large cluster of bright pixels closeby, 189 | they remain bright even after blurring, 190 | and will influence the final picture a lot. 191 | This produces a bloom effect. 192 | 193 | ### Clamping 194 | 195 | Finally, before quantizing the floating-point colors and sending the frame to output, 196 | each pixel is clamped to the target range using a desaturation formula. 197 | 198 | #### The desaturation formula 199 | 200 | The desaturation formula first calculates a luminosity value from the input R,G,B 201 | components using ITU coefficients (see [sRGB on Wikipedia](https://en.wikipedia.org/wiki/SRGB)): 202 | 203 | ![luma calculation](https://render.githubusercontent.com/render/math?math=luma=0.2126\cdot+value_{red}%2B0.7152\cdot+value_{green}%2B0.0722\cdot+value_{blue}) 204 | 205 | * If the luminosity is less than 0, black is returned. 206 | * If the luminosity is more than 1, white is returned. 207 | * Otherwise, a saturation value is initialized as 1, and then adjusted by inspecting each color channel value separately: 208 | 209 | ![adjust](https://render.githubusercontent.com/render/math?math=saturation\leftarrow\begin{cases}\min%28saturation,\frac{luma-1}{luma-value_{channel}}%29,+%26+\text{if }value_{channel}\gt+1%5C%5C%0D%0A\min%28saturation,\frac{luma}{luma-value_{channel}}%29,+%26+\text{if }value_{channel}\lt+0%5C%5Csaturation%26\text{otherwise}\end{cases}) 210 | 211 | After analyzing all color channels, 212 | if the saturation still remains as 1, the input color is returned verbatim. 213 | Otherwise each color channel is readjusted as: 214 | 215 | ![adjust](https://render.githubusercontent.com/render/math?math=value_{channel}\prime=\min%281,\max%280,%28value_{channel}-luma%29\cdot+saturation%2Bluma%29%29) 216 | 217 | The readjusted color channel values are then joined together to form the returned color. 218 | 219 | The advantage of desaturation-aware clamping over naïve clamping 220 | is that it does a much better job at preserving energy. 221 | To illustrate, here is a picture with two color ramps. 222 | The brightness of the color ramp increases linearly along the Y axis. 223 | That is, top is darkest (0) and bottom is brightest (1, i.e. full). 224 | Every pixel on each scanline should be approximately same brightness. 225 | 226 | The brightness scaling in this illustration is done by simply multiplying 227 | the RGB color with the brightness value. At high brightness values, this 228 | produces colors that are impossible to show on the screen. 229 | 230 | ![Rainbow illustration](img/rainbow.png) 231 | 232 | In the leftside picture with naïve clamping (i.e. `if x>255, then set x to 255`), 233 | you can see that the further 234 | down you go in the picture, the more different the color brightnesses are. 235 | The blue stripe is much, much darker than anything else in the picture, 236 | even though it is fully saturated and as bright as your screen can make it.* 237 | 238 | However, on the right side, with the desaturation aware clamping formula, 239 | every scanline remains at perfectly even brightness, even 240 | when you exceed the maximum possible brightness of the screen colors. 241 | 242 | In the desaturation-aware algorithm, colors that are impossible 243 | to show on screen due to excess brightness are approximated with 244 | desaturated versions, that preserve the brightness perception 245 | at the cost of color saturation. 246 | 247 | (Note: “Perfectly” was a hyperbole. 248 | The colors are not quite the same brightness, 249 | because of differences in screen calibration and because of 250 | differences in human individual eyes. This is more of an illustration.) 251 | You can download the source code of this illustration in 252 | [img/rainbow.php](img/rainbow.php). 253 | 254 | Note that this does *not* mean that all colors become more washed out. 255 | You may come to this mistaken conclusion, because this illustration is 256 | fixed for perceptual brightness. The only colors that will be 257 | desaturated are those that are have out-of-range values 258 | (i.e. individual channel values are greater than 255 or smaller than 0); 259 | marked with crosshatch pattern in the below picture. 260 | Everything else is kept unchanged. 261 | 262 | ![Rainbow with crosshatch](img/rainbow2.png) 263 | 264 | *) Note that \#0000FF is not blue at brightness 1. While it is maximally bright 265 | fully saturated blue, its brightness is only about 10 % of the brightness of 266 | \#00FF00, maximally bright fully saturated green, and only about 7 % of the 267 | brightness of \#FFFFFF, a maximally bright white pixel (which does have 268 | brightness level of 1). 269 | 270 | This is trivial to 271 | prove: \#FFFFFF is a color where you light up all the LEDs that comprise 272 | color \#0000FF, but you also light up all the LEDs that comprise \#FF0000 273 | and all the LEDs that comprise \#00FF00. Because there are three times as 274 | many LEDs shining as when just \#0000FF is shown, the brightness of \#FFFFFF 275 | cannot be the same, but has to be much higher. Therefore, \#0000FF cannot 276 | have brightness level of 1. 277 | 278 | It is also worth noting that brightness is not the same as radiant energy. 279 | This has nothing to do with energy. 280 | The human eye is simply differently sensitive to different wavelengths 281 | of visible light; least of them to blue (see 282 | [V(λ)](https://en.wikipedia.org/wiki/Luminous_efficiency_function)). 283 | Brightness is a perception phenomenon. 284 | -------------------------------------------------------------------------------- /newhash/simd.hh: -------------------------------------------------------------------------------- 1 | #if defined(__MMX__) && !(defined(__x86_64) || defined(_M_X64)) 2 | #define USE_MMX 3 | #endif 4 | #if defined(__SSE__) 5 | #define USE_SSE 6 | #endif 7 | 8 | /* SIMD interface (MMX) written by Bisqwit 9 | * Copyright (C) 1992,2011 Joel Yliluoma (http://iki.fi/bisqwit/) 10 | */ 11 | 12 | #ifdef __3dNOW__ 13 | # include /* Note: not available on ICC */ 14 | #elif defined(__MMX__) 15 | # include 16 | #endif 17 | #ifdef __SSE__ 18 | #include 19 | #ifdef __ICC 20 | typedef __m128 __v4sf; 21 | #endif 22 | #endif 23 | 24 | struct c64_common 25 | { 26 | static signed char clamp_s8(int_fast64_t v) 27 | { return v<-128 ? -128 : (v > 127 ? 127 : v); } 28 | static unsigned char clamp_u8(int_fast64_t v) 29 | { return v<0 ? 0 : (v > 255 ? 255 : v); } 30 | static short clamp_s16(int_fast64_t v) 31 | { return v<-32768 ? -32768 : (v > 32767 ? 32767 : v); } 32 | 33 | static inline std::uint_fast64_t expand32_8(std::uint_fast32_t a) 34 | { 35 | // 0000abcd -> 0a0b0c0d 36 | typedef std::uint_fast64_t v; 37 | return (a&0xFFU) 38 | | ((a&0xFF00U)<<8) // base: 8+8 = 16 39 | | ((v)(a&0xFF0000U)<<16) // base: 16+16 = 32 40 | | ((v)(a&0xFF000000UL)<<24); // base: 24+24 = 48 41 | } 42 | static inline std::uint_fast64_t expand32_16(std::uint_fast32_t a) 43 | { 44 | // 0000abcd -> 00ab00cd 45 | typedef std::uint_fast64_t v; 46 | return (a&0xFFFFU) 47 | | ((v)(a&0xFFFF0000UL)<<16); // base: 16+16 = 32 48 | } 49 | }; 50 | 51 | #ifdef __MMX__ 52 | /* 64-bit integers that use MMX / 3Dnow operations where relevant */ 53 | struct c64_MMX: public c64_common 54 | { 55 | typedef c64_MMX c64; 56 | typedef __m64 valuetype; 57 | 58 | valuetype value; 59 | 60 | inline c64_MMX() : value() { } 61 | inline c64_MMX(__m64 v) : value(v) { } 62 | inline c64_MMX(const std::uint64_t& v) : value( *(const __m64*)& v) { } 63 | inline c64_MMX(int v) : value(_m_from_int(v)) { } 64 | inline c64_MMX(short a,short b,short c, short d) 65 | : value(_mm_setr_pi16(a,b,c,d)) { } 66 | 67 | inline c64 operator<< (int b) const { if(b < 0) return *this >> -b; return shl64(b); } 68 | inline c64 operator>> (int b) const { if(b < 0) return *this << -b; return shr64(b); } 69 | c64& operator<<= (int n) { return *this = shl64(n); } 70 | c64& operator>>= (int n) { return *this = shr64(n); } 71 | 72 | operator bool() const 73 | { 74 | union m64union { struct { std::uint_least32_t a, b; }; __m64 c; }; 75 | const m64union& tmp = (const m64union&) value; 76 | return !(tmp.a ^ tmp.b); 77 | } // TODO: verify 78 | 79 | c64 conv_s16_u8() const { return conv_s16_u8(*this); } 80 | c64 conv_s16_s8() const { return conv_s16_s8(*this); } 81 | 82 | void Get(const unsigned char* p) { value = *(const __m64*)p; } 83 | void Put( unsigned char* p)const { *(__m64*)p = value; } 84 | 85 | void Init16(short a,short b,short c, short d) 86 | { value = _mm_setr_pi16(a,b,c,d); } 87 | void Init16(short a) 88 | { value = _mm_set1_pi16(a); } 89 | 90 | void GetD(const unsigned char* p) { value = *(const __m64*)p; } 91 | 92 | template 93 | short Extract16() const { return ((const short*)&value)[n]; } 94 | template 95 | int Extract32() const { return ((const int*)&value)[n]; } 96 | 97 | short Extract88_from_1616lo() const 98 | { 99 | const unsigned char* data = (const unsigned char*)&value; 100 | // bytes: 76543210 101 | // shorts: 33221100 102 | // take: H L 103 | return data[0] | *(const short*)(data+1); 104 | //return data[0] | ((*(const unsigned int*)data) >> 8); 105 | } 106 | short Extract88_from_1616hi() const 107 | { 108 | const unsigned char* data = 4+(const unsigned char*)&value; 109 | // bytes: 76543210 110 | // shorts: 33221100 111 | // take: H L 112 | return data[0] | *(const short*)(data+1); 113 | //return data[0] | ((*(const unsigned int*)data) >> 8); 114 | } 115 | 116 | 117 | c64& operator&= (const c64& b) { value=_mm_and_si64(value,b.value); return *this; } 118 | c64& operator|= (const c64& b) { value=_mm_or_si64(value,b.value); return *this; } 119 | c64& operator^= (const c64& b) { value=_mm_xor_si64(value,b.value); return *this; } 120 | c64& operator+= (const c64& b) { return *this = *this + b; } 121 | c64& operator-= (const c64& b) { return *this = *this - b; } 122 | 123 | c64 operator~ () const { 124 | static const std::uint_least64_t negpat = ~(std::uint_least64_t)0; 125 | return c64(_mm_xor_si64(value, *(const __m64*)&negpat)); 126 | } 127 | 128 | /* psllqi: p = packed 129 | s = shift 130 | r = right, l = left 131 | l = shift in zero, a = shift in sign bit 132 | q = 64-bit, d = 32-bit, w = 16-bit 133 | [i = immed amount] 134 | */ 135 | c64 operator& (const c64& b) const { return c64(_mm_and_si64(value,b.value)); } 136 | c64 operator| (const c64& b) const { return c64(_mm_or_si64(value,b.value)); } 137 | c64 operator^ (const c64& b) const { return c64(_mm_xor_si64(value,b.value)); } 138 | 139 | c64 operator- (const c64& b) const 140 | { 141 | #ifdef __SSE2__ 142 | return _mm_sub_si64(value, b.value); 143 | #else 144 | return (const std::uint64_t&)value - (const std::uint64_t&)b.value; 145 | #endif 146 | } 147 | c64 operator+ (const c64& b) const 148 | { 149 | #ifdef __SSE2__ 150 | return _mm_add_si64(value, b.value); 151 | #else 152 | return (const std::uint64_t&)value + (const std::uint64_t&)b.value; 153 | #endif 154 | } 155 | 156 | 157 | c64 shl64(int b) const { return _mm_slli_si64(value, b); } 158 | c64 shr64(int b) const { return _mm_srli_si64(value, b); } 159 | c64 shl16(int b) const { return _mm_slli_pi16(value, b); } 160 | c64 shr16(int b) const { return _mm_srli_pi16(value, b); } 161 | c64 sar32(int b) const { return _mm_srai_pi32(value, b); } 162 | c64 sar16(int b) const { return _mm_srai_pi16(value, b); } 163 | c64 add32(const c64& b) const { return _mm_add_pi32(value, b.value); } 164 | c64 add16(const c64& b) const { return _mm_add_pi16(value, b.value); } 165 | c64 sub32(const c64& b) const { return _mm_sub_pi32(value, b.value); } 166 | c64 sub16(const c64& b) const { return _mm_sub_pi16(value, b.value); } 167 | c64 mul16(const c64& b) const { return _mm_mullo_pi16(value, b.value); } 168 | c64 mul16hi(const c64& b) const { return _mm_mulhi_pi16(value, b.value); } 169 | //c64 mul32(const c64& b) const { return _mm_mullo_pi32(value, b.value); } 170 | c64 add8(const c64& b) const { return _mm_add_pi8(value, b.value); } 171 | c64 sub8(const c64& b) const { return _mm_sub_pi8(value, b.value); } 172 | 173 | c64 unpacklbw(const c64& b) const { return _mm_unpacklo_pi8(b.value,value); } 174 | c64 unpacklwd(const c64& b) const { return _mm_unpacklo_pi16(b.value,value); } 175 | c64 unpackhbw(const c64& b) const { return _mm_unpackhi_pi8(b.value,value); } 176 | c64 unpackhwd(const c64& b) const { return _mm_unpackhi_pi16(b.value,value); } 177 | c64 unpackldq(const c64& b) const { return _mm_unpacklo_pi32(b.value,value); } 178 | c64 unpackldq() const { return _mm_unpacklo_pi32(value,value); } 179 | 180 | c64 operator& (const std::uint64_t& v) { return c64(_mm_and_si64(value, *(const __m64*)& v)); } 181 | 182 | c64 conv_s32_s16(const c64& b) const { return _mm_packs_pi32(value, b.value); } 183 | c64 conv_s16_u8(const c64& b) const { return _mm_packs_pu16(value, b.value); } 184 | c64 conv_s16_s8(const c64& b) const { return _mm_packs_pi16(value, b.value); } 185 | }; 186 | #endif 187 | 188 | struct c64_nonMMX: public c64_common 189 | { 190 | typedef c64_nonMMX c64; 191 | typedef std::uint_least64_t valuetype; 192 | valuetype value; 193 | 194 | inline c64_nonMMX() : value() { } 195 | inline c64_nonMMX(std::uint64_t v) : value(v) { } 196 | inline c64_nonMMX(int v) : value(v) { } 197 | inline c64_nonMMX(short a,short b,short c, short d) : value() 198 | { Init16(a,b,c,d); } 199 | 200 | c64 operator<< (int b) const { if(b < 0) return *this >> -b; return shl64(b); } 201 | c64 operator>> (int b) const { if(b < 0) return *this << -b; return shr64(b); } 202 | c64& operator<<= (int n) { return *this = shl64(n); } 203 | c64& operator>>= (int n) { return *this = shr64(n); } 204 | 205 | operator bool() const { return value; } 206 | 207 | c64 conv_s16_u8() const { return conv_s16_u8(*this); } 208 | c64 conv_s16_s8() const { return conv_s16_s8(*this); } 209 | 210 | void Init16(short a,short b,short c, short d) 211 | { std::uint_fast64_t aa = (unsigned short)a, 212 | bb = (unsigned short)b, 213 | cc = (unsigned short)c, 214 | dd = (unsigned short)d; 215 | value = aa | (bb << 16) | (cc << 32) | (dd << 48); } 216 | void Init16(short a) 217 | { Init16(a,a,a,a); } 218 | void Init8(unsigned char a,unsigned char b,unsigned char c,unsigned char d, 219 | unsigned char e,unsigned char f,unsigned char g,unsigned char h) 220 | { 221 | value = ((std::uint_fast64_t)(a | (b << 8) | (c << 16) | (d << 24))) 222 | | (((std::uint_fast64_t)e) << 32) 223 | | (((std::uint_fast64_t)f) << 40) 224 | | (((std::uint_fast64_t)g) << 48) 225 | | (((std::uint_fast64_t)h) << 56); 226 | } 227 | 228 | void Get(const unsigned char* p) { value = *(const std::uint_least64_t*)p; } 229 | void Put( unsigned char* p)const { *(std::uint_least64_t*)p = value; } 230 | 231 | c64& operator&= (const c64& b) { value&=b.value; return *this; } 232 | c64& operator|= (const c64& b) { value|=b.value; return *this; } 233 | c64& operator^= (const c64& b) { value^=b.value; return *this; } 234 | c64& operator+= (const c64& b) { value+=b.value; return *this; } 235 | c64& operator-= (const c64& b) { value-=b.value; return *this; } 236 | c64 operator& (const c64& b) const { return value & b.value; } 237 | c64 operator| (const c64& b) const { return value | b.value; } 238 | c64 operator^ (const c64& b) const { return value ^ b.value; } 239 | c64 operator- (const c64& b) const { return value - b.value; } 240 | c64 operator+ (const c64& b) const { return value + b.value; } 241 | 242 | c64 operator& (std::uint_fast64_t b) const { return value & b; } 243 | 244 | c64 operator~ () const { return ~value; } 245 | 246 | #define usimdsim(type, count, op) \ 247 | type* p = (type*)&res.value; \ 248 | for(int n=0; n> b; } 257 | c64 shl16(int b) const { c64 res = *this; usimdsim(short, 2, <<); return res; } 258 | c64 shr16(int b) const { c64 res = *this; usimdsim(unsigned short, 2, >>); return res; } 259 | c64 sar32(int b) const { c64 res = *this; usimdsim(int, 2, >>); return res; } 260 | c64 sar16(int b) const { c64 res = *this; usimdsim(short, 2, >>); return res; } 261 | 262 | c64 add16(const c64& b) const { c64 res = *this; simdsim(short, 4, +); return res; } 263 | c64 sub16(const c64& b) const { c64 res = *this; simdsim(short, 4, -); return res; } 264 | c64 add32(const c64& b) const { c64 res = *this; simdsim(int, 2, +); return res; } 265 | c64 sub32(const c64& b) const { c64 res = *this; simdsim(int, 2, -); return res; } 266 | c64 mul16(const c64& b) const { c64 res = *this; simdsim(short, 4, *); return res; } 267 | c64 mul16hi(const c64& b) const { c64 res = *this; simdsim(short, 4, *) >> 16; return res; } 268 | c64 add8(const c64& b) const { c64 res = *this; simdsim(unsigned char, 8, +); return res; } 269 | c64 sub8(const c64& b) const { c64 res = *this; simdsim(unsigned char, 8, -); return res; } 270 | 271 | #undef simdsim 272 | #undef usimdsim 273 | 274 | c64 conv_s32_s16(const c64& b) const 275 | { 276 | c64 res; res. 277 | Init16(clamp_s16(value & 0xFFFFFFFFU), 278 | clamp_s16(value >> 32), 279 | clamp_s16(b.value & 0xFFFFFFFFU), 280 | clamp_s16(b.value >> 32)); 281 | return res; 282 | } 283 | c64 conv_s16_u8(const c64& b) const 284 | { 285 | c64 res; res. 286 | Init8(clamp_u8(value & 0xFFFF), 287 | clamp_u8((value >> 16) & 0xFFFF), 288 | clamp_u8((value >> 32) & 0xFFFF), 289 | clamp_u8((value >> 48) & 0xFFFF), 290 | clamp_u8(b.value & 0xFFFF), 291 | clamp_u8((b.value >> 16) & 0xFFFF), 292 | clamp_u8((b.value >> 32) & 0xFFFF), 293 | clamp_u8((b.value >> 48) & 0xFFFF)); 294 | return res; 295 | } 296 | c64 conv_s16_s8(const c64& b) const 297 | { 298 | c64 res; res. 299 | Init8(clamp_s8(value & 0xFFFF), 300 | clamp_s8((value >> 16) & 0xFFFF), 301 | clamp_s8((value >> 32) & 0xFFFF), 302 | clamp_s8((value >> 48) & 0xFFFF), 303 | clamp_s8(b.value & 0xFFFF), 304 | clamp_s8((b.value >> 16) & 0xFFFF), 305 | clamp_s8((b.value >> 32) & 0xFFFF), 306 | clamp_s8((b.value >> 48) & 0xFFFF)); 307 | return res; 308 | } 309 | 310 | /* TODO: Verify that these are correct (though they should never be used anyway) */ 311 | c64 unpacklbw(const c64& p) const 312 | { 313 | #if defined(__MMX__) && !defined(__ICC) 314 | /* ICC says [error: type of cast must be integral or enum] 315 | * on the return value cast, 316 | * so we cannot use this code on ICC. Fine for GCC. */ 317 | return (std::uint_least64_t)_m_punpcklbw(*(const __m64*)&p.value, *(const __m64*)&value); 318 | #else 319 | std::uint_fast64_t a=value, b=p.value; 320 | return expand32_8(a) | (expand32_8(b) << 8); 321 | #endif 322 | } 323 | c64 unpackhbw(const c64& p) const 324 | { 325 | #if defined(__MMX__) && !defined(__ICC) 326 | return (std::uint_least64_t)_m_punpckhbw(*(const __m64*)&p.value, *(const __m64*)&value); 327 | #else 328 | std::uint_fast64_t a=value, b=p.value; 329 | return expand32_8(a>>32) | (expand32_8(b>>32) << 8); 330 | #endif 331 | } 332 | c64 unpacklwd(const c64& p) const 333 | { 334 | #if defined(__MMX__) && !defined(__ICC) 335 | return (std::uint_least64_t)_m_punpcklwd(*(const __m64*)&p.value, *(const __m64*)&value); 336 | #else 337 | std::uint_fast64_t a=value, b=p.value; 338 | return expand32_16(a) | (expand32_16(b) << 16); 339 | #endif 340 | } 341 | c64 unpackhwd(const c64& p) const 342 | { 343 | #if defined(__MMX__) && !defined(__ICC) 344 | return (std::uint_least64_t)_m_punpckhwd(*(const __m64*)&p.value, *(const __m64*)&value); 345 | #else 346 | std::uint_fast64_t a=value, b=p.value; 347 | return expand32_16(a>>32) | (expand32_16(b>>32) << 16); 348 | #endif 349 | } 350 | c64 unpackldq() const { return unpackldq(*this); } 351 | c64 unpackldq(const c64& p) const 352 | { 353 | #if defined(__MMX__) && !defined(__ICC) 354 | return (std::uint_least64_t)_m_punpckldq(*(const __m64*)&p.value, *(const __m64*)&value); 355 | #else 356 | return value | (p.value << 32); 357 | #endif 358 | } 359 | }; 360 | 361 | #ifdef USE_MMX 362 | typedef c64_MMX c64; 363 | #else 364 | typedef c64_nonMMX c64; 365 | #endif 366 | 367 | static inline void MMX_clear() 368 | { 369 | #ifdef __3dNOW__ 370 | _m_femms(); /* Note: not available on ICC or Valgrind */ 371 | //_mm_empty(); 372 | #elif defined(__MMX__) 373 | _mm_empty(); 374 | #endif 375 | } 376 | -------------------------------------------------------------------------------- /crt-filter.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "blur.hh" 9 | 10 | #define likely(x) __builtin_expect(!!(x), 1) 11 | #define unlikely(x) __builtin_expect(!!(x), 0) 12 | 13 | #include "newhash/newhash.cc" 14 | 15 | /* Magnitude of scaled scanline, where n = 0..1 = position between scanlines */ 16 | inline constexpr float ScanlineMagnitude(float n) { float c = 0.3f; return std::exp(-(n-0.5f)*(n-0.5f)/(2.f*c*c)); } 17 | 18 | constexpr unsigned NumHorizPixels = 640; 19 | constexpr unsigned CellWidth0 = 2, CellBlank0 = 1; // R 20 | constexpr unsigned CellWidth1 = 2, CellBlank1 = 1; // G 21 | constexpr unsigned CellWidth2 = 2, CellBlank2 = 2; // B 22 | constexpr unsigned TotalHorizRes = NumHorizPixels * (CellWidth0 + CellBlank0 + CellWidth1 + CellBlank1 + CellWidth2 + CellBlank2); 23 | constexpr unsigned Cell0Start = 0, Cell0End = Cell0Start + CellWidth0; 24 | constexpr unsigned Cell1Start = Cell0End+CellBlank0, Cell1End = Cell1Start + CellWidth1; 25 | constexpr unsigned Cell2Start = Cell1End+CellBlank1, Cell2End = Cell2Start + CellWidth2; 26 | 27 | constexpr unsigned NumVertPixels = 400; 28 | constexpr unsigned CellHeight0 = 5; // Height of RGB triplet 29 | constexpr unsigned CellHeight1 = 1; // Blank after RGB triplet 30 | constexpr unsigned CellStagger = 3; // Offset of successive columns 31 | constexpr unsigned TotalVertRes = NumVertPixels * (CellHeight0 + CellHeight1); 32 | 33 | template 34 | static inline float GetMask(unsigned x, unsigned y) 35 | { 36 | constexpr unsigned cellwidth = CellWidth0 + CellBlank0 + CellWidth1 + CellBlank1 + CellWidth2 + CellBlank2; 37 | unsigned hpix = x / cellwidth, hmod = x % cellwidth; 38 | 39 | constexpr unsigned cellheight = CellHeight0 + CellHeight1; 40 | unsigned vmod = (y + CellStagger * hpix) % cellheight; 41 | return (vmod < CellHeight0) & (hmod >= Start) & (hmod < End); 42 | } 43 | 44 | template 45 | static void ConvertPlane(unsigned num, const std::uint32_t* pixels, float* output) 46 | { 47 | #pragma omp simd 48 | for(unsigned n=0; n>= Shift; 52 | p &= 0xFF; 53 | output[n] = p;// * (1.f / 255.f); 54 | } 55 | } 56 | 57 | 58 | 59 | template 60 | static inline float Lanczos_pi(float x_pi) 61 | { 62 | if(unlikely(x_pi == (float)0.0)) return (float)1.0; 63 | if(check) 64 | { 65 | if (x_pi <= (float)(-Radius*M_PI) 66 | || x_pi >= (float)( Radius*M_PI)) return (float)0.0; 67 | } 68 | 69 | float x_pi_div_Radius = x_pi / Radius; 70 | 71 | //float a = sin(x_pi) / x_pi; 72 | //float b = sin(x_pi_div_Radius) / x_pi_div_Radius; 73 | //return a * b; 74 | 75 | return std::sin(x_pi) * std::sin(x_pi_div_Radius) / (x_pi * x_pi_div_Radius); 76 | } 77 | template 78 | static inline float Lanczos(float x) 79 | { 80 | return Lanczos_pi(x * (float)M_PI); 81 | } 82 | template 83 | class Lanczos2DBase 84 | { 85 | const SrcTab& src; DestTab& tgt; 86 | 87 | // Note: In this vocabulary, 88 | // y denotes outer loop and 89 | // x denotes inner loop, but 90 | // it could be vice versa. 91 | int xinc_src, yinc_src; 92 | int xinc_tgt, yinc_tgt; 93 | int ylimit; 94 | public: 95 | Lanczos2DBase( 96 | const SrcTab& src_, DestTab& tgt_, 97 | int sxi,int syi, int txi,int tyi, int ylim) 98 | : src(src_), tgt(tgt_), 99 | xinc_src(sxi), yinc_src(syi), 100 | xinc_tgt(txi), yinc_tgt(tyi), 101 | ylimit(ylim) 102 | { } 103 | 104 | void ScaleOne 105 | (int srcpos, int tgtpos, int nmax, 106 | const float contrib[], float density_rev) const 107 | { 108 | float res = 0.0; 109 | int srctemp = srcpos; 110 | 111 | //#pragma omp parallel for reduction(+:r,g,b) 112 | #pragma omp simd reduction(+:res) 113 | for(int n=0; n0; ) 138 | { 139 | /*fprintf(stderr, "- within: srcpos=%d, tgtpos=%d, y=%d\n", srcpos,tgtpos, y);*/ 140 | ScaleOne(srcpos, tgtpos, nmax, contrib, density_rev); 141 | 142 | srcpos += yinc_src; // source y increment 143 | tgtpos += yinc_tgt; // target y increment 144 | } 145 | } 146 | }; 147 | 148 | template 149 | class HorizScaler: public Lanczos2DBase 150 | { 151 | public: 152 | /* 153 | <--------------> 154 | <--------------> 155 | <--------------> 156 | <--------------> 157 | <--------------> 158 | <--------------> 159 | 160 | For each output column (out_size = {ow}), 161 | {h} rows (source and target) get processed 162 | 163 | On each row, 164 | {nmax} source columns get transformed 165 | into 1 target column 166 | 167 | Target: 168 | New column stride = {1} 169 | New row stride = {ow} 170 | Source: 171 | Next column stride = {1} 172 | Next row stride = {iw} 173 | */ 174 | 175 | HorizScaler( 176 | int iw,int ow, int h, 177 | const SrcTab& src, DestTab& tgt) 178 | : Lanczos2DBase( 179 | src,tgt, 180 | 1, // xinc_src ok 181 | iw, // yinc_src ok 182 | 1, // xinc_tgt ok 183 | ow, // yinc_tgt ok 184 | h // ylimit ok 185 | ) { } 186 | }; 187 | 188 | template 189 | class VertScaler: public Lanczos2DBase 190 | { 191 | public: 192 | /* 193 | ^^^^^^^^^^^^^^^^ 194 | |||||||||||||||| 195 | |||||||||||||||| 196 | |||||||||||||||| 197 | |||||||||||||||| 198 | vvvvvvvvvvvvvvvv 199 | 200 | For each output row (out_size = {oh}), 201 | {w} columns (source and target) get processed 202 | 203 | On each column, 204 | {nmax} source rows get transformed 205 | into 1 target row 206 | 207 | Target: 208 | New row stride = {w} 209 | New column stride = {1} 210 | Source: 211 | Next row stride = {w} 212 | Next column stride = {1} 213 | */ 214 | 215 | VertScaler( 216 | int w, 217 | const SrcTab& src, DestTab& tgt) 218 | 219 | : Lanczos2DBase( 220 | src,tgt, 221 | w, // xinc_src ok 222 | 1, // yinc_src ok 223 | w, // xinc_tgt ok 224 | 1, // yinc_tgt ok 225 | w // ylimit ok 226 | ) { } 227 | }; 228 | 229 | /*template 230 | class ScalarScaler: private Lanczos2DBase 231 | { 232 | public: 233 | ScalarScaler(const SrcTab& src, DestTab& tgt) 234 | : Lanczos2DBase(src,tgt, 1,1,1,1,1) { } 235 | 236 | void StripeLoop(int tx, int sx, int nmax, 237 | const float contrib[], float density) const 238 | { 239 | const float density_rev = (density == 0.0 || density == 1.0) 240 | ? 1.0 241 | : (1.0 / density); 242 | ScaleOne(sx, tx, nmax, contrib, density_rev); 243 | } 244 | };*/ 245 | 246 | struct LanczosCoreCalcRes 247 | { 248 | int start; 249 | int nmax; 250 | float density; 251 | }; 252 | 253 | template 254 | inline LanczosCoreCalcRes LanczosCoreCalc 255 | (int in_size, 256 | float center, float support, float scale, 257 | float contrib[]) 258 | { 259 | const int start = std::max((int)(center-support+(float)0.5), 0); 260 | const int end = std::min((int)(center+support+(float)0.5), in_size); 261 | const int nmax = end-start; 262 | 263 | const float scale_pi = scale * M_PI; 264 | 265 | const float s_min = -FilterRadius*M_PI; 266 | const float s_max = FilterRadius*M_PI; 267 | 268 | float s_pi = (start-center+(float)0.5) * scale_pi; 269 | 270 | float density = 0.0; 271 | 272 | { int n=0; 273 | for(; n < nmax && unlikely(s_pi < s_min); ++n, s_pi += scale_pi) 274 | {} 275 | for(; n < nmax && likely(s_pi < s_max); ++n, s_pi += scale_pi) 276 | { 277 | float l = Lanczos_pi (s_pi); 278 | contrib[n] = l; 279 | density += l; 280 | } 281 | } 282 | 283 | LanczosCoreCalcRes res; 284 | res.start = start; 285 | res.nmax = nmax; 286 | res.density = density; 287 | return res; 288 | } 289 | 290 | /* A generic Lanczos scaler suitable for 291 | * converting something to something else 292 | * at once. 293 | * For image pixels, use Triplet 294 | * For stereo samples, use Triplet 295 | * For mono samples, just use type 296 | */ 297 | template 298 | static void LanczosScale(int in_size, int out_size, Handler& target) 299 | { 300 | const int FilterRadius = 2; 301 | const float blur = 1.0f; 302 | 303 | const float factor = out_size / (float)in_size; 304 | const float scale = std::min(factor, (float)1.0) / blur; 305 | const float support = FilterRadius / scale; 306 | 307 | const std::size_t contrib_size = std::min(in_size, 5+int(2*support)); 308 | float contrib[contrib_size]; 309 | 310 | /*fprintf(stderr, "Scaling (%d->%d), contrib=%d\n", 311 | in_size, out_size, (int)contrib_size);*/ 312 | 313 | #pragma omp parallel for schedule(static) 314 | for(int outpos=0; outpos(in_size, center, support, scale, contrib); 319 | target.StripeLoop(outpos, res.start, res.nmax, &contrib[0], res.density); 320 | } 321 | } 322 | 323 | 324 | static void VLanczos(unsigned in_width,unsigned in_height, unsigned out_height, const float* in, float* out) 325 | { 326 | VertScaler handler_y(in_width, in, out); 327 | LanczosScale(in_height, out_height, handler_y); 328 | } 329 | static void HLanczos(unsigned in_width,unsigned in_height, unsigned out_width, const float* in, float* out) 330 | { 331 | HorizScaler handler_x(in_width,out_width, in_height, in, out); 332 | LanczosScale(in_width, out_width, handler_x); 333 | } 334 | 335 | static std::uint32_t ClampWithDesaturation(int r,int g,int b) 336 | { 337 | const int R = 2126, G = 7152, B = 722, sum=R+G+B; 338 | int luma = r*R + g*G + b*B; 339 | if(luma > 255*sum) { r=g=b=255; } 340 | else if(luma <= 0) { r=g=b=0; } 341 | else 342 | { 343 | // See explanations below on the uses of this function. 344 | auto spread = [&r,&g,&b,R,G,B](auto&& test, auto&& cap, int sign) 345 | { 346 | // Is there load waiting to be shared? 347 | int cr,cg,cb, work = R*std::max(0, test(r)) 348 | + G*std::max(0, test(g)) 349 | + B*std::max(0, test(b)); 350 | if(!work) return false; 351 | // Are there capable load bearers? 352 | if(int capacity = R*std::max(0, (cr = cap(r))) 353 | + G*std::max(0, (cg = cap(g))) 354 | + B*std::max(0, (cb = cap(b)))) 355 | { 356 | // Distribute the load (take it away & give to others). 357 | int act = std::min(work, capacity); 358 | r += cr * sign * act / (cr > 0 ? capacity : work); 359 | g += cg * sign * act / (cg > 0 ? capacity : work); 360 | b += cb * sign * act / (cb > 0 ? capacity : work); 361 | } 362 | return true; 363 | }; 364 | // Find out the amount of excess color energy. 365 | // Dissipate it to capable channels, 366 | // and take it away from those that had excess. 367 | for(int rounds=0; rounds<4; ++rounds) 368 | { 369 | bool excess = spread([](int c) { return c-255; }, // Amount of access 370 | [](int c) { return 255-c; }, // Capacity for reception 371 | 1); 372 | // Find out the amount of color energy debt. 373 | // Borrow energy from capable channels, 374 | // and give it to channels that need it. 375 | bool debt = spread([](int c) { return -c; }, // Amount of debt 376 | [](int c) { return c; }, // Capacity for borrowing 377 | -1); 378 | if(!excess && !debt) break; 379 | } 380 | // Normally, 1 round should be fine. In case it isn't, we provide 381 | // a few retry rounds. 2 round is at most needed, in my experiments. 382 | // But just to be perfectly safe, in the unlikely case that one of 383 | // the channels still needs clamping... We do it the traditional way. 384 | // They need clamping if one of the values has bits other than 0-7 set. 385 | if(unlikely((r | g | b) & ~0xFF)) 386 | { 387 | r = std::clamp(r, 0, 255); 388 | g = std::clamp(g, 0, 255); 389 | b = std::clamp(b, 0, 255); 390 | } 391 | } 392 | return unsigned(r)*65536u + unsigned(g)*256u + b; 393 | } 394 | 395 | 396 | void ConvertPicture(unsigned in_width, 397 | unsigned in_height, 398 | unsigned out_width, 399 | unsigned out_height, 400 | unsigned NumScanlines, 401 | const std::uint32_t* pixels, 402 | std::uint32_t* outpixels) 403 | { 404 | std::vector plane(NumScanlines * in_width * 3); 405 | std::vector tempplane(TotalVertRes * out_width * 3); 406 | std::vector resuplane(out_width * out_height * 3); 407 | constexpr float Gamma = 2.0; 408 | 409 | if(in_height == NumScanlines) 410 | { 411 | ConvertPlane<16>(NumScanlines*in_width, pixels, &plane[NumScanlines*in_width*0 + 0]); 412 | ConvertPlane< 8>(NumScanlines*in_width, pixels, &plane[NumScanlines*in_width*1 + 0]); 413 | ConvertPlane< 0>(NumScanlines*in_width, pixels, &plane[NumScanlines*in_width*2 + 0]); 414 | 415 | #pragma omp parallel for simd schedule(static) 416 | for(unsigned n=0; n indata(in_width * in_height * 3); 422 | ConvertPlane<16>(in_height*in_width, pixels, &indata[in_height*in_width*0 + 0]); 423 | ConvertPlane< 8>(in_height*in_width, pixels, &indata[in_height*in_width*1 + 0]); 424 | ConvertPlane< 0>(in_height*in_width, pixels, &indata[in_height*in_width*2 + 0]); 425 | 426 | #pragma omp parallel for simd schedule(static) 427 | for(unsigned n=0; n(x,y); 458 | XMask[x + TotalHorizRes*1] = GetMask(x,y); 459 | XMask[x + TotalHorizRes*2] = GetMask(x,y); 460 | } 461 | 462 | #pragma omp simd collapse(1) 463 | for(unsigned n=0; n<3; ++n) 464 | for(unsigned x=0; x(x,y) + GetMask(x,y) + GetMask(x,y); } 489 | for(unsigned n=0; n<8; ++n) 490 | { facsum2 += 1; sum2 += ScanlineMagnitude(n/8.f); } 491 | float factor = facsum*facsum2 / (sum*sum2); 492 | 493 | #pragma omp parallel for simd schedule(static) 494 | for(unsigned n=0; n resuplanes(out_width * out_height * 3); 497 | std::vector resuplanestmp(out_width * out_height * 3); 498 | std::vector resuplaneout(out_width * out_height * 3); 499 | 500 | #pragma omp parallel for simd schedule(static) 501 | for(unsigned n=0; n(&resuplanes[n*out_width*out_height], 507 | &resuplaneout[n*out_width*out_height], 508 | &resuplanestmp[n*out_width*out_height], 509 | out_width, out_height, out_width / 640.f); 510 | } 511 | 512 | #pragma omp parallel for simd schedule(static) 513 | for(unsigned n=0; n \33[m\n"); 562 | return 1; 563 | } 564 | unsigned in_width = std::atoi(argv[1]); 565 | unsigned in_height = std::atoi(argv[2]); 566 | unsigned out_width = std::atoi(argv[3]); 567 | unsigned out_height = std::atoi(argv[4]); 568 | unsigned NumScanlines = std::atoi(argv[5]); 569 | std::vector inbuf(in_width*in_height); 570 | std::vector outbuf(out_width*out_height); 571 | 572 | constexpr unsigned NFrames = 4; 573 | newhash_t hashes[NFrames]; 574 | std::vector saved_outputs[NFrames]; 575 | std::vector saved_inputs[NFrames]; 576 | for(;;) 577 | { 578 | if(FullyRead(0, &inbuf[0], inbuf.size()*4) < (long)inbuf.size()*4) break; 579 | 580 | newhash_t hash = newhash_calc((const unsigned char*)&inbuf[0], 581 | inbuf.size()*sizeof(inbuf[0])); 582 | bool found = false; 583 | for(unsigned n=0; n