├── .gitignore
├── 1_square
    ├── Makefile
    └── square.c
├── 2_norm
    ├── Makefile
    └── norm.c
├── 3_dot
    ├── Makefile
    └── dot.c
├── 4_invsqrt
    ├── Makefile
    └── invsqrt.c
├── 5_fn_like
    ├── Makefile
    └── fn_like.c
├── 6_cache
    ├── Makefile
    └── cache.c
├── 7_fwd
    ├── Makefile
    └── fwd.c
├── 8_batch
    ├── Makefile
    └── batch.c
├── 9_multisource
    ├── Makefile
    ├── multisource.c
    ├── myblas.c
    └── myblas.h
├── README.md
├── cuda
    ├── README.md
    └── square
    │   ├── Makefile
    │   └── cuda_square.cu
├── docker
    └── Dockerfile
├── dockerscript.sh
├── julia
    └── introduction.ipynb
├── julia_activity
    ├── .ipynb_checkpoints
    │   └── activity-checkpoint.ipynb
    ├── .jupyter
    │   └── desktop-workspaces
    │   │   └── default-37a8.jupyterlab-workspace
    └── activity.jl
├── julia_custom
    └── custom.jl
├── julia_fwd_and_batch
    └── fwd_and_batch.jl
├── mpi
    ├── README.md
    └── disclaimer.txt
└── openmp
    ├── README.md
    ├── parallel_for
        ├── Makefile
        ├── OldMakefile
        └── omp_parallel_for.c
    ├── parallel_for_nounroll
        ├── Makefile
        └── omp_parallel_for_nounroll.c
    └── parallel_simple
        ├── Makefile
        └── omp_parallel.c


/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.ll
3 | 


--------------------------------------------------------------------------------
/1_square/Makefile:
--------------------------------------------------------------------------------
 1 | all: square.o
 2 | 
 3 | clean:
 4 | 	rm -f *.o *.ll
 5 | 
 6 | %.o: %.c
 7 | 	../dockerscript.sh clang-12 /host/$^ -O3 -Xclang -load -Xclang /Enzyme/enzyme/build/Enzyme/ClangEnzyme-12.so -ffast-math -o /host/$@
 8 | 
 9 | run-%: %.o
10 | 	../dockerscript.sh /host/$^ 3.14
11 | 


--------------------------------------------------------------------------------
/1_square/square.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | 
 4 | // Function to differentiate
 5 | double square(double x) {
 6 |   return x * x;
 7 | }
 8 | 
 9 | double __enzyme_autodiff(void*, ...);
10 | int enzyme_const, enzyme_dup, enzyme_out;
11 | 
12 | int main(int argc, char *argv[]) {
13 |   double x = 20;
14 |   if (argc > 1) {
15 |     x = atof(argv[1]);
16 |   }
17 | 
18 |   double grad_x = __enzyme_autodiff((void*)square, x);
19 |   printf("Gradient square(%f) = %f\n", x, grad_x);
20 | 
21 |   return 0;
22 | }
23 | 


--------------------------------------------------------------------------------
/2_norm/Makefile:
--------------------------------------------------------------------------------
 1 | all: norm-O2enzyme.o norm-enzymeO2.o norm-O2enzyme.ll norm-enzymeO2.ll norm-unopt.ll
 2 | 
 3 | clean:
 4 | 	rm -f *.o *.ll
 5 | 
 6 | %-unopt.ll: %.c
 7 | 	../dockerscript.sh clang-12 /host/$^ -O1 -Xclang -disable-llvm-passes -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o /host/$@ -S -emit-llvm
 8 | 
 9 | %-enzymeO2.ll: %-unopt.ll
10 | 	../dockerscript.sh opt-12 /host/$^ -load /Enzyme/enzyme/build/Enzyme/LLVMEnzyme-12.so -enzyme -O2 -o /host/$@ -S
11 | 
12 | %-O2enzyme.ll: %-unopt.ll
13 | 	../dockerscript.sh opt-12 /host/$^ -load /Enzyme/enzyme/build/Enzyme/LLVMEnzyme-12.so -O2 -enzyme -o /host/$@ -S
14 | 
15 | %.o: %.ll
16 | 	../dockerscript.sh clang-12 -O2 /host/$^ -o /host/$@ -lm
17 | 
18 | run-%: %.o
19 | 	../dockerscript.sh /host/$^ 2000
20 | 


--------------------------------------------------------------------------------
/2_norm/norm.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <sys/time.h>
 4 | #include <stdlib.h>
 5 | #include <math.h>
 6 | #include <assert.h>
 7 | 
 8 | float tdiff(struct timeval *start, struct timeval *end) {
 9 |   return (end->tv_sec-start->tv_sec) + 1e-6*(end->tv_usec-start->tv_usec);
10 | }
11 | 
12 | __attribute__((const,noinline))
13 | double mag(const double *A, int n) {
14 |   double amt = 0;
15 |   for(int i=0; i<n; i++) amt += A[i];
16 |   return amt;
17 | }
18 | 
19 | void normalize(double *__restrict__ out, const double *__restrict__ in, int n) {
20 |  for(int i = 0; i < n; ++i)
21 |     out[i] = in[i] / mag(in, n);
22 | }
23 | 
24 | void __enzyme_autodiff(void*, ...);
25 | int enzyme_const, enzyme_dup, enzyme_out;
26 | 
27 | int main(int argc, char *argv[]) {
28 |   struct timeval start, end;
29 |   int n = 1000000;
30 |   int x = 20;
31 |   if (argc > 1) {
32 |     n = atoi(argv[1]);
33 |     if (argc > 2) {
34 |       x = atoi(argv[2]);
35 |     }
36 |   }
37 | 
38 |   double *A = (double*)malloc(sizeof(double) * n);
39 |   assert(A != 0);
40 | 
41 |   double *B = (double*)malloc(sizeof(double) * n);
42 |   assert(B != 0);
43 |   for(int i=0; i<n; i++)
44 |    B[i] = x; 
45 | 
46 | 
47 |   double *grad_A = (double*)malloc(sizeof(double) * n);
48 |   assert(grad_A != 0);
49 |   for(int i = 0; i < n; i++)
50 |       grad_A[i] = 1.0;
51 | 
52 |   double *grad_B = (double*)malloc(sizeof(double) * n);
53 |   assert(grad_B != 0);
54 |   for(int i = 0; i < n; i++)
55 |       grad_B[i] = 0.0;
56 | 
57 | 
58 |   gettimeofday(&start, NULL);
59 | 
60 |   normalize(A, B, n);
61 | 
62 |   gettimeofday(&end, NULL);
63 |   printf("Serial Normalize %0.6f %f\n", tdiff(&start, &end), A[n-1]);
64 | 
65 |   gettimeofday(&start, NULL);
66 | 
67 |   __enzyme_autodiff((void*)normalize,
68 |                     enzyme_dup, A, grad_A,
69 |                     enzyme_dup, B, grad_B,
70 |                     enzyme_const, n);
71 | 
72 |   gettimeofday(&end, NULL);
73 |   printf("Gradient Normalize %0.6f %f %f\n", tdiff(&start, &end), A[n-1], grad_B[0]);
74 | 
75 |   free(A);
76 |   free(B);
77 |   return 0;
78 | }
79 | 


--------------------------------------------------------------------------------
/3_dot/Makefile:
--------------------------------------------------------------------------------
 1 | all: dot.o
 2 | 
 3 | clean:
 4 | 	rm -f *.o *.ll
 5 | 
 6 | %.o: %.c
 7 | 	../dockerscript.sh clang-12 /host/$^ -O3 -Xclang -load -Xclang /Enzyme/enzyme/build/Enzyme/ClangEnzyme-12.so -ffast-math -o /host/$@
 8 | 
 9 | run-%: %.o
10 | 	../dockerscript.sh /host/$^
11 | 


--------------------------------------------------------------------------------
/3_dot/dot.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <sys/time.h>
 4 | #include <assert.h>
 5 | 
 6 | float tdiff(struct timeval *start, struct timeval *end) {
 7 |   return (end->tv_sec-start->tv_sec) + 1e-6*(end->tv_usec-start->tv_usec);
 8 | }
 9 | 
10 | double dot(double* __restrict__ A, double* __restrict__ B, double C, int n) {
11 |   double sum = 0;
12 |   for (int i=0; i<n; i++) {
13 |     sum += A[i] * B[i];
14 |   }
15 |   return C + sum;
16 | }
17 | 
18 | double __enzyme_autodiff(void*, ...);
19 | int enzyme_const, enzyme_dup, enzyme_out;
20 | 
21 | int main(int argc, char *argv[]) {
22 |   struct timeval start, end;
23 |   int n = 20000000;
24 |   int x = 20;
25 |   if (argc > 1) {
26 |     n = atoi(argv[1]);
27 |     if (argc > 2) {
28 |       x = atoi(argv[2]);
29 |     }
30 |   }
31 | 
32 |   double *A = (double*)malloc(sizeof(double) * n);
33 |   for(int i=0; i<n; i++)
34 |    A[i] = x / (i+1);
35 |   assert(A != 0);
36 | 
37 |   double *B = (double*)malloc(sizeof(double) * n);
38 |   assert(B != 0);
39 |   for(int i=0; i<n; i++)
40 |    B[i] = x + i;
41 | 
42 | 
43 |   double *grad_A = (double*)malloc(sizeof(double) * n);
44 |   assert(grad_A != 0);
45 |   for(int i = 0; i < n; i++)
46 |       grad_A[i] = 0.0;
47 | 
48 |   double *grad_B = (double*)malloc(sizeof(double) * n);
49 |   assert(grad_B != 0);
50 |   for(int i = 0; i < n; i++)
51 |       grad_B[i] = 0.0;
52 | 
53 |   double C = 1/x;
54 | 
55 |   gettimeofday(&start, NULL);
56 | 
57 |   double grad_C = __enzyme_autodiff((void*)dot,
58 |                     A, grad_A,
59 |                     B, grad_B,
60 |                     C,
61 |                     n);
62 | 
63 |   gettimeofday(&end, NULL);
64 |   printf("Grad Normalize time=%0.6f A'[0]=%f B'[0]=%f C'=%f\n", tdiff(&start, &end), grad_A[0], grad_B[0], grad_C);
65 | 
66 |   for(int i = 0; i < n; i++)
67 |       grad_A[i] = 0.0;
68 |   for(int i = 0; i < n; i++)
69 |       grad_B[i] = 0.0;
70 | 
71 |   gettimeofday(&start, NULL);
72 | 
73 |   grad_C =__enzyme_autodiff((void*)dot,
74 |                     enzyme_const, A,
75 |                     enzyme_dup, B, grad_B,
76 |                     enzyme_out, C,
77 |                     enzyme_const, n);
78 | 
79 |   gettimeofday(&end, NULL);
80 |   printf("Constant A time=%0.6f A'[0]=%f B'[0]=%f C'=%f\n", tdiff(&start, &end), grad_A[0], grad_B[0], grad_C);
81 | 
82 |   return 0;
83 | }
84 | 


--------------------------------------------------------------------------------
/4_invsqrt/Makefile:
--------------------------------------------------------------------------------
 1 | all: invsqrt-custom.o
 2 | 
 3 | clean:
 4 | 	rm -f *.o *.ll
 5 | 
 6 | %-nocustom.o: %.c
 7 | 	../dockerscript.sh clang-12 /host/$^ -O3 -Xclang -load -Xclang /Enzyme/enzyme/build/Enzyme/ClangEnzyme-12.so -ffast-math -o /host/$@
 8 | 
 9 | %-custom.o: %.c
10 | 	../dockerscript.sh clang-12 -DCUSTOM=1 /host/$^ -O3 -Xclang -load -Xclang /Enzyme/enzyme/build/Enzyme/ClangEnzyme-12.so -ffast-math -o /host/$@
11 | 
12 | run-%: %.o
13 | 	../dockerscript.sh /host/$^ 3
14 | 


--------------------------------------------------------------------------------
/4_invsqrt/invsqrt.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <assert.h>
 4 | 
 5 | // Fast inverse sqrt
 6 | // Code taken from https://en.wikipedia.org/wiki/Fast_inverse_square_root
 7 | #ifdef CUSTOM
 8 | __attribute__((noinline))
 9 | #endif
10 | float Q_rsqrt( float number )
11 | {
12 |   long i;
13 |   float x2, y;
14 |   const float threehalfs = 1.5F;
15 | 
16 |   x2 = number * 0.5F;
17 |   y  = number;
18 |   i  = * ( long * ) &y;                       // evil floating point bit level hacking
19 |   i  = 0x5f3759df - ( i >> 1 );               // what the [...]?
20 |   y  = * ( float * ) &i;
21 |   y  = y * ( threehalfs - ( x2 * y * y ) );   // 1st iteration
22 |   return y;
23 | }
24 | 
25 | 
26 | double invmag(double* __restrict__ A, int n) {
27 |   double sumsq = 0;
28 |   for (int i=0; i<n; i++) {
29 |     sumsq += A[i] * A[i];
30 |   }
31 |   return Q_rsqrt(sumsq);
32 | }
33 | 
34 | #ifdef CUSTOM
35 | 
36 | 
37 | // Returns { optional tape, original return (if pointer), shadow return (if pointer) }
38 | void aug_rsqrt(float x) {
39 |   // Nothing need to be done in augmented forward pass
40 | }
41 | 
42 | // Arguments: all pointers duplicated, gradient of the return, tape (if provided)
43 | float rev_rsqrt(float x, float grad_out) {
44 |   // derivative of x^(-1/2) = -1/2 x^(-3/2)
45 |   return -grad_out * Q_rsqrt(x) / (2 * x);
46 | }
47 | 
48 | void* __enzyme_register_gradient_rsqrt[3] = { (void*)Q_rsqrt, (void*)aug_rsqrt, (void*)rev_rsqrt };
49 | 
50 | #endif
51 | 
52 | 
53 | void __enzyme_autodiff(void*, ...);
54 | int enzyme_const, enzyme_dup, enzyme_out;
55 | 
56 | int main(int argc, char *argv[]) {
57 |   int n = 3;
58 |   if (argc > 1) {
59 |     n = atoi(argv[1]);
60 |   }
61 | 
62 | 
63 |   double *A = (double*)malloc(sizeof(double) * n);
64 |   for(int i=0; i<n; i++)
65 |    A[i] = (i+1);
66 |   assert(A != 0);
67 | 
68 |   double *grad_A = (double*)malloc(sizeof(double) * n);
69 |   assert(grad_A != 0);
70 |   for(int i=0; i<n; i++)
71 |    grad_A[i] = 0;
72 | 
73 |   __enzyme_autodiff((void*)invmag, A, grad_A, n);
74 |   printf("Gradient invmag(A)[0] = %f\n", grad_A[0]);
75 | 
76 |   return 0;
77 | }
78 | 


--------------------------------------------------------------------------------
/5_fn_like/Makefile:
--------------------------------------------------------------------------------
 1 | all: fn_like.o
 2 | 
 3 | clean:
 4 | 	rm -f *.o *.ll
 5 | 
 6 | %.o: %.c
 7 | 	../dockerscript.sh clang-12 /host/$^ -O3 -Xclang -load -Xclang /Enzyme/enzyme/build/Enzyme/ClangEnzyme-12.so -ffast-math -o /host/$@
 8 | 
 9 | run-%: %.o
10 | 	../dockerscript.sh /host/$^ 3.14
11 | 


--------------------------------------------------------------------------------
/5_fn_like/fn_like.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <math.h>
 3 | #include <assert.h>
 4 | 
 5 | 
 6 | double __enzyme_autodiff(void*, ...);
 7 | 
 8 | double log1p_like_function(double a) {
 9 |   return 2*a;
10 | }
11 | 
12 | double test(double a) {
13 |   return log1p_like_function(a);
14 | }
15 | 
16 | void* __enzyme_function_like[2] = {(void*)log1p_like_function, "log1p"}; 
17 | 
18 | int main(int argc, char** argv) {
19 | 
20 |   double grad_out = __enzyme_autodiff(test, 2.0);
21 |   printf("Gradient of the log1p like function is %f", grad_out);
22 | 
23 |   return 0;
24 | }


--------------------------------------------------------------------------------
/6_cache/Makefile:
--------------------------------------------------------------------------------
 1 | all: cache-alias.o cache-noalias.o
 2 | 
 3 | clean:
 4 | 	rm -f *.o *.ll
 5 | 
 6 | %-alias.o: %.c
 7 | 	../dockerscript.sh clang-12 -Rpass=enzyme /host/$^ -O3 -Xclang -load -Xclang /Enzyme/enzyme/build/Enzyme/ClangEnzyme-12.so -ffast-math -o /host/$@
 8 | 
 9 | %-noalias.o: %.c
10 | 	../dockerscript.sh clang-12 -Rpass=enzyme -DNOALIAS=1 /host/$^ -O3 -Xclang -load -Xclang /Enzyme/enzyme/build/Enzyme/ClangEnzyme-12.so -ffast-math -o /host/$@
11 | 
12 | run-%: %.o
13 | 	../dockerscript.sh /host/$^
14 | 


--------------------------------------------------------------------------------
/6_cache/cache.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <assert.h>
 4 | #include <sys/time.h>
 5 | 
 6 | float tdiff(struct timeval *start, struct timeval *end) {
 7 |   return (end->tv_sec-start->tv_sec) + 1e-6*(end->tv_usec-start->tv_usec);
 8 | }
 9 | 
10 | #ifndef NOALIAS
11 | 
12 | void squareCopy(double* in, double* out, int n) {
13 |   double sumsq = 0;
14 |   for (int i=0; i<n; i++) {
15 |     sumsq += in[i] * in[i];
16 |   }
17 |   *out = sumsq;
18 | }
19 | 
20 | #else
21 | 
22 | void squareCopy(double *__restrict__ in, double *__restrict__ out, int n) {
23 |   double sumsq = 0;
24 |   for (int i=0; i<n; i++) {
25 |     sumsq += in[i] * in[i];
26 |   }
27 |   *out = sumsq;
28 | }
29 | 
30 | #endif
31 | 
32 | void __enzyme_autodiff(void*, ...);
33 | int enzyme_const, enzyme_dup, enzyme_out;
34 | 
35 | int main(int argc, char *argv[]) {
36 |   struct timeval start, end;
37 | 
38 |   int n = 30000000;
39 |   if (argc > 1) {
40 |     n = atoi(argv[1]);
41 |   }
42 | 
43 | 
44 |   double *in = (double*)malloc(sizeof(double) * n);
45 |   assert(in != 0);
46 |   for(int i=0; i<n; i++)
47 |    in[i] = (i+1);
48 | 
49 |   double *out = (double*)malloc(sizeof(double));
50 |   assert(out != 0);
51 | 
52 |   double *grad_out = (double*)malloc(sizeof(double));
53 |   assert(grad_out != 0);
54 | 
55 |   *grad_out = 1;
56 | 
57 |   double *grad_in = (double*)malloc(sizeof(double) * n);
58 |   assert(grad_in != 0);
59 |   for(int i=0; i<n; i++)
60 |    grad_in[i] = 0;
61 | 
62 |   gettimeofday(&start, NULL);
63 | 
64 |   __enzyme_autodiff((void*)squareCopy, in, grad_in, out, grad_out, n);
65 | 
66 |   gettimeofday(&end, NULL);
67 | 
68 |   printf("Gradient time=%0.6f squareCopy(in)[0] = %f\n", tdiff(&start, &end), grad_in[0]);
69 | 
70 |   return 0;
71 | }
72 | 


--------------------------------------------------------------------------------
/7_fwd/Makefile:
--------------------------------------------------------------------------------
 1 | all: fwd.o
 2 | 
 3 | clean:
 4 | 	rm -f *.o *.ll
 5 | 
 6 | %.o: %.c
 7 | 	../dockerscript.sh clang-12 /host/$^ -O3 -Xclang -load -Xclang /Enzyme/enzyme/build/Enzyme/ClangEnzyme-12.so -ffast-math -o /host/$@
 8 | 
 9 | run-%: %.o
10 | 	../dockerscript.sh /host/$^ 3.14
11 | 


--------------------------------------------------------------------------------
/7_fwd/fwd.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <math.h>
 3 | #include <assert.h>
 4 | 
 5 | double __enzyme_fwddiff(void*, ...);
 6 | 
 7 | void compute_loops(float* a, float* b, float* ret) {
 8 |   double sum0 = 0.0;
 9 |   for (int i = 0; i < 100; i++) {
10 |     sum0 += *a + *b;
11 |   }
12 |   *ret = sum0;
13 | }
14 | 
15 | int main(int argc, char** argv) {
16 |   float a = 2.0;
17 |   float b = 3.0;
18 | 
19 |   float da = 1.0;//(float*) malloc(sizeof(float));
20 |   float db = 1.0;//(float*) malloc(sizeof(float));
21 | 
22 |   float ret = 0;
23 |   float dret = 1.0;
24 | 
25 |   __enzyme_fwddiff(compute_loops, &a, &da, &b, &db, &ret, &dret);
26 |   printf("ret %f, dret %f, da: %f, db: %f\n", ret, dret, da, db);
27 | 
28 |   return 0;
29 | }


--------------------------------------------------------------------------------
/8_batch/Makefile:
--------------------------------------------------------------------------------
 1 | all: batch.o
 2 | 
 3 | clean:
 4 | 	rm -f *.o *.ll
 5 | 
 6 | %.o: %.c
 7 | 	../dockerscript.sh clang-12 /host/$^ -O3 -Xclang -load -Xclang /Enzyme/enzyme/build/Enzyme/ClangEnzyme-12.so -ffast-math -o /host/$@
 8 | 
 9 | run-%: %.o
10 | 	../dockerscript.sh /host/$^ 3.14
11 | 


--------------------------------------------------------------------------------
/8_batch/batch.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | struct Vector {
 4 |   double x1, x2, x3, x4;
 5 | };
 6 | 
 7 | extern Vector __enzyme_batch(...);
 8 | 
 9 | extern int enzyme_width;
10 | extern int enzyme_vector;
11 | extern int enzyme_scalar;
12 | 
13 | double square(double x) { return x * x; }
14 | 
15 | Vector vecsquare(double x1, double x2, double x3, double x4) {
16 |   return __enzyme_batch(square, enzyme_width, 4, enzyme_vector, x1, x2, x3, x4);
17 | }
18 | int main() {
19 |   double vals[] = {23.1, 10.0, 100.0, 3.14};
20 |   double expected[] = {square(vals[0]), square(vals[1]), square(vals[2]),
21 |                        square(vals[3])};
22 |   Vector result = vecsquare(vals[0], vals[1], vals[2], vals[3]);
23 | }


--------------------------------------------------------------------------------
/9_multisource/Makefile:
--------------------------------------------------------------------------------
 1 | all: lib.exe
 2 | 
 3 | clean:
 4 | 	rm -f *.o *.ll *.exe
 5 | 
 6 | %.o: %.c
 7 | 	../dockerscript.sh clang-12 -c -fuse-ld=lld -flto /host/$^ -O2 -ffast-math -o /host/$@
 8 | 
 9 | lib.exe: myblas.o multisource.o
10 | 	../dockerscript.sh clang-12 -fuse-ld=lld -flto /host/myblas.o /host/multisource.o -O2 -ffast-math -o /host/$@ -Wl,-mllvm=-load=/Enzyme/enzyme/build/Enzyme/LLDEnzyme-12.so
11 | 
12 | run-%: %.exe
13 | 	../dockerscript.sh /host/$^ 3
14 | 


--------------------------------------------------------------------------------
/9_multisource/multisource.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <math.h>
 4 | #include <assert.h>
 5 | #include "myblas.h"
 6 | 
 7 | double dotabs(struct complex* alpha, struct complex* beta, int n) {
 8 |   struct complex prod = myblas_cdot(alpha, beta, n);
 9 |   return myblas_cabs(prod);
10 | }
11 | 
12 | void __enzyme_autodiff(void*, ...);
13 | int enzyme_const, enzyme_dup, enzyme_out;
14 | 
15 | int main(int argc, char *argv[]) {
16 |   int n = 3;
17 |   if (argc > 1) {
18 |     n = atoi(argv[1]);
19 |   }
20 | 
21 | 
22 |   struct complex *A = (struct complex*)malloc(sizeof(struct complex) * n);
23 |   assert(A != 0);
24 |   for(int i=0; i<n; i++)
25 |    A[i] = (struct complex){(i+1), (i+2)};
26 | 
27 |   struct complex *grad_A = (struct complex*)malloc(sizeof(struct complex) * n);
28 |   assert(grad_A != 0);
29 |   for(int i=0; i<n; i++)
30 |    grad_A[i] = (struct complex){0,0};
31 | 
32 | 
33 | 
34 |   struct complex *B = (struct complex*)malloc(sizeof(struct complex) * n);
35 |   assert(B != 0);
36 |   for(int i=0; i<n; i++)
37 |    B[i] = (struct complex){-3-i, 2*i};
38 | 
39 |   struct complex *grad_B = (struct complex*)malloc(sizeof(struct complex) * n);
40 |   assert(grad_B != 0);
41 |   for(int i=0; i<n; i++)
42 |    grad_B[i] = (struct complex){0,0};
43 | 
44 |   __enzyme_autodiff((void*)dotabs, A, grad_A, B, grad_B, n);
45 |   printf("Gradient dotabs(A)[0] = %f\n", grad_A[0].r);
46 | 
47 |   return 0;
48 | }
49 | 


--------------------------------------------------------------------------------
/9_multisource/myblas.c:
--------------------------------------------------------------------------------
 1 | #include "myblas.h"
 2 | 
 3 | struct complex myblas_cdot(struct complex* x, struct complex *y, int n) {
 4 |   struct complex sum = { 0, 0 };
 5 |   for (int i=0; i<n; i++) {
 6 |     sum.r += x[i].r * y[i].r - x[i].i * y[i].i;
 7 |     sum.i += x[i].r * y[i].i + x[i].i * y[i].r;
 8 |   }
 9 |   return sum;
10 | }
11 | 
12 | double myblas_cabs(struct complex x) {
13 |     return x.r * x.r + x.i * x.i;
14 | }
15 | 


--------------------------------------------------------------------------------
/9_multisource/myblas.h:
--------------------------------------------------------------------------------
1 | struct complex {
2 |     double r;
3 |     double i;
4 | };
5 | 
6 | struct complex myblas_cdot(struct complex* x, struct complex *y, int n);
7 | 
8 | double myblas_cabs(struct complex x);
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Enzyme Tutorial
 2 | 
 3 | 
 4 | ## Structure
 5 | 
 6 | 1. Introduction to Automatic Differentiation
 7 | 2. Compiler-based Automatic Differentiation with Enzyme
 8 | 3. My First Derivative
 9 | 4. Custom Compiler Analyses
10 | 5. Parallel Automatic Differentiation (CPU + GPU)
11 | 6. Multi-Language Automatic Differentiation
12 | 7. Using Enzyme as a Plugin and Future Work
13 | 


--------------------------------------------------------------------------------
/cuda/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EnzymeAD/Enzyme-Tutorial/803ddd188c65087b2ebe8aa19aa3983951bd1834/cuda/README.md


--------------------------------------------------------------------------------
/cuda/square/Makefile:
--------------------------------------------------------------------------------
 1 | # Environment Variables
 2 | CUDA_PATH ?= /usr/local/cuda-11.2
 3 | ENZYME_PATH ?= /home/lpaehler/Work/Dev-Tools/llvm-13/Enzyme/enzyme/build/Enzyme
 4 | CLANG_PATH ?= /home/lpaehler/Work/Dev-Tools/llvm-13/llvm-project/build/bin/clang++
 5 | SM_VERSION  = 70
 6 | 
 7 | CC := $(CLANG_PATH)
 8 | CFLAGS = -fno-vectorize -O2 -fno-unroll-loops -fPIC --cuda-path=$(CUDA_PATH) -L$(CUDA_PATH)/lib64 --cuda-gpu-arch=sm_$(SM_VERSION)\
 9 | 		--no-cuda-version-check -Xclang -load -Xclang $(ENZYME_PATH)
10 | LDFLAGS = -lcudart_static
11 | 
12 | all: cuda_square
13 | 
14 | cuda_square: cuda_square.o
15 | 	$(CC) $(CFLAGS) cuda_square.o -o cuda_square $(LDFLAGS)


--------------------------------------------------------------------------------
/cuda/square/cuda_square.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | void __device__ square_impl(double* x_in, double *x_out) {
 4 |     x_out[0] = x_in[0] * x_in[0];
 5 | }
 6 | 
 7 | typedef void (*f_ptr)(double*, double*);
 8 | 
 9 | extern void __device__ __enzyme_autodiff(f_ptr,
10 |     int, double*, double*,
11 |     int, double*, double*
12 | );
13 | 
14 | void __global__ square(double* x_in, double *x_out) {
15 |     square_impl(x_in, x_out);
16 | }
17 | 
18 | int __device__ enzyme_dup;
19 | int __device__ enzyme_out;
20 | int __device__ enzyme_const;
21 | 
22 | void __global__ square_grad(double* x, double *d_x, double *y, double *d_y) {
23 | 
24 |     __enzyme_autodiff(square_impl,
25 |         enzyme_dup, x, d_x,
26 |         enzyme_dup, y, d_y);
27 | 
28 | }
29 | 
30 | int main() {
31 | 
32 |     // Device pointers
33 |     double *x, *d_x, *y, *d_y;
34 | 
35 |     // Allocate GPU device memory
36 |     cudaMalloc(&x, sizeof(*x));
37 |     cudaMalloc(&d_x, sizeof(*d_x))
38 |     cudaMalloc(&y, sizeof(*y));
39 |     cudaMalloc(&d_y, sizeof(*d_y))
40 | 
41 |     // Initialize device values
42 |     double host_x = 1.4;
43 |     double host_d_x = 0.0;
44 |     double host_y;
45 |     double host_d_y = 1.0;
46 | 
47 |     // Copy data to device
48 |     cudaMemcpy(x, &host_x, sizeof(*x), cudaMemcpyHostToDevice);
49 |     cudaMemcpy(d_x, &host_d_x, sizeof(*d_x), cudaMemcpyHostToDevice);
50 |     cudaMemcpy(y, &host_y, sizeof(*y), cudaMemcpyHostToDevice);
51 |     cudaMemcpy(d_y, &host_d_y, sizeof(*d_y), cudaMemcpyHostToDevice);
52 | 
53 | #ifdef FORWARD
54 |     // Forward pass only
55 |     square<<<1, 1>>>(x, y);
56 | #else
57 |     // Forward and backward pass
58 |     square_grad<<<1, 1>>>(x, d_x, y, d_y);  
59 | #endif
60 | 
61 |     // Synchronize device memory
62 |     cudaDeviceSynchronize();
63 | 
64 |     // Copy data from device to host
65 |     cudaMemcpy(&host_x, x, sizeof(*x), cudaMemcpyDeviceToHost);
66 |     cudaMemcpy(&host_d_x, d_x, sizeof(*d_x), cudaMemcpyDeviceToHost);
67 |     cudaMemcpy(&host_y, y, sizeof(*y), cudaMemcpyDeviceToHost);
68 |     cudaMemcpy(&host_d_y, d_y, sizeof(*d_y), cudaMemcpyDeviceToHost);
69 | 
70 |     // Print results
71 |     printf("%f %f\n", host_x, host_y);
72 |     printf("%f %f\n", host_d_x, host_d_y);
73 |     
74 | }


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:20.04
 2 | 
 3 | ENV DEBIAN_FRONTEND=noninteractive
 4 | 
 5 | RUN apt-get -y update && apt-get install -y --no-install-recommends curl gnupg lsb-core software-properties-common \
 6 |     && curl -fsSL https://apt.llvm.org/llvm-snapshot.gpg.key|apt-key add - \
 7 |     && apt-add-repository "deb http://apt.llvm.org/`lsb_release -c | cut -f2`/ llvm-toolchain-`lsb_release -c | cut -f2`-12 main" \
 8 |     && apt-get install -y --no-install-recommends autoconf cmake ninja-build gcc g++ libtool gfortran llvm-12-dev lld-12 clang-12 libopenmpi-dev openmpi-bin git \
 9 |     && apt-get autoremove -y --purge \
10 |     && apt-get clean -y \
11 |     && rm -rf /var/lib/apt/lists/*
12 | 
13 | # Get & install Enzyme
14 | RUN git clone https://github.com/wsmoses/Enzyme.git \
15 |     && cd Enzyme/enzyme \
16 |     && mkdir build && cd build \
17 |     && cmake -G Ninja .. -DCMAKE_BUILD_TYPE=Debug \
18 |     && ninja
19 | 
20 | RUN update-alternatives --install /usr/bin/clang clang /usr/bin/clang-12 10 --slave /usr/bin/clang++ clang++ /usr/bin/clang++-12 \
21 |     && update-alternatives --install /usr/bin/opt opt /usr/bin/opt-12 10 \
22 |     && update-alternatives --install /usr/bin/lld lld /usr/bin/lld-12 10 \
23 |     && update-alternatives --install /usr/bin/llvm-symbolizer llvm-symbolizer /usr/bin/llvm-symbolizer-12 10
24 | 
25 | ENV DEBIAN_FRONTEND=
26 | 


--------------------------------------------------------------------------------
/dockerscript.sh:
--------------------------------------------------------------------------------
1 | docker run -v `pwd`:/host wsmoses/enzyme "$@"
2 | 


--------------------------------------------------------------------------------
/julia/introduction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "source": [
  6 |     "# Enzyme.jl\n",
  7 |     "\n",
  8 |     "Julia is a high-level programming language using LLVM as a compiler backend.\n",
  9 |     "Enzyme.jl uses Julia's GPU compiler infrastructure to provide a custom optimization\n",
 10 |     "pipeline that inserts Enzyme LLVM pass. \n",
 11 |     "\n",
 12 |     "It uses Orc (v2/v1) to then JIT the adjoints and call them through Julia foreign-function\n",
 13 |     "interface."
 14 |    ],
 15 |    "metadata": {}
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "source": [
 20 |     "function mysum(X)\n",
 21 |     "    acc = zero(eltype(X))\n",
 22 |     "    @simd for x in X\n",
 23 |     "       acc += x\n",
 24 |     "    end\n",
 25 |     "    acc\n",
 26 |     "end"
 27 |    ],
 28 |    "metadata": {}
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "source": [
 33 |     "# Installing Enzyme\n",
 34 |     "\n",
 35 |     "Tutorial tested with Julia 1.7-beta3\n",
 36 |     "\n",
 37 |     "Using the Julia package manger:\n",
 38 |     "```julia\n",
 39 |     "import Pkg\n",
 40 |     "Pkg.add(\"Enzyme\")\n",
 41 |     "```"
 42 |    ],
 43 |    "metadata": {}
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 37,
 48 |    "source": [
 49 |     "import Pkg\n",
 50 |     "Pkg.activate(; temp=true)\n",
 51 |     "Pkg.add(Pkg.PackageSpec(name=\"Enzyme\", rev=\"822afeff2c8a9b87c8fb93c6415cc3ffb19924e8\"))\n",
 52 |     "Pkg.add(\"BenchmarkTools\")\n",
 53 |     "Pkg.add(\"ForwardDiff\")"
 54 |    ],
 55 |    "outputs": [
 56 |     {
 57 |      "output_type": "stream",
 58 |      "name": "stdout",
 59 |      "text": [
 60 |       "\u001b[32m\u001b[1m  Activating\u001b[22m\u001b[39m new project at `/tmp/jl_yAfFxB`\n",
 61 |       "\u001b[32m\u001b[1m   Resolving\u001b[22m\u001b[39m package versions...\n",
 62 |       "\u001b[32m\u001b[1m    Updating\u001b[22m\u001b[39m `/tmp/jl_yAfFxB/Project.toml`\n",
 63 |       " \u001b[90m [7da242da] \u001b[39m\u001b[92m+ Enzyme v0.7.0 `https://github.com/wsmoses/Enzyme.jl.git#822afef`\u001b[39m\n",
 64 |       "\u001b[32m\u001b[1m    Updating\u001b[22m\u001b[39m `/tmp/jl_yAfFxB/Manifest.toml`\n",
 65 |       " \u001b[90m [79e6a3ab] \u001b[39m\u001b[92m+ Adapt v3.3.1\u001b[39m\n",
 66 |       " \u001b[90m [fa961155] \u001b[39m\u001b[92m+ CEnum v0.4.1\u001b[39m\n",
 67 |       " \u001b[90m [7da242da] \u001b[39m\u001b[92m+ Enzyme v0.7.0 `https://github.com/wsmoses/Enzyme.jl.git#822afef`\u001b[39m\n",
 68 |       " \u001b[90m [e2ba6199] \u001b[39m\u001b[92m+ ExprTools v0.1.6\u001b[39m\n",
 69 |       " \u001b[90m [61eb1bfa] \u001b[39m\u001b[92m+ GPUCompiler v0.13.7\u001b[39m\n",
 70 |       " \u001b[90m [692b3bcd] \u001b[39m\u001b[92m+ JLLWrappers v1.3.0\u001b[39m\n",
 71 |       " \u001b[90m [929cbde3] \u001b[39m\u001b[92m+ LLVM v4.6.0\u001b[39m\n",
 72 |       " \u001b[90m [d8793406] \u001b[39m\u001b[92m+ ObjectFile v0.3.7\u001b[39m\n",
 73 |       " \u001b[90m [21216c6a] \u001b[39m\u001b[92m+ Preferences v1.2.2\u001b[39m\n",
 74 |       " \u001b[90m [189a3867] \u001b[39m\u001b[92m+ Reexport v1.2.2\u001b[39m\n",
 75 |       " \u001b[90m [53d494c1] \u001b[39m\u001b[92m+ StructIO v0.3.0\u001b[39m\n",
 76 |       " \u001b[90m [a759f4b9] \u001b[39m\u001b[92m+ TimerOutputs v0.5.13\u001b[39m\n",
 77 |       " \u001b[90m [7cc45869] \u001b[39m\u001b[92m+ Enzyme_jll v0.0.21+0\u001b[39m\n",
 78 |       " \u001b[90m [dad2f222] \u001b[39m\u001b[92m+ LLVMExtra_jll v0.0.11+0\u001b[39m\n",
 79 |       " \u001b[90m [0dad84c5] \u001b[39m\u001b[92m+ ArgTools\u001b[39m\n",
 80 |       " \u001b[90m [56f22d72] \u001b[39m\u001b[92m+ Artifacts\u001b[39m\n",
 81 |       " \u001b[90m [2a0f44e3] \u001b[39m\u001b[92m+ Base64\u001b[39m\n",
 82 |       " \u001b[90m [ade2ca70] \u001b[39m\u001b[92m+ Dates\u001b[39m\n",
 83 |       " \u001b[90m [f43a241f] \u001b[39m\u001b[92m+ Downloads\u001b[39m\n",
 84 |       " \u001b[90m [b77e0a4c] \u001b[39m\u001b[92m+ InteractiveUtils\u001b[39m\n",
 85 |       " \u001b[90m [b27032c2] \u001b[39m\u001b[92m+ LibCURL\u001b[39m\n",
 86 |       " \u001b[90m [76f85450] \u001b[39m\u001b[92m+ LibGit2\u001b[39m\n",
 87 |       " \u001b[90m [8f399da3] \u001b[39m\u001b[92m+ Libdl\u001b[39m\n",
 88 |       " \u001b[90m [37e2e46d] \u001b[39m\u001b[92m+ LinearAlgebra\u001b[39m\n",
 89 |       " \u001b[90m [56ddb016] \u001b[39m\u001b[92m+ Logging\u001b[39m\n",
 90 |       " \u001b[90m [d6f4376e] \u001b[39m\u001b[92m+ Markdown\u001b[39m\n",
 91 |       " \u001b[90m [ca575930] \u001b[39m\u001b[92m+ NetworkOptions\u001b[39m\n",
 92 |       " \u001b[90m [44cfe95a] \u001b[39m\u001b[92m+ Pkg\u001b[39m\n",
 93 |       " \u001b[90m [de0858da] \u001b[39m\u001b[92m+ Printf\u001b[39m\n",
 94 |       " \u001b[90m [3fa0cd96] \u001b[39m\u001b[92m+ REPL\u001b[39m\n",
 95 |       " \u001b[90m [9a3f8284] \u001b[39m\u001b[92m+ Random\u001b[39m\n",
 96 |       " \u001b[90m [ea8e919c] \u001b[39m\u001b[92m+ SHA\u001b[39m\n",
 97 |       " \u001b[90m [9e88b42a] \u001b[39m\u001b[92m+ Serialization\u001b[39m\n",
 98 |       " \u001b[90m [6462fe0b] \u001b[39m\u001b[92m+ Sockets\u001b[39m\n",
 99 |       " \u001b[90m [fa267f1f] \u001b[39m\u001b[92m+ TOML\u001b[39m\n",
100 |       " \u001b[90m [a4e569a6] \u001b[39m\u001b[92m+ Tar\u001b[39m\n",
101 |       " \u001b[90m [8dfed614] \u001b[39m\u001b[92m+ Test\u001b[39m\n",
102 |       " \u001b[90m [cf7118a7] \u001b[39m\u001b[92m+ UUIDs\u001b[39m\n",
103 |       " \u001b[90m [4ec0a83e] \u001b[39m\u001b[92m+ Unicode\u001b[39m\n",
104 |       " \u001b[90m [e66e0078] \u001b[39m\u001b[92m+ CompilerSupportLibraries_jll\u001b[39m\n",
105 |       " \u001b[90m [deac9b47] \u001b[39m\u001b[92m+ LibCURL_jll\u001b[39m\n",
106 |       " \u001b[90m [29816b5a] \u001b[39m\u001b[92m+ LibSSH2_jll\u001b[39m\n",
107 |       " \u001b[90m [c8ffd9c3] \u001b[39m\u001b[92m+ MbedTLS_jll\u001b[39m\n",
108 |       " \u001b[90m [14a3606d] \u001b[39m\u001b[92m+ MozillaCACerts_jll\u001b[39m\n",
109 |       " \u001b[90m [4536629a] \u001b[39m\u001b[92m+ OpenBLAS_jll\u001b[39m\n",
110 |       " \u001b[90m [83775a58] \u001b[39m\u001b[92m+ Zlib_jll\u001b[39m\n",
111 |       " \u001b[90m [8e850b90] \u001b[39m\u001b[92m+ libblastrampoline_jll\u001b[39m\n",
112 |       " \u001b[90m [8e850ede] \u001b[39m\u001b[92m+ nghttp2_jll\u001b[39m\n",
113 |       " \u001b[90m [3f19e933] \u001b[39m\u001b[92m+ p7zip_jll\u001b[39m\n",
114 |       "\u001b[32m\u001b[1m   Resolving\u001b[22m\u001b[39m package versions...\n",
115 |       "\u001b[32m\u001b[1m    Updating\u001b[22m\u001b[39m `/tmp/jl_yAfFxB/Project.toml`\n",
116 |       " \u001b[90m [6e4b80f9] \u001b[39m\u001b[92m+ BenchmarkTools v1.2.0\u001b[39m\n",
117 |       "\u001b[32m\u001b[1m    Updating\u001b[22m\u001b[39m `/tmp/jl_yAfFxB/Manifest.toml`\n",
118 |       " \u001b[90m [6e4b80f9] \u001b[39m\u001b[92m+ BenchmarkTools v1.2.0\u001b[39m\n",
119 |       " \u001b[90m [682c06a0] \u001b[39m\u001b[92m+ JSON v0.21.2\u001b[39m\n",
120 |       " \u001b[90m [69de0a69] \u001b[39m\u001b[92m+ Parsers v2.1.2\u001b[39m\n",
121 |       " \u001b[90m [a63ad114] \u001b[39m\u001b[92m+ Mmap\u001b[39m\n",
122 |       " \u001b[90m [9abbd945] \u001b[39m\u001b[92m+ Profile\u001b[39m\n",
123 |       " \u001b[90m [2f01184e] \u001b[39m\u001b[92m+ SparseArrays\u001b[39m\n",
124 |       " \u001b[90m [10745b16] \u001b[39m\u001b[92m+ Statistics\u001b[39m\n",
125 |       "\u001b[32m\u001b[1m   Resolving\u001b[22m\u001b[39m package versions...\n",
126 |       "\u001b[32m\u001b[1m    Updating\u001b[22m\u001b[39m `/tmp/jl_yAfFxB/Project.toml`\n",
127 |       " \u001b[90m [f6369f11] \u001b[39m\u001b[92m+ ForwardDiff v0.10.23\u001b[39m\n",
128 |       "\u001b[32m\u001b[1m    Updating\u001b[22m\u001b[39m `/tmp/jl_yAfFxB/Manifest.toml`\n",
129 |       " \u001b[90m [d360d2e6] \u001b[39m\u001b[92m+ ChainRulesCore v1.11.1\u001b[39m\n",
130 |       " \u001b[90m [9e997f8a] \u001b[39m\u001b[92m+ ChangesOfVariables v0.1.1\u001b[39m\n",
131 |       " \u001b[90m [bbf7d656] \u001b[39m\u001b[92m+ CommonSubexpressions v0.3.0\u001b[39m\n",
132 |       " \u001b[90m [34da2185] \u001b[39m\u001b[92m+ Compat v3.40.0\u001b[39m\n",
133 |       " \u001b[90m [163ba53b] \u001b[39m\u001b[92m+ DiffResults v1.0.3\u001b[39m\n",
134 |       " \u001b[90m [b552c78f] \u001b[39m\u001b[92m+ DiffRules v1.4.0\u001b[39m\n",
135 |       " \u001b[90m [ffbed154] \u001b[39m\u001b[92m+ DocStringExtensions v0.8.6\u001b[39m\n",
136 |       " \u001b[90m [f6369f11] \u001b[39m\u001b[92m+ ForwardDiff v0.10.23\u001b[39m\n",
137 |       " \u001b[90m [3587e190] \u001b[39m\u001b[92m+ InverseFunctions v0.1.2\u001b[39m\n",
138 |       " \u001b[90m [92d709cd] \u001b[39m\u001b[92m+ IrrationalConstants v0.1.1\u001b[39m\n",
139 |       " \u001b[90m [2ab3a3ac] \u001b[39m\u001b[92m+ LogExpFunctions v0.3.5\u001b[39m\n",
140 |       " \u001b[90m [1914dd2f] \u001b[39m\u001b[92m+ MacroTools v0.5.9\u001b[39m\n",
141 |       " \u001b[90m [77ba4419] \u001b[39m\u001b[92m+ NaNMath v0.3.5\u001b[39m\n",
142 |       " \u001b[90m [276daf66] \u001b[39m\u001b[92m+ SpecialFunctions v1.8.1\u001b[39m\n",
143 |       " \u001b[90m [90137ffa] \u001b[39m\u001b[92m+ StaticArrays v1.2.13\u001b[39m\n",
144 |       " \u001b[90m [efe28fd5] \u001b[39m\u001b[92m+ OpenSpecFun_jll v0.5.5+0\u001b[39m\n",
145 |       " \u001b[90m [8bb1440f] \u001b[39m\u001b[92m+ DelimitedFiles\u001b[39m\n",
146 |       " \u001b[90m [8ba89e20] \u001b[39m\u001b[92m+ Distributed\u001b[39m\n",
147 |       " \u001b[90m [1a1011a3] \u001b[39m\u001b[92m+ SharedArrays\u001b[39m\n",
148 |       " \u001b[90m [05823500] \u001b[39m\u001b[92m+ OpenLibm_jll\u001b[39m\n"
149 |      ]
150 |     }
151 |    ],
152 |    "metadata": {}
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 38,
157 |    "source": [
158 |     "using Enzyme\n",
159 |     "using ForwardDiff\n",
160 |     "using BenchmarkTools"
161 |    ],
162 |    "outputs": [],
163 |    "metadata": {}
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "source": [
168 |     "# Activity annotations\n",
169 |     "- `Const`\n",
170 |     "- `Active`\n",
171 |     "- `Duplicated`\n",
172 |     "- `DuplicatedNoNeed`"
173 |    ],
174 |    "metadata": {}
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 39,
179 |    "source": [
180 |     "square(x) = x^2"
181 |    ],
182 |    "outputs": [
183 |     {
184 |      "data": {
185 |       "text/plain": [
186 |        "square (generic function with 1 method)"
187 |       ]
188 |      },
189 |      "metadata": {},
190 |      "output_type": "display_data"
191 |     }
192 |    ],
193 |    "metadata": {}
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 40,
198 |    "source": [
199 |     "autodiff(square, 1.0)"
200 |    ],
201 |    "outputs": [
202 |     {
203 |      "data": {
204 |       "text/plain": [
205 |        "()"
206 |       ]
207 |      },
208 |      "metadata": {},
209 |      "output_type": "display_data"
210 |     }
211 |    ],
212 |    "metadata": {}
213 |   },
214 |   {
215 |    "cell_type": "markdown",
216 |    "source": [
217 |     "Default activity for values is `Const`"
218 |    ],
219 |    "metadata": {}
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": 41,
224 |    "source": [
225 |     "autodiff(square, Const(1.0))"
226 |    ],
227 |    "outputs": [
228 |     {
229 |      "data": {
230 |       "text/plain": [
231 |        "()"
232 |       ]
233 |      },
234 |      "metadata": {},
235 |      "output_type": "display_data"
236 |     }
237 |    ],
238 |    "metadata": {}
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": 42,
243 |    "source": [
244 |     "autodiff(square, Active(1.0))"
245 |    ],
246 |    "outputs": [
247 |     {
248 |      "data": {
249 |       "text/plain": [
250 |        "(2.0,)"
251 |       ]
252 |      },
253 |      "metadata": {},
254 |      "output_type": "display_data"
255 |     }
256 |    ],
257 |    "metadata": {}
258 |   },
259 |   {
260 |    "cell_type": "markdown",
261 |    "source": [
262 |     "## Supporting mutating functions\n",
263 |     "\n",
264 |     "Enzyme can differentiate through mutating functions. This requires that the users passes in the shadow variables with the `Duplicated` or `DuplicatedNoNeed` activity annotation."
265 |    ],
266 |    "metadata": {}
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": 43,
271 |    "source": [
272 |     "function cube(y, x)\n",
273 |     "\ty[] = x[]^3\n",
274 |     "\treturn nothing\n",
275 |     "end"
276 |    ],
277 |    "outputs": [
278 |     {
279 |      "data": {
280 |       "text/plain": [
281 |        "cube (generic function with 1 method)"
282 |       ]
283 |      },
284 |      "metadata": {},
285 |      "output_type": "display_data"
286 |     }
287 |    ],
288 |    "metadata": {}
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": 44,
293 |    "source": [
294 |     "x = Ref(4.0)\n",
295 |     "y = Ref(0.0)\n",
296 |     "cube(y, x)\n",
297 |     "y[]"
298 |    ],
299 |    "outputs": [
300 |     {
301 |      "data": {
302 |       "text/plain": [
303 |        "64.0"
304 |       ]
305 |      },
306 |      "metadata": {},
307 |      "output_type": "display_data"
308 |     }
309 |    ],
310 |    "metadata": {}
311 |   },
312 |   {
313 |    "cell_type": "markdown",
314 |    "source": [
315 |     "\n",
316 |     "In order to calculate the gradient of `x`, we have to propagate `1.0` into the\n",
317 |     "shadow `dy`.\n"
318 |    ],
319 |    "metadata": {}
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 45,
324 |    "source": [
325 |     "x = Ref(4.0)\n",
326 |     "dx = Ref(0.0)\n",
327 |     "\n",
328 |     "y = Ref(0.0)\n",
329 |     "dy = Ref(1.0)\n",
330 |     "\n",
331 |     "autodiff(cube, Duplicated(y, dy), Duplicated(x, dx))\n",
332 |     "y[], dy[], x[], dx[]"
333 |    ],
334 |    "outputs": [
335 |     {
336 |      "data": {
337 |       "text/plain": [
338 |        "(64.0, 0.0, 4.0, 48.0)"
339 |       ]
340 |      },
341 |      "metadata": {},
342 |      "output_type": "display_data"
343 |     }
344 |    ],
345 |    "metadata": {}
346 |   },
347 |   {
348 |    "cell_type": "markdown",
349 |    "source": [
350 |     "# Reflection"
351 |    ],
352 |    "metadata": {}
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 46,
357 |    "source": [
358 |     "\n",
359 |     "Enzyme.Compiler.enzyme_code_llvm(cube, Const,\n",
360 |     "\tTuple{Enzyme.Duplicated{Base.RefValue{Float64}}, \n",
361 |     "\tDuplicated{Base.RefValue{Float64}}}, debuginfo=:none)"
362 |    ],
363 |    "outputs": [
364 |     {
365 |      "output_type": "stream",
366 |      "name": "stdout",
367 |      "text": [
368 |       "; Function Attrs: alwaysinline\n",
369 |       "define void @diffejulia_cube_9969wrap({}* %0, {}* %1, {}* %2, {}* %3) #3 {\n",
370 |       "entry:\n",
371 |       "  %\"'ipc6.i\" = bitcast {}* %3 to double*\n",
372 |       "  %4 = bitcast {}* %2 to double*\n",
373 |       "  %5 = load double, double* %4, align 8\n",
374 |       "  %6 = fmul double %5, %5\n",
375 |       "  %7 = fmul double %5, %6\n",
376 |       "  %\"'ipc.i\" = bitcast {}* %1 to double*\n",
377 |       "  %8 = bitcast {}* %0 to double*\n",
378 |       "  store double %7, double* %8, align 8\n",
379 |       "  %9 = load double, double* %\"'ipc.i\", align 8\n",
380 |       "  store double 0.000000e+00, double* %\"'ipc.i\", align 8\n",
381 |       "  %10 = load double, double* %\"'ipc6.i\", align 8\n",
382 |       "  %11 = fmul fast double %6, 3.000000e+00\n",
383 |       "  %reass.mul = fmul fast double %11, %9\n",
384 |       "  %12 = fadd fast double %reass.mul, %10\n",
385 |       "  store double %12, double* %\"'ipc6.i\", align 8\n",
386 |       "  ret void\n",
387 |       "}\n"
388 |      ]
389 |     }
390 |    ],
391 |    "metadata": {}
392 |   },
393 |   {
394 |    "cell_type": "markdown",
395 |    "source": [
396 |     "# Differentiating through control-flow\n",
397 |     "Let's differentiate through some control flow. This kind of scalar code is where normally one would use `ForwardDiff.jl` since the machine learning optimized toolkits like Zygote have unacceptable overheads."
398 |    ],
399 |    "metadata": {}
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": 47,
404 |    "source": [
405 |     "# Taylor series for `-log(1-x)`\n",
406 |     "# eval at -log(1-1/2) = -log(1/2)\n",
407 |     "function taylor(f::T, N=10^7) where T\n",
408 |     "    g = zero(T)\n",
409 |     "    for i in 1:N\n",
410 |     "        g += f^i / i\n",
411 |     "    end\n",
412 |     "    return g\n",
413 |     "end\n",
414 |     "\n",
415 |     "autodiff(taylor, Active(0.5), Const(10^8))\n"
416 |    ],
417 |    "outputs": [
418 |     {
419 |      "data": {
420 |       "text/plain": [
421 |        "(2.0,)"
422 |       ]
423 |      },
424 |      "metadata": {},
425 |      "output_type": "display_data"
426 |     }
427 |    ],
428 |    "metadata": {}
429 |   },
430 |   {
431 |    "cell_type": "code",
432 |    "execution_count": 48,
433 |    "source": [
434 |     "fwd_taylor(x) = ForwardDiff.derivative(taylor, 0.5)\n",
435 |     "\n",
436 |     "enz_taylor(x) = autodiff(taylor, Active(x))\n"
437 |    ],
438 |    "outputs": [
439 |     {
440 |      "data": {
441 |       "text/plain": [
442 |        "enz_taylor (generic function with 1 method)"
443 |       ]
444 |      },
445 |      "metadata": {},
446 |      "output_type": "display_data"
447 |     }
448 |    ],
449 |    "metadata": {}
450 |   },
451 |   {
452 |    "cell_type": "code",
453 |    "execution_count": 49,
454 |    "source": [
455 |     "\n",
456 |     "@benchmark fwd_taylor($(Ref(0.5))[])"
457 |    ],
458 |    "outputs": [
459 |     {
460 |      "data": {
461 |       "text/plain": [
462 |        "BenchmarkTools.Trial: 6 samples with 1 evaluation.\n",
463 |        " Range \u001b[90m(\u001b[39m\u001b[36m\u001b[1mmin\u001b[22m\u001b[39m … \u001b[35mmax\u001b[39m\u001b[90m):  \u001b[39m\u001b[36m\u001b[1m869.791 ms\u001b[22m\u001b[39m … \u001b[35m  1.031 s\u001b[39m  \u001b[90m┊\u001b[39m GC \u001b[90m(\u001b[39mmin … max\u001b[90m): \u001b[39m0.00% … 0.00%\n",
464 |        " Time  \u001b[90m(\u001b[39m\u001b[34m\u001b[1mmedian\u001b[22m\u001b[39m\u001b[90m):     \u001b[39m\u001b[34m\u001b[1m903.407 ms              \u001b[22m\u001b[39m\u001b[90m┊\u001b[39m GC \u001b[90m(\u001b[39mmedian\u001b[90m):    \u001b[39m0.00%\n",
465 |        " Time  \u001b[90m(\u001b[39m\u001b[32m\u001b[1mmean\u001b[22m\u001b[39m ± \u001b[32mσ\u001b[39m\u001b[90m):   \u001b[39m\u001b[32m\u001b[1m919.832 ms\u001b[22m\u001b[39m ± \u001b[32m61.069 ms\u001b[39m  \u001b[90m┊\u001b[39m GC \u001b[90m(\u001b[39mmean ± σ\u001b[90m):  \u001b[39m0.00% ± 0.00%\n",
466 |        "\n",
467 |        "  \u001b[39m█\u001b[39m \u001b[39m█\u001b[39m█\u001b[34m \u001b[39m\u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[32m \u001b[39m\u001b[39m \u001b[39m█\u001b[39m \u001b[39m \u001b[39m█\u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m█\u001b[39m \u001b[39m \n",
468 |        "  \u001b[39m█\u001b[39m▁\u001b[39m█\u001b[39m█\u001b[34m▁\u001b[39m\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[32m▁\u001b[39m\u001b[39m▁\u001b[39m█\u001b[39m▁\u001b[39m▁\u001b[39m█\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m█\u001b[39m \u001b[39m▁\n",
469 |        "  870 ms\u001b[90m          Histogram: frequency by time\u001b[39m          1.03 s \u001b[0m\u001b[1m<\u001b[22m\n",
470 |        "\n",
471 |        " Memory estimate\u001b[90m: \u001b[39m\u001b[33m0 bytes\u001b[39m, allocs estimate\u001b[90m: \u001b[39m\u001b[33m0\u001b[39m."
472 |       ]
473 |      },
474 |      "metadata": {},
475 |      "output_type": "display_data"
476 |     }
477 |    ],
478 |    "metadata": {}
479 |   },
480 |   {
481 |    "cell_type": "code",
482 |    "execution_count": 50,
483 |    "source": [
484 |     "@benchmark enz_taylor($(Ref(0.5))[])"
485 |    ],
486 |    "outputs": [
487 |     {
488 |      "data": {
489 |       "text/plain": [
490 |        "BenchmarkTools.Trial: 11 samples with 1 evaluation.\n",
491 |        " Range \u001b[90m(\u001b[39m\u001b[36m\u001b[1mmin\u001b[22m\u001b[39m … \u001b[35mmax\u001b[39m\u001b[90m):  \u001b[39m\u001b[36m\u001b[1m473.358 ms\u001b[22m\u001b[39m … \u001b[35m553.634 ms\u001b[39m  \u001b[90m┊\u001b[39m GC \u001b[90m(\u001b[39mmin … max\u001b[90m): \u001b[39m0.00% … 0.00%\n",
492 |        " Time  \u001b[90m(\u001b[39m\u001b[34m\u001b[1mmedian\u001b[22m\u001b[39m\u001b[90m):     \u001b[39m\u001b[34m\u001b[1m489.023 ms               \u001b[22m\u001b[39m\u001b[90m┊\u001b[39m GC \u001b[90m(\u001b[39mmedian\u001b[90m):    \u001b[39m0.00%\n",
493 |        " Time  \u001b[90m(\u001b[39m\u001b[32m\u001b[1mmean\u001b[22m\u001b[39m ± \u001b[32mσ\u001b[39m\u001b[90m):   \u001b[39m\u001b[32m\u001b[1m495.482 ms\u001b[22m\u001b[39m ± \u001b[32m 23.014 ms\u001b[39m  \u001b[90m┊\u001b[39m GC \u001b[90m(\u001b[39mmean ± σ\u001b[90m):  \u001b[39m0.00% ± 0.00%\n",
494 |        "\n",
495 |        "  \u001b[39m▁\u001b[39m▁\u001b[39m \u001b[39m \u001b[39m▁\u001b[39m \u001b[39m \u001b[39m \u001b[39m▁\u001b[39m \u001b[39m \u001b[34m█\u001b[39m\u001b[39m \u001b[39m \u001b[39m \u001b[39m▁\u001b[39m▁\u001b[32m \u001b[39m\u001b[39m \u001b[39m▁\u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m▁\u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m \u001b[39m▁\u001b[39m \u001b[39m \n",
496 |        "  \u001b[39m█\u001b[39m█\u001b[39m▁\u001b[39m▁\u001b[39m█\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m█\u001b[39m▁\u001b[39m▁\u001b[34m█\u001b[39m\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m█\u001b[39m█\u001b[32m▁\u001b[39m\u001b[39m▁\u001b[39m█\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m█\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m▁\u001b[39m█\u001b[39m \u001b[39m▁\n",
497 |        "  473 ms\u001b[90m           Histogram: frequency by time\u001b[39m          554 ms \u001b[0m\u001b[1m<\u001b[22m\n",
498 |        "\n",
499 |        " Memory estimate\u001b[90m: \u001b[39m\u001b[33m16 bytes\u001b[39m, allocs estimate\u001b[90m: \u001b[39m\u001b[33m1\u001b[39m."
500 |       ]
501 |      },
502 |      "metadata": {},
503 |      "output_type": "display_data"
504 |     }
505 |    ],
506 |    "metadata": {}
507 |   },
508 |   {
509 |    "cell_type": "markdown",
510 |    "source": [
511 |     "\n",
512 |     "# Differentiating through more complicated codes\n",
513 |     "\n",
514 |     "## A custom matrix multiply"
515 |    ],
516 |    "metadata": {}
517 |   },
518 |   {
519 |    "cell_type": "code",
520 |    "execution_count": 51,
521 |    "source": [
522 |     "\n",
523 |     "function mymul!(R, A, B)\n",
524 |     "    @assert axes(A,2) == axes(B,1)\n",
525 |     "    @inbounds @simd for i in eachindex(R)\n",
526 |     "        R[i] = 0\n",
527 |     "    end\n",
528 |     "    @inbounds for j in axes(B, 2), i in axes(A, 1)\n",
529 |     "        @inbounds @simd for k in axes(A,2)\n",
530 |     "            R[i,j] += A[i,k] * B[k,j]\n",
531 |     "        end\n",
532 |     "    end\n",
533 |     "    nothing\n",
534 |     "end"
535 |    ],
536 |    "outputs": [
537 |     {
538 |      "data": {
539 |       "text/plain": [
540 |        "mymul! (generic function with 1 method)"
541 |       ]
542 |      },
543 |      "metadata": {},
544 |      "output_type": "display_data"
545 |     }
546 |    ],
547 |    "metadata": {}
548 |   },
549 |   {
550 |    "cell_type": "code",
551 |    "execution_count": 52,
552 |    "source": [
553 |     "A = rand(1024, 64)\n",
554 |     "B = rand(64, 512)\n",
555 |     "\n",
556 |     "R = zeros(size(A,1), size(B,2))\n",
557 |     "∂z_∂R = rand(size(R)...)  # Some gradient/tangent passed to us\n",
558 |     "\n",
559 |     "∂z_∂A = zero(A)\n",
560 |     "∂z_∂B = zero(B)\n",
561 |     "\n",
562 |     "Enzyme.autodiff(mymul!, \n",
563 |     "\tDuplicated(R, ∂z_∂R),\n",
564 |     "\tDuplicated(A, ∂z_∂A),\n",
565 |     "\tDuplicated(B, ∂z_∂B))"
566 |    ],
567 |    "outputs": [
568 |     {
569 |      "data": {
570 |       "text/plain": [
571 |        "()"
572 |       ]
573 |      },
574 |      "metadata": {},
575 |      "output_type": "display_data"
576 |     }
577 |    ],
578 |    "metadata": {}
579 |   },
580 |   {
581 |    "cell_type": "markdown",
582 |    "source": [
583 |     "\n",
584 |     "Let's confirm correctness of result"
585 |    ],
586 |    "metadata": {}
587 |   },
588 |   {
589 |    "cell_type": "code",
590 |    "execution_count": 53,
591 |    "source": [
592 |     "R ≈ A * B"
593 |    ],
594 |    "outputs": [
595 |     {
596 |      "data": {
597 |       "text/plain": [
598 |        "true"
599 |       ]
600 |      },
601 |      "metadata": {},
602 |      "output_type": "display_data"
603 |     }
604 |    ],
605 |    "metadata": {}
606 |   },
607 |   {
608 |    "cell_type": "markdown",
609 |    "source": [
610 |     "and correctness of the gradients"
611 |    ],
612 |    "metadata": {}
613 |   },
614 |   {
615 |    "cell_type": "code",
616 |    "execution_count": 54,
617 |    "source": [
618 |     "∂z_∂A ≈ ∂z_∂R * B'"
619 |    ],
620 |    "outputs": [
621 |     {
622 |      "data": {
623 |       "text/plain": [
624 |        "true"
625 |       ]
626 |      },
627 |      "metadata": {},
628 |      "output_type": "display_data"
629 |     }
630 |    ],
631 |    "metadata": {}
632 |   },
633 |   {
634 |    "cell_type": "markdown",
635 |    "source": [
636 |     "# Some more fun"
637 |    ],
638 |    "metadata": {}
639 |   },
640 |   {
641 |    "cell_type": "code",
642 |    "execution_count": 55,
643 |    "source": [
644 |     "struct LList\n",
645 |     "    next::Union{LList,Nothing}\n",
646 |     "\tval::Float64\n",
647 |     "end \n",
648 |     "\n",
649 |     "function sumlist(n::LList)\n",
650 |     "    sum = 0.0\n",
651 |     "    while n !== nothing\n",
652 |     "        sum += n.val\n",
653 |     "        n = n.next\n",
654 |     "    end\n",
655 |     "    sum\n",
656 |     "end"
657 |    ],
658 |    "outputs": [
659 |     {
660 |      "data": {
661 |       "text/plain": [
662 |        "sumlist (generic function with 1 method)"
663 |       ]
664 |      },
665 |      "metadata": {},
666 |      "output_type": "display_data"
667 |     }
668 |    ],
669 |    "metadata": {}
670 |   },
671 |   {
672 |    "cell_type": "code",
673 |    "execution_count": 56,
674 |    "source": [
675 |     "regular = LList(LList(nothing, 1.0), 2.0)\n",
676 |     "shadow  = LList(LList(nothing, 0.0), 0.0)\n",
677 |     "autodiff(sumlist, Duplicated(regular, shadow))"
678 |    ],
679 |    "outputs": [
680 |     {
681 |      "data": {
682 |       "text/plain": [
683 |        "()"
684 |       ]
685 |      },
686 |      "metadata": {},
687 |      "output_type": "display_data"
688 |     }
689 |    ],
690 |    "metadata": {}
691 |   },
692 |   {
693 |    "cell_type": "code",
694 |    "execution_count": 57,
695 |    "source": [
696 |     "shadow.val ≈ 1.0"
697 |    ],
698 |    "outputs": [
699 |     {
700 |      "data": {
701 |       "text/plain": [
702 |        "true"
703 |       ]
704 |      },
705 |      "metadata": {},
706 |      "output_type": "display_data"
707 |     }
708 |    ],
709 |    "metadata": {}
710 |   },
711 |   {
712 |    "cell_type": "code",
713 |    "execution_count": 58,
714 |    "source": [
715 |     "shadow.next.val ≈ 1.0"
716 |    ],
717 |    "outputs": [
718 |     {
719 |      "data": {
720 |       "text/plain": [
721 |        "true"
722 |       ]
723 |      },
724 |      "metadata": {},
725 |      "output_type": "display_data"
726 |     }
727 |    ],
728 |    "metadata": {}
729 |   },
730 |   {
731 |    "cell_type": "markdown",
732 |    "source": [
733 |     "# Differentiating through Parallelism"
734 |    ],
735 |    "metadata": {}
736 |   },
737 |   {
738 |    "cell_type": "code",
739 |    "execution_count": 59,
740 |    "source": [
741 |     "function tasktest(M, x)\n",
742 |     "    xr = Ref(x)\n",
743 |     "    task = Threads.@spawn begin\n",
744 |     "        @inbounds M[1] = xr[]\n",
745 |     "    end\n",
746 |     "    @inbounds M[2] = x\n",
747 |     "    wait(task)\n",
748 |     "    nothing\n",
749 |     "end"
750 |    ],
751 |    "outputs": [
752 |     {
753 |      "data": {
754 |       "text/plain": [
755 |        "tasktest (generic function with 1 method)"
756 |       ]
757 |      },
758 |      "metadata": {},
759 |      "output_type": "display_data"
760 |     }
761 |    ],
762 |    "metadata": {}
763 |   },
764 |   {
765 |    "cell_type": "code",
766 |    "execution_count": 60,
767 |    "source": [
768 |     "R = Float64[0., 0.]\n",
769 |     "dR = Float64[2., 3.]\n",
770 |     "\n",
771 |     "Enzyme.autodiff(tasktest, Duplicated(R, dR), Active(2.0))"
772 |    ],
773 |    "outputs": [
774 |     {
775 |      "output_type": "stream",
776 |      "name": "stdout",
777 |      "text": [
778 |       "┌ Warning: active variables passeed by value to jl_new_task are not yet supported\n",
779 |       "└ @ Enzyme.Compiler /home/vchuravy/.julia/packages/Enzyme/2n29R/src/compiler.jl:212\n"
780 |      ]
781 |     },
782 |     {
783 |      "data": {
784 |       "text/plain": [
785 |        "(5.0,)"
786 |       ]
787 |      },
788 |      "metadata": {},
789 |      "output_type": "display_data"
790 |     }
791 |    ],
792 |    "metadata": {}
793 |   },
794 |   {
795 |    "cell_type": "code",
796 |    "execution_count": 61,
797 |    "source": [
798 |     "Float64[2.0, 2.0] ≈ R\n",
799 |     "Float64[0.0, 0.0] ≈ dR"
800 |    ],
801 |    "outputs": [
802 |     {
803 |      "data": {
804 |       "text/plain": [
805 |        "true"
806 |       ]
807 |      },
808 |      "metadata": {},
809 |      "output_type": "display_data"
810 |     }
811 |    ],
812 |    "metadata": {}
813 |   },
814 |   {
815 |    "cell_type": "markdown",
816 |    "source": [
817 |     "# Using the Enzyme API to integrate with a different language\n",
818 |     "\n",
819 |     "Enzyme exports a C-API (https://github.com/wsmoses/Enzyme/blob/main/enzyme/Enzyme/CApi.h)\n",
820 |     "\n",
821 |     "## Steps\n",
822 |     "\n",
823 |     "1. Obtain the LLVM IR of the code you want to differentiate\n",
824 |     "2. Run an early set of optimizations\n",
825 |     "3. Provide `TypeTree` information and activity for the input arguments\n",
826 |     "4. Register custom adjoints for runtime functions\n",
827 |     "5. Run `EnzymeCreateAugmentedPrimal`/`EnzymeCreatePrimalAndGradient` to synthesize gradients\n",
828 |     "6. Compile and Link gradient code -- maybe using Orc\n",
829 |     "7. Call from user program -- ABI can be finicky"
830 |    ],
831 |    "metadata": {}
832 |   }
833 |  ],
834 |  "metadata": {
835 |   "orig_nbformat": 4,
836 |   "language_info": {
837 |    "name": "python"
838 |   }
839 |  },
840 |  "nbformat": 4,
841 |  "nbformat_minor": 2
842 | }


--------------------------------------------------------------------------------
/julia_activity/.ipynb_checkpoints/activity-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "attachments": {},
  5 |    "cell_type": "markdown",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Installing Enzyme"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "import Pkg\n",
 18 |     "Pkg.add(\"Enzyme\")"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "using Enzyme"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "attachments": {},
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## Activity Annotations\n",
 36 |     "\n",
 37 |     "* `Const`\n",
 38 |     "* `Active`\n",
 39 |     "* `Duplicated`\n",
 40 |     "* `DuplicatedNoNeed`"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "attachments": {},
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "square(x) = x^2"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "autodiff(Reverse, square, 1.0)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "attachments": {},
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "Default activity for values is `Const`"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "attachments": {},
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "autodiff(Reverse, square, Const(1.0))"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "attachments": {},
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "When adding the `Active` annotation Enzyme differentiates with respect to the argument"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "attachments": {},
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "autodiff(Reverse, square, Active(1.0))"
 90 |    ]
 91 |   }
 92 |  ],
 93 |  "metadata": {
 94 |   "kernelspec": {
 95 |    "display_name": "Python 3",
 96 |    "language": "python",
 97 |    "name": "python3"
 98 |   },
 99 |   "language_info": {
100 |    "name": "python",
101 |    "version": "3.11.1"
102 |   },
103 |   "orig_nbformat": 4,
104 |   "vscode": {
105 |    "interpreter": {
106 |     "hash": "5c7b89af1651d0b8571dde13640ecdccf7d5a6204171d6ab33e7c296e100e08a"
107 |    }
108 |   }
109 |  },
110 |  "nbformat": 4,
111 |  "nbformat_minor": 2
112 | }
113 | 


--------------------------------------------------------------------------------
/julia_activity/.jupyter/desktop-workspaces/default-37a8.jupyterlab-workspace:
--------------------------------------------------------------------------------
1 | {"data":{"layout-restorer:data":{"main":{"dock":{"type":"tab-area","currentIndex":0,"widgets":["notebook:activity.ipynb"]},"current":"notebook:activity.ipynb"},"down":{"size":0,"widgets":[]},"left":{"collapsed":false,"current":"filebrowser","widgets":["filebrowser","running-sessions","@jupyterlab/toc:plugin","extensionmanager.main-view"]},"right":{"collapsed":true,"widgets":["jp-property-inspector","debugger-sidebar"]},"relativeSizes":[0.15111378687537627,0.8488862131246238,0]},"notebook:activity.ipynb":{"data":{"path":"activity.ipynb","factory":"Notebook"}}},"metadata":{"id":"default"}}


--------------------------------------------------------------------------------
/julia_activity/activity.jl:
--------------------------------------------------------------------------------
 1 | using Enzyme
 2 | using Printf
 3 | 
 4 | # Defining the square function
 5 | square(x) = x^2;
 6 | 
 7 | # No activity annotations
 8 | result_1 = Enzyme.autodiff(Reverse, square, 1.0)
 9 | printf("No annotations result: %f", result_1)
10 | 
11 | # No activity annotations = constant annotation
12 | result_2 = Enzyme.autodiff(Reverse, square, Const(1.0))
13 | printf("Equals constant annotations: %f", result_2)
14 | 
15 | # Adding activity annotations
16 | result_3 = Enzyme.autodiff(Reverse, square, Active(1.0))
17 | printf("Adding activity annotations: %f", result_3)


--------------------------------------------------------------------------------
/julia_custom/custom.jl:
--------------------------------------------------------------------------------
 1 | using Enzyme
 2 | using Enzyme: EnzymeRules
 3 | 
 4 | # Defining our function
 5 | f(x) = x^2;
 6 | 
 7 | function f_ip(x)
 8 |     x[1] *= x[1]
 9 |     return nothing
10 |  end
11 |  
12 |  import .EnzymeRules: augmented_primal, reverse, Annotation, has_rrule, has_rrule_from_sig
13 |  using .EnzymeRules
14 | 
15 |  function augmented_primal(config::ConfigWidth{1}, func::Const{typeof(f)}, ::Type{<:Active}, x::Active)
16 |     if needs_primal(config)
17 |         return AugmentedReturn(func.val(x.val), nothing, nothing)
18 |     else
19 |         return AugmentedReturn(nothing, nothing, nothing)
20 |     end
21 | end
22 | 
23 | function reverse(config::ConfigWidth{1}, ::Const{typeof(f)}, dret::Active, tape, x::Active)
24 |     if needs_primal(config)
25 |         return (10+2*x.val*dret.val,)
26 |     else
27 |         return (100+2*x.val*dret.val,)
28 |     end
29 | end
30 | 
31 | function augmented_primal(::Config{false, false, 1}, func::Const{typeof(f_ip)}, ::Type{<:Const}, x::Duplicated)
32 |     v = x.val[1]
33 |     x.val[1] *= v
34 |     return AugmentedReturn(nothing, nothing, v)
35 | end
36 | 
37 | function reverse(::Config{false, false, 1}, ::Const{typeof(f_ip)}, ::Type{<:Const}, tape, x::Duplicated)
38 |     x.dval[1] = 100 + x.dval[1] * tape
39 |     return ()
40 | end
41 | 
42 | # To which we can then apply the Enzyme calls
43 | Enzyme.autodiff(Enzyme.Reverse, f, Active(2.0))[1][1];
44 | Enzyme.autodiff(Enzyme.Reverse, x->f(x)^2, Active(2.0))[1][1];
45 | 
46 | x = [2.0];
47 | dx = [1.0];
48 | 
49 | Enzyme.autodiff(Enzyme.Reverse, f_ip, Duplicated(x, dx));
50 | 


--------------------------------------------------------------------------------
/julia_fwd_and_batch/fwd_and_batch.jl:
--------------------------------------------------------------------------------
 1 | using Enzyme
 2 | 
 3 | # Defining our test function
 4 | function f(x::Array{Float64}, y::Array{Float64})
 5 |     y[1] = x[1] * x[1] + x[2] * x[1]
 6 |     return nothing
 7 | end;
 8 | 
 9 | # To use forward-mode we then have to seed the gradient computation with
10 | x  = [2.0, 2.0];
11 | dx = [1.0, 0.0];
12 | y  = [0.0];
13 | dy = [0.0];
14 | 
15 | # Of which the second duplicated element then stores the tangent
16 | Enzyme.autodiff(Forward, f, Duplicated(x, dx), Duplicated(y, dy))
17 | 
18 | # If we then seek to propagate multiple tangents at the same time to obtain the Hessian in one autodiff call, we then seed
19 | # the following way
20 | y = [0.0];
21 | x = [2.0, 2.0];
22 | 
23 | vdy = ([0.0],[0.0]);
24 | vdx = ([1.0, 0.0], [0.0, 1.0]);
25 | 
26 | bx = [0.0, 0.0];
27 | by = [1.0];
28 | vdbx = ([0.0, 0.0], [0.0, 0.0]);
29 | vdby = ([0.0], [0.0]);
30 | 
31 | # The AD-call then takes the following form
32 | Enzyme.autodiff(
33 |     Forward,
34 |     (x,y) -> Enzyme.autodiff_deferred(f, x, y),
35 |     BatchDuplicated(Duplicated(x, bx), Duplicated.(vdx, vdbx)),
36 |     BatchDuplicated(Duplicated(y, by), Duplicated.(vdy, vdby)),
37 | );
38 | 


--------------------------------------------------------------------------------
/mpi/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EnzymeAD/Enzyme-Tutorial/803ddd188c65087b2ebe8aa19aa3983951bd1834/mpi/README.md


--------------------------------------------------------------------------------
/mpi/disclaimer.txt:
--------------------------------------------------------------------------------
 1 | MPI_Allgather
 2 | MPI_Allreduce
 3 | MPI_Barrier
 4 | MPI_Bcast
 5 | MPI_Comm_free
 6 | MPI_Comm_rank
 7 | MPI_Comm_size
 8 | MPI_Comm_split
 9 | MPI_Finalize
10 | MPI_Gather
11 | MPI_Scatter
12 | MPI_Get_count
13 | MPI_Graph_create
14 | MPI_Init
15 | MPI_Intercomm_create
16 | MPI_Irecv
17 | MPI_Issend
18 | MPI_Probe
19 | MPI_Recv
20 | MPI_Reduce
21 | MPI_Send
22 | MPI_Test
23 | MPI_Wait
24 | 


--------------------------------------------------------------------------------
/openmp/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EnzymeAD/Enzyme-Tutorial/803ddd188c65087b2ebe8aa19aa3983951bd1834/openmp/README.md


--------------------------------------------------------------------------------
/openmp/parallel_for/Makefile:
--------------------------------------------------------------------------------
 1 | CLANG = /home/lpaehler/Work/Dev-Tools/llvm-fortran/f18-llvm-project/build/bin/clang
 2 | 
 3 | LLVM_PATH = /home/lpaehler/Work/Dev-Tools/llvm-fortran/f18-llvm-project/build
 4 | 
 5 | ENZYME_PATH = /home/lpaehler/Work/AutomaticDifferentiation/Enzyme/build/Enzyme/LLVMEnzyme-13.so
 6 | LLVM13_PATH = /home/lpaehler/Work/AutomaticDifferentiation/llvm-project/build
 7 | 
 8 | all: omp_parallel_simple.o
 9 | 
10 | clean:
11 | 	rm -f *.o *.ll
12 | 
13 | %.o: %.c
14 | 	$(LLVM_PATH)/bin/clang++  -O3 -Xclang -load -Xclang $(ENZYME_PATH) -ffast-math -fopenmp -o /host/$@
15 | 
16 | run-%: %.o
17 | 	../dockerscript.sh /host/$^
18 | 


--------------------------------------------------------------------------------
/openmp/parallel_for/OldMakefile:
--------------------------------------------------------------------------------
 1 | all: omp_parallel_for.o
 2 | 
 3 | clean:
 4 | 	rm -f *.o *.ll
 5 | 
 6 | %.o: %.c
 7 | 	../dockerscript.sh clang-12 /host/$^ -O3 -Xclang -load -Xclang /Enzyme/enzyme/build/Enzyme/ClangEnzyme-12.so -ffast-math -fopenmp=libomp -o /host/$@
 8 | 
 9 | run-%: %.o
10 | 	../dockerscript.sh /host/$^
11 | 


--------------------------------------------------------------------------------
/openmp/parallel_for/omp_parallel_for.c:
--------------------------------------------------------------------------------
 1 | #include <omp.h>
 2 | 
 3 | 
 4 | // Do parallel for 
 5 | void omp(float *x, int npoints) {
 6 | 
 7 | #pragma omp parallel for
 8 |     for (int i = 0; i < npoints; i++) {
 9 |         x[i] *= x[i];
10 |     }
11 | }
12 | 
13 | 
14 | double __enzyme_autodiff(void*, ...);
15 | 
16 | int main() {
17 | 
18 |     // Initialize array
19 |     float array[1000];
20 |     for(int i=0; i<1000; i++) {
21 |         array[i] = i + 0.5;
22 |     }
23 | 
24 |     // Set up the array to host the gradients
25 |     float d_array[1000];
26 |     for(int i=0; i<1000; i++) {
27 |         d_array[i] = 1.0f;
28 |     }
29 | 
30 |     // Alter the entries
31 | #ifdef FORWARD
32 |     sub(array, 1000);
33 | #else
34 |     __enzyme_autodiff((void*)omp, array, d_array, 1000);
35 | #endif
36 | 
37 |     return 0;
38 | }
39 | 


--------------------------------------------------------------------------------
/openmp/parallel_for_nounroll/Makefile:
--------------------------------------------------------------------------------
 1 | all: omp_parallel_for_nounroll.o
 2 | 
 3 | clean:
 4 | 	rm -f *.o *.ll
 5 | 
 6 | %.o: %.c
 7 | 	../dockerscript.sh clang-12 /host/$^ -O3 -Xclang -load -Xclang /Enzyme/enzyme/build/Enzyme/ClangEnzyme-12.so -ffast-math -fopenmp -o /host/$@
 8 | 
 9 | run-%: %.o
10 | 	../dockerscript.sh /host/$^


--------------------------------------------------------------------------------
/openmp/parallel_for_nounroll/omp_parallel_for_nounroll.c:
--------------------------------------------------------------------------------
 1 | #include <omp.h>
 2 | 
 3 | 
 4 | // Do parallel for 
 5 | void omp(float *x, int npoints) {
 6 | 
 7 | #pragma omp parallel for
 8 | #pragma nounroll
 9 |     for (int i = 0; i < npoints; i++) {
10 |         x[i] *= x[i];
11 |     }
12 | }
13 | 
14 | 
15 | double __enzyme_autodiff(void*, ...);
16 | 
17 | int main() {
18 | 
19 |     // Initialize array
20 |     float array[1000];
21 |     for(int i=0, i<1000; i++) {
22 |         array[i] = i + 0.5;
23 |     }
24 | 
25 |     // Set up the array to host the gradients
26 |     float d_array[1000];
27 |     for(int i=0, i<1000; i++) {
28 |         d_array[i] = 1.0f;
29 |     }
30 | 
31 |     // Alter the entries
32 | #ifdef FORWARD
33 |     sub(array, 1000);
34 | #else
35 |     __enzyme_autodiff((void*)omp, array, d_array, 1000);
36 | #endif
37 | 
38 |     return 0;
39 | }


--------------------------------------------------------------------------------
/openmp/parallel_simple/Makefile:
--------------------------------------------------------------------------------
 1 | all: omp_parallel_simple.o
 2 | 
 3 | clean:
 4 | 	rm -f *.o *.ll
 5 | 
 6 | %.o: %.c
 7 | 	../dockerscript.sh clang-12 /host/$^ -O3 -Xclang -load -Xclang /Enzyme/enzyme/build/Enzyme/ClangEnzyme-12.so -ffast-math -fopenmp -o /host/$@
 8 | 
 9 | run-%: %.o
10 | 	../dockerscript.sh /host/$^


--------------------------------------------------------------------------------
/openmp/parallel_simple/omp_parallel.c:
--------------------------------------------------------------------------------
 1 | #include <omp.h>
 2 | 
 3 | // Change the array entries
 4 | void subdomain_change(float *x, int istart, int ipoints) {
 5 |     int i;
 6 |     for (i = 0; i < ipoints; i++) {
 7 |         x[istart+i] = x[i] + i;
 8 |     }
 9 | }
10 | 
11 | // Partition into subdomains and alter the entries
12 | void sub(float *x, int npoints) {
13 |     int iam, nt, ipoints, istart;
14 | 
15 | #pragma omp parallel default(shared) private(iam, nt, ipoints, istart)
16 |     {
17 |         iam = omp_get_thread_num();
18 |         nt = omp_get_num_threads();
19 | 
20 |         // Size of partition
21 |         ipoints = npoints / nt;
22 | 
23 |         // Starting array index
24 |         istart = ipoints * iam;
25 | 
26 |         // Last thread may be required to do more
27 |         if (iam == nt - 1)
28 |             ipoints = npoints - istart;
29 |         subdomain(x, istart, ipoints);
30 |     }
31 | }
32 | 
33 | 
34 | void __enzyme_autodiff(void*, ...);
35 | 
36 | int main() {
37 | 
38 |     // Initialize array
39 |     float array[1000];
40 |     for(int i=0; i<1000; i++) {
41 |         array[i] = 0.0;
42 |     }
43 | 
44 |     // Set up the array to host the gradients
45 |     float d_array[1000];
46 |     for(int i=0; i<1000; i++) {
47 |         d_array[i] = 1.0f;
48 |     }
49 | 
50 |     // Alter the entries
51 | // #ifdef FORWARD
52 |     sub(array, 1000);
53 | // #else
54 | // __enzyme_autodiff((void*)sub, array, d_array, 1000);
55 | // #endif
56 | 
57 |     return 0;
58 | }
59 | 


--------------------------------------------------------------------------------