├── .gitignore
├── AY07
    ├── LectureFiniteDifferences.pdf
    ├── 1_OMP_static_schedule.cpp
    ├── 2_OMP_dynamic_schedule.cpp
    └── 3_OMP_reduction.cpp
├── AY04
    ├── Debug
    │   ├── 1.cpp
    │   ├── README.md
    │   ├── 4.cpp
    │   ├── 2.cpp
    │   ├── 3.cpp
    │   └── 5.cpp
    ├── job.sh
    ├── 5valgrind.cpp
    ├── 2aritmetica_punteros.cpp
    ├── 1punteros.cpp
    ├── 4matrix_heap.cpp
    ├── README.md
    └── 3arrays.cpp
├── AY05
    ├── job.sh
    ├── 1MPI_hello_world.cpp
    ├── 3MPI_dot.cpp
    ├── 4MPI_mat_vec.cpp
    ├── 2MPI_sum.cpp
    ├── 5MPI_variable_collective.cpp
    └── README.md
├── AY01
    ├── Paralelo.py
    ├── README.md
    ├── Tutorial_joblib_2_reuse.ipynb
    ├── Tutorial_joblib_1_basics.ipynb
    └── Comparaciones.ipynb
├── AY03
    ├── 1hello_world.cpp
    ├── README.md
    ├── 4punteros.cpp
    ├── 5funciones.cpp
    ├── 2variables.cpp
    ├── 6arrays.cpp
    └── 3flujos.cpp
├── AY06
    ├── 1_OMP_helloworld.cpp
    ├── 2_OMP_loops.cpp
    ├── 3_OMP_matvec.cpp
    └── DataLocality.ipynb
├── AY09
    ├── README.md
    └── 06_reduction_padding.ipynb
├── AY08
    ├── PyOpenCL
    │   └── README.md
    └── Numba
    │   ├── 6_Numba_caching.ipynb
    │   ├── 4_Numba_race_condition.ipynb
    │   ├── 1_PythonDecorators.ipynb
    │   ├── 2_Numba_vector_addition.ipynb
    │   ├── 3_Numba_vector_addition_parallel.ipynb
    │   └── 5_Numba_data_types.ipynb
├── README.md
└── AY02
    └── Tutorial_joblib_5_shared_variables.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | *.out
3 | AY06/Ejemplo


--------------------------------------------------------------------------------
/AY07/LectureFiniteDifferences.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShescBlank/IMT2112/HEAD/AY07/LectureFiniteDifferences.pdf


--------------------------------------------------------------------------------
/AY04/Debug/1.cpp:
--------------------------------------------------------------------------------
1 | #include <stdlib.h>
2 | 
3 | int main() {
4 | 
5 |         printf("Hello World\n");
6 | 
7 |     return 0;
8 | }


--------------------------------------------------------------------------------
/AY04/Debug/README.md:
--------------------------------------------------------------------------------
1 | OJO: Los códigos de esta carpeta pueden contener errores y solo los usaremos para realizar el ejercicio de encontrar los problemas que hay o puede haber.


--------------------------------------------------------------------------------
/AY04/Debug/4.cpp:
--------------------------------------------------------------------------------
1 | #include <stdio.h>
2 | 
3 | int main() { int a = 100; for (int i = 0; i < a; i++) { printf("%d\n", i); } printf("Este código es un poco extraño\n"); printf("Por favor no entreguen algo así :cc\n"); return 0; }


--------------------------------------------------------------------------------
/AY04/job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --partition=full
 4 | 
 5 | #SBATCH --job-name=IMT2112
 6 | #SBATCH --output=log.out
 7 | 
 8 | #SBATCH --ntasks=1
 9 | #SBATCH --cpus-per-task=1
10 | 
11 | g++ 1punteros.cpp
12 | ./a.out


--------------------------------------------------------------------------------
/AY05/job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --partition=full
 4 | 
 5 | #SBATCH --job-name=IMT2112
 6 | #SBATCH --output=log.out
 7 | 
 8 | ### Notar que la línea siguiente define la cantidad de procesos que queremos utilizar (no agregamos -np en el mpirun):
 9 | #SBATCH --ntasks=4
10 | #SBATCH --cpus-per-task=1
11 | 
12 | mpic++ 1MPI_hello_world.cpp -std=c++11
13 | mpirun ./a.out


--------------------------------------------------------------------------------
/AY04/Debug/2.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | int main() {
 4 | 
 5 |     int count = 0;
 6 |     int lim1 = 10;
 7 |     int lim2 = 20;
 8 | 
 9 |     for (int i = 0; i < lim1; i++)
10 |     {
11 |         for (int j = 0; j < lim2; i++)
12 |         {
13 |             count++;
14 |         }
15 |     }
16 |     
17 |     printf("El resultado de la cuenta es: %d\n", count);
18 | 
19 |     return 0;
20 | }


--------------------------------------------------------------------------------
/AY01/Paralelo.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from math import sqrt
 3 | from joblib import Parallel
 4 | from joblib import delayed
 5 | import time as tm
 6 | 
 7 | print("Comenzando a calcular...")
 8 | start = tm.time()
 9 | parallel_pool = Parallel(n_jobs=4)
10 | parallel_sqrt = delayed(sqrt)
11 | parallel_tasks = [parallel_sqrt(i) for i in range(10000000)]
12 | parallel_results = parallel_pool(parallel_tasks)
13 | end = tm.time()
14 | print(f"Tiempo total: {end - start}s")


--------------------------------------------------------------------------------
/AY04/Debug/3.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h> 
 3 | 
 4 | int main() {
 5 | 
 6 |     int length1 = 10;
 7 |     int length2 = 5;
 8 |     int** arrays = (int**)calloc(length1, sizeof(int*));
 9 | 
10 |     for (int i = 0; i < length1; ++i)
11 |     {
12 |         arrays[i] = (int*)malloc(length2 * sizeof(int));
13 | 
14 |         for (int j = 0; j < length2; j++)
15 |         {
16 |             arrays[i][j] = j*i;
17 |         }
18 |     }
19 | 
20 |     free(arrays);
21 | 
22 |     return 0;
23 | }


--------------------------------------------------------------------------------
/AY07/1_OMP_static_schedule.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Comandos útiles:
 3 | Compilar código:    g++ -o a1.out 1_OMP_static_schedule.cpp -fopenmp
 4 | Correr código:      ./a1.out
 5 | Podemos correrlo con time (antes del ./a1.out) para ver cuánto se demora en total.
 6 | */
 7 | 
 8 | #include <stdio.h>
 9 | #include <stdlib.h>
10 | #include <omp.h>
11 | #include "iostream"
12 | #include <unistd.h> // Sleep
13 | 
14 | using namespace std;
15 |  
16 | int main() {
17 | 
18 |     #pragma omp parallel for num_threads(4) // schedule(static)
19 |     for (int i = 1; i < 20; i++) {
20 |         int id = omp_get_thread_num();
21 |         sleep(i);
22 |         cout << "Thread " << id << " durmió " << i << " segundos ZzZ..." << endl;
23 |     }
24 | 
25 |     return 0;
26 | }
27 | 


--------------------------------------------------------------------------------
/AY07/2_OMP_dynamic_schedule.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Comandos útiles:
 3 | Compilar código:    g++ -o a2.out 2_OMP_dynamic_schedule.cpp -fopenmp
 4 | Correr código:      ./a2.out
 5 | Podemos correrlo con time (antes del ./a2.out) para ver cuánto se demora en total.
 6 | */
 7 | 
 8 | #include <stdio.h>
 9 | #include <stdlib.h>
10 | #include <omp.h>
11 | #include "iostream"
12 | #include <unistd.h> // Sleep
13 | 
14 | using namespace std;
15 |  
16 | int main() {
17 | 
18 |     #pragma omp parallel for num_threads(4) schedule(dynamic)
19 |     for (int i = 1; i < 20; i++) {
20 |         int id = omp_get_thread_num();
21 |         sleep(i);
22 |         cout << "Thread " << id << " durmió " << i << " segundos ZzZ..." << endl;
23 |     }
24 | 
25 |     return 0;
26 | }
27 | 


--------------------------------------------------------------------------------
/AY03/1hello_world.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Correr código (en consola):
 3 | - Compilar: g++ 1hello_world.cpp -o name_output
 4 | - Correr: ./name_output
 5 | */
 6 | 
 7 | // Incluimos la librería standard I/O
 8 | #include <stdio.h> // incluye al printf
 9 | 
10 | // Las dos líneas siguientes son para el cout
11 | #include <iostream>
12 | using namespace std;
13 | 
14 | // La función main es la que se ejecuta cuando se corre el código
15 | int main() {
16 | 
17 |     printf("Hello World\n"); // printf no tiene salto de línea
18 | 
19 |     // "\n" representa un salto de línea
20 |     // "\t" representa un tab (4 espacios)
21 | 
22 |     cout << "Hello " << "World";
23 |     cout << " 2" << endl; // endl agrega el salto de línea
24 | 
25 |     // Debemos retornar un número porque así fue definido main
26 |     return 0;
27 | }


--------------------------------------------------------------------------------
/AY04/Debug/5.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h> 
 3 | 
 4 | int* create_array(int n)
 5 | {
 6 |     int array[n];
 7 |     for (int i = 0; i < n; i++)
 8 |     {
 9 |         array[i] = i;
10 |     }
11 |     return array;
12 | }
13 | 
14 | int main() {
15 | 
16 |     // Creemos un array con la función anterior
17 |     int n = 10;
18 | 
19 |     // Nos retorna un puntero de int
20 |     int* array1 = create_array(n);
21 | 
22 |     // Imprimamos sus elementos para ver que todo esté bien
23 |     for (int i = 0; i < n; i++)
24 |     {
25 |         printf("%d ", array1[i]);
26 |     }
27 |     printf("\n");
28 | 
29 |     // Ahora creemos otro array
30 |     int* array2 = create_array(n);
31 | 
32 |     // E imprimamos
33 |     for (int i = 0; i < n; i++)
34 |     {
35 |         printf("%d ", array2[i]);
36 |     }
37 |     printf("\n");
38 | 
39 |     return 0;
40 | }


--------------------------------------------------------------------------------
/AY03/README.md:
--------------------------------------------------------------------------------
 1 | ## Compilar Código: (desde la consola y deben ubicarse en la posición de los archivos (utilizando cd))
 2 | 
 3 | - g++ codigo.gcc (genera un archivo ejecutable a.out)
 4 | - g++ codigo.gcc -o archivo (genera un archivo ejecutable archivo.out)
 5 | 
 6 | 
 7 | ## Correr Código: 
 8 | - ```./archivo.out```
 9 | - ```valgrind ./archivo.out``` (Para debuggear)
10 | 
11 | 
12 | Operaciones de Calculo: + - * / %(modulo)
13 | 
14 | ```x += 1``` es equivalente a ```x = x+1``` (lo mismo para otras operaciones excepto modulo)
15 | 
16 | Para suma y resta esto es equivalente a ```x++``` o ```x--```
17 | 
18 | 
19 | Operaciones Comparativas: == != < > <= >=
20 | 
21 | 
22 | Operaciones Logicas: &&(and)   ||(or)
23 | 
24 | 
25 | printf: %i(int) $d(int) %f(float) %lf(double)
26 | 
27 | 
28 | sizeof(tipo_variable): devuelve el tamaño en bytes de ese tipo de variable
29 | 
30 | 
31 | ## Si quieren aprender más o profundizar sobre lo visto en la ayudantía les recomiendo el siguiente [enlace](https://github.com/DCCentral-de-Apuntes/intro-C)
32 | 
33 | 
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/AY03/4punteros.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Correr código (en consola):
 3 | - Compilar: g++ 4punteros.cpp -o name_output
 4 | - Correr: ./name_output
 5 | */
 6 | 
 7 | #include <stdio.h>
 8 | 
 9 | int main() {
10 |   // Declaremos un int y démosle un valor
11 |   int a = 4;
12 | 
13 |   // Declaremos un puntero, en este caso es un puntero a un valor int
14 |   int* b;
15 | 
16 |   // Hagamos que el valor de b sea el puntero de a (anteponer & para acceder al puntero de una variable)
17 |   b = &a;
18 | 
19 |   printf("Mi valor es %d y mi puntero es %p\n", a, b);
20 | 
21 |   // Los punteros tienen muchas utilidades ya que podemos utilizarlos para acceder al valor guardado en memoria:
22 |   // (* a la izquierda de la variable para acceder al valor apuntado)
23 |   *b += 1;
24 | 
25 |   printf("Mi valor es %d y mi puntero es %p\n", a, b);
26 | 
27 |   // También podemos hacer punteros de punteros
28 |   int** c = &b;
29 | 
30 |   // Nuevamente podemos utilizarlo para modificar a:
31 |   *(*c) += 1;
32 | 
33 |   printf("Mi valor es %d y mi puntero es %p y el puntero de mi puntero es %p\n", a, b, c);
34 | 
35 |   return 0;
36 | }


--------------------------------------------------------------------------------
/AY06/1_OMP_helloworld.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Comandos útiles:
 3 | Compilar código:    g++ -o name_output 1_OMP_helloworld.cpp -fopenmp
 4 | Correr código:      ./name_output
 5 | */
 6 | 
 7 | #include <stdio.h>
 8 | #include <omp.h>
 9 |  
10 | int main() {
11 | 
12 |     // Trabajaremos con thread y usaremos OpenMP
13 | 
14 |     // Con la siguiente línea, generamos un bloque de código que será ejecutado por múltiple threads:
15 |     #pragma omp parallel // en mi computador se utilizan 8 threads por defecto
16 |     {
17 |         printf("Hello World\n");
18 |         int id = omp_get_thread_num();
19 |         int total = omp_get_num_threads();
20 |         printf("Greetings from process %d out of %d \n", id, total);   
21 |     }
22 | 
23 |     printf("\n=================================================================\n");
24 | 
25 |     // Pero también podemos decirle cuántos threads queremos
26 |     #pragma omp parallel num_threads(1)
27 |     {
28 |         printf("Hello World\n");
29 |         int id = omp_get_thread_num();
30 |         int total = omp_get_num_threads();
31 |         printf("Greetings from process %d out of %d \n", id, total);   
32 |     }
33 | 
34 |     return 0;
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/AY03/5funciones.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Correr código (en consola):
 3 | - Compilar: g++ 5funciones.cpp -o name_output
 4 | - Correr: ./name_output
 5 | */
 6 | 
 7 | #include <stdio.h>
 8 | 
 9 | int suma(int x1, int x2)
10 | { 
11 |   x1 += 1;
12 |   x2 += 2;
13 |   return x1 + x2;
14 | }
15 | 
16 | void actualizar(int* x) // no retorna nada
17 | { 
18 |   *x = 1; // modifico la variable desde dentro de una función mediante su puntero
19 | }
20 | 
21 | float promedio(float x, float y, float z)
22 | {
23 |   return (x + y + z) / 3;
24 | }
25 | 
26 | int main() {
27 |   
28 |   int a = 5;
29 |   int b = 10;
30 | 
31 |   printf("a = %d (antes de actualizar())\n", a);
32 | 
33 |   actualizar(&a);
34 | 
35 |   printf("a = %d (después de actualizar())\n\n", a);
36 | 
37 |   int c = suma(a, b); // no modifica a y b ya que crea una copia de ellas en la función
38 | 
39 |   printf("Valores finales:\n");
40 |   printf("a = %d\n", a); // Notar que a y b no se modifican en la función suma
41 |   printf("b = %d\n", b);
42 |   printf("c = %d\n", c);
43 |   printf("Promedio = %f\n", promedio(1.1, 5.7, 13.59));
44 | 
45 |   return 0;
46 | }
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/AY05/1MPI_hello_world.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Correr código (en consola):
 3 | - Compilar: mpic++ 1MPI_hello_world.cpp -std=c++11
 4 | - Correr (con 2 procesos): mpirun -np 2 ./a.out
 5 | Se puede cambiar el número de procesos con el que se corre
 6 | */
 7 | 
 8 | #include <mpi.h> // Nuevo include!
 9 | #include <stdio.h>
10 | 
11 | int main(int argc, char** argv) {
12 |     // int a = 10;
13 |     // printf("Valor de a: %d\n", a);
14 | 
15 |     // Initialize the MPI environment (Message Passing Interface)
16 |     MPI_Init(NULL, NULL);
17 | 
18 |     // Get the number of processes
19 |     int world_size;
20 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
21 | 
22 |     // El MPI_COMM_WORLD es el comunicador de nuestro grupo de procesos
23 | 
24 |     // Get the rank of the process
25 |     int world_rank;
26 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
27 | 
28 |     // Get the name of the processor
29 |     char processor_name[MPI_MAX_PROCESSOR_NAME];
30 |     int name_len;
31 |     MPI_Get_processor_name(processor_name, &name_len);
32 | 
33 |     // Print off a hello world message from each process
34 |     printf("Hello world from processor %s, rank %d out of %d processors\n", processor_name, world_rank, world_size);
35 | 
36 |     // Finalize the MPI environment.
37 |     MPI_Finalize();
38 | }


--------------------------------------------------------------------------------
/AY01/README.md:
--------------------------------------------------------------------------------
 1 | # AY01
 2 | 
 3 | Introducción a la librería Joblib y comparaciones de rendimiento.
 4 | 
 5 | Librerías utilizadas para esta ayudantía:
 6 | - Numpy
 7 | - Joblib
 8 | 
 9 | Todas estas librerías se pueden instalar usando pip (Ej: ```pip install joblib```).
10 | 
11 | También pueden usar Google Colab para correr todos los códigos.
12 | 
13 | ## ¿Cómo observar la cantidad de procesos e hilos que está usando nuestro programa?
14 | 
15 | Para mirar estas dos cantidades podemos utilizar el administrador de tareas (Windows) o monitor de actividad (Mac) de nuestro computador. En específico, para los procesos basta con fijarse en cuántas instancias de Python podemos ver abiertas y trabajando (básicamente, programas con el nombre de Python). En relación a la cantidad de hilos, en Mac es sencillo ya que el mismo monitor de actividad lo dice directamente, sin embargo, en Windows esta opción está un poco más escondida y hay que seguir los siguientes pasos para poder verla:
16 | 
17 | - Abrir el Administrador de Tareas
18 | - Ir a la pestaña de detalles
19 | - Hacer click derecho en alguno de los nombres de las columnas y apretar "Seleccionar columnas"
20 | - Bajar en el listado hasta encontrar la opción de "Threads" o "Subprocesos" e incluirla en el listado
21 | - ¡Listo! Ahora podemos ver cuántos hilos está usando cada programa por separado


--------------------------------------------------------------------------------
/AY06/2_OMP_loops.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Comandos útiles:
 3 | Compilar código:    g++ -o name_output 2_OMP_loops.cpp -fopenmp
 4 | Correr código:      ./name_output
 5 | */
 6 | 
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | #include <omp.h>
10 |  
11 | int main() {
12 | 
13 |     // Qué pasa si sumamos en paralelo usando threads?
14 |     int x1 = 0;
15 | 
16 |     // Notar que agregamos "for" al pragma
17 |     // Esto para repartir las iteraciones entre los threads
18 |     #pragma omp parallel for num_threads(24)
19 |     for (int i = 0; i < 10000; i++) {
20 |         int id = omp_get_thread_num();
21 |         x1 += 1;
22 |         //printf("Proceso %d sumó 1\n", id);  
23 |     }
24 | 
25 |     printf("\nSuma Final 1 = %d\n", x1);
26 | 
27 |     // Veamos otra posibilidad
28 | 
29 |     int x2 = 0;
30 |     int n = 10;
31 |     int* array = (int*) calloc(n, sizeof(int));
32 |     
33 |     #pragma omp parallel for num_threads(n)
34 |     for (int i = 0; i < 100000000; i++) {
35 |         int id = omp_get_thread_num();
36 |         array[id] += 1;
37 |         // printf("Proceso %d sumó 1\n", id);  
38 |     }
39 | 
40 |     for (int i = 0; i < n; i++) {
41 |         x2 += array[i]; 
42 |     }
43 | 
44 |     printf("\nSuma Final 2 = %d\n", x2);
45 | 
46 | 
47 |     // Nunca olviden de liberar memoria si es que la reservan :D
48 |     free(array);
49 | 
50 |     return 0;
51 | }
52 | 


--------------------------------------------------------------------------------
/AY04/5valgrind.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Correr código (en consola):
 3 | - Compilar: g++ -g 5valgrind.cpp -o name_output
 4 | - Correr con Valgrind: valgrind ./name_output 
 5 | Valgrind se encargará de revisar nuestros errores
 6 | pero debes compilar con -g para que nos pueda decir nuestros errores
 7 | */
 8 | 
 9 | #include <stdio.h>
10 | #include <stdlib.h> 
11 | #include <ctime>
12 | 
13 | int main() {
14 |     // Este código contiene errores!
15 |   
16 |     int N, begin, end;
17 |     begin = 10
18 |     end = 20;
19 | 
20 |     // Creemos un array
21 |     int* array = (int*)calloc(end-begin, sizeof(int));
22 | 
23 |     // rellenemos sus valores
24 |     for (int i = 0; i < end-begin; i++)
25 |     {
26 |         array[i] = i;
27 |     }
28 | 
29 |     // Imprimimamos sus resultados
30 |     for (int i = 0; i < end-begin; i++)
31 |     {
32 |         printf("%d ", array[i]);
33 |     }
34 |     printf("\n");
35 | 
36 |     // Y cambiemos el valor en la casilla N
37 |     array[N] = 100;
38 | 
39 |     // Volvamos a imprimir
40 |     for (int i = begin; i < end; i++)
41 |     {
42 |         printf("%d ", array[i]);
43 |     }
44 |     printf("\n");
45 | 
46 |     // No olviden liberar la memoria reservada
47 |     free(array);
48 |     // Si liberaron todo en un código normal, Valgrind debería decir:
49 |     // "All heap blocks were freed -- no leaks are possible"
50 | 
51 |     return 0;
52 | }


--------------------------------------------------------------------------------
/AY03/2variables.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Correr código (en consola):
 3 | - Compilar: g++ 2variables.cpp -o name_output
 4 | - Correr: ./name_output
 5 | */
 6 | 
 7 | #include <stdio.h>
 8 | #include <iostream>
 9 | using namespace std;
10 | 
11 | int main() {
12 |     
13 |     // Para utilizar variables debemos definir su tipo
14 |     
15 |     // INT:
16 |     printf("INT:\n");
17 |     int a = 9;
18 |     int b;
19 |     b = 10;
20 |     printf("La suma de %i y %d es: %d\n", a, b, a+b);
21 |     cout << "La multiplicación de " << a << " y " << b << " es: " << a*b << endl;
22 | 
23 |     // FLOAT:
24 |     printf("\nFLOAT:\n");
25 |     float c = 10.2, d = 3.1;
26 |     printf("c*d=%f\n", c*d);
27 | 
28 |     // CASTING:
29 |     printf("\nCASTING:\n");
30 |     printf("División sin casting: %d/%d=%f\n", a, b, a/b);
31 |     float result = (float)a/b;
32 |     printf("División con casting: %d/%d=%f\n", a, b, result);
33 | 
34 |     // DOUBLE:
35 |     printf("\nDOUBLE:\n");
36 |     float pi2 = 3.14159265358979323846;
37 |     printf("%f\n", pi2);
38 |     double pi = 3.14159265358979323846;
39 |     printf("%lf\n", pi);
40 | 
41 |     // BOOLEANS:
42 |     printf("\nBOOLEANS:\n");
43 |     bool e = true;
44 |     bool f = false;
45 |     bool g = !(e || f);
46 |     bool h = g && f;
47 |     printf("True: %d\n", e);
48 |     printf("False: %d\n", f);
49 |     printf("!(T || F)=%d\nF && F=%d\n", g, h);
50 | 
51 |     return 0;
52 | }


--------------------------------------------------------------------------------
/AY04/2aritmetica_punteros.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Correr código (en consola):
 3 | - Compilar: g++ 2aritmetica_punteros.cpp -o name_output
 4 | - Correr: ./name_output
 5 | */
 6 | 
 7 | #include <stdio.h>
 8 | #include <stdint.h>
 9 | 
10 | int main() {
11 |     // Creemos dos variables int
12 |     int a = 1;
13 |     int b = 2;
14 |     printf("a = %d\n", a);
15 |     printf("b = %d\n\n", b);
16 |     
17 |     // Veamos su puntero
18 |     printf("Puntero de a: %p\n", &a); // Se puede observar que los punteros están en formato hexadecimal
19 |     printf("Puntero de b: %p\n", &b);
20 | 
21 |     // La distancia de ambos punteros es 4, cuál es el tamaño en bytes de un int?
22 |     printf("Tamaño en bytes de un int: %zu\n", sizeof(int));
23 | 
24 |     // Qué ocurre si le sumamos 1 al puntero de a, llegamos a b?
25 |     printf("\nPuntero + 1 de a: %p\n", &a + 1);
26 | 
27 |     // Efectivamente llegamos, entonces podemos acceder al valor de b
28 |     printf("Valor de b mediante el puntero de a: %d\n", *(&a + 1));
29 | 
30 |     // Probemos con otros tipos de datos
31 |     char c = 'c';
32 |     char d = 'd';
33 |     printf("\nCHAR\n");
34 |     printf("Puntero de c: %p\n", &c);
35 |     printf("Puntero de d: %p\n", &d);
36 |     printf("Tamaño en bytes de un char: %zu\n", sizeof(char));
37 | 
38 | 
39 |     double e = 0.1;
40 |     double f = 0.123;
41 |     printf("\nDOUBLE\n");
42 |     printf("Puntero de e: %p\n", &e);
43 |     printf("Puntero de f: %p\n", &f);
44 |     printf("Tamaño en bytes de un double: %zu\n", sizeof(double));
45 | 
46 |     return 0;
47 | }


--------------------------------------------------------------------------------
/AY04/1punteros.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Correr código (en consola):
 3 | - Compilar: g++ 1punteros.cpp -o name_output
 4 | - Correr: ./name_output
 5 | */
 6 | 
 7 | #include <stdio.h>
 8 | 
 9 | int suma1(int x)
10 | { 
11 |   x += 1;
12 |   return x;
13 | }
14 | 
15 | void actualizar(int* x) // no retorna nada
16 | { 
17 |   *x += 1; // modifico la variable desde dentro de una función mediante su puntero
18 |   // Notar que se usa * por la izquierda para acceder al valor apuntado
19 | }
20 | 
21 | int main() {
22 |   
23 |   int a = 5;
24 | 
25 |   // Al crear la variable anterior, podemos acceder a su puntero:
26 |   int* b = &a;
27 | 
28 |   // Recordar que para acceder al puntero se usa &
29 |   // Y se debe declarar el tipo de puntero que estamos guardando, en este caso, puntero a int
30 | 
31 |   // Gracias al puntero, tenemos otra forma de modificar el valor de una variable
32 |   // Esto será útil en arrays
33 |   printf("Valor de a antes de actualizar: %d\n", a);
34 |   actualizar(b);
35 |   printf("Valor de a después de actualizar: %d\n\n", a);
36 | 
37 |   // Notar que si usamos la función suma, el valor de a no se modifica
38 |   printf("Valor de a antes de suma1: %d\n", a);
39 |   suma1(a);
40 |   printf("Valor de a después de suma1: %d\n\n", a);
41 | 
42 |   // Ahora, si podemos retornarlo y actualizar el valor
43 |   printf("Valor de a antes de suma1 con retorno: %d\n", a);
44 |   a = suma1(a);
45 |   printf("Valor de a después de suma1 con retorno: %d\n", a);
46 | 
47 |   return 0;
48 | }
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/AY07/3_OMP_reduction.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Comandos útiles:
 3 | Compilar código:    g++ -o name_output 3_OMP_reduction.cpp -fopenmp
 4 | Correr código:      ./name_output
 5 | */
 6 | 
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | #include <omp.h>
10 |  
11 | int main() {
12 | 
13 |     // Qué pasa si hacemos una reducción con OpenMP?
14 |     int x1 = 0;
15 | 
16 |     // Notar que agregamos "for" al pragma
17 |     // Esto para repartir las iteraciones entre los threads
18 |     #pragma omp parallel for num_threads(24)
19 |     for (int i = 0; i < 100000000; i++) {
20 |         int id = omp_get_thread_num();
21 |         x1 += 1;
22 |         //printf("Proceso %d sumó 1\n", id);  
23 |     }
24 | 
25 |     printf("\nSuma Final 1 = %d\n", x1);
26 | 
27 |     // Ahora, veamos algo que sí funciona:
28 | 
29 |     int x2 = 0;
30 |     int n = 10;
31 |     int* array = (int*) calloc(n, sizeof(int));
32 |     
33 |     #pragma omp parallel for num_threads(n)
34 |     for (int i = 0; i < 100000000; i++) {
35 |         int id = omp_get_thread_num();
36 |         array[id] += 1;
37 |         // printf("Proceso %d sumó 1\n", id);  
38 |     }
39 | 
40 |     for (int i = 0; i < n; i++) {
41 |         x2 += array[i]; 
42 |     }
43 | 
44 |     printf("\nSuma Final 2 = %d\n", x2);
45 | 
46 |     // Por último, OpenMP también tiene la siguiente forma de hacerlo:
47 |     int x3 = 0;
48 | 
49 |     // Agregamos el parámetro reduction al pragma
50 |     #pragma omp parallel for num_threads(24) reduction(+:x3)
51 |     for (int i = 0; i < 100000000; i++) {
52 |         int id = omp_get_thread_num();
53 |         x3 += 1;
54 |         //printf("Proceso %d sumó 1\n", id);  
55 |     }
56 | 
57 |     printf("\nSuma Final 3 = %d\n", x3);
58 | 
59 | 
60 |     // Nunca olviden de liberar memoria si es que la reservan :D
61 |     free(array);
62 | 
63 |     return 0;
64 | }
65 | 


--------------------------------------------------------------------------------
/AY03/6arrays.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Correr código (en consola):
 3 | - Compilar: g++ 6arrays.cpp -o name_output
 4 | - Correr: ./name_output
 5 | */
 6 | 
 7 | #include <stdio.h>
 8 | #include <stdlib.h> 
 9 | #include <ctime>
10 | 
11 | int main() {
12 |   
13 |     // las listas no vienen nativas, existe array
14 | 
15 |     // Primera manera de hacerlo: dame un array de tamaño n de ints
16 |     int n = 10;
17 |     int array1[n];
18 |     
19 |     // Segunda manera: puntero de int. calloc es resérvame 10 casilleros 
20 |     // de memoria del tamaño de un int. El (int*) es un casting de int a puntero de int
21 |     int* array2 = (int*) calloc(10, sizeof(int)); // inicializa valores a 0
22 | 
23 |     // Tercera forma: en malloc solo le damos el tamaño total de lo que quiero reservar
24 |     int* array3 = (int*) malloc(10*sizeof(int)); // no inicializa valores
25 | 
26 |     for (int i=0; i<10; i++) {
27 |         // printf("%i ", array1[i]);
28 |         // printf("%i ", array2[i]);
29 |         // printf("%i ", array3[i]);
30 |         // printf("\n");
31 |     }
32 |     
33 | 
34 |     // Números random
35 |     // srand((int) time(0)); // seteamos una semilla con el tiempo actual
36 |                           // esto significa que cambiará bastante seguido
37 |                           // y así obtendremos comportamientos distintos 
38 |                           // al correr de nuevo el código.
39 |     //srand(1);
40 |     
41 |     for (int i=0;i<10;++i) {
42 |         // array2[i] = rand(); // genera números random
43 |         // array2[i] = rand()%10; // para generar números en algún rango (0 y 9)
44 |         //array2[i] = rand()%201 - 100; // entre -100 y 100
45 |         // printf("%i ", array2[i]);
46 |     }
47 | 
48 |     printf("\n");
49 |     
50 |     // Es necesario liberar las memorias que fueron reservadas con calloc o malloc (HEAP)
51 |     free(array2);
52 |     free(array3);
53 |     // No es necesario al array1 porque este no reserva memoria
54 | 
55 |     return 0;
56 | }


--------------------------------------------------------------------------------
/AY04/4matrix_heap.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Correr código (en consola):
 3 | - Compilar: g++ 4matrix_heap -o name_output
 4 | - Correr: ./name_output
 5 | */
 6 | 
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | #include <ctime>
10 | 
11 | void print_matrix(int N, int** matrix)
12 | {
13 |     // Notar que es importante guardar siempre el largo de un array, para poder recorrerlo.
14 |     // No tenemos una función directa como "len" para obtener su largo.
15 |     for (int i = 0; i < N; i++)
16 |     {
17 |         for (int j = 0; j < N; j++)
18 |         {
19 |             printf("%d\t", matrix[i][j]);
20 |         }
21 |         printf("\n");
22 |     }
23 | }
24 | 
25 | int main() {
26 |   
27 |     // Para hacer una matriz simplemente debemos hacer array de arrays (punteros de punteros)
28 |     int N = 10;
29 |     int** matrix = (int**)malloc(N * sizeof(int*)); // Reservamos un bloque de memoria que va a almacenar punteros de int
30 |     for (int i = 0; i < N; i++)
31 |     {
32 |         matrix[i] = (int*)calloc(N, sizeof(int)); // Y ahora le damos valor a cada uno de esos punteros con un nuevo bloque
33 |                                                   // de memoria que busca almacenar ints.
34 |     }
35 | 
36 |     // Rellenemos sus valores con números random
37 |     srand((int) time(0));
38 |     for (int i = 0; i < N; i++)
39 |     {
40 |         for (int j = 0; j < N; j++)
41 |         {
42 |             matrix[i][j] = rand()%101; // Qué hace esto?
43 |         }
44 |     }
45 | 
46 |     // Imprimamos nuestra matriz con la función definida
47 |     print_matrix(N, matrix);
48 |     
49 |     // Finalmente, como usamos calloc y malloc, debemos liberar la memoria reservada
50 |     // antes de acabar nuestro programa:
51 |     for (int i = 0; i < N; i++)
52 |     {
53 |         free(matrix[i]);
54 |     }
55 |     free(matrix);
56 |     // Recordar que siempre hay que liberar desde adentro hacia afuera para no perder las referencias.
57 | 
58 |     // Qué pasa si no liberamos la memoria o si liberamos matrix antes que cada bloque matrix[i]?
59 | 
60 |     return 0;
61 | }


--------------------------------------------------------------------------------
/AY03/3flujos.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Correr código (en consola):
  3 | - Compilar: g++ 3flujos.cpp -o name_output
  4 | - Correr: ./name_output
  5 | */
  6 | 
  7 | #include <stdio.h>
  8 | #include <iostream>
  9 | using namespace std;
 10 | 
 11 | int main() {
 12 | 
 13 |     int a, b, c;
 14 |     a = 10;
 15 |     b = 2;
 16 |     c = 5;
 17 | 
 18 | 
 19 |     // IF, ELSE IF, ELSE
 20 |     printf("\nIF, ELSE IF, ELSE:\n");
 21 |     if (a > c)
 22 |     {
 23 |         printf("a > c\n");
 24 |     }
 25 |     else if (a == c) {
 26 |         printf("a = c\n");
 27 |     }
 28 |     else { printf("a < c\n"); }
 29 | 
 30 | 
 31 |     // FOR
 32 |     printf("\nFOR: i++\n");
 33 |     for (int i = 0; i < 10; i++)
 34 |     {
 35 |         cout << i << endl;
 36 |     }
 37 | 
 38 |     printf("\nFOR: i*=2\n");
 39 |     for (int i = 1; i < 10; i*=2)
 40 |     {
 41 |         cout << i << endl;
 42 |     }
 43 | 
 44 |     printf("\nFOR: continue\n");
 45 |     for (int i = 0; i < 5; i++)
 46 |     {
 47 |        for (int j = 0; j < 5; j++)
 48 |        {
 49 |         if (i == j) 
 50 |         {
 51 |             continue;
 52 |         }
 53 |         printf("(%d, %d)\n", i, j);
 54 |        }
 55 |     }
 56 | 
 57 |     // WHILE
 58 |     printf("\nWHILE:\n");
 59 |     while (c < 8) 
 60 |     {
 61 |         cout << c++ << endl; // Ojo que acá se le está sumando 1 al valor de la variable c en cada llamado
 62 |         // cout << ++c << endl; // Qué diferencia hay con esta forma?
 63 |     }
 64 |     cout << "Valor de c fuera del while: "<< c << endl;
 65 | 
 66 |     printf("\nWHILE: true y break\n");
 67 |     while (true) 
 68 |     {
 69 |         b -= 1;
 70 |         if (b == -10)
 71 |         {
 72 |             printf("Rompemos el while\n");
 73 |             break;
 74 |         }
 75 |         printf("Siguiente iteración...\n");
 76 |     }
 77 | 
 78 |     
 79 |     // SWITCH STATEMENT
 80 |     printf("\nSWITCH:\n");
 81 |     int variable = 2;
 82 | 
 83 |     switch (variable)
 84 |     {
 85 |         case 0:
 86 |             cout << "Falso" << endl;
 87 |             break;
 88 |     
 89 |         case 1:
 90 |             cout << "Verdadero" << endl;
 91 |             break;
 92 |     
 93 |         default:
 94 |             cout << "Inválido" << endl;
 95 |             break;
 96 |     }
 97 | 
 98 |     printf("\nSWITCH: sin break\n");
 99 |     variable = 0;
100 | 
101 |     switch (variable)
102 |     {
103 |         case 0:
104 |             cout << "Falso" << endl;
105 |         
106 |         case 1:
107 |             cout << "Verdadero" << endl;
108 |         
109 |         default:
110 |             cout << "Inválido" << endl;
111 |     }
112 |     
113 |     return 0;
114 | }


--------------------------------------------------------------------------------
/AY09/README.md:
--------------------------------------------------------------------------------
 1 | ## Ayudantía 9: Instalación PyOpenCL
 2 | 
 3 | La instalación de OpenCL (y de PyOpenCL) varía según los dispositivos que tengamos por lo que puede no ser muy directa (además de que siempre existe la posibilidad de que no quede bien configurado). Es por esto que, si no queremos pasar por el proceso de instalación, podemos usar, como alternativa, la plataforma de Google Colab con la tarjeta gráfica NVIDIA TESLA T4 que nos presta Google.
 4 | 
 5 | En particular, lo primero que se debe hacer en Google Colab es ir a 'Entorno de ejecución' -> 'Cambiar tipo de entorno de ejecución' -> 'T4 GPU'. Segundo, Google sacó hace un tiempo los drivers de OpenCL del entorno base de Colab, por lo que hay que instalarlos con los siguiente comandos:
 6 | 
 7 |     !sudo apt -y update
 8 |     !sudo apt install -y nvidia-cuda-toolkit
 9 | 
10 |     # Para poder también correr nuestros códigos en la CPU del entorno de GPU de Colab:
11 |     !sudo apt install -y pocl-opencl-icd
12 | 
13 |     # Finalmente, solo nos falta instalar PyOpenCL en el entorno:
14 |     !pip install pyopencl
15 | 
16 | Todos estos pasos se explican con mayor detalle en el notebook ```0_installation.ipynb```.
17 | 
18 | Subiendo los códigos de la ayudantía a Colab y haciendo los pasos anteriores, no debería haber problemas corriendo los códigos con PyOpenCL.
19 | 
20 | Cabe destacar que el entorno de GPU de Google Colab no es ilimitado y, por lo general, la sesión se corta después de 6 horas aproximadamente (puede variar bastante). Es importante tener esto en consideración al realizar alguna tarea del curso (también nos podemos cambiar de cuenta de Google cuando nos pasa esto).
21 | 
22 | Por último, si alguien está interesado en instalar OpenCL en su computador, podemos conversarlo y ver la instalación juntos, ya que hay varias guías en internet sobre las formas de instalarlo para distintos dispositivos.
23 | 
24 | ### Correr en GPU y CPU
25 | 
26 | Para correr en ambos dispositivos, es necesario instalar los drivers de más arriba y definir un contexto para cada uno (antes utilizábamos ```ctx = cl.create_some_context()``` por defecto). Ahora:
27 | 
28 |     platforms_cuda = cl.get_platforms()[0] # GPU
29 |     platforms_pocl = cl.get_platforms()[1] # CPU
30 | 
31 |     devices_gpu = platforms_cuda.get_devices(device_type=cl.device_type.GPU)
32 |     devices_cpu = platforms_pocl.get_devices(device_type=cl.device_type.CPU)
33 | 
34 |     # Contexto para GPU:
35 |     ctx_gpu = cl.Context(devices=devices_gpu)
36 |     # Contexto para CPU:
37 |     ctx_cpu = cl.Context(devices=devices_cpu)
38 | 
39 |     # Esto está asumiendo que la plataforma 0 es la de GPU y la 1 de CPU.
40 |     # Pueden comprobar esto imprimiendo las plataformas antes de indexarlas.
41 | 
42 | Luego, el resto de códigos se mantienen igual. Basta con cambiar el contexto para trabajar en CPU/GPU.
43 | 


--------------------------------------------------------------------------------
/AY08/PyOpenCL/README.md:
--------------------------------------------------------------------------------
 1 | ## Ayudantía 8: Instalación PyOpenCL
 2 | 
 3 | La instalación de OpenCL (y de PyOpenCL) varía según los dispositivos que tengamos por lo que puede no ser muy directa (además de que siempre existe la posibilidad de que no quede bien configurado). Es por esto que, si no queremos pasar por el proceso de instalación, podemos usar, como alternativa, la plataforma de Google Colab con la tarjeta gráfica NVIDIA TESLA T4 que nos presta Google.
 4 | 
 5 | En particular, lo primero que se debe hacer en Google Colab es ir a 'Entorno de ejecución' -> 'Cambiar tipo de entorno de ejecución' -> 'T4 GPU'. Segundo, Google sacó hace un tiempo los drivers de OpenCL del entorno base de Colab, por lo que hay que instalarlos con los siguiente comandos:
 6 | 
 7 |     !sudo apt -y update
 8 |     !sudo apt install -y nvidia-cuda-toolkit
 9 | 
10 |     # Para poder también correr nuestros códigos en la CPU del entorno de GPU de Colab:
11 |     !sudo apt install -y pocl-opencl-icd
12 | 
13 |     # Finalmente, solo nos falta instalar PyOpenCL en el entorno:
14 |     !pip install pyopencl
15 | 
16 | Todos estos pasos se explican con mayor detalle en el notebook ```0_installation.ipynb```.
17 | 
18 | Subiendo los códigos de la ayudantía a Colab y haciendo los pasos anteriores, no debería haber problemas corriendo los códigos con PyOpenCL.
19 | 
20 | Cabe destacar que el entorno de GPU de Google Colab no es ilimitado y, por lo general, la sesión se corta después de 6 horas aproximadamente (puede variar bastante). Es importante tener esto en consideración al realizar alguna tarea del curso (también nos podemos cambiar de cuenta de Google cuando nos pasa esto).
21 | 
22 | Por último, si alguien está interesado en instalar OpenCL en su computador, podemos conversarlo y ver la instalación juntos, ya que hay varias guías en internet sobre las formas de instalarlo para distintos dispositivos.
23 | 
24 | ### Correr en GPU y CPU
25 | 
26 | Para correr en ambos dispositivos, es necesario instalar los drivers de más arriba y definir un contexto para cada uno (antes utilizábamos ```ctx = cl.create_some_context()``` por defecto). Ahora:
27 | 
28 |     platforms_cuda = cl.get_platforms()[0] # GPU
29 |     platforms_pocl = cl.get_platforms()[1] # CPU
30 | 
31 |     devices_gpu = platforms_cuda.get_devices(device_type=cl.device_type.GPU)
32 |     devices_cpu = platforms_pocl.get_devices(device_type=cl.device_type.CPU)
33 | 
34 |     # Contexto para GPU:
35 |     ctx_gpu = cl.Context(devices=devices_gpu)
36 |     # Contexto para CPU:
37 |     ctx_cpu = cl.Context(devices=devices_cpu)
38 | 
39 |     # Esto está asumiendo que la plataforma 0 es la de GPU y la 1 de CPU.
40 |     # Pueden comprobar esto imprimiendo las plataformas antes de indexarlas.
41 | 
42 | Luego, el resto de códigos se mantienen igual. Basta con cambiar el contexto para trabajar en CPU/GPU.
43 | 


--------------------------------------------------------------------------------
/AY04/README.md:
--------------------------------------------------------------------------------
 1 | Nuestros códigos los correremos en el clúster de ingeniería. Para conectarse debemos usar SSH, es decir, escribir en consola:
 2 | 
 3 | ```ssh username@cluster.ing.uc.cl```
 4 | 
 5 | o antes también se usaba: (todavía me sigue funcionando)
 6 | 
 7 | ```ssh username@mazinger.ing.puc.cl```
 8 | 
 9 | Siendo ```username``` el nombre de usuario de su correo UC. Luego de escribir lo anterior, les preguntará una contraseña (se las enviaré a sus mails).
10 | 
11 | Una vez dentro, se pueden usar los comandos típicos que usamos en consola: ```ls```, ```cd```, ```rm```, ```mkdir```, ```htop``` (para ver los procesos corriendo en el servidor), entre otros.
12 | 
13 | Si quieren cambiar su contraseña del servidor, pueden usar el comando ```passwd```.
14 | 
15 | Subir y recibir archivos:
16 | 
17 | - Opción 1: ```scp -r local_dir username@cluster.ing.uc.cl:server_dir```
18 | 
19 |     La línea anterior se corre en consola y el -r sirve para subir una carpeta (si se quita, se sube el archivo indicado en el path). Por ejemplo: tengo mi carpeta ```AY04``` y quiero subirla a mi carpeta del servidor, entonces, ubicado en el path de mi carpeta en la consola, escribo el comando  ```scp -r AY04/ alberto.almuna@cluster.ing.uc.cl:~```, les preguntará por su contraseña. Este comando también se puede utilizar para enviar archivos desde el clúster a nuestro computador y solo basta con intercambiar el destino y origen (Ej: ```scp alberto.almuna@cluster.ing.uc.cl:~/log.out ./AY04```)
20 | 
21 | - Opción 2: crear un repositorio de Github y clonarlo en su computador y en el servidor, para luego transferir archivos con push y pull.
22 | 
23 | Correr código:
24 | 
25 | Para correr nuestros códigos podemos hacer lo mismo que en nuestro computador o pedirle al servidor que lo agregue a su cola de trabajo (cuando queramos correr algo grande).
26 | 
27 | La forma de compilar y correr nuestro código es la misma que ya hemos hecho.
28 | 
29 | Y para agregarlo a la cola de trabajo (con el job.sh creado):
30 | 
31 | - ```sbatch job.sh``` -> agrega el trabajo job.sh a la cola del clúster (este archivo corresponde a un set de instrucciones a realizar)
32 | 
33 | En nuestro caso, una vez que termine el job, creará un archivo ```log.out``` con el resultado obtenido. Si queremos leerlo rápidamente, podemos usar el comando ```cat log.out```. También podemos enviarnos el resultado usando el comando ```scp``` como arriba.
34 | 
35 | Para obtener información de la cola podemos usar ```squeue```, ```top``` o ```htop```.
36 | 
37 | Para más información sobre el clúster de Ing: https://deg.ing.uc.cl/informatica/cluster/
38 | 
39 | En particular, en la página https://deg.ing.uc.cl/informatica/cluster/trabajos-al-cluster/ en la sección de "Solicitud de CPU" se puede observar otro ejemplo de un archivo como ```job.sh```.
40 | 
41 | # Video Clúster
42 | 
43 | Hace un par de años hice un video que revisa todo lo importante relacionado al clúster (quizás ya toca hacerle una actualización, pero se los dejo disponible por si les sirve):
44 | 
45 | https://youtu.be/LqeU8yo_b-w
46 | 
47 | En este video uso el ssh con "mazinger.ing.puc.cl", pero hace poco se cambió a "cluster.ing.uc.cl". A mi todavía me funciona el anterior, pero igual les recomiendo utilizar la dirección más nueva.
48 | 
49 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # IMT2112-2024-2
 2 | Repositorio para las ayudantías de Algoritmos Paralelos en Computación Científica.
 3 | 
 4 | Mail: alberto.almuna@uc.cl
 5 | 
 6 | Si quieres dar algún comentario u opinión sobre la ayudantía: [Formulario de Feedback](https://forms.gle/7B53rLqTXwvjFFhW8)
 7 | 
 8 | ## Correr código de C/C++ en Windows
 9 | 
10 | Recomiendo instalar WSL (Windows Subsystem for Linux) y pueden seguir la siguiente [guía](https://docs.microsoft.com/en-us/windows/wsl/install)
11 | 
12 | En teoría debería funcionar la instalación directa con 
13 | ```
14 | wsl --install
15 | ```
16 | en la PowerShell con permisos de administrador, sin embargo, si no les funciona, recomiendo seguir la instalación manual que se menciona en el link anterior.
17 | 
18 | Una vez leída la guía anterior e instalado WSL2 junto a su distribución de Linux preferida (sugiero instalar Ubuntu que es la que viene por defecto en el comando anterior), recomiendo instalar la consola Windows Terminal (desde la Microsoft Store) ya que permite manejar distintos tipos de consolas dentro de la misma y, en general, es bastante cómoda.
19 | 
20 | Una vez dentro de WSL, corran al siguiente comando:
21 | ```
22 | cd ~
23 | ```
24 | ![image](https://user-images.githubusercontent.com/53873288/186964595-824dab4e-45f2-47a7-8d50-4fe40eb80e7a.png)
25 | 
26 | para ir al directorio base. Si no hacen lo anterior, correrán los códigos desde Windows y no desde la distribución de Linux, lo que puede afectar el rendimiento de la ejecución. Entonces en este punto recomiendo crear las carpetas que utilizarán para el curso.
27 | 
28 | A continuación, pueden instalar git y el compilador que utilizaremos con los siguientes comandos:
29 | ```
30 | sudo apt-get update
31 | sudo apt-get install git
32 | sudo apt-get install g++
33 | ```
34 | 
35 | Aunque Valgrind debería venir instalado, pueden intentar instalarlo con el siguiente comando:
36 | ```
37 | sudo apt-get install valgrind
38 | ```
39 | es una herramienta que nos ayudará a debuggear nuestro código ya que C/C++ no va a ser muy explícito en decirnos qué está fallando.
40 | 
41 | Si utilizan VS Code, deben instalar la extensión 
42 | ![image](https://user-images.githubusercontent.com/53873288/186965303-e73d7741-0dc4-48b5-89d3-ec318447505a.png)
43 | 
44 | de esta manera, pueden correr el comando
45 | ```
46 | code .
47 | ```
48 | en la consola de WSL y les abrirá VS Code dentro del entorno de la distribución de Linux.
49 | 
50 | Además, recomiendo instalar la extensión de C/C++ de VS Code, ya que contiene algunos atajos que nos ayudarán a programar más rápido!
51 | 
52 | ## Correr código de C/C++ en Mac
53 | 
54 | El año pasado me compartieron el siguiente [video](https://youtu.be/lGsyqgpMAYY?si=Nllk5YKNcEBY_p1E) para la instalación en Mac (M1/M2). No lo puedo probar, pero el año pasado no produjo problemas para la realización del curso. Por favor avísenme si es que encuentran que algún otro tutorial les funciona mejor o si este presenta problemas.
55 | 
56 | ## Extas
57 | 
58 | Por último, algunos comando útiles para navegar dentro de la consola:
59 | - ```cd nombre_carpeta```: es para moverse por los directorios
60 | - ```cd ..```: es para devolverse una carpeta
61 | - ```mkdir nombre_carpeta```: es para crear una nueva carpeta
62 | - ```ls```: es para ver los archivos que se encuentran en el directorio actual
63 | 
64 | ## Si quieren aprender más o profundizar sobre lo visto en la ayudantía les recomiendo el siguiente [enlace](https://github.com/DCCentral-de-Apuntes/intro-C). Es un un taller muy completo para aprender C y que nos sirve para entender C++. Además, cualquier problema que tengan con la instalación, no duden en escribirme para ver si lo podemos solucionar 🐧
65 | 


--------------------------------------------------------------------------------
/AY05/3MPI_dot.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Correr código (en consola):
 3 | - Compilar: mpic++ 3MPI_dot.cpp -std=c++11
 4 | - Correr (con 2 procesos): mpirun -np 2 ./a.out
 5 | Se puede cambiar el número de procesos con el que se corre
 6 | */
 7 | 
 8 | #include <stdio.h>
 9 | #include <stdlib.h>
10 | #include <chrono>
11 | #include <iostream>
12 | #include <mpi.h>
13 | using namespace std;
14 | 
15 | int main() {
16 | 
17 |     // Iniciamos el entorno de MPI y obtenemos los valores usuales
18 |     MPI_Init(NULL,NULL);
19 | 	int world_size, world_rank;
20 | 	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
21 | 	MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
22 | 
23 |     // Que solo el proceso raíz imprima la cantidad de procesos totales:
24 |     if (world_rank == 0) {
25 |         printf("Cantidad de procesos: %i\n\n", world_size);
26 |     }
27 | 
28 |     // Inicializamos algunas variables que utilizaremos
29 |     int firstIndex, localSize, n, err;
30 |     
31 |     // Vamos a trabajar con vectores de largo n
32 |     n = 7;
33 | 
34 |     // Calculamos el primer índice y el tamaño local de cada proceso
35 |     localSize = n / world_size;
36 |     firstIndex = world_rank*localSize;
37 | 
38 |     // Recordar que hay que agregar lo que sobra (por si la división no es exacta)
39 |     if (world_rank == world_size-1) {
40 |         localSize += n % world_size;
41 |     }
42 |     printf("Rank %i, local size: %i, first index %i \n", world_rank, localSize, firstIndex);
43 | 
44 |     // Cada proceso crea la porción del vector que usará:
45 | 	int localVec1[localSize];
46 | 	int localVec2[localSize];
47 | 	for (int i=0; i<localSize; i++) {
48 | 		localVec1[i] = 1;               // Vector de 1s
49 |         localVec2[i] = firstIndex + i;  // Vector con valores de posición
50 |         //printf("Rank: %i, locvec1 %i, locvec2 %i\n", world_rank, localVec1[i], localVec2[i]);
51 | 	}
52 | 
53 |     // Cada proceso calcula su producto punto local:
54 | 	int localSum = 0;
55 | 	for (int i=0; i<localSize; i++) {
56 | 		localSum += localVec1[i] * localVec2[i];
57 | 	}
58 |     // printf("Local Sum: %i\n", localSum);
59 |     
60 |     // Finalmente, nos encargamos de la comunicación de la información:
61 |     int globalSum = localSum;
62 |     int receiveBuffer;
63 | 
64 |     if (world_rank == 0) {  // Proceso raíz recibe
65 |         for (int p=1; p<world_size; p++) {
66 |             err = MPI_Recv(&receiveBuffer, 1, MPI_INT, p, p, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
67 |             //MPI_Recv(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Status *status)
68 |             globalSum += receiveBuffer;
69 |         }
70 |         printf("El valor del producto punto es: %i\n", globalSum);
71 |     } else {                // El resto de procesos envía
72 |         err = MPI_Send(&localSum, 1, MPI_INT, 0, world_rank, MPI_COMM_WORLD);
73 |         //MPI_Send(void* data, int count, MPI_Datatype datatype, int destination, int tag, MPI_Comm communicator)
74 |     }
75 | 
76 |     // Recordar que también se puede con otras funciones como las de reduce.
77 | 
78 | 	MPI_Finalize();
79 | }
80 | 
81 | /*
82 | Cabe destacar que, en estos ejemplos, estamos viendo casos en que cada proceso se preocupa de crear y trabajar su parte local.
83 | En general, no vale la pena que cada proceso posea una copia entera de la matriz o vector que se está trabajando, ya que habría mucha 
84 | información repetida, lo que provoca que se utilice más memoria de la necesaria y hay que tener mucho ojo si se quiere actualizar 
85 | alguna entrada de esos objetos (repartir la información a todos los procesos y cuidar que los cálculos se realicen con las últimas versiones de todo).
86 | En resumen, es mejor que cada proceso trabaje solo con lo que le corresponde, creándolo/leyéndolo él mismo desde un inicio.
87 | */
88 | 


--------------------------------------------------------------------------------
/AY04/3arrays.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Correr código (en consola):
 3 | - Compilar: g++ 3arrays.cpp -o name_output
 4 | - Correr: ./name_output
 5 | */
 6 | 
 7 | #include <stdio.h>
 8 | #include <stdlib.h> 
 9 | #include <ctime>
10 | 
11 | int main() {
12 |   
13 |     // las listas no vienen nativas, existe el concepto de array
14 | 
15 |     // Primera manera de hacerlo (stack): dame un array de tamaño n de ints
16 |     int n = 10;
17 |     int array1[n];
18 |     for (int i=0; i<10; i++) {
19 |         printf("%d ", array1[i]); // no inicializa valores (toma lo que hay directamente)
20 |     }
21 |     printf("\n");
22 |     
23 |     // Segunda manera (heap): puntero de int. calloc es resérvame 10 casilleros 
24 |     // de memoria del tamaño de un int y dales valor 0. El (int*) es un casting de int a puntero de int.
25 |     int* array2 = (int*) calloc(10, sizeof(int)); // inicializa valores a 0
26 |     for (int i=0; i<10; i++) {
27 |         printf("%d ", array2[i]);
28 |     }
29 |     printf("\n");
30 | 
31 |     // Tercera forma (heap): en malloc solo le damos el tamaño total de lo que quiero reservar.
32 |     // También es necesario hacer el casting a puntero de int!
33 |     int* array3 = (int*) malloc(10 * sizeof(int)); // no inicializa valores (toma lo que hay directamente)
34 |     for (int i=0; i<10; i++) {
35 |         printf("%d ", array3[i]);
36 |     }
37 |     printf("\n");
38 |     
39 | 
40 |     // Generación de números aleatorios
41 |     // srand((int) time(0)); // seteamos una semilla con el tiempo actual
42 |                              // para obtener resultados distintos en cada ejecución
43 |     //srand(1);
44 |     
45 |     for (int i=0;i<10;++i) {
46 |         // array2[i] = rand(); // genera números random
47 |         // array2[i] = rand()%10; // para generar números en algún rango (0 y 9)
48 |         // array2[i] = rand()%201 - 100; // entre -100 y 100
49 |         // printf("%i ", array2[i]);
50 |     }
51 |     printf("\n");
52 | 
53 | 
54 | 
55 |     // Algunos datos extras:
56 |     // ======================================
57 |     // // Veamos los punteros de cada array:
58 |     // printf("\nPunteros al array1:\n");
59 |     // printf("array1\t\t %p\n", array1);
60 |     // printf("&array1\t\t %p\n", &array1);
61 |     // printf("&array1[0]\t %p\n\n", &array1[0]);
62 | 
63 |     // printf("Punteros al array2:\n");
64 |     // printf("array2\t\t %p\n", array2);
65 |     // printf("&array2\t\t %p\n", &array2);
66 |     // printf("&array2[0]\t %p\n\n", &array2[0]);
67 | 
68 |     // printf("Punteros al array3:\n");
69 |     // printf("array3\t\t %p\n", array3);
70 |     // printf("&array3\t\t %p\n", &array3);
71 |     // printf("&array3[0]\t %p\n\n", &array3[0]);
72 | 
73 |     // // Acceso al elemento de un array:
74 |     // array2[2] = 10;
75 |     // printf("Le damos un valor a array[2] y accedemos a él:\n");
76 |     // printf("array[2] = %d\n", array2[2]);
77 |     // printf("2[array] = %d\n\n", 2[array2]);
78 |     // ======================================
79 | 
80 |     // Es necesario liberar las memorias que fueron reservadas con calloc o malloc (heap)
81 |     free(array2);
82 |     free(array3);
83 |     // No es necesario al array1 porque este no reserva memoria en el heap
84 | 
85 |     // La diferencia entre el primer arreglo y los dos siguientes es que:
86 |     // - El primer arreglo vive en el Stack, que es básicamente la memoria asignada por mi 
87 |     // sistema operativo para mi programa. Es muy útil, aunque tiene algunas limitaciones.
88 |     // - Los otros dos viven en el Heap, el que es memoria extra pedida por mi programa.
89 |     // Este último espacio de memoria tiene la ventaja de que siempre que se le pida, va a crecer, y, si se 
90 |     // libera algún bloque ocupado, entonces se reduce. Luego podemos tener control de cuánta memoria 
91 |     // le pedimos.
92 | 
93 |     return 0;
94 | }


--------------------------------------------------------------------------------
/AY05/4MPI_mat_vec.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Correr código (en consola):
  3 | - Compilar: mpic++ 4MPI_mat_vec.cpp -std=c++11
  4 | - Correr (con 2 procesos): mpirun -np 2 ./a.out
  5 | Se puede cambiar el número de procesos con el que se corre
  6 | */
  7 | 
  8 | #include <stdio.h>
  9 | #include <stdlib.h>
 10 | #include <chrono>
 11 | #include <iostream>
 12 | #include <mpi.h>
 13 | using namespace std;
 14 | 
 15 | 
 16 | int** matrix_generator(int filas, int columnas, int world_rank) {
 17 |   int** matrix = (int**) calloc(filas, sizeof(int*));
 18 | 
 19 |   for (int i = 0; i < filas; i++) {
 20 |     matrix[i] = (int*) calloc(columnas, sizeof(int));
 21 |   }
 22 | 
 23 |   for (int i = 0; i < filas; i++) {
 24 |     for (int j = 0; j < columnas; j++) {
 25 |         matrix[i][j] = i+j+world_rank;
 26 |     }
 27 |   }
 28 |   return matrix;
 29 | }
 30 | 
 31 | void free_matrix(int** matrix, int filas) {
 32 |   for (int i = 0; i < filas; i++) {
 33 |     free(matrix[i]);
 34 |   }
 35 |   free(matrix);
 36 | }
 37 | 
 38 | void print_matrix(int** matrix, int filas, int columnas) {
 39 |   printf("\n");
 40 |   for (int i = 0; i < filas; i++) {
 41 |       for (int j = 0; j < columnas; j++) {
 42 |         printf("%i ", matrix[i][j]); 
 43 |       }
 44 |     printf("\n");
 45 |   }
 46 | }
 47 | 
 48 | void print_vector(int* vector, int n) {
 49 |   printf("\n");
 50 |   for (int i = 0; i < n; i++) {
 51 |       printf("%i ", vector[i]);
 52 |   }
 53 |   printf("\n");
 54 | }
 55 | 
 56 | 
 57 | int main() {
 58 |     MPI_Init(NULL,NULL);
 59 |     int world_size, world_rank;
 60 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 61 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 62 | 
 63 |     if (world_rank == 0) {
 64 |         printf("Cantidad de procesos: %i\n\n", world_size);
 65 |     }
 66 | 
 67 |     int firstIndex, localColumnas, n, err;
 68 |     
 69 |     n = 8;
 70 | 
 71 |     localColumnas = n / world_size;
 72 |     firstIndex = world_rank*localColumnas;
 73 | 
 74 |     if (world_rank == world_size-1) {
 75 |         localColumnas += n % world_size;
 76 |     }
 77 | 
 78 |     printf("Rank %i, local columnas: %i, first index %i \n", world_rank, localColumnas, firstIndex);
 79 | 
 80 | 	  int localVec[localColumnas];
 81 | 
 82 | 	  for (int i=0; i<localColumnas; i++) {
 83 |         localVec[i] = firstIndex + i;
 84 | 	  }
 85 | 
 86 |     int** localMat = matrix_generator(n, localColumnas, world_rank);
 87 |     //print_vector(localVec, localColumnas);
 88 |     //print_matrix(localMat, n, localColumnas);
 89 | 	  int* localResult = (int*) calloc(n, sizeof(int));
 90 | 
 91 | 
 92 |     printf("Rank %i, empezando local mat vec\n", world_rank);
 93 | 	  for (int i=0; i<n; i++) {
 94 |         for (int j=0; j<localColumnas; j++) {
 95 |             localResult[i] += localMat[i][j] * localVec[j];
 96 |         }
 97 | 	  }
 98 |     printf("Rank %i, terminó local mat vec\n", world_rank);
 99 | 
100 | 
101 |     // Ahora nos toca reunir los resultados locales en el proceso 0.
102 |     // Lo haremos con Send y Recv, pero también podemos usar operaciones colectivas
103 |     // (notar eso sí que ahora estamos mandando n ints y no 1 como en los ejemplos anteriores).
104 |     if (world_rank == 0) {
105 |         printf("Rank 0 va a empezar el proceso de recibir\n");
106 |         int* buffer = (int*) calloc(n, sizeof(int));
107 |         for (int p=1; p<world_size; p++) {
108 |             printf("A punto de recibir de %i\n", p);
109 |             err = MPI_Recv(buffer, n, MPI_INT, p, p, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
110 |             //MPI_Recv(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Status *status)
111 |             printf("Recibido de %i\n", p);
112 |             for (int i=0; i<n; i++) {
113 |                 localResult[i] += buffer[i];
114 |             }
115 |         }
116 |         print_vector(localResult, n);
117 |         free(buffer);
118 |     } else {
119 |         printf("Rank %i va a enviar\n", world_rank);
120 |         err = MPI_Send(localResult, n, MPI_INT, 0, world_rank, MPI_COMM_WORLD);
121 |         printf("Rank %i terminó de enviar\n", world_rank);
122 |         //MPI_Send(void* data, int count, MPI_Datatype datatype, int destination, int tag, MPI_Comm communicator)
123 |     }
124 | 
125 |     free(localResult);
126 |     free_matrix(localMat, n);
127 | 	MPI_Finalize();
128 | }


--------------------------------------------------------------------------------
/AY05/2MPI_sum.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Correr código (en consola):
 3 | - Compilar: mpic++ 2MPI_sum.cpp -std=c++11
 4 | - Correr (con 2 procesos): mpirun -np 2 ./a.out
 5 | Se puede cambiar el número de procesos con el que se corre
 6 | */
 7 | 
 8 | #include <iostream>
 9 | using namespace std;
10 | #include <mpi.h>
11 | 
12 | int main() {
13 | 
14 |     // Iniciamos el entorno de MPI
15 | 	MPI_Init(NULL,NULL);
16 | 
17 |     // Obtenemos el número de procesos y el rango de cada uno:
18 | 	int world_size, world_rank;
19 | 	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
20 | 	MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
21 |     // Estos datos son importantes porque los utilizaremos para dividir bien el trabajo y manejar las comunicaciones.
22 | 
23 |     // Vamos a hacer una suma de 10 (sumlength) números
24 | 	int firstIndex, localSize;
25 |     int sumlength = 10;
26 | 
27 |     // Obtenemos cuántos números sumará cada proceso y desde qué posición parte:
28 |     localSize = sumlength / world_size;
29 |     firstIndex = world_rank * (sumlength/world_size);
30 | 
31 |     // Notar que la división puede no ser exacta, por lo que hay que sumarle el resto al último proceso
32 |     // (Puede ser otro proceso el encargado del resto, pero elegimos al último por comodidad).
33 |     if (world_rank == world_size-1){
34 |         localSize += sumlength % world_size;
35 |     }
36 |     
37 |     // Imprimimos la información que tenemos hasta ahora:
38 | 	cout << "Rank: " << world_rank << ", first index: " << firstIndex << ", local size: " << localSize << endl;
39 | 
40 |     // Cada proceso crea su vector local que tendrá los valores a sumar:
41 | 	int localVector[localSize];
42 | 	for (int n=0; n<localSize; n++){
43 | 		localVector[n] = firstIndex + n;
44 | 		cout << "Rank: " << world_rank << ", localVector[" << n << "] = " << localVector[n] << endl;
45 | 	}
46 | 
47 |     // Cada proceso obtiene su suma local:
48 | 	int localSum = 0;
49 | 	for (int n=0; n<localSize; n++){
50 | 		localSum += localVector[n];
51 | 	}
52 | 	cout << "Rank: " << world_rank << ", local sum: " << localSum << endl;
53 |     
54 |     // Hasta el momento, cada proceso tiene su suma local.
55 |     // Ahora, debemos encargarnos de juntar todos estos resultados en un proceso y retornar el resultado:
56 |     int globalSum;
57 |     globalSum = 0; // Variable que tendrá el valor final de la suma
58 | 
59 |     // El proceso de rango 0 se encargará de recibir las sumas locales de todos los otros procesos:
60 |     if (world_rank == 0){
61 |         int receiveBuffer;
62 |         globalSum = localSum;
63 |         for (int p=1; p<world_size; p++){
64 |             MPI_Recv(&receiveBuffer, 1, MPI_INT, p, p, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
65 |             // int MPI_Recv ( void* buffer, int contador, MPI_Datatype tipo, int origen, int etiqueta, MPI_Comm comunicador, MPI_Status *estado)
66 |             globalSum += receiveBuffer;
67 |         }
68 |     }
69 | 
70 |     // En cambio, el resto de procesos debe enviar su resultado al proceso 0:
71 |     else {
72 |         MPI_Send(&localSum, 1, MPI_INT, 0, world_rank, MPI_COMM_WORLD);
73 |         // int MPI_Send ( void* data, int contador, MPI_Datatype tipo, int destino, int etiqueta, MPI_Comm comunicador)
74 |     }
75 |     cout << "Rank: " << world_rank << ", global sum from send and receive: " << globalSum << endl;
76 | 
77 |     // =================================================================================================
78 |     // Logramos recibir todos los cálculos en el root definiendo la comunicación específica de cada proceso. 
79 |     // Veamos otras formas de hacer lo mismo:
80 | 
81 |     // Podemos usar Reduce para juntar los datos en un proceso raíz a través de una operación
82 | 	globalSum = 0;
83 | 	MPI_Reduce(&localSum, &globalSum, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
84 |     // MPI_Reduce(void* send_data, void* recv_data, int count, MPI_Datatype datatype, MPI_Op operation, int root, MPI_Comm communicator)
85 | 	cout << "Rank: " << world_rank << ", global sum from reduce: " << globalSum << endl;
86 | 
87 |     // Podemos usar Allreduce para que el resultado reducido esté en todos los procesos:
88 | 	globalSum = 0;
89 | 	MPI_Allreduce(&localSum, &globalSum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
90 |     // MPI_Allreduce(void* send_data, void* recv_data, int count, MPI_Datatype datatype, MPI_Op operation, MPI_Comm communicator)
91 | 	cout << "Rank: " << world_rank << ", global sum from allreduce: " << globalSum << endl;
92 | 
93 | 	MPI_Finalize();
94 | 
95 | }


--------------------------------------------------------------------------------
/AY05/5MPI_variable_collective.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Correr código (en consola):
  3 | - Compilar: mpic++ 5MPI_variable_collective.cpp -std=c++11
  4 | - Correr (con 2 procesos): mpirun -np 2 ./a.out
  5 | Se puede cambiar el número de procesos con el que se corre
  6 | */
  7 | 
  8 | #include <stdio.h>
  9 | #include <stdlib.h>
 10 | #include <chrono>
 11 | #include <iostream>
 12 | #include <mpi.h>
 13 | using namespace std;
 14 | 
 15 | 
 16 | void print_vector(int* vector, int n, int rank, const char* text) {
 17 |     printf("\nRank %i, %s:\n", rank, text);
 18 |     for (int i = 0; i < n; i++) {
 19 |         printf("%i ", vector[i]);
 20 |     }
 21 |     printf("\n");
 22 | }
 23 | 
 24 | /*
 25 | A veces podemos querer utilizar alguna operación colectiva para facilitar el envío y recepción de datos, sin embargo, 
 26 | puede darse el caso que todos los procesos envían una cantidad distinta de elementos, es decir, no todos los trabajadores 
 27 | tienen los mismos tamaños locales. Para esto, existen operaciones colectivas variables que permiten enviar/recibir 
 28 | cantidades distintas de datos por cada proceso. Y, para utilizarlas, basta con otorgar un array de tamaños y otro de 
 29 | offsets (posiciones), los que representan la información de cada trabajador. 
 30 | Página con todas las operaciones colectivas de MPI: https://learn.microsoft.com/en-us/message-passing-interface/mpi-collective-functions 
 31 | A continuación, presentamos un ejemplo de Allgatherv:
 32 | */
 33 | 
 34 | int main() {
 35 |     MPI_Init(NULL,NULL);
 36 |     int world_size, world_rank;
 37 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 38 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 39 | 
 40 |     if (world_rank == 0) {
 41 |         printf("Cantidad de procesos: %i\n\n", world_size);
 42 |     }
 43 | 
 44 |     int first_index, local_size, err;
 45 | 
 46 |     // Vamos a hacer un ejemplo donde cada proceso tiene un vector de tamaño igual a su rango +1
 47 |     // Definamos las características de cada uno:
 48 |     local_size = world_rank + 1;
 49 |     first_index = 0;
 50 |     for (int i = 0; i < world_rank; i++)
 51 |     {
 52 |         first_index += i + 1;
 53 |     }
 54 |     printf("\nRank %i, local size: %i, first index %i \n", world_rank, local_size, first_index);
 55 |     
 56 |     // Ahora, creamos el vector local de cada proceso:
 57 |     int *local_vec = new int[local_size];
 58 | 
 59 |     // Rellenamos el vector:
 60 |     for (int i = 0; i < local_size; i++)
 61 |     {
 62 |         local_vec[i] = i + first_index;
 63 |     }
 64 |     const char* text = "local vector";
 65 |     print_vector(local_vec, local_size, world_rank, text);
 66 |     
 67 |     // Hasta el momento, cada proceso tiene su propio vector. 
 68 |     // Y queremos reunir todas las partes en todos los procesos.
 69 |     // Utilicemos operaciones colectivas para realizar esto de manera directa!
 70 |     
 71 |     // Calculamos el tamaño total del vector completo:
 72 |     int total_size = 0; 
 73 |     for (int i = 1; i < world_size + 1; i++)
 74 |     {
 75 |         total_size += i;
 76 |     }
 77 | 
 78 |     // Necesitamos crear arrays que guarden la cantidad de elementos a recibir de cada proceso y los offsets correspondientes
 79 |     int recvcounts[world_size];
 80 |     int offsets[world_size];
 81 |     for (int i = 0; i < world_size; i++)
 82 |     {
 83 |         recvcounts[i] = i+1;
 84 |         offsets[i] = i;
 85 |         if (i > 0)
 86 |         {
 87 |             offsets[i] += offsets[i-1];
 88 |         }
 89 |     }
 90 |     if (world_rank==0) {
 91 |         const char* text2 = "recvcounts vector";
 92 |         print_vector(recvcounts, world_size, world_rank, text2);
 93 |         const char* text3 = "offsets vector";
 94 |         print_vector(offsets, world_size, world_rank, text3);
 95 |     }
 96 |     
 97 | 
 98 |     // Creamos el vector completo
 99 |     int *full_vec = new int[total_size];
100 |     
101 |     // Y utilizamos la operación colectiva AllGather pero variable:
102 |     // (ya que no todos los procesos envían la misma cantida de elementos)
103 |     err = MPI_Allgatherv(local_vec, local_size, MPI_INT, full_vec, recvcounts, offsets, MPI_INT, MPI_COMM_WORLD);
104 |     // int MPI_Allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
105 |     //                void *recvbuf, const int *recvcounts, const int *displs,
106 |     //                MPI_Datatype recvtype, MPI_Comm comm)
107 | 
108 |     const char* text4 = "full vector";
109 |     print_vector(full_vec, total_size, world_rank, text4);
110 | 
111 |     // No olvidemos de liberar la memoria reservada en el heap
112 |     delete[] local_vec;
113 |     delete[] full_vec;
114 | 
115 | 
116 |     MPI_Finalize();
117 | }


--------------------------------------------------------------------------------
/AY05/README.md:
--------------------------------------------------------------------------------
 1 | ## Instalación MPI en WSL
 2 | 
 3 | Para instalar MPI en WSL, pueden usar los siguientes comandos en consola:
 4 | 
 5 |     sudo apt-get update
 6 |     sudo apt install openmpi-bin libopenmpi-dev
 7 | 
 8 | # Clúster
 9 | 
10 | Nuestros códigos los correremos en el clúster de ingeniería. Para conectarse debemos usar SSH, es decir, escribir en consola:
11 | 
12 | ```ssh username@cluster.ing.uc.cl```
13 | 
14 | o antes también se usaba: (todavía me sigue funcionando)
15 | 
16 | ```ssh username@mazinger.ing.puc.cl```
17 | 
18 | Siendo ```username``` el nombre de usuario de su correo UC. Luego de escribir lo anterior, les preguntará una contraseña (se las enviaré a sus mails).
19 | 
20 | Una vez dentro, se pueden usar los comandos típicos que usamos en consola: ```ls```, ```cd```, ```rm```, ```mkdir```, ```htop``` (para ver los procesos corriendo en el servidor), entre otros.
21 | 
22 | Si quieren cambiar su contraseña del servidor, pueden usar el comando ```passwd```.
23 | 
24 | ## Subir y recibir archivos
25 | 
26 | - Opción 1: ```scp -r local_dir username@cluster.ing.uc.cl:server_dir```
27 | 
28 |     La línea anterior se corre en consola y el -r sirve para subir una carpeta (si se quita, se sube el archivo indicado en el path). Por ejemplo: tengo mi carpeta ```AY05``` y quiero subirla a mi carpeta del servidor, entonces, ubicado en el path de mi carpeta en la consola, escribo el comando  ```scp -r AY05/ alberto.almuna@cluster.ing.uc.cl:~```, les preguntará por su contraseña. Este comando también se puede utilizar para enviar archivos desde el clúster a nuestro computador y solo basta con intercambiar el destino y origen (Ej: ```scp alberto.almuna@cluster.ing.uc.cl:~/log.out ./AY05```)
29 | 
30 | - Opción 2: crear un repositorio de Github y clonarlo en su computador y en el servidor, para luego transferir archivos con push y pull.
31 | 
32 | ## Correr código
33 | 
34 | Para correr nuestros códigos podemos hacer lo mismo que en nuestro computador o pedirle al servidor que lo agregue a su cola de trabajo (cuando queramos correr algo grande).
35 | 
36 | La forma de compilar y correr nuestro código con MPI es la siguiente:
37 | 
38 | - Antes de poder compilar con ```mpic++``` en el clúster, es necesario correr el comando ```module load mpi/openmpi-x86_64``` en la consola del servidor (cada vez que nos conectemos es necesario volver a correr el comando anterior). Esto carga el módulo asociado a MPI, lo que significa que ahora podremos compilar y correr nuestros scripts de la misma manera que en nuestro computador personal. En caso de no correr la línea anterior luego de hacer login e intentar compilar, nos aparecerá un error de que no se encuentra el comando ```mpic++```.
39 | - ```mpic++ code.cpp -std=c++11``` -> compila el código y lo guarda en a.out
40 | - ```mpirun ./a.out``` -> corre el ejecutable a.out
41 | - ```mpirun -np 2 ./a.out``` -> corre el ejecutable a.out con 2 procesos
42 | 
43 | Y para agregarlo a la cola de trabajo (con el job.sh creado):
44 | 
45 | - ```sbatch job.sh``` -> agrega el trabajo job.sh a la cola de trabajo. Ojo que también es necesario correr el comando ```module load mpi/openmpi-x86_64``` antes de mandar el trabajo a la cola.
46 | 
47 | En nuestro caso, una vez que termine el job, creará un archivo ```log.out``` con el resultado obtenido. Si queremos leerlo rápidamente, podemos usar el comando ```cat log.out```.
48 | 
49 | Para obtener información de la cola podemos usar ```squeue```, ```top``` o ```htop```.
50 | 
51 | Para más datos sobre el clúster de Ing: https://deg.ing.uc.cl/informatica/cluster/
52 | 
53 | ## Correr Python en el clúster
54 | 
55 | En la tarea es necesario también correr un código de Python para generar la matriz pedida y, si queremos hacer un ejemplo bien grande, es buena idea correr este script también en el clúster. En dicho caso, puede ocurrir que no estén todas las librerías de Python necesarias directamente instaladas en el servidor, por lo que, para instalarlas, recomiendo seguir los pasos descritos en la documentación del clúster de ingeniería (https://dt.ing.uc.cl/recursos/cluster/ en la pestaña de "Software" y en la sección de Python). Acá se menciona que cada usuario debe usar Anaconda para administrar los paquetes extras y se indican las instrucciones para la instalación.
56 | 
57 | ## Video Clúster
58 | 
59 | Hace un par de años hice un video que revisa todo lo importante relacionado al clúster (quizás ya toca hacerle una actualización, pero se los dejo disponible por si les sirve):
60 | 
61 | https://youtu.be/LqeU8yo_b-w
62 | 
63 | ### IMPORTANTE: Cuando hice este video no era necesario correr el comando de ```module load mpi/openmpi-x86_64``` antes de querer compilar códigos de MPI, sin embargo, si ahora intentan ejecutar ```mpic++``` sin correr la línea anterior, el servidor les tirará un error porque no será capaz de encontrar el comando.
64 | 
65 | Otro detalle es que uso el ssh con "mazinger.ing.puc.cl", pero hace poco se cambió a "cluster.ing.uc.cl". A mi todavía me funciona el anterior, pero igual les recomiendo utilizar la dirección más nueva.
66 | 
67 | 


--------------------------------------------------------------------------------
/AY08/Numba/6_Numba_caching.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "770a296f",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Caching with Numba\n",
  9 |     "\n",
 10 |     " - Elwin van 't Wout\n",
 11 |     " - Pontificia Universidad Católica de Chile\n",
 12 |     " - IMT3870\n",
 13 |     " - 28-8-2023\n",
 14 |     "\n",
 15 |     "Cache a function optimised by Numba."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "id": "217c54e1",
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import time\n",
 26 |     "import numpy as np\n",
 27 |     "from numba import jit"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "id": "6192058c",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "Numba uses JIT to compile functions when they are called for the first time. The compiled function will be immediately used for subsequent calls. This means that after restarting the kernel or shutting down the Jupyter notebook, the compiled function will be *lost*. Numba can store the compiled function on disk, called *caching* (not to be confused with the cache memory)."
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 2,
 41 |    "id": "2f57b2ac",
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "@jit(nopython=True, cache=False)\n",
 46 |     "def sum_vector_no_cache(a):\n",
 47 |     "    s = np.sum(a)\n",
 48 |     "    return s\n",
 49 |     "\n",
 50 |     "@jit(nopython=True, cache=True)\n",
 51 |     "def sum_vector_cached(a):\n",
 52 |     "    s = np.sum(a)\n",
 53 |     "    return s"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 3,
 59 |    "id": "eb370ff0",
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "n = int(1e7)\n",
 64 |     "vec = np.arange(n)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "id": "c63e69dc",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "The first call is always slow because the code needs to be compiled."
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 4,
 78 |    "id": "d171e3dc",
 79 |    "metadata": {},
 80 |    "outputs": [
 81 |     {
 82 |      "name": "stdout",
 83 |      "output_type": "stream",
 84 |      "text": [
 85 |       "The 1st call to the Numba function took: 0.5009028911590576 seconds.\n",
 86 |       "The 2nd call to the Numba function took: 0.012819528579711914 seconds.\n",
 87 |       "The 3rd call to the Numba function took: 0.001155853271484375 seconds.\n"
 88 |      ]
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "time_1 = time.time()\n",
 93 |     "sum_vector_no_cache(vec)\n",
 94 |     "time_2 = time.time()\n",
 95 |     "sum_vector_no_cache(vec)\n",
 96 |     "time_3 = time.time()\n",
 97 |     "sum_vector_no_cache(vec)\n",
 98 |     "time_4 = time.time()\n",
 99 |     "print(\"The 1st call to the Numba function took:\",time_2-time_1,\"seconds.\")\n",
100 |     "print(\"The 2nd call to the Numba function took:\",time_3-time_2,\"seconds.\")\n",
101 |     "print(\"The 3rd call to the Numba function took:\",time_4-time_3,\"seconds.\")"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 5,
107 |    "id": "2b2876b2",
108 |    "metadata": {},
109 |    "outputs": [
110 |     {
111 |      "name": "stdout",
112 |      "output_type": "stream",
113 |      "text": [
114 |       "The 1st call to the Numba function took: 0.03046727180480957 seconds.\n",
115 |       "The 2nd call to the Numba function took: 0.0 seconds.\n",
116 |       "The 3rd call to the Numba function took: 0.0 seconds.\n"
117 |      ]
118 |     }
119 |    ],
120 |    "source": [
121 |     "time_1 = time.time()\n",
122 |     "sum_vector_cached(vec)\n",
123 |     "time_2 = time.time()\n",
124 |     "sum_vector_cached(vec)\n",
125 |     "time_3 = time.time()\n",
126 |     "sum_vector_cached(vec)\n",
127 |     "time_4 = time.time()\n",
128 |     "print(\"The 1st call to the Numba function took:\",time_2-time_1,\"seconds.\")\n",
129 |     "print(\"The 2nd call to the Numba function took:\",time_3-time_2,\"seconds.\")\n",
130 |     "print(\"The 3rd call to the Numba function took:\",time_4-time_3,\"seconds.\")"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "id": "2f755ede",
136 |    "metadata": {},
137 |    "source": [
138 |     "The very first time this Notebook is used, Numba needs to perform the optimisation and compilation, which takes time. Next time, the first call will be fast as well. It is slightly slower than the second and third call since the compiled code needs to be read from disk, but much quicker than without caching."
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "id": "5b86341a",
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": []
148 |   }
149 |  ],
150 |  "metadata": {
151 |   "kernelspec": {
152 |    "display_name": "Python 3",
153 |    "language": "python",
154 |    "name": "python3"
155 |   },
156 |   "language_info": {
157 |    "codemirror_mode": {
158 |     "name": "ipython",
159 |     "version": 3
160 |    },
161 |    "file_extension": ".py",
162 |    "mimetype": "text/x-python",
163 |    "name": "python",
164 |    "nbconvert_exporter": "python",
165 |    "pygments_lexer": "ipython3",
166 |    "version": "3.11.2"
167 |   }
168 |  },
169 |  "nbformat": 4,
170 |  "nbformat_minor": 5
171 | }
172 | 


--------------------------------------------------------------------------------
/AY08/Numba/4_Numba_race_condition.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "68659313",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Race conditions in parallel Numba\n",
  9 |     "\n",
 10 |     " - Elwin van 't Wout\n",
 11 |     " - Pontificia Universidad Católica de Chile\n",
 12 |     " - IMT3870\n",
 13 |     " - 26-8-2024\n",
 14 |     "\n",
 15 |     "This tutorial shows a race condition in a parallel for-loop that leads to code that is not thread safe."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "id": "8cfe49b2",
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "from numba import njit, prange\n",
 26 |     "import numpy as np"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "id": "9a315fde",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "The following code sums the elements of a vector."
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 2,
 40 |    "id": "ecdd83c9",
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "@njit(parallel=True)\n",
 45 |     "def sum_vector(x):\n",
 46 |     "    length = x.shape[0]\n",
 47 |     "    s = 0\n",
 48 |     "    for i in prange(length):\n",
 49 |     "        s += x[i]\n",
 50 |     "    return s   "
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "id": "04b98951",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "The expected sum of a vector with elements $0,1,2,\\dots,n-1$ is $(n-1)n/2$."
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 3,
 64 |    "id": "fc4d5095",
 65 |    "metadata": {},
 66 |    "outputs": [
 67 |     {
 68 |      "name": "stdout",
 69 |      "output_type": "stream",
 70 |      "text": [
 71 |       "The vector with 100000 elements sums to 4999950000\n"
 72 |      ]
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "n = int(1e5)\n",
 77 |     "vec = np.arange(n)\n",
 78 |     "sum_exact = int((n-1)*n/2)\n",
 79 |     "print(\"The vector with\",n,\"elements sums to\",sum_exact)"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 4,
 85 |    "id": "38a3e9ad",
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "name": "stdout",
 90 |      "output_type": "stream",
 91 |      "text": [
 92 |       "The sum calculated by Numba is: 4999950000\n"
 93 |      ]
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "sum_numba = sum_vector(vec)\n",
 98 |     "print(\"The sum calculated by Numba is:\",sum_numba)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "id": "33307306",
104 |    "metadata": {},
105 |    "source": [
106 |     "Now, let us create an array with four elements, and calculate the sum of the input vector for each element of the output array. With Python broadcasting, you can add each element of the input array to the entire output array."
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 5,
112 |    "id": "85447bdd",
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "@njit(parallel=True)\n",
117 |     "def sum_vector_in_array_race_condition(x):\n",
118 |     "    length = x.shape[0]\n",
119 |     "    s = np.zeros(4, dtype=np.int_)\n",
120 |     "    for i in prange(length):\n",
121 |     "        s[:] += x[i]\n",
122 |     "    return s"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 6,
128 |    "id": "974abac2",
129 |    "metadata": {},
130 |    "outputs": [
131 |     {
132 |      "name": "stdout",
133 |      "output_type": "stream",
134 |      "text": [
135 |       "The sum calculated by Numba is: [1927748978 2055798753 1979814415 2010307473]\n"
136 |      ]
137 |     }
138 |    ],
139 |    "source": [
140 |     "sum_race = sum_vector_in_array_race_condition(vec)\n",
141 |     "print(\"The sum calculated by Numba is:\",sum_race)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "id": "797b1e6f",
147 |    "metadata": {},
148 |    "source": [
149 |     "The four elements of the output array should all be the sum of the input vector, but this is not the case. The code is not thread safe! Each element is different because there is a race condition."
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "id": "ab24b8f0",
155 |    "metadata": {},
156 |    "source": [
157 |     "The following adaptation solves the race condition because it changes the data types such that Numba understands the race condition and parallelises the code correctly. Specifically, you need to create a slice reference outside the loop."
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 7,
163 |    "id": "9eb742e3",
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "@njit(parallel=True)\n",
168 |     "def sum_vector_in_array_safe(x):\n",
169 |     "    length = x.shape[0]\n",
170 |     "    y = np.zeros(4, dtype=np.int_)\n",
171 |     "    s = y[:]\n",
172 |     "    for i in prange(length):\n",
173 |     "        s += x[i]\n",
174 |     "    return s"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 8,
180 |    "id": "d86836cc",
181 |    "metadata": {},
182 |    "outputs": [
183 |     {
184 |      "name": "stdout",
185 |      "output_type": "stream",
186 |      "text": [
187 |       "The sum calculated by Numba is: [704982704 704982704 704982704 704982704]\n"
188 |      ]
189 |     }
190 |    ],
191 |    "source": [
192 |     "sum_safe = sum_vector_in_array_safe(vec)\n",
193 |     "print(\"The sum calculated by Numba is:\",sum_safe)"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "id": "df608a2a",
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": []
203 |   }
204 |  ],
205 |  "metadata": {
206 |   "kernelspec": {
207 |    "display_name": "Python 3",
208 |    "language": "python",
209 |    "name": "python3"
210 |   },
211 |   "language_info": {
212 |    "codemirror_mode": {
213 |     "name": "ipython",
214 |     "version": 3
215 |    },
216 |    "file_extension": ".py",
217 |    "mimetype": "text/x-python",
218 |    "name": "python",
219 |    "nbconvert_exporter": "python",
220 |    "pygments_lexer": "ipython3",
221 |    "version": "3.11.2"
222 |   }
223 |  },
224 |  "nbformat": 4,
225 |  "nbformat_minor": 5
226 | }
227 | 


--------------------------------------------------------------------------------
/AY06/3_OMP_matvec.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Comandos útiles:
  3 | Compilar código:    g++ -o name_output 3_OMP_matvec.cpp -fopenmp
  4 | Correr código:      ./name_output
  5 | */
  6 | 
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <chrono>
 10 | #include <omp.h>
 11 | 
 12 | // Función que genera un vector de largo n con valores entre -rango y +rango
 13 | float* vector_generator(int n, int range)
 14 | {
 15 |     float* vector = (float*) calloc(n, sizeof(float));
 16 |     
 17 |     for (int i=0; i<n; i++)
 18 |     {
 19 |         vector[i] = (2*range * ((float)rand() / (float)RAND_MAX)) - range;
 20 |     }
 21 | 
 22 |     return vector;
 23 | }
 24 | 
 25 | // Función que imprime en consola un vector
 26 | void print_vector(float* vector, int n)
 27 | {
 28 |     for (int i = 0; i < n; i++)
 29 |     {
 30 |         printf("%f\t", vector[i]);
 31 |     }
 32 |     printf("\n\n");
 33 | }
 34 | 
 35 | // Función que genera una matriz cuadrada de dim n y valores entre -rango y rango
 36 | float** matrix_generator(int n, int range)
 37 | {
 38 |     float** matrix = (float**) calloc(n, sizeof(float*));
 39 | 
 40 |     for (int i = 0; i < n; i++)
 41 |     {
 42 |         matrix[i] = (float*) calloc(n, sizeof(float));
 43 |     }
 44 | 
 45 |     for (int i = 0; i < n; i++) {
 46 |         for (int j = 0; j < n; j++)
 47 |         {
 48 |             matrix[i][j] = (2*range * ((float)rand() / (float)RAND_MAX)) - range;
 49 | 
 50 |         }
 51 |     }
 52 | 
 53 |     return matrix;
 54 | }
 55 | 
 56 | // Función que imprime en consola una matriz
 57 | void print_matrix(float** matrix, int n)
 58 | {
 59 |     for (int i = 0; i < n; i++)
 60 |     {
 61 |         for (int j = 0; j < n; j++)
 62 |         {
 63 |             printf("%f\t", matrix[i][j]); 
 64 |         }
 65 |         printf("\n");
 66 |     }
 67 |     printf("\n");
 68 | }
 69 | 
 70 | // Función que libera la memoria reservada de una matriz de nxn
 71 | void free_matrix(float** matrix, int n)
 72 | {
 73 |     for (int i = 0; i < n; i++)
 74 |     {
 75 |         free(matrix[i]);
 76 |     }
 77 |     free(matrix);
 78 | }
 79 | 
 80 | // Función que implementa una operación matriz x vector de dim n
 81 | float* mat_vec(float** matrix, float* vector, int n)
 82 | {
 83 |     float* result = (float*)calloc(n, sizeof(float));
 84 | 
 85 |     for (int i = 0; i < n; i++)
 86 |     {
 87 |         for (int j = 0; j < n; j++)
 88 |         {
 89 |             result[i] += matrix[i][j] * vector[j];
 90 |         }
 91 |     }
 92 |     return result;
 93 | }
 94 | 
 95 | // Versión paralela de una función que implementa una operación matriz x vector de dim n
 96 | float* mat_vec_par(float **matrix, float *vector, int n)
 97 | {
 98 |     float *result = (float*)calloc(n, sizeof(float));
 99 | 
100 |     #pragma omp parallel for //num_threads(4)
101 |     for (int i = 0; i < n; i++)
102 |     {
103 |         for (int j = 0; j < n; j++)
104 |         {
105 |             result[i] += matrix[i][j] * vector[j];
106 |         }
107 |     }
108 |   return result;
109 | }
110 | 
111 | 
112 | int main()
113 | {
114 |     // Definimos una semilla
115 |     srand((int)time(0));
116 | 
117 |     // Decidimos que opción correr del código
118 |     int opcion_a_correr = 1;
119 | 
120 |     // Hagamos un switch para hacer distintos ejemplos
121 |     // PORQUE PUEDO
122 |     switch (opcion_a_correr)
123 |     {
124 |         case 0: // Veamos un ejemplo sencillo de MATVEC:
125 |         {
126 |             int n = 5; // Dimensión
127 |             int range = 10; // Rango de números (-range a +range)
128 | 
129 |             // Creamos un vector y una matriz:
130 |             float* vec = vector_generator(n, range);
131 |             float** mat = matrix_generator(n, range);
132 | 
133 |             // Imprimamos cada estructura:
134 |             printf("Vector de dim %d:\n", n);
135 |             print_vector(vec, n);
136 | 
137 |             printf("Matriz de dim %d:\n", n);
138 |             print_matrix(mat, n);
139 | 
140 |             // Calculemos su matvec:
141 |             float* result = mat_vec(mat, vec, n);
142 | 
143 |             // Imprimamos el resultado:
144 |             printf("Resultado de dim %d:\n", n);
145 |             print_vector(result, n);
146 | 
147 |             // Y ahora lo mismo pero paralelo:
148 |             float* result2 = mat_vec_par(mat, vec, n);
149 |             printf("Resultado de dim %d:\n", n);
150 |             print_vector(result2, n);
151 | 
152 |             // No olvidemos liberar la memoria :D
153 |             free(vec);
154 |             free_matrix(mat, n);
155 |             free(result);
156 |             free(result2);
157 | 
158 |             break;
159 |         }
160 | 
161 |         case 1: // Hagamos un matvec más pesado
162 |         {
163 |             int n = 10000; // Dimensión
164 |             int range = 100; // Rango de números (-range a +range)
165 | 
166 |             // Creamos un vector y una matriz:
167 |             float* vec = vector_generator(n, range);
168 |             float** mat = matrix_generator(n, range);
169 | 
170 |             // Calculemos su matvec y obtengamos el tiempo que tardó en computar:
171 |             auto start = std::chrono::high_resolution_clock::now();
172 |             float* result = mat_vec(mat, vec, n);
173 |             auto end = std::chrono::high_resolution_clock::now();
174 |             auto execution = std::chrono::duration_cast<std::chrono::nanoseconds>( end - start ); 
175 | 
176 |             // Vemos cuánto se tardó en calcular el resultado
177 |             printf("Tiempo MATVEC normal: %.9f seconds\n", execution.count() * 1e-9);
178 | 
179 |             // Repitamos el cálculo anterior pero ahora con OpenMP
180 |             start = std::chrono::high_resolution_clock::now();
181 |             float* result2 = mat_vec_par(mat, vec, n);
182 |             end = std::chrono::high_resolution_clock::now();
183 |             execution = std::chrono::duration_cast<std::chrono::nanoseconds>( end - start ); 
184 | 
185 |             // Vemos cuánto se tardó en calcular el resultado
186 |             printf("Tiempo MATVEC paralelo: %.9f seconds\n", execution.count() * 1e-9);
187 | 
188 |             // No olvidemos liberar la memoria :D
189 |             free(vec);
190 |             free_matrix(mat, n);
191 |             free(result);
192 |             free(result2);
193 | 
194 |             break;
195 |         }
196 | 
197 |         default:
198 |             printf("default");
199 |             break;
200 |     }
201 | 
202 |     return 0;
203 | }


--------------------------------------------------------------------------------
/AY08/Numba/1_PythonDecorators.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "711ca4c9",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Python decorators\n",
  9 |     "\n",
 10 |     " - Elwin van 't Wout\n",
 11 |     " - Pontificia Universidad Católica de Chile\n",
 12 |     " - IMT3870\n",
 13 |     " - 26-8-2024\n",
 14 |     " \n",
 15 |     "This tutorial shows the functionality of Python *decorators*. A *decorator* is a programming construction that adapts functions."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "id": "98df67de",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "A Python *function* can take Python objects as input and output. An often used construction is taking a number, or array of numbers, as input of a function, and another number, or array of numbers, as output. Following is an example."
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 1,
 29 |    "id": "1f86e3de",
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "def my_square(x):\n",
 34 |     "    return x**2"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 2,
 40 |    "id": "299ead5b",
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "data": {
 45 |       "text/plain": [
 46 |        "4"
 47 |       ]
 48 |      },
 49 |      "execution_count": 2,
 50 |      "metadata": {},
 51 |      "output_type": "execute_result"
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "my_square(2)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "id": "9d5ec8a5",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "Python functions are objects themselves and can, therefore, be used as input and output of another Python function. The following example takes an arbitrary function, performs additional timing statistics, and returns this new function."
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 3,
 69 |    "id": "44ad541a",
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "import time\n",
 74 |     "\n",
 75 |     "def timer(fun):\n",
 76 |     "    def function_execution(*args):\n",
 77 |     "        print(\"Start execution of function\", fun.__name__, \"at\", time.asctime())\n",
 78 |     "        start = time.perf_counter()\n",
 79 |     "        output_value = fun(*args)\n",
 80 |     "        finish = time.perf_counter()\n",
 81 |     "        print(\"Finished execution in\", finish - start, \"seconds\")\n",
 82 |     "        return output_value\n",
 83 |     "    return function_execution"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 4,
 89 |    "id": "f23edf05",
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "my_timed_square = timer(my_square)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 5,
 99 |    "id": "1af60fff",
100 |    "metadata": {},
101 |    "outputs": [
102 |     {
103 |      "name": "stdout",
104 |      "output_type": "stream",
105 |      "text": [
106 |       "Start execution of function my_square at Fri Oct 18 10:10:52 2024\n",
107 |       "Finished execution in 1.500000053056283e-06 seconds\n"
108 |      ]
109 |     },
110 |     {
111 |      "data": {
112 |       "text/plain": [
113 |        "4"
114 |       ]
115 |      },
116 |      "execution_count": 5,
117 |      "metadata": {},
118 |      "output_type": "execute_result"
119 |     }
120 |    ],
121 |    "source": [
122 |     "my_timed_square(2)"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "id": "8be2eda8",
128 |    "metadata": {},
129 |    "source": [
130 |     "The idea of decorators is to simplify this process. Above, we needed to create a separate function `my_timed_square` to use the timer for the square operation. However, we might want to use the timing capabilities for other functions as well, like for calculating the cube of a number. The timing functionality can be reused for any function with a *decorator*."
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 6,
136 |    "id": "7679f533",
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "@timer\n",
141 |     "def my_cube(x):\n",
142 |     "    return x**3"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 7,
148 |    "id": "aa7bc033",
149 |    "metadata": {},
150 |    "outputs": [
151 |     {
152 |      "name": "stdout",
153 |      "output_type": "stream",
154 |      "text": [
155 |       "Start execution of function my_cube at Fri Oct 18 10:11:06 2024\n",
156 |       "Finished execution in 1.6999999843392288e-06 seconds\n"
157 |      ]
158 |     },
159 |     {
160 |      "data": {
161 |       "text/plain": [
162 |        "8"
163 |       ]
164 |      },
165 |      "execution_count": 7,
166 |      "metadata": {},
167 |      "output_type": "execute_result"
168 |     }
169 |    ],
170 |    "source": [
171 |     "my_cube(2)"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "id": "12e956ba",
177 |    "metadata": {},
178 |    "source": [
179 |     "Notice that we can call the cube function immediately, without creating an additional function."
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "id": "a888af5f",
185 |    "metadata": {},
186 |    "source": [
187 |     "Notice that the decorator only takes the function on the next line, not all functions in a cell."
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 8,
193 |    "id": "2bd25948",
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "@timer\n",
198 |     "def my_fourth_power(x):\n",
199 |     "    return x**4\n",
200 |     "\n",
201 |     "def my_fifth_power(x):\n",
202 |     "    return x**5"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 9,
208 |    "id": "49590e09",
209 |    "metadata": {},
210 |    "outputs": [
211 |     {
212 |      "name": "stdout",
213 |      "output_type": "stream",
214 |      "text": [
215 |       "Start execution of function my_fourth_power at Fri Oct 18 10:11:19 2024\n",
216 |       "Finished execution in 1.2999998943996616e-06 seconds\n"
217 |      ]
218 |     },
219 |     {
220 |      "data": {
221 |       "text/plain": [
222 |        "16"
223 |       ]
224 |      },
225 |      "execution_count": 9,
226 |      "metadata": {},
227 |      "output_type": "execute_result"
228 |     }
229 |    ],
230 |    "source": [
231 |     "my_fourth_power(2)"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 10,
237 |    "id": "69a98e85",
238 |    "metadata": {},
239 |    "outputs": [
240 |     {
241 |      "data": {
242 |       "text/plain": [
243 |        "32"
244 |       ]
245 |      },
246 |      "execution_count": 10,
247 |      "metadata": {},
248 |      "output_type": "execute_result"
249 |     }
250 |    ],
251 |    "source": [
252 |     "my_fifth_power(2)"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "id": "e0d816a9",
259 |    "metadata": {},
260 |    "outputs": [],
261 |    "source": []
262 |   }
263 |  ],
264 |  "metadata": {
265 |   "kernelspec": {
266 |    "display_name": "Python 3",
267 |    "language": "python",
268 |    "name": "python3"
269 |   },
270 |   "language_info": {
271 |    "codemirror_mode": {
272 |     "name": "ipython",
273 |     "version": 3
274 |    },
275 |    "file_extension": ".py",
276 |    "mimetype": "text/x-python",
277 |    "name": "python",
278 |    "nbconvert_exporter": "python",
279 |    "pygments_lexer": "ipython3",
280 |    "version": "3.11.2"
281 |   }
282 |  },
283 |  "nbformat": 4,
284 |  "nbformat_minor": 5
285 | }
286 | 


--------------------------------------------------------------------------------
/AY01/Tutorial_joblib_2_reuse.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "attachments": {},
  5 |    "cell_type": "markdown",
  6 |    "id": "a7056a3f",
  7 |    "metadata": {},
  8 |    "source": [
  9 |     "# Ayudantía 1 - Notebook 2\n",
 10 |     "### Profesor: Elwin van 't Wout\n",
 11 |     "### Ayudante: Alberto Almuna Morales (alberto.almuna@uc.cl)"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "attachments": {},
 16 |    "cell_type": "markdown",
 17 |    "id": "a032fff2",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "The library ```joblib``` provides functionality for parallel computing. In this notebook, let us use a parallel pool of workers for different tasks."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 1,
 26 |    "id": "34a5963a",
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import numpy as np"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "attachments": {},
 35 |    "cell_type": "markdown",
 36 |    "id": "834bf49b",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "The `Parallel` class of `joblib` creates a pool of workers to which tasks can be assigned. This pool of workers can be reused for different sets of tasks."
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 2,
 45 |    "id": "5225b2c9",
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "from joblib import Parallel, delayed"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "attachments": {},
 54 |    "cell_type": "markdown",
 55 |    "id": "46e35cb6",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "Let us create a pool with the maximum number of workers available on our machine. Specifying the number of jobs as minus one means the maximum number of workers that can automatically be found on the machine."
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 3,
 64 |    "id": "bbd0465c",
 65 |    "metadata": {},
 66 |    "outputs": [
 67 |     {
 68 |      "name": "stdout",
 69 |      "output_type": "stream",
 70 |      "text": [
 71 |       "Number of cores found by joblib: 8\n"
 72 |      ]
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "from joblib import cpu_count\n",
 77 |     "print(\"Number of cores found by joblib:\", cpu_count())"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 4,
 83 |    "id": "3db86182",
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "parallel_pool = Parallel(n_jobs=-1)"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "attachments": {},
 92 |    "cell_type": "markdown",
 93 |    "id": "03c1b471",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "Let us create two different functions we like to perform: taking the square and the square root. For the square root, we can use the `Numpy` function, but for the square we create our own function."
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 5,
102 |    "id": "df3e02e7",
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "def my_square(n):\n",
107 |     "    return n**2\n",
108 |     "parallel_square = delayed(my_square)\n",
109 |     "parallel_root = delayed(np.sqrt)"
110 |    ]
111 |   },
112 |   {
113 |    "attachments": {},
114 |    "cell_type": "markdown",
115 |    "id": "6fd79d41",
116 |    "metadata": {},
117 |    "source": [
118 |     "Creating the tasks requires specifying the input variables. For the square, let us use a uniform sample for values between zero and one. For the square root, we'd like to use the previous output and check if the result is the input again. Hence, we first need to perform the square operations."
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 8,
124 |    "id": "d011898d",
125 |    "metadata": {},
126 |    "outputs": [
127 |     {
128 |      "name": "stdout",
129 |      "output_type": "stream",
130 |      "text": [
131 |       "Input values are: [0.         0.11111111 0.22222222 0.33333333 0.44444444 0.55555556\n",
132 |       " 0.66666667 0.77777778 0.88888889 1.        ]\n"
133 |      ]
134 |     }
135 |    ],
136 |    "source": [
137 |     "input_values = np.linspace(0,1,10)\n",
138 |     "print(\"Input values are:\", input_values)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 9,
144 |    "id": "a756c03c",
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "parallel_tasks_square = [parallel_square(i) for i in input_values]"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 10,
154 |    "id": "25f7702c",
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "parallel_results_square = parallel_pool(parallel_tasks_square)"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 11,
164 |    "id": "e7a16225",
165 |    "metadata": {},
166 |    "outputs": [
167 |     {
168 |      "name": "stdout",
169 |      "output_type": "stream",
170 |      "text": [
171 |       "The squares of input values are: [0.0, 0.012345679012345678, 0.04938271604938271, 0.1111111111111111, 0.19753086419753085, 0.308641975308642, 0.4444444444444444, 0.6049382716049381, 0.7901234567901234, 1.0]\n"
172 |      ]
173 |     }
174 |    ],
175 |    "source": [
176 |     "print(\"The squares of input values are:\", parallel_results_square)"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 12,
182 |    "id": "ed4f1782",
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "parallel_tasks_root = [parallel_root(i) for i in parallel_results_square]"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": 13,
192 |    "id": "61cdb6a0",
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "parallel_results_root = parallel_pool(parallel_tasks_root)"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 14,
202 |    "id": "58506510",
203 |    "metadata": {},
204 |    "outputs": [
205 |     {
206 |      "name": "stdout",
207 |      "output_type": "stream",
208 |      "text": [
209 |       "The square roots of the squares of the input values are: [0.0, 0.1111111111111111, 0.2222222222222222, 0.3333333333333333, 0.4444444444444444, 0.5555555555555556, 0.6666666666666666, 0.7777777777777777, 0.8888888888888888, 1.0]\n"
210 |      ]
211 |     }
212 |    ],
213 |    "source": [
214 |     "print(\"The square roots of the squares of the input values are:\", parallel_results_root)"
215 |    ]
216 |   },
217 |   {
218 |    "attachments": {},
219 |    "cell_type": "markdown",
220 |    "id": "ddaf4ddf",
221 |    "metadata": {},
222 |    "source": [
223 |     "Notice that the square root of the square of the input values are indeed the input variables, perhaps with a small rounding error. Also, the same pool of workers was used twice: the tasks needed to be defined again but the same worker pool can be used many times. Reusing the same worker pool tends to be quicker since it can be initialized once and applied to different tasks."
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "id": "e41c04ac",
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": []
233 |   }
234 |  ],
235 |  "metadata": {
236 |   "kernelspec": {
237 |    "display_name": "Python 3.10.4 64-bit",
238 |    "language": "python",
239 |    "name": "python3"
240 |   },
241 |   "language_info": {
242 |    "codemirror_mode": {
243 |     "name": "ipython",
244 |     "version": 3
245 |    },
246 |    "file_extension": ".py",
247 |    "mimetype": "text/x-python",
248 |    "name": "python",
249 |    "nbconvert_exporter": "python",
250 |    "pygments_lexer": "ipython3",
251 |    "version": "3.11.2"
252 |   },
253 |   "vscode": {
254 |    "interpreter": {
255 |     "hash": "7600a12950a547366bb7a6732117e300ffd26224351912980486e1126c5d0f9a"
256 |    }
257 |   }
258 |  },
259 |  "nbformat": 4,
260 |  "nbformat_minor": 5
261 | }
262 | 


--------------------------------------------------------------------------------
/AY08/Numba/2_Numba_vector_addition.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "770a296f",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Basic usage of Numba\n",
  9 |     "\n",
 10 |     " - Elwin van 't Wout\n",
 11 |     " - Pontificia Universidad Católica de Chile\n",
 12 |     " - IMT3870\n",
 13 |     " - 26-8-2024\n",
 14 |     "\n",
 15 |     "Sum the values of a vector and compare the timing between Python, Numpy, and Numba."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "id": "217c54e1",
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import numpy as np\n",
 26 |     "from numba import jit"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "id": "14629e10",
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "def sum_vector_python(a):\n",
 37 |     "    s = 0\n",
 38 |     "    for i in range(a.size):\n",
 39 |     "        s += a[i]\n",
 40 |     "    return s   "
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 3,
 46 |    "id": "2f57b2ac",
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "def sum_vector_numpy(a):\n",
 51 |     "    s = np.sum(a)\n",
 52 |     "    return s   "
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "id": "f366e150",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "For Numba, we can use exactly the same function as before but with the Numba decorator added.\n",
 61 |     "\n",
 62 |     "Use the option ```nopython=True``` to use the performance optimisation of Numba. Alternatively, one can use ```@njit```.\n",
 63 |     "\n",
 64 |     "Remark: earlier versions of Numba had the option of using ```nopython=False```, which basically ran the Python code as is, without any optimization. However, this option is deprecated, and more recent versions do not support this functionality anymore. Depending on the version, you may receive a warning or Numba just ignores the option."
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 4,
 70 |    "id": "183a1d41",
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "@jit(nopython=True)\n",
 75 |     "def sum_vector_numba_nopython(a):\n",
 76 |     "    s = 0\n",
 77 |     "    for i in range(a.size):\n",
 78 |     "        s += a[i]\n",
 79 |     "    return s"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 5,
 85 |    "id": "d8a6f955-5a3f-4283-b222-66d19c76b91c",
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "name": "stderr",
 90 |      "output_type": "stream",
 91 |      "text": [
 92 |       "C:\\Users\\alber\\AppData\\Local\\Temp\\ipykernel_11680\\3605788378.py:1: NumbaDeprecationWarning: \u001b[1mThe keyword argument 'nopython=False' was supplied. From Numba 0.59.0 the default is being changed to True and use of 'nopython=False' will raise a warning as the argument will have no effect. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n",
 93 |       "  @jit(nopython=False)\n"
 94 |      ]
 95 |     }
 96 |    ],
 97 |    "source": [
 98 |     "@jit(nopython=False)\n",
 99 |     "def sum_vector_numba_python(a):\n",
100 |     "    s = 0\n",
101 |     "    for i in range(a.size):\n",
102 |     "        s += a[i]\n",
103 |     "    return s"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "id": "f2644c21",
109 |    "metadata": {},
110 |    "source": [
111 |     "Let us create a vector with elements $0,1,2,\\dots,n-1$ and calculate the sum."
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 6,
117 |    "id": "6d76abcf",
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "n = int(1e7)\n",
122 |     "vec = np.arange(n)"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "id": "c63e69dc",
128 |    "metadata": {},
129 |    "source": [
130 |     "Be careful with the timing of the Numba function. The first call is always slow because the code needs to be compiled."
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 11,
136 |    "id": "d171e3dc",
137 |    "metadata": {},
138 |    "outputs": [
139 |     {
140 |      "name": "stdout",
141 |      "output_type": "stream",
142 |      "text": [
143 |       "The 1st call to the Numba function took: 0.0028438568115234375 seconds.\n",
144 |       "The 2nd call to the Numba function took: 0.0 seconds.\n",
145 |       "The 3rd call to the Numba function took: 0.009637832641601562 seconds.\n"
146 |      ]
147 |     }
148 |    ],
149 |    "source": [
150 |     "import time\n",
151 |     "time_1 = time.time()\n",
152 |     "sum_vector_numba_nopython(vec)\n",
153 |     "time_2 = time.time()\n",
154 |     "sum_vector_numba_nopython(vec)\n",
155 |     "time_3 = time.time()\n",
156 |     "sum_vector_numba_nopython(vec)\n",
157 |     "time_4 = time.time()\n",
158 |     "print(\"The 1st call to the Numba function took:\",time_2-time_1,\"seconds.\")\n",
159 |     "print(\"The 2nd call to the Numba function took:\",time_3-time_2,\"seconds.\")\n",
160 |     "print(\"The 3rd call to the Numba function took:\",time_4-time_3,\"seconds.\")"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 12,
166 |    "id": "d5f596ee-9fba-44df-b3ce-d31c017181d8",
167 |    "metadata": {},
168 |    "outputs": [
169 |     {
170 |      "name": "stdout",
171 |      "output_type": "stream",
172 |      "text": [
173 |       "The 1st call to the Numba function took: 0.0029931068420410156 seconds.\n",
174 |       "The 2nd call to the Numba function took: 0.005274295806884766 seconds.\n",
175 |       "The 3rd call to the Numba function took: 0.00500035285949707 seconds.\n"
176 |      ]
177 |     }
178 |    ],
179 |    "source": [
180 |     "import time\n",
181 |     "time_1 = time.time()\n",
182 |     "sum_vector_numba_python(vec)\n",
183 |     "time_2 = time.time()\n",
184 |     "sum_vector_numba_python(vec)\n",
185 |     "time_3 = time.time()\n",
186 |     "sum_vector_numba_python(vec)\n",
187 |     "time_4 = time.time()\n",
188 |     "print(\"The 1st call to the Numba function took:\",time_2-time_1,\"seconds.\")\n",
189 |     "print(\"The 2nd call to the Numba function took:\",time_3-time_2,\"seconds.\")\n",
190 |     "print(\"The 3rd call to the Numba function took:\",time_4-time_3,\"seconds.\")"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 13,
196 |    "id": "c6814cce",
197 |    "metadata": {},
198 |    "outputs": [
199 |     {
200 |      "name": "stderr",
201 |      "output_type": "stream",
202 |      "text": [
203 |       "C:\\Users\\alber\\AppData\\Local\\Temp\\ipykernel_11680\\1815385635.py:4: RuntimeWarning: overflow encountered in scalar add\n",
204 |       "  s += a[i]\n"
205 |      ]
206 |     },
207 |     {
208 |      "name": "stdout",
209 |      "output_type": "stream",
210 |      "text": [
211 |       "822 ms ± 32.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
212 |      ]
213 |     }
214 |    ],
215 |    "source": [
216 |     "%%timeit\n",
217 |     "sum_vector_python(vec)"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 14,
223 |    "id": "728db4c7",
224 |    "metadata": {},
225 |    "outputs": [
226 |     {
227 |      "name": "stdout",
228 |      "output_type": "stream",
229 |      "text": [
230 |       "3.71 ms ± 154 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
231 |      ]
232 |     }
233 |    ],
234 |    "source": [
235 |     "%%timeit\n",
236 |     "sum_vector_numpy(vec)"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 15,
242 |    "id": "192b5184",
243 |    "metadata": {},
244 |    "outputs": [
245 |     {
246 |      "name": "stdout",
247 |      "output_type": "stream",
248 |      "text": [
249 |       "2.73 ms ± 118 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
250 |      ]
251 |     }
252 |    ],
253 |    "source": [
254 |     "%%timeit\n",
255 |     "sum_vector_numba_nopython(vec)"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 16,
261 |    "id": "a2bb42c8-e541-411b-be97-e6e251881f99",
262 |    "metadata": {},
263 |    "outputs": [
264 |     {
265 |      "name": "stdout",
266 |      "output_type": "stream",
267 |      "text": [
268 |       "2.68 ms ± 112 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
269 |      ]
270 |     }
271 |    ],
272 |    "source": [
273 |     "%%timeit\n",
274 |     "sum_vector_numba_python(vec)"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "id": "8c5670b1",
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": []
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "id": "aeff57f0",
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": []
292 |   }
293 |  ],
294 |  "metadata": {
295 |   "kernelspec": {
296 |    "display_name": "Python 3",
297 |    "language": "python",
298 |    "name": "python3"
299 |   },
300 |   "language_info": {
301 |    "codemirror_mode": {
302 |     "name": "ipython",
303 |     "version": 3
304 |    },
305 |    "file_extension": ".py",
306 |    "mimetype": "text/x-python",
307 |    "name": "python",
308 |    "nbconvert_exporter": "python",
309 |    "pygments_lexer": "ipython3",
310 |    "version": "3.11.2"
311 |   }
312 |  },
313 |  "nbformat": 4,
314 |  "nbformat_minor": 5
315 | }
316 | 


--------------------------------------------------------------------------------
/AY08/Numba/3_Numba_vector_addition_parallel.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "770a296f",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Parallel functionality of Numba\n",
  9 |     "\n",
 10 |     " - Elwin van 't Wout\n",
 11 |     " - Pontificia Universidad Católica de Chile\n",
 12 |     " - IMT3870\n",
 13 |     " - 26-8-2024\n",
 14 |     "\n",
 15 |     "Sum the values of a vector and compare the timing between parallelised versions."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "id": "217c54e1",
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import numpy as np\n",
 26 |     "from numba import njit, prange"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "id": "14629e10",
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "def sum_vector_python(a):\n",
 37 |     "    s = 0\n",
 38 |     "    for i in range(a.size):\n",
 39 |     "        s += a[i]\n",
 40 |     "    return s   "
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 3,
 46 |    "id": "2f57b2ac",
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "def sum_vector_numpy(a):\n",
 51 |     "    s = np.sum(a)\n",
 52 |     "    return s   "
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "id": "f366e150",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "We can use Numba to optimize the Python code through its JIT capabilities. Moreover, Numba can automatically parallelise code through the multi-threading paradigm. For this, set the option ```parallel=False```."
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 4,
 66 |    "id": "183a1d41",
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "@njit(parallel=False)\n",
 71 |     "def sum_vector_numba_serial(a):\n",
 72 |     "    s = 0\n",
 73 |     "    for i in range(a.size):\n",
 74 |     "        s += a[i]\n",
 75 |     "    return s   "
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "id": "c5f85486",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "Adding the parallel option to the Numba decorator makes Numba search for parts of the code than can be parallelised. Add the option ```parallel=True``` for automatic parallelisation. In earlier version, this will only work when ```nopython=True```."
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 5,
 89 |    "id": "b31b18fc",
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "@njit(parallel=True)\n",
 94 |     "def sum_vector_numba_parallel(a):\n",
 95 |     "    s = 0\n",
 96 |     "    for i in range(a.size):\n",
 97 |     "        s += a[i]\n",
 98 |     "    return s   "
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "id": "ad1e849b",
104 |    "metadata": {},
105 |    "source": [
106 |     "Instead of letting Numba search for parallelisation opportunities, you can also explicitly state that a for loop needs to be parallelised. Use the function ```prange()``` instead of the standard ```range()``` in the for loop. In this case, Numba automatically detects that the variable ```s``` for the sum is a shared variable and solves issues with race conditions."
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 6,
112 |    "id": "3b69d84b",
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "@njit(parallel=True)\n",
117 |     "def sum_vector_numba_prange(a):\n",
118 |     "    s = 0\n",
119 |     "    for i in prange(a.size):\n",
120 |     "        s += a[i]\n",
121 |     "    return s   "
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "id": "f2644c21",
127 |    "metadata": {},
128 |    "source": [
129 |     "Let us create a vector with elements $0,1,2,\\dots,n-1$ and calculate the sum."
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 7,
135 |    "id": "6d76abcf",
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "n = int(1e7)\n",
140 |     "vec = np.arange(n)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "id": "3a4a4710",
146 |    "metadata": {},
147 |    "source": [
148 |     "Before performing the timings, call the Numba functions once, so that they are compiled"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 8,
154 |    "id": "0188ef52",
155 |    "metadata": {},
156 |    "outputs": [
157 |     {
158 |      "name": "stdout",
159 |      "output_type": "stream",
160 |      "text": [
161 |       "Sum of vector with serial Numba: 49999995000000\n",
162 |       "Sum of vector with parallel Numba: 49999995000000\n"
163 |      ]
164 |     },
165 |     {
166 |      "name": "stderr",
167 |      "output_type": "stream",
168 |      "text": [
169 |       "c:\\Users\\alber\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\numba\\core\\typed_passes.py:334: NumbaPerformanceWarning: \u001b[1m\n",
170 |       "The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.\n",
171 |       "\n",
172 |       "To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.\n",
173 |       "\u001b[1m\n",
174 |       "File \"..\\..\\..\\..\\..\\..\\..\\AppData\\Local\\Temp\\ipykernel_13076\\1770870890.py\", line 1:\u001b[0m\n",
175 |       "\u001b[1m<source missing, REPL/exec in use?>\u001b[0m\n",
176 |       "\u001b[0m\n",
177 |       "  warnings.warn(errors.NumbaPerformanceWarning(msg,\n"
178 |      ]
179 |     },
180 |     {
181 |      "name": "stdout",
182 |      "output_type": "stream",
183 |      "text": [
184 |       "Sum of vector with prange Numba: 49999995000000\n"
185 |      ]
186 |     }
187 |    ],
188 |    "source": [
189 |     "print(\"Sum of vector with serial Numba:\", sum_vector_numba_serial(vec))\n",
190 |     "print(\"Sum of vector with parallel Numba:\", sum_vector_numba_parallel(vec))\n",
191 |     "print(\"Sum of vector with prange Numba:\", sum_vector_numba_prange(vec))"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "id": "678670a6",
197 |    "metadata": {},
198 |    "source": [
199 |     "Numba may give warnings when it cannot perform the requested optimisation of the code."
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 9,
205 |    "id": "c6814cce",
206 |    "metadata": {},
207 |    "outputs": [
208 |     {
209 |      "name": "stderr",
210 |      "output_type": "stream",
211 |      "text": [
212 |       "C:\\Users\\alber\\AppData\\Local\\Temp\\ipykernel_13076\\1815385635.py:4: RuntimeWarning: overflow encountered in scalar add\n",
213 |       "  s += a[i]\n"
214 |      ]
215 |     },
216 |     {
217 |      "name": "stdout",
218 |      "output_type": "stream",
219 |      "text": [
220 |       "843 ms ± 35.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
221 |      ]
222 |     }
223 |    ],
224 |    "source": [
225 |     "%%timeit\n",
226 |     "sum_vector_python(vec)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 10,
232 |    "id": "728db4c7",
233 |    "metadata": {},
234 |    "outputs": [
235 |     {
236 |      "name": "stdout",
237 |      "output_type": "stream",
238 |      "text": [
239 |       "3.64 ms ± 101 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
240 |      ]
241 |     }
242 |    ],
243 |    "source": [
244 |     "%%timeit\n",
245 |     "sum_vector_numpy(vec)"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 11,
251 |    "id": "192b5184",
252 |    "metadata": {},
253 |    "outputs": [
254 |     {
255 |      "name": "stdout",
256 |      "output_type": "stream",
257 |      "text": [
258 |       "2.6 ms ± 73.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
259 |      ]
260 |     }
261 |    ],
262 |    "source": [
263 |     "%%timeit\n",
264 |     "sum_vector_numba_serial(vec)"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 12,
270 |    "id": "a636b5ab",
271 |    "metadata": {},
272 |    "outputs": [
273 |     {
274 |      "name": "stdout",
275 |      "output_type": "stream",
276 |      "text": [
277 |       "2.59 ms ± 124 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
278 |      ]
279 |     }
280 |    ],
281 |    "source": [
282 |     "%%timeit\n",
283 |     "sum_vector_numba_parallel(vec)"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": 13,
289 |    "id": "842cbee4",
290 |    "metadata": {},
291 |    "outputs": [
292 |     {
293 |      "name": "stdout",
294 |      "output_type": "stream",
295 |      "text": [
296 |       "760 µs ± 41.3 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
297 |      ]
298 |     }
299 |    ],
300 |    "source": [
301 |     "%%timeit\n",
302 |     "sum_vector_numba_prange(vec)"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "markdown",
307 |    "id": "4d2f75c6",
308 |    "metadata": {},
309 |    "source": [
310 |     "The number of threads used by Numba is stored in global variables."
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 14,
316 |    "id": "5e71e3e5",
317 |    "metadata": {},
318 |    "outputs": [
319 |     {
320 |      "name": "stdout",
321 |      "output_type": "stream",
322 |      "text": [
323 |       "The number of available CPUs detected by Numba is: 8\n",
324 |       "The number of threads used by Numba is: 8\n"
325 |      ]
326 |     }
327 |    ],
328 |    "source": [
329 |     "from numba import config\n",
330 |     "print(\"The number of available CPUs detected by Numba is:\", config.NUMBA_DEFAULT_NUM_THREADS)\n",
331 |     "print(\"The number of threads used by Numba is:\", config.NUMBA_NUM_THREADS)"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "markdown",
336 |    "id": "b78e5700",
337 |    "metadata": {},
338 |    "source": [
339 |     "The number of threads used by Numba can be changed manually."
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": 15,
345 |    "id": "9c501c38",
346 |    "metadata": {},
347 |    "outputs": [
348 |     {
349 |      "name": "stdout",
350 |      "output_type": "stream",
351 |      "text": [
352 |       "The current number of threads used by Numba is: 2\n"
353 |      ]
354 |     }
355 |    ],
356 |    "source": [
357 |     "from numba import set_num_threads, get_num_threads\n",
358 |     "set_num_threads(2)\n",
359 |     "print(\"The current number of threads used by Numba is:\", get_num_threads())"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": null,
365 |    "id": "9340dcab",
366 |    "metadata": {},
367 |    "outputs": [],
368 |    "source": []
369 |   }
370 |  ],
371 |  "metadata": {
372 |   "kernelspec": {
373 |    "display_name": "Python 3",
374 |    "language": "python",
375 |    "name": "python3"
376 |   },
377 |   "language_info": {
378 |    "codemirror_mode": {
379 |     "name": "ipython",
380 |     "version": 3
381 |    },
382 |    "file_extension": ".py",
383 |    "mimetype": "text/x-python",
384 |    "name": "python",
385 |    "nbconvert_exporter": "python",
386 |    "pygments_lexer": "ipython3",
387 |    "version": "3.11.2"
388 |   }
389 |  },
390 |  "nbformat": 4,
391 |  "nbformat_minor": 5
392 | }
393 | 


--------------------------------------------------------------------------------
/AY01/Tutorial_joblib_1_basics.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "attachments": {},
  5 |    "cell_type": "markdown",
  6 |    "id": "c10cb256",
  7 |    "metadata": {},
  8 |    "source": [
  9 |     "# Ayudantía 1 - Notebook 1\n",
 10 |     "### Profesor: Elwin van 't Wout\n",
 11 |     "### Ayudante: Alberto Almuna Morales (alberto.almuna@uc.cl)"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "attachments": {},
 16 |    "cell_type": "markdown",
 17 |    "id": "a032fff2",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "The library ```joblib``` provides functionality for parallel computing. In this notebook, let us look into the basics of the library."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "id": "34a5963a",
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import numpy as np"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "attachments": {},
 35 |    "cell_type": "markdown",
 36 |    "id": "834bf49b",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "The ```joblib``` library has the class ```Parallel``` which provides the basic structure for parallel computing. The class provides the functionality to create a pool of workers that can perform tasks in parallel. let us create such object."
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 3,
 45 |    "id": "5225b2c9",
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "from joblib import Parallel"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 4,
 55 |    "id": "a7d9cf75",
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "Parallel(n_jobs=1)\n"
 63 |      ]
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "parallel_pool = Parallel()\n",
 68 |     "print(parallel_pool)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "attachments": {},
 73 |    "cell_type": "markdown",
 74 |    "id": "1df39af9",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "By default, the object is initialized with only a single job. This means that no parallelization will be performed, because only one worker was created. Let us specify the number of jobs explicitly upon creating the worker pool."
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 5,
 83 |    "id": "89d90ff4",
 84 |    "metadata": {},
 85 |    "outputs": [
 86 |     {
 87 |      "name": "stdout",
 88 |      "output_type": "stream",
 89 |      "text": [
 90 |       "Parallel(n_jobs=2)\n"
 91 |      ]
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "parallel_pool = Parallel(n_jobs=2)\n",
 96 |     "print(parallel_pool)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "attachments": {},
101 |    "cell_type": "markdown",
102 |    "id": "92f1bf4f",
103 |    "metadata": {},
104 |    "source": [
105 |     "The number of workers can also be retrieved through the attribute ```n_jobs```."
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 6,
111 |    "id": "7e8d8b69",
112 |    "metadata": {},
113 |    "outputs": [
114 |     {
115 |      "name": "stdout",
116 |      "output_type": "stream",
117 |      "text": [
118 |       "The number of workers is: 2\n"
119 |      ]
120 |     }
121 |    ],
122 |    "source": [
123 |     "print(\"The number of workers is:\", parallel_pool.n_jobs)"
124 |    ]
125 |   },
126 |   {
127 |    "attachments": {},
128 |    "cell_type": "markdown",
129 |    "id": "dcb606d8",
130 |    "metadata": {},
131 |    "source": [
132 |     "Having created a class that can create different workers, let us specify the tasks to be performed. The tasks can be specified by the decorator `delayed` from the `joblib` library. A *decorator* is a Python function that takes one function and returns another function."
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 7,
138 |    "id": "4818f706",
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "from joblib import delayed"
143 |    ]
144 |   },
145 |   {
146 |    "attachments": {},
147 |    "cell_type": "markdown",
148 |    "id": "9f908a53",
149 |    "metadata": {},
150 |    "source": [
151 |     "Here, we will calculate the square root of different values."
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 8,
157 |    "id": "53a866e0",
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "parallel_sqrt = delayed(np.sqrt)"
162 |    ]
163 |   },
164 |   {
165 |    "attachments": {},
166 |    "cell_type": "markdown",
167 |    "id": "ae567efb",
168 |    "metadata": {},
169 |    "source": [
170 |     "The function `parallel_sqrt` is now a function which can be interpreted by `joblib` as a parallel variant of the function `sqrt` of `Numpy`. It can be interpreted as a function that can assign the `Numpy` square-root function to the different workers in a parallel pool."
171 |    ]
172 |   },
173 |   {
174 |    "attachments": {},
175 |    "cell_type": "markdown",
176 |    "id": "96569b47",
177 |    "metadata": {},
178 |    "source": [
179 |     "Before assigning the function to the workers, we need to specify the input variables for which the function needs to be called. Notice that we need to specify all tasks we like to perform but we do not have to specify which tasks needs to be assigned to which workers. This task assignment will be performed automatically by `joblib`."
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 9,
185 |    "id": "89160a91",
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "parallel_tasks = [parallel_sqrt(i) for i in range(10)]"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 10,
195 |    "id": "73f44597",
196 |    "metadata": {},
197 |    "outputs": [
198 |     {
199 |      "data": {
200 |       "text/plain": [
201 |        "[(<ufunc 'sqrt'>, (0,), {}),\n",
202 |        " (<ufunc 'sqrt'>, (1,), {}),\n",
203 |        " (<ufunc 'sqrt'>, (2,), {}),\n",
204 |        " (<ufunc 'sqrt'>, (3,), {}),\n",
205 |        " (<ufunc 'sqrt'>, (4,), {}),\n",
206 |        " (<ufunc 'sqrt'>, (5,), {}),\n",
207 |        " (<ufunc 'sqrt'>, (6,), {}),\n",
208 |        " (<ufunc 'sqrt'>, (7,), {}),\n",
209 |        " (<ufunc 'sqrt'>, (8,), {}),\n",
210 |        " (<ufunc 'sqrt'>, (9,), {})]"
211 |       ]
212 |      },
213 |      "execution_count": 10,
214 |      "metadata": {},
215 |      "output_type": "execute_result"
216 |     }
217 |    ],
218 |    "source": [
219 |     "parallel_tasks"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 11,
225 |    "id": "83316ab1",
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": [
229 |     "array = [i for i in range(10)]\n",
230 |     "parallel_tasks_2 = map(parallel_sqrt, array)"
231 |    ]
232 |   },
233 |   {
234 |    "attachments": {},
235 |    "cell_type": "markdown",
236 |    "id": "af54dc33",
237 |    "metadata": {},
238 |    "source": [
239 |     "With the list of all tasks created, we can ask the parallel pool of workers to perform all tasks in parallel."
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 12,
245 |    "id": "3e401c94",
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "parallel_results = parallel_pool(parallel_tasks)"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": 13,
255 |    "id": "c91a16ec",
256 |    "metadata": {},
257 |    "outputs": [],
258 |    "source": [
259 |     "parallel_results_2 = parallel_pool(parallel_tasks_2)"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": 14,
265 |    "id": "8d961dcc",
266 |    "metadata": {},
267 |    "outputs": [
268 |     {
269 |      "name": "stdout",
270 |      "output_type": "stream",
271 |      "text": [
272 |       "[0.0, 1.0, 1.4142135623730951, 1.7320508075688772, 2.0, 2.23606797749979, 2.449489742783178, 2.6457513110645907, 2.8284271247461903, 3.0]\n"
273 |      ]
274 |     }
275 |    ],
276 |    "source": [
277 |     "print(parallel_results)"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": 15,
283 |    "id": "5b14e197",
284 |    "metadata": {},
285 |    "outputs": [
286 |     {
287 |      "name": "stdout",
288 |      "output_type": "stream",
289 |      "text": [
290 |       "[0.0, 1.0, 1.4142135623730951, 1.7320508075688772, 2.0, 2.23606797749979, 2.449489742783178, 2.6457513110645907, 2.8284271247461903, 3.0]\n"
291 |      ]
292 |     }
293 |    ],
294 |    "source": [
295 |     "print(parallel_results_2)"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": 16,
301 |    "id": "3be38762",
302 |    "metadata": {},
303 |    "outputs": [
304 |     {
305 |      "data": {
306 |       "text/plain": [
307 |        "True"
308 |       ]
309 |      },
310 |      "execution_count": 16,
311 |      "metadata": {},
312 |      "output_type": "execute_result"
313 |     }
314 |    ],
315 |    "source": [
316 |     "parallel_results == parallel_results_2"
317 |    ]
318 |   },
319 |   {
320 |    "attachments": {},
321 |    "cell_type": "markdown",
322 |    "id": "a9889775",
323 |    "metadata": {},
324 |    "source": [
325 |     "The output is indeed the square root of all input values."
326 |    ]
327 |   },
328 |   {
329 |    "attachments": {},
330 |    "cell_type": "markdown",
331 |    "id": "938015bd",
332 |    "metadata": {},
333 |    "source": [
334 |     "### Example of function with multiple arguments:"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": 17,
340 |    "id": "291c5270",
341 |    "metadata": {},
342 |    "outputs": [
343 |     {
344 |      "name": "stdout",
345 |      "output_type": "stream",
346 |      "text": [
347 |       "[5, 12, 21, 32, 45, 60, 77, 96, 117, 140]\n"
348 |      ]
349 |     }
350 |    ],
351 |    "source": [
352 |     "def my_task(n, m):\n",
353 |     "    return n*m\n",
354 |     "\n",
355 |     "n = [i for i in range(1, 11)]\n",
356 |     "m = [i for i in range(5, 16)]\n",
357 |     "\n",
358 |     "tasks = [delayed(my_task)(i, j) for i, j in zip(n,m)]\n",
359 |     "\n",
360 |     "with Parallel(n_jobs=4) as parallel_pool:\n",
361 |     "    parallel_results = parallel_pool(tasks)\n",
362 |     "    print(parallel_results)"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": 18,
368 |    "id": "03a830b3",
369 |    "metadata": {},
370 |    "outputs": [
371 |     {
372 |      "name": "stdout",
373 |      "output_type": "stream",
374 |      "text": [
375 |       "[5, 12, 21, 32, 45, 60, 77, 96, 117, 140]\n"
376 |      ]
377 |     }
378 |    ],
379 |    "source": [
380 |     "tasks_2 = map(delayed(my_task), n, m)\n",
381 |     "\n",
382 |     "with Parallel(n_jobs=4) as parallel_pool:\n",
383 |     "    parallel_results = parallel_pool(tasks_2)\n",
384 |     "    print(parallel_results)"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": null,
390 |    "id": "de107ce8",
391 |    "metadata": {},
392 |    "outputs": [],
393 |    "source": []
394 |   }
395 |  ],
396 |  "metadata": {
397 |   "kernelspec": {
398 |    "display_name": "Python 3.10.4 64-bit",
399 |    "language": "python",
400 |    "name": "python3"
401 |   },
402 |   "language_info": {
403 |    "codemirror_mode": {
404 |     "name": "ipython",
405 |     "version": 3
406 |    },
407 |    "file_extension": ".py",
408 |    "mimetype": "text/x-python",
409 |    "name": "python",
410 |    "nbconvert_exporter": "python",
411 |    "pygments_lexer": "ipython3",
412 |    "version": "3.11.2"
413 |   },
414 |   "vscode": {
415 |    "interpreter": {
416 |     "hash": "7600a12950a547366bb7a6732117e300ffd26224351912980486e1126c5d0f9a"
417 |    }
418 |   }
419 |  },
420 |  "nbformat": 4,
421 |  "nbformat_minor": 5
422 | }
423 | 


--------------------------------------------------------------------------------
/AY08/Numba/5_Numba_data_types.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "68659313",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Data types in Numba\n",
  9 |     "\n",
 10 |     " - Elwin van 't Wout\n",
 11 |     " - Pontificia Universidad Católica de Chile\n",
 12 |     " - IMT3870\n",
 13 |     " - 26-8-2024\n",
 14 |     "\n",
 15 |     "This tutorial shows the sensitivity of Numba to data types."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "id": "8cfe49b2",
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "from numba import njit\n",
 26 |     "import numpy as np"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "id": "b3286f6e-2276-4de2-85df-84cc1d119a8f",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "Let us create a function that adds a given matrix to the input variable. Since the matrix is of integer type, we specify this explicitly as well."
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 13,
 40 |    "id": "fee72232-ec5b-4f01-9013-338d11840211",
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "@njit\n",
 45 |     "def add_to_matrix(x):\n",
 46 |     "    my_matrix = np.array([[11, 12, 13], [21, 22, 23]])\n",
 47 |     "    sum = my_matrix + x\n",
 48 |     "    return sum"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 14,
 54 |    "id": "1963d10a-e27b-434e-ab7e-3a1496f7c094",
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "@njit\n",
 59 |     "def add_to_matrix_int(x):\n",
 60 |     "    my_matrix = np.array([[11, 12, 13], [21, 22, 23]], dtype=int)\n",
 61 |     "    sum = my_matrix + x\n",
 62 |     "    return sum"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 15,
 68 |    "id": "db67d1f5-c2ef-4b0c-b772-4bee5768915e",
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "@njit\n",
 73 |     "def add_to_matrix_npint32(x):\n",
 74 |     "    my_matrix = np.array([[11, 12, 13], [21, 22, 23]], dtype=np.int32)\n",
 75 |     "    sum = my_matrix + x\n",
 76 |     "    return sum"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 16,
 82 |    "id": "cf33edf1-b6d5-49df-b33b-e3e34e111605",
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "@njit\n",
 87 |     "def add_to_matrix_npint64(x):\n",
 88 |     "    my_matrix = np.array([[11, 12, 13], [21, 22, 23]], dtype=np.int64)\n",
 89 |     "    sum = my_matrix + x\n",
 90 |     "    return sum"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "id": "2de4757f-72bf-49d3-86e9-37a72546b302",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "Let's apply the functionality to a matrix with the same size and data type."
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 22,
104 |    "id": "c409779c-f071-482d-a91e-88dc4f2ef1f8",
105 |    "metadata": {},
106 |    "outputs": [
107 |     {
108 |      "name": "stdout",
109 |      "output_type": "stream",
110 |      "text": [
111 |       "Matrix:\n",
112 |       " [[1 2 3]\n",
113 |       " [4 5 6]]\n",
114 |       "Data type: int32\n"
115 |      ]
116 |     }
117 |    ],
118 |    "source": [
119 |     "matrix_int_2_x_3 = np.array([[1, 2, 3], [4, 5, 6]], dtype=int)\n",
120 |     "print(\"Matrix:\\n\", matrix_int_2_x_3)\n",
121 |     "print(\"Data type:\", matrix_int_2_x_3.dtype)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 23,
127 |    "id": "635f56d2-56fe-41c8-bb99-78e748c41ce7",
128 |    "metadata": {},
129 |    "outputs": [
130 |     {
131 |      "name": "stdout",
132 |      "output_type": "stream",
133 |      "text": [
134 |       "[[12 14 16]\n",
135 |       " [25 27 29]]\n"
136 |      ]
137 |     }
138 |    ],
139 |    "source": [
140 |     "result_int_2_x_3 = add_to_matrix(matrix_int_2_x_3)\n",
141 |     "print(result_int_2_x_3)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 24,
147 |    "id": "dddee2ac-2f07-4105-99a0-ddc55c87d7da",
148 |    "metadata": {},
149 |    "outputs": [
150 |     {
151 |      "ename": "TypingError",
152 |      "evalue": "Failed in nopython mode pipeline (step: nopython frontend)\n\u001b[1m\u001b[1m\u001b[1mNo implementation of function Function(<built-in function array>) found for signature:\n \n >>> array(list(list(int64)<iv=None>)<iv=None>, dtype=Function(<class 'int'>))\n \nThere are 2 candidate implementations:\n\u001b[1m      - Of which 2 did not match due to:\n      Overload in function 'impl_np_array': File: numba\\np\\arrayobj.py: Line 5242.\n        With argument(s): '(list(list(int64)<iv=None>)<iv=None>, dtype=Function(<class 'int'>))':\u001b[0m\n\u001b[1m       Rejected as the implementation raised a specific error:\n         TypingError: \u001b[1mThe argument \"dtype\" must be a data-type if it is provided\u001b[0m\u001b[0m\n  raised from c:\\Users\\alber\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\numba\\np\\arrayobj.py:5250\n\u001b[0m\n\u001b[0m\u001b[1mDuring: resolving callee type: Function(<built-in function array>)\u001b[0m\n\u001b[0m\u001b[1mDuring: typing of call at C:\\Users\\alber\\AppData\\Local\\Temp\\ipykernel_5328\\785250876.py (3)\n\u001b[0m\n\u001b[1m\nFile \"..\\..\\..\\..\\..\\..\\..\\AppData\\Local\\Temp\\ipykernel_5328\\785250876.py\", line 3:\u001b[0m\n\u001b[1m<source missing, REPL/exec in use?>\u001b[0m\n",
153 |      "output_type": "error",
154 |      "traceback": [
155 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
156 |       "\u001b[1;31mTypingError\u001b[0m                               Traceback (most recent call last)",
157 |       "Cell \u001b[1;32mIn[24], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m result_int_2_x_3 \u001b[38;5;241m=\u001b[39m \u001b[43madd_to_matrix_int\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmatrix_int_2_x_3\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;28mprint\u001b[39m(result_int_2_x_3)\n",
158 |       "File \u001b[1;32mc:\\Users\\alber\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\numba\\core\\dispatcher.py:468\u001b[0m, in \u001b[0;36m_DispatcherBase._compile_for_args\u001b[1;34m(self, *args, **kws)\u001b[0m\n\u001b[0;32m    464\u001b[0m         msg \u001b[38;5;241m=\u001b[39m (\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mstr\u001b[39m(e)\u001b[38;5;241m.\u001b[39mrstrip()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mThis error may have been caused \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m    465\u001b[0m                \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mby the following argument(s):\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00margs_str\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m    466\u001b[0m         e\u001b[38;5;241m.\u001b[39mpatch_message(msg)\n\u001b[1;32m--> 468\u001b[0m     \u001b[43merror_rewrite\u001b[49m\u001b[43m(\u001b[49m\u001b[43me\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtyping\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m    469\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m errors\u001b[38;5;241m.\u001b[39mUnsupportedError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m    470\u001b[0m     \u001b[38;5;66;03m# Something unsupported is present in the user code, add help info\u001b[39;00m\n\u001b[0;32m    471\u001b[0m     error_rewrite(e, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124munsupported_error\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
159 |       "File \u001b[1;32mc:\\Users\\alber\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\numba\\core\\dispatcher.py:409\u001b[0m, in \u001b[0;36m_DispatcherBase._compile_for_args.<locals>.error_rewrite\u001b[1;34m(e, issue_type)\u001b[0m\n\u001b[0;32m    407\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[0;32m    408\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 409\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m e\u001b[38;5;241m.\u001b[39mwith_traceback(\u001b[38;5;28;01mNone\u001b[39;00m)\n",
160 |       "\u001b[1;31mTypingError\u001b[0m: Failed in nopython mode pipeline (step: nopython frontend)\n\u001b[1m\u001b[1m\u001b[1mNo implementation of function Function(<built-in function array>) found for signature:\n \n >>> array(list(list(int64)<iv=None>)<iv=None>, dtype=Function(<class 'int'>))\n \nThere are 2 candidate implementations:\n\u001b[1m      - Of which 2 did not match due to:\n      Overload in function 'impl_np_array': File: numba\\np\\arrayobj.py: Line 5242.\n        With argument(s): '(list(list(int64)<iv=None>)<iv=None>, dtype=Function(<class 'int'>))':\u001b[0m\n\u001b[1m       Rejected as the implementation raised a specific error:\n         TypingError: \u001b[1mThe argument \"dtype\" must be a data-type if it is provided\u001b[0m\u001b[0m\n  raised from c:\\Users\\alber\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\numba\\np\\arrayobj.py:5250\n\u001b[0m\n\u001b[0m\u001b[1mDuring: resolving callee type: Function(<built-in function array>)\u001b[0m\n\u001b[0m\u001b[1mDuring: typing of call at C:\\Users\\alber\\AppData\\Local\\Temp\\ipykernel_5328\\785250876.py (3)\n\u001b[0m\n\u001b[1m\nFile \"..\\..\\..\\..\\..\\..\\..\\AppData\\Local\\Temp\\ipykernel_5328\\785250876.py\", line 3:\u001b[0m\n\u001b[1m<source missing, REPL/exec in use?>\u001b[0m\n"
161 |      ]
162 |     }
163 |    ],
164 |    "source": [
165 |     "result_int_2_x_3 = add_to_matrix_int(matrix_int_2_x_3)\n",
166 |     "print(result_int_2_x_3)"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": 25,
172 |    "id": "ddc209e9-878a-4215-81de-c52b042b8a10",
173 |    "metadata": {},
174 |    "outputs": [
175 |     {
176 |      "name": "stdout",
177 |      "output_type": "stream",
178 |      "text": [
179 |       "[[12 14 16]\n",
180 |       " [25 27 29]]\n"
181 |      ]
182 |     }
183 |    ],
184 |    "source": [
185 |     "result_int_2_x_3 = add_to_matrix_npint32(matrix_int_2_x_3)\n",
186 |     "print(result_int_2_x_3)"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": 26,
192 |    "id": "1b8dbc2c-86df-40a6-894f-73bcdb8a47e4",
193 |    "metadata": {},
194 |    "outputs": [
195 |     {
196 |      "name": "stdout",
197 |      "output_type": "stream",
198 |      "text": [
199 |       "[[12 14 16]\n",
200 |       " [25 27 29]]\n"
201 |      ]
202 |     }
203 |    ],
204 |    "source": [
205 |     "result_int_2_x_3 = add_to_matrix_npint64(matrix_int_2_x_3)\n",
206 |     "print(result_int_2_x_3)"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "id": "c45f83ed-dc30-4a26-b91b-f5df7ab0c2a8",
212 |    "metadata": {},
213 |    "source": [
214 |     "The result is indeed as expected, the sum of the two matrices. However, the second implementation raises an error. Reading the error message suggests a problem with the data type. This is strange, because the matrix to add was defined with ```dtype=int``` as in the Numba function. Still it raises an error since ```int``` is a data type managed by Python and ```np.int64``` a data type managed by NumPy.\n",
215 |     "\n",
216 |     "Regardless of this specific example, it is recommended to check data types when using Numba. It is a common error to have incompatibilities in data type since Numba infers the data type from the function and optimises the code accordingly."
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "id": "df608a2a",
223 |    "metadata": {},
224 |    "outputs": [],
225 |    "source": []
226 |   }
227 |  ],
228 |  "metadata": {
229 |   "kernelspec": {
230 |    "display_name": "Python 3",
231 |    "language": "python",
232 |    "name": "python3"
233 |   },
234 |   "language_info": {
235 |    "codemirror_mode": {
236 |     "name": "ipython",
237 |     "version": 3
238 |    },
239 |    "file_extension": ".py",
240 |    "mimetype": "text/x-python",
241 |    "name": "python",
242 |    "nbconvert_exporter": "python",
243 |    "pygments_lexer": "ipython3",
244 |    "version": "3.11.2"
245 |   }
246 |  },
247 |  "nbformat": 4,
248 |  "nbformat_minor": 5
249 | }
250 | 


--------------------------------------------------------------------------------
/AY02/Tutorial_joblib_5_shared_variables.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "8807fd1d",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Multiprocessing tutorial 5\n",
  9 |     "\n",
 10 |     " - Author: Elwin van 't Wout\n",
 11 |     " - Affiliation: Pontificia Universidad Católica de Chile\n",
 12 |     " - Course: IMT3870\n",
 13 |     " - Date: 12-8-2024"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "id": "801f0f24",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "The library `joblib` provides functionality for parallel computing. In this notebook, let us look into shared variables."
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "id": "9dc4770f",
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "from joblib import Parallel, delayed"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "id": "0483e6cc",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "In the *multiprocessing* model, each process has its own data space with private variables. Hence, no variables can be shared between different tasks. The `joblib` library does allow for the use of global variables in each process, but they should not be changed by the different processes."
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "id": "85184253",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "## Reading a global variable"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "id": "b8ee2978",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "Let us create a global variable with a specific value and a function that reads it."
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "id": "2749eda5",
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "MY_GLOBAL_VAR = 1.4"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "id": "12a4452e",
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "def return_global_var():\n",
 76 |     "    return MY_GLOBAL_VAR"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "id": "a0fae770",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "Let us perform this task with multiple processes. That is, each process returns the global variable."
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "id": "5d7b95dd",
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "n_workers = 2"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "id": "adfb598c",
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "tasks = [delayed(return_global_var)() for i in range(n_workers)]"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "id": "5ca1a1e9",
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "with Parallel(n_jobs=n_workers, batch_size=1, verbose=10, backend='loky') as parallel_pool:\n",
115 |     "    parallel_results = parallel_pool(tasks)"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "id": "16550bd2",
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "print(parallel_results)"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "id": "4c515f3e",
131 |    "metadata": {},
132 |    "source": [
133 |     "Even though each process has an independent dataspace, the variables created earlier in the notebook can also be used. However, this does not mean that the variable is actually shared in the sense that both processes can access the same memory where the variable is stored. The `joblib` library made a copy of the global variable in each process. Hence, it cannot be changed by the individual processes."
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "id": "d9e1bd9a",
139 |    "metadata": {},
140 |    "source": [
141 |     "## Writing into a global variable"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "id": "b3555c7f",
147 |    "metadata": {},
148 |    "source": [
149 |     "Let us try to overwrite a global variable with different values in each process."
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "id": "67926769",
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "def change_global_var(n):\n",
160 |     "    MY_GLOBAL_VAR = n\n",
161 |     "    return MY_GLOBAL_VAR"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "id": "1c81c7f1",
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "tasks = [delayed(change_global_var)(i) for i in range(n_workers)]"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "id": "47d546c2",
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "with Parallel(n_jobs=n_workers, batch_size=1, verbose=10, backend='loky') as parallel_pool:\n",
182 |     "    parallel_results = parallel_pool(tasks)"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "id": "6d047817",
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "print(parallel_results)\n",
193 |     "print(MY_GLOBAL_VAR)"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "id": "9a717543",
199 |    "metadata": {},
200 |    "source": [
201 |     "The above results shows that in each process, a local variable named `MY_GLOBAL_VAR` was created and returned to the main process. The global variable with the same name `MY_GLOBAL_VAR` was left unchanged. Notice that this is the expected behaviour of any Python function, not just for `joblib`."
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "id": "209e2d79",
207 |    "metadata": {},
208 |    "source": [
209 |     "## Changing a global variable"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "id": "882fb8e4",
215 |    "metadata": {},
216 |    "source": [
217 |     "Let us try to add a value to the global variable."
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": null,
223 |    "id": "0244968a",
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": [
227 |     "def add_to_global_var(n):\n",
228 |     "    MY_GLOBAL_VAR += n\n",
229 |     "    return MY_GLOBAL_VAR"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": null,
235 |    "id": "607919c7",
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "tasks = [delayed(add_to_global_var)(i) for i in range(n_workers)]"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": null,
245 |    "id": "0eb30038",
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "with Parallel(n_jobs=n_workers, batch_size=1, verbose=10, backend='loky') as parallel_pool:\n",
250 |     "    parallel_results = parallel_pool(tasks)"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "markdown",
255 |    "id": "82f48a16",
256 |    "metadata": {},
257 |    "source": [
258 |     "The `joblib` library throws an error. Here, the function tries to read and then write into the same variable `my_global_var`. The previous examples showed that either reading or writing is possible, but adding to a global variable fails. The reason is that Python cannot detect if the variable is a global or local variable since we try to both read and write the variable."
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "markdown",
263 |    "id": "8f7fcbfd",
264 |    "metadata": {},
265 |    "source": [
266 |     "## Reading databases"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "markdown",
271 |    "id": "52092b7b",
272 |    "metadata": {},
273 |    "source": [
274 |     "In data science, it is common to have a dataset that needs to be used by all processes. However, each process has its own data space. There are different ways to handle this situation. The easiest approach is to handle the data set as a global variable. This is sufficient if the processes only need to read the dataset but not adapt it."
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "id": "1493f80d",
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": [
284 |     "import pandas as pd\n",
285 |     "import numpy as np"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": null,
291 |    "id": "f0467d68",
292 |    "metadata": {},
293 |    "outputs": [],
294 |    "source": [
295 |     "my_global_df = pd.DataFrame(data=np.arange(100), columns=[\"my_data\"])"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": null,
301 |    "id": "41152db8",
302 |    "metadata": {},
303 |    "outputs": [],
304 |    "source": [
305 |     "my_global_df"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "id": "da67b62e",
312 |    "metadata": {},
313 |    "outputs": [],
314 |    "source": [
315 |     "def sum_data(start, end):\n",
316 |     "    my_local_data = my_global_df[\"my_data\"][start:end]\n",
317 |     "    return np.sum(my_local_data)"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": null,
323 |    "id": "0d5d2231",
324 |    "metadata": {},
325 |    "outputs": [],
326 |    "source": [
327 |     "chunk_size = my_global_df.shape[0] // n_workers\n",
328 |     "tasks = [delayed(sum_data)(i*chunk_size, (i+1)*chunk_size) for i in range(n_workers)]"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": null,
334 |    "id": "2db9e066",
335 |    "metadata": {},
336 |    "outputs": [],
337 |    "source": [
338 |     "with Parallel(n_jobs=n_workers, batch_size=1, verbose=10, backend='loky') as parallel_pool:\n",
339 |     "    parallel_results = parallel_pool(tasks)"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": null,
345 |    "id": "ee2650d4",
346 |    "metadata": {},
347 |    "outputs": [],
348 |    "source": [
349 |     "print(parallel_results)"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "markdown",
354 |    "id": "accd3a23",
355 |    "metadata": {},
356 |    "source": [
357 |     "The first process indeed summed all elements in the first half of the database, and the second process summed the second half. Although this works, both processes have a copy of the entire database, which is inefficient."
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "markdown",
362 |    "id": "3e97c44a",
363 |    "metadata": {},
364 |    "source": [
365 |     "One way of distributing a database over different processes is by reading the necessary parts of the database in each process. For example, one worker reads the first half and the other worker the second half of a database from disk."
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": null,
371 |    "id": "bc2b718c",
372 |    "metadata": {},
373 |    "outputs": [],
374 |    "source": [
375 |     "my_global_df.to_excel(\"my_database.xlsx\", index=False)"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": null,
381 |    "id": "3d7f4053",
382 |    "metadata": {},
383 |    "outputs": [],
384 |    "source": [
385 |     "def read_and_sum_data(start, end):\n",
386 |     "    my_local_df = pd.read_excel(\"my_database.xlsx\", header=0, skiprows=range(1,start+1), nrows=end-start)\n",
387 |     "    return my_local_df.shape, np.sum(my_local_df[\"my_data\"])"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "code",
392 |    "execution_count": null,
393 |    "id": "0edad3bb",
394 |    "metadata": {},
395 |    "outputs": [],
396 |    "source": [
397 |     "chunk_size = my_global_df.shape[0] // n_workers\n",
398 |     "tasks = [delayed(read_and_sum_data)(i*chunk_size, (i+1)*chunk_size) for i in range(n_workers)]"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": null,
404 |    "id": "8265eaeb",
405 |    "metadata": {},
406 |    "outputs": [],
407 |    "source": [
408 |     "with Parallel(n_jobs=n_workers, batch_size=1, verbose=10, backend='loky') as parallel_pool:\n",
409 |     "    parallel_results = parallel_pool(tasks)"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": null,
415 |    "id": "62f3257c",
416 |    "metadata": {},
417 |    "outputs": [],
418 |    "source": [
419 |     "print(parallel_results)"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "markdown",
424 |    "id": "6e813b4a",
425 |    "metadata": {},
426 |    "source": [
427 |     "The result shows that the local data frames are half the size of the Excel file. Furthermore, the summations are correct."
428 |    ]
429 |   }
430 |  ],
431 |  "metadata": {
432 |   "kernelspec": {
433 |    "display_name": "Python 3 (ipykernel)",
434 |    "language": "python",
435 |    "name": "python3"
436 |   },
437 |   "language_info": {
438 |    "codemirror_mode": {
439 |     "name": "ipython",
440 |     "version": 3
441 |    },
442 |    "file_extension": ".py",
443 |    "mimetype": "text/x-python",
444 |    "name": "python",
445 |    "nbconvert_exporter": "python",
446 |    "pygments_lexer": "ipython3",
447 |    "version": "3.12.5"
448 |   }
449 |  },
450 |  "nbformat": 4,
451 |  "nbformat_minor": 5
452 | }
453 | 


--------------------------------------------------------------------------------
/AY01/Comparaciones.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "attachments": {},
  5 |       "cell_type": "markdown",
  6 |       "metadata": {
  7 |         "id": "Fv1dVo4_y2rm"
  8 |       },
  9 |       "source": [
 10 |         "# Comparaciones de rendimiento\n",
 11 |         "\n",
 12 |         "Calculemos raíces cuadradas de varios números de distintas maneras:"
 13 |       ]
 14 |     },
 15 |     {
 16 |       "attachments": {},
 17 |       "cell_type": "markdown",
 18 |       "metadata": {
 19 |         "id": "Rugw0fMby__Z"
 20 |       },
 21 |       "source": [
 22 |         "### Python normal:"
 23 |       ]
 24 |     },
 25 |     {
 26 |       "cell_type": "code",
 27 |       "execution_count": 1,
 28 |       "metadata": {
 29 |         "id": "RqDWRK7xyw0u"
 30 |       },
 31 |       "outputs": [],
 32 |       "source": [
 33 |         "from math import sqrt\n",
 34 |         "import time as tm"
 35 |       ]
 36 |     },
 37 |     {
 38 |       "cell_type": "code",
 39 |       "execution_count": 2,
 40 |       "metadata": {},
 41 |       "outputs": [],
 42 |       "source": [
 43 |         "n = 10000000"
 44 |       ]
 45 |     },
 46 |     {
 47 |       "cell_type": "code",
 48 |       "execution_count": 3,
 49 |       "metadata": {
 50 |         "colab": {
 51 |           "base_uri": "https://localhost:8080/"
 52 |         },
 53 |         "id": "sh6FUabezDiR",
 54 |         "outputId": "f672cf37-2834-49d1-d586-a625f58b7ebe"
 55 |       },
 56 |       "outputs": [
 57 |         {
 58 |           "name": "stdout",
 59 |           "output_type": "stream",
 60 |           "text": [
 61 |             "Comenzando a calcular...\n",
 62 |             "Tiempo total: 0.9421744346618652s\n"
 63 |           ]
 64 |         }
 65 |       ],
 66 |       "source": [
 67 |         "print(\"Comenzando a calcular...\")\n",
 68 |         "start = tm.time()\n",
 69 |         "normal_results = [sqrt(i) for i in range(n)]\n",
 70 |         "end = tm.time()\n",
 71 |         "print(f\"Tiempo total: {end - start}s\")"
 72 |       ]
 73 |     },
 74 |     {
 75 |       "attachments": {},
 76 |       "cell_type": "markdown",
 77 |       "metadata": {
 78 |         "id": "xPOV-TYHzH_o"
 79 |       },
 80 |       "source": [
 81 |         "## Numpy:"
 82 |       ]
 83 |     },
 84 |     {
 85 |       "cell_type": "code",
 86 |       "execution_count": 4,
 87 |       "metadata": {
 88 |         "id": "x1JnDuDozG6d"
 89 |       },
 90 |       "outputs": [],
 91 |       "source": [
 92 |         "import numpy as np\n",
 93 |         "import time as tm"
 94 |       ]
 95 |     },
 96 |     {
 97 |       "cell_type": "code",
 98 |       "execution_count": 5,
 99 |       "metadata": {
100 |         "colab": {
101 |           "base_uri": "https://localhost:8080/"
102 |         },
103 |         "id": "a2iwAUwLzOHB",
104 |         "outputId": "763cac3a-3bcd-40a8-bb63-ffcaa6cae7a7"
105 |       },
106 |       "outputs": [
107 |         {
108 |           "name": "stdout",
109 |           "output_type": "stream",
110 |           "text": [
111 |             "Comenzando a calcular...\n",
112 |             "Tiempo total: 0.04117774963378906s\n"
113 |           ]
114 |         }
115 |       ],
116 |       "source": [
117 |         "print(\"Comenzando a calcular...\")\n",
118 |         "start = tm.time()\n",
119 |         "data = np.arange(n)\n",
120 |         "numpy_results = np.sqrt(data) # Ojo que le damos todo el array de datos directamente a la función\n",
121 |         "# En general, al trabajar con Numpy, es muy buena idea hacer todo de manera vectorial/matricial.\n",
122 |         "# Está muy optimizado para esto!\n",
123 |         "end = tm.time()\n",
124 |         "print(f\"Tiempo total: {end - start}s\")"
125 |       ]
126 |     },
127 |     {
128 |       "attachments": {},
129 |       "cell_type": "markdown",
130 |       "metadata": {
131 |         "id": "KizYAhQ2zR9g"
132 |       },
133 |       "source": [
134 |         "## Joblib"
135 |       ]
136 |     },
137 |     {
138 |       "cell_type": "code",
139 |       "execution_count": 6,
140 |       "metadata": {
141 |         "id": "x4jm5YJpzQUD"
142 |       },
143 |       "outputs": [],
144 |       "source": [
145 |         "import numpy as np\n",
146 |         "from math import sqrt\n",
147 |         "from joblib import Parallel\n",
148 |         "from joblib import delayed\n",
149 |         "import time as tm"
150 |       ]
151 |     },
152 |     {
153 |       "attachments": {},
154 |       "cell_type": "markdown",
155 |       "metadata": {
156 |         "id": "2dg3MYcI1GLw"
157 |       },
158 |       "source": [
159 |         "Dos trabajadores:"
160 |       ]
161 |     },
162 |     {
163 |       "cell_type": "code",
164 |       "execution_count": 7,
165 |       "metadata": {
166 |         "colab": {
167 |           "base_uri": "https://localhost:8080/"
168 |         },
169 |         "id": "l_JLrCVa09X5",
170 |         "outputId": "11f283eb-aabd-4745-925f-d1083edcbeeb"
171 |       },
172 |       "outputs": [
173 |         {
174 |           "name": "stdout",
175 |           "output_type": "stream",
176 |           "text": [
177 |             "Comenzando a calcular...\n",
178 |             "Tiempo total: 18.816686868667603s\n"
179 |           ]
180 |         }
181 |       ],
182 |       "source": [
183 |         "print(\"Comenzando a calcular...\")\n",
184 |         "start = tm.time()\n",
185 |         "parallel_pool = Parallel(n_jobs=2)\n",
186 |         "parallel_sqrt = delayed(sqrt)\n",
187 |         "parallel_tasks = [parallel_sqrt(i) for i in range(n)]\n",
188 |         "parallel_results = parallel_pool(parallel_tasks)\n",
189 |         "end = tm.time()\n",
190 |         "print(f\"Tiempo total: {end - start}s\")"
191 |       ]
192 |     },
193 |     {
194 |       "attachments": {},
195 |       "cell_type": "markdown",
196 |       "metadata": {
197 |         "id": "xkxI7K8P1IZF"
198 |       },
199 |       "source": [
200 |         "Cuatro trabajadores:"
201 |       ]
202 |     },
203 |     {
204 |       "cell_type": "code",
205 |       "execution_count": 8,
206 |       "metadata": {
207 |         "colab": {
208 |           "base_uri": "https://localhost:8080/"
209 |         },
210 |         "id": "dKiyfzwCzaFZ",
211 |         "outputId": "0fc8e872-2786-4ffc-b145-9a924dc276ea"
212 |       },
213 |       "outputs": [
214 |         {
215 |           "name": "stdout",
216 |           "output_type": "stream",
217 |           "text": [
218 |             "Comenzando a calcular...\n",
219 |             "Tiempo total: 16.809730291366577s\n"
220 |           ]
221 |         }
222 |       ],
223 |       "source": [
224 |         "print(\"Comenzando a calcular...\")\n",
225 |         "start = tm.time()\n",
226 |         "parallel_pool = Parallel(n_jobs=4)\n",
227 |         "parallel_sqrt = delayed(sqrt)\n",
228 |         "parallel_tasks = [parallel_sqrt(i) for i in range(n)]\n",
229 |         "parallel_results = parallel_pool(parallel_tasks)\n",
230 |         "end = tm.time()\n",
231 |         "print(f\"Tiempo total: {end - start}s\")"
232 |       ]
233 |     },
234 |     {
235 |       "attachments": {},
236 |       "cell_type": "markdown",
237 |       "metadata": {
238 |         "id": "4zfFot501KeG"
239 |       },
240 |       "source": [
241 |         "¿Qué ocurre si usamos la función raíz de numpy?"
242 |       ]
243 |     },
244 |     {
245 |       "cell_type": "code",
246 |       "execution_count": 9,
247 |       "metadata": {
248 |         "colab": {
249 |           "base_uri": "https://localhost:8080/"
250 |         },
251 |         "id": "hIy5wAJKzXN2",
252 |         "outputId": "bbd1530c-5bf4-4492-a561-e3330ff57aa6"
253 |       },
254 |       "outputs": [
255 |         {
256 |           "name": "stdout",
257 |           "output_type": "stream",
258 |           "text": [
259 |             "Comenzando a calcular...\n",
260 |             "Tiempo total: 34.296361446380615s\n"
261 |           ]
262 |         }
263 |       ],
264 |       "source": [
265 |         "print(\"Comenzando a calcular...\")\n",
266 |         "start = tm.time()\n",
267 |         "parallel_pool = Parallel(n_jobs=2)\n",
268 |         "parallel_sqrt = delayed(np.sqrt) # Notar la diferencia\n",
269 |         "parallel_tasks = [parallel_sqrt(i) for i in range(n)]\n",
270 |         "parallel_results = parallel_pool(parallel_tasks)\n",
271 |         "end = tm.time()\n",
272 |         "print(f\"Tiempo total: {end - start}s\")"
273 |       ]
274 |     },
275 |     {
276 |       "attachments": {},
277 |       "cell_type": "markdown",
278 |       "metadata": {
279 |         "id": "CB7eJ_6b9Diw"
280 |       },
281 |       "source": [
282 |         "Finalmente con batch_size fijo:"
283 |       ]
284 |     },
285 |     {
286 |       "cell_type": "code",
287 |       "execution_count": 10,
288 |       "metadata": {
289 |         "colab": {
290 |           "base_uri": "https://localhost:8080/"
291 |         },
292 |         "id": "fGEG0I-VzhYS",
293 |         "outputId": "3bab9c95-cd4a-4fe7-d9be-5c54447c6b86"
294 |       },
295 |       "outputs": [
296 |         {
297 |           "name": "stdout",
298 |           "output_type": "stream",
299 |           "text": [
300 |             "Comenzando a calcular...\n",
301 |             "Tiempo total: 15.666585206985474s\n"
302 |           ]
303 |         }
304 |       ],
305 |       "source": [
306 |         "print(\"Comenzando a calcular...\")\n",
307 |         "start = tm.time()\n",
308 |         "parallel_pool = Parallel(n_jobs=2, batch_size=100000)\n",
309 |         "parallel_sqrt = delayed(sqrt)\n",
310 |         "parallel_tasks = [parallel_sqrt(i) for i in range(n)]\n",
311 |         "parallel_results = parallel_pool(parallel_tasks)\n",
312 |         "end = tm.time()\n",
313 |         "print(f\"Tiempo total: {end - start}s\")"
314 |       ]
315 |     },
316 |     {
317 |       "cell_type": "code",
318 |       "execution_count": 11,
319 |       "metadata": {
320 |         "colab": {
321 |           "base_uri": "https://localhost:8080/"
322 |         },
323 |         "id": "PdiInRDx9IFA",
324 |         "outputId": "1a5efdd8-eaf9-43ba-b3f6-7aef7ea4fd56"
325 |       },
326 |       "outputs": [
327 |         {
328 |           "name": "stdout",
329 |           "output_type": "stream",
330 |           "text": [
331 |             "Comenzando a calcular...\n",
332 |             "Tiempo total: 19.51024317741394s\n"
333 |           ]
334 |         }
335 |       ],
336 |       "source": [
337 |         "print(\"Comenzando a calcular...\")\n",
338 |         "start = tm.time()\n",
339 |         "parallel_pool = Parallel(n_jobs=2, batch_size=500000)\n",
340 |         "parallel_sqrt = delayed(sqrt)\n",
341 |         "parallel_tasks = [parallel_sqrt(i) for i in range(n)]\n",
342 |         "parallel_results = parallel_pool(parallel_tasks)\n",
343 |         "end = tm.time()\n",
344 |         "print(f\"Tiempo total: {end - start}s\")"
345 |       ]
346 |     },
347 |     {
348 |       "cell_type": "code",
349 |       "execution_count": 12,
350 |       "metadata": {
351 |         "id": "i8dx3VA49wBZ"
352 |       },
353 |       "outputs": [
354 |         {
355 |           "name": "stdout",
356 |           "output_type": "stream",
357 |           "text": [
358 |             "Comenzando a calcular...\n",
359 |             "Tiempo total: 20.722821712493896s\n"
360 |           ]
361 |         }
362 |       ],
363 |       "source": [
364 |         "print(\"Comenzando a calcular...\")\n",
365 |         "start = tm.time()\n",
366 |         "parallel_pool = Parallel(n_jobs=4, batch_size=int(n/4))\n",
367 |         "parallel_sqrt = delayed(sqrt)\n",
368 |         "parallel_tasks = [parallel_sqrt(i) for i in range(n)]\n",
369 |         "parallel_results = parallel_pool(parallel_tasks)\n",
370 |         "end = tm.time()\n",
371 |         "print(f\"Tiempo total: {end - start}s\")"
372 |       ]
373 |     },
374 |     {
375 |       "cell_type": "markdown",
376 |       "metadata": {},
377 |       "source": [
378 |         "## Comparación rendimiento de appendear valores a una lista vs un array"
379 |       ]
380 |     },
381 |     {
382 |       "cell_type": "code",
383 |       "execution_count": 13,
384 |       "metadata": {},
385 |       "outputs": [
386 |         {
387 |           "name": "stdout",
388 |           "output_type": "stream",
389 |           "text": [
390 |             "0.6783316135406494\n"
391 |           ]
392 |         }
393 |       ],
394 |       "source": [
395 |         "# Numpy array vs listas\n",
396 |         "\n",
397 |         "lista = []\n",
398 |         "t1 = tm.time()\n",
399 |         "for i in range(n):\n",
400 |         "    lista.append(i)\n",
401 |         "t2 = tm.time()\n",
402 |         "print(t2-t1)"
403 |       ]
404 |     },
405 |     {
406 |       "cell_type": "code",
407 |       "execution_count": 15,
408 |       "metadata": {},
409 |       "outputs": [
410 |         {
411 |           "ename": "KeyboardInterrupt",
412 |           "evalue": "",
413 |           "output_type": "error",
414 |           "traceback": [
415 |             "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
416 |             "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
417 |             "Cell \u001b[1;32mIn[15], line 4\u001b[0m\n\u001b[0;32m      2\u001b[0m t1 \u001b[38;5;241m=\u001b[39m tm\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m      3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(n):\n\u001b[1;32m----> 4\u001b[0m     array \u001b[38;5;241m=\u001b[39m \u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mappend\u001b[49m\u001b[43m(\u001b[49m\u001b[43marray\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m      5\u001b[0m t2 \u001b[38;5;241m=\u001b[39m tm\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m      6\u001b[0m \u001b[38;5;28mprint\u001b[39m(t2\u001b[38;5;241m-\u001b[39mt1)\n",
418 |             "File \u001b[1;32m<__array_function__ internals>:200\u001b[0m, in \u001b[0;36mappend\u001b[1;34m(*args, **kwargs)\u001b[0m\n",
419 |             "File \u001b[1;32mc:\\Users\\alber\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\numpy\\lib\\function_base.py:5499\u001b[0m, in \u001b[0;36mappend\u001b[1;34m(arr, values, axis)\u001b[0m\n\u001b[0;32m   5497\u001b[0m     values \u001b[38;5;241m=\u001b[39m ravel(values)\n\u001b[0;32m   5498\u001b[0m     axis \u001b[38;5;241m=\u001b[39m arr\u001b[38;5;241m.\u001b[39mndim\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m\n\u001b[1;32m-> 5499\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mconcatenate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m(\u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m)\u001b[49m\n",
420 |             "File \u001b[1;32m<__array_function__ internals>:200\u001b[0m, in \u001b[0;36mconcatenate\u001b[1;34m(*args, **kwargs)\u001b[0m\n",
421 |             "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
422 |           ]
423 |         }
424 |       ],
425 |       "source": [
426 |         "array = np.array([])\n",
427 |         "t1 = tm.time()\n",
428 |         "for i in range(n):\n",
429 |         "    array = np.append(array, i)\n",
430 |         "t2 = tm.time()\n",
431 |         "print(t2-t1)\n",
432 |         "\n",
433 |         "# Lo detuve antes de que terminara porque ya era mucho, pero llegó a más de 42 minutos sin terminar."
434 |       ]
435 |     },
436 |     {
437 |       "cell_type": "markdown",
438 |       "metadata": {},
439 |       "source": [
440 |         "En resumen:\n",
441 |         "- Si necesitan ir agregando valores -> listas\n",
442 |         "- Para realizar operaciones matriciales y de vectores -> NUMPY\n",
443 |         "  \n",
444 |         "** Es común agregar valores a una lista y luego transformarla a un array para realizar operaciones con ella."
445 |       ]
446 |     },
447 |     {
448 |       "cell_type": "markdown",
449 |       "metadata": {},
450 |       "source": []
451 |     }
452 |   ],
453 |   "metadata": {
454 |     "colab": {
455 |       "provenance": []
456 |     },
457 |     "kernelspec": {
458 |       "display_name": "Python 3",
459 |       "name": "python3"
460 |     },
461 |     "language_info": {
462 |       "codemirror_mode": {
463 |         "name": "ipython",
464 |         "version": 3
465 |       },
466 |       "file_extension": ".py",
467 |       "mimetype": "text/x-python",
468 |       "name": "python",
469 |       "nbconvert_exporter": "python",
470 |       "pygments_lexer": "ipython3",
471 |       "version": "3.11.2"
472 |     }
473 |   },
474 |   "nbformat": 4,
475 |   "nbformat_minor": 0
476 | }
477 | 


--------------------------------------------------------------------------------
/AY06/DataLocality.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "1e4d5e01",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Data locality\n",
  9 |     "\n",
 10 |     " - Author: Elwin van 't Wout\n",
 11 |     " - Affiliation: Pontificia Universidad Católica de Chile\n",
 12 |     " - Date: 18-8-2023\n",
 13 |     "\n",
 14 |     "Test the efficiency of Python with different memory access."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "id": "366bb889",
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import numpy as np"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "id": "f68d0297",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "## Loop strides\n",
 33 |     "\n",
 34 |     "Test the influence of the stride of the loop on the efficiency. First, create a large NumPy array with random numbers. Then, perform two different sums, with the same number of operators.\n",
 35 |     "\n",
 36 |     "A loop with ```range(0,N,1)``` has elements 0, 1, 2, 3, ..., N-1.\n",
 37 |     "\n",
 38 |     "A loop with ```range(0,N*s,s)``` has elements 0, s, 2s, 3s, ..., (N-1)s.\n"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 2,
 44 |    "id": "77f64223",
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "N = 1000000\n",
 49 |     "stride = 187"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 3,
 55 |    "id": "59c4b340-1b0c-45bd-ad9c-8bfef7c547ba",
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "CPU times: user 8.67 ms, sys: 9.03 ms, total: 17.7 ms\n",
 63 |       "Wall time: 14.9 ms\n"
 64 |      ]
 65 |     }
 66 |    ],
 67 |    "source": [
 68 |     "%%time\n",
 69 |     "a = np.random.rand(N)"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 4,
 75 |    "id": "00134d7f",
 76 |    "metadata": {},
 77 |    "outputs": [
 78 |     {
 79 |      "name": "stdout",
 80 |      "output_type": "stream",
 81 |      "text": [
 82 |       "109 ms ± 6.64 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
 83 |      ]
 84 |     }
 85 |    ],
 86 |    "source": [
 87 |     "%%timeit\n",
 88 |     "\n",
 89 |     "sum1 = 0.0\n",
 90 |     "for n in range(0,N,1):\n",
 91 |     "    sum1 += a[n]\n"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 5,
 97 |    "id": "fb9654ce-7477-476a-98a7-caf7a74db724",
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "name": "stdout",
102 |      "output_type": "stream",
103 |      "text": [
104 |       "CPU times: user 1.09 s, sys: 1.04 s, total: 2.13 s\n",
105 |       "Wall time: 2.13 s\n"
106 |      ]
107 |     }
108 |    ],
109 |    "source": [
110 |     "%%time\n",
111 |     "b = np.random.rand(N*stride)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 6,
117 |    "id": "15e86634",
118 |    "metadata": {},
119 |    "outputs": [
120 |     {
121 |      "name": "stdout",
122 |      "output_type": "stream",
123 |      "text": [
124 |       "149 ms ± 4.37 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
125 |      ]
126 |     }
127 |    ],
128 |    "source": [
129 |     "%%timeit\n",
130 |     "\n",
131 |     "sum2 = 0.0\n",
132 |     "for n in range(0,N*stride,stride):\n",
133 |     "    sum2 += b[n]\n"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "id": "ee5dd40c-eafd-4d71-ae92-9beaa206cfe4",
139 |    "metadata": {},
140 |    "source": [
141 |     "This experiment shows two algorithms, each with the same number of the same operations: $N$ summations. However, the timing is different. This can only be due to different memory access."
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "id": "52d00beb",
147 |    "metadata": {},
148 |    "source": [
149 |     "The efficiency of a NumPy array is different than for a Python list because both store the data in diferent formats."
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 7,
155 |    "id": "2e1f06de",
156 |    "metadata": {},
157 |    "outputs": [
158 |     {
159 |      "name": "stdout",
160 |      "output_type": "stream",
161 |      "text": [
162 |       "CPU times: user 49.8 s, sys: 2.87 s, total: 52.7 s\n",
163 |       "Wall time: 52.8 s\n"
164 |      ]
165 |     }
166 |    ],
167 |    "source": [
168 |     "%%time\n",
169 |     "c = [np.random.rand() for n in range(N*stride)]"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 8,
175 |    "id": "999c41b5",
176 |    "metadata": {},
177 |    "outputs": [
178 |     {
179 |      "name": "stdout",
180 |      "output_type": "stream",
181 |      "text": [
182 |       "158 ms ± 9.64 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
183 |      ]
184 |     }
185 |    ],
186 |    "source": [
187 |     "%%timeit\n",
188 |     "\n",
189 |     "sum3 = 0.0\n",
190 |     "for n in range(0,N*stride,stride):\n",
191 |     "    sum3 += c[n]\n"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "id": "a8efc106-5e43-41fd-8a06-0f235c84d8c4",
197 |    "metadata": {},
198 |    "source": [
199 |     "In general, it is almost always more efficient to use Numpy functionality."
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 9,
205 |    "id": "18400e83-d422-4c48-9af4-5aa2e6864e8f",
206 |    "metadata": {},
207 |    "outputs": [
208 |     {
209 |      "name": "stdout",
210 |      "output_type": "stream",
211 |      "text": [
212 |       "337 µs ± 15.2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
213 |      ]
214 |     }
215 |    ],
216 |    "source": [
217 |     "%%timeit\n",
218 |     "\n",
219 |     "sum4 = np.sum(a)\n"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 10,
225 |    "id": "58ca5191-232a-4780-ad1d-1c1ecd15269e",
226 |    "metadata": {},
227 |    "outputs": [
228 |     {
229 |      "name": "stdout",
230 |      "output_type": "stream",
231 |      "text": [
232 |       "9.11 ms ± 1.97 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
233 |      ]
234 |     }
235 |    ],
236 |    "source": [
237 |     "%%timeit\n",
238 |     "\n",
239 |     "sum5 = np.sum(b[::stride])\n"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 11,
245 |    "id": "a0f83a6c",
246 |    "metadata": {},
247 |    "outputs": [
248 |     {
249 |      "name": "stdout",
250 |      "output_type": "stream",
251 |      "text": [
252 |       "153 ms ± 8.51 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
253 |      ]
254 |     }
255 |    ],
256 |    "source": [
257 |     "%%timeit\n",
258 |     "\n",
259 |     "sum6 = np.sum(c[::stride])\n"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "id": "c8ea3c7a",
265 |    "metadata": {},
266 |    "source": [
267 |     "## Summing the elements of a multi-dimensional arrays\n",
268 |     "\n",
269 |     "The elements of a multidimensional arrays are stored in memory as a one-dimensional ordering. Hence, the order of accessing the elements has an impact on the timing. Let us create a 3-dimensional tensor and sum all elements. Again, the different implementations all require exactly the same number of the same operators ($N^3$ summations) but the memory access is different."
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": 12,
275 |    "id": "54ea1926-8578-420a-8200-b427f16b6802",
276 |    "metadata": {},
277 |    "outputs": [
278 |     {
279 |      "name": "stdout",
280 |      "output_type": "stream",
281 |      "text": [
282 |       "Created a random array of shape (250, 250, 250)\n"
283 |      ]
284 |     }
285 |    ],
286 |    "source": [
287 |     "N = 250\n",
288 |     "\n",
289 |     "a = np.random.rand(N,N,N)\n",
290 |     "b = np.random.rand(N,N,N)\n",
291 |     "c = np.random.rand(N,N,N)\n",
292 |     "\n",
293 |     "print(\"Created a random array of shape\",a.shape)"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": 13,
299 |    "id": "d71730a7-fda0-470f-8b9f-9727ad8f6408",
300 |    "metadata": {},
301 |    "outputs": [
302 |     {
303 |      "name": "stdout",
304 |      "output_type": "stream",
305 |      "text": [
306 |       "2.39 s ± 292 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
307 |      ]
308 |     }
309 |    ],
310 |    "source": [
311 |     "%%timeit\n",
312 |     "\n",
313 |     "sum1 = 0.0\n",
314 |     "for i in range(N):\n",
315 |     "    for j in range(N):\n",
316 |     "        for k in range(N):\n",
317 |     "            sum1 += a[i,j,k]\n"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": 14,
323 |    "id": "bce7e19c-979a-4245-9955-b287af5251d5",
324 |    "metadata": {},
325 |    "outputs": [
326 |     {
327 |      "name": "stdout",
328 |      "output_type": "stream",
329 |      "text": [
330 |       "3.44 s ± 322 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
331 |      ]
332 |     }
333 |    ],
334 |    "source": [
335 |     "%%timeit\n",
336 |     "\n",
337 |     "sum2 = 0.0\n",
338 |     "for k in range(N):\n",
339 |     "    for j in range(N):\n",
340 |     "        for i in range(N):\n",
341 |     "            sum2 += b[i,j,k]\n"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "markdown",
346 |    "id": "3aaf6986",
347 |    "metadata": {},
348 |    "source": [
349 |     "Instead of using a loop, it is more efficient to use NumPy functions that are based on optimised algorithms and implementations."
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": 15,
355 |    "id": "aa3d3875-8633-4745-9ed9-46047af87f6f",
356 |    "metadata": {},
357 |    "outputs": [
358 |     {
359 |      "name": "stdout",
360 |      "output_type": "stream",
361 |      "text": [
362 |       "8.93 ms ± 108 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
363 |      ]
364 |     }
365 |    ],
366 |    "source": [
367 |     "%%timeit\n",
368 |     "\n",
369 |     "sum3 = np.sum(c)\n"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "markdown",
374 |    "id": "b630997a",
375 |    "metadata": {},
376 |    "source": [
377 |     "## Python broadcasting\n",
378 |     "\n",
379 |     "NumPy has optimised implementations for many algorithms. For example, summing a constant value to all elements in an array can be done with ```a+=1```. Even though the variable ```a``` has size ```(n,)``` and ```1``` is a scalar, the algorithm works. NumPy uses *broadcasting* which means that (if possible) the summation is performed for all elements."
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": 16,
385 |    "id": "40352069",
386 |    "metadata": {},
387 |    "outputs": [],
388 |    "source": [
389 |     "N = 1000000"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": 17,
395 |    "id": "a9b75dc2",
396 |    "metadata": {},
397 |    "outputs": [
398 |     {
399 |      "name": "stdout",
400 |      "output_type": "stream",
401 |      "text": [
402 |       "153 ms ± 2.39 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
403 |      ]
404 |     }
405 |    ],
406 |    "source": [
407 |     "%%timeit\n",
408 |     "\n",
409 |     "a = np.zeros(N)\n",
410 |     "for n in range(N):\n",
411 |     "    a[n] += 1\n"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": 18,
417 |    "id": "ab78ac69",
418 |    "metadata": {},
419 |    "outputs": [
420 |     {
421 |      "name": "stdout",
422 |      "output_type": "stream",
423 |      "text": [
424 |       "837 µs ± 52.8 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
425 |      ]
426 |     }
427 |    ],
428 |    "source": [
429 |     "%%timeit\n",
430 |     "\n",
431 |     "b = np.zeros(N)\n",
432 |     "b += 1\n"
433 |    ]
434 |   },
435 |   {
436 |    "cell_type": "markdown",
437 |    "id": "62814320",
438 |    "metadata": {},
439 |    "source": [
440 |     "## Matrix-matrix multiplication\n",
441 |     "\n",
442 |     "Perform a matrix-matrix multiplication with different types of memory access."
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "code",
447 |    "execution_count": 19,
448 |    "id": "51ce5ced",
449 |    "metadata": {},
450 |    "outputs": [],
451 |    "source": [
452 |     "n = 500\n",
453 |     "\n",
454 |     "A = np.random.rand(n,n)\n",
455 |     "B = np.random.rand(n,n)"
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "code",
460 |    "execution_count": 20,
461 |    "id": "c9550d0d",
462 |    "metadata": {},
463 |    "outputs": [
464 |     {
465 |      "name": "stdout",
466 |      "output_type": "stream",
467 |      "text": [
468 |       "355 ms ± 22.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
469 |      ]
470 |     }
471 |    ],
472 |    "source": [
473 |     "%%timeit\n",
474 |     "\n",
475 |     "C1 = np.zeros((n,n))\n",
476 |     "for i in range(n):\n",
477 |     "    for j in range(n):\n",
478 |     "        C1[i,j] = np.dot(A[i,:],B[:,j])\n"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "code",
483 |    "execution_count": 21,
484 |    "id": "f4e3eca4",
485 |    "metadata": {},
486 |    "outputs": [
487 |     {
488 |      "name": "stdout",
489 |      "output_type": "stream",
490 |      "text": [
491 |       "727 ms ± 28.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
492 |      ]
493 |     }
494 |    ],
495 |    "source": [
496 |     "%%timeit\n",
497 |     "\n",
498 |     "C2 = np.zeros((n,n))\n",
499 |     "for j in range(n):\n",
500 |     "    for k in range(n):\n",
501 |     "        C2[:,j] += A[:,k]*B[k,j]\n"
502 |    ]
503 |   },
504 |   {
505 |    "cell_type": "code",
506 |    "execution_count": 22,
507 |    "id": "6e47fe63",
508 |    "metadata": {},
509 |    "outputs": [
510 |     {
511 |      "name": "stdout",
512 |      "output_type": "stream",
513 |      "text": [
514 |       "539 ms ± 16.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
515 |      ]
516 |     }
517 |    ],
518 |    "source": [
519 |     "%%timeit\n",
520 |     "\n",
521 |     "C3 = np.zeros((n,n))\n",
522 |     "for i in range(n):\n",
523 |     "    for k in range(n):\n",
524 |     "        C3[i,:] += A[i,k]*B[k,:]\n"
525 |    ]
526 |   },
527 |   {
528 |    "cell_type": "code",
529 |    "execution_count": 23,
530 |    "id": "1c9b77fa",
531 |    "metadata": {},
532 |    "outputs": [
533 |     {
534 |      "name": "stdout",
535 |      "output_type": "stream",
536 |      "text": [
537 |       "3.28 ms ± 668 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
538 |      ]
539 |     }
540 |    ],
541 |    "source": [
542 |     "%%timeit\n",
543 |     "\n",
544 |     "C4 = A @ B\n"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "markdown",
549 |    "id": "cf893695",
550 |    "metadata": {},
551 |    "source": [
552 |     "Consider a larger matrix size."
553 |    ]
554 |   },
555 |   {
556 |    "cell_type": "code",
557 |    "execution_count": 24,
558 |    "id": "5b40b11a",
559 |    "metadata": {},
560 |    "outputs": [
561 |     {
562 |      "name": "stdout",
563 |      "output_type": "stream",
564 |      "text": [
565 |       "Finished the loops with order (i,j,*) in 15.621845722198486 seconds.\n",
566 |       "Finished the loops with order (j,k,*) in 58.30747675895691 seconds.\n",
567 |       "Finished the loops with order (i,k,*) in 15.23149061203003 seconds.\n",
568 |       "Finished the Numpy algorithm in 0.11636805534362793 seconds.\n"
569 |      ]
570 |     }
571 |    ],
572 |    "source": [
573 |     "import time\n",
574 |     "\n",
575 |     "n = 2000\n",
576 |     "A = np.random.rand(n,n)\n",
577 |     "B = np.random.rand(n,n)\n",
578 |     "\n",
579 |     "\n",
580 |     "C1 = np.zeros((n,n))\n",
581 |     "time_start = time.time()\n",
582 |     "for i in range(n):\n",
583 |     "    for j in range(n):\n",
584 |     "        C1[i,j] = np.dot(A[i,:],B[:,j])\n",
585 |     "time_end = time.time()\n",
586 |     "print(\"Finished the loops with order (i,j,*) in\",time_end-time_start,\"seconds.\")\n",
587 |     "\n",
588 |     "C2 = np.zeros((n,n))\n",
589 |     "time_start = time.time()\n",
590 |     "for j in range(n):\n",
591 |     "    for k in range(n):\n",
592 |     "        C2[:,j] += A[:,k]*B[k,j]\n",
593 |     "time_end = time.time()\n",
594 |     "print(\"Finished the loops with order (j,k,*) in\",time_end-time_start,\"seconds.\")\n",
595 |     "\n",
596 |     "C3 = np.zeros((n,n))\n",
597 |     "time_start = time.time()\n",
598 |     "for i in range(n):\n",
599 |     "    for k in range(n):\n",
600 |     "        C3[i,:] += A[i,k]*B[k,:]\n",
601 |     "time_end = time.time()\n",
602 |     "print(\"Finished the loops with order (i,k,*) in\",time_end-time_start,\"seconds.\")\n",
603 |     "\n",
604 |     "C4 = np.zeros((n,n))\n",
605 |     "time_start = time.time()\n",
606 |     "C4 = A @ B\n",
607 |     "time_end = time.time()\n",
608 |     "print(\"Finished the Numpy algorithm in\",time_end-time_start,\"seconds.\")\n"
609 |    ]
610 |   },
611 |   {
612 |    "cell_type": "markdown",
613 |    "id": "f3d0579c-e1b8-4cdc-89db-d018c4c9127b",
614 |    "metadata": {
615 |     "editable": true,
616 |     "slideshow": {
617 |      "slide_type": ""
618 |     },
619 |     "tags": []
620 |    },
621 |    "source": [
622 |     "## Apending data\n",
623 |     "\n",
624 |     "Appending data to a Python list is efficient, but Numpy appends arrays by copying the data."
625 |    ]
626 |   },
627 |   {
628 |    "cell_type": "code",
629 |    "execution_count": 25,
630 |    "id": "c4a5c1cd-c4cc-4174-8277-d03bcac1f093",
631 |    "metadata": {},
632 |    "outputs": [],
633 |    "source": [
634 |     "n = 100000"
635 |    ]
636 |   },
637 |   {
638 |    "cell_type": "code",
639 |    "execution_count": 26,
640 |    "id": "fe12194e-0116-4f0a-bfc2-96321644ba67",
641 |    "metadata": {},
642 |    "outputs": [
643 |     {
644 |      "name": "stdout",
645 |      "output_type": "stream",
646 |      "text": [
647 |       "3.57 ms ± 178 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
648 |      ]
649 |     }
650 |    ],
651 |    "source": [
652 |     "%%timeit\n",
653 |     "\n",
654 |     "my_list = []\n",
655 |     "\n",
656 |     "for i in range(n):\n",
657 |     "    my_list.append(i)"
658 |    ]
659 |   },
660 |   {
661 |    "cell_type": "code",
662 |    "execution_count": 27,
663 |    "id": "d88801e1-4a1a-48e6-b775-68ce1e0998e1",
664 |    "metadata": {},
665 |    "outputs": [
666 |     {
667 |      "name": "stdout",
668 |      "output_type": "stream",
669 |      "text": [
670 |       "1.66 s ± 22.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
671 |      ]
672 |     }
673 |    ],
674 |    "source": [
675 |     "%%timeit\n",
676 |     "\n",
677 |     "my_array1 = np.array([], dtype=int)\n",
678 |     "\n",
679 |     "for i in range(n):\n",
680 |     "    my_array1 = np.append(my_array1, i)"
681 |    ]
682 |   },
683 |   {
684 |    "cell_type": "code",
685 |    "execution_count": 28,
686 |    "id": "4f8c9515-c166-4c68-aea0-84d8dd935671",
687 |    "metadata": {},
688 |    "outputs": [
689 |     {
690 |      "name": "stdout",
691 |      "output_type": "stream",
692 |      "text": [
693 |       "5.76 ms ± 187 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
694 |      ]
695 |     }
696 |    ],
697 |    "source": [
698 |     "%%timeit\n",
699 |     "\n",
700 |     "my_array2 = np.empty(n, dtype=int)\n",
701 |     "\n",
702 |     "for i in range(n):\n",
703 |     "    my_array2[i] = i"
704 |    ]
705 |   },
706 |   {
707 |    "cell_type": "code",
708 |    "execution_count": null,
709 |    "id": "2b7f549d-49b4-4272-9801-96339d2b63ef",
710 |    "metadata": {},
711 |    "outputs": [],
712 |    "source": []
713 |   }
714 |  ],
715 |  "metadata": {
716 |   "kernelspec": {
717 |    "display_name": "Python 3 (ipykernel)",
718 |    "language": "python",
719 |    "name": "python3"
720 |   },
721 |   "language_info": {
722 |    "codemirror_mode": {
723 |     "name": "ipython",
724 |     "version": 3
725 |    },
726 |    "file_extension": ".py",
727 |    "mimetype": "text/x-python",
728 |    "name": "python",
729 |    "nbconvert_exporter": "python",
730 |    "pygments_lexer": "ipython3",
731 |    "version": "3.10.6"
732 |   }
733 |  },
734 |  "nbformat": 4,
735 |  "nbformat_minor": 5
736 | }
737 | 


--------------------------------------------------------------------------------
/AY09/06_reduction_padding.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "source": [
  6 |         "# Calculating a vector sum in PyOpenCL\n",
  7 |         "\n",
  8 |         "Elwin van 't Wout\n",
  9 |         "\n",
 10 |         "PUC Chile\n",
 11 |         "\n",
 12 |         "25-9-2024"
 13 |       ],
 14 |       "metadata": {
 15 |         "id": "TG2oWDkjHT8D"
 16 |       }
 17 |     },
 18 |     {
 19 |       "cell_type": "markdown",
 20 |       "metadata": {
 21 |         "id": "KXKZENUruGaq"
 22 |       },
 23 |       "source": [
 24 |         "Calculate the sum of a vector with OpenCL.\n",
 25 |         "\n",
 26 |         "First, we need to configure the virtual machine and install PyOpenCL."
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "code",
 31 |       "execution_count": 1,
 32 |       "metadata": {
 33 |         "colab": {
 34 |           "base_uri": "https://localhost:8080/"
 35 |         },
 36 |         "id": "8lHkgpGAuBEn",
 37 |         "outputId": "715036bd-3fb0-416f-94a0-ddec0a123434"
 38 |       },
 39 |       "outputs": [
 40 |         {
 41 |           "output_type": "stream",
 42 |           "name": "stdout",
 43 |           "text": [
 44 |             "\u001b[33m\r0% [Working]\u001b[0m\r            \rHit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease\n",
 45 |             "Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease\n",
 46 |             "Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease\n",
 47 |             "Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease\n",
 48 |             "Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease\n",
 49 |             "Hit:6 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease\n",
 50 |             "Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease\n",
 51 |             "Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease\n",
 52 |             "Ign:9 https://r2u.stat.illinois.edu/ubuntu jammy InRelease\n",
 53 |             "Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease\n",
 54 |             "Hit:11 https://r2u.stat.illinois.edu/ubuntu jammy Release\n",
 55 |             "Reading package lists... Done\n",
 56 |             "Building dependency tree... Done\n",
 57 |             "Reading state information... Done\n",
 58 |             "54 packages can be upgraded. Run 'apt list --upgradable' to see them.\n",
 59 |             "\u001b[1;33mW: \u001b[0mSkipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)\u001b[0m\n",
 60 |             "Reading package lists... Done\n",
 61 |             "Building dependency tree... Done\n",
 62 |             "Reading state information... Done\n",
 63 |             "nvidia-cuda-toolkit is already the newest version (11.5.1-1ubuntu1).\n",
 64 |             "0 upgraded, 0 newly installed, 0 to remove and 54 not upgraded.\n",
 65 |             "Requirement already satisfied: pyopencl in /usr/local/lib/python3.10/dist-packages (2024.2.7)\n",
 66 |             "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from pyopencl) (1.26.4)\n",
 67 |             "Requirement already satisfied: platformdirs>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from pyopencl) (4.3.6)\n",
 68 |             "Requirement already satisfied: pytools>=2024.1.5 in /usr/local/lib/python3.10/dist-packages (from pyopencl) (2024.1.14)\n",
 69 |             "Requirement already satisfied: typing-extensions>=4 in /usr/local/lib/python3.10/dist-packages (from pytools>=2024.1.5->pyopencl) (4.12.2)\n"
 70 |           ]
 71 |         }
 72 |       ],
 73 |       "source": [
 74 |         "!sudo apt update\n",
 75 |         "!sudo apt install nvidia-cuda-toolkit -y\n",
 76 |         "!pip install pyopencl"
 77 |       ]
 78 |     },
 79 |     {
 80 |       "cell_type": "code",
 81 |       "execution_count": 2,
 82 |       "metadata": {
 83 |         "id": "lgP6978wudGd",
 84 |         "colab": {
 85 |           "base_uri": "https://localhost:8080/"
 86 |         },
 87 |         "outputId": "52b475ee-2f4b-4f20-c30e-f433d2b657eb"
 88 |       },
 89 |       "outputs": [
 90 |         {
 91 |           "output_type": "stream",
 92 |           "name": "stderr",
 93 |           "text": [
 94 |             "/usr/local/lib/python3.10/dist-packages/pytools/persistent_dict.py:63: RecommendedHashNotFoundWarning: Unable to import recommended hash 'siphash24.siphash13', falling back to 'hashlib.sha256'. Run 'python3 -m pip install siphash24' to install the recommended hash.\n",
 95 |             "  warn(\"Unable to import recommended hash 'siphash24.siphash13', \"\n"
 96 |           ]
 97 |         }
 98 |       ],
 99 |       "source": [
100 |         "import numpy as np\n",
101 |         "import pyopencl as cl\n",
102 |         "import pyopencl.array as cl_array"
103 |       ]
104 |     },
105 |     {
106 |       "cell_type": "code",
107 |       "execution_count": 3,
108 |       "metadata": {
109 |         "id": "JCIaRG2KufBQ"
110 |       },
111 |       "outputs": [],
112 |       "source": [
113 |         "ctx = cl.create_some_context()\n",
114 |         "queue = cl.CommandQueue(ctx)"
115 |       ]
116 |     },
117 |     {
118 |       "cell_type": "code",
119 |       "execution_count": 4,
120 |       "metadata": {
121 |         "colab": {
122 |           "base_uri": "https://localhost:8080/"
123 |         },
124 |         "id": "i4jmqmb-j_B7",
125 |         "outputId": "b63bcb4e-1f54-449b-9c83-c7c4c5972049"
126 |       },
127 |       "outputs": [
128 |         {
129 |           "output_type": "stream",
130 |           "name": "stdout",
131 |           "text": [
132 |             "Platform name: NVIDIA CUDA\n",
133 |             "Device name: Tesla T4\n",
134 |             "Maximum work group size: 1024\n"
135 |           ]
136 |         }
137 |       ],
138 |       "source": [
139 |         "platform = cl.get_platforms()[0]\n",
140 |         "device = ctx.devices[0]\n",
141 |         "print(\"Platform name:\", platform.name)\n",
142 |         "print(\"Device name:\", device.name)\n",
143 |         "print(\"Maximum work group size:\", device.max_work_group_size)"
144 |       ]
145 |     },
146 |     {
147 |       "cell_type": "markdown",
148 |       "metadata": {
149 |         "id": "_pv020cIuoWa"
150 |       },
151 |       "source": [
152 |         "In this tutorial, we like to calculate the sum of all elements in a vector of arbitrary size. In general, the size of the vector may not be a multiple of the desired workgroup size. In that case, the algorithm needs to be adapted to facilitate arbitrary workgroup and vector sizes. One option is called 'padding'.\n",
153 |         "\n",
154 |         "Let us first create the kernel to calculate the sum of an integer array. See tutorial 5 for an explanation on the kernel."
155 |       ]
156 |     },
157 |     {
158 |       "cell_type": "code",
159 |       "execution_count": 5,
160 |       "metadata": {
161 |         "id": "UtjvaSG-urrG"
162 |       },
163 |       "outputs": [],
164 |       "source": [
165 |         "kernel = \"\"\"\n",
166 |         "__kernel void sum1(__global const long int *vec,\n",
167 |         "                   __global long int *partial_sums)\n",
168 |         "{\n",
169 |         "  int group_size = get_local_size(0);\n",
170 |         "  int local_id = get_local_id(0);\n",
171 |         "  int group_id = get_group_id(0);\n",
172 |         "  int global_id = get_global_id(0);\n",
173 |         "\n",
174 |         "  if (local_id == 0){\n",
175 |         "    long int sum = 0;\n",
176 |         "    for(int i = 0; i < group_size; i++){\n",
177 |         "      sum += vec[global_id + i];\n",
178 |         "    }\n",
179 |         "    partial_sums[group_id] = sum;\n",
180 |         "  }\n",
181 |         "}\n",
182 |         "__kernel void sum2(__global long int *vec,\n",
183 |         "                   __global long int *partial_sums)\n",
184 |         "{\n",
185 |         "  int group_size = get_local_size(0);\n",
186 |         "  int local_id = get_local_id(0);\n",
187 |         "  int group_id = get_group_id(0);\n",
188 |         "  int global_id = get_global_id(0);\n",
189 |         "\n",
190 |         "  int step = 2;\n",
191 |         "  while (step <= group_size) {\n",
192 |         "    if (local_id % step == 0) {\n",
193 |         "      vec[global_id] += vec[global_id + step / 2];\n",
194 |         "    }\n",
195 |         "    barrier(CLK_GLOBAL_MEM_FENCE);\n",
196 |         "    step *= 2;\n",
197 |         "  }\n",
198 |         "  if (local_id == 0){\n",
199 |         "    partial_sums[group_id] = vec[global_id];\n",
200 |         "  }\n",
201 |         "}\n",
202 |         "\"\"\""
203 |       ]
204 |     },
205 |     {
206 |       "cell_type": "code",
207 |       "execution_count": 6,
208 |       "metadata": {
209 |         "id": "RKRY4_iuwy32"
210 |       },
211 |       "outputs": [],
212 |       "source": [
213 |         "prg = cl.Program(ctx, kernel).build()"
214 |       ]
215 |     },
216 |     {
217 |       "cell_type": "markdown",
218 |       "source": [
219 |         "The idea of 'padding' is to add dummy elements to the vector that will not change the final result. For example, if we'd like to calculate the sum of a vector, one can add an arbitrary number of elements with value zero, without changing the final result.\n",
220 |         "\n",
221 |         "Let us assume that we have a vector of dimension $d$ and a workgroup of size $s$. PyOpenCL needs a domain with workgroups of equal size. However, $d$ may not be a multiple of $s$. Hence, we create another vector with size $n$ such that $n \\geq d$ and $n \\mod d = 0$, that is, $n$ is a multiple of $d$. The following function provides an efficient routine to calculate the next multiple."
222 |       ],
223 |       "metadata": {
224 |         "id": "W5PmWsU-CYXg"
225 |       }
226 |     },
227 |     {
228 |       "cell_type": "code",
229 |       "source": [
230 |         "def next_multiple(val, mul):\n",
231 |         "    \"\"\"Return the smallest value which is larger or equal to 'val' and a multiple of 'mul'.\"\"\"\n",
232 |         "    return -(-val // mul) * mul"
233 |       ],
234 |       "metadata": {
235 |         "id": "ska0rPVSDd4o"
236 |       },
237 |       "execution_count": 7,
238 |       "outputs": []
239 |     },
240 |     {
241 |       "cell_type": "markdown",
242 |       "source": [
243 |         "For example, if we have a vector of size 100 and like to use workgroup sizes of 32, we need 4 groups to cover the dimension. Notice that $4 \\cdot 32 = 128$ is the next multiple."
244 |       ],
245 |       "metadata": {
246 |         "id": "e1jxH9G1DtJ2"
247 |       }
248 |     },
249 |     {
250 |       "cell_type": "code",
251 |       "source": [
252 |         "print(\"The next multiple of 32 larger or equal to 100 is: \", next_multiple(100, 32))"
253 |       ],
254 |       "metadata": {
255 |         "colab": {
256 |           "base_uri": "https://localhost:8080/"
257 |         },
258 |         "id": "mfoOdfRsD8pa",
259 |         "outputId": "addc1ad9-fb42-4c58-90f8-6013a17276cf"
260 |       },
261 |       "execution_count": 8,
262 |       "outputs": [
263 |         {
264 |           "output_type": "stream",
265 |           "name": "stdout",
266 |           "text": [
267 |             "The next multiple of 32 larger or equal to 100 is:  128\n"
268 |           ]
269 |         }
270 |       ]
271 |     },
272 |     {
273 |       "cell_type": "markdown",
274 |       "source": [
275 |         "Let us choose a vector size and a workgroup size."
276 |       ],
277 |       "metadata": {
278 |         "id": "0m4tuU-bDjsM"
279 |       }
280 |     },
281 |     {
282 |       "cell_type": "code",
283 |       "source": [
284 |         "vector_size = 10000\n",
285 |         "workgroup_size = 32"
286 |       ],
287 |       "metadata": {
288 |         "id": "ml3YzUw5EO6z"
289 |       },
290 |       "execution_count": 9,
291 |       "outputs": []
292 |     },
293 |     {
294 |       "cell_type": "markdown",
295 |       "source": [
296 |         "Let us calculate the next multiple of the workgroup size larger or equal to the vector dimension. This will be the size of our thread domain."
297 |       ],
298 |       "metadata": {
299 |         "id": "eiriH6agEc7V"
300 |       }
301 |     },
302 |     {
303 |       "cell_type": "code",
304 |       "source": [
305 |         "global_size = next_multiple(vector_size, workgroup_size)\n",
306 |         "n_workgroups = global_size // workgroup_size\n",
307 |         "print(\"The global size of the domain is:\", global_size)\n",
308 |         "print(\"The number of workgroups is:\", n_workgroups)"
309 |       ],
310 |       "metadata": {
311 |         "colab": {
312 |           "base_uri": "https://localhost:8080/"
313 |         },
314 |         "id": "a7SE3GteEk2Z",
315 |         "outputId": "533fd4a9-7efc-47c8-8bc0-b75303c4139d"
316 |       },
317 |       "execution_count": 10,
318 |       "outputs": [
319 |         {
320 |           "output_type": "stream",
321 |           "name": "stdout",
322 |           "text": [
323 |             "The global size of the domain is: 10016\n",
324 |             "The number of workgroups is: 313\n"
325 |           ]
326 |         }
327 |       ]
328 |     },
329 |     {
330 |       "cell_type": "markdown",
331 |       "source": [
332 |         "Let us create a vector with $n$ values from zero to $n-1$ for which we'd like to calculate the sum of its elements."
333 |       ],
334 |       "metadata": {
335 |         "id": "YX1T_i1XFMf4"
336 |       }
337 |     },
338 |     {
339 |       "cell_type": "code",
340 |       "source": [
341 |         "my_vector = np.arange(vector_size, dtype=np.int64)"
342 |       ],
343 |       "metadata": {
344 |         "id": "KKgRPIcBFVp1"
345 |       },
346 |       "execution_count": 11,
347 |       "outputs": []
348 |     },
349 |     {
350 |       "cell_type": "markdown",
351 |       "source": [
352 |         "The essential step of the 'padding' approach is to create another vector which will be given to the PyOpenCL kernel. That is, we need to add additional zero elements to fill the vector up until reaching the desired size. Remember that appending zero elements will not change the objective: calculating the sum of the vector."
353 |       ],
354 |       "metadata": {
355 |         "id": "WgpRE1MbFhNe"
356 |       }
357 |     },
358 |     {
359 |       "cell_type": "code",
360 |       "source": [
361 |         "padding = np.zeros(global_size - vector_size, dtype=np.int64)\n",
362 |         "np_vector = np.concatenate((my_vector, padding))\n",
363 |         "print(\"The size of the padded vector:\", np_vector.size)\n",
364 |         "print(\"The elements of the last workgroup:\", np_vector[-workgroup_size:])"
365 |       ],
366 |       "metadata": {
367 |         "colab": {
368 |           "base_uri": "https://localhost:8080/"
369 |         },
370 |         "id": "C_X7AJ6JF0rP",
371 |         "outputId": "6cb308d0-1e84-42ce-af9c-8b19a4e5c987"
372 |       },
373 |       "execution_count": 12,
374 |       "outputs": [
375 |         {
376 |           "output_type": "stream",
377 |           "name": "stdout",
378 |           "text": [
379 |             "The size of the padded vector: 10016\n",
380 |             "The elements of the last workgroup: [9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997\n",
381 |             " 9998 9999    0    0    0    0    0    0    0    0    0    0    0    0\n",
382 |             "    0    0    0    0]\n"
383 |           ]
384 |         }
385 |       ]
386 |     },
387 |     {
388 |       "cell_type": "code",
389 |       "source": [
390 |         "print(\"Sum of the original vector:\", np.sum(my_vector))\n",
391 |         "print(\"Sum of the padded vector:\", np.sum(np_vector))"
392 |       ],
393 |       "metadata": {
394 |         "colab": {
395 |           "base_uri": "https://localhost:8080/"
396 |         },
397 |         "id": "BcB2pNQHJJB8",
398 |         "outputId": "db5f089c-8386-49c3-a3be-d989ec85faa4"
399 |       },
400 |       "execution_count": 13,
401 |       "outputs": [
402 |         {
403 |           "output_type": "stream",
404 |           "name": "stdout",
405 |           "text": [
406 |             "Sum of the original vector: 49995000\n",
407 |             "Sum of the padded vector: 49995000\n"
408 |           ]
409 |         }
410 |       ]
411 |     },
412 |     {
413 |       "cell_type": "markdown",
414 |       "source": [
415 |         "We can indeed see that the new vector has zero elements in the final workgroup. These are the padded values. Now, we are ready to launch the kernel for the padded vector. Notice that we need to provide the global size of the domain, not the dimension of the vector to the program."
416 |       ],
417 |       "metadata": {
418 |         "id": "M5ZOmxBbGay0"
419 |       }
420 |     },
421 |     {
422 |       "cell_type": "code",
423 |       "execution_count": 14,
424 |       "metadata": {
425 |         "id": "C-ACLmhhyDbA"
426 |       },
427 |       "outputs": [],
428 |       "source": [
429 |         "cl_vector = cl_array.to_device(queue, np_vector)\n",
430 |         "\n",
431 |         "cl_partial_sums1 = cl_array.empty(queue, n_workgroups, dtype=np.int64)\n",
432 |         "cl_partial_sums2 = cl_array.empty(queue, n_workgroups, dtype=np.int64)"
433 |       ]
434 |     },
435 |     {
436 |       "cell_type": "code",
437 |       "execution_count": 15,
438 |       "metadata": {
439 |         "id": "8Tl44a3rK9pK"
440 |       },
441 |       "outputs": [],
442 |       "source": [
443 |         "event = prg.sum1(queue,\n",
444 |         "                 (global_size,),\n",
445 |         "                 (workgroup_size,),\n",
446 |         "                 cl_vector.data,\n",
447 |         "                 cl_partial_sums1.data\n",
448 |         "                )"
449 |       ]
450 |     },
451 |     {
452 |       "cell_type": "code",
453 |       "execution_count": 16,
454 |       "metadata": {
455 |         "id": "zOgdjPGQw05a"
456 |       },
457 |       "outputs": [],
458 |       "source": [
459 |         "event = prg.sum2(queue,\n",
460 |         "                 (global_size,),\n",
461 |         "                 (workgroup_size,),\n",
462 |         "                 cl_vector.data,\n",
463 |         "                 cl_partial_sums2.data\n",
464 |         "                )"
465 |       ]
466 |     },
467 |     {
468 |       "cell_type": "code",
469 |       "execution_count": 17,
470 |       "metadata": {
471 |         "id": "fZz_oO8L3QRN"
472 |       },
473 |       "outputs": [],
474 |       "source": [
475 |         "np_partial_sums1 = cl_partial_sums1.get()\n",
476 |         "vector_sum1 = np.sum(np_partial_sums1)\n",
477 |         "np_partial_sums2 = cl_partial_sums2.get()\n",
478 |         "vector_sum2 = np.sum(np_partial_sums2)"
479 |       ]
480 |     },
481 |     {
482 |       "cell_type": "code",
483 |       "execution_count": 18,
484 |       "metadata": {
485 |         "colab": {
486 |           "base_uri": "https://localhost:8080/"
487 |         },
488 |         "id": "Lh1Y_6560TxT",
489 |         "outputId": "4eb90b66-a1b0-491a-844b-02521c4a8d7d"
490 |       },
491 |       "outputs": [
492 |         {
493 |           "output_type": "stream",
494 |           "name": "stdout",
495 |           "text": [
496 |             "The sum calculated by OpenCL: 49995000\n",
497 |             "The sum calculated by OpenCL: 49995000\n",
498 |             "The exact value of the sum:   49995000\n"
499 |           ]
500 |         }
501 |       ],
502 |       "source": [
503 |         "print(\"The sum calculated by OpenCL:\", vector_sum1)\n",
504 |         "print(\"The sum calculated by OpenCL:\", vector_sum2)\n",
505 |         "print(\"The exact value of the sum:  \", vector_size*(vector_size-1)//2)"
506 |       ]
507 |     },
508 |     {
509 |       "cell_type": "markdown",
510 |       "source": [
511 |         "The exact value of summing values ranging from 0 to $n-1$ is explicitly known: $n(n-1)/2$. The results implemented with OpenCL are correct, indeed."
512 |       ],
513 |       "metadata": {
514 |         "id": "ZPYQyfdBIABF"
515 |       }
516 |     }
517 |   ],
518 |   "metadata": {
519 |     "accelerator": "GPU",
520 |     "colab": {
521 |       "provenance": []
522 |     },
523 |     "kernelspec": {
524 |       "display_name": "Python 3 (ipykernel)",
525 |       "language": "python",
526 |       "name": "python3"
527 |     },
528 |     "language_info": {
529 |       "codemirror_mode": {
530 |         "name": "ipython",
531 |         "version": 3
532 |       },
533 |       "file_extension": ".py",
534 |       "mimetype": "text/x-python",
535 |       "name": "python",
536 |       "nbconvert_exporter": "python",
537 |       "pygments_lexer": "ipython3",
538 |       "version": "3.10.5"
539 |     }
540 |   },
541 |   "nbformat": 4,
542 |   "nbformat_minor": 0
543 | }


--------------------------------------------------------------------------------