├── .gitignore ├── AY07 ├── LectureFiniteDifferences.pdf ├── 1_OMP_static_schedule.cpp ├── 2_OMP_dynamic_schedule.cpp └── 3_OMP_reduction.cpp ├── AY04 ├── Debug │ ├── 1.cpp │ ├── README.md │ ├── 4.cpp │ ├── 2.cpp │ ├── 3.cpp │ └── 5.cpp ├── job.sh ├── 5valgrind.cpp ├── 2aritmetica_punteros.cpp ├── 1punteros.cpp ├── 4matrix_heap.cpp ├── README.md └── 3arrays.cpp ├── AY05 ├── job.sh ├── 1MPI_hello_world.cpp ├── 3MPI_dot.cpp ├── 4MPI_mat_vec.cpp ├── 2MPI_sum.cpp ├── 5MPI_variable_collective.cpp └── README.md ├── AY01 ├── Paralelo.py ├── README.md ├── Tutorial_joblib_2_reuse.ipynb ├── Tutorial_joblib_1_basics.ipynb └── Comparaciones.ipynb ├── AY03 ├── 1hello_world.cpp ├── README.md ├── 4punteros.cpp ├── 5funciones.cpp ├── 2variables.cpp ├── 6arrays.cpp └── 3flujos.cpp ├── AY06 ├── 1_OMP_helloworld.cpp ├── 2_OMP_loops.cpp ├── 3_OMP_matvec.cpp └── DataLocality.ipynb ├── AY09 ├── README.md └── 06_reduction_padding.ipynb ├── AY08 ├── PyOpenCL │ └── README.md └── Numba │ ├── 6_Numba_caching.ipynb │ ├── 4_Numba_race_condition.ipynb │ ├── 1_PythonDecorators.ipynb │ ├── 2_Numba_vector_addition.ipynb │ ├── 3_Numba_vector_addition_parallel.ipynb │ └── 5_Numba_data_types.ipynb ├── README.md └── AY02 └── Tutorial_joblib_5_shared_variables.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | *.out 3 | AY06/Ejemplo -------------------------------------------------------------------------------- /AY07/LectureFiniteDifferences.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShescBlank/IMT2112/HEAD/AY07/LectureFiniteDifferences.pdf -------------------------------------------------------------------------------- /AY04/Debug/1.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main() { 4 | 5 | printf("Hello World\n"); 6 | 7 | return 0; 8 | } -------------------------------------------------------------------------------- /AY04/Debug/README.md: -------------------------------------------------------------------------------- 1 | OJO: Los códigos de esta carpeta pueden contener errores y solo los usaremos para realizar el ejercicio de encontrar los problemas que hay o puede haber. -------------------------------------------------------------------------------- /AY04/Debug/4.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main() { int a = 100; for (int i = 0; i < a; i++) { printf("%d\n", i); } printf("Este código es un poco extraño\n"); printf("Por favor no entreguen algo así :cc\n"); return 0; } -------------------------------------------------------------------------------- /AY04/job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --partition=full 4 | 5 | #SBATCH --job-name=IMT2112 6 | #SBATCH --output=log.out 7 | 8 | #SBATCH --ntasks=1 9 | #SBATCH --cpus-per-task=1 10 | 11 | g++ 1punteros.cpp 12 | ./a.out -------------------------------------------------------------------------------- /AY05/job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --partition=full 4 | 5 | #SBATCH --job-name=IMT2112 6 | #SBATCH --output=log.out 7 | 8 | ### Notar que la línea siguiente define la cantidad de procesos que queremos utilizar (no agregamos -np en el mpirun): 9 | #SBATCH --ntasks=4 10 | #SBATCH --cpus-per-task=1 11 | 12 | mpic++ 1MPI_hello_world.cpp -std=c++11 13 | mpirun ./a.out -------------------------------------------------------------------------------- /AY04/Debug/2.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main() { 4 | 5 | int count = 0; 6 | int lim1 = 10; 7 | int lim2 = 20; 8 | 9 | for (int i = 0; i < lim1; i++) 10 | { 11 | for (int j = 0; j < lim2; i++) 12 | { 13 | count++; 14 | } 15 | } 16 | 17 | printf("El resultado de la cuenta es: %d\n", count); 18 | 19 | return 0; 20 | } -------------------------------------------------------------------------------- /AY01/Paralelo.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from math import sqrt 3 | from joblib import Parallel 4 | from joblib import delayed 5 | import time as tm 6 | 7 | print("Comenzando a calcular...") 8 | start = tm.time() 9 | parallel_pool = Parallel(n_jobs=4) 10 | parallel_sqrt = delayed(sqrt) 11 | parallel_tasks = [parallel_sqrt(i) for i in range(10000000)] 12 | parallel_results = parallel_pool(parallel_tasks) 13 | end = tm.time() 14 | print(f"Tiempo total: {end - start}s") -------------------------------------------------------------------------------- /AY04/Debug/3.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main() { 5 | 6 | int length1 = 10; 7 | int length2 = 5; 8 | int** arrays = (int**)calloc(length1, sizeof(int*)); 9 | 10 | for (int i = 0; i < length1; ++i) 11 | { 12 | arrays[i] = (int*)malloc(length2 * sizeof(int)); 13 | 14 | for (int j = 0; j < length2; j++) 15 | { 16 | arrays[i][j] = j*i; 17 | } 18 | } 19 | 20 | free(arrays); 21 | 22 | return 0; 23 | } -------------------------------------------------------------------------------- /AY07/1_OMP_static_schedule.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Comandos útiles: 3 | Compilar código: g++ -o a1.out 1_OMP_static_schedule.cpp -fopenmp 4 | Correr código: ./a1.out 5 | Podemos correrlo con time (antes del ./a1.out) para ver cuánto se demora en total. 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | #include "iostream" 12 | #include // Sleep 13 | 14 | using namespace std; 15 | 16 | int main() { 17 | 18 | #pragma omp parallel for num_threads(4) // schedule(static) 19 | for (int i = 1; i < 20; i++) { 20 | int id = omp_get_thread_num(); 21 | sleep(i); 22 | cout << "Thread " << id << " durmió " << i << " segundos ZzZ..." << endl; 23 | } 24 | 25 | return 0; 26 | } 27 | -------------------------------------------------------------------------------- /AY07/2_OMP_dynamic_schedule.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Comandos útiles: 3 | Compilar código: g++ -o a2.out 2_OMP_dynamic_schedule.cpp -fopenmp 4 | Correr código: ./a2.out 5 | Podemos correrlo con time (antes del ./a2.out) para ver cuánto se demora en total. 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | #include "iostream" 12 | #include // Sleep 13 | 14 | using namespace std; 15 | 16 | int main() { 17 | 18 | #pragma omp parallel for num_threads(4) schedule(dynamic) 19 | for (int i = 1; i < 20; i++) { 20 | int id = omp_get_thread_num(); 21 | sleep(i); 22 | cout << "Thread " << id << " durmió " << i << " segundos ZzZ..." << endl; 23 | } 24 | 25 | return 0; 26 | } 27 | -------------------------------------------------------------------------------- /AY03/1hello_world.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Correr código (en consola): 3 | - Compilar: g++ 1hello_world.cpp -o name_output 4 | - Correr: ./name_output 5 | */ 6 | 7 | // Incluimos la librería standard I/O 8 | #include // incluye al printf 9 | 10 | // Las dos líneas siguientes son para el cout 11 | #include 12 | using namespace std; 13 | 14 | // La función main es la que se ejecuta cuando se corre el código 15 | int main() { 16 | 17 | printf("Hello World\n"); // printf no tiene salto de línea 18 | 19 | // "\n" representa un salto de línea 20 | // "\t" representa un tab (4 espacios) 21 | 22 | cout << "Hello " << "World"; 23 | cout << " 2" << endl; // endl agrega el salto de línea 24 | 25 | // Debemos retornar un número porque así fue definido main 26 | return 0; 27 | } -------------------------------------------------------------------------------- /AY04/Debug/5.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int* create_array(int n) 5 | { 6 | int array[n]; 7 | for (int i = 0; i < n; i++) 8 | { 9 | array[i] = i; 10 | } 11 | return array; 12 | } 13 | 14 | int main() { 15 | 16 | // Creemos un array con la función anterior 17 | int n = 10; 18 | 19 | // Nos retorna un puntero de int 20 | int* array1 = create_array(n); 21 | 22 | // Imprimamos sus elementos para ver que todo esté bien 23 | for (int i = 0; i < n; i++) 24 | { 25 | printf("%d ", array1[i]); 26 | } 27 | printf("\n"); 28 | 29 | // Ahora creemos otro array 30 | int* array2 = create_array(n); 31 | 32 | // E imprimamos 33 | for (int i = 0; i < n; i++) 34 | { 35 | printf("%d ", array2[i]); 36 | } 37 | printf("\n"); 38 | 39 | return 0; 40 | } -------------------------------------------------------------------------------- /AY03/README.md: -------------------------------------------------------------------------------- 1 | ## Compilar Código: (desde la consola y deben ubicarse en la posición de los archivos (utilizando cd)) 2 | 3 | - g++ codigo.gcc (genera un archivo ejecutable a.out) 4 | - g++ codigo.gcc -o archivo (genera un archivo ejecutable archivo.out) 5 | 6 | 7 | ## Correr Código: 8 | - ```./archivo.out``` 9 | - ```valgrind ./archivo.out``` (Para debuggear) 10 | 11 | 12 | Operaciones de Calculo: + - * / %(modulo) 13 | 14 | ```x += 1``` es equivalente a ```x = x+1``` (lo mismo para otras operaciones excepto modulo) 15 | 16 | Para suma y resta esto es equivalente a ```x++``` o ```x--``` 17 | 18 | 19 | Operaciones Comparativas: == != < > <= >= 20 | 21 | 22 | Operaciones Logicas: &&(and) ||(or) 23 | 24 | 25 | printf: %i(int) $d(int) %f(float) %lf(double) 26 | 27 | 28 | sizeof(tipo_variable): devuelve el tamaño en bytes de ese tipo de variable 29 | 30 | 31 | ## Si quieren aprender más o profundizar sobre lo visto en la ayudantía les recomiendo el siguiente [enlace](https://github.com/DCCentral-de-Apuntes/intro-C) 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /AY03/4punteros.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Correr código (en consola): 3 | - Compilar: g++ 4punteros.cpp -o name_output 4 | - Correr: ./name_output 5 | */ 6 | 7 | #include 8 | 9 | int main() { 10 | // Declaremos un int y démosle un valor 11 | int a = 4; 12 | 13 | // Declaremos un puntero, en este caso es un puntero a un valor int 14 | int* b; 15 | 16 | // Hagamos que el valor de b sea el puntero de a (anteponer & para acceder al puntero de una variable) 17 | b = &a; 18 | 19 | printf("Mi valor es %d y mi puntero es %p\n", a, b); 20 | 21 | // Los punteros tienen muchas utilidades ya que podemos utilizarlos para acceder al valor guardado en memoria: 22 | // (* a la izquierda de la variable para acceder al valor apuntado) 23 | *b += 1; 24 | 25 | printf("Mi valor es %d y mi puntero es %p\n", a, b); 26 | 27 | // También podemos hacer punteros de punteros 28 | int** c = &b; 29 | 30 | // Nuevamente podemos utilizarlo para modificar a: 31 | *(*c) += 1; 32 | 33 | printf("Mi valor es %d y mi puntero es %p y el puntero de mi puntero es %p\n", a, b, c); 34 | 35 | return 0; 36 | } -------------------------------------------------------------------------------- /AY06/1_OMP_helloworld.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Comandos útiles: 3 | Compilar código: g++ -o name_output 1_OMP_helloworld.cpp -fopenmp 4 | Correr código: ./name_output 5 | */ 6 | 7 | #include 8 | #include 9 | 10 | int main() { 11 | 12 | // Trabajaremos con thread y usaremos OpenMP 13 | 14 | // Con la siguiente línea, generamos un bloque de código que será ejecutado por múltiple threads: 15 | #pragma omp parallel // en mi computador se utilizan 8 threads por defecto 16 | { 17 | printf("Hello World\n"); 18 | int id = omp_get_thread_num(); 19 | int total = omp_get_num_threads(); 20 | printf("Greetings from process %d out of %d \n", id, total); 21 | } 22 | 23 | printf("\n=================================================================\n"); 24 | 25 | // Pero también podemos decirle cuántos threads queremos 26 | #pragma omp parallel num_threads(1) 27 | { 28 | printf("Hello World\n"); 29 | int id = omp_get_thread_num(); 30 | int total = omp_get_num_threads(); 31 | printf("Greetings from process %d out of %d \n", id, total); 32 | } 33 | 34 | return 0; 35 | } 36 | 37 | -------------------------------------------------------------------------------- /AY03/5funciones.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Correr código (en consola): 3 | - Compilar: g++ 5funciones.cpp -o name_output 4 | - Correr: ./name_output 5 | */ 6 | 7 | #include 8 | 9 | int suma(int x1, int x2) 10 | { 11 | x1 += 1; 12 | x2 += 2; 13 | return x1 + x2; 14 | } 15 | 16 | void actualizar(int* x) // no retorna nada 17 | { 18 | *x = 1; // modifico la variable desde dentro de una función mediante su puntero 19 | } 20 | 21 | float promedio(float x, float y, float z) 22 | { 23 | return (x + y + z) / 3; 24 | } 25 | 26 | int main() { 27 | 28 | int a = 5; 29 | int b = 10; 30 | 31 | printf("a = %d (antes de actualizar())\n", a); 32 | 33 | actualizar(&a); 34 | 35 | printf("a = %d (después de actualizar())\n\n", a); 36 | 37 | int c = suma(a, b); // no modifica a y b ya que crea una copia de ellas en la función 38 | 39 | printf("Valores finales:\n"); 40 | printf("a = %d\n", a); // Notar que a y b no se modifican en la función suma 41 | printf("b = %d\n", b); 42 | printf("c = %d\n", c); 43 | printf("Promedio = %f\n", promedio(1.1, 5.7, 13.59)); 44 | 45 | return 0; 46 | } 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /AY05/1MPI_hello_world.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Correr código (en consola): 3 | - Compilar: mpic++ 1MPI_hello_world.cpp -std=c++11 4 | - Correr (con 2 procesos): mpirun -np 2 ./a.out 5 | Se puede cambiar el número de procesos con el que se corre 6 | */ 7 | 8 | #include // Nuevo include! 9 | #include 10 | 11 | int main(int argc, char** argv) { 12 | // int a = 10; 13 | // printf("Valor de a: %d\n", a); 14 | 15 | // Initialize the MPI environment (Message Passing Interface) 16 | MPI_Init(NULL, NULL); 17 | 18 | // Get the number of processes 19 | int world_size; 20 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 21 | 22 | // El MPI_COMM_WORLD es el comunicador de nuestro grupo de procesos 23 | 24 | // Get the rank of the process 25 | int world_rank; 26 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 27 | 28 | // Get the name of the processor 29 | char processor_name[MPI_MAX_PROCESSOR_NAME]; 30 | int name_len; 31 | MPI_Get_processor_name(processor_name, &name_len); 32 | 33 | // Print off a hello world message from each process 34 | printf("Hello world from processor %s, rank %d out of %d processors\n", processor_name, world_rank, world_size); 35 | 36 | // Finalize the MPI environment. 37 | MPI_Finalize(); 38 | } -------------------------------------------------------------------------------- /AY01/README.md: -------------------------------------------------------------------------------- 1 | # AY01 2 | 3 | Introducción a la librería Joblib y comparaciones de rendimiento. 4 | 5 | Librerías utilizadas para esta ayudantía: 6 | - Numpy 7 | - Joblib 8 | 9 | Todas estas librerías se pueden instalar usando pip (Ej: ```pip install joblib```). 10 | 11 | También pueden usar Google Colab para correr todos los códigos. 12 | 13 | ## ¿Cómo observar la cantidad de procesos e hilos que está usando nuestro programa? 14 | 15 | Para mirar estas dos cantidades podemos utilizar el administrador de tareas (Windows) o monitor de actividad (Mac) de nuestro computador. En específico, para los procesos basta con fijarse en cuántas instancias de Python podemos ver abiertas y trabajando (básicamente, programas con el nombre de Python). En relación a la cantidad de hilos, en Mac es sencillo ya que el mismo monitor de actividad lo dice directamente, sin embargo, en Windows esta opción está un poco más escondida y hay que seguir los siguientes pasos para poder verla: 16 | 17 | - Abrir el Administrador de Tareas 18 | - Ir a la pestaña de detalles 19 | - Hacer click derecho en alguno de los nombres de las columnas y apretar "Seleccionar columnas" 20 | - Bajar en el listado hasta encontrar la opción de "Threads" o "Subprocesos" e incluirla en el listado 21 | - ¡Listo! Ahora podemos ver cuántos hilos está usando cada programa por separado -------------------------------------------------------------------------------- /AY06/2_OMP_loops.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Comandos útiles: 3 | Compilar código: g++ -o name_output 2_OMP_loops.cpp -fopenmp 4 | Correr código: ./name_output 5 | */ 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | int main() { 12 | 13 | // Qué pasa si sumamos en paralelo usando threads? 14 | int x1 = 0; 15 | 16 | // Notar que agregamos "for" al pragma 17 | // Esto para repartir las iteraciones entre los threads 18 | #pragma omp parallel for num_threads(24) 19 | for (int i = 0; i < 10000; i++) { 20 | int id = omp_get_thread_num(); 21 | x1 += 1; 22 | //printf("Proceso %d sumó 1\n", id); 23 | } 24 | 25 | printf("\nSuma Final 1 = %d\n", x1); 26 | 27 | // Veamos otra posibilidad 28 | 29 | int x2 = 0; 30 | int n = 10; 31 | int* array = (int*) calloc(n, sizeof(int)); 32 | 33 | #pragma omp parallel for num_threads(n) 34 | for (int i = 0; i < 100000000; i++) { 35 | int id = omp_get_thread_num(); 36 | array[id] += 1; 37 | // printf("Proceso %d sumó 1\n", id); 38 | } 39 | 40 | for (int i = 0; i < n; i++) { 41 | x2 += array[i]; 42 | } 43 | 44 | printf("\nSuma Final 2 = %d\n", x2); 45 | 46 | 47 | // Nunca olviden de liberar memoria si es que la reservan :D 48 | free(array); 49 | 50 | return 0; 51 | } 52 | -------------------------------------------------------------------------------- /AY04/5valgrind.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Correr código (en consola): 3 | - Compilar: g++ -g 5valgrind.cpp -o name_output 4 | - Correr con Valgrind: valgrind ./name_output 5 | Valgrind se encargará de revisar nuestros errores 6 | pero debes compilar con -g para que nos pueda decir nuestros errores 7 | */ 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | int main() { 14 | // Este código contiene errores! 15 | 16 | int N, begin, end; 17 | begin = 10 18 | end = 20; 19 | 20 | // Creemos un array 21 | int* array = (int*)calloc(end-begin, sizeof(int)); 22 | 23 | // rellenemos sus valores 24 | for (int i = 0; i < end-begin; i++) 25 | { 26 | array[i] = i; 27 | } 28 | 29 | // Imprimimamos sus resultados 30 | for (int i = 0; i < end-begin; i++) 31 | { 32 | printf("%d ", array[i]); 33 | } 34 | printf("\n"); 35 | 36 | // Y cambiemos el valor en la casilla N 37 | array[N] = 100; 38 | 39 | // Volvamos a imprimir 40 | for (int i = begin; i < end; i++) 41 | { 42 | printf("%d ", array[i]); 43 | } 44 | printf("\n"); 45 | 46 | // No olviden liberar la memoria reservada 47 | free(array); 48 | // Si liberaron todo en un código normal, Valgrind debería decir: 49 | // "All heap blocks were freed -- no leaks are possible" 50 | 51 | return 0; 52 | } -------------------------------------------------------------------------------- /AY03/2variables.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Correr código (en consola): 3 | - Compilar: g++ 2variables.cpp -o name_output 4 | - Correr: ./name_output 5 | */ 6 | 7 | #include 8 | #include 9 | using namespace std; 10 | 11 | int main() { 12 | 13 | // Para utilizar variables debemos definir su tipo 14 | 15 | // INT: 16 | printf("INT:\n"); 17 | int a = 9; 18 | int b; 19 | b = 10; 20 | printf("La suma de %i y %d es: %d\n", a, b, a+b); 21 | cout << "La multiplicación de " << a << " y " << b << " es: " << a*b << endl; 22 | 23 | // FLOAT: 24 | printf("\nFLOAT:\n"); 25 | float c = 10.2, d = 3.1; 26 | printf("c*d=%f\n", c*d); 27 | 28 | // CASTING: 29 | printf("\nCASTING:\n"); 30 | printf("División sin casting: %d/%d=%f\n", a, b, a/b); 31 | float result = (float)a/b; 32 | printf("División con casting: %d/%d=%f\n", a, b, result); 33 | 34 | // DOUBLE: 35 | printf("\nDOUBLE:\n"); 36 | float pi2 = 3.14159265358979323846; 37 | printf("%f\n", pi2); 38 | double pi = 3.14159265358979323846; 39 | printf("%lf\n", pi); 40 | 41 | // BOOLEANS: 42 | printf("\nBOOLEANS:\n"); 43 | bool e = true; 44 | bool f = false; 45 | bool g = !(e || f); 46 | bool h = g && f; 47 | printf("True: %d\n", e); 48 | printf("False: %d\n", f); 49 | printf("!(T || F)=%d\nF && F=%d\n", g, h); 50 | 51 | return 0; 52 | } -------------------------------------------------------------------------------- /AY04/2aritmetica_punteros.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Correr código (en consola): 3 | - Compilar: g++ 2aritmetica_punteros.cpp -o name_output 4 | - Correr: ./name_output 5 | */ 6 | 7 | #include 8 | #include 9 | 10 | int main() { 11 | // Creemos dos variables int 12 | int a = 1; 13 | int b = 2; 14 | printf("a = %d\n", a); 15 | printf("b = %d\n\n", b); 16 | 17 | // Veamos su puntero 18 | printf("Puntero de a: %p\n", &a); // Se puede observar que los punteros están en formato hexadecimal 19 | printf("Puntero de b: %p\n", &b); 20 | 21 | // La distancia de ambos punteros es 4, cuál es el tamaño en bytes de un int? 22 | printf("Tamaño en bytes de un int: %zu\n", sizeof(int)); 23 | 24 | // Qué ocurre si le sumamos 1 al puntero de a, llegamos a b? 25 | printf("\nPuntero + 1 de a: %p\n", &a + 1); 26 | 27 | // Efectivamente llegamos, entonces podemos acceder al valor de b 28 | printf("Valor de b mediante el puntero de a: %d\n", *(&a + 1)); 29 | 30 | // Probemos con otros tipos de datos 31 | char c = 'c'; 32 | char d = 'd'; 33 | printf("\nCHAR\n"); 34 | printf("Puntero de c: %p\n", &c); 35 | printf("Puntero de d: %p\n", &d); 36 | printf("Tamaño en bytes de un char: %zu\n", sizeof(char)); 37 | 38 | 39 | double e = 0.1; 40 | double f = 0.123; 41 | printf("\nDOUBLE\n"); 42 | printf("Puntero de e: %p\n", &e); 43 | printf("Puntero de f: %p\n", &f); 44 | printf("Tamaño en bytes de un double: %zu\n", sizeof(double)); 45 | 46 | return 0; 47 | } -------------------------------------------------------------------------------- /AY04/1punteros.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Correr código (en consola): 3 | - Compilar: g++ 1punteros.cpp -o name_output 4 | - Correr: ./name_output 5 | */ 6 | 7 | #include 8 | 9 | int suma1(int x) 10 | { 11 | x += 1; 12 | return x; 13 | } 14 | 15 | void actualizar(int* x) // no retorna nada 16 | { 17 | *x += 1; // modifico la variable desde dentro de una función mediante su puntero 18 | // Notar que se usa * por la izquierda para acceder al valor apuntado 19 | } 20 | 21 | int main() { 22 | 23 | int a = 5; 24 | 25 | // Al crear la variable anterior, podemos acceder a su puntero: 26 | int* b = &a; 27 | 28 | // Recordar que para acceder al puntero se usa & 29 | // Y se debe declarar el tipo de puntero que estamos guardando, en este caso, puntero a int 30 | 31 | // Gracias al puntero, tenemos otra forma de modificar el valor de una variable 32 | // Esto será útil en arrays 33 | printf("Valor de a antes de actualizar: %d\n", a); 34 | actualizar(b); 35 | printf("Valor de a después de actualizar: %d\n\n", a); 36 | 37 | // Notar que si usamos la función suma, el valor de a no se modifica 38 | printf("Valor de a antes de suma1: %d\n", a); 39 | suma1(a); 40 | printf("Valor de a después de suma1: %d\n\n", a); 41 | 42 | // Ahora, si podemos retornarlo y actualizar el valor 43 | printf("Valor de a antes de suma1 con retorno: %d\n", a); 44 | a = suma1(a); 45 | printf("Valor de a después de suma1 con retorno: %d\n", a); 46 | 47 | return 0; 48 | } 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /AY07/3_OMP_reduction.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Comandos útiles: 3 | Compilar código: g++ -o name_output 3_OMP_reduction.cpp -fopenmp 4 | Correr código: ./name_output 5 | */ 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | int main() { 12 | 13 | // Qué pasa si hacemos una reducción con OpenMP? 14 | int x1 = 0; 15 | 16 | // Notar que agregamos "for" al pragma 17 | // Esto para repartir las iteraciones entre los threads 18 | #pragma omp parallel for num_threads(24) 19 | for (int i = 0; i < 100000000; i++) { 20 | int id = omp_get_thread_num(); 21 | x1 += 1; 22 | //printf("Proceso %d sumó 1\n", id); 23 | } 24 | 25 | printf("\nSuma Final 1 = %d\n", x1); 26 | 27 | // Ahora, veamos algo que sí funciona: 28 | 29 | int x2 = 0; 30 | int n = 10; 31 | int* array = (int*) calloc(n, sizeof(int)); 32 | 33 | #pragma omp parallel for num_threads(n) 34 | for (int i = 0; i < 100000000; i++) { 35 | int id = omp_get_thread_num(); 36 | array[id] += 1; 37 | // printf("Proceso %d sumó 1\n", id); 38 | } 39 | 40 | for (int i = 0; i < n; i++) { 41 | x2 += array[i]; 42 | } 43 | 44 | printf("\nSuma Final 2 = %d\n", x2); 45 | 46 | // Por último, OpenMP también tiene la siguiente forma de hacerlo: 47 | int x3 = 0; 48 | 49 | // Agregamos el parámetro reduction al pragma 50 | #pragma omp parallel for num_threads(24) reduction(+:x3) 51 | for (int i = 0; i < 100000000; i++) { 52 | int id = omp_get_thread_num(); 53 | x3 += 1; 54 | //printf("Proceso %d sumó 1\n", id); 55 | } 56 | 57 | printf("\nSuma Final 3 = %d\n", x3); 58 | 59 | 60 | // Nunca olviden de liberar memoria si es que la reservan :D 61 | free(array); 62 | 63 | return 0; 64 | } 65 | -------------------------------------------------------------------------------- /AY03/6arrays.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Correr código (en consola): 3 | - Compilar: g++ 6arrays.cpp -o name_output 4 | - Correr: ./name_output 5 | */ 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | int main() { 12 | 13 | // las listas no vienen nativas, existe array 14 | 15 | // Primera manera de hacerlo: dame un array de tamaño n de ints 16 | int n = 10; 17 | int array1[n]; 18 | 19 | // Segunda manera: puntero de int. calloc es resérvame 10 casilleros 20 | // de memoria del tamaño de un int. El (int*) es un casting de int a puntero de int 21 | int* array2 = (int*) calloc(10, sizeof(int)); // inicializa valores a 0 22 | 23 | // Tercera forma: en malloc solo le damos el tamaño total de lo que quiero reservar 24 | int* array3 = (int*) malloc(10*sizeof(int)); // no inicializa valores 25 | 26 | for (int i=0; i<10; i++) { 27 | // printf("%i ", array1[i]); 28 | // printf("%i ", array2[i]); 29 | // printf("%i ", array3[i]); 30 | // printf("\n"); 31 | } 32 | 33 | 34 | // Números random 35 | // srand((int) time(0)); // seteamos una semilla con el tiempo actual 36 | // esto significa que cambiará bastante seguido 37 | // y así obtendremos comportamientos distintos 38 | // al correr de nuevo el código. 39 | //srand(1); 40 | 41 | for (int i=0;i<10;++i) { 42 | // array2[i] = rand(); // genera números random 43 | // array2[i] = rand()%10; // para generar números en algún rango (0 y 9) 44 | //array2[i] = rand()%201 - 100; // entre -100 y 100 45 | // printf("%i ", array2[i]); 46 | } 47 | 48 | printf("\n"); 49 | 50 | // Es necesario liberar las memorias que fueron reservadas con calloc o malloc (HEAP) 51 | free(array2); 52 | free(array3); 53 | // No es necesario al array1 porque este no reserva memoria 54 | 55 | return 0; 56 | } -------------------------------------------------------------------------------- /AY04/4matrix_heap.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Correr código (en consola): 3 | - Compilar: g++ 4matrix_heap -o name_output 4 | - Correr: ./name_output 5 | */ 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | void print_matrix(int N, int** matrix) 12 | { 13 | // Notar que es importante guardar siempre el largo de un array, para poder recorrerlo. 14 | // No tenemos una función directa como "len" para obtener su largo. 15 | for (int i = 0; i < N; i++) 16 | { 17 | for (int j = 0; j < N; j++) 18 | { 19 | printf("%d\t", matrix[i][j]); 20 | } 21 | printf("\n"); 22 | } 23 | } 24 | 25 | int main() { 26 | 27 | // Para hacer una matriz simplemente debemos hacer array de arrays (punteros de punteros) 28 | int N = 10; 29 | int** matrix = (int**)malloc(N * sizeof(int*)); // Reservamos un bloque de memoria que va a almacenar punteros de int 30 | for (int i = 0; i < N; i++) 31 | { 32 | matrix[i] = (int*)calloc(N, sizeof(int)); // Y ahora le damos valor a cada uno de esos punteros con un nuevo bloque 33 | // de memoria que busca almacenar ints. 34 | } 35 | 36 | // Rellenemos sus valores con números random 37 | srand((int) time(0)); 38 | for (int i = 0; i < N; i++) 39 | { 40 | for (int j = 0; j < N; j++) 41 | { 42 | matrix[i][j] = rand()%101; // Qué hace esto? 43 | } 44 | } 45 | 46 | // Imprimamos nuestra matriz con la función definida 47 | print_matrix(N, matrix); 48 | 49 | // Finalmente, como usamos calloc y malloc, debemos liberar la memoria reservada 50 | // antes de acabar nuestro programa: 51 | for (int i = 0; i < N; i++) 52 | { 53 | free(matrix[i]); 54 | } 55 | free(matrix); 56 | // Recordar que siempre hay que liberar desde adentro hacia afuera para no perder las referencias. 57 | 58 | // Qué pasa si no liberamos la memoria o si liberamos matrix antes que cada bloque matrix[i]? 59 | 60 | return 0; 61 | } -------------------------------------------------------------------------------- /AY03/3flujos.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Correr código (en consola): 3 | - Compilar: g++ 3flujos.cpp -o name_output 4 | - Correr: ./name_output 5 | */ 6 | 7 | #include 8 | #include 9 | using namespace std; 10 | 11 | int main() { 12 | 13 | int a, b, c; 14 | a = 10; 15 | b = 2; 16 | c = 5; 17 | 18 | 19 | // IF, ELSE IF, ELSE 20 | printf("\nIF, ELSE IF, ELSE:\n"); 21 | if (a > c) 22 | { 23 | printf("a > c\n"); 24 | } 25 | else if (a == c) { 26 | printf("a = c\n"); 27 | } 28 | else { printf("a < c\n"); } 29 | 30 | 31 | // FOR 32 | printf("\nFOR: i++\n"); 33 | for (int i = 0; i < 10; i++) 34 | { 35 | cout << i << endl; 36 | } 37 | 38 | printf("\nFOR: i*=2\n"); 39 | for (int i = 1; i < 10; i*=2) 40 | { 41 | cout << i << endl; 42 | } 43 | 44 | printf("\nFOR: continue\n"); 45 | for (int i = 0; i < 5; i++) 46 | { 47 | for (int j = 0; j < 5; j++) 48 | { 49 | if (i == j) 50 | { 51 | continue; 52 | } 53 | printf("(%d, %d)\n", i, j); 54 | } 55 | } 56 | 57 | // WHILE 58 | printf("\nWHILE:\n"); 59 | while (c < 8) 60 | { 61 | cout << c++ << endl; // Ojo que acá se le está sumando 1 al valor de la variable c en cada llamado 62 | // cout << ++c << endl; // Qué diferencia hay con esta forma? 63 | } 64 | cout << "Valor de c fuera del while: "<< c << endl; 65 | 66 | printf("\nWHILE: true y break\n"); 67 | while (true) 68 | { 69 | b -= 1; 70 | if (b == -10) 71 | { 72 | printf("Rompemos el while\n"); 73 | break; 74 | } 75 | printf("Siguiente iteración...\n"); 76 | } 77 | 78 | 79 | // SWITCH STATEMENT 80 | printf("\nSWITCH:\n"); 81 | int variable = 2; 82 | 83 | switch (variable) 84 | { 85 | case 0: 86 | cout << "Falso" << endl; 87 | break; 88 | 89 | case 1: 90 | cout << "Verdadero" << endl; 91 | break; 92 | 93 | default: 94 | cout << "Inválido" << endl; 95 | break; 96 | } 97 | 98 | printf("\nSWITCH: sin break\n"); 99 | variable = 0; 100 | 101 | switch (variable) 102 | { 103 | case 0: 104 | cout << "Falso" << endl; 105 | 106 | case 1: 107 | cout << "Verdadero" << endl; 108 | 109 | default: 110 | cout << "Inválido" << endl; 111 | } 112 | 113 | return 0; 114 | } -------------------------------------------------------------------------------- /AY09/README.md: -------------------------------------------------------------------------------- 1 | ## Ayudantía 9: Instalación PyOpenCL 2 | 3 | La instalación de OpenCL (y de PyOpenCL) varía según los dispositivos que tengamos por lo que puede no ser muy directa (además de que siempre existe la posibilidad de que no quede bien configurado). Es por esto que, si no queremos pasar por el proceso de instalación, podemos usar, como alternativa, la plataforma de Google Colab con la tarjeta gráfica NVIDIA TESLA T4 que nos presta Google. 4 | 5 | En particular, lo primero que se debe hacer en Google Colab es ir a 'Entorno de ejecución' -> 'Cambiar tipo de entorno de ejecución' -> 'T4 GPU'. Segundo, Google sacó hace un tiempo los drivers de OpenCL del entorno base de Colab, por lo que hay que instalarlos con los siguiente comandos: 6 | 7 | !sudo apt -y update 8 | !sudo apt install -y nvidia-cuda-toolkit 9 | 10 | # Para poder también correr nuestros códigos en la CPU del entorno de GPU de Colab: 11 | !sudo apt install -y pocl-opencl-icd 12 | 13 | # Finalmente, solo nos falta instalar PyOpenCL en el entorno: 14 | !pip install pyopencl 15 | 16 | Todos estos pasos se explican con mayor detalle en el notebook ```0_installation.ipynb```. 17 | 18 | Subiendo los códigos de la ayudantía a Colab y haciendo los pasos anteriores, no debería haber problemas corriendo los códigos con PyOpenCL. 19 | 20 | Cabe destacar que el entorno de GPU de Google Colab no es ilimitado y, por lo general, la sesión se corta después de 6 horas aproximadamente (puede variar bastante). Es importante tener esto en consideración al realizar alguna tarea del curso (también nos podemos cambiar de cuenta de Google cuando nos pasa esto). 21 | 22 | Por último, si alguien está interesado en instalar OpenCL en su computador, podemos conversarlo y ver la instalación juntos, ya que hay varias guías en internet sobre las formas de instalarlo para distintos dispositivos. 23 | 24 | ### Correr en GPU y CPU 25 | 26 | Para correr en ambos dispositivos, es necesario instalar los drivers de más arriba y definir un contexto para cada uno (antes utilizábamos ```ctx = cl.create_some_context()``` por defecto). Ahora: 27 | 28 | platforms_cuda = cl.get_platforms()[0] # GPU 29 | platforms_pocl = cl.get_platforms()[1] # CPU 30 | 31 | devices_gpu = platforms_cuda.get_devices(device_type=cl.device_type.GPU) 32 | devices_cpu = platforms_pocl.get_devices(device_type=cl.device_type.CPU) 33 | 34 | # Contexto para GPU: 35 | ctx_gpu = cl.Context(devices=devices_gpu) 36 | # Contexto para CPU: 37 | ctx_cpu = cl.Context(devices=devices_cpu) 38 | 39 | # Esto está asumiendo que la plataforma 0 es la de GPU y la 1 de CPU. 40 | # Pueden comprobar esto imprimiendo las plataformas antes de indexarlas. 41 | 42 | Luego, el resto de códigos se mantienen igual. Basta con cambiar el contexto para trabajar en CPU/GPU. 43 | -------------------------------------------------------------------------------- /AY08/PyOpenCL/README.md: -------------------------------------------------------------------------------- 1 | ## Ayudantía 8: Instalación PyOpenCL 2 | 3 | La instalación de OpenCL (y de PyOpenCL) varía según los dispositivos que tengamos por lo que puede no ser muy directa (además de que siempre existe la posibilidad de que no quede bien configurado). Es por esto que, si no queremos pasar por el proceso de instalación, podemos usar, como alternativa, la plataforma de Google Colab con la tarjeta gráfica NVIDIA TESLA T4 que nos presta Google. 4 | 5 | En particular, lo primero que se debe hacer en Google Colab es ir a 'Entorno de ejecución' -> 'Cambiar tipo de entorno de ejecución' -> 'T4 GPU'. Segundo, Google sacó hace un tiempo los drivers de OpenCL del entorno base de Colab, por lo que hay que instalarlos con los siguiente comandos: 6 | 7 | !sudo apt -y update 8 | !sudo apt install -y nvidia-cuda-toolkit 9 | 10 | # Para poder también correr nuestros códigos en la CPU del entorno de GPU de Colab: 11 | !sudo apt install -y pocl-opencl-icd 12 | 13 | # Finalmente, solo nos falta instalar PyOpenCL en el entorno: 14 | !pip install pyopencl 15 | 16 | Todos estos pasos se explican con mayor detalle en el notebook ```0_installation.ipynb```. 17 | 18 | Subiendo los códigos de la ayudantía a Colab y haciendo los pasos anteriores, no debería haber problemas corriendo los códigos con PyOpenCL. 19 | 20 | Cabe destacar que el entorno de GPU de Google Colab no es ilimitado y, por lo general, la sesión se corta después de 6 horas aproximadamente (puede variar bastante). Es importante tener esto en consideración al realizar alguna tarea del curso (también nos podemos cambiar de cuenta de Google cuando nos pasa esto). 21 | 22 | Por último, si alguien está interesado en instalar OpenCL en su computador, podemos conversarlo y ver la instalación juntos, ya que hay varias guías en internet sobre las formas de instalarlo para distintos dispositivos. 23 | 24 | ### Correr en GPU y CPU 25 | 26 | Para correr en ambos dispositivos, es necesario instalar los drivers de más arriba y definir un contexto para cada uno (antes utilizábamos ```ctx = cl.create_some_context()``` por defecto). Ahora: 27 | 28 | platforms_cuda = cl.get_platforms()[0] # GPU 29 | platforms_pocl = cl.get_platforms()[1] # CPU 30 | 31 | devices_gpu = platforms_cuda.get_devices(device_type=cl.device_type.GPU) 32 | devices_cpu = platforms_pocl.get_devices(device_type=cl.device_type.CPU) 33 | 34 | # Contexto para GPU: 35 | ctx_gpu = cl.Context(devices=devices_gpu) 36 | # Contexto para CPU: 37 | ctx_cpu = cl.Context(devices=devices_cpu) 38 | 39 | # Esto está asumiendo que la plataforma 0 es la de GPU y la 1 de CPU. 40 | # Pueden comprobar esto imprimiendo las plataformas antes de indexarlas. 41 | 42 | Luego, el resto de códigos se mantienen igual. Basta con cambiar el contexto para trabajar en CPU/GPU. 43 | -------------------------------------------------------------------------------- /AY04/README.md: -------------------------------------------------------------------------------- 1 | Nuestros códigos los correremos en el clúster de ingeniería. Para conectarse debemos usar SSH, es decir, escribir en consola: 2 | 3 | ```ssh username@cluster.ing.uc.cl``` 4 | 5 | o antes también se usaba: (todavía me sigue funcionando) 6 | 7 | ```ssh username@mazinger.ing.puc.cl``` 8 | 9 | Siendo ```username``` el nombre de usuario de su correo UC. Luego de escribir lo anterior, les preguntará una contraseña (se las enviaré a sus mails). 10 | 11 | Una vez dentro, se pueden usar los comandos típicos que usamos en consola: ```ls```, ```cd```, ```rm```, ```mkdir```, ```htop``` (para ver los procesos corriendo en el servidor), entre otros. 12 | 13 | Si quieren cambiar su contraseña del servidor, pueden usar el comando ```passwd```. 14 | 15 | Subir y recibir archivos: 16 | 17 | - Opción 1: ```scp -r local_dir username@cluster.ing.uc.cl:server_dir``` 18 | 19 | La línea anterior se corre en consola y el -r sirve para subir una carpeta (si se quita, se sube el archivo indicado en el path). Por ejemplo: tengo mi carpeta ```AY04``` y quiero subirla a mi carpeta del servidor, entonces, ubicado en el path de mi carpeta en la consola, escribo el comando ```scp -r AY04/ alberto.almuna@cluster.ing.uc.cl:~```, les preguntará por su contraseña. Este comando también se puede utilizar para enviar archivos desde el clúster a nuestro computador y solo basta con intercambiar el destino y origen (Ej: ```scp alberto.almuna@cluster.ing.uc.cl:~/log.out ./AY04```) 20 | 21 | - Opción 2: crear un repositorio de Github y clonarlo en su computador y en el servidor, para luego transferir archivos con push y pull. 22 | 23 | Correr código: 24 | 25 | Para correr nuestros códigos podemos hacer lo mismo que en nuestro computador o pedirle al servidor que lo agregue a su cola de trabajo (cuando queramos correr algo grande). 26 | 27 | La forma de compilar y correr nuestro código es la misma que ya hemos hecho. 28 | 29 | Y para agregarlo a la cola de trabajo (con el job.sh creado): 30 | 31 | - ```sbatch job.sh``` -> agrega el trabajo job.sh a la cola del clúster (este archivo corresponde a un set de instrucciones a realizar) 32 | 33 | En nuestro caso, una vez que termine el job, creará un archivo ```log.out``` con el resultado obtenido. Si queremos leerlo rápidamente, podemos usar el comando ```cat log.out```. También podemos enviarnos el resultado usando el comando ```scp``` como arriba. 34 | 35 | Para obtener información de la cola podemos usar ```squeue```, ```top``` o ```htop```. 36 | 37 | Para más información sobre el clúster de Ing: https://deg.ing.uc.cl/informatica/cluster/ 38 | 39 | En particular, en la página https://deg.ing.uc.cl/informatica/cluster/trabajos-al-cluster/ en la sección de "Solicitud de CPU" se puede observar otro ejemplo de un archivo como ```job.sh```. 40 | 41 | # Video Clúster 42 | 43 | Hace un par de años hice un video que revisa todo lo importante relacionado al clúster (quizás ya toca hacerle una actualización, pero se los dejo disponible por si les sirve): 44 | 45 | https://youtu.be/LqeU8yo_b-w 46 | 47 | En este video uso el ssh con "mazinger.ing.puc.cl", pero hace poco se cambió a "cluster.ing.uc.cl". A mi todavía me funciona el anterior, pero igual les recomiendo utilizar la dirección más nueva. 48 | 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # IMT2112-2024-2 2 | Repositorio para las ayudantías de Algoritmos Paralelos en Computación Científica. 3 | 4 | Mail: alberto.almuna@uc.cl 5 | 6 | Si quieres dar algún comentario u opinión sobre la ayudantía: [Formulario de Feedback](https://forms.gle/7B53rLqTXwvjFFhW8) 7 | 8 | ## Correr código de C/C++ en Windows 9 | 10 | Recomiendo instalar WSL (Windows Subsystem for Linux) y pueden seguir la siguiente [guía](https://docs.microsoft.com/en-us/windows/wsl/install) 11 | 12 | En teoría debería funcionar la instalación directa con 13 | ``` 14 | wsl --install 15 | ``` 16 | en la PowerShell con permisos de administrador, sin embargo, si no les funciona, recomiendo seguir la instalación manual que se menciona en el link anterior. 17 | 18 | Una vez leída la guía anterior e instalado WSL2 junto a su distribución de Linux preferida (sugiero instalar Ubuntu que es la que viene por defecto en el comando anterior), recomiendo instalar la consola Windows Terminal (desde la Microsoft Store) ya que permite manejar distintos tipos de consolas dentro de la misma y, en general, es bastante cómoda. 19 | 20 | Una vez dentro de WSL, corran al siguiente comando: 21 | ``` 22 | cd ~ 23 | ``` 24 | ![image](https://user-images.githubusercontent.com/53873288/186964595-824dab4e-45f2-47a7-8d50-4fe40eb80e7a.png) 25 | 26 | para ir al directorio base. Si no hacen lo anterior, correrán los códigos desde Windows y no desde la distribución de Linux, lo que puede afectar el rendimiento de la ejecución. Entonces en este punto recomiendo crear las carpetas que utilizarán para el curso. 27 | 28 | A continuación, pueden instalar git y el compilador que utilizaremos con los siguientes comandos: 29 | ``` 30 | sudo apt-get update 31 | sudo apt-get install git 32 | sudo apt-get install g++ 33 | ``` 34 | 35 | Aunque Valgrind debería venir instalado, pueden intentar instalarlo con el siguiente comando: 36 | ``` 37 | sudo apt-get install valgrind 38 | ``` 39 | es una herramienta que nos ayudará a debuggear nuestro código ya que C/C++ no va a ser muy explícito en decirnos qué está fallando. 40 | 41 | Si utilizan VS Code, deben instalar la extensión 42 | ![image](https://user-images.githubusercontent.com/53873288/186965303-e73d7741-0dc4-48b5-89d3-ec318447505a.png) 43 | 44 | de esta manera, pueden correr el comando 45 | ``` 46 | code . 47 | ``` 48 | en la consola de WSL y les abrirá VS Code dentro del entorno de la distribución de Linux. 49 | 50 | Además, recomiendo instalar la extensión de C/C++ de VS Code, ya que contiene algunos atajos que nos ayudarán a programar más rápido! 51 | 52 | ## Correr código de C/C++ en Mac 53 | 54 | El año pasado me compartieron el siguiente [video](https://youtu.be/lGsyqgpMAYY?si=Nllk5YKNcEBY_p1E) para la instalación en Mac (M1/M2). No lo puedo probar, pero el año pasado no produjo problemas para la realización del curso. Por favor avísenme si es que encuentran que algún otro tutorial les funciona mejor o si este presenta problemas. 55 | 56 | ## Extas 57 | 58 | Por último, algunos comando útiles para navegar dentro de la consola: 59 | - ```cd nombre_carpeta```: es para moverse por los directorios 60 | - ```cd ..```: es para devolverse una carpeta 61 | - ```mkdir nombre_carpeta```: es para crear una nueva carpeta 62 | - ```ls```: es para ver los archivos que se encuentran en el directorio actual 63 | 64 | ## Si quieren aprender más o profundizar sobre lo visto en la ayudantía les recomiendo el siguiente [enlace](https://github.com/DCCentral-de-Apuntes/intro-C). Es un un taller muy completo para aprender C y que nos sirve para entender C++. Además, cualquier problema que tengan con la instalación, no duden en escribirme para ver si lo podemos solucionar 🐧 65 | -------------------------------------------------------------------------------- /AY05/3MPI_dot.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Correr código (en consola): 3 | - Compilar: mpic++ 3MPI_dot.cpp -std=c++11 4 | - Correr (con 2 procesos): mpirun -np 2 ./a.out 5 | Se puede cambiar el número de procesos con el que se corre 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | using namespace std; 14 | 15 | int main() { 16 | 17 | // Iniciamos el entorno de MPI y obtenemos los valores usuales 18 | MPI_Init(NULL,NULL); 19 | int world_size, world_rank; 20 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 21 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 22 | 23 | // Que solo el proceso raíz imprima la cantidad de procesos totales: 24 | if (world_rank == 0) { 25 | printf("Cantidad de procesos: %i\n\n", world_size); 26 | } 27 | 28 | // Inicializamos algunas variables que utilizaremos 29 | int firstIndex, localSize, n, err; 30 | 31 | // Vamos a trabajar con vectores de largo n 32 | n = 7; 33 | 34 | // Calculamos el primer índice y el tamaño local de cada proceso 35 | localSize = n / world_size; 36 | firstIndex = world_rank*localSize; 37 | 38 | // Recordar que hay que agregar lo que sobra (por si la división no es exacta) 39 | if (world_rank == world_size-1) { 40 | localSize += n % world_size; 41 | } 42 | printf("Rank %i, local size: %i, first index %i \n", world_rank, localSize, firstIndex); 43 | 44 | // Cada proceso crea la porción del vector que usará: 45 | int localVec1[localSize]; 46 | int localVec2[localSize]; 47 | for (int i=0; i 8 | #include 9 | #include 10 | 11 | int main() { 12 | 13 | // las listas no vienen nativas, existe el concepto de array 14 | 15 | // Primera manera de hacerlo (stack): dame un array de tamaño n de ints 16 | int n = 10; 17 | int array1[n]; 18 | for (int i=0; i<10; i++) { 19 | printf("%d ", array1[i]); // no inicializa valores (toma lo que hay directamente) 20 | } 21 | printf("\n"); 22 | 23 | // Segunda manera (heap): puntero de int. calloc es resérvame 10 casilleros 24 | // de memoria del tamaño de un int y dales valor 0. El (int*) es un casting de int a puntero de int. 25 | int* array2 = (int*) calloc(10, sizeof(int)); // inicializa valores a 0 26 | for (int i=0; i<10; i++) { 27 | printf("%d ", array2[i]); 28 | } 29 | printf("\n"); 30 | 31 | // Tercera forma (heap): en malloc solo le damos el tamaño total de lo que quiero reservar. 32 | // También es necesario hacer el casting a puntero de int! 33 | int* array3 = (int*) malloc(10 * sizeof(int)); // no inicializa valores (toma lo que hay directamente) 34 | for (int i=0; i<10; i++) { 35 | printf("%d ", array3[i]); 36 | } 37 | printf("\n"); 38 | 39 | 40 | // Generación de números aleatorios 41 | // srand((int) time(0)); // seteamos una semilla con el tiempo actual 42 | // para obtener resultados distintos en cada ejecución 43 | //srand(1); 44 | 45 | for (int i=0;i<10;++i) { 46 | // array2[i] = rand(); // genera números random 47 | // array2[i] = rand()%10; // para generar números en algún rango (0 y 9) 48 | // array2[i] = rand()%201 - 100; // entre -100 y 100 49 | // printf("%i ", array2[i]); 50 | } 51 | printf("\n"); 52 | 53 | 54 | 55 | // Algunos datos extras: 56 | // ====================================== 57 | // // Veamos los punteros de cada array: 58 | // printf("\nPunteros al array1:\n"); 59 | // printf("array1\t\t %p\n", array1); 60 | // printf("&array1\t\t %p\n", &array1); 61 | // printf("&array1[0]\t %p\n\n", &array1[0]); 62 | 63 | // printf("Punteros al array2:\n"); 64 | // printf("array2\t\t %p\n", array2); 65 | // printf("&array2\t\t %p\n", &array2); 66 | // printf("&array2[0]\t %p\n\n", &array2[0]); 67 | 68 | // printf("Punteros al array3:\n"); 69 | // printf("array3\t\t %p\n", array3); 70 | // printf("&array3\t\t %p\n", &array3); 71 | // printf("&array3[0]\t %p\n\n", &array3[0]); 72 | 73 | // // Acceso al elemento de un array: 74 | // array2[2] = 10; 75 | // printf("Le damos un valor a array[2] y accedemos a él:\n"); 76 | // printf("array[2] = %d\n", array2[2]); 77 | // printf("2[array] = %d\n\n", 2[array2]); 78 | // ====================================== 79 | 80 | // Es necesario liberar las memorias que fueron reservadas con calloc o malloc (heap) 81 | free(array2); 82 | free(array3); 83 | // No es necesario al array1 porque este no reserva memoria en el heap 84 | 85 | // La diferencia entre el primer arreglo y los dos siguientes es que: 86 | // - El primer arreglo vive en el Stack, que es básicamente la memoria asignada por mi 87 | // sistema operativo para mi programa. Es muy útil, aunque tiene algunas limitaciones. 88 | // - Los otros dos viven en el Heap, el que es memoria extra pedida por mi programa. 89 | // Este último espacio de memoria tiene la ventaja de que siempre que se le pida, va a crecer, y, si se 90 | // libera algún bloque ocupado, entonces se reduce. Luego podemos tener control de cuánta memoria 91 | // le pedimos. 92 | 93 | return 0; 94 | } -------------------------------------------------------------------------------- /AY05/4MPI_mat_vec.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Correr código (en consola): 3 | - Compilar: mpic++ 4MPI_mat_vec.cpp -std=c++11 4 | - Correr (con 2 procesos): mpirun -np 2 ./a.out 5 | Se puede cambiar el número de procesos con el que se corre 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | using namespace std; 14 | 15 | 16 | int** matrix_generator(int filas, int columnas, int world_rank) { 17 | int** matrix = (int**) calloc(filas, sizeof(int*)); 18 | 19 | for (int i = 0; i < filas; i++) { 20 | matrix[i] = (int*) calloc(columnas, sizeof(int)); 21 | } 22 | 23 | for (int i = 0; i < filas; i++) { 24 | for (int j = 0; j < columnas; j++) { 25 | matrix[i][j] = i+j+world_rank; 26 | } 27 | } 28 | return matrix; 29 | } 30 | 31 | void free_matrix(int** matrix, int filas) { 32 | for (int i = 0; i < filas; i++) { 33 | free(matrix[i]); 34 | } 35 | free(matrix); 36 | } 37 | 38 | void print_matrix(int** matrix, int filas, int columnas) { 39 | printf("\n"); 40 | for (int i = 0; i < filas; i++) { 41 | for (int j = 0; j < columnas; j++) { 42 | printf("%i ", matrix[i][j]); 43 | } 44 | printf("\n"); 45 | } 46 | } 47 | 48 | void print_vector(int* vector, int n) { 49 | printf("\n"); 50 | for (int i = 0; i < n; i++) { 51 | printf("%i ", vector[i]); 52 | } 53 | printf("\n"); 54 | } 55 | 56 | 57 | int main() { 58 | MPI_Init(NULL,NULL); 59 | int world_size, world_rank; 60 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 61 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 62 | 63 | if (world_rank == 0) { 64 | printf("Cantidad de procesos: %i\n\n", world_size); 65 | } 66 | 67 | int firstIndex, localColumnas, n, err; 68 | 69 | n = 8; 70 | 71 | localColumnas = n / world_size; 72 | firstIndex = world_rank*localColumnas; 73 | 74 | if (world_rank == world_size-1) { 75 | localColumnas += n % world_size; 76 | } 77 | 78 | printf("Rank %i, local columnas: %i, first index %i \n", world_rank, localColumnas, firstIndex); 79 | 80 | int localVec[localColumnas]; 81 | 82 | for (int i=0; i 9 | using namespace std; 10 | #include 11 | 12 | int main() { 13 | 14 | // Iniciamos el entorno de MPI 15 | MPI_Init(NULL,NULL); 16 | 17 | // Obtenemos el número de procesos y el rango de cada uno: 18 | int world_size, world_rank; 19 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 20 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 21 | // Estos datos son importantes porque los utilizaremos para dividir bien el trabajo y manejar las comunicaciones. 22 | 23 | // Vamos a hacer una suma de 10 (sumlength) números 24 | int firstIndex, localSize; 25 | int sumlength = 10; 26 | 27 | // Obtenemos cuántos números sumará cada proceso y desde qué posición parte: 28 | localSize = sumlength / world_size; 29 | firstIndex = world_rank * (sumlength/world_size); 30 | 31 | // Notar que la división puede no ser exacta, por lo que hay que sumarle el resto al último proceso 32 | // (Puede ser otro proceso el encargado del resto, pero elegimos al último por comodidad). 33 | if (world_rank == world_size-1){ 34 | localSize += sumlength % world_size; 35 | } 36 | 37 | // Imprimimos la información que tenemos hasta ahora: 38 | cout << "Rank: " << world_rank << ", first index: " << firstIndex << ", local size: " << localSize << endl; 39 | 40 | // Cada proceso crea su vector local que tendrá los valores a sumar: 41 | int localVector[localSize]; 42 | for (int n=0; n 9 | #include 10 | #include 11 | #include 12 | #include 13 | using namespace std; 14 | 15 | 16 | void print_vector(int* vector, int n, int rank, const char* text) { 17 | printf("\nRank %i, %s:\n", rank, text); 18 | for (int i = 0; i < n; i++) { 19 | printf("%i ", vector[i]); 20 | } 21 | printf("\n"); 22 | } 23 | 24 | /* 25 | A veces podemos querer utilizar alguna operación colectiva para facilitar el envío y recepción de datos, sin embargo, 26 | puede darse el caso que todos los procesos envían una cantidad distinta de elementos, es decir, no todos los trabajadores 27 | tienen los mismos tamaños locales. Para esto, existen operaciones colectivas variables que permiten enviar/recibir 28 | cantidades distintas de datos por cada proceso. Y, para utilizarlas, basta con otorgar un array de tamaños y otro de 29 | offsets (posiciones), los que representan la información de cada trabajador. 30 | Página con todas las operaciones colectivas de MPI: https://learn.microsoft.com/en-us/message-passing-interface/mpi-collective-functions 31 | A continuación, presentamos un ejemplo de Allgatherv: 32 | */ 33 | 34 | int main() { 35 | MPI_Init(NULL,NULL); 36 | int world_size, world_rank; 37 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 38 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 39 | 40 | if (world_rank == 0) { 41 | printf("Cantidad de procesos: %i\n\n", world_size); 42 | } 43 | 44 | int first_index, local_size, err; 45 | 46 | // Vamos a hacer un ejemplo donde cada proceso tiene un vector de tamaño igual a su rango +1 47 | // Definamos las características de cada uno: 48 | local_size = world_rank + 1; 49 | first_index = 0; 50 | for (int i = 0; i < world_rank; i++) 51 | { 52 | first_index += i + 1; 53 | } 54 | printf("\nRank %i, local size: %i, first index %i \n", world_rank, local_size, first_index); 55 | 56 | // Ahora, creamos el vector local de cada proceso: 57 | int *local_vec = new int[local_size]; 58 | 59 | // Rellenamos el vector: 60 | for (int i = 0; i < local_size; i++) 61 | { 62 | local_vec[i] = i + first_index; 63 | } 64 | const char* text = "local vector"; 65 | print_vector(local_vec, local_size, world_rank, text); 66 | 67 | // Hasta el momento, cada proceso tiene su propio vector. 68 | // Y queremos reunir todas las partes en todos los procesos. 69 | // Utilicemos operaciones colectivas para realizar esto de manera directa! 70 | 71 | // Calculamos el tamaño total del vector completo: 72 | int total_size = 0; 73 | for (int i = 1; i < world_size + 1; i++) 74 | { 75 | total_size += i; 76 | } 77 | 78 | // Necesitamos crear arrays que guarden la cantidad de elementos a recibir de cada proceso y los offsets correspondientes 79 | int recvcounts[world_size]; 80 | int offsets[world_size]; 81 | for (int i = 0; i < world_size; i++) 82 | { 83 | recvcounts[i] = i+1; 84 | offsets[i] = i; 85 | if (i > 0) 86 | { 87 | offsets[i] += offsets[i-1]; 88 | } 89 | } 90 | if (world_rank==0) { 91 | const char* text2 = "recvcounts vector"; 92 | print_vector(recvcounts, world_size, world_rank, text2); 93 | const char* text3 = "offsets vector"; 94 | print_vector(offsets, world_size, world_rank, text3); 95 | } 96 | 97 | 98 | // Creamos el vector completo 99 | int *full_vec = new int[total_size]; 100 | 101 | // Y utilizamos la operación colectiva AllGather pero variable: 102 | // (ya que no todos los procesos envían la misma cantida de elementos) 103 | err = MPI_Allgatherv(local_vec, local_size, MPI_INT, full_vec, recvcounts, offsets, MPI_INT, MPI_COMM_WORLD); 104 | // int MPI_Allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, 105 | // void *recvbuf, const int *recvcounts, const int *displs, 106 | // MPI_Datatype recvtype, MPI_Comm comm) 107 | 108 | const char* text4 = "full vector"; 109 | print_vector(full_vec, total_size, world_rank, text4); 110 | 111 | // No olvidemos de liberar la memoria reservada en el heap 112 | delete[] local_vec; 113 | delete[] full_vec; 114 | 115 | 116 | MPI_Finalize(); 117 | } -------------------------------------------------------------------------------- /AY05/README.md: -------------------------------------------------------------------------------- 1 | ## Instalación MPI en WSL 2 | 3 | Para instalar MPI en WSL, pueden usar los siguientes comandos en consola: 4 | 5 | sudo apt-get update 6 | sudo apt install openmpi-bin libopenmpi-dev 7 | 8 | # Clúster 9 | 10 | Nuestros códigos los correremos en el clúster de ingeniería. Para conectarse debemos usar SSH, es decir, escribir en consola: 11 | 12 | ```ssh username@cluster.ing.uc.cl``` 13 | 14 | o antes también se usaba: (todavía me sigue funcionando) 15 | 16 | ```ssh username@mazinger.ing.puc.cl``` 17 | 18 | Siendo ```username``` el nombre de usuario de su correo UC. Luego de escribir lo anterior, les preguntará una contraseña (se las enviaré a sus mails). 19 | 20 | Una vez dentro, se pueden usar los comandos típicos que usamos en consola: ```ls```, ```cd```, ```rm```, ```mkdir```, ```htop``` (para ver los procesos corriendo en el servidor), entre otros. 21 | 22 | Si quieren cambiar su contraseña del servidor, pueden usar el comando ```passwd```. 23 | 24 | ## Subir y recibir archivos 25 | 26 | - Opción 1: ```scp -r local_dir username@cluster.ing.uc.cl:server_dir``` 27 | 28 | La línea anterior se corre en consola y el -r sirve para subir una carpeta (si se quita, se sube el archivo indicado en el path). Por ejemplo: tengo mi carpeta ```AY05``` y quiero subirla a mi carpeta del servidor, entonces, ubicado en el path de mi carpeta en la consola, escribo el comando ```scp -r AY05/ alberto.almuna@cluster.ing.uc.cl:~```, les preguntará por su contraseña. Este comando también se puede utilizar para enviar archivos desde el clúster a nuestro computador y solo basta con intercambiar el destino y origen (Ej: ```scp alberto.almuna@cluster.ing.uc.cl:~/log.out ./AY05```) 29 | 30 | - Opción 2: crear un repositorio de Github y clonarlo en su computador y en el servidor, para luego transferir archivos con push y pull. 31 | 32 | ## Correr código 33 | 34 | Para correr nuestros códigos podemos hacer lo mismo que en nuestro computador o pedirle al servidor que lo agregue a su cola de trabajo (cuando queramos correr algo grande). 35 | 36 | La forma de compilar y correr nuestro código con MPI es la siguiente: 37 | 38 | - Antes de poder compilar con ```mpic++``` en el clúster, es necesario correr el comando ```module load mpi/openmpi-x86_64``` en la consola del servidor (cada vez que nos conectemos es necesario volver a correr el comando anterior). Esto carga el módulo asociado a MPI, lo que significa que ahora podremos compilar y correr nuestros scripts de la misma manera que en nuestro computador personal. En caso de no correr la línea anterior luego de hacer login e intentar compilar, nos aparecerá un error de que no se encuentra el comando ```mpic++```. 39 | - ```mpic++ code.cpp -std=c++11``` -> compila el código y lo guarda en a.out 40 | - ```mpirun ./a.out``` -> corre el ejecutable a.out 41 | - ```mpirun -np 2 ./a.out``` -> corre el ejecutable a.out con 2 procesos 42 | 43 | Y para agregarlo a la cola de trabajo (con el job.sh creado): 44 | 45 | - ```sbatch job.sh``` -> agrega el trabajo job.sh a la cola de trabajo. Ojo que también es necesario correr el comando ```module load mpi/openmpi-x86_64``` antes de mandar el trabajo a la cola. 46 | 47 | En nuestro caso, una vez que termine el job, creará un archivo ```log.out``` con el resultado obtenido. Si queremos leerlo rápidamente, podemos usar el comando ```cat log.out```. 48 | 49 | Para obtener información de la cola podemos usar ```squeue```, ```top``` o ```htop```. 50 | 51 | Para más datos sobre el clúster de Ing: https://deg.ing.uc.cl/informatica/cluster/ 52 | 53 | ## Correr Python en el clúster 54 | 55 | En la tarea es necesario también correr un código de Python para generar la matriz pedida y, si queremos hacer un ejemplo bien grande, es buena idea correr este script también en el clúster. En dicho caso, puede ocurrir que no estén todas las librerías de Python necesarias directamente instaladas en el servidor, por lo que, para instalarlas, recomiendo seguir los pasos descritos en la documentación del clúster de ingeniería (https://dt.ing.uc.cl/recursos/cluster/ en la pestaña de "Software" y en la sección de Python). Acá se menciona que cada usuario debe usar Anaconda para administrar los paquetes extras y se indican las instrucciones para la instalación. 56 | 57 | ## Video Clúster 58 | 59 | Hace un par de años hice un video que revisa todo lo importante relacionado al clúster (quizás ya toca hacerle una actualización, pero se los dejo disponible por si les sirve): 60 | 61 | https://youtu.be/LqeU8yo_b-w 62 | 63 | ### IMPORTANTE: Cuando hice este video no era necesario correr el comando de ```module load mpi/openmpi-x86_64``` antes de querer compilar códigos de MPI, sin embargo, si ahora intentan ejecutar ```mpic++``` sin correr la línea anterior, el servidor les tirará un error porque no será capaz de encontrar el comando. 64 | 65 | Otro detalle es que uso el ssh con "mazinger.ing.puc.cl", pero hace poco se cambió a "cluster.ing.uc.cl". A mi todavía me funciona el anterior, pero igual les recomiendo utilizar la dirección más nueva. 66 | 67 | -------------------------------------------------------------------------------- /AY08/Numba/6_Numba_caching.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "770a296f", 6 | "metadata": {}, 7 | "source": [ 8 | "# Caching with Numba\n", 9 | "\n", 10 | " - Elwin van 't Wout\n", 11 | " - Pontificia Universidad Católica de Chile\n", 12 | " - IMT3870\n", 13 | " - 28-8-2023\n", 14 | "\n", 15 | "Cache a function optimised by Numba." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "id": "217c54e1", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import time\n", 26 | "import numpy as np\n", 27 | "from numba import jit" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "id": "6192058c", 33 | "metadata": {}, 34 | "source": [ 35 | "Numba uses JIT to compile functions when they are called for the first time. The compiled function will be immediately used for subsequent calls. This means that after restarting the kernel or shutting down the Jupyter notebook, the compiled function will be *lost*. Numba can store the compiled function on disk, called *caching* (not to be confused with the cache memory)." 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "id": "2f57b2ac", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "@jit(nopython=True, cache=False)\n", 46 | "def sum_vector_no_cache(a):\n", 47 | " s = np.sum(a)\n", 48 | " return s\n", 49 | "\n", 50 | "@jit(nopython=True, cache=True)\n", 51 | "def sum_vector_cached(a):\n", 52 | " s = np.sum(a)\n", 53 | " return s" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 3, 59 | "id": "eb370ff0", 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "n = int(1e7)\n", 64 | "vec = np.arange(n)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "id": "c63e69dc", 70 | "metadata": {}, 71 | "source": [ 72 | "The first call is always slow because the code needs to be compiled." 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 4, 78 | "id": "d171e3dc", 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "The 1st call to the Numba function took: 0.5009028911590576 seconds.\n", 86 | "The 2nd call to the Numba function took: 0.012819528579711914 seconds.\n", 87 | "The 3rd call to the Numba function took: 0.001155853271484375 seconds.\n" 88 | ] 89 | } 90 | ], 91 | "source": [ 92 | "time_1 = time.time()\n", 93 | "sum_vector_no_cache(vec)\n", 94 | "time_2 = time.time()\n", 95 | "sum_vector_no_cache(vec)\n", 96 | "time_3 = time.time()\n", 97 | "sum_vector_no_cache(vec)\n", 98 | "time_4 = time.time()\n", 99 | "print(\"The 1st call to the Numba function took:\",time_2-time_1,\"seconds.\")\n", 100 | "print(\"The 2nd call to the Numba function took:\",time_3-time_2,\"seconds.\")\n", 101 | "print(\"The 3rd call to the Numba function took:\",time_4-time_3,\"seconds.\")" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 5, 107 | "id": "2b2876b2", 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "name": "stdout", 112 | "output_type": "stream", 113 | "text": [ 114 | "The 1st call to the Numba function took: 0.03046727180480957 seconds.\n", 115 | "The 2nd call to the Numba function took: 0.0 seconds.\n", 116 | "The 3rd call to the Numba function took: 0.0 seconds.\n" 117 | ] 118 | } 119 | ], 120 | "source": [ 121 | "time_1 = time.time()\n", 122 | "sum_vector_cached(vec)\n", 123 | "time_2 = time.time()\n", 124 | "sum_vector_cached(vec)\n", 125 | "time_3 = time.time()\n", 126 | "sum_vector_cached(vec)\n", 127 | "time_4 = time.time()\n", 128 | "print(\"The 1st call to the Numba function took:\",time_2-time_1,\"seconds.\")\n", 129 | "print(\"The 2nd call to the Numba function took:\",time_3-time_2,\"seconds.\")\n", 130 | "print(\"The 3rd call to the Numba function took:\",time_4-time_3,\"seconds.\")" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "id": "2f755ede", 136 | "metadata": {}, 137 | "source": [ 138 | "The very first time this Notebook is used, Numba needs to perform the optimisation and compilation, which takes time. Next time, the first call will be fast as well. It is slightly slower than the second and third call since the compiled code needs to be read from disk, but much quicker than without caching." 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "id": "5b86341a", 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [] 148 | } 149 | ], 150 | "metadata": { 151 | "kernelspec": { 152 | "display_name": "Python 3", 153 | "language": "python", 154 | "name": "python3" 155 | }, 156 | "language_info": { 157 | "codemirror_mode": { 158 | "name": "ipython", 159 | "version": 3 160 | }, 161 | "file_extension": ".py", 162 | "mimetype": "text/x-python", 163 | "name": "python", 164 | "nbconvert_exporter": "python", 165 | "pygments_lexer": "ipython3", 166 | "version": "3.11.2" 167 | } 168 | }, 169 | "nbformat": 4, 170 | "nbformat_minor": 5 171 | } 172 | -------------------------------------------------------------------------------- /AY08/Numba/4_Numba_race_condition.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "68659313", 6 | "metadata": {}, 7 | "source": [ 8 | "# Race conditions in parallel Numba\n", 9 | "\n", 10 | " - Elwin van 't Wout\n", 11 | " - Pontificia Universidad Católica de Chile\n", 12 | " - IMT3870\n", 13 | " - 26-8-2024\n", 14 | "\n", 15 | "This tutorial shows a race condition in a parallel for-loop that leads to code that is not thread safe." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "id": "8cfe49b2", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "from numba import njit, prange\n", 26 | "import numpy as np" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "id": "9a315fde", 32 | "metadata": {}, 33 | "source": [ 34 | "The following code sums the elements of a vector." 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "id": "ecdd83c9", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "@njit(parallel=True)\n", 45 | "def sum_vector(x):\n", 46 | " length = x.shape[0]\n", 47 | " s = 0\n", 48 | " for i in prange(length):\n", 49 | " s += x[i]\n", 50 | " return s " 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "id": "04b98951", 56 | "metadata": {}, 57 | "source": [ 58 | "The expected sum of a vector with elements $0,1,2,\\dots,n-1$ is $(n-1)n/2$." 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 3, 64 | "id": "fc4d5095", 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "The vector with 100000 elements sums to 4999950000\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "n = int(1e5)\n", 77 | "vec = np.arange(n)\n", 78 | "sum_exact = int((n-1)*n/2)\n", 79 | "print(\"The vector with\",n,\"elements sums to\",sum_exact)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 4, 85 | "id": "38a3e9ad", 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "name": "stdout", 90 | "output_type": "stream", 91 | "text": [ 92 | "The sum calculated by Numba is: 4999950000\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "sum_numba = sum_vector(vec)\n", 98 | "print(\"The sum calculated by Numba is:\",sum_numba)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "id": "33307306", 104 | "metadata": {}, 105 | "source": [ 106 | "Now, let us create an array with four elements, and calculate the sum of the input vector for each element of the output array. With Python broadcasting, you can add each element of the input array to the entire output array." 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 5, 112 | "id": "85447bdd", 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "@njit(parallel=True)\n", 117 | "def sum_vector_in_array_race_condition(x):\n", 118 | " length = x.shape[0]\n", 119 | " s = np.zeros(4, dtype=np.int_)\n", 120 | " for i in prange(length):\n", 121 | " s[:] += x[i]\n", 122 | " return s" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 6, 128 | "id": "974abac2", 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "name": "stdout", 133 | "output_type": "stream", 134 | "text": [ 135 | "The sum calculated by Numba is: [1927748978 2055798753 1979814415 2010307473]\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "sum_race = sum_vector_in_array_race_condition(vec)\n", 141 | "print(\"The sum calculated by Numba is:\",sum_race)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "id": "797b1e6f", 147 | "metadata": {}, 148 | "source": [ 149 | "The four elements of the output array should all be the sum of the input vector, but this is not the case. The code is not thread safe! Each element is different because there is a race condition." 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "id": "ab24b8f0", 155 | "metadata": {}, 156 | "source": [ 157 | "The following adaptation solves the race condition because it changes the data types such that Numba understands the race condition and parallelises the code correctly. Specifically, you need to create a slice reference outside the loop." 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 7, 163 | "id": "9eb742e3", 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "@njit(parallel=True)\n", 168 | "def sum_vector_in_array_safe(x):\n", 169 | " length = x.shape[0]\n", 170 | " y = np.zeros(4, dtype=np.int_)\n", 171 | " s = y[:]\n", 172 | " for i in prange(length):\n", 173 | " s += x[i]\n", 174 | " return s" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 8, 180 | "id": "d86836cc", 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "The sum calculated by Numba is: [704982704 704982704 704982704 704982704]\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "sum_safe = sum_vector_in_array_safe(vec)\n", 193 | "print(\"The sum calculated by Numba is:\",sum_safe)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "id": "df608a2a", 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [] 203 | } 204 | ], 205 | "metadata": { 206 | "kernelspec": { 207 | "display_name": "Python 3", 208 | "language": "python", 209 | "name": "python3" 210 | }, 211 | "language_info": { 212 | "codemirror_mode": { 213 | "name": "ipython", 214 | "version": 3 215 | }, 216 | "file_extension": ".py", 217 | "mimetype": "text/x-python", 218 | "name": "python", 219 | "nbconvert_exporter": "python", 220 | "pygments_lexer": "ipython3", 221 | "version": "3.11.2" 222 | } 223 | }, 224 | "nbformat": 4, 225 | "nbformat_minor": 5 226 | } 227 | -------------------------------------------------------------------------------- /AY06/3_OMP_matvec.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Comandos útiles: 3 | Compilar código: g++ -o name_output 3_OMP_matvec.cpp -fopenmp 4 | Correr código: ./name_output 5 | */ 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | // Función que genera un vector de largo n con valores entre -rango y +rango 13 | float* vector_generator(int n, int range) 14 | { 15 | float* vector = (float*) calloc(n, sizeof(float)); 16 | 17 | for (int i=0; i( end - start ); 175 | 176 | // Vemos cuánto se tardó en calcular el resultado 177 | printf("Tiempo MATVEC normal: %.9f seconds\n", execution.count() * 1e-9); 178 | 179 | // Repitamos el cálculo anterior pero ahora con OpenMP 180 | start = std::chrono::high_resolution_clock::now(); 181 | float* result2 = mat_vec_par(mat, vec, n); 182 | end = std::chrono::high_resolution_clock::now(); 183 | execution = std::chrono::duration_cast( end - start ); 184 | 185 | // Vemos cuánto se tardó en calcular el resultado 186 | printf("Tiempo MATVEC paralelo: %.9f seconds\n", execution.count() * 1e-9); 187 | 188 | // No olvidemos liberar la memoria :D 189 | free(vec); 190 | free_matrix(mat, n); 191 | free(result); 192 | free(result2); 193 | 194 | break; 195 | } 196 | 197 | default: 198 | printf("default"); 199 | break; 200 | } 201 | 202 | return 0; 203 | } -------------------------------------------------------------------------------- /AY08/Numba/1_PythonDecorators.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "711ca4c9", 6 | "metadata": {}, 7 | "source": [ 8 | "# Python decorators\n", 9 | "\n", 10 | " - Elwin van 't Wout\n", 11 | " - Pontificia Universidad Católica de Chile\n", 12 | " - IMT3870\n", 13 | " - 26-8-2024\n", 14 | " \n", 15 | "This tutorial shows the functionality of Python *decorators*. A *decorator* is a programming construction that adapts functions." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "id": "98df67de", 21 | "metadata": {}, 22 | "source": [ 23 | "A Python *function* can take Python objects as input and output. An often used construction is taking a number, or array of numbers, as input of a function, and another number, or array of numbers, as output. Following is an example." 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "id": "1f86e3de", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "def my_square(x):\n", 34 | " return x**2" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "id": "299ead5b", 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/plain": [ 46 | "4" 47 | ] 48 | }, 49 | "execution_count": 2, 50 | "metadata": {}, 51 | "output_type": "execute_result" 52 | } 53 | ], 54 | "source": [ 55 | "my_square(2)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "id": "9d5ec8a5", 61 | "metadata": {}, 62 | "source": [ 63 | "Python functions are objects themselves and can, therefore, be used as input and output of another Python function. The following example takes an arbitrary function, performs additional timing statistics, and returns this new function." 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "id": "44ad541a", 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "import time\n", 74 | "\n", 75 | "def timer(fun):\n", 76 | " def function_execution(*args):\n", 77 | " print(\"Start execution of function\", fun.__name__, \"at\", time.asctime())\n", 78 | " start = time.perf_counter()\n", 79 | " output_value = fun(*args)\n", 80 | " finish = time.perf_counter()\n", 81 | " print(\"Finished execution in\", finish - start, \"seconds\")\n", 82 | " return output_value\n", 83 | " return function_execution" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 4, 89 | "id": "f23edf05", 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "my_timed_square = timer(my_square)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 5, 99 | "id": "1af60fff", 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "Start execution of function my_square at Fri Oct 18 10:10:52 2024\n", 107 | "Finished execution in 1.500000053056283e-06 seconds\n" 108 | ] 109 | }, 110 | { 111 | "data": { 112 | "text/plain": [ 113 | "4" 114 | ] 115 | }, 116 | "execution_count": 5, 117 | "metadata": {}, 118 | "output_type": "execute_result" 119 | } 120 | ], 121 | "source": [ 122 | "my_timed_square(2)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "id": "8be2eda8", 128 | "metadata": {}, 129 | "source": [ 130 | "The idea of decorators is to simplify this process. Above, we needed to create a separate function `my_timed_square` to use the timer for the square operation. However, we might want to use the timing capabilities for other functions as well, like for calculating the cube of a number. The timing functionality can be reused for any function with a *decorator*." 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 6, 136 | "id": "7679f533", 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "@timer\n", 141 | "def my_cube(x):\n", 142 | " return x**3" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 7, 148 | "id": "aa7bc033", 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "name": "stdout", 153 | "output_type": "stream", 154 | "text": [ 155 | "Start execution of function my_cube at Fri Oct 18 10:11:06 2024\n", 156 | "Finished execution in 1.6999999843392288e-06 seconds\n" 157 | ] 158 | }, 159 | { 160 | "data": { 161 | "text/plain": [ 162 | "8" 163 | ] 164 | }, 165 | "execution_count": 7, 166 | "metadata": {}, 167 | "output_type": "execute_result" 168 | } 169 | ], 170 | "source": [ 171 | "my_cube(2)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "id": "12e956ba", 177 | "metadata": {}, 178 | "source": [ 179 | "Notice that we can call the cube function immediately, without creating an additional function." 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "id": "a888af5f", 185 | "metadata": {}, 186 | "source": [ 187 | "Notice that the decorator only takes the function on the next line, not all functions in a cell." 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 8, 193 | "id": "2bd25948", 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "@timer\n", 198 | "def my_fourth_power(x):\n", 199 | " return x**4\n", 200 | "\n", 201 | "def my_fifth_power(x):\n", 202 | " return x**5" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 9, 208 | "id": "49590e09", 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "name": "stdout", 213 | "output_type": "stream", 214 | "text": [ 215 | "Start execution of function my_fourth_power at Fri Oct 18 10:11:19 2024\n", 216 | "Finished execution in 1.2999998943996616e-06 seconds\n" 217 | ] 218 | }, 219 | { 220 | "data": { 221 | "text/plain": [ 222 | "16" 223 | ] 224 | }, 225 | "execution_count": 9, 226 | "metadata": {}, 227 | "output_type": "execute_result" 228 | } 229 | ], 230 | "source": [ 231 | "my_fourth_power(2)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 10, 237 | "id": "69a98e85", 238 | "metadata": {}, 239 | "outputs": [ 240 | { 241 | "data": { 242 | "text/plain": [ 243 | "32" 244 | ] 245 | }, 246 | "execution_count": 10, 247 | "metadata": {}, 248 | "output_type": "execute_result" 249 | } 250 | ], 251 | "source": [ 252 | "my_fifth_power(2)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "id": "e0d816a9", 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [] 262 | } 263 | ], 264 | "metadata": { 265 | "kernelspec": { 266 | "display_name": "Python 3", 267 | "language": "python", 268 | "name": "python3" 269 | }, 270 | "language_info": { 271 | "codemirror_mode": { 272 | "name": "ipython", 273 | "version": 3 274 | }, 275 | "file_extension": ".py", 276 | "mimetype": "text/x-python", 277 | "name": "python", 278 | "nbconvert_exporter": "python", 279 | "pygments_lexer": "ipython3", 280 | "version": "3.11.2" 281 | } 282 | }, 283 | "nbformat": 4, 284 | "nbformat_minor": 5 285 | } 286 | -------------------------------------------------------------------------------- /AY01/Tutorial_joblib_2_reuse.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "id": "a7056a3f", 7 | "metadata": {}, 8 | "source": [ 9 | "# Ayudantía 1 - Notebook 2\n", 10 | "### Profesor: Elwin van 't Wout\n", 11 | "### Ayudante: Alberto Almuna Morales (alberto.almuna@uc.cl)" 12 | ] 13 | }, 14 | { 15 | "attachments": {}, 16 | "cell_type": "markdown", 17 | "id": "a032fff2", 18 | "metadata": {}, 19 | "source": [ 20 | "The library ```joblib``` provides functionality for parallel computing. In this notebook, let us use a parallel pool of workers for different tasks." 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "id": "34a5963a", 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import numpy as np" 31 | ] 32 | }, 33 | { 34 | "attachments": {}, 35 | "cell_type": "markdown", 36 | "id": "834bf49b", 37 | "metadata": {}, 38 | "source": [ 39 | "The `Parallel` class of `joblib` creates a pool of workers to which tasks can be assigned. This pool of workers can be reused for different sets of tasks." 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 2, 45 | "id": "5225b2c9", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "from joblib import Parallel, delayed" 50 | ] 51 | }, 52 | { 53 | "attachments": {}, 54 | "cell_type": "markdown", 55 | "id": "46e35cb6", 56 | "metadata": {}, 57 | "source": [ 58 | "Let us create a pool with the maximum number of workers available on our machine. Specifying the number of jobs as minus one means the maximum number of workers that can automatically be found on the machine." 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 3, 64 | "id": "bbd0465c", 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "Number of cores found by joblib: 8\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "from joblib import cpu_count\n", 77 | "print(\"Number of cores found by joblib:\", cpu_count())" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 4, 83 | "id": "3db86182", 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "parallel_pool = Parallel(n_jobs=-1)" 88 | ] 89 | }, 90 | { 91 | "attachments": {}, 92 | "cell_type": "markdown", 93 | "id": "03c1b471", 94 | "metadata": {}, 95 | "source": [ 96 | "Let us create two different functions we like to perform: taking the square and the square root. For the square root, we can use the `Numpy` function, but for the square we create our own function." 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 5, 102 | "id": "df3e02e7", 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "def my_square(n):\n", 107 | " return n**2\n", 108 | "parallel_square = delayed(my_square)\n", 109 | "parallel_root = delayed(np.sqrt)" 110 | ] 111 | }, 112 | { 113 | "attachments": {}, 114 | "cell_type": "markdown", 115 | "id": "6fd79d41", 116 | "metadata": {}, 117 | "source": [ 118 | "Creating the tasks requires specifying the input variables. For the square, let us use a uniform sample for values between zero and one. For the square root, we'd like to use the previous output and check if the result is the input again. Hence, we first need to perform the square operations." 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 8, 124 | "id": "d011898d", 125 | "metadata": {}, 126 | "outputs": [ 127 | { 128 | "name": "stdout", 129 | "output_type": "stream", 130 | "text": [ 131 | "Input values are: [0. 0.11111111 0.22222222 0.33333333 0.44444444 0.55555556\n", 132 | " 0.66666667 0.77777778 0.88888889 1. ]\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "input_values = np.linspace(0,1,10)\n", 138 | "print(\"Input values are:\", input_values)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 9, 144 | "id": "a756c03c", 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "parallel_tasks_square = [parallel_square(i) for i in input_values]" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 10, 154 | "id": "25f7702c", 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "parallel_results_square = parallel_pool(parallel_tasks_square)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 11, 164 | "id": "e7a16225", 165 | "metadata": {}, 166 | "outputs": [ 167 | { 168 | "name": "stdout", 169 | "output_type": "stream", 170 | "text": [ 171 | "The squares of input values are: [0.0, 0.012345679012345678, 0.04938271604938271, 0.1111111111111111, 0.19753086419753085, 0.308641975308642, 0.4444444444444444, 0.6049382716049381, 0.7901234567901234, 1.0]\n" 172 | ] 173 | } 174 | ], 175 | "source": [ 176 | "print(\"The squares of input values are:\", parallel_results_square)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 12, 182 | "id": "ed4f1782", 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "parallel_tasks_root = [parallel_root(i) for i in parallel_results_square]" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 13, 192 | "id": "61cdb6a0", 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "parallel_results_root = parallel_pool(parallel_tasks_root)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 14, 202 | "id": "58506510", 203 | "metadata": {}, 204 | "outputs": [ 205 | { 206 | "name": "stdout", 207 | "output_type": "stream", 208 | "text": [ 209 | "The square roots of the squares of the input values are: [0.0, 0.1111111111111111, 0.2222222222222222, 0.3333333333333333, 0.4444444444444444, 0.5555555555555556, 0.6666666666666666, 0.7777777777777777, 0.8888888888888888, 1.0]\n" 210 | ] 211 | } 212 | ], 213 | "source": [ 214 | "print(\"The square roots of the squares of the input values are:\", parallel_results_root)" 215 | ] 216 | }, 217 | { 218 | "attachments": {}, 219 | "cell_type": "markdown", 220 | "id": "ddaf4ddf", 221 | "metadata": {}, 222 | "source": [ 223 | "Notice that the square root of the square of the input values are indeed the input variables, perhaps with a small rounding error. Also, the same pool of workers was used twice: the tasks needed to be defined again but the same worker pool can be used many times. Reusing the same worker pool tends to be quicker since it can be initialized once and applied to different tasks." 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "id": "e41c04ac", 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [] 233 | } 234 | ], 235 | "metadata": { 236 | "kernelspec": { 237 | "display_name": "Python 3.10.4 64-bit", 238 | "language": "python", 239 | "name": "python3" 240 | }, 241 | "language_info": { 242 | "codemirror_mode": { 243 | "name": "ipython", 244 | "version": 3 245 | }, 246 | "file_extension": ".py", 247 | "mimetype": "text/x-python", 248 | "name": "python", 249 | "nbconvert_exporter": "python", 250 | "pygments_lexer": "ipython3", 251 | "version": "3.11.2" 252 | }, 253 | "vscode": { 254 | "interpreter": { 255 | "hash": "7600a12950a547366bb7a6732117e300ffd26224351912980486e1126c5d0f9a" 256 | } 257 | } 258 | }, 259 | "nbformat": 4, 260 | "nbformat_minor": 5 261 | } 262 | -------------------------------------------------------------------------------- /AY08/Numba/2_Numba_vector_addition.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "770a296f", 6 | "metadata": {}, 7 | "source": [ 8 | "# Basic usage of Numba\n", 9 | "\n", 10 | " - Elwin van 't Wout\n", 11 | " - Pontificia Universidad Católica de Chile\n", 12 | " - IMT3870\n", 13 | " - 26-8-2024\n", 14 | "\n", 15 | "Sum the values of a vector and compare the timing between Python, Numpy, and Numba." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "id": "217c54e1", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import numpy as np\n", 26 | "from numba import jit" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "id": "14629e10", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "def sum_vector_python(a):\n", 37 | " s = 0\n", 38 | " for i in range(a.size):\n", 39 | " s += a[i]\n", 40 | " return s " 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "id": "2f57b2ac", 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "def sum_vector_numpy(a):\n", 51 | " s = np.sum(a)\n", 52 | " return s " 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "id": "f366e150", 58 | "metadata": {}, 59 | "source": [ 60 | "For Numba, we can use exactly the same function as before but with the Numba decorator added.\n", 61 | "\n", 62 | "Use the option ```nopython=True``` to use the performance optimisation of Numba. Alternatively, one can use ```@njit```.\n", 63 | "\n", 64 | "Remark: earlier versions of Numba had the option of using ```nopython=False```, which basically ran the Python code as is, without any optimization. However, this option is deprecated, and more recent versions do not support this functionality anymore. Depending on the version, you may receive a warning or Numba just ignores the option." 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 4, 70 | "id": "183a1d41", 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "@jit(nopython=True)\n", 75 | "def sum_vector_numba_nopython(a):\n", 76 | " s = 0\n", 77 | " for i in range(a.size):\n", 78 | " s += a[i]\n", 79 | " return s" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 5, 85 | "id": "d8a6f955-5a3f-4283-b222-66d19c76b91c", 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "name": "stderr", 90 | "output_type": "stream", 91 | "text": [ 92 | "C:\\Users\\alber\\AppData\\Local\\Temp\\ipykernel_11680\\3605788378.py:1: NumbaDeprecationWarning: \u001b[1mThe keyword argument 'nopython=False' was supplied. From Numba 0.59.0 the default is being changed to True and use of 'nopython=False' will raise a warning as the argument will have no effect. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n", 93 | " @jit(nopython=False)\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "@jit(nopython=False)\n", 99 | "def sum_vector_numba_python(a):\n", 100 | " s = 0\n", 101 | " for i in range(a.size):\n", 102 | " s += a[i]\n", 103 | " return s" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "id": "f2644c21", 109 | "metadata": {}, 110 | "source": [ 111 | "Let us create a vector with elements $0,1,2,\\dots,n-1$ and calculate the sum." 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 6, 117 | "id": "6d76abcf", 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "n = int(1e7)\n", 122 | "vec = np.arange(n)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "id": "c63e69dc", 128 | "metadata": {}, 129 | "source": [ 130 | "Be careful with the timing of the Numba function. The first call is always slow because the code needs to be compiled." 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 11, 136 | "id": "d171e3dc", 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "name": "stdout", 141 | "output_type": "stream", 142 | "text": [ 143 | "The 1st call to the Numba function took: 0.0028438568115234375 seconds.\n", 144 | "The 2nd call to the Numba function took: 0.0 seconds.\n", 145 | "The 3rd call to the Numba function took: 0.009637832641601562 seconds.\n" 146 | ] 147 | } 148 | ], 149 | "source": [ 150 | "import time\n", 151 | "time_1 = time.time()\n", 152 | "sum_vector_numba_nopython(vec)\n", 153 | "time_2 = time.time()\n", 154 | "sum_vector_numba_nopython(vec)\n", 155 | "time_3 = time.time()\n", 156 | "sum_vector_numba_nopython(vec)\n", 157 | "time_4 = time.time()\n", 158 | "print(\"The 1st call to the Numba function took:\",time_2-time_1,\"seconds.\")\n", 159 | "print(\"The 2nd call to the Numba function took:\",time_3-time_2,\"seconds.\")\n", 160 | "print(\"The 3rd call to the Numba function took:\",time_4-time_3,\"seconds.\")" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 12, 166 | "id": "d5f596ee-9fba-44df-b3ce-d31c017181d8", 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "name": "stdout", 171 | "output_type": "stream", 172 | "text": [ 173 | "The 1st call to the Numba function took: 0.0029931068420410156 seconds.\n", 174 | "The 2nd call to the Numba function took: 0.005274295806884766 seconds.\n", 175 | "The 3rd call to the Numba function took: 0.00500035285949707 seconds.\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "import time\n", 181 | "time_1 = time.time()\n", 182 | "sum_vector_numba_python(vec)\n", 183 | "time_2 = time.time()\n", 184 | "sum_vector_numba_python(vec)\n", 185 | "time_3 = time.time()\n", 186 | "sum_vector_numba_python(vec)\n", 187 | "time_4 = time.time()\n", 188 | "print(\"The 1st call to the Numba function took:\",time_2-time_1,\"seconds.\")\n", 189 | "print(\"The 2nd call to the Numba function took:\",time_3-time_2,\"seconds.\")\n", 190 | "print(\"The 3rd call to the Numba function took:\",time_4-time_3,\"seconds.\")" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 13, 196 | "id": "c6814cce", 197 | "metadata": {}, 198 | "outputs": [ 199 | { 200 | "name": "stderr", 201 | "output_type": "stream", 202 | "text": [ 203 | "C:\\Users\\alber\\AppData\\Local\\Temp\\ipykernel_11680\\1815385635.py:4: RuntimeWarning: overflow encountered in scalar add\n", 204 | " s += a[i]\n" 205 | ] 206 | }, 207 | { 208 | "name": "stdout", 209 | "output_type": "stream", 210 | "text": [ 211 | "822 ms ± 32.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 212 | ] 213 | } 214 | ], 215 | "source": [ 216 | "%%timeit\n", 217 | "sum_vector_python(vec)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 14, 223 | "id": "728db4c7", 224 | "metadata": {}, 225 | "outputs": [ 226 | { 227 | "name": "stdout", 228 | "output_type": "stream", 229 | "text": [ 230 | "3.71 ms ± 154 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" 231 | ] 232 | } 233 | ], 234 | "source": [ 235 | "%%timeit\n", 236 | "sum_vector_numpy(vec)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 15, 242 | "id": "192b5184", 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "name": "stdout", 247 | "output_type": "stream", 248 | "text": [ 249 | "2.73 ms ± 118 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" 250 | ] 251 | } 252 | ], 253 | "source": [ 254 | "%%timeit\n", 255 | "sum_vector_numba_nopython(vec)" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 16, 261 | "id": "a2bb42c8-e541-411b-be97-e6e251881f99", 262 | "metadata": {}, 263 | "outputs": [ 264 | { 265 | "name": "stdout", 266 | "output_type": "stream", 267 | "text": [ 268 | "2.68 ms ± 112 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" 269 | ] 270 | } 271 | ], 272 | "source": [ 273 | "%%timeit\n", 274 | "sum_vector_numba_python(vec)" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "id": "8c5670b1", 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "id": "aeff57f0", 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [] 292 | } 293 | ], 294 | "metadata": { 295 | "kernelspec": { 296 | "display_name": "Python 3", 297 | "language": "python", 298 | "name": "python3" 299 | }, 300 | "language_info": { 301 | "codemirror_mode": { 302 | "name": "ipython", 303 | "version": 3 304 | }, 305 | "file_extension": ".py", 306 | "mimetype": "text/x-python", 307 | "name": "python", 308 | "nbconvert_exporter": "python", 309 | "pygments_lexer": "ipython3", 310 | "version": "3.11.2" 311 | } 312 | }, 313 | "nbformat": 4, 314 | "nbformat_minor": 5 315 | } 316 | -------------------------------------------------------------------------------- /AY08/Numba/3_Numba_vector_addition_parallel.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "770a296f", 6 | "metadata": {}, 7 | "source": [ 8 | "# Parallel functionality of Numba\n", 9 | "\n", 10 | " - Elwin van 't Wout\n", 11 | " - Pontificia Universidad Católica de Chile\n", 12 | " - IMT3870\n", 13 | " - 26-8-2024\n", 14 | "\n", 15 | "Sum the values of a vector and compare the timing between parallelised versions." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "id": "217c54e1", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import numpy as np\n", 26 | "from numba import njit, prange" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "id": "14629e10", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "def sum_vector_python(a):\n", 37 | " s = 0\n", 38 | " for i in range(a.size):\n", 39 | " s += a[i]\n", 40 | " return s " 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "id": "2f57b2ac", 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "def sum_vector_numpy(a):\n", 51 | " s = np.sum(a)\n", 52 | " return s " 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "id": "f366e150", 58 | "metadata": {}, 59 | "source": [ 60 | "We can use Numba to optimize the Python code through its JIT capabilities. Moreover, Numba can automatically parallelise code through the multi-threading paradigm. For this, set the option ```parallel=False```." 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 4, 66 | "id": "183a1d41", 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "@njit(parallel=False)\n", 71 | "def sum_vector_numba_serial(a):\n", 72 | " s = 0\n", 73 | " for i in range(a.size):\n", 74 | " s += a[i]\n", 75 | " return s " 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "id": "c5f85486", 81 | "metadata": {}, 82 | "source": [ 83 | "Adding the parallel option to the Numba decorator makes Numba search for parts of the code than can be parallelised. Add the option ```parallel=True``` for automatic parallelisation. In earlier version, this will only work when ```nopython=True```." 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 5, 89 | "id": "b31b18fc", 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "@njit(parallel=True)\n", 94 | "def sum_vector_numba_parallel(a):\n", 95 | " s = 0\n", 96 | " for i in range(a.size):\n", 97 | " s += a[i]\n", 98 | " return s " 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "id": "ad1e849b", 104 | "metadata": {}, 105 | "source": [ 106 | "Instead of letting Numba search for parallelisation opportunities, you can also explicitly state that a for loop needs to be parallelised. Use the function ```prange()``` instead of the standard ```range()``` in the for loop. In this case, Numba automatically detects that the variable ```s``` for the sum is a shared variable and solves issues with race conditions." 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 6, 112 | "id": "3b69d84b", 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "@njit(parallel=True)\n", 117 | "def sum_vector_numba_prange(a):\n", 118 | " s = 0\n", 119 | " for i in prange(a.size):\n", 120 | " s += a[i]\n", 121 | " return s " 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "id": "f2644c21", 127 | "metadata": {}, 128 | "source": [ 129 | "Let us create a vector with elements $0,1,2,\\dots,n-1$ and calculate the sum." 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 7, 135 | "id": "6d76abcf", 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "n = int(1e7)\n", 140 | "vec = np.arange(n)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "id": "3a4a4710", 146 | "metadata": {}, 147 | "source": [ 148 | "Before performing the timings, call the Numba functions once, so that they are compiled" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 8, 154 | "id": "0188ef52", 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "name": "stdout", 159 | "output_type": "stream", 160 | "text": [ 161 | "Sum of vector with serial Numba: 49999995000000\n", 162 | "Sum of vector with parallel Numba: 49999995000000\n" 163 | ] 164 | }, 165 | { 166 | "name": "stderr", 167 | "output_type": "stream", 168 | "text": [ 169 | "c:\\Users\\alber\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\numba\\core\\typed_passes.py:334: NumbaPerformanceWarning: \u001b[1m\n", 170 | "The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.\n", 171 | "\n", 172 | "To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.\n", 173 | "\u001b[1m\n", 174 | "File \"..\\..\\..\\..\\..\\..\\..\\AppData\\Local\\Temp\\ipykernel_13076\\1770870890.py\", line 1:\u001b[0m\n", 175 | "\u001b[1m\u001b[0m\n", 176 | "\u001b[0m\n", 177 | " warnings.warn(errors.NumbaPerformanceWarning(msg,\n" 178 | ] 179 | }, 180 | { 181 | "name": "stdout", 182 | "output_type": "stream", 183 | "text": [ 184 | "Sum of vector with prange Numba: 49999995000000\n" 185 | ] 186 | } 187 | ], 188 | "source": [ 189 | "print(\"Sum of vector with serial Numba:\", sum_vector_numba_serial(vec))\n", 190 | "print(\"Sum of vector with parallel Numba:\", sum_vector_numba_parallel(vec))\n", 191 | "print(\"Sum of vector with prange Numba:\", sum_vector_numba_prange(vec))" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "id": "678670a6", 197 | "metadata": {}, 198 | "source": [ 199 | "Numba may give warnings when it cannot perform the requested optimisation of the code." 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 9, 205 | "id": "c6814cce", 206 | "metadata": {}, 207 | "outputs": [ 208 | { 209 | "name": "stderr", 210 | "output_type": "stream", 211 | "text": [ 212 | "C:\\Users\\alber\\AppData\\Local\\Temp\\ipykernel_13076\\1815385635.py:4: RuntimeWarning: overflow encountered in scalar add\n", 213 | " s += a[i]\n" 214 | ] 215 | }, 216 | { 217 | "name": "stdout", 218 | "output_type": "stream", 219 | "text": [ 220 | "843 ms ± 35.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "%%timeit\n", 226 | "sum_vector_python(vec)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 10, 232 | "id": "728db4c7", 233 | "metadata": {}, 234 | "outputs": [ 235 | { 236 | "name": "stdout", 237 | "output_type": "stream", 238 | "text": [ 239 | "3.64 ms ± 101 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" 240 | ] 241 | } 242 | ], 243 | "source": [ 244 | "%%timeit\n", 245 | "sum_vector_numpy(vec)" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 11, 251 | "id": "192b5184", 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | "2.6 ms ± 73.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" 259 | ] 260 | } 261 | ], 262 | "source": [ 263 | "%%timeit\n", 264 | "sum_vector_numba_serial(vec)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 12, 270 | "id": "a636b5ab", 271 | "metadata": {}, 272 | "outputs": [ 273 | { 274 | "name": "stdout", 275 | "output_type": "stream", 276 | "text": [ 277 | "2.59 ms ± 124 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" 278 | ] 279 | } 280 | ], 281 | "source": [ 282 | "%%timeit\n", 283 | "sum_vector_numba_parallel(vec)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 13, 289 | "id": "842cbee4", 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "name": "stdout", 294 | "output_type": "stream", 295 | "text": [ 296 | "760 µs ± 41.3 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" 297 | ] 298 | } 299 | ], 300 | "source": [ 301 | "%%timeit\n", 302 | "sum_vector_numba_prange(vec)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "id": "4d2f75c6", 308 | "metadata": {}, 309 | "source": [ 310 | "The number of threads used by Numba is stored in global variables." 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 14, 316 | "id": "5e71e3e5", 317 | "metadata": {}, 318 | "outputs": [ 319 | { 320 | "name": "stdout", 321 | "output_type": "stream", 322 | "text": [ 323 | "The number of available CPUs detected by Numba is: 8\n", 324 | "The number of threads used by Numba is: 8\n" 325 | ] 326 | } 327 | ], 328 | "source": [ 329 | "from numba import config\n", 330 | "print(\"The number of available CPUs detected by Numba is:\", config.NUMBA_DEFAULT_NUM_THREADS)\n", 331 | "print(\"The number of threads used by Numba is:\", config.NUMBA_NUM_THREADS)" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "id": "b78e5700", 337 | "metadata": {}, 338 | "source": [ 339 | "The number of threads used by Numba can be changed manually." 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 15, 345 | "id": "9c501c38", 346 | "metadata": {}, 347 | "outputs": [ 348 | { 349 | "name": "stdout", 350 | "output_type": "stream", 351 | "text": [ 352 | "The current number of threads used by Numba is: 2\n" 353 | ] 354 | } 355 | ], 356 | "source": [ 357 | "from numba import set_num_threads, get_num_threads\n", 358 | "set_num_threads(2)\n", 359 | "print(\"The current number of threads used by Numba is:\", get_num_threads())" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "id": "9340dcab", 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [] 369 | } 370 | ], 371 | "metadata": { 372 | "kernelspec": { 373 | "display_name": "Python 3", 374 | "language": "python", 375 | "name": "python3" 376 | }, 377 | "language_info": { 378 | "codemirror_mode": { 379 | "name": "ipython", 380 | "version": 3 381 | }, 382 | "file_extension": ".py", 383 | "mimetype": "text/x-python", 384 | "name": "python", 385 | "nbconvert_exporter": "python", 386 | "pygments_lexer": "ipython3", 387 | "version": "3.11.2" 388 | } 389 | }, 390 | "nbformat": 4, 391 | "nbformat_minor": 5 392 | } 393 | -------------------------------------------------------------------------------- /AY01/Tutorial_joblib_1_basics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "id": "c10cb256", 7 | "metadata": {}, 8 | "source": [ 9 | "# Ayudantía 1 - Notebook 1\n", 10 | "### Profesor: Elwin van 't Wout\n", 11 | "### Ayudante: Alberto Almuna Morales (alberto.almuna@uc.cl)" 12 | ] 13 | }, 14 | { 15 | "attachments": {}, 16 | "cell_type": "markdown", 17 | "id": "a032fff2", 18 | "metadata": {}, 19 | "source": [ 20 | "The library ```joblib``` provides functionality for parallel computing. In this notebook, let us look into the basics of the library." 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "id": "34a5963a", 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import numpy as np" 31 | ] 32 | }, 33 | { 34 | "attachments": {}, 35 | "cell_type": "markdown", 36 | "id": "834bf49b", 37 | "metadata": {}, 38 | "source": [ 39 | "The ```joblib``` library has the class ```Parallel``` which provides the basic structure for parallel computing. The class provides the functionality to create a pool of workers that can perform tasks in parallel. let us create such object." 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 3, 45 | "id": "5225b2c9", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "from joblib import Parallel" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 4, 55 | "id": "a7d9cf75", 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "Parallel(n_jobs=1)\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "parallel_pool = Parallel()\n", 68 | "print(parallel_pool)" 69 | ] 70 | }, 71 | { 72 | "attachments": {}, 73 | "cell_type": "markdown", 74 | "id": "1df39af9", 75 | "metadata": {}, 76 | "source": [ 77 | "By default, the object is initialized with only a single job. This means that no parallelization will be performed, because only one worker was created. Let us specify the number of jobs explicitly upon creating the worker pool." 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 5, 83 | "id": "89d90ff4", 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "Parallel(n_jobs=2)\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "parallel_pool = Parallel(n_jobs=2)\n", 96 | "print(parallel_pool)" 97 | ] 98 | }, 99 | { 100 | "attachments": {}, 101 | "cell_type": "markdown", 102 | "id": "92f1bf4f", 103 | "metadata": {}, 104 | "source": [ 105 | "The number of workers can also be retrieved through the attribute ```n_jobs```." 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 6, 111 | "id": "7e8d8b69", 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "The number of workers is: 2\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "print(\"The number of workers is:\", parallel_pool.n_jobs)" 124 | ] 125 | }, 126 | { 127 | "attachments": {}, 128 | "cell_type": "markdown", 129 | "id": "dcb606d8", 130 | "metadata": {}, 131 | "source": [ 132 | "Having created a class that can create different workers, let us specify the tasks to be performed. The tasks can be specified by the decorator `delayed` from the `joblib` library. A *decorator* is a Python function that takes one function and returns another function." 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 7, 138 | "id": "4818f706", 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "from joblib import delayed" 143 | ] 144 | }, 145 | { 146 | "attachments": {}, 147 | "cell_type": "markdown", 148 | "id": "9f908a53", 149 | "metadata": {}, 150 | "source": [ 151 | "Here, we will calculate the square root of different values." 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 8, 157 | "id": "53a866e0", 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "parallel_sqrt = delayed(np.sqrt)" 162 | ] 163 | }, 164 | { 165 | "attachments": {}, 166 | "cell_type": "markdown", 167 | "id": "ae567efb", 168 | "metadata": {}, 169 | "source": [ 170 | "The function `parallel_sqrt` is now a function which can be interpreted by `joblib` as a parallel variant of the function `sqrt` of `Numpy`. It can be interpreted as a function that can assign the `Numpy` square-root function to the different workers in a parallel pool." 171 | ] 172 | }, 173 | { 174 | "attachments": {}, 175 | "cell_type": "markdown", 176 | "id": "96569b47", 177 | "metadata": {}, 178 | "source": [ 179 | "Before assigning the function to the workers, we need to specify the input variables for which the function needs to be called. Notice that we need to specify all tasks we like to perform but we do not have to specify which tasks needs to be assigned to which workers. This task assignment will be performed automatically by `joblib`." 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 9, 185 | "id": "89160a91", 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "parallel_tasks = [parallel_sqrt(i) for i in range(10)]" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 10, 195 | "id": "73f44597", 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "data": { 200 | "text/plain": [ 201 | "[(, (0,), {}),\n", 202 | " (, (1,), {}),\n", 203 | " (, (2,), {}),\n", 204 | " (, (3,), {}),\n", 205 | " (, (4,), {}),\n", 206 | " (, (5,), {}),\n", 207 | " (, (6,), {}),\n", 208 | " (, (7,), {}),\n", 209 | " (, (8,), {}),\n", 210 | " (, (9,), {})]" 211 | ] 212 | }, 213 | "execution_count": 10, 214 | "metadata": {}, 215 | "output_type": "execute_result" 216 | } 217 | ], 218 | "source": [ 219 | "parallel_tasks" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 11, 225 | "id": "83316ab1", 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "array = [i for i in range(10)]\n", 230 | "parallel_tasks_2 = map(parallel_sqrt, array)" 231 | ] 232 | }, 233 | { 234 | "attachments": {}, 235 | "cell_type": "markdown", 236 | "id": "af54dc33", 237 | "metadata": {}, 238 | "source": [ 239 | "With the list of all tasks created, we can ask the parallel pool of workers to perform all tasks in parallel." 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 12, 245 | "id": "3e401c94", 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "parallel_results = parallel_pool(parallel_tasks)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 13, 255 | "id": "c91a16ec", 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "parallel_results_2 = parallel_pool(parallel_tasks_2)" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 14, 265 | "id": "8d961dcc", 266 | "metadata": {}, 267 | "outputs": [ 268 | { 269 | "name": "stdout", 270 | "output_type": "stream", 271 | "text": [ 272 | "[0.0, 1.0, 1.4142135623730951, 1.7320508075688772, 2.0, 2.23606797749979, 2.449489742783178, 2.6457513110645907, 2.8284271247461903, 3.0]\n" 273 | ] 274 | } 275 | ], 276 | "source": [ 277 | "print(parallel_results)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 15, 283 | "id": "5b14e197", 284 | "metadata": {}, 285 | "outputs": [ 286 | { 287 | "name": "stdout", 288 | "output_type": "stream", 289 | "text": [ 290 | "[0.0, 1.0, 1.4142135623730951, 1.7320508075688772, 2.0, 2.23606797749979, 2.449489742783178, 2.6457513110645907, 2.8284271247461903, 3.0]\n" 291 | ] 292 | } 293 | ], 294 | "source": [ 295 | "print(parallel_results_2)" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 16, 301 | "id": "3be38762", 302 | "metadata": {}, 303 | "outputs": [ 304 | { 305 | "data": { 306 | "text/plain": [ 307 | "True" 308 | ] 309 | }, 310 | "execution_count": 16, 311 | "metadata": {}, 312 | "output_type": "execute_result" 313 | } 314 | ], 315 | "source": [ 316 | "parallel_results == parallel_results_2" 317 | ] 318 | }, 319 | { 320 | "attachments": {}, 321 | "cell_type": "markdown", 322 | "id": "a9889775", 323 | "metadata": {}, 324 | "source": [ 325 | "The output is indeed the square root of all input values." 326 | ] 327 | }, 328 | { 329 | "attachments": {}, 330 | "cell_type": "markdown", 331 | "id": "938015bd", 332 | "metadata": {}, 333 | "source": [ 334 | "### Example of function with multiple arguments:" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 17, 340 | "id": "291c5270", 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "name": "stdout", 345 | "output_type": "stream", 346 | "text": [ 347 | "[5, 12, 21, 32, 45, 60, 77, 96, 117, 140]\n" 348 | ] 349 | } 350 | ], 351 | "source": [ 352 | "def my_task(n, m):\n", 353 | " return n*m\n", 354 | "\n", 355 | "n = [i for i in range(1, 11)]\n", 356 | "m = [i for i in range(5, 16)]\n", 357 | "\n", 358 | "tasks = [delayed(my_task)(i, j) for i, j in zip(n,m)]\n", 359 | "\n", 360 | "with Parallel(n_jobs=4) as parallel_pool:\n", 361 | " parallel_results = parallel_pool(tasks)\n", 362 | " print(parallel_results)" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 18, 368 | "id": "03a830b3", 369 | "metadata": {}, 370 | "outputs": [ 371 | { 372 | "name": "stdout", 373 | "output_type": "stream", 374 | "text": [ 375 | "[5, 12, 21, 32, 45, 60, 77, 96, 117, 140]\n" 376 | ] 377 | } 378 | ], 379 | "source": [ 380 | "tasks_2 = map(delayed(my_task), n, m)\n", 381 | "\n", 382 | "with Parallel(n_jobs=4) as parallel_pool:\n", 383 | " parallel_results = parallel_pool(tasks_2)\n", 384 | " print(parallel_results)" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "id": "de107ce8", 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [] 394 | } 395 | ], 396 | "metadata": { 397 | "kernelspec": { 398 | "display_name": "Python 3.10.4 64-bit", 399 | "language": "python", 400 | "name": "python3" 401 | }, 402 | "language_info": { 403 | "codemirror_mode": { 404 | "name": "ipython", 405 | "version": 3 406 | }, 407 | "file_extension": ".py", 408 | "mimetype": "text/x-python", 409 | "name": "python", 410 | "nbconvert_exporter": "python", 411 | "pygments_lexer": "ipython3", 412 | "version": "3.11.2" 413 | }, 414 | "vscode": { 415 | "interpreter": { 416 | "hash": "7600a12950a547366bb7a6732117e300ffd26224351912980486e1126c5d0f9a" 417 | } 418 | } 419 | }, 420 | "nbformat": 4, 421 | "nbformat_minor": 5 422 | } 423 | -------------------------------------------------------------------------------- /AY08/Numba/5_Numba_data_types.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "68659313", 6 | "metadata": {}, 7 | "source": [ 8 | "# Data types in Numba\n", 9 | "\n", 10 | " - Elwin van 't Wout\n", 11 | " - Pontificia Universidad Católica de Chile\n", 12 | " - IMT3870\n", 13 | " - 26-8-2024\n", 14 | "\n", 15 | "This tutorial shows the sensitivity of Numba to data types." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "id": "8cfe49b2", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "from numba import njit\n", 26 | "import numpy as np" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "id": "b3286f6e-2276-4de2-85df-84cc1d119a8f", 32 | "metadata": {}, 33 | "source": [ 34 | "Let us create a function that adds a given matrix to the input variable. Since the matrix is of integer type, we specify this explicitly as well." 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 13, 40 | "id": "fee72232-ec5b-4f01-9013-338d11840211", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "@njit\n", 45 | "def add_to_matrix(x):\n", 46 | " my_matrix = np.array([[11, 12, 13], [21, 22, 23]])\n", 47 | " sum = my_matrix + x\n", 48 | " return sum" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 14, 54 | "id": "1963d10a-e27b-434e-ab7e-3a1496f7c094", 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "@njit\n", 59 | "def add_to_matrix_int(x):\n", 60 | " my_matrix = np.array([[11, 12, 13], [21, 22, 23]], dtype=int)\n", 61 | " sum = my_matrix + x\n", 62 | " return sum" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 15, 68 | "id": "db67d1f5-c2ef-4b0c-b772-4bee5768915e", 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "@njit\n", 73 | "def add_to_matrix_npint32(x):\n", 74 | " my_matrix = np.array([[11, 12, 13], [21, 22, 23]], dtype=np.int32)\n", 75 | " sum = my_matrix + x\n", 76 | " return sum" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 16, 82 | "id": "cf33edf1-b6d5-49df-b33b-e3e34e111605", 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "@njit\n", 87 | "def add_to_matrix_npint64(x):\n", 88 | " my_matrix = np.array([[11, 12, 13], [21, 22, 23]], dtype=np.int64)\n", 89 | " sum = my_matrix + x\n", 90 | " return sum" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "id": "2de4757f-72bf-49d3-86e9-37a72546b302", 96 | "metadata": {}, 97 | "source": [ 98 | "Let's apply the functionality to a matrix with the same size and data type." 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 22, 104 | "id": "c409779c-f071-482d-a91e-88dc4f2ef1f8", 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "Matrix:\n", 112 | " [[1 2 3]\n", 113 | " [4 5 6]]\n", 114 | "Data type: int32\n" 115 | ] 116 | } 117 | ], 118 | "source": [ 119 | "matrix_int_2_x_3 = np.array([[1, 2, 3], [4, 5, 6]], dtype=int)\n", 120 | "print(\"Matrix:\\n\", matrix_int_2_x_3)\n", 121 | "print(\"Data type:\", matrix_int_2_x_3.dtype)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 23, 127 | "id": "635f56d2-56fe-41c8-bb99-78e748c41ce7", 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "name": "stdout", 132 | "output_type": "stream", 133 | "text": [ 134 | "[[12 14 16]\n", 135 | " [25 27 29]]\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "result_int_2_x_3 = add_to_matrix(matrix_int_2_x_3)\n", 141 | "print(result_int_2_x_3)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 24, 147 | "id": "dddee2ac-2f07-4105-99a0-ddc55c87d7da", 148 | "metadata": {}, 149 | "outputs": [ 150 | { 151 | "ename": "TypingError", 152 | "evalue": "Failed in nopython mode pipeline (step: nopython frontend)\n\u001b[1m\u001b[1m\u001b[1mNo implementation of function Function() found for signature:\n \n >>> array(list(list(int64)), dtype=Function())\n \nThere are 2 candidate implementations:\n\u001b[1m - Of which 2 did not match due to:\n Overload in function 'impl_np_array': File: numba\\np\\arrayobj.py: Line 5242.\n With argument(s): '(list(list(int64)), dtype=Function())':\u001b[0m\n\u001b[1m Rejected as the implementation raised a specific error:\n TypingError: \u001b[1mThe argument \"dtype\" must be a data-type if it is provided\u001b[0m\u001b[0m\n raised from c:\\Users\\alber\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\numba\\np\\arrayobj.py:5250\n\u001b[0m\n\u001b[0m\u001b[1mDuring: resolving callee type: Function()\u001b[0m\n\u001b[0m\u001b[1mDuring: typing of call at C:\\Users\\alber\\AppData\\Local\\Temp\\ipykernel_5328\\785250876.py (3)\n\u001b[0m\n\u001b[1m\nFile \"..\\..\\..\\..\\..\\..\\..\\AppData\\Local\\Temp\\ipykernel_5328\\785250876.py\", line 3:\u001b[0m\n\u001b[1m\u001b[0m\n", 153 | "output_type": "error", 154 | "traceback": [ 155 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 156 | "\u001b[1;31mTypingError\u001b[0m Traceback (most recent call last)", 157 | "Cell \u001b[1;32mIn[24], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m result_int_2_x_3 \u001b[38;5;241m=\u001b[39m \u001b[43madd_to_matrix_int\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmatrix_int_2_x_3\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28mprint\u001b[39m(result_int_2_x_3)\n", 158 | "File \u001b[1;32mc:\\Users\\alber\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\numba\\core\\dispatcher.py:468\u001b[0m, in \u001b[0;36m_DispatcherBase._compile_for_args\u001b[1;34m(self, *args, **kws)\u001b[0m\n\u001b[0;32m 464\u001b[0m msg \u001b[38;5;241m=\u001b[39m (\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mstr\u001b[39m(e)\u001b[38;5;241m.\u001b[39mrstrip()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mThis error may have been caused \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 465\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mby the following argument(s):\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00margs_str\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 466\u001b[0m e\u001b[38;5;241m.\u001b[39mpatch_message(msg)\n\u001b[1;32m--> 468\u001b[0m \u001b[43merror_rewrite\u001b[49m\u001b[43m(\u001b[49m\u001b[43me\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtyping\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 469\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m errors\u001b[38;5;241m.\u001b[39mUnsupportedError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 470\u001b[0m \u001b[38;5;66;03m# Something unsupported is present in the user code, add help info\u001b[39;00m\n\u001b[0;32m 471\u001b[0m error_rewrite(e, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124munsupported_error\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", 159 | "File \u001b[1;32mc:\\Users\\alber\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\numba\\core\\dispatcher.py:409\u001b[0m, in \u001b[0;36m_DispatcherBase._compile_for_args..error_rewrite\u001b[1;34m(e, issue_type)\u001b[0m\n\u001b[0;32m 407\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[0;32m 408\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 409\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\u001b[38;5;241m.\u001b[39mwith_traceback(\u001b[38;5;28;01mNone\u001b[39;00m)\n", 160 | "\u001b[1;31mTypingError\u001b[0m: Failed in nopython mode pipeline (step: nopython frontend)\n\u001b[1m\u001b[1m\u001b[1mNo implementation of function Function() found for signature:\n \n >>> array(list(list(int64)), dtype=Function())\n \nThere are 2 candidate implementations:\n\u001b[1m - Of which 2 did not match due to:\n Overload in function 'impl_np_array': File: numba\\np\\arrayobj.py: Line 5242.\n With argument(s): '(list(list(int64)), dtype=Function())':\u001b[0m\n\u001b[1m Rejected as the implementation raised a specific error:\n TypingError: \u001b[1mThe argument \"dtype\" must be a data-type if it is provided\u001b[0m\u001b[0m\n raised from c:\\Users\\alber\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\numba\\np\\arrayobj.py:5250\n\u001b[0m\n\u001b[0m\u001b[1mDuring: resolving callee type: Function()\u001b[0m\n\u001b[0m\u001b[1mDuring: typing of call at C:\\Users\\alber\\AppData\\Local\\Temp\\ipykernel_5328\\785250876.py (3)\n\u001b[0m\n\u001b[1m\nFile \"..\\..\\..\\..\\..\\..\\..\\AppData\\Local\\Temp\\ipykernel_5328\\785250876.py\", line 3:\u001b[0m\n\u001b[1m\u001b[0m\n" 161 | ] 162 | } 163 | ], 164 | "source": [ 165 | "result_int_2_x_3 = add_to_matrix_int(matrix_int_2_x_3)\n", 166 | "print(result_int_2_x_3)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 25, 172 | "id": "ddc209e9-878a-4215-81de-c52b042b8a10", 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "name": "stdout", 177 | "output_type": "stream", 178 | "text": [ 179 | "[[12 14 16]\n", 180 | " [25 27 29]]\n" 181 | ] 182 | } 183 | ], 184 | "source": [ 185 | "result_int_2_x_3 = add_to_matrix_npint32(matrix_int_2_x_3)\n", 186 | "print(result_int_2_x_3)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 26, 192 | "id": "1b8dbc2c-86df-40a6-894f-73bcdb8a47e4", 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | "[[12 14 16]\n", 200 | " [25 27 29]]\n" 201 | ] 202 | } 203 | ], 204 | "source": [ 205 | "result_int_2_x_3 = add_to_matrix_npint64(matrix_int_2_x_3)\n", 206 | "print(result_int_2_x_3)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "id": "c45f83ed-dc30-4a26-b91b-f5df7ab0c2a8", 212 | "metadata": {}, 213 | "source": [ 214 | "The result is indeed as expected, the sum of the two matrices. However, the second implementation raises an error. Reading the error message suggests a problem with the data type. This is strange, because the matrix to add was defined with ```dtype=int``` as in the Numba function. Still it raises an error since ```int``` is a data type managed by Python and ```np.int64``` a data type managed by NumPy.\n", 215 | "\n", 216 | "Regardless of this specific example, it is recommended to check data types when using Numba. It is a common error to have incompatibilities in data type since Numba infers the data type from the function and optimises the code accordingly." 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "id": "df608a2a", 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [] 226 | } 227 | ], 228 | "metadata": { 229 | "kernelspec": { 230 | "display_name": "Python 3", 231 | "language": "python", 232 | "name": "python3" 233 | }, 234 | "language_info": { 235 | "codemirror_mode": { 236 | "name": "ipython", 237 | "version": 3 238 | }, 239 | "file_extension": ".py", 240 | "mimetype": "text/x-python", 241 | "name": "python", 242 | "nbconvert_exporter": "python", 243 | "pygments_lexer": "ipython3", 244 | "version": "3.11.2" 245 | } 246 | }, 247 | "nbformat": 4, 248 | "nbformat_minor": 5 249 | } 250 | -------------------------------------------------------------------------------- /AY02/Tutorial_joblib_5_shared_variables.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "8807fd1d", 6 | "metadata": {}, 7 | "source": [ 8 | "# Multiprocessing tutorial 5\n", 9 | "\n", 10 | " - Author: Elwin van 't Wout\n", 11 | " - Affiliation: Pontificia Universidad Católica de Chile\n", 12 | " - Course: IMT3870\n", 13 | " - Date: 12-8-2024" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "id": "801f0f24", 19 | "metadata": {}, 20 | "source": [ 21 | "The library `joblib` provides functionality for parallel computing. In this notebook, let us look into shared variables." 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "id": "9dc4770f", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "from joblib import Parallel, delayed" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "id": "0483e6cc", 37 | "metadata": {}, 38 | "source": [ 39 | "In the *multiprocessing* model, each process has its own data space with private variables. Hence, no variables can be shared between different tasks. The `joblib` library does allow for the use of global variables in each process, but they should not be changed by the different processes." 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "id": "85184253", 45 | "metadata": {}, 46 | "source": [ 47 | "## Reading a global variable" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "id": "b8ee2978", 53 | "metadata": {}, 54 | "source": [ 55 | "Let us create a global variable with a specific value and a function that reads it." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "id": "2749eda5", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "MY_GLOBAL_VAR = 1.4" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "12a4452e", 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "def return_global_var():\n", 76 | " return MY_GLOBAL_VAR" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "id": "a0fae770", 82 | "metadata": {}, 83 | "source": [ 84 | "Let us perform this task with multiple processes. That is, each process returns the global variable." 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "id": "5d7b95dd", 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "n_workers = 2" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "id": "adfb598c", 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "tasks = [delayed(return_global_var)() for i in range(n_workers)]" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "id": "5ca1a1e9", 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "with Parallel(n_jobs=n_workers, batch_size=1, verbose=10, backend='loky') as parallel_pool:\n", 115 | " parallel_results = parallel_pool(tasks)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "id": "16550bd2", 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "print(parallel_results)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "id": "4c515f3e", 131 | "metadata": {}, 132 | "source": [ 133 | "Even though each process has an independent dataspace, the variables created earlier in the notebook can also be used. However, this does not mean that the variable is actually shared in the sense that both processes can access the same memory where the variable is stored. The `joblib` library made a copy of the global variable in each process. Hence, it cannot be changed by the individual processes." 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "id": "d9e1bd9a", 139 | "metadata": {}, 140 | "source": [ 141 | "## Writing into a global variable" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "id": "b3555c7f", 147 | "metadata": {}, 148 | "source": [ 149 | "Let us try to overwrite a global variable with different values in each process." 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "id": "67926769", 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "def change_global_var(n):\n", 160 | " MY_GLOBAL_VAR = n\n", 161 | " return MY_GLOBAL_VAR" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "id": "1c81c7f1", 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "tasks = [delayed(change_global_var)(i) for i in range(n_workers)]" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "id": "47d546c2", 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "with Parallel(n_jobs=n_workers, batch_size=1, verbose=10, backend='loky') as parallel_pool:\n", 182 | " parallel_results = parallel_pool(tasks)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "id": "6d047817", 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "print(parallel_results)\n", 193 | "print(MY_GLOBAL_VAR)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "id": "9a717543", 199 | "metadata": {}, 200 | "source": [ 201 | "The above results shows that in each process, a local variable named `MY_GLOBAL_VAR` was created and returned to the main process. The global variable with the same name `MY_GLOBAL_VAR` was left unchanged. Notice that this is the expected behaviour of any Python function, not just for `joblib`." 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "id": "209e2d79", 207 | "metadata": {}, 208 | "source": [ 209 | "## Changing a global variable" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "id": "882fb8e4", 215 | "metadata": {}, 216 | "source": [ 217 | "Let us try to add a value to the global variable." 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "id": "0244968a", 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "def add_to_global_var(n):\n", 228 | " MY_GLOBAL_VAR += n\n", 229 | " return MY_GLOBAL_VAR" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "id": "607919c7", 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "tasks = [delayed(add_to_global_var)(i) for i in range(n_workers)]" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "id": "0eb30038", 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "with Parallel(n_jobs=n_workers, batch_size=1, verbose=10, backend='loky') as parallel_pool:\n", 250 | " parallel_results = parallel_pool(tasks)" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "id": "82f48a16", 256 | "metadata": {}, 257 | "source": [ 258 | "The `joblib` library throws an error. Here, the function tries to read and then write into the same variable `my_global_var`. The previous examples showed that either reading or writing is possible, but adding to a global variable fails. The reason is that Python cannot detect if the variable is a global or local variable since we try to both read and write the variable." 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "id": "8f7fcbfd", 264 | "metadata": {}, 265 | "source": [ 266 | "## Reading databases" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "id": "52092b7b", 272 | "metadata": {}, 273 | "source": [ 274 | "In data science, it is common to have a dataset that needs to be used by all processes. However, each process has its own data space. There are different ways to handle this situation. The easiest approach is to handle the data set as a global variable. This is sufficient if the processes only need to read the dataset but not adapt it." 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "id": "1493f80d", 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "import pandas as pd\n", 285 | "import numpy as np" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "id": "f0467d68", 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "my_global_df = pd.DataFrame(data=np.arange(100), columns=[\"my_data\"])" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "id": "41152db8", 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "my_global_df" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "id": "da67b62e", 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "def sum_data(start, end):\n", 316 | " my_local_data = my_global_df[\"my_data\"][start:end]\n", 317 | " return np.sum(my_local_data)" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "id": "0d5d2231", 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "chunk_size = my_global_df.shape[0] // n_workers\n", 328 | "tasks = [delayed(sum_data)(i*chunk_size, (i+1)*chunk_size) for i in range(n_workers)]" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "id": "2db9e066", 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "with Parallel(n_jobs=n_workers, batch_size=1, verbose=10, backend='loky') as parallel_pool:\n", 339 | " parallel_results = parallel_pool(tasks)" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "id": "ee2650d4", 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "print(parallel_results)" 350 | ] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "id": "accd3a23", 355 | "metadata": {}, 356 | "source": [ 357 | "The first process indeed summed all elements in the first half of the database, and the second process summed the second half. Although this works, both processes have a copy of the entire database, which is inefficient." 358 | ] 359 | }, 360 | { 361 | "cell_type": "markdown", 362 | "id": "3e97c44a", 363 | "metadata": {}, 364 | "source": [ 365 | "One way of distributing a database over different processes is by reading the necessary parts of the database in each process. For example, one worker reads the first half and the other worker the second half of a database from disk." 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "id": "bc2b718c", 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [ 375 | "my_global_df.to_excel(\"my_database.xlsx\", index=False)" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "id": "3d7f4053", 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "def read_and_sum_data(start, end):\n", 386 | " my_local_df = pd.read_excel(\"my_database.xlsx\", header=0, skiprows=range(1,start+1), nrows=end-start)\n", 387 | " return my_local_df.shape, np.sum(my_local_df[\"my_data\"])" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": null, 393 | "id": "0edad3bb", 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "chunk_size = my_global_df.shape[0] // n_workers\n", 398 | "tasks = [delayed(read_and_sum_data)(i*chunk_size, (i+1)*chunk_size) for i in range(n_workers)]" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": null, 404 | "id": "8265eaeb", 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "with Parallel(n_jobs=n_workers, batch_size=1, verbose=10, backend='loky') as parallel_pool:\n", 409 | " parallel_results = parallel_pool(tasks)" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "id": "62f3257c", 416 | "metadata": {}, 417 | "outputs": [], 418 | "source": [ 419 | "print(parallel_results)" 420 | ] 421 | }, 422 | { 423 | "cell_type": "markdown", 424 | "id": "6e813b4a", 425 | "metadata": {}, 426 | "source": [ 427 | "The result shows that the local data frames are half the size of the Excel file. Furthermore, the summations are correct." 428 | ] 429 | } 430 | ], 431 | "metadata": { 432 | "kernelspec": { 433 | "display_name": "Python 3 (ipykernel)", 434 | "language": "python", 435 | "name": "python3" 436 | }, 437 | "language_info": { 438 | "codemirror_mode": { 439 | "name": "ipython", 440 | "version": 3 441 | }, 442 | "file_extension": ".py", 443 | "mimetype": "text/x-python", 444 | "name": "python", 445 | "nbconvert_exporter": "python", 446 | "pygments_lexer": "ipython3", 447 | "version": "3.12.5" 448 | } 449 | }, 450 | "nbformat": 4, 451 | "nbformat_minor": 5 452 | } 453 | -------------------------------------------------------------------------------- /AY01/Comparaciones.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": { 7 | "id": "Fv1dVo4_y2rm" 8 | }, 9 | "source": [ 10 | "# Comparaciones de rendimiento\n", 11 | "\n", 12 | "Calculemos raíces cuadradas de varios números de distintas maneras:" 13 | ] 14 | }, 15 | { 16 | "attachments": {}, 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "Rugw0fMby__Z" 20 | }, 21 | "source": [ 22 | "### Python normal:" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 1, 28 | "metadata": { 29 | "id": "RqDWRK7xyw0u" 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "from math import sqrt\n", 34 | "import time as tm" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "n = 10000000" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 3, 49 | "metadata": { 50 | "colab": { 51 | "base_uri": "https://localhost:8080/" 52 | }, 53 | "id": "sh6FUabezDiR", 54 | "outputId": "f672cf37-2834-49d1-d586-a625f58b7ebe" 55 | }, 56 | "outputs": [ 57 | { 58 | "name": "stdout", 59 | "output_type": "stream", 60 | "text": [ 61 | "Comenzando a calcular...\n", 62 | "Tiempo total: 0.9421744346618652s\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "print(\"Comenzando a calcular...\")\n", 68 | "start = tm.time()\n", 69 | "normal_results = [sqrt(i) for i in range(n)]\n", 70 | "end = tm.time()\n", 71 | "print(f\"Tiempo total: {end - start}s\")" 72 | ] 73 | }, 74 | { 75 | "attachments": {}, 76 | "cell_type": "markdown", 77 | "metadata": { 78 | "id": "xPOV-TYHzH_o" 79 | }, 80 | "source": [ 81 | "## Numpy:" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 4, 87 | "metadata": { 88 | "id": "x1JnDuDozG6d" 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "import numpy as np\n", 93 | "import time as tm" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 5, 99 | "metadata": { 100 | "colab": { 101 | "base_uri": "https://localhost:8080/" 102 | }, 103 | "id": "a2iwAUwLzOHB", 104 | "outputId": "763cac3a-3bcd-40a8-bb63-ffcaa6cae7a7" 105 | }, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "Comenzando a calcular...\n", 112 | "Tiempo total: 0.04117774963378906s\n" 113 | ] 114 | } 115 | ], 116 | "source": [ 117 | "print(\"Comenzando a calcular...\")\n", 118 | "start = tm.time()\n", 119 | "data = np.arange(n)\n", 120 | "numpy_results = np.sqrt(data) # Ojo que le damos todo el array de datos directamente a la función\n", 121 | "# En general, al trabajar con Numpy, es muy buena idea hacer todo de manera vectorial/matricial.\n", 122 | "# Está muy optimizado para esto!\n", 123 | "end = tm.time()\n", 124 | "print(f\"Tiempo total: {end - start}s\")" 125 | ] 126 | }, 127 | { 128 | "attachments": {}, 129 | "cell_type": "markdown", 130 | "metadata": { 131 | "id": "KizYAhQ2zR9g" 132 | }, 133 | "source": [ 134 | "## Joblib" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 6, 140 | "metadata": { 141 | "id": "x4jm5YJpzQUD" 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "import numpy as np\n", 146 | "from math import sqrt\n", 147 | "from joblib import Parallel\n", 148 | "from joblib import delayed\n", 149 | "import time as tm" 150 | ] 151 | }, 152 | { 153 | "attachments": {}, 154 | "cell_type": "markdown", 155 | "metadata": { 156 | "id": "2dg3MYcI1GLw" 157 | }, 158 | "source": [ 159 | "Dos trabajadores:" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 7, 165 | "metadata": { 166 | "colab": { 167 | "base_uri": "https://localhost:8080/" 168 | }, 169 | "id": "l_JLrCVa09X5", 170 | "outputId": "11f283eb-aabd-4745-925f-d1083edcbeeb" 171 | }, 172 | "outputs": [ 173 | { 174 | "name": "stdout", 175 | "output_type": "stream", 176 | "text": [ 177 | "Comenzando a calcular...\n", 178 | "Tiempo total: 18.816686868667603s\n" 179 | ] 180 | } 181 | ], 182 | "source": [ 183 | "print(\"Comenzando a calcular...\")\n", 184 | "start = tm.time()\n", 185 | "parallel_pool = Parallel(n_jobs=2)\n", 186 | "parallel_sqrt = delayed(sqrt)\n", 187 | "parallel_tasks = [parallel_sqrt(i) for i in range(n)]\n", 188 | "parallel_results = parallel_pool(parallel_tasks)\n", 189 | "end = tm.time()\n", 190 | "print(f\"Tiempo total: {end - start}s\")" 191 | ] 192 | }, 193 | { 194 | "attachments": {}, 195 | "cell_type": "markdown", 196 | "metadata": { 197 | "id": "xkxI7K8P1IZF" 198 | }, 199 | "source": [ 200 | "Cuatro trabajadores:" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 8, 206 | "metadata": { 207 | "colab": { 208 | "base_uri": "https://localhost:8080/" 209 | }, 210 | "id": "dKiyfzwCzaFZ", 211 | "outputId": "0fc8e872-2786-4ffc-b145-9a924dc276ea" 212 | }, 213 | "outputs": [ 214 | { 215 | "name": "stdout", 216 | "output_type": "stream", 217 | "text": [ 218 | "Comenzando a calcular...\n", 219 | "Tiempo total: 16.809730291366577s\n" 220 | ] 221 | } 222 | ], 223 | "source": [ 224 | "print(\"Comenzando a calcular...\")\n", 225 | "start = tm.time()\n", 226 | "parallel_pool = Parallel(n_jobs=4)\n", 227 | "parallel_sqrt = delayed(sqrt)\n", 228 | "parallel_tasks = [parallel_sqrt(i) for i in range(n)]\n", 229 | "parallel_results = parallel_pool(parallel_tasks)\n", 230 | "end = tm.time()\n", 231 | "print(f\"Tiempo total: {end - start}s\")" 232 | ] 233 | }, 234 | { 235 | "attachments": {}, 236 | "cell_type": "markdown", 237 | "metadata": { 238 | "id": "4zfFot501KeG" 239 | }, 240 | "source": [ 241 | "¿Qué ocurre si usamos la función raíz de numpy?" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 9, 247 | "metadata": { 248 | "colab": { 249 | "base_uri": "https://localhost:8080/" 250 | }, 251 | "id": "hIy5wAJKzXN2", 252 | "outputId": "bbd1530c-5bf4-4492-a561-e3330ff57aa6" 253 | }, 254 | "outputs": [ 255 | { 256 | "name": "stdout", 257 | "output_type": "stream", 258 | "text": [ 259 | "Comenzando a calcular...\n", 260 | "Tiempo total: 34.296361446380615s\n" 261 | ] 262 | } 263 | ], 264 | "source": [ 265 | "print(\"Comenzando a calcular...\")\n", 266 | "start = tm.time()\n", 267 | "parallel_pool = Parallel(n_jobs=2)\n", 268 | "parallel_sqrt = delayed(np.sqrt) # Notar la diferencia\n", 269 | "parallel_tasks = [parallel_sqrt(i) for i in range(n)]\n", 270 | "parallel_results = parallel_pool(parallel_tasks)\n", 271 | "end = tm.time()\n", 272 | "print(f\"Tiempo total: {end - start}s\")" 273 | ] 274 | }, 275 | { 276 | "attachments": {}, 277 | "cell_type": "markdown", 278 | "metadata": { 279 | "id": "CB7eJ_6b9Diw" 280 | }, 281 | "source": [ 282 | "Finalmente con batch_size fijo:" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 10, 288 | "metadata": { 289 | "colab": { 290 | "base_uri": "https://localhost:8080/" 291 | }, 292 | "id": "fGEG0I-VzhYS", 293 | "outputId": "3bab9c95-cd4a-4fe7-d9be-5c54447c6b86" 294 | }, 295 | "outputs": [ 296 | { 297 | "name": "stdout", 298 | "output_type": "stream", 299 | "text": [ 300 | "Comenzando a calcular...\n", 301 | "Tiempo total: 15.666585206985474s\n" 302 | ] 303 | } 304 | ], 305 | "source": [ 306 | "print(\"Comenzando a calcular...\")\n", 307 | "start = tm.time()\n", 308 | "parallel_pool = Parallel(n_jobs=2, batch_size=100000)\n", 309 | "parallel_sqrt = delayed(sqrt)\n", 310 | "parallel_tasks = [parallel_sqrt(i) for i in range(n)]\n", 311 | "parallel_results = parallel_pool(parallel_tasks)\n", 312 | "end = tm.time()\n", 313 | "print(f\"Tiempo total: {end - start}s\")" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 11, 319 | "metadata": { 320 | "colab": { 321 | "base_uri": "https://localhost:8080/" 322 | }, 323 | "id": "PdiInRDx9IFA", 324 | "outputId": "1a5efdd8-eaf9-43ba-b3f6-7aef7ea4fd56" 325 | }, 326 | "outputs": [ 327 | { 328 | "name": "stdout", 329 | "output_type": "stream", 330 | "text": [ 331 | "Comenzando a calcular...\n", 332 | "Tiempo total: 19.51024317741394s\n" 333 | ] 334 | } 335 | ], 336 | "source": [ 337 | "print(\"Comenzando a calcular...\")\n", 338 | "start = tm.time()\n", 339 | "parallel_pool = Parallel(n_jobs=2, batch_size=500000)\n", 340 | "parallel_sqrt = delayed(sqrt)\n", 341 | "parallel_tasks = [parallel_sqrt(i) for i in range(n)]\n", 342 | "parallel_results = parallel_pool(parallel_tasks)\n", 343 | "end = tm.time()\n", 344 | "print(f\"Tiempo total: {end - start}s\")" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 12, 350 | "metadata": { 351 | "id": "i8dx3VA49wBZ" 352 | }, 353 | "outputs": [ 354 | { 355 | "name": "stdout", 356 | "output_type": "stream", 357 | "text": [ 358 | "Comenzando a calcular...\n", 359 | "Tiempo total: 20.722821712493896s\n" 360 | ] 361 | } 362 | ], 363 | "source": [ 364 | "print(\"Comenzando a calcular...\")\n", 365 | "start = tm.time()\n", 366 | "parallel_pool = Parallel(n_jobs=4, batch_size=int(n/4))\n", 367 | "parallel_sqrt = delayed(sqrt)\n", 368 | "parallel_tasks = [parallel_sqrt(i) for i in range(n)]\n", 369 | "parallel_results = parallel_pool(parallel_tasks)\n", 370 | "end = tm.time()\n", 371 | "print(f\"Tiempo total: {end - start}s\")" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "metadata": {}, 377 | "source": [ 378 | "## Comparación rendimiento de appendear valores a una lista vs un array" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 13, 384 | "metadata": {}, 385 | "outputs": [ 386 | { 387 | "name": "stdout", 388 | "output_type": "stream", 389 | "text": [ 390 | "0.6783316135406494\n" 391 | ] 392 | } 393 | ], 394 | "source": [ 395 | "# Numpy array vs listas\n", 396 | "\n", 397 | "lista = []\n", 398 | "t1 = tm.time()\n", 399 | "for i in range(n):\n", 400 | " lista.append(i)\n", 401 | "t2 = tm.time()\n", 402 | "print(t2-t1)" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 15, 408 | "metadata": {}, 409 | "outputs": [ 410 | { 411 | "ename": "KeyboardInterrupt", 412 | "evalue": "", 413 | "output_type": "error", 414 | "traceback": [ 415 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 416 | "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 417 | "Cell \u001b[1;32mIn[15], line 4\u001b[0m\n\u001b[0;32m 2\u001b[0m t1 \u001b[38;5;241m=\u001b[39m tm\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(n):\n\u001b[1;32m----> 4\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mappend\u001b[49m\u001b[43m(\u001b[49m\u001b[43marray\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 5\u001b[0m t2 \u001b[38;5;241m=\u001b[39m tm\u001b[38;5;241m.\u001b[39mtime()\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28mprint\u001b[39m(t2\u001b[38;5;241m-\u001b[39mt1)\n", 418 | "File \u001b[1;32m<__array_function__ internals>:200\u001b[0m, in \u001b[0;36mappend\u001b[1;34m(*args, **kwargs)\u001b[0m\n", 419 | "File \u001b[1;32mc:\\Users\\alber\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\numpy\\lib\\function_base.py:5499\u001b[0m, in \u001b[0;36mappend\u001b[1;34m(arr, values, axis)\u001b[0m\n\u001b[0;32m 5497\u001b[0m values \u001b[38;5;241m=\u001b[39m ravel(values)\n\u001b[0;32m 5498\u001b[0m axis \u001b[38;5;241m=\u001b[39m arr\u001b[38;5;241m.\u001b[39mndim\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m\n\u001b[1;32m-> 5499\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mconcatenate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m(\u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m)\u001b[49m\n", 420 | "File \u001b[1;32m<__array_function__ internals>:200\u001b[0m, in \u001b[0;36mconcatenate\u001b[1;34m(*args, **kwargs)\u001b[0m\n", 421 | "\u001b[1;31mKeyboardInterrupt\u001b[0m: " 422 | ] 423 | } 424 | ], 425 | "source": [ 426 | "array = np.array([])\n", 427 | "t1 = tm.time()\n", 428 | "for i in range(n):\n", 429 | " array = np.append(array, i)\n", 430 | "t2 = tm.time()\n", 431 | "print(t2-t1)\n", 432 | "\n", 433 | "# Lo detuve antes de que terminara porque ya era mucho, pero llegó a más de 42 minutos sin terminar." 434 | ] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "metadata": {}, 439 | "source": [ 440 | "En resumen:\n", 441 | "- Si necesitan ir agregando valores -> listas\n", 442 | "- Para realizar operaciones matriciales y de vectores -> NUMPY\n", 443 | " \n", 444 | "** Es común agregar valores a una lista y luego transformarla a un array para realizar operaciones con ella." 445 | ] 446 | }, 447 | { 448 | "cell_type": "markdown", 449 | "metadata": {}, 450 | "source": [] 451 | } 452 | ], 453 | "metadata": { 454 | "colab": { 455 | "provenance": [] 456 | }, 457 | "kernelspec": { 458 | "display_name": "Python 3", 459 | "name": "python3" 460 | }, 461 | "language_info": { 462 | "codemirror_mode": { 463 | "name": "ipython", 464 | "version": 3 465 | }, 466 | "file_extension": ".py", 467 | "mimetype": "text/x-python", 468 | "name": "python", 469 | "nbconvert_exporter": "python", 470 | "pygments_lexer": "ipython3", 471 | "version": "3.11.2" 472 | } 473 | }, 474 | "nbformat": 4, 475 | "nbformat_minor": 0 476 | } 477 | -------------------------------------------------------------------------------- /AY06/DataLocality.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "1e4d5e01", 6 | "metadata": {}, 7 | "source": [ 8 | "# Data locality\n", 9 | "\n", 10 | " - Author: Elwin van 't Wout\n", 11 | " - Affiliation: Pontificia Universidad Católica de Chile\n", 12 | " - Date: 18-8-2023\n", 13 | "\n", 14 | "Test the efficiency of Python with different memory access." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "id": "366bb889", 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import numpy as np" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "f68d0297", 30 | "metadata": {}, 31 | "source": [ 32 | "## Loop strides\n", 33 | "\n", 34 | "Test the influence of the stride of the loop on the efficiency. First, create a large NumPy array with random numbers. Then, perform two different sums, with the same number of operators.\n", 35 | "\n", 36 | "A loop with ```range(0,N,1)``` has elements 0, 1, 2, 3, ..., N-1.\n", 37 | "\n", 38 | "A loop with ```range(0,N*s,s)``` has elements 0, s, 2s, 3s, ..., (N-1)s.\n" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 2, 44 | "id": "77f64223", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "N = 1000000\n", 49 | "stride = 187" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 3, 55 | "id": "59c4b340-1b0c-45bd-ad9c-8bfef7c547ba", 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "CPU times: user 8.67 ms, sys: 9.03 ms, total: 17.7 ms\n", 63 | "Wall time: 14.9 ms\n" 64 | ] 65 | } 66 | ], 67 | "source": [ 68 | "%%time\n", 69 | "a = np.random.rand(N)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 4, 75 | "id": "00134d7f", 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "name": "stdout", 80 | "output_type": "stream", 81 | "text": [ 82 | "109 ms ± 6.64 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" 83 | ] 84 | } 85 | ], 86 | "source": [ 87 | "%%timeit\n", 88 | "\n", 89 | "sum1 = 0.0\n", 90 | "for n in range(0,N,1):\n", 91 | " sum1 += a[n]\n" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 5, 97 | "id": "fb9654ce-7477-476a-98a7-caf7a74db724", 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "CPU times: user 1.09 s, sys: 1.04 s, total: 2.13 s\n", 105 | "Wall time: 2.13 s\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "%%time\n", 111 | "b = np.random.rand(N*stride)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 6, 117 | "id": "15e86634", 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "name": "stdout", 122 | "output_type": "stream", 123 | "text": [ 124 | "149 ms ± 4.37 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "%%timeit\n", 130 | "\n", 131 | "sum2 = 0.0\n", 132 | "for n in range(0,N*stride,stride):\n", 133 | " sum2 += b[n]\n" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "id": "ee5dd40c-eafd-4d71-ae92-9beaa206cfe4", 139 | "metadata": {}, 140 | "source": [ 141 | "This experiment shows two algorithms, each with the same number of the same operations: $N$ summations. However, the timing is different. This can only be due to different memory access." 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "id": "52d00beb", 147 | "metadata": {}, 148 | "source": [ 149 | "The efficiency of a NumPy array is different than for a Python list because both store the data in diferent formats." 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 7, 155 | "id": "2e1f06de", 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "name": "stdout", 160 | "output_type": "stream", 161 | "text": [ 162 | "CPU times: user 49.8 s, sys: 2.87 s, total: 52.7 s\n", 163 | "Wall time: 52.8 s\n" 164 | ] 165 | } 166 | ], 167 | "source": [ 168 | "%%time\n", 169 | "c = [np.random.rand() for n in range(N*stride)]" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 8, 175 | "id": "999c41b5", 176 | "metadata": {}, 177 | "outputs": [ 178 | { 179 | "name": "stdout", 180 | "output_type": "stream", 181 | "text": [ 182 | "158 ms ± 9.64 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 183 | ] 184 | } 185 | ], 186 | "source": [ 187 | "%%timeit\n", 188 | "\n", 189 | "sum3 = 0.0\n", 190 | "for n in range(0,N*stride,stride):\n", 191 | " sum3 += c[n]\n" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "id": "a8efc106-5e43-41fd-8a06-0f235c84d8c4", 197 | "metadata": {}, 198 | "source": [ 199 | "In general, it is almost always more efficient to use Numpy functionality." 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 9, 205 | "id": "18400e83-d422-4c48-9af4-5aa2e6864e8f", 206 | "metadata": {}, 207 | "outputs": [ 208 | { 209 | "name": "stdout", 210 | "output_type": "stream", 211 | "text": [ 212 | "337 µs ± 15.2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" 213 | ] 214 | } 215 | ], 216 | "source": [ 217 | "%%timeit\n", 218 | "\n", 219 | "sum4 = np.sum(a)\n" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 10, 225 | "id": "58ca5191-232a-4780-ad1d-1c1ecd15269e", 226 | "metadata": {}, 227 | "outputs": [ 228 | { 229 | "name": "stdout", 230 | "output_type": "stream", 231 | "text": [ 232 | "9.11 ms ± 1.97 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 233 | ] 234 | } 235 | ], 236 | "source": [ 237 | "%%timeit\n", 238 | "\n", 239 | "sum5 = np.sum(b[::stride])\n" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 11, 245 | "id": "a0f83a6c", 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "name": "stdout", 250 | "output_type": "stream", 251 | "text": [ 252 | "153 ms ± 8.51 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 253 | ] 254 | } 255 | ], 256 | "source": [ 257 | "%%timeit\n", 258 | "\n", 259 | "sum6 = np.sum(c[::stride])\n" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "id": "c8ea3c7a", 265 | "metadata": {}, 266 | "source": [ 267 | "## Summing the elements of a multi-dimensional arrays\n", 268 | "\n", 269 | "The elements of a multidimensional arrays are stored in memory as a one-dimensional ordering. Hence, the order of accessing the elements has an impact on the timing. Let us create a 3-dimensional tensor and sum all elements. Again, the different implementations all require exactly the same number of the same operators ($N^3$ summations) but the memory access is different." 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 12, 275 | "id": "54ea1926-8578-420a-8200-b427f16b6802", 276 | "metadata": {}, 277 | "outputs": [ 278 | { 279 | "name": "stdout", 280 | "output_type": "stream", 281 | "text": [ 282 | "Created a random array of shape (250, 250, 250)\n" 283 | ] 284 | } 285 | ], 286 | "source": [ 287 | "N = 250\n", 288 | "\n", 289 | "a = np.random.rand(N,N,N)\n", 290 | "b = np.random.rand(N,N,N)\n", 291 | "c = np.random.rand(N,N,N)\n", 292 | "\n", 293 | "print(\"Created a random array of shape\",a.shape)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 13, 299 | "id": "d71730a7-fda0-470f-8b9f-9727ad8f6408", 300 | "metadata": {}, 301 | "outputs": [ 302 | { 303 | "name": "stdout", 304 | "output_type": "stream", 305 | "text": [ 306 | "2.39 s ± 292 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 307 | ] 308 | } 309 | ], 310 | "source": [ 311 | "%%timeit\n", 312 | "\n", 313 | "sum1 = 0.0\n", 314 | "for i in range(N):\n", 315 | " for j in range(N):\n", 316 | " for k in range(N):\n", 317 | " sum1 += a[i,j,k]\n" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 14, 323 | "id": "bce7e19c-979a-4245-9955-b287af5251d5", 324 | "metadata": {}, 325 | "outputs": [ 326 | { 327 | "name": "stdout", 328 | "output_type": "stream", 329 | "text": [ 330 | "3.44 s ± 322 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 331 | ] 332 | } 333 | ], 334 | "source": [ 335 | "%%timeit\n", 336 | "\n", 337 | "sum2 = 0.0\n", 338 | "for k in range(N):\n", 339 | " for j in range(N):\n", 340 | " for i in range(N):\n", 341 | " sum2 += b[i,j,k]\n" 342 | ] 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "id": "3aaf6986", 347 | "metadata": {}, 348 | "source": [ 349 | "Instead of using a loop, it is more efficient to use NumPy functions that are based on optimised algorithms and implementations." 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 15, 355 | "id": "aa3d3875-8633-4745-9ed9-46047af87f6f", 356 | "metadata": {}, 357 | "outputs": [ 358 | { 359 | "name": "stdout", 360 | "output_type": "stream", 361 | "text": [ 362 | "8.93 ms ± 108 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" 363 | ] 364 | } 365 | ], 366 | "source": [ 367 | "%%timeit\n", 368 | "\n", 369 | "sum3 = np.sum(c)\n" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "id": "b630997a", 375 | "metadata": {}, 376 | "source": [ 377 | "## Python broadcasting\n", 378 | "\n", 379 | "NumPy has optimised implementations for many algorithms. For example, summing a constant value to all elements in an array can be done with ```a+=1```. Even though the variable ```a``` has size ```(n,)``` and ```1``` is a scalar, the algorithm works. NumPy uses *broadcasting* which means that (if possible) the summation is performed for all elements." 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 16, 385 | "id": "40352069", 386 | "metadata": {}, 387 | "outputs": [], 388 | "source": [ 389 | "N = 1000000" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 17, 395 | "id": "a9b75dc2", 396 | "metadata": {}, 397 | "outputs": [ 398 | { 399 | "name": "stdout", 400 | "output_type": "stream", 401 | "text": [ 402 | "153 ms ± 2.39 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" 403 | ] 404 | } 405 | ], 406 | "source": [ 407 | "%%timeit\n", 408 | "\n", 409 | "a = np.zeros(N)\n", 410 | "for n in range(N):\n", 411 | " a[n] += 1\n" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": 18, 417 | "id": "ab78ac69", 418 | "metadata": {}, 419 | "outputs": [ 420 | { 421 | "name": "stdout", 422 | "output_type": "stream", 423 | "text": [ 424 | "837 µs ± 52.8 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" 425 | ] 426 | } 427 | ], 428 | "source": [ 429 | "%%timeit\n", 430 | "\n", 431 | "b = np.zeros(N)\n", 432 | "b += 1\n" 433 | ] 434 | }, 435 | { 436 | "cell_type": "markdown", 437 | "id": "62814320", 438 | "metadata": {}, 439 | "source": [ 440 | "## Matrix-matrix multiplication\n", 441 | "\n", 442 | "Perform a matrix-matrix multiplication with different types of memory access." 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": 19, 448 | "id": "51ce5ced", 449 | "metadata": {}, 450 | "outputs": [], 451 | "source": [ 452 | "n = 500\n", 453 | "\n", 454 | "A = np.random.rand(n,n)\n", 455 | "B = np.random.rand(n,n)" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": 20, 461 | "id": "c9550d0d", 462 | "metadata": {}, 463 | "outputs": [ 464 | { 465 | "name": "stdout", 466 | "output_type": "stream", 467 | "text": [ 468 | "355 ms ± 22.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 469 | ] 470 | } 471 | ], 472 | "source": [ 473 | "%%timeit\n", 474 | "\n", 475 | "C1 = np.zeros((n,n))\n", 476 | "for i in range(n):\n", 477 | " for j in range(n):\n", 478 | " C1[i,j] = np.dot(A[i,:],B[:,j])\n" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 21, 484 | "id": "f4e3eca4", 485 | "metadata": {}, 486 | "outputs": [ 487 | { 488 | "name": "stdout", 489 | "output_type": "stream", 490 | "text": [ 491 | "727 ms ± 28.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 492 | ] 493 | } 494 | ], 495 | "source": [ 496 | "%%timeit\n", 497 | "\n", 498 | "C2 = np.zeros((n,n))\n", 499 | "for j in range(n):\n", 500 | " for k in range(n):\n", 501 | " C2[:,j] += A[:,k]*B[k,j]\n" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": 22, 507 | "id": "6e47fe63", 508 | "metadata": {}, 509 | "outputs": [ 510 | { 511 | "name": "stdout", 512 | "output_type": "stream", 513 | "text": [ 514 | "539 ms ± 16.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 515 | ] 516 | } 517 | ], 518 | "source": [ 519 | "%%timeit\n", 520 | "\n", 521 | "C3 = np.zeros((n,n))\n", 522 | "for i in range(n):\n", 523 | " for k in range(n):\n", 524 | " C3[i,:] += A[i,k]*B[k,:]\n" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": 23, 530 | "id": "1c9b77fa", 531 | "metadata": {}, 532 | "outputs": [ 533 | { 534 | "name": "stdout", 535 | "output_type": "stream", 536 | "text": [ 537 | "3.28 ms ± 668 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" 538 | ] 539 | } 540 | ], 541 | "source": [ 542 | "%%timeit\n", 543 | "\n", 544 | "C4 = A @ B\n" 545 | ] 546 | }, 547 | { 548 | "cell_type": "markdown", 549 | "id": "cf893695", 550 | "metadata": {}, 551 | "source": [ 552 | "Consider a larger matrix size." 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": 24, 558 | "id": "5b40b11a", 559 | "metadata": {}, 560 | "outputs": [ 561 | { 562 | "name": "stdout", 563 | "output_type": "stream", 564 | "text": [ 565 | "Finished the loops with order (i,j,*) in 15.621845722198486 seconds.\n", 566 | "Finished the loops with order (j,k,*) in 58.30747675895691 seconds.\n", 567 | "Finished the loops with order (i,k,*) in 15.23149061203003 seconds.\n", 568 | "Finished the Numpy algorithm in 0.11636805534362793 seconds.\n" 569 | ] 570 | } 571 | ], 572 | "source": [ 573 | "import time\n", 574 | "\n", 575 | "n = 2000\n", 576 | "A = np.random.rand(n,n)\n", 577 | "B = np.random.rand(n,n)\n", 578 | "\n", 579 | "\n", 580 | "C1 = np.zeros((n,n))\n", 581 | "time_start = time.time()\n", 582 | "for i in range(n):\n", 583 | " for j in range(n):\n", 584 | " C1[i,j] = np.dot(A[i,:],B[:,j])\n", 585 | "time_end = time.time()\n", 586 | "print(\"Finished the loops with order (i,j,*) in\",time_end-time_start,\"seconds.\")\n", 587 | "\n", 588 | "C2 = np.zeros((n,n))\n", 589 | "time_start = time.time()\n", 590 | "for j in range(n):\n", 591 | " for k in range(n):\n", 592 | " C2[:,j] += A[:,k]*B[k,j]\n", 593 | "time_end = time.time()\n", 594 | "print(\"Finished the loops with order (j,k,*) in\",time_end-time_start,\"seconds.\")\n", 595 | "\n", 596 | "C3 = np.zeros((n,n))\n", 597 | "time_start = time.time()\n", 598 | "for i in range(n):\n", 599 | " for k in range(n):\n", 600 | " C3[i,:] += A[i,k]*B[k,:]\n", 601 | "time_end = time.time()\n", 602 | "print(\"Finished the loops with order (i,k,*) in\",time_end-time_start,\"seconds.\")\n", 603 | "\n", 604 | "C4 = np.zeros((n,n))\n", 605 | "time_start = time.time()\n", 606 | "C4 = A @ B\n", 607 | "time_end = time.time()\n", 608 | "print(\"Finished the Numpy algorithm in\",time_end-time_start,\"seconds.\")\n" 609 | ] 610 | }, 611 | { 612 | "cell_type": "markdown", 613 | "id": "f3d0579c-e1b8-4cdc-89db-d018c4c9127b", 614 | "metadata": { 615 | "editable": true, 616 | "slideshow": { 617 | "slide_type": "" 618 | }, 619 | "tags": [] 620 | }, 621 | "source": [ 622 | "## Apending data\n", 623 | "\n", 624 | "Appending data to a Python list is efficient, but Numpy appends arrays by copying the data." 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": 25, 630 | "id": "c4a5c1cd-c4cc-4174-8277-d03bcac1f093", 631 | "metadata": {}, 632 | "outputs": [], 633 | "source": [ 634 | "n = 100000" 635 | ] 636 | }, 637 | { 638 | "cell_type": "code", 639 | "execution_count": 26, 640 | "id": "fe12194e-0116-4f0a-bfc2-96321644ba67", 641 | "metadata": {}, 642 | "outputs": [ 643 | { 644 | "name": "stdout", 645 | "output_type": "stream", 646 | "text": [ 647 | "3.57 ms ± 178 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" 648 | ] 649 | } 650 | ], 651 | "source": [ 652 | "%%timeit\n", 653 | "\n", 654 | "my_list = []\n", 655 | "\n", 656 | "for i in range(n):\n", 657 | " my_list.append(i)" 658 | ] 659 | }, 660 | { 661 | "cell_type": "code", 662 | "execution_count": 27, 663 | "id": "d88801e1-4a1a-48e6-b775-68ce1e0998e1", 664 | "metadata": {}, 665 | "outputs": [ 666 | { 667 | "name": "stdout", 668 | "output_type": "stream", 669 | "text": [ 670 | "1.66 s ± 22.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 671 | ] 672 | } 673 | ], 674 | "source": [ 675 | "%%timeit\n", 676 | "\n", 677 | "my_array1 = np.array([], dtype=int)\n", 678 | "\n", 679 | "for i in range(n):\n", 680 | " my_array1 = np.append(my_array1, i)" 681 | ] 682 | }, 683 | { 684 | "cell_type": "code", 685 | "execution_count": 28, 686 | "id": "4f8c9515-c166-4c68-aea0-84d8dd935671", 687 | "metadata": {}, 688 | "outputs": [ 689 | { 690 | "name": "stdout", 691 | "output_type": "stream", 692 | "text": [ 693 | "5.76 ms ± 187 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" 694 | ] 695 | } 696 | ], 697 | "source": [ 698 | "%%timeit\n", 699 | "\n", 700 | "my_array2 = np.empty(n, dtype=int)\n", 701 | "\n", 702 | "for i in range(n):\n", 703 | " my_array2[i] = i" 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": null, 709 | "id": "2b7f549d-49b4-4272-9801-96339d2b63ef", 710 | "metadata": {}, 711 | "outputs": [], 712 | "source": [] 713 | } 714 | ], 715 | "metadata": { 716 | "kernelspec": { 717 | "display_name": "Python 3 (ipykernel)", 718 | "language": "python", 719 | "name": "python3" 720 | }, 721 | "language_info": { 722 | "codemirror_mode": { 723 | "name": "ipython", 724 | "version": 3 725 | }, 726 | "file_extension": ".py", 727 | "mimetype": "text/x-python", 728 | "name": "python", 729 | "nbconvert_exporter": "python", 730 | "pygments_lexer": "ipython3", 731 | "version": "3.10.6" 732 | } 733 | }, 734 | "nbformat": 4, 735 | "nbformat_minor": 5 736 | } 737 | -------------------------------------------------------------------------------- /AY09/06_reduction_padding.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# Calculating a vector sum in PyOpenCL\n", 7 | "\n", 8 | "Elwin van 't Wout\n", 9 | "\n", 10 | "PUC Chile\n", 11 | "\n", 12 | "25-9-2024" 13 | ], 14 | "metadata": { 15 | "id": "TG2oWDkjHT8D" 16 | } 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "KXKZENUruGaq" 22 | }, 23 | "source": [ 24 | "Calculate the sum of a vector with OpenCL.\n", 25 | "\n", 26 | "First, we need to configure the virtual machine and install PyOpenCL." 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 1, 32 | "metadata": { 33 | "colab": { 34 | "base_uri": "https://localhost:8080/" 35 | }, 36 | "id": "8lHkgpGAuBEn", 37 | "outputId": "715036bd-3fb0-416f-94a0-ddec0a123434" 38 | }, 39 | "outputs": [ 40 | { 41 | "output_type": "stream", 42 | "name": "stdout", 43 | "text": [ 44 | "\u001b[33m\r0% [Working]\u001b[0m\r \rHit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease\n", 45 | "Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64 InRelease\n", 46 | "Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease\n", 47 | "Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease\n", 48 | "Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease\n", 49 | "Hit:6 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease\n", 50 | "Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease\n", 51 | "Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease\n", 52 | "Ign:9 https://r2u.stat.illinois.edu/ubuntu jammy InRelease\n", 53 | "Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease\n", 54 | "Hit:11 https://r2u.stat.illinois.edu/ubuntu jammy Release\n", 55 | "Reading package lists... Done\n", 56 | "Building dependency tree... Done\n", 57 | "Reading state information... Done\n", 58 | "54 packages can be upgraded. Run 'apt list --upgradable' to see them.\n", 59 | "\u001b[1;33mW: \u001b[0mSkipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)\u001b[0m\n", 60 | "Reading package lists... Done\n", 61 | "Building dependency tree... Done\n", 62 | "Reading state information... Done\n", 63 | "nvidia-cuda-toolkit is already the newest version (11.5.1-1ubuntu1).\n", 64 | "0 upgraded, 0 newly installed, 0 to remove and 54 not upgraded.\n", 65 | "Requirement already satisfied: pyopencl in /usr/local/lib/python3.10/dist-packages (2024.2.7)\n", 66 | "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from pyopencl) (1.26.4)\n", 67 | "Requirement already satisfied: platformdirs>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from pyopencl) (4.3.6)\n", 68 | "Requirement already satisfied: pytools>=2024.1.5 in /usr/local/lib/python3.10/dist-packages (from pyopencl) (2024.1.14)\n", 69 | "Requirement already satisfied: typing-extensions>=4 in /usr/local/lib/python3.10/dist-packages (from pytools>=2024.1.5->pyopencl) (4.12.2)\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "!sudo apt update\n", 75 | "!sudo apt install nvidia-cuda-toolkit -y\n", 76 | "!pip install pyopencl" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 2, 82 | "metadata": { 83 | "id": "lgP6978wudGd", 84 | "colab": { 85 | "base_uri": "https://localhost:8080/" 86 | }, 87 | "outputId": "52b475ee-2f4b-4f20-c30e-f433d2b657eb" 88 | }, 89 | "outputs": [ 90 | { 91 | "output_type": "stream", 92 | "name": "stderr", 93 | "text": [ 94 | "/usr/local/lib/python3.10/dist-packages/pytools/persistent_dict.py:63: RecommendedHashNotFoundWarning: Unable to import recommended hash 'siphash24.siphash13', falling back to 'hashlib.sha256'. Run 'python3 -m pip install siphash24' to install the recommended hash.\n", 95 | " warn(\"Unable to import recommended hash 'siphash24.siphash13', \"\n" 96 | ] 97 | } 98 | ], 99 | "source": [ 100 | "import numpy as np\n", 101 | "import pyopencl as cl\n", 102 | "import pyopencl.array as cl_array" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 3, 108 | "metadata": { 109 | "id": "JCIaRG2KufBQ" 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "ctx = cl.create_some_context()\n", 114 | "queue = cl.CommandQueue(ctx)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 4, 120 | "metadata": { 121 | "colab": { 122 | "base_uri": "https://localhost:8080/" 123 | }, 124 | "id": "i4jmqmb-j_B7", 125 | "outputId": "b63bcb4e-1f54-449b-9c83-c7c4c5972049" 126 | }, 127 | "outputs": [ 128 | { 129 | "output_type": "stream", 130 | "name": "stdout", 131 | "text": [ 132 | "Platform name: NVIDIA CUDA\n", 133 | "Device name: Tesla T4\n", 134 | "Maximum work group size: 1024\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "platform = cl.get_platforms()[0]\n", 140 | "device = ctx.devices[0]\n", 141 | "print(\"Platform name:\", platform.name)\n", 142 | "print(\"Device name:\", device.name)\n", 143 | "print(\"Maximum work group size:\", device.max_work_group_size)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": { 149 | "id": "_pv020cIuoWa" 150 | }, 151 | "source": [ 152 | "In this tutorial, we like to calculate the sum of all elements in a vector of arbitrary size. In general, the size of the vector may not be a multiple of the desired workgroup size. In that case, the algorithm needs to be adapted to facilitate arbitrary workgroup and vector sizes. One option is called 'padding'.\n", 153 | "\n", 154 | "Let us first create the kernel to calculate the sum of an integer array. See tutorial 5 for an explanation on the kernel." 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 5, 160 | "metadata": { 161 | "id": "UtjvaSG-urrG" 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "kernel = \"\"\"\n", 166 | "__kernel void sum1(__global const long int *vec,\n", 167 | " __global long int *partial_sums)\n", 168 | "{\n", 169 | " int group_size = get_local_size(0);\n", 170 | " int local_id = get_local_id(0);\n", 171 | " int group_id = get_group_id(0);\n", 172 | " int global_id = get_global_id(0);\n", 173 | "\n", 174 | " if (local_id == 0){\n", 175 | " long int sum = 0;\n", 176 | " for(int i = 0; i < group_size; i++){\n", 177 | " sum += vec[global_id + i];\n", 178 | " }\n", 179 | " partial_sums[group_id] = sum;\n", 180 | " }\n", 181 | "}\n", 182 | "__kernel void sum2(__global long int *vec,\n", 183 | " __global long int *partial_sums)\n", 184 | "{\n", 185 | " int group_size = get_local_size(0);\n", 186 | " int local_id = get_local_id(0);\n", 187 | " int group_id = get_group_id(0);\n", 188 | " int global_id = get_global_id(0);\n", 189 | "\n", 190 | " int step = 2;\n", 191 | " while (step <= group_size) {\n", 192 | " if (local_id % step == 0) {\n", 193 | " vec[global_id] += vec[global_id + step / 2];\n", 194 | " }\n", 195 | " barrier(CLK_GLOBAL_MEM_FENCE);\n", 196 | " step *= 2;\n", 197 | " }\n", 198 | " if (local_id == 0){\n", 199 | " partial_sums[group_id] = vec[global_id];\n", 200 | " }\n", 201 | "}\n", 202 | "\"\"\"" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 6, 208 | "metadata": { 209 | "id": "RKRY4_iuwy32" 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "prg = cl.Program(ctx, kernel).build()" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "source": [ 219 | "The idea of 'padding' is to add dummy elements to the vector that will not change the final result. For example, if we'd like to calculate the sum of a vector, one can add an arbitrary number of elements with value zero, without changing the final result.\n", 220 | "\n", 221 | "Let us assume that we have a vector of dimension $d$ and a workgroup of size $s$. PyOpenCL needs a domain with workgroups of equal size. However, $d$ may not be a multiple of $s$. Hence, we create another vector with size $n$ such that $n \\geq d$ and $n \\mod d = 0$, that is, $n$ is a multiple of $d$. The following function provides an efficient routine to calculate the next multiple." 222 | ], 223 | "metadata": { 224 | "id": "W5PmWsU-CYXg" 225 | } 226 | }, 227 | { 228 | "cell_type": "code", 229 | "source": [ 230 | "def next_multiple(val, mul):\n", 231 | " \"\"\"Return the smallest value which is larger or equal to 'val' and a multiple of 'mul'.\"\"\"\n", 232 | " return -(-val // mul) * mul" 233 | ], 234 | "metadata": { 235 | "id": "ska0rPVSDd4o" 236 | }, 237 | "execution_count": 7, 238 | "outputs": [] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "source": [ 243 | "For example, if we have a vector of size 100 and like to use workgroup sizes of 32, we need 4 groups to cover the dimension. Notice that $4 \\cdot 32 = 128$ is the next multiple." 244 | ], 245 | "metadata": { 246 | "id": "e1jxH9G1DtJ2" 247 | } 248 | }, 249 | { 250 | "cell_type": "code", 251 | "source": [ 252 | "print(\"The next multiple of 32 larger or equal to 100 is: \", next_multiple(100, 32))" 253 | ], 254 | "metadata": { 255 | "colab": { 256 | "base_uri": "https://localhost:8080/" 257 | }, 258 | "id": "mfoOdfRsD8pa", 259 | "outputId": "addc1ad9-fb42-4c58-90f8-6013a17276cf" 260 | }, 261 | "execution_count": 8, 262 | "outputs": [ 263 | { 264 | "output_type": "stream", 265 | "name": "stdout", 266 | "text": [ 267 | "The next multiple of 32 larger or equal to 100 is: 128\n" 268 | ] 269 | } 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "source": [ 275 | "Let us choose a vector size and a workgroup size." 276 | ], 277 | "metadata": { 278 | "id": "0m4tuU-bDjsM" 279 | } 280 | }, 281 | { 282 | "cell_type": "code", 283 | "source": [ 284 | "vector_size = 10000\n", 285 | "workgroup_size = 32" 286 | ], 287 | "metadata": { 288 | "id": "ml3YzUw5EO6z" 289 | }, 290 | "execution_count": 9, 291 | "outputs": [] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "source": [ 296 | "Let us calculate the next multiple of the workgroup size larger or equal to the vector dimension. This will be the size of our thread domain." 297 | ], 298 | "metadata": { 299 | "id": "eiriH6agEc7V" 300 | } 301 | }, 302 | { 303 | "cell_type": "code", 304 | "source": [ 305 | "global_size = next_multiple(vector_size, workgroup_size)\n", 306 | "n_workgroups = global_size // workgroup_size\n", 307 | "print(\"The global size of the domain is:\", global_size)\n", 308 | "print(\"The number of workgroups is:\", n_workgroups)" 309 | ], 310 | "metadata": { 311 | "colab": { 312 | "base_uri": "https://localhost:8080/" 313 | }, 314 | "id": "a7SE3GteEk2Z", 315 | "outputId": "533fd4a9-7efc-47c8-8bc0-b75303c4139d" 316 | }, 317 | "execution_count": 10, 318 | "outputs": [ 319 | { 320 | "output_type": "stream", 321 | "name": "stdout", 322 | "text": [ 323 | "The global size of the domain is: 10016\n", 324 | "The number of workgroups is: 313\n" 325 | ] 326 | } 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "source": [ 332 | "Let us create a vector with $n$ values from zero to $n-1$ for which we'd like to calculate the sum of its elements." 333 | ], 334 | "metadata": { 335 | "id": "YX1T_i1XFMf4" 336 | } 337 | }, 338 | { 339 | "cell_type": "code", 340 | "source": [ 341 | "my_vector = np.arange(vector_size, dtype=np.int64)" 342 | ], 343 | "metadata": { 344 | "id": "KKgRPIcBFVp1" 345 | }, 346 | "execution_count": 11, 347 | "outputs": [] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "source": [ 352 | "The essential step of the 'padding' approach is to create another vector which will be given to the PyOpenCL kernel. That is, we need to add additional zero elements to fill the vector up until reaching the desired size. Remember that appending zero elements will not change the objective: calculating the sum of the vector." 353 | ], 354 | "metadata": { 355 | "id": "WgpRE1MbFhNe" 356 | } 357 | }, 358 | { 359 | "cell_type": "code", 360 | "source": [ 361 | "padding = np.zeros(global_size - vector_size, dtype=np.int64)\n", 362 | "np_vector = np.concatenate((my_vector, padding))\n", 363 | "print(\"The size of the padded vector:\", np_vector.size)\n", 364 | "print(\"The elements of the last workgroup:\", np_vector[-workgroup_size:])" 365 | ], 366 | "metadata": { 367 | "colab": { 368 | "base_uri": "https://localhost:8080/" 369 | }, 370 | "id": "C_X7AJ6JF0rP", 371 | "outputId": "6cb308d0-1e84-42ce-af9c-8b19a4e5c987" 372 | }, 373 | "execution_count": 12, 374 | "outputs": [ 375 | { 376 | "output_type": "stream", 377 | "name": "stdout", 378 | "text": [ 379 | "The size of the padded vector: 10016\n", 380 | "The elements of the last workgroup: [9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997\n", 381 | " 9998 9999 0 0 0 0 0 0 0 0 0 0 0 0\n", 382 | " 0 0 0 0]\n" 383 | ] 384 | } 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "source": [ 390 | "print(\"Sum of the original vector:\", np.sum(my_vector))\n", 391 | "print(\"Sum of the padded vector:\", np.sum(np_vector))" 392 | ], 393 | "metadata": { 394 | "colab": { 395 | "base_uri": "https://localhost:8080/" 396 | }, 397 | "id": "BcB2pNQHJJB8", 398 | "outputId": "db5f089c-8386-49c3-a3be-d989ec85faa4" 399 | }, 400 | "execution_count": 13, 401 | "outputs": [ 402 | { 403 | "output_type": "stream", 404 | "name": "stdout", 405 | "text": [ 406 | "Sum of the original vector: 49995000\n", 407 | "Sum of the padded vector: 49995000\n" 408 | ] 409 | } 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "source": [ 415 | "We can indeed see that the new vector has zero elements in the final workgroup. These are the padded values. Now, we are ready to launch the kernel for the padded vector. Notice that we need to provide the global size of the domain, not the dimension of the vector to the program." 416 | ], 417 | "metadata": { 418 | "id": "M5ZOmxBbGay0" 419 | } 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": 14, 424 | "metadata": { 425 | "id": "C-ACLmhhyDbA" 426 | }, 427 | "outputs": [], 428 | "source": [ 429 | "cl_vector = cl_array.to_device(queue, np_vector)\n", 430 | "\n", 431 | "cl_partial_sums1 = cl_array.empty(queue, n_workgroups, dtype=np.int64)\n", 432 | "cl_partial_sums2 = cl_array.empty(queue, n_workgroups, dtype=np.int64)" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": 15, 438 | "metadata": { 439 | "id": "8Tl44a3rK9pK" 440 | }, 441 | "outputs": [], 442 | "source": [ 443 | "event = prg.sum1(queue,\n", 444 | " (global_size,),\n", 445 | " (workgroup_size,),\n", 446 | " cl_vector.data,\n", 447 | " cl_partial_sums1.data\n", 448 | " )" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": 16, 454 | "metadata": { 455 | "id": "zOgdjPGQw05a" 456 | }, 457 | "outputs": [], 458 | "source": [ 459 | "event = prg.sum2(queue,\n", 460 | " (global_size,),\n", 461 | " (workgroup_size,),\n", 462 | " cl_vector.data,\n", 463 | " cl_partial_sums2.data\n", 464 | " )" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 17, 470 | "metadata": { 471 | "id": "fZz_oO8L3QRN" 472 | }, 473 | "outputs": [], 474 | "source": [ 475 | "np_partial_sums1 = cl_partial_sums1.get()\n", 476 | "vector_sum1 = np.sum(np_partial_sums1)\n", 477 | "np_partial_sums2 = cl_partial_sums2.get()\n", 478 | "vector_sum2 = np.sum(np_partial_sums2)" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 18, 484 | "metadata": { 485 | "colab": { 486 | "base_uri": "https://localhost:8080/" 487 | }, 488 | "id": "Lh1Y_6560TxT", 489 | "outputId": "4eb90b66-a1b0-491a-844b-02521c4a8d7d" 490 | }, 491 | "outputs": [ 492 | { 493 | "output_type": "stream", 494 | "name": "stdout", 495 | "text": [ 496 | "The sum calculated by OpenCL: 49995000\n", 497 | "The sum calculated by OpenCL: 49995000\n", 498 | "The exact value of the sum: 49995000\n" 499 | ] 500 | } 501 | ], 502 | "source": [ 503 | "print(\"The sum calculated by OpenCL:\", vector_sum1)\n", 504 | "print(\"The sum calculated by OpenCL:\", vector_sum2)\n", 505 | "print(\"The exact value of the sum: \", vector_size*(vector_size-1)//2)" 506 | ] 507 | }, 508 | { 509 | "cell_type": "markdown", 510 | "source": [ 511 | "The exact value of summing values ranging from 0 to $n-1$ is explicitly known: $n(n-1)/2$. The results implemented with OpenCL are correct, indeed." 512 | ], 513 | "metadata": { 514 | "id": "ZPYQyfdBIABF" 515 | } 516 | } 517 | ], 518 | "metadata": { 519 | "accelerator": "GPU", 520 | "colab": { 521 | "provenance": [] 522 | }, 523 | "kernelspec": { 524 | "display_name": "Python 3 (ipykernel)", 525 | "language": "python", 526 | "name": "python3" 527 | }, 528 | "language_info": { 529 | "codemirror_mode": { 530 | "name": "ipython", 531 | "version": 3 532 | }, 533 | "file_extension": ".py", 534 | "mimetype": "text/x-python", 535 | "name": "python", 536 | "nbconvert_exporter": "python", 537 | "pygments_lexer": "ipython3", 538 | "version": "3.10.5" 539 | } 540 | }, 541 | "nbformat": 4, 542 | "nbformat_minor": 0 543 | } --------------------------------------------------------------------------------