├── CODE ├── Makefile ├── coarray.F90 ├── do-concurrent.F90 ├── mpi-abi │ ├── Makefile │ ├── check.F90 │ ├── check.c │ ├── handle.F90 │ └── handle.c ├── numerot.F90 ├── openmp.F90 └── sequential.F90 ├── Career_Trajectory.md ├── ChatGPT.md ├── Dealing_with_imperfect_Fortran_compilers.md ├── Dealing_with_imperfect_Fortran_compilers_2.md ├── Dell_Inspiron5000_Linux.md ├── Empathy.md ├── Finnish_Nonalcoholic_Beer.md ├── Finnish_is_cool.md ├── Finnish_language.md ├── Fortrans_Missing_Parallelism.md ├── GitHub_Universe.md ├── HPC_Numerical_Precision.md ├── In_Defense_Of_Fortran.md ├── Learning_Programming_Languages.md ├── MPI_Needs_ABI.md ├── MPI_Needs_ABI_Part_2.md ├── MPI_Needs_ABI_Part_3.md ├── MPI_Needs_ABI_Part_4.md ├── MPI_Needs_ABI_Part_5.md ├── MPI_Needs_ABI_Part_6.md ├── MPI_Needs_ABI_Part_7.md ├── MPI_RMA_HOWTO.md ├── Mentoring_Interns.md ├── README.md ├── Reading_list.md ├── Summer_of_2011.md ├── Turing_Pi_v1.md ├── Why_Finland.md └── Winning_by_laziness.md /CODE/Makefile: -------------------------------------------------------------------------------- 1 | FC = gfortran 2 | FFLAGS = -Wall -O2 -fopenmp -fcoarray=single 3 | 4 | all: sequential.x coarray.x do-concurrent.x openmp.x 5 | 6 | numerot.o: numerot.F90 7 | $(FC) $(FFLAGS) -c $< -o $@ 8 | 9 | %.x: %.F90 numerot.o 10 | $(FC) $(FFLAGS) numerot.o $< -o $@ 11 | 12 | clean: 13 | -rm -f *.x 14 | -rm -f *.o 15 | -rm -f *.mod 16 | -------------------------------------------------------------------------------- /CODE/coarray.F90: -------------------------------------------------------------------------------- 1 | program main 2 | use numerot 3 | implicit none 4 | real :: A(100), B(100), C(100) 5 | real :: R 6 | 7 | A = 1 8 | B = 1 9 | C = 1 10 | 11 | if (num_images().ne.3) STOP 12 | 13 | if (this_image().eq.1) R = yksi(A) 14 | if (this_image().eq.2) R = kaksi(A) 15 | if (this_image().eq.3) R = kolme(A) 16 | 17 | SYNC ALL() 18 | 19 | call co_sum(R) 20 | if (this_image().eq.1) print*,R 21 | end program main 22 | -------------------------------------------------------------------------------- /CODE/do-concurrent.F90: -------------------------------------------------------------------------------- 1 | program main 2 | use numerot 3 | implicit none 4 | real :: A(100), B(100), C(100) 5 | real :: RA, RB, RC 6 | integer :: k 7 | 8 | A = 1 9 | B = 1 10 | C = 1 11 | 12 | do concurrent (k=1:3) 13 | 14 | if (k.eq.1) RA = yksi(A) 15 | if (k.eq.2) RB = kaksi(B) 16 | if (k.eq.3) RC = kolme(C) 17 | 18 | end do 19 | 20 | print*,RA+RB+RC 21 | end program main 22 | -------------------------------------------------------------------------------- /CODE/mpi-abi/Makefile: -------------------------------------------------------------------------------- 1 | FC = gfortran 2 | FCFLAGS = -g -std=f2008 -fall-intrinsics -Wall -Wextra -Werror 3 | 4 | CC = gcc 5 | CFLAGS = -g -std=c11 -Wall -Wextra -Werror 6 | 7 | all: handle_f.x handle_c.x check_f.x check_c.x 8 | 9 | %_f.x: %.F90 10 | $(FC) $(FCFLAGS) $< -o $@ 11 | 12 | %_c.x: %.c 13 | $(CC) $(CFLAGS) $< -o $@ 14 | 15 | clean: 16 | -rm -f *.x *.o *.mod 17 | -rm -fr *.dSYM 18 | -------------------------------------------------------------------------------- /CODE/mpi-abi/check.F90: -------------------------------------------------------------------------------- 1 | module m 2 | use iso_c_binding, only: c_intptr_t 3 | type, bind(C) :: handle 4 | integer(kind=c_intptr_t) :: val 5 | end type handle 6 | 7 | end module m 8 | 9 | program p 10 | use m 11 | implicit none 12 | type(handle) :: h 13 | print*,'LOC: ',LOC(h),LOC(h%val) 14 | end program p 15 | -------------------------------------------------------------------------------- /CODE/mpi-abi/check.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | typedef struct { 5 | intptr_t val; 6 | } handle1; 7 | 8 | typedef struct { 9 | intptr_t val; 10 | } handle2; 11 | 12 | void foo(handle1 h1, handle2 * h2) 13 | { 14 | h2->val = h1.val; 15 | } 16 | 17 | int main(void) 18 | { 19 | handle1 h1; 20 | handle2 h2; 21 | foo(h1,&h2); 22 | // THIS SHOULD FAIL 23 | foo(h2,&h1); 24 | return 0; 25 | } 26 | -------------------------------------------------------------------------------- /CODE/mpi-abi/handle.F90: -------------------------------------------------------------------------------- 1 | module m 2 | use iso_c_binding, only: c_intptr_t 3 | type, bind(C) :: handle 4 | integer(kind=c_intptr_t) :: val 5 | end type handle 6 | 7 | end module m 8 | 9 | program p 10 | use m 11 | implicit none 12 | type(handle) :: h 13 | print*,'LOC: ',LOC(h),LOC(h%val) 14 | end program p 15 | -------------------------------------------------------------------------------- /CODE/mpi-abi/handle.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | typedef struct { 5 | intptr_t val; 6 | } handle; 7 | 8 | int main(void) 9 | { 10 | handle h; 11 | printf("&h=%p &h[0]=%p\n",&h,&(h.val)); 12 | return 0; 13 | } 14 | -------------------------------------------------------------------------------- /CODE/numerot.F90: -------------------------------------------------------------------------------- 1 | module numerot 2 | contains 3 | pure real function yksi(X) 4 | implicit none 5 | real, intent(in) :: X(100) 6 | !real, intent(out) :: R 7 | yksi = norm2(X) 8 | end function yksi 9 | pure real function kaksi(X) 10 | implicit none 11 | real, intent(in) :: X(100) 12 | kaksi = 2*norm2(X) 13 | end function kaksi 14 | pure real function kolme(X) 15 | implicit none 16 | real, intent(in) :: X(100) 17 | kolme = 3*norm2(X) 18 | end function kolme 19 | end module numerot 20 | -------------------------------------------------------------------------------- /CODE/openmp.F90: -------------------------------------------------------------------------------- 1 | program main 2 | use numerot 3 | implicit none 4 | real :: A(100), B(100), C(100) 5 | real :: RA, RB, RC 6 | 7 | A = 1 8 | B = 1 9 | C = 1 10 | 11 | !$omp parallel 12 | !$omp master 13 | 14 | !$omp task 15 | RA = yksi(A) 16 | !$omp end task 17 | 18 | !$omp task 19 | RB = kaksi(B) 20 | !$omp end task 21 | 22 | !$omp task 23 | RC = kolme(C) 24 | !$omp end task 25 | 26 | !$omp end master 27 | !$omp end parallel 28 | 29 | print*,RA+RB+RC 30 | end program main 31 | -------------------------------------------------------------------------------- /CODE/sequential.F90: -------------------------------------------------------------------------------- 1 | program main 2 | use numerot 3 | implicit none 4 | real :: A(100), B(100), C(100) 5 | real :: RA, RB, RC 6 | 7 | A = 1 8 | B = 1 9 | C = 1 10 | 11 | RA = yksi(A) 12 | RB = kaksi(B) 13 | RC = kolme(C) 14 | 15 | print*,RA+RB+RC 16 | end program main 17 | -------------------------------------------------------------------------------- /Career_Trajectory.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | I started out in organic chemistry and ended up making supercomputers at Intel and NVIDIA. 4 | 5 | Since people often ask me about how this happened, or to speak about career paths in general, I figure I should write it down. 6 | 7 | # The beginning 8 | 9 | I was fascinated by quantum mechanics from a young age, probably because it was elusive, not because I had a particular need for it. I remember reading books on heavy-element chemistry and string theory in high school, in those books written by popularizers that hide all the math. This was most useful in chemistry class, mostly because my boredom often led to mischief, and a chemistry lab is a very bad place to conduct mischief (my high school was old and had actual uranium in storage that nobody knew how to dispose of without shutting down the school). 10 | 11 | My career plan at that time was to become a lawyer, because I was under the foolish impression that this was a good way to get rich arguing with people. However, I did really well in chemistry and math classes, so I kept taking them, because they were often an easy way to get my grades up. However, it became clear in college that my math skills were more suited for a chemistry major than a math major, which aligned with my disinterest in not learning German, French and/or Russian (as was prescribed for PhDs in math). 12 | 13 | # University of Washington 14 | 15 | I became a chemistry major because of a University of Washington Professor named [Nic Epiotis](https://www.legacy.com/us/obituaries/seattletimes/name/nicholas-epiotis-obituary?pid=175420056). Epiotis was a character: he told off-topic stories during lectures and disparaged organic chemistry traditions like "actually reading the textbook" and "memorizing things". I recall one day he held up a 1000-page [organic chemistry textbook](https://www.wiley.com/en-us/Organic+Chemistry%2C+12th+Edition-p-9781118875766), pinched off about 20 pages and declared, "if you read and understand this section, you can get an A in this class - the rest of the book doesn't matter." As an inherently lazy person, this was my kind of class. I feel in love with the quantum mechanical aspects of organic chemistry, which explain, among other things, why carrots are orange (see e.g. [this page](https://www.maplesoft.com/support/help/Maple/view.aspx?path=QuantumChemistry/ParticleInBox-Dyes) to understand the details). 16 | 17 | Later in my undergraduate years, I was drawn into computational chemistry research thanks to Professors Wes Borden and Bill Reinhardt, along with Eric Brown, who was a rather charismatic T.A. and my research mentor in Wes' group. Eric filled me with all sorts of ideas, essentially all of which have turned out to be useful. Eric discouraged me from taking any classes in computer science because, "why do you want to learn Java?" and encouraged me to go deep into the theoretical and computational aspects of chemistry, which I did in grad school. At the time, I was proficient in running Gaussian, and dabbled in Mathematica and Matlab, but I couldn't have compiled a C or Fortran program if my life depended on it. I really wanted to undestand how Gaussian produced results and what sort of math was behind B3LYP and MP2, for example. 18 | 19 | # University of Chicago 20 | 21 | I went to the University of Chicago (UC) for grad school because (1) the quality of the faculty, (2) its reputation as an intensely academic place, and (3) the cost-of-living was a lot lower than at Berkeley, which matters when you're going to make $20K/year. The first half of grad school was devoted to intensely theoretical topics with rudimentary computation, primarily done in Matlab. I attended (sometimes for credit) a lot of classes outside the chemistry department to broaden my horizons, including my first proper computing courses, a course on random number generators, and a few weeks of intro to economics with Steve Levitt (the Freakonomics guy). UC was as erudite as I had been promised and I had a wonderful social life full of profoundly nerds and geeks. I also met my wife, who was a different type of nerd than I was. 22 | 23 | In my first year of grad school, I applied to fellowships, including the Department of Energy (DOE) Computational Science Graduate Fellowship (henceforth "CSGF"). Applying for this fellowship requires non-trivial essays about computational science, which prompted me to think a lot about big computers and software in ways that I hadn't before. Unfortunately, I did not get the fellowship the first time I applied. Fortunately, in those days, it was possible to apply as a second-year grad student, which I did successfully. At the time, I was only a Matlab programmer (and a bad one at that), but I had dreams of running semidefinite programming algorithms on Blue Gene/L, which I assume appealed to the selection committee more than my grades. 24 | 25 | # DOE-CSGF and PNNL 26 | 27 | One of the novel features of the CSGF is that it requires the student to do a "practicum" (internship) at a DOE lab. I focused on Argonne National Lab (Argonne) and Pacific Northwest National Lab (PNNL), for both technical and geographic reasons. I ended up at PNNL working on NWChem because they didn't care that I had no relevant programming experience. As some of the details of my practicum are found in [Deixis](https://www.krellinst.org/doecsgf/docs/deixis/deixis2007.pdf), I won't repeat them here. 28 | 29 | Working at PNNL in the summer of 2006 changed my life and is the reason I've been able to have a career in computing (I elaborate on this [here](https://www.youtube.com/watch?v=2wTZz-6eaSQ&ab_channel=KrellInstitute)). Tim Carlson patiently taught me how to administer a Linux system, Jochen Autschbach and Bert de Jong got me started with Fortran (e.g. "start typing at column seven"), Dunyou Wang taught me about version control and regression testing, and Karol Kowalski taught me quantum many-body theory and putting everything together in NWChem. Karol became my research supervisor and mentor -- we've written about a dozen papers together by now, which was an abnormally good outcome for a summer internship. 30 | 31 | I worked on NWChem full-time for my last three years of grad school and my [dissertation](https://figshare.com/articles/thesis/Coupled_cluster_response_theory_parallel_algorithms_and_novel_applications/967859) is based on that work. During that time, I became a lot better at programming, although still primary in [old-school Fortran](https://stevelionel.com/drfortran/2020/05/16/doctor-fortran-in-military-strength/), and I learned about running jobs on supercomputers, first on MPP2, a 1000-node supercomputer made of Itanium2 processors, the Quadrics Elan2 interconnect, and properly provisioned local and global filesystems, and but also on Argonne Blue Gene systems. 32 | 33 | One of the things that changed for me due to working on NWChem was how I prioritized research, (software) development, and support. Where once I believed that the scientific food chain had academics at the top, and everyone else was merely a failure who couldn't get a faculty position somewhere, I came to appreciate that national labs are at least as a good a place to do science, and that publishing papers is relatively low impact compared to building and supporting tools like NWChem that allow others to do science. It is not typical for grad students to do software support, but I found a great deal of satisfaction in this, and it forced me to learn things about NWChem that I never would have learned otherwise. I also started learning how to write documentation and practice empathy, which have proven to be far more useful in my professional life than solving quantum many-body equations in parallel. 34 | 35 | In the same way that I wanted to understand the guts of Gaussian and B3LYP as an undergrad, working on NWChem inspired to wonder about how low-level system programming and supercomputer interconnects worked. While I could write massively parallel code in NWChem, this had more to do with the structured approach to parallelism built into the application and the Global Arrays runtime system rather than any skill of mine. Robert Harrison quoted So Hirata, who created the Tensor Contraction Engine (TCE) on which my work was built, as saying, "my code is working in parallel and I don't know why." I had a similar experience as So, although I decided I cared more about "why?" questions in parallel computing than chemistry itself. 36 | 37 | # Argonne 38 | 39 | Thanks for CSGF, I had a number of connections at DOE labs to help find a job (postdoc). While Argonne had once declared me unsuitable for their purposes because I didn't know how to program when I was a prospective intern, they were my top choice for a postdoc for geographic reasons. I wanted to go to Argonne because they had a really good computer science division and I wanted to become proficient at both C and MPI. They were also standing up rather novel supercomputers in the Argonne Leadership Computing Facility (ALCF) and needed someone with computational chemistry skills who could make those codes run on the Blue Gene/P architecture. 40 | 41 | Shortly after the new computing building was finished, Vinod Tipparaju visited Argonne and I ended up in a conference room with him and Pavan Balaji. I knew Vinod from PNNL days and he was one of the lead developers of Global Arrays. Pavan was (and is) an MPI extraordinaire. I didn't know Pavan very well back then, so it was serendipitous that Vinod made this introduction. We spent an hour or two analyzing common patterns in scalable HPC applications, which set into motion a number of research projects that eventually led to [ARMCI-MPI](https://github.com/pmodels/armci-mpi), among other things, and was the beginning of a long and very fruitful collaboration between Pavan and me. Working with Pavan and the rest of the MPICH team turned me into a moderately functioning computer scientist, and is the primary reason I'm capable of doing anything useful for Intel. 42 | 43 | Because this blog post must be finite, I'm not going to describe all of the wonderful people at Argonne who taught me things, but one essential person in all of this was my postdoc supervisor, Ray Bair, who, like me, did a PhD in computational chemistry and wandered into HPC. In stark contrast to academic postdocs, Ray did not give me a project, but rather mentored me on how to collaborate with people across the lab and taught me how to be successful in both my ALCF support "day job" and my research "side job". Ironically, I have never co-authored a paper with Ray, but his support made possible some of my most cited papers with Larry Curtiss, Eugene DePrince and Pavan. 44 | 45 | In the second half of my tenure at Argonne, I became involved in some of the more facility-oriented activities of ALCF, including the acceptance of the Blue Gene/Q system and the specification of DOE's pre-exascale systems as part of the [CORAL-1 program](https://science.osti.gov/-/media/ascr/ascac/pdf/meetings/20140331/CORAL_Update_for_ASCAC_March_31_2014_V31.pdf). This allowed me to be acquainted with a number of HPC vendors and the technical and nontechnical aspects of designing and buying a very expensive supercomputer years in advance of its existence. I learned about technical topics like "hardware-software co-design" and nontechnical topics like "using a statement-of-work to hold your vendor accountable." 46 | 47 | The other thing that happened to me in my last two years in DOE was a string of rejections to proposals I submitted, many of which were unrelated to merit but rather due to me violating unwritten political critera such as "we must fund the oldest white men first", "you can't do _that_ research at _that_ lab", and "it doesn't matter how good your proposal is if the program office wants to fund another branch of science." Furthermore, even when I was successful, I was kicked off of projects for being a chemist, even when my contributions were computer science. Finally, my efforts to secure an Early Career grant were impeded because, as someone at a DOE user facility rather than a research division, "I was not a problem in need of solving" (i.e. I was covered by block funding and the lab didn't _need_ me to get any grants). 48 | 49 | At the end of this long series of rejections from the program office, I concluded that they were never going to allow me to have a research career and that I might as well focus on the facility work I was doing, which included many things that I enjoyed. In hindsight, this was a blessing, because it freed me from any long-term obligations to DOE and allowed me and of the burden of thinking about the politically driven shenanigans associated with research funding. It also set the stage for my departure to industry, which never would have happened if I had been co-PI of an exascale co-design center or SciDAC project, as I had wanted so badly. 50 | 51 | # Intel 52 | 53 | In early 2014, Tim Mattson wrote me with an invitation to join his group at Intel Labs. We had a very long email conversation related to my requirements, which included (1) never using Windows, (2) open-sourcing as much code as possible, and (3) continuing to have no formal working hours or location. The most important factor in all of this was relocation to Portland, Oregon, which was one of the very few places on earth that would motivate me to leave Argonne. There was a time when I thought I'd spend my entire career in DOE, but between the abuse of government by Ted Cruz and the total capitulation of Steve Chu to penny-wise, pound-foolish budget obsession, I was more than happy to take my chances with the private sector. 54 | 55 | When I interviewed with the Intel Parallel Computing Lab, they didn't provide much in terms of guidance about what I'd be doing. At one point, I asked whether they wanted me to focus on computational chemistry, parallel programming models, or something else. The answer was something along the lines of, "all of that sounds good to us." I've always been fortunate to have a great deal of freedom in what I do and how I do it, so this was appreciated, although it forced me to take a small leap of faith as to whether or not they meant it. Fortunately, they did mean it, and in my 6+ years at Intel, I have worked under and 80-20 rule, wherein I control about 80% of my time, and somebody else controls about 20% of my time. 56 | 57 | I'll write more about my time at Intel in another post. Once I do that, I also need to write about working at NVIDIA. 58 | 59 | (c) Copyright Jeff Hammond, 2023. No reuse permitted except by permission from the author. 60 | -------------------------------------------------------------------------------- /ChatGPT.md: -------------------------------------------------------------------------------- 1 | TODO: write a blog post about ChatGPT generates terrible code, so that people stop freaking out. 2 | 3 | ## Matrix Transpose 4 | 5 | Q: Please write a code that transposes a matrix of arbitrary size using CUDA Fortran 6 | 7 | A: Here is an example of a Fortran program that transposes a matrix of arbitrary size using CUDA: 8 | 9 | ```fortran 10 | program matrix_transpose 11 | use cudafor 12 | implicit none 13 | 14 | ! Declare variables 15 | integer, parameter :: m = 1000, n = 1000 16 | real, dimension(m,n) :: a, b 17 | integer :: i, j 18 | 19 | ! Initialize input matrix 20 | do i = 1, m 21 | do j = 1, n 22 | a(i,j) = i * j 23 | end do 24 | end do 25 | 26 | ! Transfer input matrix to device 27 | a = a !$cuf 28 | 29 | ! Launch kernel to transpose matrix on device 30 | call transpose_matrix(m, n, a, b) 31 | 32 | ! Transfer result back to host 33 | b = b !$cuf 34 | 35 | ! Print transposed matrix 36 | do i = 1, m 37 | do j = 1, n 38 | write(*,*) b(i,j) 39 | end do 40 | end do 41 | 42 | end program matrix_transpose 43 | 44 | ! Kernel to transpose matrix on device 45 | attributes(global) subroutine transpose_matrix(m, n, a, b) 46 | implicit none 47 | 48 | ! Declare variables 49 | integer, value :: m, n 50 | real, dimension(m,n) :: a, b 51 | integer :: i, j 52 | 53 | ! Get thread and block indices 54 | i = blockIdx%x 55 | j = threadIdx%x 56 | 57 | ! Compute transposed matrix element 58 | b(i,j) = a(j,i) 59 | 60 | end subroutine transpose_matrix 61 | ``` 62 | This program defines a matrix_transpose program that initializes an input matrix on the host, transfers it to the device, launches a kernel to transpose the matrix on the device, and then transfers the result back to the host to be printed to the screen. The transpose_matrix kernel uses the thread and block indices to compute the transposed matrix element at the corresponding position in the output matrix. The size of the matrix can be changed by modifying the m and n parameters. Note that this is just one possible way to implement matrix transposition using CUDA Fortran, and there are many other ways to achieve the same result. 63 | 64 | 65 | ## Matrix Multiplication with OpenMP 66 | 67 | Q: Please write a Fortran code that does matrix multiplication using OpenMP target offload 68 | A: Here is an example of a Fortran program that performs matrix multiplication using OpenMP target offload: 69 | 70 | ```fortran 71 | program matrix_multiply 72 | use omp_lib 73 | implicit none 74 | 75 | ! Declare variables 76 | integer, parameter :: m = 1000, n = 1000, p = 1000 77 | real, dimension(m,n) :: a, b, c 78 | integer :: i, j, k 79 | 80 | ! Initialize input matrices 81 | do i = 1, m 82 | do j = 1, n 83 | a(i,j) = i * j 84 | end do 85 | end do 86 | do i = 1, n 87 | do j = 1, p 88 | b(i,j) = i + j 89 | end do 90 | end do 91 | 92 | ! Perform matrix multiplication on device 93 | !$omp target map(tofrom: a, b, c) 94 | !$omp parallel do default(none) shared(m,n,p,a,b,c) private(i,j,k) 95 | do i = 1, m 96 | do j = 1, p 97 | c(i,j) = 0 98 | do k = 1, n 99 | c(i,j) = c(i,j) + a(i,k) * b(k,j) 100 | end do 101 | end do 102 | end do 103 | !$omp end parallel do 104 | 105 | ! Print result matrix 106 | do i = 1, m 107 | do j = 1, p 108 | write(*,*) c(i,j) 109 | end do 110 | end do 111 | 112 | end program matrix_multiply 113 | ``` 114 | This program defines a matrix_multiply program that initializes two input matrices on the host, then uses OpenMP target offload to perform matrix multiplication on the device. The omp parallel do construct is used to distribute the computation of the matrix multiplication across multiple threads on the device. The result matrix is then printed to the screen on the host. The size of the matrices can be changed by modifying the m, n, and p parameters. Note that this is just one possible way to implement matrix multiplication using OpenMP target offload, and there are many other ways to achieve the same result. 115 | 116 | ## Matrix Multiplication 2 117 | 118 | Q: Please write a matrix multiplication code in Fortran that uses cache and register blocking to achieve a reasonable performance 119 | A: Here is an example of a Fortran program that performs matrix multiplication using cache and register blocking to optimize performance: 120 | 121 | ```fortran 122 | program matrix_multiply 123 | implicit none 124 | 125 | ! Declare variables 126 | integer, parameter :: m = 1000, n = 1000, p = 1000 127 | real, dimension(m,n) :: a, b, c 128 | integer :: i, j, k, ii, jj, kk, block_size 129 | real, dimension(:), allocatable :: a_block, b_block 130 | 131 | ! Set block size 132 | block_size = 32 133 | 134 | ! Allocate memory for block arrays 135 | allocate(a_block(block_size*block_size)) 136 | allocate(b_block(block_size*block_size)) 137 | 138 | ! Initialize input matrices 139 | do i = 1, m 140 | do j = 1, n 141 | a(i,j) = i * j 142 | end do 143 | end do 144 | do i = 1, n 145 | do j = 1, p 146 | b(i,j) = i + j 147 | end do 148 | end do 149 | 150 | ! Perform matrix multiplication with blocking 151 | do ii = 1, m, block_size 152 | do jj = 1, p, block_size 153 | do kk = 1, n, block_size 154 | 155 | ! Copy blocks of matrix A and B to registers 156 | do i = 1, block_size 157 | do j = 1, block_size 158 | a_block(i + (j-1)*block_size) = a(ii+i-1, kk+j-1) 159 | b_block(i + (j-1)*block_size) = b(kk+i-1, jj+j-1) 160 | end do 161 | end do 162 | 163 | ! Perform matrix multiplication on block 164 | do i = 1, block_size 165 | do j = 1, block_size 166 | c(ii+i-1, jj+j-1) = 0 167 | do k = 1, block_size 168 | c(ii+i-1, jj+j-1) = c(ii+i-1, jj+j-1) + a_block(i + (k-1)*block_size) * b_block(k + (j-1)*block_size) 169 | end do 170 | end do 171 | end do 172 | 173 | end do 174 | end do 175 | end do 176 | 177 | ! Print result matrix 178 | do i = 1, m 179 | do j = 1, p 180 | write(*,*) c(i,j) 181 | end do 182 | end do 183 | 184 | end program matrix_multiply 185 | ``` 186 | This program defines a matrix_multiply program that initializes two input matrices on the host, then uses cache and register blocking to perform matrix multiplication in an optimized way. The algorithm divides the matrices into blocks of a specified size and performs the multiplication on each block, storing the blocks in registers to reduce access to main memory. This can improve performance by reducing the number of memory accesses and increasing the amount of data that can be kept in cache. The size of the matrices and the block size can be changed by modifying the corresponding parameters. Note that this is just one possible way to implement matrix multiplication using cache and register blocking, and there are many other ways to achieve the same result. 187 | -------------------------------------------------------------------------------- /Dealing_with_imperfect_Fortran_compilers.md: -------------------------------------------------------------------------------- 1 | # Dealing with Imperfect Fortran Compilers, Part 1 2 | 3 | TL;DR You can build your Fortran application with two different compilers to get 4 | the best of both worlds when it comes to coarrays and GPU parallelism, for example. 5 | This is not the easiest thing to do, but it's better than the bucket of tears 6 | you're living with right now. 7 | 8 | This post describes the motivation. [Part 2](https://github.com/jeffhammond/blog/blob/main/Dealing_with_imperfect_Fortran_compilers_2.md) has the technical details. 9 | 10 | ## The Current State of Fortran Compilers 11 | 12 | Most programmers are familiar with imperfect software. 13 | There are few, if any, nontrivial bug-free programs. 14 | However, users of Fortran are likely more familiar with this topic than the average programmer. 15 | It was not too long ago that there was no free compiler that 16 | supported the latest Fortran standard, or even a decent fraction of it. 17 | For example, when I started programming in Fortran in 2006, 18 | the free compilers that existed were g77 and g95, neither 19 | of which resembled a real Fortran 2003 compiler. 20 | Obviously, better Fortran compilers have always been available, 21 | but not necessarily at the right price. 22 | For example, Cray Fortran is an excellent compiler, and the compiler 23 | is free with the purchase of a Cray system, but the minimum 24 | purchasable unit of Cray hardware has historically been a 25 | rack, which costs around a million dollars. 26 | Reportedly, TITECH bought a single Cray XK7 system 27 | so they could use Cray's Fortran OpenACC compiler on 28 | one of the TSUBAME systems. 29 | 30 | Regardless of the frustrating history of Fortran compilers, 31 | the situation in 2022 is a lot better. 32 | GCC Fortran covers a large portion of the latest Fortran standard (2018), 33 | Intel's Fortran compiler and NVIDIA's NVHPC (nee PGI) 34 | Fortran compilers are freely available (without purchase of hardware), 35 | and the LLVM Fortran effort has made significant progress. 36 | The first-generation LLVM Fortran, Flang, was based on PGI's Fortran 37 | compiler and is the basis for Fortran products from AMD and ARM, 38 | although neither can claim to support the majority of Fortran 2008. 39 | The new LLVM Fortran project, F18, is expected to support all the latest 40 | standard features when it reaches production quality. 41 | 42 | One area where Fortran compiler support is quite poor is the distributed 43 | memory model known as coarrays. 44 | Cray has a great implementation of coarrays but it's tied to their 45 | high-performance networking hardware. 46 | Intel and GCC Fortran both support coarrays, but one of these 47 | compilers is very good at distributed-memory performance and 48 | the other is very good at shared-memory performance 49 | (which is which is left as an exercise for the reader) 50 | so HPC users at NCAR, for example, are forced to choose 51 | which half of the HPC performance spectrum matters to them. 52 | Right now, none of AMD, ARM, LLVM, or NVIDIA support coarrays 53 | at all, although one hopes that future progress in LLVM F18 54 | will include coarrays and percolate into vendor derivatives. 55 | 56 | Fortran programmers fall into a few different categories: 57 | 58 | 1. Luddites who haven't left the 20th century, 59 | don't know that the Fortran language has changed since the mid-1980s [1], 60 | and whose code compiles everywhere this side of a punchcard reader. 61 | 2. Pragmatists, who write to the widely supported common subset of Fortran language support, 62 | which can be approximated by Fortran 2003, and definitely does not include coarrays. 63 | These folks also don't care about coarrays because MPI is better and 64 | has been universally available since before the `gfortran` project began. 65 | 3. Purists, who insist that, if WG5 can imagine it, then it should be usable, 66 | at least within a few years of the ISO ink drying. 67 | Such users either have very business relationships with a compiler vendor or are willing to compromise 68 | on at least one of performance and portability. 69 | 70 | In a CPU-only HPC universe, particularly one dominated by x86, most programmers 71 | could live relatively comfortably within one of these categories. 72 | However, we haven't lived in a CPU-only HPC universe since at least 2012, 73 | when ORNL's Titan Cray XK7 ushered in the beginning of the GPU era of HPC. 74 | Furthermore, x86 domination in HPC streadily eroded as numerous ARM-based 75 | alternatives have emerged, ranging from the exotic Fujitsu A64fx processor 76 | to awesome-yet-boring cloud-oriented offerings from Ampere and AWS. 77 | 78 | Now, our purists are far less happy than before and likely have at least one of the following grievances: 79 | 80 | 1. My system does not support coarrays properly or at all. 81 | 2. My system does not support OpenACC, CUDA Fortran, or StdPar (i.e. `DO CONCURRENT` on GPUs). 82 | 3. My system does not support an feature-complete OpenMP 5 GPU compiler. 83 | 4. My system does not support Fortran 2018 features not related to parallelism. 84 | 5. The only good Fortran compiler on my system is not mainstream and not supported by important HPC software. 85 | 6. The only good Fortran compiler I can use has no well-defined support model. 86 | 87 | Even the pragmatists are starting to get impatient and would like more of the post-2003 features 88 | than are universally available. 89 | 90 | Fortunately, there is a solution to these problems, but it requires a bit of software gymnastics. 91 | On the other hand, if you can actually use most of the features in Fortran 2003+, you're more than 92 | smart enough to deal with the back handspring I'm going to describe next. 93 | 94 | ## A Brief Digression about Application-Binary Interfaces (ABIs) 95 | 96 | None of the aforementioned problems exist in the C world because 97 | - with a small number of reasonable caveats - 98 | C compilers are interoperable, and there's no issue mixing objects 99 | from GCC, Clang and a vendor C compiler based on EDG. 100 | This is because C supports ABI stability on a given platform 101 | and very few users want to mix C standard libraries, which 102 | is the one thing one cannot do. 103 | Similarly, in C++, one can mix GCC and Clang or a EDG-based vendor compiler 104 | as long as they use the same STL. 105 | 106 | Unfortunately, Fortran offers nothing in the way of ABI stability. 107 | Each Fortran compiler can have its own convention for passing 108 | `CHARACTER*(*)` strings and arrays, with the latter often including 109 | a non-standard descriptor format, which may not be well-documented. 110 | Finally, I/O statements and all the intrinsics are a based on 111 | a compiler-specific runtime library, which is tighly bound 112 | to the aforementioned calling conventions. 113 | 114 | However, starting in Fortran 2003, there has been standardized 115 | interoperability between Fortran and C, and this feature set 116 | became almost magical in Fortran 2018, with the introduction 117 | of `CFI_cdesc_t` and other features. 118 | 119 | Extended C-Fortran interoperability (CFI) in Fortran 2018 is the magic 120 | that is going to allow us to break free from the limitations 121 | of a single imperfect compiler, to realize the features provided 122 | by TWO imperfect compilers, so long as there are clean boundaries 123 | between the Fortran code called by each. 124 | 125 | ## The Luddites Might be Right 126 | 127 | Amusingly, the luddites who stopped reading already have been able 128 | to rely on the almost-ABI stability of legacy Fortran 129 | (which is erroneously called "Fortran 77" by many, 130 | but is more accurately Fortran ~85). 131 | Because `integer A(*)` behaves like C99's `int a[restrict]`, 132 | and the infrequent use of proper strings in Fortran, 133 | Fortran libraries like the BLAS and LAPACK are mostly compiler-agnostic. 134 | For example, one does not need to handle the string length in 135 | `DGEMM` because exactly 1 character is read per argument. 136 | There is an issue with complex number return values, 137 | but I'm going to ignore that one. Not today, Satan. 138 | 139 | So basically, if you are willing to write Fortran with REO Speedwagon 140 | playing in the background, and your idea of a complicated datatype 141 | is `DOUBLE PRECISION A(LDA,*)` then you don't need to know what 142 | comes next. 143 | 144 | ## Getting to the Point 145 | 146 | You've waited long enough. 147 | The trick we are going to use to make all of our Fortran dreams come true 148 | is to split Fortran applications into pieces that can be compiled with 149 | different Fortran compilers, and to connect them using CFI features. 150 | 151 | Pictorially, this can be described as follows [2]: 152 | 153 | ![72lgcj](https://user-images.githubusercontent.com/406118/204720665-04588b4e-36d1-40d5-bf61-32e9928bc94a.jpg) 154 | 155 | The overall effect of this is as if we turn 1 application into 156 | 1 application and N libraries, where the libraries have C linkage. 157 | All of this is possible in a strictly standard-compliant way 158 | as of Fortran 2018, although it is a bit tricky to implement, 159 | and there are some limitations. 160 | 161 | One of the more obvious applications of this technique is to build 162 | a program that uses coarrays for distributed-memory parallelism 163 | and `DO CONCURRENT` with GPU parallelism. 164 | Currently, this is not possible on any interesting platform, 165 | since the NVIDIA HPC Fortran compiler does not support coarrays 166 | and neither GCC or Cray has GPU support for `DO CONCURRENT` 167 | right now. Intel Fortran supports coarrays and recently released 168 | GPU support for `DO CONCURRENT`, but Intel has not shipped a 169 | proper HPC GPU, hence the "interesting platform" caveat. 170 | 171 | A less exciting application of this method is to build libraries 172 | that are implemented using modern Fortran that are usable from 173 | any language that supports C calling conventions / linkage. 174 | For example, one could rewrite the BLAS and LAPACK without 175 | disrupting user experience for those accustomed to the legacy 176 | Fortran APIs, or even ship an implementation of CBLAS that 177 | contains zero C code. 178 | 179 | The technical details of this are described in a [follow-up post](https://github.com/jeffhammond/blog/blob/main/Dealing_with_imperfect_Fortran_compilers_2.md). 180 | 181 | ## References 182 | 183 | 1. [Doctor Fortran in "Military Strength"](https://stevelionel.com/drfortran/2020/05/16/doctor-fortran-in-military-strength/) 184 | 2. [Aquarium Leaking Slap Meme Generator](https://imgflip.com/memegenerator/194158970/Aquarium-Leaking-Slap) 185 | 3. Intel recently released a compiler that has `DO CONCURRENT` GPU support but it has a few issues that need to be addressed. 186 | 187 | ## Disclaimer and license 188 | 189 | The opinions expressed in this post are exclusively the author's 190 | and not those of his current and past co-workers, co-authors, friends or family members. 191 | 192 | (c) Copyright Jeff Hammond, 2022. No reuse permitted except by permission from the author. 193 | -------------------------------------------------------------------------------- /Dealing_with_imperfect_Fortran_compilers_2.md: -------------------------------------------------------------------------------- 1 | # Dealing with Imperfect Fortran Compilers, Part 2 2 | 3 | TL;DR You can build your Fortran application with two different compilers to get 4 | the best of both worlds when it comes to coarrays and GPU parallelism, for example. 5 | This is not the easiest thing to do, but it's better than the bucket of tears 6 | you're living with right now. 7 | 8 | ## The Goal 9 | 10 | What we want to build here is a bridge between two Fortran compilers. 11 | The most general bridge is the standard C-Fortran interoperability (CFI) 12 | feature set found in Fortran 2018, but we will also describe 13 | a bridge that uses the implementation-specific array descriptor 14 | of the NVIDIA (nee PGI) Fortran compiler, because that was original 15 | motivation for this project. 16 | 17 | ## Fortran Type-Checking 18 | 19 | Fortran is a strongly type language, although legacy Fortran 20 | compilers had no good way to enforce this, so users regularly 21 | abused the fact that Fortran compilers (almost?) always pass 22 | arguments by reference. 23 | This has allowed functions like `MPI_Bcast` to work for any 24 | Fortran buffer input, since the underlying implementation only 25 | cares about the number of bytes that need to fly around the 26 | machine. 27 | Starting in Fortran 90, compilers used modules and the 28 | interfaces contained therein to check types, which was 29 | a problem for MPI [4] although it was mitigated with the 30 | use of non-standard directives 31 | (usually containing `IGNORE_TKR`, which means "ignore Type, Kind and Rank"). 32 | 33 | In part because of MPI, Fortran added a way to do type-agnostic 34 | arguments, which is sort of like C's `void *`, but it 35 | imposes more rules of what users can do with it. 36 | 37 | ## How CFI Works 38 | 39 | In a Fortran program, one can declare a dummy argument that is 40 | assumed-type (`TYPE(*)`) and assumed-rank (`DIMENSION(..)`). 41 | Within a Fortran program, one can decode such an argument 42 | using `SELECT TYPE` and `SELECT RANK`. 43 | You can look up how those works but they are not important here. 44 | When such arguments are used in the context of CFI, 45 | the C function sees a special argument of the type 46 | `CFI_cdesc_t` that contains all of the information required 47 | to reconstitute the Fortran array details. 48 | 49 | For example, if I pass a 1D array of double precision elements, 50 | which technically should use `real(c_double)` but I can cheat 51 | if I'm sure that a C `double` and a Fortran `double precision` 52 | are the same thing, 53 | my C code will use the following members of `CFI_cdesc_t`: 54 | 55 | ```c 56 | void * base_addr = 57 | size_t elem_len = sizeof(double) = 8 58 | CFI_rank_t rank = 1 59 | CFI_type_t type = CFI_type_double 60 | CFI_dim_t dim[1] = { .. } 61 | ``` 62 | The last listed memory, `dim`, contains the size information 63 | for each array dimension. 64 | For contiguous array arguments, it's easy to understand this, 65 | while for non-contiguous array arguments, one has to be a bit 66 | more careful. 67 | 68 | Please see Ref. [1,2,3] for details. 69 | This blog post is not meant to be a complete tutorial on CFI. 70 | 71 | ## Duct Tape, Part 1 72 | 73 | Unfortunately, CFI is only a standard API, and the ABI is not specified. 74 | Implementations are permitted to use different integer types 75 | for the various members, e.g. `CFI_rank_t`, and can choose 76 | their own order of the struct members, with the exception of 77 | `base_addr`, `elem_len`, and `version`, which must come first. 78 | This means that one has to compile C code for each Fortran 79 | compiler using the correct `ISO_Fortran_binding.h` header file. 80 | 81 | There is a simple but annoying solution to this. 82 | If I define my own implementation of `CFI_cdesc_t` 83 | (with a different name, of course) 84 | then I can convert from one compiler's ABI to anothers as follows. 85 | Note that the code below was written directly into this 86 | blog and has never been compiled or tested. 87 | Implementing a correctly functioning version of this 88 | is left as an exercise to the reader (for now). 89 | 90 | ```c 91 | #include 92 | 93 | // the symbols need to be disambiguity 94 | #define MANGLE(x) FORTRAN_COMPILER_NAME ## _ ## x 95 | 96 | // DT = Duct Tape 97 | // use int64_t under the assumption that it is big enough for any implementation ABI. 98 | // this assumption should be verified in production code. 99 | typedef DT_cdesc_t { 100 | void * base_addr; 101 | size_t elem_len; 102 | int64_t rank; 103 | int64_t type; 104 | ... 105 | } 106 | 107 | // define a DT_type_t enum etc. 108 | 109 | // convert a CFI descriptor to a DT one 110 | void MANGLE(CFI_to_DT) 111 | (const CFI_cdesc_t * i, DT_cdesc_t * o) 112 | { 113 | o->base_addr = i->base_addr; 114 | o->elem_len = i->elem_len; 115 | o->rank = i->rank; 116 | 117 | // because the values of CFI_type_t can be different between implementations, 118 | // we need to have our set of DT_type_t values and translate them in both directions. 119 | o->type = translate_type_t_CFI_to_DT(i->type); 120 | 121 | // more stuff... 122 | } 123 | 124 | // convert a DFT descriptor to a CFI one 125 | void MANGLE(DT_to_CFI) 126 | (const DT_cdesc_t * i, CFI_cdesc_t * o) 127 | { 128 | o->base_addr = i->base_addr; 129 | o->elem_len = i->elem_len; 130 | o->rank = i->rank; 131 | 132 | // because the values of CFI_type_t can be different between implementations, 133 | // we need to have our set of DT_type_t values and translate them in both directions. 134 | o->type = translate_type_t_DT_to_CFI(i->type); 135 | 136 | // more stuff... 137 | } 138 | ``` 139 | 140 | This file needs to be compiled for every Fortran compiler. 141 | 142 | Then I can write another C function that uses e.g. 143 | `Intel_Fortran_CFI_to_DT` and `Cray_Fortran_DT_to_CFI` 144 | to send an array from an Intel Fortran subroutine 145 | to a Cray Fortran subroutine. 146 | 147 | ## Duct Tape, Part 2 148 | 149 | Unfortunately, not all Fortran compilers support Fortran 2018 CFI right now. 150 | Fortunately, it's actually easier to implement the equivalent of the above 151 | duct tape in this case, because there is no ABI conflict between CFI 152 | descriptors and non-standard ones. 153 | 154 | Now I'm going to describe how to call NVIDIA Fortran from another Fortran 155 | compiler that supports Fortran 2018, particularly both CFI and coarrays. 156 | The motivation is that there are people out there who want to use 157 | GPU `DO CONCURRENT` support in the NVIDIA Fortran compiler along with 158 | existing coarray applications. 159 | 160 | The following code is taken from https://github.com/jeffhammond/Cthulhu, 161 | which has been compiled and works correctly in limited testing. 162 | We use the [Parallel Research Kernels](https://github.com/ParRes/Kernels) 163 | implementations of `nstream` (like STREAM triad) to illustrate how this works. 164 | The goal is to take [nstream-coarray.F90](https://github.com/ParRes/Kernels/blob/default/FORTRAN/nstream-coarray.F90) 165 | and allow the `do concurrent` part to use the NVIDIA Fortran compiler 166 | with GPU support enabled. 167 | The relevant portion of the code is shown below. 168 | 169 | ```fortran 170 | ... 171 | real(kind=REAL64), allocatable :: A(:)[:] 172 | real(kind=REAL64), allocatable :: B(:)[:] 173 | real(kind=REAL64), allocatable :: C(:)[:] 174 | real(kind=REAL64) :: scalar 175 | ... 176 | do concurrent (i=1:length) 177 | A(i) = A(i) + B(i) + scalar * C(i) 178 | enddo 179 | ... 180 | ``` 181 | 182 | The first step is to outline the `do concurrent` part 183 | and make it into a subroutine call. 184 | ```fortran 185 | !do concurrent (i=1:length) 186 | ! A(i) = A(i) + B(i) + scalar * C(i) 187 | !enddo 188 | call nstream_colon_trampoline(length,scalar,A,B,C) 189 | ``` 190 | _Aside: I use `colon` in the name to refer to `(:)`, in contrast 191 | to other dummy argument syntax, `(N)` or `(*)`, that I tried, 192 | not because of any connection to the gastrointestinal system._ 193 | 194 | The subroutine has the following interface defined in a module. 195 | ```fortran 196 | interface 197 | subroutine nstream_colon_trampoline(length,scalar,A,B,C) bind(C) 198 | use, intrinsic :: iso_fortran_env 199 | integer(kind=INT64), value :: length 200 | real(kind=REAL64), value :: scalar 201 | real(kind=REAL64), dimension(:) :: A,B,C 202 | end subroutine nstream_colon_trampoline 203 | end interface 204 | ``` 205 | Having the interface is important if the F90 array descriptor is sufficiently 206 | incompatible with a CFI descriptor so as to not work. 207 | It is possible that some Fortran compilers need to see the 208 | `type(*), dimension(..)` to generate a proper `CFI_cdesc_t`. 209 | 210 | The above is implemented in C and looks like this: 211 | ```c 212 | void nstream_colon_trampoline(int64_t length, double scalar, 213 | CFI_cdesc_t * dA, CFI_cdesc_t * dB, CFI_cdesc_t * dC) 214 | { 215 | double * restrict A = dA->base_addr; 216 | double * restrict B = dB->base_addr; 217 | double * restrict C = dC->base_addr; 218 | F90_Desc_la pA={0}, pB={0}, pC={0}; 219 | cfi_to_pgi_desc(dA,&pA); 220 | cfi_to_pgi_desc(dB,&pB); 221 | cfi_to_pgi_desc(dC,&pC); 222 | nstream_colon(length, scalar, A, B, C, &pA, &pB, &pC); 223 | } 224 | ``` 225 | Here we see the descriptor conversion from CFI to the PGI->NVIDIA descriptor (`F90_Desc_la`), 226 | which is defined in `nvhpc_cuda_runtime.h` that ships with the 227 | [NVHPC SDK](https://developer.nvidia.com/nvidia-hpc-sdk-downloads). 228 | We copied the relevant parts, which can be seen in 229 | [pgif90.h](https://github.com/jeffhammond/Cthulhu/blob/main/pgif90.h). 230 | 231 | The descriptor conversion is done in `cfi_to_pgi_desc`, which can be found in 232 | [trampoline.h](https://github.com/jeffhammond/Cthulhu/blob/main/trampoline.h). 233 | A real implementation of the type id conversion alluded to above is shown there as well. 234 | 235 | The C code calls `nstream_colon`, which is a Fortran subroutine compiled with 236 | the NVIDIA compiler, shown below. 237 | ```fortran 238 | subroutine nstream_colon(length,scalar,A,B,C) bind(C) 239 | use, intrinsic :: iso_fortran_env 240 | integer(kind=INT64), value :: length 241 | real(kind=REAL64), value :: scalar 242 | real(kind=REAL64), dimension(:) :: A,B,C 243 | integer(kind=INT64) :: i 244 | do concurrent (i=1:length) 245 | A(i) = A(i) + B(i) + scalar * C(i) 246 | enddo 247 | end subroutine nstream_colon 248 | ``` 249 | 250 | To glue the C code to the NVIDIA Fortran code, 251 | I needed to know that NVIDIA Fortran passes the buffer address in the expected place 252 | and appends the array descriptors at the end, similar to how Fortran strings are passed. 253 | 254 | ## Summary 255 | 256 | What we just did was write a Fortran program that calls CFI-compatible interface 257 | (using Fortran 2018 features) 258 | to a C function that converts one Fortran compiler array descriptor to another 259 | Fortran compiler array descriptor that calls a C-compatible Fortran subroutine 260 | (using only Fortran 2003 features). 261 | 262 | Neither of the Fortran compilers know each other exist as each is talking to C code. 263 | The Fortran coarray code is calling a C function, with a known-compatible interface. 264 | The Fortran `do concurrent` code is called by a C function that passes it the necessary metadata. 265 | All of this is within the scope of why CFI was designed, 266 | although it's not clear if WG5 (the Fortran standards committee) foresaw the 267 | perverse use case show here. 268 | (Most likely, WG5 imagined that one would never need to do this because all 269 | Fortran compilers are perfect implementations of the standard. 😉) 270 | 271 | The real hero is, of course, the all-powerful C language, which can communicate with 272 | any other programming languages thanks to its lack of support for the type of 273 | expressive language features that would get in the way. 274 | However, since you are here because you love Fortran enough to try to use the union 275 | of features found in two different compilers, the other hero here is CFI, 276 | which allows Fortran to masquerade as C at the binary object level, thereby 277 | enabling Fortran programmers to create libraries as if they were written in C, 278 | and thus can be called from any other language, including Fortran. 279 | 280 | ## References 281 | 282 | 1. Michael Metcalf, John Reid, Malcolm Cohen. [Modern Fortran Explained: Incorporating Fortran 2018](https://academic.oup.com/book/26799) 283 | 2. Intel Fortran: [C Structures, Typedefs, and Macros for Interoperability](https://www.intel.com/content/www/us/en/develop/documentation/fortran-compiler-oneapi-dev-guide-and-reference/top/compiler-reference/mixed-language-programming/standard-tools-for-interoperability/c-structures-typedefs-macros-for-interoperability.html#c-structures-typedefs-macros-for-interoperability) 284 | 3. GCC Fortran: [Interoperability with C](https://gcc.gnu.org/onlinedocs/gfortran/Interoperability-with-C.html) 285 | 4. C. E. Rasmussen, J. M. Squyres. [A Case for New MPI Fortran Bindings](https://www.open-mpi.org/papers/euro-pvmmpi-2005-fortran/euro-pvm-mpi-2005-fortran.pdf). 286 | 287 | ## Disclaimer and license 288 | 289 | The opinions expressed in this post are exclusively the author's 290 | and not those of his current and past co-workers, co-authors, friends or family members. 291 | 292 | (c) Copyright Jeff Hammond, 2022. No reuse permitted except by permission from the author. 293 | -------------------------------------------------------------------------------- /Dell_Inspiron5000_Linux.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | This post describes my experience getting Linux installed on a [Dell Inspiron 5000 laptop](https://www.dell.com/en-us/member/shop/dell-laptops/new-inspiron-14-5000-laptop/spd/inspiron-14-5402-laptop/nn5402ejobh) with the latest (as of 2020) Intel processor, known as Tiger Lake. 4 | 5 | **TL;DR** Turn of BitLocker, install Ubuntu 20.10 (not 20.04!), and repartition the drive to dual-boot Linux alongside Windows 10. Everything works nicely. 6 | 7 | # Disclaimer 8 | 9 | I was not compensated by anyone to write this, but I work for Intel and thus you should not treat this as an objective, third-party review. In any case, my focus here will be explaining what I did to install Linux and how things are working so far, not to compare it to any competitive products. My primary basis for comparison will older Dell laptops and other computers on which I run Linux. 10 | 11 | # What's in the box? 12 | 13 | The box contains a laptop, the power cable and the trivial paperwork. That's it. There was minimal plastic (sleeves around each item) and the cardboard appears to be recycled. 14 | 15 | Here are some hardware details if you care: 16 | ```sh 17 | $ sudo lshw 18 | tigerlake 19 | description: Notebook 20 | product: Inspiron 5402 (0A01) 21 | vendor: Dell Inc. 22 | serial: * 23 | width: 64 bits 24 | capabilities: smbios-3.2.0 dmi-3.2.0 smp vsyscall32 25 | configuration: boot=normal chassis=notebook family=Inspiron sku=0A01 26 | *-core 27 | description: Motherboard 28 | product: 0MF3C8 29 | vendor: Dell Inc. 30 | physical id: 0 31 | version: A00 32 | serial: * 33 | *-firmware 34 | description: BIOS 35 | vendor: Dell Inc. 36 | physical id: 0 37 | version: 1.1.5 38 | date: 09/22/2020 39 | size: 1MiB 40 | capacity: 32MiB 41 | capabilities: pci pnp upgrade shadowing cdboot bootselect edd int5printscreen int9keyboard int14serial int17printer acpi usb smartbattery biosbootspecification netboot uefi 42 | *-cpu 43 | description: CPU 44 | product: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz 45 | vendor: Intel Corp. 46 | physical id: 400 47 | bus info: cpu@0 48 | version: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz 49 | slot: CPU 1 50 | size: 1274MHz 51 | capacity: 4700MHz 52 | width: 64 bits 53 | clock: 100MHz 54 | capabilities: lm fpu fpu_exception wp vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp x86-64 constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb cat_l2 invpcid_single cdp_l2 ssbd ibrs ibpb stibp ibrs_enhanced tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb intel_pt avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves split_lock_detect dtherm ida arat pln pts hwp hwp_notify hwp_act_window hwp_epp hwp_pkg_req avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid movdiri movdir64b fsrm avx512_vp2intersect md_clear flush_l1d arch_capabilities cpufreq 55 | configuration: cores=4 enabledcores=4 threads=8 56 | ``` 57 | 58 | # Booting for the first time 59 | 60 | When you power on the system, it enters into the Windows installer. I do not like Windows and the installer insisted that I provide personal information to proceed. You can find more about this elsewhere, so I won't elaborate. 61 | 62 | # Installing WSL 63 | 64 | Before I went all-in with Linux, I decided to see how WSL works. It's pretty nice, although switching from regular Windows to the Insider Preview updates meant that I had to install the Preview version of Windows Terminal, which seems like an unnecessary user inconvenience. 65 | 66 | The biggest issue with WSL (both 1 and 2) is that I cannot get GPU compute support. This is an area of active interest for many, including Microsoft, all the major GPU vendors, and countless users. I expect it will be fixed some time in 2021, but this is just an uninformed guess. 67 | 68 | The other thing I disliked about about Windows and WSL is how many times I had to reboot the computer. On Linux, the only time I reboot is when I update the OS kernel itself. Nothing else requires a reboot. On Windows, you have to reboot to change just about anything. I'm sure somebody thinks there is a good reason for this, but it's annoying and one of many reasons why I cannot take Windows seriously. 69 | 70 | # Installing Linux 71 | 72 | My first few attempts to boot to a USB drive and run Linux live were unsuccessful and I made my computer very angry. The repeated reboots, including a few forced reboots, triggered system checks and something that looked like a rescue process. Fortunately, I did not render my machine unusable in the process. 73 | 74 | Eventually, I figured out how to boot from an USB drive. I didn't capture the details properly but you should plug the USB into the driver, go into the BIOS settings, and place the USB boot drive about the others. 75 | 76 | The first time I tried to run Linux, I was using the Ubuntu 20.04 ISO on a USB. This image does not know about WiFi6, which means that I couldn't do anything with it, because the laptop doesn't have an Ethernet jack and I do not have a USB-to-Ethernet dongle. 77 | 78 | It appears that ArchLinux has the latest kernel in an ISO but I'm less familiar with Arch so I tried Ubuntu 20.10 instead. Fortunately, Ubuntu 20.10 has the updates requires to recognize the wireless chip (and all the other hardware I'm aware of). 79 | 80 | When I booted into Ubuntu 20.10 in live mode, I could see that all the hardware was recognized, including the wireless and the MicroSD port. The touchpad and screen worked perfectly. At this point, I decided to make the Linux install persistent. 81 | 82 | Because I wasn't completely sure that Linux can do all the BIOS/firmware updates directly, I decided to dual-boot. I don't expect to need more than ~250 GB for each OS image, so this won't be an issue even if I never use Windows 10 again. 83 | 84 | Ubuntu is really smart and understands the Windows 10 filesystem, and can reorganize the drive to add a Linux (ext4) partition. However, it can only do this if you turn off BitLocker. I suppose I can turn on BitLocker again now that Linux is installed, but as this laptop is unlikely to leave my office for a long time, I'm not that worried about physical security. 85 | 86 | After booting into Windows to disable Bitlocker (takes about 5 minutes or less), I proceeded with the Ubuntu installer, which was extremely fast (less than 5 minutes). I opted for the minimal configuration, but added the third-party proprietary drivers in case any of the hardware needs that. 87 | 88 | # Running Linux 89 | 90 | After the necessary reboot, I saw the Ubuntu boot menu, which defaults to Ubuntu, but also gives me the option to go into the Windows boot flow. I didn't bother to test the Windows boot again because I don't really care if it works, but if something doesn't work, I'll add a note about that here. 91 | 92 | Because I am a luddite, I installed all the things I wanted with Apt in Terminal. The apps I install everywhere include the Chrome browser, the latest versions of GCC and LLVM, and associated development tools like Git, Vim, Valgrind and CMake (yes, I hate CMake but it's a necessary evil). 93 | 94 | # Comparison to past experiences 95 | 96 | I've been a Dell Linux laptop user for many years, since 2005 or so. In the past, I would install OpenSUSE because it seemed to have the best driver support and wouldn't even think the fact that Windows was there, although I recall the horrors of ndiswrapper in cases where my wireless chip didn't permit native Linux drives. Things are different now, mostly in good ways, although BIOS security features and SecureBoot mean that nuking Windows requires slightly more work. As noted already, I am keeping Windows 10 around in a dual-boot configuration just in case I need it to update the firmware or something like that. 97 | 98 | # Installing oneAPI 99 | 100 | Because the motivation for getting this machine was to evaluate the developer experience of oneAPI on a Tiger Lake laptop, I installed oneAPI and other Intel GPU software. Fortunately, both support Apt and the process is trivial. Since the details are documented elsewhere, I will merely link to those: 101 | 102 | * https://dgpu-docs.intel.com/installation-guides/ubuntu/ubuntu-focal.html 103 | * https://software.intel.com/content/www/us/en/develop/articles/installing-intel-oneapi-toolkits-via-apt.html 104 | 105 | Aside: the above works just fine in WSL as well, but the GPU isn't exposed because the driver support isn't there yet. 106 | 107 | # Performance 108 | 109 | ## Practical 110 | 111 | The other computers on my desk right now are a Macbook Pro circa 2018-2019 and a Hades Canyon NUC. The Tiger Lake laptop feels noticeably faster than the NUC even though the NUC has a higher power envelope and frequency ceiling. I'm not sure whether this is real or not, and if it's real, how it correlates with the processor, memory, or SSD capability. 112 | 113 | I can't really compare the Mac laptop because it is burdened with corporate IT bloatware like Microsoft Outlook that I unfortunately have to run all the time. I've mostly given up on building software natively on MacOS because Apple refuses to let me program the GPU in a sensible manner (their OpenCL is decent for a 1.x implementation, but that's not saying much). 114 | 115 | The other thing I like about this laptop is that it cost around $800. My Macbook Pro cost around $3000 and while it has twice the SSD and more memory (16 vs 12), it's not faster than the cheaper one, and certainly not anywhere near four times faster. I can get a similarly provisioned Mac laptop with the new M1 procesor in it for *only* twice the cost of my Dell. While people on Twitter tell me that the M1 is seventy bazillion times better than every other processor ever made, I am very happy with the performance and the battery life on the Dell, particularly for the price. Also, my name is not Jon Masters and I do not love ARM processors *that* much. 116 | 117 | ## Raw 118 | 119 | `clpeak` is a nice way to measure the peak memory and compute capability of CPU and GPU devices using an equivalent methodology. 120 | 121 | I ran these tests with the Linux governor set for performance ([details](https://askubuntu.com/questions/604720/setting-to-high-performance)). 122 | 123 | ```sh 124 | jrhammon@tigerlake:~/clpeak$ echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor 125 | performance 126 | ``` 127 | 128 | ```sh 129 | jrhammon@tigerlake:~/clpeak$ ./clpeak | tee clpeak.log 130 | 131 | Platform: Intel(R) OpenCL 132 | Device: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz 133 | Driver version : 2020.11.10.0.05_160000 (Linux x64) 134 | Compute units : 8 135 | Clock frequency : 2800 MHz 136 | 137 | Global memory bandwidth (GBPS) 138 | float : 32.09 139 | float2 : 31.57 140 | float4 : 34.14 141 | float8 : 28.76 142 | float16 : 21.88 143 | 144 | Single-precision compute (GFLOPS) 145 | float : 218.19 146 | float2 : 409.03 147 | float4 : 407.21 148 | float8 : 400.21 149 | float16 : 393.09 150 | 151 | No half precision support! Skipped 152 | 153 | Double-precision compute (GFLOPS) 154 | double : 211.05 155 | double2 : 203.61 156 | double4 : 201.58 157 | double8 : 198.51 158 | double16 : 86.28 159 | 160 | Integer compute (GIOPS) 161 | int : 75.64 162 | int2 : 134.78 163 | int4 : 172.68 164 | int8 : 88.42 165 | int16 : 86.30 166 | 167 | Integer compute Fast 24bit (GIOPS) 168 | int : 58.13 169 | int2 : 85.43 170 | int4 : 90.68 171 | int8 : 89.62 172 | int16 : 85.92 173 | 174 | Transfer bandwidth (GBPS) 175 | enqueueWriteBuffer : 14.70 176 | enqueueReadBuffer : 14.85 177 | enqueueWriteBuffer non-blocking : 14.68 178 | enqueueReadBuffer non-blocking : 14.82 179 | enqueueMapBuffer(for read) : 59322.75 180 | memcpy from mapped ptr : 14.69 181 | enqueueUnmap(after write) : 52377.65 182 | memcpy to mapped ptr : 14.51 183 | 184 | Kernel launch latency : 1.97 us 185 | 186 | Platform: Intel(R) OpenCL HD Graphics 187 | Device: Intel(R) Graphics Gen12LP [0x9a49] 188 | Driver version : 20.46.18421 (Linux x64) 189 | Compute units : 96 190 | Clock frequency : 1300 MHz 191 | 192 | Global memory bandwidth (GBPS) 193 | float : 32.51 194 | float2 : 24.16 195 | float4 : 31.49 196 | float8 : 32.43 197 | float16 : 40.02 198 | 199 | Single-precision compute (GFLOPS) 200 | float : 1413.85 201 | float2 : 1410.00 202 | float4 : 860.81 203 | float8 : 899.68 204 | float16 : 753.69 205 | 206 | Half-precision compute (GFLOPS) 207 | half : 2327.97 208 | half2 : 2304.86 209 | half4 : 2329.34 210 | half8 : 1427.09 211 | half16 : 1612.55 212 | 213 | No double precision support! Skipped 214 | 215 | Integer compute (GIOPS) 216 | int : 329.21 217 | int2 : 238.15 218 | int4 : 225.79 219 | int8 : 308.19 220 | int16 : 260.67 221 | 222 | Integer compute Fast 24bit (GIOPS) 223 | int : 326.83 224 | int2 : 235.12 225 | int4 : 252.99 226 | int8 : 252.23 227 | int16 : 256.53 228 | 229 | Transfer bandwidth (GBPS) 230 | enqueueWriteBuffer : 12.41 231 | enqueueReadBuffer : 12.48 232 | enqueueWriteBuffer non-blocking : 10.37 233 | enqueueReadBuffer non-blocking : 10.18 234 | enqueueMapBuffer(for read) : 4294959.00 235 | memcpy from mapped ptr : 12.43 236 | enqueueUnmap(after write) : inf 237 | memcpy to mapped ptr : 12.43 238 | 239 | Kernel launch latency : 26.90 us 240 | 241 | ``` 242 | 243 | # Hardware details 244 | 245 | The wireless hardware that Ubuntu 20.04 doesn't support is: 246 | ``` 247 | $ sudo lshw -C network 248 | *-network 249 | description: Wireless interface 250 | product: Wi-Fi 6 AX201 251 | vendor: Intel Corporation 252 | physical id: 14.3 253 | logical name: wlp0s20f3 254 | version: 20 255 | serial: * 256 | width: 64 bits 257 | clock: 33MHz 258 | capabilities: pm msi pciexpress msix bus_master cap_list ethernet physical wireless 259 | configuration: broadcast=yes driver=iwlwifi driverversion=5.8.0-29-generic firmware=55.d9698065.0 ip=* latency=0 link=yes multicast=yes wireless=IEEE 802.11 260 | ``` 261 | 262 | (c) Copyright Jeff Hammond, 2020. No reuse permitted except by permission from the author. 263 | -------------------------------------------------------------------------------- /Empathy.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | Empathy is an essential non-technical skill. If you lack it, you might be able to be successful by some metrics, but your impact will be limited and your colleagues (and friends) will constantly need to compensate for your inadequacy in this respect. Fortunately, empathy is something that can be taught and learned, and if it is determined that you are lacking in empathy, you should focus a large amount of your personal and professional development energies towards acquiring more of it. 4 | 5 | I will write more on this later. 6 | 7 | 8 | 9 | (c) Copyright Jeff Hammond, 2020. No reuse permitted except by permission from the author. 10 | -------------------------------------------------------------------------------- /Finnish_Nonalcoholic_Beer.md: -------------------------------------------------------------------------------- 1 | # Jeff rates Finnish non-alcoholic beer 2 | 3 | I do not consume ethanol, but I like beer. 4 | These are my notes on this topic, from the perspective of a resident of Finland. 5 | 6 | ### [KOFF Crisp Vaalea Lager](https://www.carlsberggroup.com/products/koff/koff-crisp-vaalea-lager) 7 | 8 | I like this one quite a bit. There is enough hops flavor and the flavor palatte seems well-balanced. 9 | The quality-control of this beer is imperfect - I can detect variation between batches. 10 | Some cans taste a bit metallic, while others are a bit skunkier than others. 11 | 12 | ### [More Brewing Lager](https://www.hartwall.fi/yritys/uutiset/2021/hartwallin-uusi-olutbrandi-more-brewing-on-alkoholittomana-syntynyt/) 13 | 14 | This one is pretty good, but not as good as Crisp. 15 | There is a lot of hops flavor in this one, but it seems like the beer is trying too hard to do this. 16 | There are a lot of American alcoholic beers that try too hard to taste like hops, and if you like those beers, you will like this one. 17 | 18 | ### [Karjala](https://www.hartwall.fi/juomat/oluet/karjala/karjala-00/) 19 | 20 | I do not like this one at all. 21 | I do not understand what the creators were going for, but this is not a good beer. 22 | There is not even a bad American beer -- e.g. Milwaukee's Best Ice -- that I can remember being worse. 23 | 24 | ### [Tsingtao Alcohol Free Lager](https://www.alko.fi/en/products/708554/Tsingtao-Alcohol-Free-Lager/) 25 | 26 | This one was pretty good. 27 | I can't remember Tsingtao well enough to know if it is an authentic reproduction of the flavor, but it tastes like a real beer. 28 | 29 | ### [Kukko Pils](https://laitilan.com/en/tuote/kukko-pils-alcohol-free/) 30 | 31 | _Note: this one - like the others from Kukko - is 0.3%, not strictly 0.0%, so if a tiny amount of alcohol is a problem for you, avoid this one._ 32 | 33 | This one is okay. I don't like Pilsner that much, but I think it's a reasonable approximation to a real pilsner. 34 | -------------------------------------------------------------------------------- /Finnish_is_cool.md: -------------------------------------------------------------------------------- 1 | # Thoughts on the Finnish language 2 | 3 | I have been learning Finnish for approximately 600 days now. These are my thoughts on the language. 4 | I am somewhere in the vicinity of A1 proficiency, so my linguistic analysis should not be given much weight. 5 | 6 | ## Notable differences versus English and other European languages 7 | 8 | The two most obvious differences with languages like English and German are 9 | the lack of articles and the lack of gender: 10 | * She is a Finn = Hän on Suomalainen 11 | * He is a Finn = Hän on Suomalainen 12 | 13 | This saves both words and time relative to say, German, which pointlessly assigns a gender to pencils and clocks: 14 | * the hotel, the pencil, the clock = das Hotel, der Bleistift, die Uhr 15 | * the hotel, the pencil, the clock = hotelli, kynä, kello. 16 | 17 | Not gendering third-person pronouns is both practical and inclusive. 18 | 19 | The distinction between "a" and "the" is positional: 20 | You have _the_ ticket = Lippu on sinulla 21 | I have _a_ ticket = Minulla on lippu 22 | 23 | ## Pronounciation 24 | 25 | Finnish is strictly phonetic. 26 | My friend Mikko says that a spelling bee in Finland would be a hearing test. 27 | The good news is that, if one can hear words correctly, one can write them. 28 | The bad news is that one has to listen very carefully, because every sound matters. 29 | One can turn a meeting into a murder by not making an "a" sound long enough ("tapaan" versus "tapan"). 30 | Fortunately, the crime rate in Finland is quite low and the likelihood of confusion here is unlikely. 31 | 32 | ## "no niin" 33 | 34 | This is the linguistic equivalent of "duct tape" in Finland. You can use it pretty much anywhere. 35 | Ismo explains this [here](https://youtu.be/9EWMlCusxjQ). 36 | The literal translation is "well so" but has more inflection-driven versatility than either 37 | "well" and "so" in English, although some of the usage patterns overlap. 38 | From what I've inferred, "no niin" is a standard way to answer the telephone with familiars. 39 | 40 | -------------------------------------------------------------------------------- /Finnish_language.md: -------------------------------------------------------------------------------- 1 | # The Finnish language 2 | 3 | I've lived in Finland for approximately one year. 4 | These are my thoughts on learning Finnish, based on my experience so far. 5 | 6 | TL;DR You will not need to know any Finnish to survive in the Helsinki area, but it helps to know a few things. 7 | 8 | # Basics 9 | 10 | The two most important words in Finnish are kiitos (thank you) and anteeksi (excuse me). 11 | If you can infer what people are saying from context, 12 | you can get through a number of interactions with strangers using only "kiitos", 13 | for example, grocery shopping. 14 | 15 | When I want to work hard to communicate to a Finn that I want to speak English with them, 16 | I say "voimmeko puhua englantia?" (can we speak English?). If I am lazy, I just say, "sorry", 17 | and they switch to English most of the time. 18 | 19 | # Conversing with Finns 20 | 21 | The average Finnish person speaks English very well, although they will always 22 | underestimate their ability to do so (Finns are excessively modest). 23 | In most cases, unless it is an explicit goal to use Finnish, 24 | it is more effective to just switch to English. 25 | Switching back and forth between languages is difficult for most people, and will 26 | often confuse the listener more than speaking one language they only partially understand. 27 | 28 | If a Finn does not understand a word you use, it can be useful to replace it with a literal 29 | description of what it is, since many Finnish words are created this way 30 | ([examples](https://becomingfinnishsite.wordpress.com/2018/08/13/funny-finnish-words-with-literal-translations/)). 31 | 32 | # Hearing and Pronuncing Words 33 | 34 | Finnish pronounciation is very straightforward - the language is perfectly phonetic - but 35 | it's obviously not identical to any other language. 36 | Pronunciation is more important in Finnish than in English, particularly since 37 | words that sound very similar can have very different meanings 38 | (e.g. [tapan versus tapaan](https://www.finnishpod101.com/blog/2020/12/11/common-finnish-mistakes/)). 39 | 40 | Unlike English, where one can be extremely casual about pronounciation and still be understood 41 | by most proficient speakers, one should try to pronounce words correctly in Finnish 42 | even if one does not understand the meaning. 43 | For example, one should learn to pronounce one's address properly, for the purposes of providing 44 | it to taxi drivers. 45 | 46 | Note that the sound of one letter and two letters is meaningful, for both vowels and consonants, 47 | as demonstrated above with tapan/tapaan. 48 | You also need to pronounce double-t and double-k properly, 49 | since these are part of two different syllables. 50 | For example, bank=pankki, which is pronounced pank-ki. 51 | All of the school materials for kids learning Finnish hyphenates the words so they learn the 52 | correct pronounciation. 53 | 54 | # Spoken versus Written Finnish 55 | 56 | The spoken language ("puhekieli") is usually a shorthand version of the 57 | formal language ("kirjakieli"). The former is geographically dependent and 58 | varies between, say, Helsinki and Pori. 59 | If you speak kirjakieli, everyone will know you are a novice speaker, but 60 | at least they will understand you. I don't see any reason to try to figure out 61 | puhekieli until you are at least a year into learning Finnish and live here. 62 | I have not devoted any time to it, but my family tells me things. 63 | 64 | If you are going to learn puhekieli to listen to others, the only parts I've managed 65 | to learn so far are that mä=minä and sä=sinä (shorthand for first and second person 66 | singular pronouns) and that numbers get shortened from, say, yksi to yks, which 67 | is good because the written form of 99 is yhdeksänkymmentäyhdeksän, which makes for a 68 | rather tedious singing of "99 bottles of beer on the wall"... 69 | 70 | # Swearing and Rude Phrases 71 | 72 | Just say "perkele" (Devil). It's the universal swear word when things go wrong. 73 | The harder you roll your Rs, the more you mean it. 74 | 75 | There is no reason to learn how to say rude things to people in Finland, because 76 | everyone is helpful. If you want to be rude to someone, say 77 | "äitisi oli hamsteri ja isäsi haisi seljanmarjoilta." 78 | 79 | Most references to "saunan takana" (behind the sauna) are death threats, although 80 | it is unlikely the usage will be serious enough to warrant action by the listener. 81 | I'm only familiar with its use in jest or when Finns are being abusive to one another on the internet 82 | (since Facebook doesn't understand this colloquialism, it is not recognized as a Terms of Service violation). 83 | 84 | # DuoLingo 85 | 86 | Lots of people on the internet like to criticize DuoLingo. 87 | I have found it very useful, and it is certainly better than nothing. 88 | Unfortunately, the Finnish course is rather short relative to German, for example, 89 | and it does not cover critical topics like the days of the week or months of the year. 90 | On the other hand, it is good for practicing a bunch of basic concepts, such as 91 | ordering food at a restaurant and describing the elements of nature. 92 | 93 | I finished the DuoLingo Finnish course and can understand about a third of the words 94 | that I hear spoken in public, and be able to hear most of the others well enough that 95 | I can look them up in a dictionary. 96 | 97 | Given that finding Finnish language education outside of Finland is quite difficult, 98 | if you are planning to come to Finland for a nontrivial period of time, I recommend that 99 | you devote some time to DuoLingo Finnish. 100 | 101 | # Google Translate 102 | 103 | Google Translate for Finnish is far from perfect. 104 | It knows very little about homonyms, and will falsely lead you to believe that 105 | you are eating varnish-flavored quark, because "lakka" means both varnish and cloudberry. 106 | You will also have to figure out from context whether something is a cucumber or a throat 107 | (both are "kurkku"). 108 | 109 | On the other hand, it can translate from images, which is incredible useful when one 110 | receives a form letter or a parking ticket. I frequently use my phone to translate 111 | street signs (not while driving, of course) and other public postings 112 | 113 | # Online Resources 114 | 115 | These are online resources I find useful: 116 | * https://uusikielemme.fi/ 117 | 118 | There are many other good resources, particularly from universities. 119 | If you are serious about learning Finnish, you'll search for these online and find them easily enough. 120 | -------------------------------------------------------------------------------- /Fortrans_Missing_Parallelism.md: -------------------------------------------------------------------------------- 1 | # Fortran's Missing Parallelism 2 | 3 | ## Summary 4 | 5 | Starting in Fortran 2008, Fortran supports two forms of parallelism: 6 | 1. `DO CONCURRENT`, which supports loop-level data parallelism. 7 | 2. coarrays, which is form of [PGAS](https://en.wikipedia.org/wiki/Partitioned_global_address_space). 8 | 9 | This document will describe a third form of parallelism and argue that it should be supported by the Fortran language. 10 | The third form of parallelism is shared-memory task parallelism, which supports a range of use cases not easily covered by 1 and 2. 11 | 12 | ## Background Reading 13 | 14 | The reader may wish to consult the following for additional context on this topic: 15 | * _Patterns for Parallel Programming_ by Timothy G. Mattson, Beverly Sanders and Berna Massingill 16 | * _Task Parallelism By Example_ from the Chapel Project ([Slides](https://chapel-lang.org/tutorials/SC14/SC14-4-Chapel-TaskPar.pdf)) 17 | * _OpenMP Tasking Explained_ by Ruud van der Pas ([Slides](https://openmp.org/wp-content/uploads/sc13.tasking.ruud.pdf)) 18 | * _OpenMP Tasking_ by Christian Terboven and Michael Klemm ([Slides](https://www.openmp.org/wp-content/uploads/sc15-openmp-CT-MK-tasking.pdf)) 19 | * _The Problem with Threads_ by Edward A. Lee ([Paper](https://www2.eecs.berkeley.edu/Pubs/TechRpts/2006/EECS-2006-1.pdf)) 20 | 21 | ## Motivating Example 22 | 23 | Consider the following Fortran program: 24 | ```fortran 25 | module numerot 26 | contains 27 | pure real function yksi(X) 28 | implicit none 29 | real, intent(in) :: X(100) 30 | !real, intent(out) :: R 31 | yksi = norm2(X) 32 | end function yksi 33 | pure real function kaksi(X) 34 | implicit none 35 | real, intent(in) :: X(100) 36 | kaksi = 2*norm2(X) 37 | end function kaksi 38 | pure real function kolme(X) 39 | implicit none 40 | real, intent(in) :: X(100) 41 | kolme = 3*norm2(X) 42 | end function kolme 43 | end module numerot 44 | ``` 45 | 46 | ```fortran 47 | program main 48 | use numerot 49 | implicit none 50 | real :: A(100), B(100), C(100) 51 | real :: RA, RB, RC 52 | 53 | A = 1 54 | B = 1 55 | C = 1 56 | 57 | RA = yksi(A) 58 | RB = kaksi(B) 59 | RC = kolme(C) 60 | 61 | print*,RA+RB+RC 62 | end program main 63 | ``` 64 | 65 | Assuming that `yksi`, `kaksi`, `kolme` share no state, then 66 | all three functions can execute concurrently. 67 | 68 | How would we implement this in Fortran 2018? 69 | 70 | One way is to use coarrays and assign each function to a different image. 71 | ```fortran 72 | program main 73 | use numerot 74 | implicit none 75 | real :: A(100), B(100), C(100) 76 | real :: R 77 | 78 | A = 1 79 | B = 1 80 | C = 1 81 | 82 | if (num_images().ne.3) STOP 83 | 84 | if (this_image().eq.1) R = yksi(A) 85 | if (this_image().eq.2) R = kaksi(A) 86 | if (this_image().eq.3) R = kolme(A) 87 | 88 | SYNC ALL() 89 | 90 | call co_sum(R) 91 | if (this_image()) print*,R 92 | end program main 93 | ``` 94 | While this works, this approach has many shortcomings. 95 | First, there is no way to share data directly between images - data must be explicitly copied using coarray operations. 96 | Second, images exist throughout the lifetime of the program (unless they fail) and thus the amount of parallelism 97 | is restricted to what is specified at runtime. 98 | Third, if there are many functions that can execute concurrently, 99 | many more than the number of images (which are likely to be processor cores or similar), 100 | then either the system will be oversubscribed or the user needs to implement scheduling by hand. 101 | Dynamic load-balancing is nontrivial and should not be delegated to application programmers in most cases. 102 | 103 | Another way to implement this program is to use `DO CONCURRENT`: 104 | ```fortran 105 | program main 106 | use numerot 107 | implicit none 108 | real :: A(100), B(100), C(100) 109 | real :: RA, RB, RC 110 | integer :: k 111 | 112 | A = 1 113 | B = 1 114 | C = 1 115 | 116 | do concurrent (k=1:3) 117 | 118 | if (k.eq.1) RA = yksi(A) 119 | if (k.eq.2) RB = kaksi(B) 120 | if (k.eq.3) RC = kolme(C) 121 | 122 | end do 123 | 124 | print*,RA+RB+RC 125 | end program main 126 | ``` 127 | This could work if the external functions are declared `PURE`, 128 | but `DO CONCURRENT` provides no means for dynamic load-balancing. 129 | The bigger problem is that Fortran implementations cannot agree on what form of parallelism 130 | `DO CONCURRENT` uses. Some implementations will use threads while others will use vector lanes. 131 | The latter is going to be useless for most purposes. 132 | Finally, the above is ugly and tedious - no one wants to write code like that 133 | to execute independent tasks. 134 | 135 | ## The OpenMP/OpenACC Solution 136 | 137 | There is a proven solution for Fortran task parallelism in OpenMP (4.0 or later) or OpenACC: 138 | 139 | ```fortran 140 | program main 141 | use numerot 142 | implicit none 143 | real :: A(100), B(100), C(100) 144 | real :: RA, RB, RC 145 | 146 | A = 1 147 | B = 1 148 | C = 1 149 | 150 | !$omp parallel 151 | !$omp master 152 | 153 | !$omp task 154 | RA = yksi(A) 155 | !$omp end task 156 | 157 | !$omp task 158 | RB = kaksi(B) 159 | !$omp end task 160 | 161 | !$omp task 162 | RC = kolme(C) 163 | !$omp end task 164 | 165 | !$omp end master 166 | !$omp end parallel 167 | 168 | print*,RA+RB+RC 169 | end program main 170 | ``` 171 | 172 | ```fortran 173 | program main 174 | use numerot 175 | implicit none 176 | real :: A(100), B(100), C(100) 177 | real :: RA, RB, RC 178 | 179 | A = 1 180 | B = 1 181 | C = 1 182 | 183 | !$acc async 184 | RA = yksi(A) 185 | !$acc end async 186 | 187 | !$acc async 188 | RB = kaksi(B) 189 | !$acc end async 190 | 191 | !$acc async 192 | RC = kolme(C) 193 | !$acc end async 194 | 195 | !$acc async wait 196 | 197 | print*,RA+RB+RC 198 | end program main 199 | ``` 200 | 201 | These programs will execute regardless of the available hardware parallelism, including sequentially. 202 | OpenMP tasking is more powerful in some use cases than OpenACC, by allowing the user to create 203 | dependencies between tasks, which forces the runtime to do more work when scheduling. 204 | This feature - tasks with dependencies - is not proposed for Fortran. 205 | 206 | ## The Proposal for Fortran 207 | 208 | Because OpenMP independent tasks is implemented in essentially all of the Fortran 2008 compilers, 209 | it is reasonable to assume that the design is portable. 210 | The goal here is to design a language feature for Fortran that is consistent with 211 | its existing semantics and syntax. 212 | 213 | We consider the `BLOCK` construct to be an appropriate starting point, because it 214 | defines a scope, and scoping data is an essential part of defining task parallelism. 215 | Because we need more than just data scoping, we use the keyword `task_block` to 216 | tell the implementation that execution concurrency is both permitted and desirable. 217 | ```fortran 218 | program main 219 | use numerot 220 | implicit none 221 | real :: A(100), B(100), C(100) 222 | real :: RA, RB, RC 223 | 224 | A = 1 225 | B = 1 226 | C = 1 227 | 228 | task_block 229 | RA = yksi(A) 230 | end task_block 231 | 232 | task_block 233 | RB = kaksi(B) 234 | end task_block 235 | 236 | task_block 237 | RC = kolme(C) 238 | end task_block 239 | 240 | task_sync all 241 | 242 | print*,RA+RB+RC 243 | end program main 244 | ``` 245 | 246 | ## Non-trivial data issues 247 | 248 | Obviously, very few programs can exploit concurrency where all data is strictly private. 249 | In `DO CONCURRENT`, locality specifiers are used to inform the implementation about 250 | whether data is shared, etc. 251 | (See [this](https://developer.nvidia.com/blog/accelerating-fortran-do-concurrent-with-gpus-and-the-nvidia-hpc-sdk/) 252 | or [this](https://software.intel.com/content/www/us/en/develop/documentation/fortran-compiler-oneapi-dev-guide-and-reference/top/language-reference/a-to-z-reference/c-to-d/do-concurrent.html) for details.) 253 | 254 | Below we modify our program as if each function used a private scratch buffer. 255 | This is not the best way to allocate X, since X could be defined inside of the 256 | `task_block` scope or inside of the external function, but this is just an illustration of the syntax. 257 | We also add T, which could be a read-only lookup table, for example. 258 | ```fortran 259 | program main 260 | use numerot 261 | implicit none 262 | real :: A(100), B(100), C(100) 263 | real :: RA, RB, RC 264 | real :: X(10) 265 | real :: T(1000) 266 | 267 | A = 1 268 | B = 1 269 | C = 1 270 | 271 | task_block local(X) shared(T) 272 | RA = yksi(A,X) 273 | end task_block 274 | 275 | task_block local(X) shared(T) 276 | RB = kaksi(B,X) 277 | end task_block 278 | 279 | task_block local(X) shared(T) 280 | RC = kolme(C,X) 281 | end task_block 282 | 283 | task_sync all 284 | 285 | print*,RA+RB+RC 286 | end program main 287 | ``` 288 | 289 | Much like `DO CONCURRENT`, we should be able to write a fully explicit version using `default(none)`. 290 | ```fortran 291 | program main 292 | use numerot 293 | implicit none 294 | real :: A(100), B(100), C(100) 295 | real :: RA, RB, RC 296 | real :: X(10) 297 | real :: T(1000) 298 | 299 | A = 1 300 | B = 1 301 | C = 1 302 | 303 | task_block local_init(A) shared(RA) local(X) shared(T) 304 | RA = yksi(A,X) 305 | end task_block 306 | 307 | task_block local_init(B) shared(RB) local(X) shared(T) 308 | RB = kaksi(B,X) 309 | end task_block 310 | 311 | task_block local_init(C) shared(RC) shared(T) 312 | RC = kolme(C,X) 313 | end task_block 314 | 315 | task_sync all 316 | 317 | print*,RA+RB+RC 318 | end program main 319 | ``` 320 | It might make sense to have a new locality specifier, `local_final` but since there might have 321 | been a reason why that was not added for `DO CONCURRENT`, we use the `shared` specifier to the 322 | result of this function. 323 | 324 | ## Dependencies 325 | 326 | Many applications where task parallelism will be used have dependencies between tasks. 327 | For example, in our program, we can add a fourth function `nalja` that depends on 328 | `yksi` and `kaksi`. 329 | ```fortran 330 | program main 331 | use iso_fortran_env, only : task_depend_kind 332 | use numerot 333 | implicit none 334 | real :: A(100), B(100), C(100) 335 | real :: RA, RB, RC 336 | real :: X(10) 337 | real :: T(1000) 338 | type(task_depend_kind) :: DEP 339 | 340 | A = 1 341 | B = 1 342 | C = 1 343 | 344 | task_block depends_to(DEP) 345 | RA = yksi(A) 346 | end task_block 347 | 348 | task_block depends_to(DEP) 349 | RB = kaksi(B) 350 | end task_block 351 | 352 | task_block 353 | RC = kolme(C) 354 | end task_block 355 | 356 | task_block depend_from(DEP) 357 | RD = nalja(RA,RB) 358 | end task_block 359 | 360 | task_sync all 361 | 362 | print*,RC+RD 363 | end program main 364 | ``` 365 | This syntax may not be ideal but it expresses the concept. 366 | In OpenMP, dependencies are expressed in the form of memory locations. 367 | Because this might be harder to implement in some scenarios, 368 | we propose an explicit opaque type that the implementation can use. 369 | 370 | ## Known Shortcomings 371 | 372 | Fortran lacks a memory model in the way that Java, C11 and C++11 do. 373 | We do not take a position on whether that is a good or bad thing, but 374 | instead attempt to make the fewest changes required to address 375 | hazards of concurrent data access by tasks. 376 | 377 | One obvious solution for tasks is to reuse the coarray atomic operations, 378 | although this may not be acceptable to the committee. 379 | However, requiring that tasks use atomic operations to access data 380 | that may be modified by another task is a straightforward solution to these hazards. 381 | Unfortunately, the overhead of coarray atomics may be higher than acceptable 382 | for shared-memory uses, in which case a new syntax is required. 383 | 384 | ## Acknowledgements 385 | 386 | Thanks to the following people, who read this proposal or related material and may have provided feedback: 387 | - Ondrej Certik 388 | - Jeff Larkin 389 | -------------------------------------------------------------------------------- /GitHub_Universe.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | This blog post accompanies my GitHub Universe video presentation on oneAPI (see [Embrace the accelerated, cross-architecture era - upstream and downstream](https://universe.intel.com/img/vid.mp4/) on Intel's [Universe home page](https://universe.intel.com/#OpenSource)) if you happen to come here first). I am going to continue to add stuff over the next week so if you find the current state incomplete, you might find it improves on its own. Alternatively, feel free to contact me to ask for the content you want to see. 4 | 5 | * email: it's on my GitHub [home page](https://jeffhammond.github.io/) 6 | * tweet: [science_dot](https://twitter.com/science_dot) 7 | * issue: create a GitHub issue against this repo to ask a question. 8 | 9 | # Direct links 10 | 11 | * [Data Parallel C++ Tutorial](https://github.com/jeffhammond/dpcpp-tutorial) 12 | * [Parallel Research Kernels](https://github.com/ParRes/Kernels) 13 | * [Stencil Demo](https://github.com/jeffhammond/stencil-demo) 14 | * [Intel DPC++ Compiler](https://github.com/intel/llvm/) 15 | * [oneAPI GitHub Project](https://github.com/oneapi-src/) 16 | * [oneAPI CI Examples](https://github.com/oneapi-src/oneapi-ci) 17 | * [Jeff's blog about getting oneAPI working on a Tiger Lake laptop]( 18 | https://github.com/jeffhammond/blog/blob/main/Dell_Inspiron5000_Linux.md) 19 | 20 | # Details 21 | 22 | ## Compiling DPC++ 23 | 24 | Download DPC++ from GitHub here: https://github.com/intel/llvm/. The most common way to download is likely the following: 25 | ```sh 26 | git clone https://github.com/intel/llvm.git dpcpp 27 | ``` 28 | 29 | ### Intel Processors 30 | 31 | You do not need to do this, but you are certainly free to compile DPC++ from source on Intel platforms. If you do not want to compile DPC++, you can just install via Linux package managers as described on [Installing Intel® oneAPI Toolkits via Linux* Package Managers](https://software.intel.com/content/www/us/en/develop/articles/oneapi-repo-instructions.html). 32 | 33 | The build for Intel processors is trivial: 34 | ```sh 35 | python ./buildbot/configure.py 36 | python ./buildbot/compile.py [-jN] 37 | ``` 38 | 39 | ### CUDA Processors 40 | 41 | The build of DPC++ for CUDA (PTX back-end) is straightforward. You should use CUDA 10.1, 11.0 or 11.1. I recall that 11.2 is not yet supported. Version 10.0 is not supported but mostly works (see below for additional comments). 42 | ```sh 43 | python ./buildbot/configure.py [--cuda] 44 | python ./buildbot/compile.py [-jN] 45 | ``` 46 | I have tested DPC++ for CUDA on P100, V100 and A100. It is possible to have problems due to various CUDA configuration issues on Linux. If you experience such issues, report them on the [DPC++ GitHub project](https://github.com/intel/llvm/). 47 | 48 | ### ARM Processors 49 | 50 | I ported DPC++ to ARM in September ([PR 2333](https://github.com/intel/llvm/pull/2333)) but unfortunately, there has been a regression in the build system that I have not yet been able to fix, so please use my branch [agx-works](https://github.com/jeffhammond/intel-llvm/tree/agx-works) for now. 51 | 52 | The ARM build is straightforward using the buildbot scripts: 53 | ```sh 54 | python ./buildbot/configure.py --arm [--cuda] 55 | python ./buildbot/compile.py [-j1] 56 | ``` 57 | 58 | If you build on an ARM+CUDA platform like Xavier AGX, you should add the `--cuda` option. Note that the current AGX distribution of CUDA is version 10.0, which is technically unsupported (10.1 is) and likely causes an issue with memory deallocation in some programs. I am optimistic that the upcoming refresh of the AGX software distribution will address this. 59 | 60 | If you are building a Raspberry Pi, you need to disable parallelism (`-j1`) because the memory on a Pi is insufficient to do parallel builds of LLVM. If you do not limit build parallelism, your Pi will almost become unresponsive and require power cycling. 61 | 62 | ## Tutorials and Demos 63 | 64 | TODO 65 | 66 | # Questions and Answers 67 | 68 | I'll add answers to any questions I receive. If you ask a question in a public forum, I'll cite that, otherwise I will not attribute your question unless you specifically request it. 69 | 70 | # License 71 | 72 | (c) Copyright Jeff Hammond, 2020. CC BY 4.0 license. See https://creativecommons.org/licenses/by/4.0/ for details. 73 | -------------------------------------------------------------------------------- /HPC_Numerical_Precision.md: -------------------------------------------------------------------------------- 1 | I should write this down once-and-for-all. 2 | 3 | [Double Precision Is Not Needed for Many-Body Calculations: New Conventional Wisdom](https://chemrxiv.org/engage/chemrxiv/article-details/60c73dd2567dfe7b4dec36dd) 4 | 5 | [Leveraging the bfloat16 Artificial Intelligence Datatype For Higher-Precision Computations](https://arxiv.org/abs/1904.06376) 6 | 7 | [Double-precision FPUs in High-Performance Computing: an Embarrassment of Riches?](https://arxiv.org/abs/1810.09330) 8 | -------------------------------------------------------------------------------- /In_Defense_Of_Fortran.md: -------------------------------------------------------------------------------- 1 | # In Defense of Fortran 2 | 3 | Fortran is a programming language that people love to hate. 4 | Hating programming languages isn't rare, and most programming languages have haters, 5 | but it seems particularly popular for serious people in the software community to 6 | criticize Fortran, often in ways that are logically or factually flawed. 7 | 8 | The fact is, I don't love Fortran, certainly not like some people love Fortran. 9 | There are people who write everything in Fortran, and believe that it's the best 10 | programming language for just about everything. 11 | While I used Fortran during grad school and have worked on NWChem - 12 | a mostly pre-modern Fortran chemistry application - my entire career, 13 | I am more proficient in C and it is the default language I use when writing new code. 14 | I've also written a nontrivial anount of C++ and Python, and have spent enough 15 | time writing other languages to appreciate the diversity of design choices that exist. 16 | Thus, I think I'm qualified to write objectively about why Fortran is a good language, 17 | and certainly why some of the criticisms of it are bogus. 18 | 19 | One of the common fallacies in programming language evaluation is to argue that 20 | because one language does better at a subset of features, it is therefore an 21 | objectively better language overall. 22 | The second fallacy is to ignore the experience and goals of the programmer using 23 | the language. 24 | Most criticisms of Fortran rely heavily on at least one of these. 25 | 26 | Now let's look at some of the good and bad features of Fortran. 27 | 28 | ## Fortran doesn't have a standard library 29 | 30 | *Fortran afficianados will of course shout at their computer that I have 31 | ignored https://github.com/fortran-lang/stdlib. I have not. 32 | An open-source project isn't the same as an ISO standard, and WG5 has not 33 | ratified anything resembling the C++ STL. 34 | Furthermore, that project is only fully supported by recent versions of 35 | two compilers, which means it's standard neither in theory nor in practice.* 36 | 37 | C++ is well-known for its standard library of containers and algorithms, 38 | and many useful primitives. 39 | On the other hand, Fortran has very few built-in algorithms, and the only 40 | data structure in Fortran is an array. 41 | This isn't too different from C, except that Fortran's strong typing and 42 | more restrictive pointer semantics make it harder to implement a 43 | linked-list than in C. 44 | 45 | Standard libraries are great, and if the C++ STL is a great match for your 46 | application or library, then you should probably use C++. 47 | This does not mean that Fortran is an inferior programming language. 48 | A large number of applications have no need for a linked-list, a dequeue, 49 | or a hash map, and when Fortran applications need something like this, 50 | they either implement it directly or call out to another language. 51 | In NWChem, we implement distributed block sparse arrays with a map, 52 | where the map data structure is a 2xN dimensional array of integers. 53 | It works and, more importantly was a lot simpler than rewriting the 54 | application in C++. 55 | 56 | On the other hand, the Fortran intrinsics that are standard are quite 57 | useful to Fortran applications. Fortran has `MATMUL`, `TRANSPOSE`, 58 | `DOT_PRODUCT`, and many other common operations for arrays, which are 59 | useful to the applications for which Fortran is used. 60 | C++20 doesn't even have proper multidimensional arrays, and it appears 61 | that the equivalent of `MATMUL` may appear in C++26. 62 | Does this mean that C++ is useful for linear algebra? 63 | Of course not, because just like Fortran programmers, C++ programmers 64 | are capable of using more than one-liner solutions. 65 | 66 | In conclusion, if you are looking for a programming language ecosystem 67 | with lots of turnkey library routines, C++ and Python are great options. 68 | Fortran and C are not, and that's okay. 69 | 70 | ## Fortran compilers are imperfect 71 | 72 | I've used Fortran compilers from GCC, Intel, PGI->NVIDIA, Cray, IBM, 73 | Fujitsu and Pathscale over the years, and none of them are perfect. 74 | In particular, some of them are inadequate for modern Fortran code. 75 | (Modern Fortran here means Fortran 2008 and later - I'm not aware of 76 | any actively developed compiler that doesn't support Fortran 2003.) 77 | Some people have tried to claim that this means that we have to abandon 78 | Fortran for C++, which has the most amazing compiler support ever, 79 | as long as the only C++ compilers we look at are the latest releases 80 | of GCC and Clang. 81 | 82 | I find it especially rich that anyone argues that C++ is better than 83 | Fortran because the compilers are so great. 84 | They seem to forget what using C++ was like before Cray, IBM and Intel 85 | killed off their C++ front-ends in favor of Clang, which happened 86 | because all the C++ programmers were so obsessed with using the latest 87 | language features that their code only compiled with GCC and later Clang. 88 | I don't use Windows so I don't really know about MSVC, but as best I 89 | can tell, there are only two usable C++ compilers for Linux, and 90 | one of them is propelled ever forward by the collective might of 91 | companies with a market capitalization in excess of three trillion dollars, 92 | who contribute at least $100M a year in employee effort to the project. 93 | Yet, when investment of $10M to make Fortran compilers better is proposed, 94 | Fortran users get uncomfortable, and nobody seems to want to spend a 95 | dime on GCC Fortran. 96 | 97 | Compilers aren't free, and good compilers are expensive. 98 | While it's true that the HPC community gets a free lunch from 99 | Big Tech when it comes to modern C++ support, they still have 100 | to pay for OpenMP and GPU support, the maintenance costs of their 101 | parallel C++ frameworks, and an army of people to debug the 102 | template instantiation error of the day. 103 | 104 | The Classic Flang project demonstrated that a multi-vendor open-source 105 | collaboration around Fortran is possible, just like with Clang, 106 | and while the LLVM (new) Flang project isn't finished yet, they 107 | deserve patience in the same way that Clang deserved patience when 108 | it was not yet competitive with GCC. 109 | 110 | ## Fortran is hard to teach and hard to learn 111 | 112 | This one is especially rich coming from the C++ community, a language 113 | that requires having an entire book written about [move semantics](https://www.cppmove.com/). 114 | However, everybody seems to agree that Python is easy to learn, 115 | so let's compare Numpy - the lingua franca and machine learning - and Fortran. 116 | 117 | ### 2D Stencil 118 | 119 | Below are excerpts from the [Parallel Research Kernels](https://github.com/ParRes/Kernels) (PRK) 120 | implementation of the 2D stencil, which is a common homework program in computational science. 121 | The major differences in the code shown are: 122 | 1. Fortran defaults to base-1 array indexing, whereas Python is 0-based like C. 123 | Fortran supports 0-based indexing if somebody has a hard time with the default. 124 | 2. Fortran requires `end do`, whereas Python figures this out from indentation. 125 | 126 | Other than those two trivial differences, the syntax is the same. 127 | Numpy supports the same expressive array syntax that Fortran 90 had. 128 | We can also look at PRK nstream, transpose and dgemm to see that they are also 129 | 1:1 using array expressions and math intrinsics. 130 | 131 | ```python 132 | b = n-r 133 | for s in range(-r, r+1): 134 | for t in range(-r, r+1): 135 | B[r:b,r:b] += W[r+t,r+s] * A[r+t:b+t,r+s:b+s] 136 | ``` 137 | 138 | ```fortran 139 | b = n-r 140 | do j=-r,r 141 | do i=-r,r 142 | B(r+1:b,r+1:b) = B(r+1:b,r+1:b) + W(i,j) * A(r+i+1:b+i,r+j+1:b+j) 143 | enddo 144 | enddo 145 | ``` 146 | 147 | Of coures, the big difference is in the code not shown. 148 | Fortran is strongly typed and requires everything to be declared 149 | (because we are not psychopaths and use `implicit none` everywhere), 150 | whereas Python infers types from the first usage of a variable. 151 | However, when using Numpy, it's often prudent to be somewhat explicit, 152 | e.g. `X = numpy.zeros(n,dtype=float)`, so it's unlikely that 153 | `real(kind=REAL64), allocatable :: A(:,:)` is going to be a showstopper 154 | when learning Fortran. 155 | 156 | While Fortran might require slightly more work than Numpy, they 157 | are similarly expressive when it comes to the mathematical code that matters, 158 | so it's hard to argue that Fortran can't be learned, while thousands of 159 | data scientists are learning Numpy every year. 160 | 161 | ## Memory Safety 162 | 163 | Rust is a relatively new programming language that is obsessed with safety 164 | and correctness, but manages to preserve performance in the process. 165 | The Rust community has written quite a bit about memory safety relative 166 | to C and C++, neither of which are memory safe. One can write a careful 167 | subset and C++ to avoid the flaming chainsaw juggling that is C memory 168 | management, but C++ compilers aren't going to prevent programmers from 169 | doing horrible things the way Rust does. 170 | 171 | I haven't seen anybody talk about Fortran being memory safe. It's certainly 172 | not trying to be memory safe in the way that Rust is, but it is educational 173 | to try and write a memory leak in Fortran, because it's not easy. 174 | 175 | ## Optional and named arguments 176 | 177 | This is a case where Fortran is just nice, and should be appreciated for it. 178 | Nobody is claiming that C++ is better than Fortran because 179 | of named arguments, because C++ doesn't have named arguments. 180 | 181 | ## Object-Oriented Fortran 182 | 183 | 184 | 185 | 186 | 187 | 188 | -------------------------------------------------------------------------------- /Learning_Programming_Languages.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | Talking to computers is different than talking to humans for a bunch of reasons, but I think there are some similarities. I recently watched a video by professional language teachers about gaining language proficiency and apply those ideas to my own experiences with programming languages. 4 | 5 | # Context 6 | 7 | Professor [Jeffery Brown](https://www.linkedin.com/in/jeffery-brown-a14b8430/) has a [video](https://youtu.be/illApgaLgGA) on how to gain language proficiency. He puts a lot of emphasis on the difference between the words "acquire" and "learn", which I dislike. What he's really saying is, natural/organic/immersive language education is superior to formal, grammar-oriented language education. I agree with this, but I don't know why we can't call it "learning naturally". 8 | 9 | Some of the key ideas in Professor Brown's video are: 10 | 11 | 1. Babies learn to communicate verbally without knowing how to read and with total disregard for language rules and grammar. 12 | 2. Comprehensible language input is the key to learning how language works. 13 | 3. When beginning to learn a language, focus on listening. Listen to someone who is fluent speak. You should look for a "language parent" that serves a similar role to the parent of a baby learning a language. 14 | 4. Only study grammar when you are fluent in the language. 15 | 16 | Professor Brown espouses the [natural approach](https://en.wikipedia.org/wiki/Natural_approach) of Krashen and Terrell, which is widely used today. I recognize the methods from my German classes in high school, which would have been a lot more effective if I had not been utterly lazy. 17 | 18 | # Learning Fortran 19 | 20 | My first experience becoming proficient in a programming language when I was an intern at PNNL in 2006 ([details](https://github.com/jeffhammond/blog/blob/main/Career_Trajectory.md#doe-csgf-and-pnnl)), when I started working on NWChem, which is written primarily in old-school Fortran (I will define this later). At the time, I had never written any Fortran and I was unable to modify existing programs, which represents 0% fluency. I was familiar with Fortran-style loops from Matlab programming, but Matlab is much simpler than Fortran for a bunch of reasons. 21 | 22 | It has been a little over 14 years, but I'm pretty sure the first thing I had to do as an intern was modify https://github.com/nwchemgit/nwchem/blob/master/src/tce/tce_energy.F, which is an 11K-line subroutine. Back then, it was probably twice as long, but I refactored it for my own sanity after a year or two of experience. The first thing I remember was trying to compile my modifications and getting an unhelpful error (compiler error messages weren't great in those days). I asked the only other person in the room at the time, and [he](https://scholar.google.com/citations?user=1w1T9HYAAAAJ&hl=en) said "you need to indent six spaces. It seemed arbitrary to me, but it worked. So the first thing I learned about Fortran was: indent six spaces. 23 | 24 | During that summer and in the years to follow, I wrote thousands of lines of Fortran. What I have never done in all my years as a programmer are (1) take a course on Fortran programming or (2) engage in any serious study of Fortran grammar. While I own at least three books on Fortran, none of them have been even the slighest bit useful to me. I will admit that I have referenced Fortran documentation on the internet from time to time, especially regarding formatted I/O, the primary methods I used to become proficient in Fortran are: 25 | 26 | 1. Reading code. 27 | 2. Writing code and seeing if it (a) compiles and (b) does the thing I want it to do. 28 | 29 | I recognized that these behaviors are very similar to the language acquisition noted above. While I was recompiling NWChem with my latest modifications -- a multi-hour process until I understood the build system and header files better -- I would read the rest of the code, and a lot of other code, too. **The first key point here is that reading code is hugely important but not something I see practiced much.** Programmers love to point out everything that is wrong with code that already exists and don't seem to see the value in learning from imperfect code. Babies learn to speak the language used at home even when their parent(s) do not speak properly. NWChem is full of of kinds of Fortran but it wasn't hard to figure out the difference between good and bad style from its inherent comprehensibility. 30 | 31 | The second thing I learned to do is stop looking up answers and just answer questions experimentally. You know those Twitter polls asking "without testing it, is this code (a) correct, (b) undefined behavior, (c) blah blah, (d) show me results"? I hate them with the burning passion of [VY Canis Majoris](https://en.wikipedia.org/wiki/VY_Canis_Majoris). I guess those polls are for ISO language lawyers and compiler developers, but they are useless to me. I care about what works, and if something gives me the right answer and passes strict compiler checking and sanitizers, that's all I need to know. Over the years, I've kept a [programmer diary](https://github.com/jeffhammond/HPCInfo) of random tests I've written to see what works. Many of those tests involve things that aren't covered by one of those ISO-blessed APIs anyways, and when it comes to what vendors ship on exotic supercomputers, the only thing that matters is what produces the correct results (vendor documentation is rarely perfect). 32 | 33 | It turns out that this is how babies learn to speak. My kids didn't read a book to know how to ask for food. Toddlers emit semi-random sounds until they get what they want, and over time, they get better at it. They get a banana and I get `$?==0`. 34 | 35 | # Learning C 36 | 37 | Later in grad school, I tried to learn C. It turns out this is rather hard coming from Fortran, because Fortran passes everything by reference, and C passes everything by value, I spent a lot of time during my first year with C wondering why I could modify `a` with `void foo(int)`. But eventually I learned, not because I read the Kernighan and Ritchie book or [ISO/IEC 9899](http://www.iso-9899.info/wiki/The_Standard) but because I read thousands of lines of C code that I was able to determine was some form of good. For example, [PSI](https://psicode.org/) isn't perfect -- it was created by a bunch of [3+ star programmers](https://wiki.c2.com/?ThreeStarProgrammer) -- but it was a lot better than the C I knew how to write at the time, and working C is better than C that either doesn't work or doesn't exist. 38 | 39 | I've subsequently become a pretty good C programmer, by reading C code written by people who know what they are doing, and by writing lots of C and seeing how it works. I eventually read K&R, but I read it as a history book, not as a technical manual. C is a rather simple language and modern compilers are so good at issuing warnings when code is non-conforming that it's pretty easy to learn "toddler-style". 40 | 41 | # Learning Java 42 | 43 | At one point, I tried to learn Java. I read a book for about a day and decided that it was close enough to C++ that I could just guess-and-check using Eclipse, which was a fantastic IDE for Java. So I'd write C++ in Eclipse and it would tell me what I was doing wrong. I wrote a number of simple but interesting programs in Java this way. 44 | 45 | # Appendix 46 | 47 | This is unnecessary detail but if you are the type of person who reads my blog posts, particularly ones about computer language acquisition, you might be the type of person who cares about unnecessary details. 48 | 49 | ## What does "old-school Fortran" mean? 50 | 51 | TL;DR old-school Fortran means: 52 | 53 | 1. Fixed-source form with 72 columns. 54 | 2. Extensive use of common blocks. 55 | 3. No use of modules, interfaces, polymorphism, user-defined types and other features introduced with Fortran 90/95. 56 | 4. No use of Fortran dynamic memory allocation. Memory management is done with C and passed using sketchy methods. 57 | 5. No use of any Fortran feature that is not implemented in every relevant compiler. 58 | 59 | Steve Lionel, aka "Dr. Fortran" wrote a [blog post](https://stevelionel.com/drfortran/2020/05/16/doctor-fortran-in-military-strength/) that elaborates on a comment I made about the non-existence of actual Fortran 77 codes, which is something I picked up from Jeff Squyres in the MPI Forum during our many discussions of the MPI Fortran bindings (interfaces). 60 | 61 | In the case of NWChem, the aversion to Fortran memory management is not just because that feature was added in Fortran 90. The distributed memory programming model of NWChem, Global Arrays, relies heavily upon interprocess shared memory and one-sided communication, both of which require special memory allocation procedures that are done in C. To make these efficient, NWChem allocates a large slab at program start and suballocates from that using a stack allocator (explicit push+pop). This enforces a programmer discipline and makes memory leaks less likely (failure to pop in reverse order of push generates a runtime error). 62 | 63 | (c) Copyright Jeff Hammond, 2020. No reuse permitted except by permission from the author. 64 | -------------------------------------------------------------------------------- /MPI_Needs_ABI.md: -------------------------------------------------------------------------------- 1 | # It's past time for MPI to have a standard ABI 2 | 3 | ## Introduction 4 | 5 | [MPI](https://www.mpi-forum.org/) has always been an API standard. 6 | Implementations are not constrained in how they define opaque types (e.g. `MPI_Comm`), 7 | which means they compile into different binary representations. 8 | This is fine for users who only use one implementation, or are content to recompile their software for each of these. 9 | Many users, including those building both traditional C/C++/Fortran libraries and new languages that use MPI via the C ABI, 10 | are tired of the duplication of effort required because MPI lacks a standard ABI. 11 | 12 | Definitions: 13 | - API = Application Programming Interface, i.e. the signature `MPI_Barrier(MPI_Comm)`. 14 | - ABI = Application Binary Interface, i.e. the binary representation of `MPI_Comm` in memory (e.g. `int` versus a pointer). 15 | 16 | ## Motivating example 17 | 18 | The MPI Forum has often espoused the view that MPI is designed for building libraries. 19 | Many of the APIs in the MPI standard are specifically designed for this purpose, and don't have much use in other contexts, 20 | since attaching state to MPI communications, for example, isn't necessary if the code that calls MPI is monolithic; 21 | in that case, such state can be managed directly by the application. 22 | 23 | Let's consider a trivial library that implements a dot product using MPI: 24 | ```c 25 | // Please ignore the lack of support for vectors longer than `MPI_INT` or 26 | // error handling - this code is not intended to be used in production. 27 | double parallel_ddot(MPI_Comm comm, const double * x, const double * y, int length) 28 | { 29 | double z = 0.0; 30 | for (int i=0; i libmkl_blacs_intelmpi_ilp64.so.1 73 | lrwxrwxrwx 1 root 31 Mar 25 04:50 /opt/intel/oneapi/mkl/2021.2.0/lib/intel64/libmkl_blacs_intelmpi_lp64.so -> libmkl_blacs_intelmpi_lp64.so.1 74 | lrwxrwxrwx 1 root 31 Mar 25 04:50 /opt/intel/oneapi/mkl/2021.2.0/lib/intel64/libmkl_blacs_openmpi_ilp64.so -> libmkl_blacs_openmpi_ilp64.so.1 75 | lrwxrwxrwx 1 root 30 Mar 25 04:50 /opt/intel/oneapi/mkl/2021.2.0/lib/intel64/libmkl_blacs_openmpi_lp64.so -> libmkl_blacs_openmpi_lp64.so.1 76 | lrwxrwxrwx 1 root 30 Mar 25 04:50 /opt/intel/oneapi/mkl/2021.2.0/lib/intel64/libmkl_blacs_sgimpt_ilp64.so -> libmkl_blacs_sgimpt_ilp64.so.1 77 | lrwxrwxrwx 1 root 29 Mar 25 04:50 /opt/intel/oneapi/mkl/2021.2.0/lib/intel64/libmkl_blacs_sgimpt_lp64.so -> libmkl_blacs_sgimpt_lp64.so.1 78 | ``` 79 | There is one BLACS build for each of MPICH, Open-MPI and SGI MPT, plus one build for each flavor of Fortran ABI. 80 | The Fortran ABI issue is similar but not one we are going to solve in the MPI Forum. 81 | Plus, the Fortran standard experts will explain that this issue is the result of improper use of Fortran 82 | compilers and can be avoided just by using features that already exist in the Fortran standard. 83 | 84 | ## The language use case 85 | 86 | C/C++ and Fortran applications aren't the only consumers of MPI. 87 | Because of MPI's rich capability for multiprocessing, and standard nature, many developers 88 | would like to use MPI from [Python](https://www.python.org/), 89 | [Julia](https://julialang.org/), [Rust](https://www.rust-lang.org/), etc. 90 | 91 | How does one do this? Because MPI implementations are all written in C, any language 92 | can call MPI via its own mechanism for calling C ABI symbols, which they all have 93 | due to needing to interact with the Linux operating system, etc. 94 | However, unlike e.g. `malloc`, which has a constant ABI on Linux, these languages 95 | need to know the binary representation of all of the MPI types to call those symbols. 96 | 97 | What this means is that the effort to build and test these MPI wrappers is O(N). 98 | 99 | We see this clearly in the Rust MPI project, [rsmpi](https://github.com/rsmpi/rsmpi), 100 | which reports testing against three different implementations, plus untested user experiences 101 | with a fourth: 102 | > rsmpi is currently tested with these implementations: 103 | > 104 | > * OpenMPI 3.0.4, 3.1.4, 4.0.1 105 | > * MPICH 3.3, 3.2.1 106 | > * MS-MPI (Windows) 10.0.0 107 | > 108 | > Users have also had success with these MPI implementations, but they are not tested in CI: 109 | > 110 | > * Spectrum MPI 10.3.0.1 111 | 112 | They furthermore alude to the O(N) effort here: 113 | 114 | > Since the MPI standard leaves some details of the C API unspecified (e.g. whether to implement certain constants and even functions using preprocessor macros or native C constructs, the details of most types, ...) rsmpi takes a two step approach to generating functional low-level bindings. 115 | > 116 | > First, it uses a thin static library written in C (see rsmpi.h and rsmpi.c) that tries to capture the underspecified identifiers and re-exports them with a fixed C API. This library is built from build.rs using the gcc crate. 117 | > 118 | > Second, to generate FFI definitions tailored to each MPI implementation, rsmpi uses rust-bindgen which needs libclang. See the bindgen project page for more information. 119 | > 120 | > Furthermore, rsmpi uses the libffi crate which installs the native libffi which depends on certain build tools. See the libffi project page for more information. 121 | 122 | The [libffi](https://en.wikipedia.org/wiki/Libffi) project is used by many projects to call C libraries, so we can expect this pain to reappear over and over. 123 | 124 | We see the same duplication of testing effort in [mpi4py](https://github.com/mpi4py/mpi4py/). 125 | The project's [Azure pipelines](https://github.com/mpi4py/mpi4py/blob/master/.azure/pipelines.yml) show 126 | tests for each of four different versions of Python on Linux, MacOS and Windows, where Linux and MacOS 127 | testing is doubled for MPICH and Open-MPI. 128 | It is possible to argue that projects should test against multiple implementations even if there is only 129 | one MPI ABI, but it's not obvious that this testing should be exhaustive in the way it is today, 130 | or that the hunt for implementation-specific bugs needs to be done in automated CI/CD environments 131 | running in shared-memory instances in the cloud. 132 | 133 | ## How do we solve this problem? 134 | 135 | The first step in solving any problem is to admit that there is a problem. 136 | This is nontrivial in this case, because many in the MPI Forum, especially the implementers, 137 | believe that implementation freedom w.r.t. ABI is a feature, not a defect. 138 | Implementations will often argue that their ABI is the best design, which obviously creates 139 | some irreconcilable differences with other implementations, plus at least the MPICH ABI camp 140 | will argue that, even if their ABI isn't perfect, it's stability is an essential feature of 141 | the MPI ecosystem, and the cost of changing it is too great. 142 | 143 | As the argument goes on, there will be arguments about how compile-time constants allow 144 | lower latency than link-time constants, because of the potential for one cache miss or 145 | one branch prediction on the critical path. 146 | If these performance arguments are valid, we should be able to see the impact experimentally. 147 | Hemal Shah, Moshe Voloshin, and Devesh Sharma measured MPI latency of MVAPICH2 versus Open-MPI 148 | and presented at [MUG20](http://mug.mvapich.cse.ohio-state.edu/mug/20/). 149 | 150 | mv-vs-ompi 151 | 152 | If we attribute the entire difference between the two libraries to the ABI choice, 153 | then it is a very small effect, on the order of 100 nanoseconds, out of the 2500+ nanoseconds 154 | required to send a small message. 155 | See the [full presentation](http://mug.mvapich.cse.ohio-state.edu/static/media/mug/presentations/20/sharma-mug-20.pdf) for details. 156 | 157 | Perhaps that 100 nanoseconds is due to a cache miss when `MPI_Send` in 158 | Open-MPI dereferences `struct ompi_communicator_t *MPI_Comm`, but it could also be a cache miss 159 | in the guts due to how these libraries represent state that isn't user-visible, 160 | or perhaps it is just the aggregate cost of a few dozen instructions and handful of branches 161 | that MVAPICH2 lacks versus Open-MPI. 162 | The question is whether the MPI user community cares more about these 100 nanoseconds 163 | versus the hours and days it takes humans to build the MPI software ecosystem twice, 164 | three times, or more, because of the lack of a standard ABI. 165 | 166 | Once the MPI Forum accepts that this is a problem, we can have all the 167 | arguments about what the right standard ABI is, but we cannot use any of those 168 | arguments to get in the way of deciding that there is, in fact, a problem to solve. 169 | It is the MPI Forum's responsibility to act on behalf of its users, in order 170 | to remain relevant. 171 | 172 | ## Relevant prior art / best practice 173 | 174 | A very useful principal of the MPI Forum is that it does not standardize research, but practice. 175 | In the case of ABI compatibility, the HPC user community at [CEA](http://www-hpc.cea.fr/) has found the ABI 176 | issue to be sufficiently obnoxious that they built [wi4mpi](https://github.com/cea-hpc/wi4mpi) 177 | specifically to solve this problem. CEA has been shipping this software since 2016, although 178 | the original license prevented its widespread use until now. 179 | 180 | A similar effort was undertaken at the Perimeter Institute for Theoretical Physics, 181 | leading to [MPItrampoline](https://github.com/eschnett/MPItrampoline), which defines 182 | its own ABI that can be redirected to many different MPI implementations, thereby 183 | solving the O(N) problem for applications. 184 | 185 | It is also known that there is a patent on one particular methods of interoperating different 186 | MPI ABIs that prevents its use by the open-source community, but which demonstrates that 187 | at least one company recognized the value of solving this problem important enough to patent it. 188 | Sadly, the patent holder only managed to prevent others from solving the problem in open-source -- 189 | they haven't actually bothered to implement the solution in a commercial product. 190 | 191 | ## Conclusion 192 | 193 | Every rational person in high-performance computing will admit that people are the most valuable 194 | component in our ecosystem. 195 | Furthermore, because MPI is a successful standard, there are hundreds of times more people 196 | using MPI and building software against it than there are implementing it. 197 | It is past time for the MPI Forum to prioritize the needs of its user community over the 198 | needs of its implementaters, or the dubious claims of performance overhead due to pointer chasing. 199 | 200 | We need to pay the price of breaking all the ABIs one more time, in order to free our users from the pain 201 | of O(N) build and test of all the code that depends on MPI. 202 | 203 | ![MPI-ABI](https://user-images.githubusercontent.com/406118/127174807-1cd9676a-eb8b-40d6-8da3-b154121e8182.jpg) 204 | 205 | Modern processors are incredibly good at pointer chasing, branch prediction, etc. 206 | We need to let those processors do their jobs and stop prematurely optimizing for something 207 | that isn't even a proven bottleneck on processors built in the past decade. 208 | 209 | Furthermore, we need to MPI implementers to get over their petty design arguments about whose 210 | ABI is superior to the other's and put users first. 211 | 212 | I added [MPI ABI Technical Details](MPI_Needs_ABI_Part_2.md) for people who are convinced this is 213 | a great idea and want to start thinking about what a good ABI might look like. 214 | 215 | ## Acknowledgements 216 | 217 | Thanks to Gonzalo Brito and Jim Dinan for an inspiring discussion on Slack. 218 | 219 | ## Disclaimer and license 220 | 221 | The opinions expressed in this post are exclusively the author's and not those of his 222 | current and past co-workers, co-authors, friends or family members. 223 | 224 | (c) Copyright Jeff Hammond, 2021. No reuse permitted except by permission from the author. 225 | -------------------------------------------------------------------------------- /MPI_Needs_ABI_Part_2.md: -------------------------------------------------------------------------------- 1 | # MPI ABI Technical Details 2 | 3 | This is a follow-up to [It's past time for MPI to have a standard ABI](MPI_Needs_ABI.md), 4 | which includes more technical details about how MPI ABIs work. 5 | 6 | As noted in the first article, the first step in solving a problem is admitting that there 7 | is one, so one should not look at this post unless one has already committed to solving 8 | the problem. 9 | 10 | ## Overview of MPI ABIs 11 | 12 | There are multiple aspects to an MPI ABI. Here are a few: 13 | 14 | - The `MPI_Status` object. This is an object with transparent members, and MPI requires it to have specific fields. 15 | - Opaque objects, including `MPI_Comm`, `MPI_Datatype`, `MPI_Win`, etc. 16 | As these are opaque, implementations can define them to be anything. 17 | 18 | [MPI-4](https://www.mpi-forum.org/docs/mpi-4.0/mpi40-report.pdf) 19 | imposes the following constraints on opaque objects: 20 | 21 | > All named constants, with the exceptions noted below for Fortran, can be used in 22 | > initialization expressions or assignments, but not necessarily in array declarations or as 23 | > labels in C `switch` or Fortran `select`/`case` statements. This implies named constants 24 | > to be link-time but not necessarily compile-time constants. The named constants listed 25 | > below are required to be compile-time constants in both C and Fortran. These constants 26 | > do not change values during execution. Opaque objects accessed by constant handles are 27 | > defined and do not change value between MPI initialization (MPI_INIT) and MPI completion 28 | > (MPI_FINALIZE). The handles themselves are constants and can be also used in initialization 29 | > expressions or assignments 30 | 31 | We will see below that MPICH has elected to provide compile-time constants, even though 32 | they are not required. This allows the implementation source code to do some things efficiently, 33 | although portable applications cannot rely on this behavior. 34 | 35 | ## The `MPI_Status` object 36 | 37 | Let's look at three different implementations of the `MPI_Status` object: 38 | 39 | ### New MPICH 40 | 41 | This is the status object after [this commit](https://github.com/pmodels/mpich/commit/4b516e886aa3aa51379e0c3806c911c9333c2cc3), 42 | which made MPICH consistent with Intel MPI, in order to establish the [MPICH ABI initiative](https://www.mpich.org/abi/). 43 | This meant that applications and libraries compiled against Intel MPI could be run using many implementations. 44 | 45 | ```c 46 | typedef struct MPI_Status { 47 | int count_lo; 48 | int count_hi_and_cancelled; 49 | int MPI_SOURCE; 50 | int MPI_TAG; 51 | int MPI_ERROR; 52 | } MPI_Status; 53 | ``` 54 | 55 | ### Old MPICH 56 | 57 | Prior to being consistent with Intel MPI, MPICH had the following status object. 58 | 59 | ```c 60 | // dnl EXTRA_STATUS_DECL - Any extra declarations that the device 61 | // dnl needs added to the definition of MPI_Status. 62 | ... 63 | typedef struct MPI_Status { 64 | int MPI_SOURCE; 65 | int MPI_TAG; 66 | int MPI_ERROR; 67 | MPI_Count count; 68 | int cancelled; 69 | int abi_slush_fund[2]; 70 | @EXTRA_STATUS_DECL@ 71 | } MPI_Status; 72 | ``` 73 | 74 | ### Open-MPI 75 | 76 | This is from Open-MPI as of [65bb9e6](https://github.com/open-mpi/ompi/blob/65bb9e6b4cffd1cafa23f73b2faf7817c5323ab8/ompi/include/mpi.h.in). 77 | I have not attempted to track the history of the Open-MPI status object. 78 | 79 | ```c 80 | typedef struct ompi_status_public_t MPI_Status; 81 | ... 82 | struct ompi_status_public_t { 83 | /* These fields are publicly defined in the MPI specification. 84 | User applications may freely read from these fields. */ 85 | int MPI_SOURCE; 86 | int MPI_TAG; 87 | int MPI_ERROR; 88 | /* The following two fields are internal to the Open MPI 89 | implementation and should not be accessed by MPI applications. 90 | They are subject to change at any time. These are not the 91 | droids you're looking for. */ 92 | int _cancelled; 93 | size_t _ucount; 94 | }; 95 | typedef struct ompi_status_public_t ompi_status_public_t; 96 | ``` 97 | 98 | The wi4mpi ABI for the status object is the same as Open-MPI's: 99 | ```c 100 | struct CCC_mpi_status_struct { 101 | /* These fields are publicly defined in the MPI specification. 102 | User applications may freely read from these fields. */ 103 | int MPI_SOURCE; 104 | int MPI_TAG; 105 | int MPI_ERROR; 106 | /* The following two fields are internal to the Open MPI 107 | implementation and should not be accessed by MPI applications. 108 | They are subject to change at any time. These are not the 109 | droids you're looking for. */ 110 | int _cancelled; 111 | size_t _ucount; 112 | }; 113 | typedef struct CCC_mpi_status_struct MPI_Status; 114 | ``` 115 | 116 | ### Analysis 117 | 118 | We see here that all variants have the required fields, `MPI_SOURCE`, `MPI_TAG` and `MPI_ERROR`, 119 | and the old MPICH ABI matched the Open-MPI ABI in having both a dedicated `int` field for the cancelled 120 | state plus a count field that supports at least 63b values. 121 | 122 | Apparently, the Intel MPI team decided to save 32 bits of space in their status object and distribute 123 | 63 bits of count and 1 bit of cancelled boolean across two `int` fields, plus they eliminated the ABI 124 | slush fund that would have allowed MPICH to adapt to future changes in the MPI standard that would 125 | have required new fields in the status object. 126 | 127 | There isn't anything wrong with the Intel MPI ABI (aka new MPICH ABI). 128 | Testing the cancelled field involves testing a single bit rather than a 32b field, 129 | but since very few MPI programs cancel receives (and cancelling sends has been deprecated), 130 | the relative costs of these does not matter at all. 131 | The needs of the request object seem to be relatively stable over time, and in hindsight it seems 132 | like the ABI slush might have been unnecessarily conservative. 133 | 134 | In any case, it seems like either the new MPICH or Open-MPI ABI would be fine for standardization. 135 | Some will argue that Open-MPI wastes 31 bits, but perhaps those bits can be used for other things 136 | in some implementations. As this state isn't user-visible it doesn't matter how implementations use 137 | it, as long as they use it consistently. 138 | 139 | If I was going to standardize an ABI for the status object, I'd put the public fields first and use 140 | 24 bytes total, which is sufficient for what both of the major ABIs do right now. 141 | I'm not aware of any architectural advantage of the 20 bytes Intel MPI uses. 142 | One could be conservative and round up to 32 bytes, which has some architectural advantages, 143 | since many modern CPUs have 256-bit data paths. 144 | ```c 145 | typedef struct MPI_Status { 146 | int MPI_SOURCE; 147 | int MPI_TAG; 148 | int MPI_ERROR; 149 | int extra[3]; 150 | } MPI_Status; 151 | ``` 152 | 153 | ## MPI datatypes 154 | 155 | MPI datatypes are opaque objects, which means implementations can represent them however they want. 156 | Here we see different philosophies in MPICH and Open-MPI. 157 | 158 | ### MPICH 159 | 160 | MPICH's [mpi.h](https://github.com/pmodels/mpich/blob/main/src/include/mpi.h.in) contains the following: 161 | ```c 162 | typedef int MPI_Datatype; 163 | #define MPI_CHAR ((MPI_Datatype)0x4c000101) 164 | #define MPI_SIGNED_CHAR ((MPI_Datatype)0x4c000118) 165 | #define MPI_UNSIGNED_CHAR ((MPI_Datatype)0x4c000102) 166 | #define MPI_BYTE ((MPI_Datatype)0x4c00010d) 167 | #define MPI_WCHAR ((MPI_Datatype)0x4c00040e) 168 | #define MPI_SHORT ((MPI_Datatype)0x4c000203) 169 | #define MPI_UNSIGNED_SHORT ((MPI_Datatype)0x4c000204) 170 | #define MPI_INT ((MPI_Datatype)0x4c000405) 171 | #define MPI_UNSIGNED ((MPI_Datatype)0x4c000406) 172 | #define MPI_LONG ((MPI_Datatype)0x4c000807) 173 | #define MPI_UNSIGNED_LONG ((MPI_Datatype)0x4c000808) 174 | #define MPI_FLOAT ((MPI_Datatype)0x4c00040a) 175 | #define MPI_DOUBLE ((MPI_Datatype)0x4c00080b) 176 | #define MPI_LONG_DOUBLE ((MPI_Datatype)0x4c00080c) 177 | #define MPI_LONG_LONG_INT ((MPI_Datatype)0x4c000809) 178 | ``` 179 | These values are obviously special, but how? 180 | One feature is that they encode the size of built-in datatypes 181 | such that these can be queried trivially with this macro: 182 | ```c 183 | #define MPIR_Datatype_get_basic_size(a) (((a)&0x0000ff00)>>8) 184 | ``` 185 | There are a bunch of other macros that take advantage of the 186 | hidden structure of the `MPI_Datatype` handle that the reader 187 | can study in [mpir_datatype.h](https://github.com/pmodels/mpich/blob/main/src/include/mpir_datatype.h) 188 | 189 | ### Open-MPI 190 | 191 | Open-MPI's [mpi.h](https://github.com/open-mpi/ompi/blob/master/ompi/include/mpi.h.in) 192 | defines the datatype handle to be a pointer, which means that built-in datatypes 193 | cannot be compile-time constants, although they are link-time constants, which ends 194 | up being similarly efficient with modern toolchains, for most purposes. 195 | ```c 196 | typedef struct ompi_datatype_t *MPI_Datatype; 197 | ... 198 | /* C datatypes */ 199 | #define MPI_DATATYPE_NULL OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_datatype_null) 200 | #define MPI_BYTE OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_byte) 201 | #define MPI_PACKED OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_packed) 202 | #define MPI_CHAR OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_char) 203 | #define MPI_SHORT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_short) 204 | #define MPI_INT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_int) 205 | #define MPI_LONG OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_long) 206 | #define MPI_FLOAT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_float) 207 | #define MPI_DOUBLE OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_double) 208 | ... 209 | ``` 210 | 211 | In contrast to MPICH, Open-MPI has to lookup the size of the datatype 212 | inside of a [352-byte `struct`](https://github.com/open-mpi/ompi/blob/master/opal/datatype/opal_datatype.h#L145), 213 | which is not a concerning overhead 214 | since the type of MPI code that will notice such an overhead is going 215 | to pass the same datatype over and over, in which case, the CPU is going 216 | to cache and correctly branch-predict the lookup and associated usage 217 | every time. 218 | ``` 219 | static inline int32_t opal_datatype_type_size(const opal_datatype_t *pData, size_t *size) 220 | { 221 | *size = pData->size; 222 | return 0; 223 | } 224 | ``` 225 | 226 | ### wi4mpi 227 | 228 | wi4mpi defines all the opaque handles to be `size_t`, which ensures they are at 229 | least as big as MPICH's `int` handles and Open-MPI's pointer handles, 230 | although I don't know if this is the reason. 231 | ``` 232 | typedef size_t MPI_Comm; 233 | typedef size_t MPI_Datatype; 234 | typedef size_t MPI_Errhandler; 235 | typedef size_t MPI_File; 236 | typedef size_t MPI_Group; 237 | typedef size_t MPI_Info; 238 | typedef size_t MPI_Op; 239 | typedef size_t MPI_Request; 240 | typedef size_t MPI_Message; 241 | typedef size_t MPI_Win; 242 | ``` 243 | 244 | wi4mpi defines the built-in datatypes to be sequential integers, 245 | which means they are not attempting to encode useful information 246 | the way MPICH's do, although they are compile-time constants, 247 | unlike Open-MPI's. 248 | I do not know if compile-time constancy is important in wi4mpi. 249 | ```c 250 | /* C datatypes */ 251 | #define MPI_DATATYPE_NULL 0 252 | #define MPI_BYTE 1 253 | #define MPI_PACKED 2 254 | #define MPI_CHAR 3 255 | #define MPI_SHORT 4 256 | #define MPI_INT 5 257 | #define MPI_LONG 6 258 | #define MPI_FLOAT 7 259 | #define MPI_DOUBLE 8 260 | ``` 261 | 262 | ### Analysis 263 | 264 | There are advantages to both approaches. MPICH optimizes for the common case of built-in types, 265 | and does a lookup for others, while Open-MPI always does a pointer lookup, but then has what 266 | it needs in both cases. 267 | 268 | The other advantage of the MPI approach is with Fortran. In Fortran, handles are `INTEGER`, 269 | or handles are a type with a single member that is an `INTEGER`. MPICH conversions between 270 | C and Fortran are trivial (ignoring the case where Fortran `INTEGER` is larger than C `int`, 271 | which is a terrible idea anyways). Open-MPI has to maintain a lookup table to go between 272 | C and Fortran. 273 | 274 | The easy solution here is to use `intptr_t` for handles and change the Fortran 2008 handle 275 | definition to use `intptr_t` for `MPI_VAL`. This allows for trivial conversions between 276 | C and Fortran 2008, for MPICH to continue use magic values for built-ins, and for Open-MPI 277 | to use pointers. Open-MPI will still need a lookup table for the older Fortran interfaces, 278 | but one of these should be [deprecated](https://github.com/mpi-forum/mpi-issues/issues/561) 279 | in MPI-5 anyways. 280 | 281 | Update: we cannot just make `MPI_VAL` a different type from `INTEGER` because the standard 282 | says that `MPI_F08` handles must be interoperable with the raw handles in the legacy API. 283 | We could rely on truncation if the values of handles were always less than `INT_MAX`, but 284 | this is dangerous and won't happen. 285 | 286 | ## Disclaimer and license 287 | 288 | The opinions expressed in this post are exclusively the author's and not those of his current and past co-workers, co-authors, friends or family members. 289 | 290 | (c) Copyright Jeff Hammond, 2021. No reuse permitted except by permission from the author. 291 | -------------------------------------------------------------------------------- /MPI_Needs_ABI_Part_3.md: -------------------------------------------------------------------------------- 1 | # MPI ABI Part 3: Launchers 2 | 3 | This is a follow-up to [It's past time for MPI to have a standard ABI](MPI_Needs_ABI.md) 4 | and [MPI ABI Technical Details](MPI_Needs_ABI_Part_2.md). 5 | 6 | ## What's a launcher? 7 | 8 | A launcher is how one causes MPI programs to start running. 9 | It is one of the least specified aspects of MPI, for various reasons. 10 | In MPI 4.1 11.5 "Portable MPI Process Startup", a syntax for a launcher 11 | named `mpiexec` is suggested, and how ones uses it for SPMD and MPMD cases. 12 | However, even though `mpiexec` is standardized, many users use the command 13 | `mpirun` or some other platform- or implementation-specific launcher. 14 | For example, on machines with [Slurm](https://slurm.schedmd.com/documentation.html), 15 | `srun` is often a recommended command. 16 | 17 | Regardless of what command is actually used to launch an MPI parallel job, the 18 | interesting parts from an implementation compatibility perspective are unseen. 19 | Using a variety of system commands, the launcher needs to broadcast the binary 20 | and its inputs to every compute node, and execute that binary on every node at least 21 | once. Once all these programs are running, they need to figure out that they need 22 | to connect to each other, no later than in `MPI_Init`. Standard output needs 23 | to be handles properly, which might involve forwarding to the node on which the job 24 | was launched. And finally, while no one wants their MPI programs to fail, if 25 | they do, the launcher needs to clean up the mess and make sure there are no 26 | zombie processes or file handles to clog up the system for subsequent users. 27 | 28 | One way to do this, which is roughly how MPICH Hydra does things, is 29 | to spawn a proxy on every node that will manage everything within its node. 30 | In this respect, it is a parent that takes its MPI program children outside 31 | and has to clean up after them when they make a mess. For example, if you 32 | SIGKILL an MPI process, you probably want the whole ensemble to go down, 33 | rather than have N-1 processes running along forever until they deadlock. 34 | 35 | ## How to not standardize launchers 36 | 37 | Last time I looked, there are more launchers than there are MPI ABIs, 38 | so standardizing a launcher is at least as hard as getting everyone to agree 39 | on an ABI. However, it may not be necessary, so let's try this: 40 | 41 | In the case of Slurm or other launchers associated not with MPI 42 | implementations but with resource managers, there is no problem. 43 | If the MPI library is compiled with Slurm support, then it knows how 44 | to wire-up inside of `MPI_Init` based on environment variables that Slurm 45 | defines. Both Open-MPI and MPICH support all of the major schedulers. 46 | 47 | For users who expect to use `mpirun` or `mpiexec`, a hack is to 48 | figure out what launcher the program expects and then invoke it. 49 | In this design, `mpiexec` can be a shell script that calls `strings` 50 | or some other introspection method on the binary and figures out if 51 | it's MPICH or Open-MPI or Intel MPI or MVAPICH2, and then calls 52 | the implementation specific `mpiexec`. This is not an elegant 53 | method but it probably works for a lot of users, and isn't any worse 54 | than the mess we have right now. 55 | 56 | ## Disclaimer and license 57 | 58 | The opinions expressed in this post are exclusively the author's and not those of his current and past co-workers, co-authors, friends or family members. 59 | 60 | (c) Copyright Jeff Hammond, 2021. No reuse permitted except by permission from the author. 61 | -------------------------------------------------------------------------------- /MPI_Needs_ABI_Part_4.md: -------------------------------------------------------------------------------- 1 | # MPI ABI Design Ideas and Challenges 2 | 3 | This is not intended to be useful to others right now, but is a note-taking space for me... 4 | 5 | 6 | ## Context 7 | 8 | This is important: 9 | 10 | > All named constants, with the exceptions noted below for Fortran, can be used in initialization expressions or assignments, 11 | > but not necessarily in array declarations or as labels in C switch or Fortran select/case statements. 12 | > This implies named constants to be link-time but not necessarily compile-time constants. 13 | > The named constants listed below are required to be compile-time constants in both C and Fortran. 14 | > These constants do not change values during execution. Opaque objects accessed by constant handles are defined 15 | > and do not change value between MPI initialization (MPI_INIT) and MPI completion (MPI_FINALIZE). 16 | > The handles themselves are constants and can be also used in initialization expressions or assignments. 17 | 18 | ## Ideas 19 | 20 | MPICH handles are `int`. Open-MPI handles are pointers. 21 | We could standardize handles to be `intptr_t` so that both designs are valid, but we can do better. 22 | One issue with both approaches is the lack of type checking. 23 | For example, [this bug](https://github.com/ParRes/Kernels/commit/ee5e5fb09019bd78325d9680cd93f52858812aa4) 24 | existed for years because the developers only tested with MPICH-based implementations. 25 | More generally, C compilers have no way to distinguish between different `typedef`-to-`int` handles, 26 | and thus in calls where it is possible to transpose handles, compilers may struggle to detect these mistakes 27 | with some implementations (e.g. MPICH). 28 | 29 | We can learn from the Fortran 2008 design here, and make handles a C `struct`, which contains 30 | a single value, `intptr_t`. This allows C compilers to check handles for type-correctness, 31 | but adds no overhead, because there is no overhead to accessing the first element 32 | of a struct. 33 | 34 | Furthermore, since the exact same type can be defined in Fortran 2003, we can eliminate 35 | handle conversion functions altogether. 36 | Handle conversions will remain required for `use mpi` (`mpif.h` should be deleted in MPI-5) 37 | but that's a necessary evil for legacy Fortran users. 38 | 39 | Today, handle conversion overhead is nontrivial in operations like `MPI_Waitall`, because 40 | a temporary vector must be allocated (unless the implementation "cheats" in some way). 41 | The proposed ABI definition of handles will eliminate this. 42 | 43 | ### C handles 44 | 45 | This is how a handle should be defined: 46 | ```c 47 | typedef union { 48 | intptr_t val; 49 | // below is implementation defined, and can be anything 50 | // as long as it does not change the layout implied by intptr_t 51 | #ifdef IMPLEMENTATION_HAS_POINTER_HANDLES 52 | void * our_pointer; 53 | #endif 54 | #ifdef IMPLEMENTATION_HAS_INT_HANDLES 55 | int our_integer; 56 | #endif 57 | } MPI_Handle; 58 | ``` 59 | The name of the member of the `union` does not matter, because users should not access them. 60 | There is not a lot of value in obfuscating the contents, and some of the methods for doing 61 | that make type checking impossible. 62 | Having type checking for well-behaved users is far more important than trying to prevent 63 | users who want to violate the standard from writing illegal code. 64 | 65 | ### Fortran handles 66 | 67 | We should change this: 68 | ```fortran 69 | type, bind(C) :: MPI_Handle 70 | integer :: MPI_VAL 71 | end type MPI_Handle 72 | ``` 73 | to this 74 | ```fortran 75 | type, bind(C) :: MPI_Handle 76 | integer(kind=c_intptr_t) :: MPI_VAL 77 | end type MPI_Handle 78 | ``` 79 | at which point all of the C-Fortran handle interoperability stuff becomes irrelevant. 80 | 81 | Right now, Fortran handle conversions are trivial with MPICH but not trivial with Open-MPI. 82 | No implemenation will have overhead with the MPI-5 ABI. 83 | 84 | Alternatively, if we don't change the Fortran ABI, having the C ABI makes it easy to write 85 | a standalone Fortran 2008 module, which can have a better Fortran ABI. 86 | 87 | ## Challenges 88 | 89 | ## `MPI_BSEND_OVERHEAD` 90 | 91 | > The MPI constant MPI_BSEND_OVERHEAD provides an upper bound on the additional space consumed by the entry 92 | > (e.g., for pointers or envelope information). 93 | 94 | This is implementation-specific. We need to agree on an upper-bound so that it can be standardized. 95 | 96 | ## Other compile-time constants 97 | 98 | Right now, all we say about thread levels is: 99 | > These values are monotonic; i.e., MPI_THREAD_SINGLE < MPI_THREAD_FUNNELED < MPI_THREAD_SERIALIZED < MPI_THREAD_MULTIPLE. 100 | MPICH defines them in a very logical way. There is no reason not to standardize this, or something similar. 101 | ``` 102 | MPI_THREAD_SINGLE = 0 103 | MPI_THREAD_FUNNELED = 1 104 | MPI_THREAD_SERIALIZED = 2 105 | MPI_THREAD_MULTIPLE = 3 106 | ``` 107 | 108 | There are no rules for how these can be defined, but again, we have to pick something to standardize. 109 | ``` 110 | MPI_IDENT = 0 111 | MPI_CONGRUENT = 1 112 | MPI_SIMILAR = 2 113 | MPI_UNEQUAL = 3 114 | ``` 115 | 116 | ## String-related constants 117 | 118 | We need to decide on an upper-bound for these, which are currently implementation-specific. 119 | ``` 120 | MPI_MAX_PROCESSOR_NAME 121 | MPI_MAX_LIBRARY_VERSION_STRING 122 | MPI_MAX_ERROR_STRING 123 | MPI_MAX_DATAREP_STRING 124 | MPI_MAX_INFO_KEY 125 | MPI_MAX_INFO_VAL 126 | MPI_MAX_OBJECT_NAME 127 | MPI_MAX_PORT_NAME 128 | ``` 129 | 130 | ## Other constants 131 | 132 | These are also compile-time constants: 133 | ``` 134 | MPI_VERSION 135 | MPI_SUBVERSION 136 | MPI_F_STATUS_SIZE (C only) 137 | MPI_STATUS_SIZE (Fortran only) 138 | MPI_ADDRESS_KIND (Fortran only) 139 | MPI_COUNT_KIND (Fortran only) 140 | MPI_INTEGER_KIND (Fortran only) 141 | MPI_OFFSET_KIND (Fortran only) 142 | ``` 143 | 144 | `MPI_VERSION` and `MPI_SUBVERSION` remain specified based on the library features, at compile-time. 145 | Users can use `MPI_Get_version` to verify consistency with run-time support. 146 | 147 | `MPI_F_STATUS_SIZE` and `MPI_STATUS_SIZE` are fixed as soon as the ABI of `MPI_Status` is defined. 148 | 149 | `MPI_*_KIND` follow from standardization of the associated C types. 150 | 151 | ## Fortran compiler support 152 | 153 | These depend on the Fortran compiler, and how the library deals with `CFI_cdesc_t`. 154 | These should be deprecated and replaced with run-time queries, if possible, although 155 | some applications may need to be able to rely on them at compile-time. 156 | ``` 157 | MPI_SUBARRAYS_SUPPORTED (Fortran only) 158 | MPI_ASYNC_PROTECTS_NONBLOCKING (Fortran only) 159 | ``` 160 | These features are associated with Fortran 2018 support, and should be widely supported 161 | by the time we are going to vote on an ABI anyways. 162 | It made sense to make them optional in 2012, but by 2024, they should be required. 163 | 164 | ## Disclaimer and license 165 | 166 | The opinions expressed in this post are exclusively the author's and not those of his 167 | current and past co-workers, co-authors, friends or family members. 168 | 169 | (c) Copyright Jeff Hammond, 2021. No reuse permitted except by permission from the author. 170 | -------------------------------------------------------------------------------- /MPI_Needs_ABI_Part_5.md: -------------------------------------------------------------------------------- 1 | # Will we ever need 128-bit offsets? 2 | 3 | One of the challenges that has come up in the course of designing the MPI standard ABI is how to define the 4 | MPI integer types. For reference, these are: 5 | - `MPI_Aint` (address integer, but sometimes gets used in other ways) 6 | - `MPI_Offset` (file offset integer) 7 | - `MPI_Count` (large count integer, but also used to hold both of the former) 8 | - `MPI_Fint` (Fortran integer, i.e. the C type equivalent to Fortran's default `INTEGER`) 9 | 10 | It is straightforward to see that `MPI_Aint` should be C `intptr_t`, because this is the 11 | only C type guarenteed to be interconvertible with a pointer (and is signed, since we 12 | can't use unsigned types because those don't exist in Fortran). 13 | 14 | Right now, on both 32- and 64-bit platforms (meaning those with 32- or 64-bit addressing, i.e. pointers) 15 | filesystems are usually 64-bits. The [LFS](https://en.wikipedia.org/wiki/Large-file_support) initiative 16 | was created to allow larger than 2 GiB files on 32-bit platforms. 17 | 18 | There have been efforts to define 128-bit filesystems, but this post is going to explain why MPI 19 | does not need to have 128-bit offsets even if the underlying filesystem uses 128-bit offsets. 20 | MPI file offsets are for a single file, so it is not a question of how large the underlying 21 | filesystem is, but the size and extent of a single MPI file that matters here. 22 | 23 | The limit of a 64-bit offset, or rather a 63-bit offset if we are dealing with signed integers, 24 | is 2^63 = 8 * (1024)^6 = >8 billion gigabytes. Let's look at what it would take to exceed this 25 | limit, in terms of money, time and energy. 26 | 27 | # Money 28 | 29 | According to https://diskprices.com/, a reasonable quality disk drive costs approximately $10/TB, 30 | so if one bought a filesystem to store exactly one file that requires more than a 63-bit offset, 31 | it's going to cost at least $90M, and that is making very aggressive assumptions about how much 32 | extra gear is required to connect more than a million multi-terabyte drives together such that 33 | they actually work as a filesystem that will hold such a file. 34 | 35 | Obviously, as with everything in computing, prices go down over time, until physical limits are 36 | reached. Based on https://ourworldindata.org/grapher/historical-cost-of-computer-memory-and-storage, 37 | there was a very nice exponential decrease in storage cost between 1990 and 2010, but since then the 38 | exponent has changed and prices have decreased only ~3x in the past 10 years. 39 | Let's assume that trend continues, which means storage costs will go down 10x every 20 years. 40 | That means some time in the 2040s, one might be able to build a filesystem for a single >8 EiB file 41 | that costs around $10M. This is more than most HPC systems cost today... 42 | 43 | The cost problem is much worse once we observe that the $10/TB pricing is for the _cheapest_ 44 | large-capacity storage media available, not for the fast media required to satisfy the time 45 | requirements established below. High-end storage media is at least 3x more expensive than 46 | 7200 RPM drives. That 3x means we need to add another decade in order to have our exafile 47 | cost anywhere near $10M. 48 | 49 | # Time 50 | 51 | Right now, the mean time to interrupt (MTTI) on a large supercomputer is less than a day 52 | and not expected to improve any time soon. Let's assume that we need to write our >8 EiB file 53 | in less than a day. That's already pretty unreasonable, but it gives a way to establish some 54 | conservative bounds. What sort of I/O bandwidth is required to write such a file in a day? 55 | One day is 86,400 seconds. At an I/O speed of 1 TB/s, we can write 86.4 PB/day. 56 | We would need an aggregate I/O speed of 100 TB/s to write the 8.64 EB/day required to populate 57 | the gigantic file in question. 58 | 59 | One current I/O standard is PCIe 5.0, which supports 64 GB/s (unidirectional). 60 | PCIe 6.0 is [expected](https://www.theverge.com/2022/1/12/22879732/pcie-6-0-final-specification-bandwidth-speeds) 61 | to support twice that, i.e. 128 GB/s, with x16. It seems we need around 1000 I/O devies 62 | to drive this filesystem, but we will probably use a lot more than 1000 drives 63 | to store the exafile, so the speed limit is the storage media, not the I/O into it. 64 | 65 | Right now, state-of-the-art storage media supports approximately 10 GB/s of write 66 | bandwidth. That bandwidth goes down as an SSD gets full, as it will when writing 67 | the exafile, but let's ignore that. Right now, the best SSDs max out PCIE with only x4, 68 | and it's likely that only change for the better. If PCIe doubles a few more times 69 | in the next decade, which is optimistic, then we might expect to be able to write 70 | at 256 GB/s to the most expensive SSDs in the future. 71 | This means that we might be able to write the exafile in less than a day, 72 | assuming we can build a filesystem with thousands of drives and the interconnect 73 | requires to move the bytes where they need to go. 74 | 75 | # Energy 76 | 77 | Writing to storage costs approximately 1 nanojoule/bit according to 78 | [this](http://large.stanford.edu/courses/2018/ph240/jiang2/). 79 | Our exafile requires writing 2^66 bits, or approximately 74 gigajoules. 80 | A watt is defined to be a joule per second. So we expend 74 gigajoules 81 | in 86400 seconds, which is 854 kW. That's a lot of power but not 82 | unreasonable for data center operations, where large systems routinely 83 | require many megawatts. 84 | 85 | It doesn't look like power is the limit to storing the exafile. 86 | 87 | # Conclusion 88 | 89 | I am not an expert at these things, but it certainly seems like we have 90 | approximately 30 years before anybody is going to be able afford 91 | to write a file that requires `MPI_Offset` to be more than 63 bits. 92 | 93 | Right now, OLCF's [Orion filesystem](https://community.hpe.com/t5/servers-systems-the-right/meet-the-world-s-largest-and-fastest-parallel-file-system/ba-p/7155645) 94 | has a capacity of 690 PB and an aggregate bandwidth of 10 TB/s. 95 | The most aggressive use case for this filesystem is: 96 | 97 | > Each of simulations consisted of 2 trillion particles and more than 1,000 steps. 98 | > The data generated by ONE simulation could total a whopping 200 PB, 99 | 100 | which one can reasonable assume requires approximately 1000 files of 101 | 200 TB each, which is consistent with our assumptions. 102 | In theory, users could insist on using exactly one file, in which 103 | case they are within 40x of the 63-bit limit, but 40x more data 104 | depends on at least 40x more simulation, which is likely three 105 | generations of supercomputers away, or more than a decade away. 106 | 107 | # Disclaimer and license 108 | 109 | The opinions expressed in this post are exclusively the author's and not those of his current and past co-workers, co-authors, friends or family members. 110 | 111 | (c) Copyright Jeff Hammond, 2023. No reuse permitted except by permission from the author. 112 | -------------------------------------------------------------------------------- /MPI_Needs_ABI_Part_6.md: -------------------------------------------------------------------------------- 1 | # MPI Fortran ABI Challenges and Solutions 2 | 3 | This article focuses on the Fortran aspects of the MPI ABI. 4 | The first thing to note is that the Fortran language and compiler ecosystem 5 | does not allow for a standard ABI in the general sense, because Fortran 6 | modules and calling conventions are not standard and known to differ -- 7 | often significantly -- between implementations. 8 | When we discuss the MPI Fortran ABI, we are only referring to the parts 9 | of the MPI C ABI that interact with Fortran. 10 | 11 | # Handle conversion and `MPI_Fint` 12 | 13 | The most obvious interaction between Fortran and the MPI C ABI is `MPI_Fint` 14 | and functions that use it. Sadly, it is allowed for Fortran compilers to 15 | change the size of the type `INTEGER` using compiler flags (e.g., `-i8`), 16 | hence the `MPI_Fint` type in C code has to know what Fortran compiler flags 17 | were used. Thus, there is no way to make this type definition part of the 18 | MPI C ABI, and therefore all of the `MPI__{f2c,c2f}` functions are 19 | ill-defined. 20 | 21 | One solution to the `MPI_Fint` problem would be to define it to C `int` 22 | and disallow MPI Fortran support from using an `INTEGER` that is not 23 | equivalent. Instead, we will add new functions that are nearly identical 24 | to f2c/c2f that do not depend on the Fortran compiler. These functions 25 | are `MPI__{toint,fromint}`. 26 | As long as Fortran `INTEGER` is not smaller than C `int`, which is true 27 | in all reasonable environments, these functions can be used to implement 28 | f2c/c2f in the MPI Fortran API. 29 | 30 | Why bother with these functions at all? Are they strictly necessary? 31 | Once we have the MPI C ABI, the handle types will be known to be C pointers, 32 | which could be stored in Fortran via CFI (C-Fortran interoperability). 33 | Unfortunately, all of the MPI Fortran API assumes handles are `INTEGER`, 34 | or a type that contains an `INTEGER`. 35 | 36 | In an implementation of the MPI Fortran API, 37 | such as [Vapaa](https://github.com/jeffhammond/vapaa), 38 | it is necessary to convert handles from Fortran to and from C quickly. 39 | For predefined handles, the MPI ABI makes this trivial, since all the 40 | constant values are small and one can cast with truncation. 41 | For user handles, the forward conversion (from Fortran to C) is often 42 | on the critical path, which can implemented using an array of handles. 43 | However, one of the most performance critical parts of MPI will involve 44 | the back conversion from C to Fortran of requests. It is possible to 45 | implement this mapping in Vapaa but not easy to do efficiently. 46 | It is expected that implementations can provide a more efficient implementation. 47 | 48 | Because the C status object (`MPI_Status`) is now fully specified, no 49 | new conversion functions are required. 50 | 51 | # Fortran types and their MPI datatypes 52 | 53 | When we call a reduction from Fortran with, e.g., `MPI_REAL`, 54 | a user-defined callback will get a datatype argument. 55 | If the C implementation of MPI doesn't know what `MPI_REAL` is, 56 | it's going to detect this as an invalid datatype. 57 | We can't just work around this by translating Fortran types 58 | to C types and passing `MPI_FLOAT`, because this means that 59 | datatype logic inside of user callbacks written in Fortran will 60 | not work. 61 | Thus, the MPI C implementation needs to know that `MPI_REAL` is 62 | valid and to preserve it throughout the program. 63 | At the same time, it also needs to know how to implement built-in 64 | reductions and other features correctly. 65 | 66 | The solution to this problem is a function to inform MPI of the 67 | C equivalents of all Fortran types. This way, the MPI library can 68 | implement `(MPI_REAL,MPI_SUM)` reductions with `MPI_FLOAT` and get 69 | native performance. Otherwise, a library like Vapaa would have to 70 | implement all the built-in reductions manually, which is not optimal. 71 | 72 | The other issue here is that MPI may need to implement logical reductions 73 | like `MPI_LAND`, `MPI_LOR` and `MPI_LXOR` in C. This requires it to know 74 | how Fortran `LOGICAL` works. For historical reasons, going back to the 75 | VAX platform, Fortran `LOGICAL` may not behave like C. It may, for example, 76 | use the sign bit to represent booleans, rather than 0 and non-zero. 77 | Even if 0 is `.FALSE.`, `.TRUE.` could be 1 or `0xFFFFFFFF`. 78 | 79 | As before, we need a function to tell MPI what the literal values of 80 | Fortran `.TRUE.` and `.FALSE.` are. 81 | 82 | The illustrate the previous case 83 | 84 | Fortran Compiler|Flags|`.FALSE.`|`.TRUE.` 85 | ---|---|---|--- 86 | GCC | | 0 | 1 87 | IFX | | 0 | -1 (`0xFFFFFFFF`) 88 | IFX | `-fpscomp logicals` | 0 | 1 89 | NVHPC | | 0 | -1 (`0xFFFFFFFF`) 90 | LLVM 14 (`flang-new`) | | 0 | -1 (`0xFFFFFFFF`) 91 | LLVM 20 (`flang-new`) | | 0 | 1 92 | Cray | | 0 | 1 93 | 94 | I did not probe when in the LLVM version history the values of logical changed to match C. 95 | 96 | Here's more fun data: 97 | 98 | Fortran Compiler|Flags| 0 | 1 | -1 | 2 99 | ---|---|---|---|---|--- 100 | GCC | | F & ! T | T & ! F | T & ! T | T & ! T 101 | IFX | | F & ! T | T & ! F | T & ! F | F & ! T 102 | IFX | `-fpscomp logicals` | F & ! T | T & ! F | T & ! F | T & ! F 103 | NVHPC | | F & ! T | T & ! F | T & ! F | F & ! T 104 | LLVM 14 (`flang-new`) | | F & ! T | T & ! F | T & ! F | F & ! T 105 | 106 | I didn't run this test for LLVM 20 or Cray. 107 | 108 | # Sentinels 109 | 110 | There was a request to provide addresses in C for all of 111 | the Fortran sentinels, not just `MPI_F(08)_STATUS(ES)_IGNORE`. 112 | We declined to solve this, because it's easy to implement directly 113 | in user code in the rare cases (profiling tools) where it is needed. 114 | 115 | See https://github.com/jeffhammond/vapaa/blob/main/source/detect_sentinels.c 116 | and https://github.com/jeffhammond/vapaa/blob/main/source/detect_sentinels_c.F90. 117 | 118 | # Callbacks 119 | 120 | This one is more difficult, and is not part of the current proposal for the standard. 121 | There are multiple use cases, not just Fortran, for extended callbacks that have 122 | extra state associated therewith, the way `MPI_Grequest_start` does. 123 | This allows the language interface to attach language-specific information 124 | about types or error-handling to the callback, so that it can be implemented 125 | more efficiently or in a more idiomatic way. 126 | 127 | A new reduction callback was proposed to address this, but there was too much debate 128 | about it's semantics to get it into the first version of the ABI. 129 | Specifically, should the user state be mutable or not, and if so, how 130 | is it protected from concurrent access (race conditions)? 131 | 132 | This doesn't mean the problem cannot be solved. It merely requires callback 133 | trampolines, as are implemented in 134 | [Mukautuva](https://github.com/jeffhammond/mukautuva) and 135 | [MPITrampoline](https://github.com/eschnett/MPItrampoline), 136 | at some added cost. 137 | However, since user-defined operations and callbacks are rarely 138 | on the critical path, this situation is tolerable. 139 | 140 | We intend to fix the callback situation in a future revision of MPI. 141 | 142 | # Module ABIs 143 | 144 | The internal structure of a Fortran module appears to leak into the symbol names. 145 | This means that a design like MPICH's 146 | ```fortran 147 | MODULE MPI 148 | USE MPI_CONSTANTS 149 | USE MPI_SIZEOFS 150 | USE MPI_BASE 151 | USE PMPI_BASE 152 | END MODULE MPI 153 | ``` 154 | may not be compatible with another implementation's module, if it uses different 155 | names internally. 156 | 157 | It is not yet proven that this is a problem, but if it is, then the only way 158 | to get an MPI Fortran module ABI is to specify the internal structure. 159 | Ironically, the terrible `mpif.h` doesn't have this problem, because it doesn't 160 | use modules at all. 161 | -------------------------------------------------------------------------------- /MPI_Needs_ABI_Part_7.md: -------------------------------------------------------------------------------- 1 | # MPI ABI Status Report 2 | 3 | As of December 2024, the MPI ABI proposals, one for C support 4 | and one for partial Fortran support, have passed the 5 | [first vote](https://www.mpi-forum.org/meetings/2024/12/votes) 6 | by the MPI Forum. 7 | 8 | There will be a second vote followed by a vote to approve the entire 9 | release of the standard containing the ABI, but it is unlikely 10 | that future votes will fail, given the lack of substantial objections 11 | to the current content of the proposal. 12 | 13 | As noted in the previous blog 14 | ([part 6](https://github.com/jeffhammond/blog/blob/main/MPI_Needs_ABI_Part_6.md)) 15 | there is still work left to be done to solve the Fortran problem. 16 | In order to get these parts right, we need to wait until the first phase 17 | of the ABI is done and then implement standalone Fortran bindings on 18 | top of it, as [Vapaa](https://github.com/jeffhammond/vapaa/) is doing. 19 | So far, Vapaa is focused only on `MPI_F08` support, but we will also 20 | prototype the legacy API to verify that nothing is lost if the 21 | MPI Fortran API is implemented separately from the MPI C API. 22 | -------------------------------------------------------------------------------- /MPI_RMA_HOWTO.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Mentoring_Interns.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | I've supervised more than twenty interns, all of whom were succesful as interns and are -- as best I can tell -- successful in their careers. This post summarizes my approach to mentoring, which boils down to (1) hire people then find projects that suite them, not the other way way around, and (2) do whatever it takes to make your interns successful, whether that means leaving them alone or sitting with them an hour or two a day. 4 | 5 | # Hiring 6 | 7 | I have hired 100% of the students who wanted to work for me or who were recommended to me, along with a number of others who I recruited because they had skills and interests that were aligned with my research activities. As I have never declined to hire an applicant, even an unofficial one, I have no idea how to do intern selection. What I can offer is my experience that everyone who wants to contribute can do so, given the right support, and it's up to the mentor to provide this. If you are not up to the challenge of making your interns successful no matter what, maybe you aren't ready to be a mentor. 8 | 9 | # Selecting a project 10 | 11 | It helps to have a rough idea of something you and your intern will do together (and various administrative functions often require it), but the details should be late binding and you should always been open to changing directions even circumstances require. I'll give an example of this later (it's in [Summer of 2011](Summer_of_2011.md)). 12 | 13 | # Defining success 14 | 15 | I applied ["Failure is not an option"](https://en.wikipedia.org/wiki/Failure_Is_Not_an_Option) to every one of my interns. I had a wonderful experience as an intern at PNNL (see [this](https://github.com/jeffhammond/blog/blob/main/Career_Trajectory.md) for details) and a lot of people were incredibly generous of their time to make that possible, and I chose to hold myself to a similar standard as a mentor. 16 | 17 | The grading criteria I used for interns was as follows: 18 | 19 | 1) Minimum success criteria. Whatever we need to accomplish for the internship to not be a complete waste of time. It needs to be something that is entirely within the skills the intern has already and can be achieved with straightforward effort, possibly requiring significant investment from the mentor. Nobody wants to be a failure, and succeeding at something straightforward gives people the confidence to take on bigger challenges. 20 | 21 | 2) Very good results. This is a pretty standard success criteria, such as publishing a peer-reviewed manuscript to which the intern contributed significantly. It should be substantial enough that everyone will recognize the intern's contribution. 22 | 23 | 3) Intergalactic science god. Yes, I actually used those words in some cases. I found that it was useful to provide an aspirational target for my interns that would keep them going if they managed hit level 2 relatively quickly. More importantly, I wanted them to know they I had a lot of confidence in their abilities and felt that there was a chance to accomplish something so profound that it would make them famous in the scientific community. It's important to have big dreams, even when you're an intern. 24 | 25 | I didn't enumerate these criteria to every intern, but I had them in mind when I was designing their projects. I'm not sure how other mentors do it, but I like to think that it's somewhat novel to think about 1 and 3, and work really hard to make sure that every intern gets to 1 as quickly as possible, and hopefully to 2. 26 | 27 | And if you are wondering, yes, my some of my students hit level 3. The work some of them did, often with very little help from me, changed their field of science and they are deservedly famous for it. At one point in my career, I spent a lot of time working on tensor-related things, but I found that some of my former interns are so much better at this topic than I am that I stopped working on it and found new areas to which I can contribute. 28 | 29 | (c) Copyright Jeff Hammond, 2020. No reuse permitted except by permission from the author. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # About this project 2 | 3 | This is my blog. I did not feel like learning a new system and I like GitHub a lot, so I figured I could just add my writing here and people could figure out how to read it if they want. 4 | 5 | ## Permissions 6 | 7 | I assert copyright on all of my original content here, unless noted otherwise. I don't see why anyone would want to reuse my writing as their own, but if you want to publish something I've created elsewhere, please request permission via a GitHub issue. 8 | 9 | Blog posts that are documenting technical information will probably use a different license. Please check the license at the bottom of every post to be sure about the license. 10 | 11 | # About me 12 | 13 | I am a nerd. I have a life outside of work that revolves around my family that I do not discuss in detail online. 14 | 15 | ## Professional biography 16 | 17 | Jeff Hammond is a Principal Engineer at NVIDIA where he works on HPC software for GPUs and ARM CPUs. His research interests include parallel programming models and system architecture. Previously, Jeff worked at Intel and the Argonne Leadership Computing Facility where he worked on a range of projects, including MPI-3, oneAPI, Blue Gene and Xeon Phi. Jeff received his PhD in Chemistry from the University of Chicago for work on NWChem. 18 | -------------------------------------------------------------------------------- /Reading_list.md: -------------------------------------------------------------------------------- 1 | These are books I've read or am reading and which I have enjoyed. 2 | 3 | # Currently Reading 4 | 5 | [Focus – The ASML way](https://focus-dewereldvanasml.nl/) by Marc Hijink. 6 | 7 | Putin's People by Catherine Belton. 8 | 9 | A Line in the Sand by James Barr. 10 | 11 | The Key to Nuclear Restraint by Thomas Jonter. 12 | 13 | Dark Sun: The Making of the Hydrogen Bomb, by Richard Rhodes. 14 | 15 | # Planning to Read 16 | 17 | A Concise History of Finland by Kirby. 18 | 19 | Legacy of Ashes by Tim Weiner. 20 | 21 | Lost Kingdom by Serghii Plokhy. 22 | 23 | The Gates of Europe by Serghii Plokhy. 24 | 25 | The Ascent of Money by Niall Ferguson. 26 | 27 | The Battle of Bretton Woods by Steil. 28 | 29 | The Origins of Totalitarianism by Hannah Arendt. 30 | 31 | Sunburst and Luminary by Don Eyles. 32 | 33 | # Have Read 34 | 35 | The Time Machine by H. G. Wells. 36 | 37 | Command and Control by Eric Schlosser. 38 | 39 | [The Demon-Haunted World: Science as a Candle in the Dark](https://en.m.wikipedia.org/wiki/The_Demon-Haunted_World) by Carl Sagan. 40 | 41 | [The Illustrated Man](https://en.wikipedia.org/wiki/The_Illustrated_Man) by Ray Bradbury. 42 | 43 | Chip War by Chris Miller. Outstanding book on the current and past state of semiconductors. 44 | 45 | The Right Stuff by Tom Wolfe. 46 | 47 | The NVIDIA Way by Tae Kim. 48 | 49 | [The Strangest Man: The Hidden Life of Paul Dirac, Quantum Genius](https://en.wikipedia.org/wiki/The_Strangest_Man). 50 | 51 | [What If?: Serious Scientific Answers to Absurd Hypothetical Questions](https://en.wikipedia.org/wiki/What_If%3F_(book)). 52 | 53 | The Last Empire by Serghii Plokhy. 54 | 55 | [American Prometheus](https://en.wikipedia.org/wiki/American_Prometheus). 56 | 57 | [The Making of the Atomic Bomb by Richard Rhodes](https://en.wikipedia.org/wiki/The_Making_of_the_Atomic_Bomb). 58 | 59 | How Not to Network A Nation by Benjamin Peters. 60 | I stopped reading this one because it was too slow. The topic could be covered with far fewer words. 61 | 62 | Never Split the Difference by Chris Voss. 63 | 64 | [The Great Influenza](https://www.penguinrandomhouse.com/books/288950/the-great-influenza-by-john-m-barry/) by John M. Barry. 65 | I enjoyed this book a great deal. The first third is about the development of modern medicine in the USA, not influenza specifically. 66 | 67 | Abstracting Away the Machine by Lorenzo. 68 | 69 | [Modern Fortran Explained: Incorporating Fortran 2018](https://academic.oup.com/book/26799) by Michael Metcalf, John Reid, Malcolm Cohen. 70 | This is an excellent book on the topic. If you are serious Fortran programmer, you should own it. 71 | 72 | [Concurrent and Real-Time Programming in Ada](https://doi.org/10.1017/CBO9780511611230) by Alan Burns and Andy Wellings. 73 | I did not want to read this book and didn't enjoy it, partly because of the writing and partly because of the subject matter. 74 | 75 | A History of Finland by Henrik Meinander. 76 | 77 | Finland's Relations with the Soviet Union, 1944-84 by R Allison. 78 | 79 | The Soul of a New Machine by Tracy Kidder. (I guess I didn't finish this one) 80 | 81 | Inside the Message Passing Interface by Supalov. 82 | 83 | [Cybernetics](https://en.wikipedia.org/wiki/Cybernetics:_Or_Control_and_Communication_in_the_Animal_and_the_Machine) by Norbert Wiener. 84 | 85 | [One Giant Leap](https://www.simonandschuster.com/books/One-Giant-Leap/Charles-Fishman/9781501106309) by Charles Fishman. 86 | This is a great sociopolitical perspective on the Apollo Program. 87 | 88 | [The Supermen](https://www.wiley.com/en-ie/The+Supermen:+The+Story+of+Seymour+Cray+and+the+Technical+Wizards+Behind+the+Supercomputer-p-9780471048855) by Charles J. Murray. 89 | 90 | Ogilvy on Advertising by David Ogilvy. 91 | 92 | The Decline and Fall of IBM by Robert X. Cringley. 93 | 94 | Only the Paranoid Survive by Andy Grove. 95 | 96 | The Innovator's Dilemma by Clayon Christensen. 97 | 98 | Far from the Tree by Andrew Solomon. 99 | 100 | The Information: A History, A Theory, A Flood by James Gleick. 101 | 102 | [The Media Lab](https://www.media.mit.edu/publications/the-media-lab-inventing-the-future-book/) by Steward Brand. 103 | I love this book, because it's from 1986 and I can see how well they predicted things, 104 | like high-speed internet at home and mobile computing devices. 105 | 106 | [Electrifying America](https://mitpress.mit.edu/9780262140485/) by David Nye. 107 | This one is probably too dry for most people, but I enjoyed it. 108 | I stopped reading 80-90% of the way through, because it was pretty clear that the story was 109 | going to end with most Americans having electricity 😊 110 | 111 | Working on the Bomb: An Oral History of WWII Hanford. By S. L. Sanger. 112 | This was a gift when I left PNNL. It's a great perspective on the Manhatten Project from a different perspective 113 | than one gets from the Los Alamos centered works. 114 | 115 | [The Puzzle Palace](https://en.wikipedia.org/wiki/The_Puzzle_Palace) by James Bamford. 116 | 117 | [The Nobel Prize](https://www.simonandschuster.com/books/The-Nobel-Prize/Burton-Feldman/9781611457247) by Burton Feldman. 118 | 119 | [Fermat's Enigma](https://en.wikipedia.org/wiki/Fermat%27s_Last_Theorem_(book)) by Simon Singh. 120 | 121 | [Blank Spots on the Map](https://www.penguinrandomhouse.com/books/302310/blank-spots-on-the-map-by-trevor-paglen/) by Trevor Paglen. 122 | 123 | [Expert C Programming: Deep C Secrets](https://www.oreilly.com/library/view/expert-c-programming/0131774298/) by P. van der Linden. 124 | It has been a while since I read this book, but I enjoyed it years ago. 125 | 126 | Who Got Einstein's Office by Regis. 127 | 128 | Genius by James Gleick. 129 | 130 | Fermat's Enigma by Simon Singh. 131 | 132 | How Would You Move Mount Fuji by William Poundstone. 133 | -------------------------------------------------------------------------------- /Summer_of_2011.md: -------------------------------------------------------------------------------- 1 | This was originally part of [Mentoring Interns](Mentoring_Interns.md) and some of the CSGF references are explained in [Career Trajectory](Career_Trajectory.md), although I will eventually write a post dedicated to CSGF. 2 | 3 | # The Summer of 2011 4 | 5 | I've been asked about this before, so I might as well write it down here. In the summer of 2010, I had started talking to Devin Matthews and Edgar Solomonik about doing their DOE-CSGF practica (i.e. internships) with me at Argonne. At the time, Edgar was a student of Jim Demmel's at Berkeley, working on efficient algorithms for matrix computations. Devin was a chemistry graduate student at Texas wtih John Stanton, working on efficient formulations of coupled-cluster theory and their applications to spectroscopy. 6 | 7 | Edgar and I came up with a plan to do something related to dynamic load-balancing in quantum chemistry, which was closer to his undergraduate research with Sanjay Kale in the Charm++ group. The idea was to add better task scheduling to [MPQC](https://mpqc.org/), which was a C++ code that did some of the same things as NWChem, but was a lot simpler to get working on Blue Gene/P. 8 | 9 | Devin's project was going to be something that would expose him to more modern programming methods, since his work at Texas involved [CFOUR](http://www.cfour.de/), which is the closest thing to a genuine Fortran 77 code as I have ever seen. Given Devin's experience with CFOUR, I figured we'd learn Fortran 95 together, and do something interesting along the way. 10 | 11 | As it turned out, neither of these project ideas worked out. After a week or two, Edgar found that he couldn't get past a bug in the IBM C++ compiler for Blue Gene/P, and given the rate at which those got fixed (2-3 months), we'd lose the entire summer waiting to merely compile the code. Around the same time, Devin decided that Fortran 95 dynamic memory management was just too stupid and he was not going to continue with modern Fortran. (I think Edgar put Devin onto C++ -- Devin is now one of the most talented C++ programmers I know, although he's still fluent in Fortran 77 for CFOUR purposes). 12 | 13 | The other thing that was happening around that time -- this was May -- was that my wife was about to give birth to our first child and I was planning to be absent from the office for a while. Knowing that I was not going to be available to help as much as normally would and the absence of any promising leads on either of their planned activities, I decided that the prudent thing was to come up with something new that they could do together while I was away. 14 | 15 | The problem that I had been chewing on for a few years was how to do distributed tensor contractions efficiently, particularly on Blue Gene systems. This mean using MPI collectives, not one-sided communication (while Blue Gene/P and /Q were really good at one-sided communication, they were utterly magical when it came to MPI collectives). This project was closer to both of their dissertation projects and would not have been approved by the CSGF program stewards as a planned activity, but our plans were not going well. 16 | 17 | We met in a conference room around the first week of June and I sketched out the problem statement. We agreed it was a worthwhile thing to try to solve, and it was ideally suited for the two of them, since Edgar was a master of distributed linear algebra and Devin was a master of tensors in the context of coupled-cluster theory. I left the lab shortly thereafter and didn't come back for about a month. I was accessible via email and chat, but I don't recall much communication from them. 18 | 19 | By the time I got back to the lab in July, they had solved the problem. They hadn't implemented everything yet, but the ideas were all there. Edgar created the [Cyclops Tensor Framework](https://solomon2.web.engr.illinois.edu/ctf/) (CTF) and Devin wrote what would become [AQUARIOUS](https://github.com/devinamatthews/aquarius). Devin wrote CCSD and CCSDT using CTF, and along the way created a prototype of a Cholesky-decomposition based SCF code. The CCSD and CCSDT codes based on CTF were faster than the NWChem TCE, which was at the time the highest performance massively parallel implementaiton of those methods. Furthermore, CTF was based entirely on simple MPI primitives that were highly optimized on every supercomputer, which ensured portable parallel performance without any special effort, which has never been true of NWChem because of its reliance on one-sided communication. 20 | 21 | Once CTF was published, it became the new standard for other researchers to beat. Some have improved on CTF by improving the design, but the core idea that tensor contractions, even ones involving highly symmetric tensors, can and should be done using communication-optimal matrix algorithms combined with collective tensor transposes. Furthermore, CTF showed that no code generation is required and that all of the operations of coupled-cluster theory, even higher-order methods like CCSDTQ, can be expressed in simple notation that translates efficiently to a small number of back-end functions. 22 | 23 | # Details 24 | 25 | If you want to know more about this project, please read the following: 26 | * [An Overview of Cyclops Tensor Framework](https://solomonik.cs.illinois.edu/talks/molssi-monterey-may-2017.pdf) - Edgar's overview slides. 27 | * [A preliminary analysis of Cyclops Tensor Framework](https://www2.eecs.berkeley.edu/Pubs/TechRpts/2012/EECS-2012-29.html) - The first paper. 28 | * [Cyclops Tensor Framework: reducing communication and eliminating load imbalance in massively parallel contractions 29 | ](https://www2.eecs.berkeley.edu/Pubs/TechRpts/2012/EECS-2012-210.html) - The second paper. 30 | * [A massively parallel tensor contraction framework for coupled-cluster computations](https://www2.eecs.berkeley.edu/Pubs/TechRpts/2014/EECS-2014-143.html) - The third paper. 31 | 32 | [CTF on GitHub](https://github.com/cyclops-community/ctf) has references to additional applications and publications. 33 | 34 | (c) Copyright Jeff Hammond, 2020. No reuse permitted except by permission from the author. 35 | -------------------------------------------------------------------------------- /Turing_Pi_v1.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | I got a Turing Pi v1 ([docs](https://docs.turingpi.com/)) for fun. 4 | 5 | # Acquisition 6 | 7 | The Turing Pi system that you buy is just the board. 8 | You need to buy compute modules and the power supply elsewhere. 9 | I bought six Compute Module 3+ 8GB from [PiShop.us](https://www.pishop.us/product/raspberry-pi-compute-module-3-8gb/) 10 | and the recommended LEDMO power supply from [Amazon](https://www.amazon.com/gp/product/B01461MOGQ/). 11 | 12 | The total cost of this setup is just shy of $400, not including any additional storage you might need. 13 | Obviously, it helps to have an HDMI cable, monitor, USB keyboard and mouse, and a USB to MicroUSB cable, 14 | but most people who would buy a Turing Pi have those laying around. 15 | 16 | # Setup 17 | 18 | I watched the YouTube videos linked on the Turing Pi website to get an idea of what to do. 19 | The documentation isn't perfect but I managed to do it on the first attempt, and I'm not very good at this sort of thing. 20 | 21 | I downloaded the Raspberry Pi 3 64-bit image of [Ubuntu](https://ubuntu.com/download/raspberry-pi) 22 | because I use 64-bit Ubuntu 20 almost everywhere else. 23 | The Turing Pi people recommend some other distro. 24 | If you want to follow their documentation exactly, use that instead. 25 | 26 | Following the directions on their website, install `usbboot` on some other Linux (or Windows, but who uses that?) system. 27 | You'll use this to boot the compute modules for flashing the OS image onto the eMMC storage. 28 | You will need to do this for every module, which is a bit tedious, but presumably doesn't happen very often. 29 | 30 | First, move the jumper next to the MicroUSB port to `flash` instead of `boot`. Connect the machine where you downloaded the Pi image to the Turing Pi board via USB-to-MicroUSB cable. 31 | 32 | For each compute module, do this: 33 | 34 | * Run `sudo ./rpiboot` in a terminal to mount the eMMC storage. 35 | * Write the image to it. This takes about 10 minutes. 36 | 37 | I used Ubuntu's `Disks` GUI app but one can of course use `dd` instead. 38 | 39 | [This](https://synyx.de/blog/turing-pi-the-ultimate-cluster-board-for-raspis/) appears to be another good description 40 | of the setup process, with more automation. You might find that more useful than mine. 41 | 42 | # Getting Started 43 | 44 | I was dumb and initially only imaged one compute module. 45 | What I learned from using the first image is that the Pi 3+ can barely run Gnome, 46 | e.g. I saw multiple hangs and even when I went back to the pure terminal, the graphics was sluggish. 47 | This isn't too surprising, but given that I have a pretty good time with Ubuntu 20 and Gnome 48 | on my Raspberry Pi 4, 49 | I subsequently uninstalled Gnome and will only use these things via SSH from a more powerful system. 50 | 51 | After imaging all of the modules, I see all of the node IP addresses in my router and with `nmap`. 52 | I used `pdsh` to make Apt changes symmetrically. 53 | 54 | # Running HPC Workloads 55 | 56 | ## MPI 57 | 58 | Open-MPI is much more reliable at launching processes on the Turing Pi. 59 | The following just works. 60 | ``` 61 | $ mpicc.openmpi -g -Os hello.c -o hello.x && pdsh -R exec -w turing[1-5] ssh -l ubuntu %h scp turing0:/tmp/hello.x /tmp/hello.x && /usr/bin/mpirun.openmpi --host turing0:4,turing1:4,turing2:4,turing3:4,turing4:4,turing5:4 /tmp/hello.x 62 | ``` 63 | 64 | The following does not work. 65 | ``` 66 | $ mpicc.mpich -g -Os hello.c -o hello.x && pdsh -R exec -w turing[1-5] ssh -l ubuntu %h scp turing0:/tmp/hello.x /tmp/hello.x && /usr/bin/mpirun.mpich --host turing0:4,turing1:4,turing2:4,turing3:4,turing4:4,turing5:4 /tmp/hello.x 67 | ``` 68 | 69 | Eventually, I had to change `~/.ssh/config` to use the key by default and just use the raw IP addresses. 70 | I don't know for sure, but it seems like a DNS issue (https://isitdns.com/). 71 | ``` 72 | $ mpicc.mpich -g -Os hello.c -o hello.x && pdsh -R exec -w turing[1-5] ssh -l ubuntu %h scp turing0:/tmp/hello.x /tmp/hello.x && /usr/bin/mpirun.mpich -launcher ssh --host 192.168.1.23:4,192.168.1.24:4,192.168.1.25:4,192.168.1.26:4,192.168.1.27:4,192.168.1.28:4 /tmp/hello.x 73 | ``` 74 | 75 | ### MPI `hello.c` (in case you need it) 76 | ```c 77 | #include 78 | #include 79 | 80 | int main(int argc, char** argv) 81 | { 82 | MPI_Init(&argc, &argv); 83 | 84 | int np; 85 | MPI_Comm_size(MPI_COMM_WORLD, &np); 86 | 87 | int me; 88 | MPI_Comm_rank(MPI_COMM_WORLD, &me); 89 | 90 | int name_len; //unused 91 | char name[MPI_MAX_PROCESSOR_NAME]; 92 | MPI_Get_processor_name(name, &name_len); 93 | printf("Hello from processor %s, rank %d out of %d processors\n", name, me, np); 94 | 95 | MPI_Finalize(); 96 | 97 | return 0; 98 | } 99 | ``` 100 | 101 | ## NWChem 102 | 103 | ### Install 104 | 105 | NWChem is part of the Debian/Ubuntu package manager: 106 | ``` 107 | apt install nwchem 108 | ``` 109 | While the binary is not built optimally, it's good enough to start. 110 | Building NWChem is not trivial either in human time or execution time of the build -- using a pre-built binary saves a lot of time, although this is eventually offset by increased execution time of the application itself. 111 | 112 | ### Launch 113 | 114 | After doing all the necessary SSH things, including passwordless SSH keys and `~/.ssh/knownhosts`, one can use Open-MPI 4.0.3 (from Apt) to run jobs. Test your MPI installing by running `hostname`. 115 | 116 | ``` 117 | $ /usr/bin/mpirun.openmpi --host turing0:4,turing1:4,turing2:4,turing3:4,turing4:4,turing5:4 /usr/bin/nwchem w9_b3lyp_6-31G_energy.nw 118 | ``` 119 | 120 | ## More NWChem 121 | 122 | This is WIP. I am still debugging the NWChem build... 123 | ``` 124 | $ /usr/bin/mpirun.mpich -launcher ssh --host 192.168.1.23:4,192.168.1.24:4,192.168.1.25:4,192.168.1.26:4,192.168.1.27:4,192.168.1.28:4 /tmp/nwchem w9_b3lyp_6-31G_energy.nw 125 | ``` 126 | -------------------------------------------------------------------------------- /Why_Finland.md: -------------------------------------------------------------------------------- 1 | # Why Finland? 2 | 3 | This is a question I get asked quite a bit, and it is overdue for me to write about it. 4 | This will be a long post, and probably have quite a few iterations before it is finished. 5 | 6 | I will answer the questions of why I decided to leave the USA and why I did not choose 7 | to live in other countries in separate posts, so if you are looking for that here, 8 | you will be disappointed. 9 | 10 | # Finland is a great place for children 11 | 12 | The juvenile education system is highly regarded. 13 | I will not try to restate what is well-covered 14 | [elsewhere](https://www.weforum.org/agenda/2018/09/10-reasons-why-finlands-education-system-is-the-best-in-the-world). 15 | 16 | Finns do not pay for education in Finland. Ever. 17 | [Only students from outside the EU, EEA and Switzerland pay tuition](https://www.study.eu/article/tuition-fees-in-finland). 18 | This is one of many reasons why it is reasonable to pay higher taxes. 19 | Rather than save up hundreds of thousands of dollars in order to give it to a private university 20 | with a multi-billion dollar endowment in the USA, or a nominally "public" institution 21 | with slightly lower fees (between $20K and $40K a year at present), I can pay my taxes, 22 | apply for citizenship, and send my kids to top universities for less than the cost of a bus pass. 23 | 24 | Even those who pay tuition pay far less than they would in the USA. 25 | The maximum tuition rate in Finland is Helsinki University, at 26 | €18K/year, which is not too much more than a state resident pays for 27 | [UMass](https://www.massachusetts.edu/education/tuition-fees) 28 | and about half of what an out-of-state domestic student pays at 29 | [UCLA](https://admission.ucla.edu/tuition-aid/tuition-fees). 30 | 31 | # Finland does not treat people like garbage 32 | 33 | Finland also considers homelessness unacceptable and has policies to eliminate it 34 | (https://www.theguardian.com/cities/2019/jun/03/its-a-miracle-helsinkis-radical-solution-to-homelessness). 35 | This is in stark contrast to the USA, which has a 36 | [homeless population](https://en.wikipedia.org/wiki/Homelessness_in_the_United_States) 37 | on par with Helsinki or Portland, Oregon. 38 | 39 | Police in Finland [rarely draw their guns](https://yle.fi/uutiset/osasto/news/police_in_finland_shoot_their_firearms_in_the_line_of_duty_about_ten_times_each_year/10183865) 40 | and frequently [kill no one for an entire year](https://en.wikipedia.org/wiki/List_of_killings_by_law_enforcement_officers_by_country#2010s). 41 | 42 | The Finnish prison system actually attempts to [rehabilitate](https://yle.fi/uutiset/osasto/news/open_prisons_in_finland_are_like_a_holiday_camp__but_they_seem_to_work/11214953) people. 43 | 44 | 45 | -------------------------------------------------------------------------------- /Winning_by_laziness.md: -------------------------------------------------------------------------------- 1 | # Winning via Laziness 2 | 3 | This post describes my life philosophy of never doing things myself that 4 | can be done better by others. 5 | 6 | # Background on Tensors 7 | 8 | As a quantum chemist focused on coupled-cluster theory in graduate school, 9 | I found myself looking at a lot of tensor contractions. 10 | For the uninitiated, tensor contractions are a class of linear algebra operation 11 | that look like matrix multiplication, but with more indices. 12 | Here is an example of a tensor contraction: 13 | ``` 14 | for all i,j,k,a,b,c: 15 | R(i,j,a,b) = T(i,k,a,c) * V(j,k,b,c) 16 | ``` 17 | This example might be found in CCSD (coupled-cluster singles and doubles). 18 | 19 | This can be contrasted wtih the simpler but similar case of matrix-matrix multiplication: 20 | ``` 21 | for all i,j,k: 22 | R(i,j) = T(i,k) * V(j,k) 23 | ``` 24 | The well-known libraries for the latter are called the BLAS 25 | (Basic Linear Algebra Subroutines) 26 | and the specific procedure is `_GEMM`, where `_` is a letter 27 | associated with numerical type used. 28 | 29 | In the BLAS `_GEMM`, there are 4 possible index permutations 30 | support, two for each input matrix, depending on whether one 31 | contracts over the row or column indices. 32 | The permutation on the output matrix can be handled implicitly 33 | by swapping the input matrices, because `(AB)^T=B^T A^T`. 34 | 35 | In the first example, there are many more possibilities. 36 | One can access each tensor in 24 different ways, ranging from 37 | 1234 to 4321. There are thus 24x24x24=13824 distinct implementations 38 | of that single tensor contraction, which is one of many possibilities. 39 | 40 | There is one obvious simplification possible here, which is to 41 | not try to optimize all the contractions directly, but to first 42 | rearrange the 4D tensors into cases that are handled directly 43 | by the BLAS. 44 | The simplest approach is to rearrange all 24 cases into a single 45 | canonical one, in which case, implementation all 13824 cases boils 46 | down to the application of 23 (24 minus the identity permutation) 47 | permutations to each of the 24 cases of 4D tensors, 48 | and one type of BLAS call. 49 | 50 | However, as has been shown previously 51 | (cite dissertation and Paul's papers), 52 | tensor permutations are expensive, and may be the bottleneck if 53 | used excessively. 54 | It is therefore prudent to both optimize permutations and to call 55 | the least expensive ones. 56 | One way to reduce the need for expensive permutations is to observe 57 | that the BLAS can perform the canonical matrix transpose permutation 58 | internally, at negligible cost. 59 | Thus, one should be able to use only 11 permutations, and do the 60 | `(12)<->(34)` part of any permutation inside of the BLAS. 61 | There are additional reductions possible, by breaking up contractions 62 | into multiple BLAS calls, including matrix-vector products, not 63 | just matrix-matrix products. 64 | Edoardo de Nipolini and coworkers have studied this. 65 | 66 | This is another way for one to optimize these operations, 67 | which is to convince other people to do it. 68 | 69 | # Outsourcing hard problems 70 | 71 | Many quantum chemists over the years have tried to be smart 72 | and solve hard computational problems with tensor contractions. 73 | I won't name all of them. 74 | I played around with optimization tensor permutations, 75 | which led to a modest success that made it into my dissertation, 76 | but I knew that there were much better implementations possible, 77 | and, more importantly, that I did not know how to produce them. 78 | 79 | Supercomputing 2007 (?) was in Austin, Texas, which is home 80 | to the research group of Robert van de Geijn. 81 | Robert's group knows a bit about dense linear algebra. 82 | I met Robert during Supercomputing, and asked him about tensors. 83 | He invited me to his office later in the week, and I spent a 84 | day with Robert and Maggie, talking about everything that was 85 | right and wrong about linear algebra software. 86 | We did not solve any tensor contraction problems that day. 87 | However, I did manage to convince Robert that I had mildly interesting 88 | computational problems to solve. 89 | 90 | I will not go into detail but the long-term result of that discussion, 91 | and many others that 92 | 93 | 94 | --------------------------------------------------------------------------------