├── CODE
    ├── Makefile
    ├── coarray.F90
    ├── do-concurrent.F90
    ├── mpi-abi
    │   ├── Makefile
    │   ├── check.F90
    │   ├── check.c
    │   ├── handle.F90
    │   └── handle.c
    ├── numerot.F90
    ├── openmp.F90
    └── sequential.F90
├── Career_Trajectory.md
├── ChatGPT.md
├── Dealing_with_imperfect_Fortran_compilers.md
├── Dealing_with_imperfect_Fortran_compilers_2.md
├── Dell_Inspiron5000_Linux.md
├── Empathy.md
├── Finnish_Nonalcoholic_Beer.md
├── Finnish_is_cool.md
├── Finnish_language.md
├── Fortrans_Missing_Parallelism.md
├── GitHub_Universe.md
├── HPC_Numerical_Precision.md
├── In_Defense_Of_Fortran.md
├── Learning_Programming_Languages.md
├── MPI_Needs_ABI.md
├── MPI_Needs_ABI_Part_2.md
├── MPI_Needs_ABI_Part_3.md
├── MPI_Needs_ABI_Part_4.md
├── MPI_Needs_ABI_Part_5.md
├── MPI_Needs_ABI_Part_6.md
├── MPI_Needs_ABI_Part_7.md
├── MPI_RMA_HOWTO.md
├── Mentoring_Interns.md
├── README.md
├── Reading_list.md
├── Summer_of_2011.md
├── Turing_Pi_v1.md
├── Why_Finland.md
└── Winning_by_laziness.md


/CODE/Makefile:
--------------------------------------------------------------------------------
 1 | FC = gfortran
 2 | FFLAGS = -Wall -O2 -fopenmp -fcoarray=single
 3 | 
 4 | all: sequential.x coarray.x do-concurrent.x openmp.x
 5 | 
 6 | numerot.o: numerot.F90
 7 | 	$(FC) $(FFLAGS) -c $< -o $@
 8 | 
 9 | %.x: %.F90 numerot.o
10 | 	$(FC) $(FFLAGS) numerot.o $< -o $@
11 | 
12 | clean:
13 | 	-rm -f *.x
14 | 	-rm -f *.o
15 | 	-rm -f *.mod
16 | 


--------------------------------------------------------------------------------
/CODE/coarray.F90:
--------------------------------------------------------------------------------
 1 | program main
 2 |   use numerot
 3 |   implicit none
 4 |   real :: A(100), B(100), C(100)
 5 |   real :: R
 6 |   
 7 |   A = 1
 8 |   B = 1
 9 |   C = 1
10 |   
11 |   if (num_images().ne.3) STOP
12 |   
13 |   if (this_image().eq.1) R = yksi(A)
14 |   if (this_image().eq.2) R = kaksi(A)
15 |   if (this_image().eq.3) R = kolme(A)
16 |   
17 |   SYNC ALL()
18 |   
19 |   call co_sum(R)
20 |   if (this_image().eq.1) print*,R
21 | end program main
22 | 


--------------------------------------------------------------------------------
/CODE/do-concurrent.F90:
--------------------------------------------------------------------------------
 1 | program main
 2 |   use numerot
 3 |   implicit none
 4 |   real :: A(100), B(100), C(100)
 5 |   real :: RA, RB, RC
 6 |   integer :: k
 7 |   
 8 |   A = 1
 9 |   B = 1
10 |   C = 1
11 | 
12 |   do concurrent (k=1:3)
13 | 
14 |     if (k.eq.1) RA = yksi(A)
15 |     if (k.eq.2) RB = kaksi(B)
16 |     if (k.eq.3) RC = kolme(C)
17 | 
18 |   end do
19 |   
20 |   print*,RA+RB+RC
21 | end program main
22 | 


--------------------------------------------------------------------------------
/CODE/mpi-abi/Makefile:
--------------------------------------------------------------------------------
 1 | FC = gfortran
 2 | FCFLAGS = -g -std=f2008 -fall-intrinsics -Wall -Wextra -Werror
 3 | 
 4 | CC = gcc
 5 | CFLAGS = -g -std=c11 -Wall -Wextra -Werror
 6 | 
 7 | all: handle_f.x handle_c.x check_f.x check_c.x
 8 | 
 9 | %_f.x: %.F90
10 | 	$(FC) $(FCFLAGS) $< -o $@
11 | 
12 | %_c.x: %.c
13 | 	$(CC) $(CFLAGS) $< -o $@
14 | 
15 | clean:
16 | 	-rm -f *.x *.o *.mod
17 | 	-rm -fr *.dSYM
18 | 


--------------------------------------------------------------------------------
/CODE/mpi-abi/check.F90:
--------------------------------------------------------------------------------
 1 | module m
 2 |     use iso_c_binding, only: c_intptr_t
 3 |     type, bind(C) :: handle
 4 |         integer(kind=c_intptr_t) :: val
 5 |     end type handle
 6 | 
 7 | end module m
 8 | 
 9 | program p
10 |     use m
11 |     implicit none
12 |     type(handle) :: h
13 |     print*,'LOC: ',LOC(h),LOC(h%val)
14 | end program p
15 | 


--------------------------------------------------------------------------------
/CODE/mpi-abi/check.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdint.h>
 3 | 
 4 | typedef struct {
 5 |     intptr_t val;
 6 | } handle1;
 7 | 
 8 | typedef struct {
 9 |     intptr_t val;
10 | } handle2;
11 | 
12 | void foo(handle1 h1, handle2 * h2)
13 | {
14 |     h2->val = h1.val;
15 | }
16 | 
17 | int main(void)
18 | {
19 |     handle1 h1;
20 |     handle2 h2;
21 |     foo(h1,&h2);
22 |     // THIS SHOULD FAIL
23 |     foo(h2,&h1);
24 |     return 0;
25 | }
26 | 


--------------------------------------------------------------------------------
/CODE/mpi-abi/handle.F90:
--------------------------------------------------------------------------------
 1 | module m
 2 |     use iso_c_binding, only: c_intptr_t
 3 |     type, bind(C) :: handle
 4 |         integer(kind=c_intptr_t) :: val
 5 |     end type handle
 6 | 
 7 | end module m
 8 | 
 9 | program p
10 |     use m
11 |     implicit none
12 |     type(handle) :: h
13 |     print*,'LOC: ',LOC(h),LOC(h%val)
14 | end program p
15 | 


--------------------------------------------------------------------------------
/CODE/mpi-abi/handle.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdint.h>
 3 | 
 4 | typedef struct {
 5 |     intptr_t val;
 6 | } handle;
 7 | 
 8 | int main(void)
 9 | {
10 |     handle h;
11 |     printf("&h=%p &h[0]=%p\n",&h,&(h.val));
12 |     return 0;
13 | }
14 | 


--------------------------------------------------------------------------------
/CODE/numerot.F90:
--------------------------------------------------------------------------------
 1 | module numerot
 2 |   contains
 3 |     pure real function yksi(X)
 4 |       implicit none
 5 |       real, intent(in) :: X(100)
 6 |       !real, intent(out) :: R
 7 |       yksi = norm2(X)
 8 |     end function yksi
 9 |     pure real function kaksi(X)
10 |       implicit none
11 |       real, intent(in) :: X(100)
12 |       kaksi = 2*norm2(X)
13 |     end function kaksi
14 |     pure real function kolme(X)
15 |       implicit none
16 |       real, intent(in) :: X(100)
17 |       kolme = 3*norm2(X)
18 |     end function kolme
19 | end module numerot
20 | 


--------------------------------------------------------------------------------
/CODE/openmp.F90:
--------------------------------------------------------------------------------
 1 | program main
 2 |   use numerot
 3 |   implicit none
 4 |   real :: A(100), B(100), C(100)
 5 |   real :: RA, RB, RC
 6 |   
 7 |   A = 1
 8 |   B = 1
 9 |   C = 1
10 |   
11 |   !$omp parallel
12 |   !$omp master
13 |   
14 |   !$omp task
15 |   RA = yksi(A)
16 |   !$omp end task
17 |   
18 |   !$omp task
19 |   RB = kaksi(B)
20 |   !$omp end task
21 |   
22 |   !$omp task
23 |   RC = kolme(C)
24 |   !$omp end task
25 |   
26 |   !$omp end master
27 |   !$omp end parallel
28 |   
29 |   print*,RA+RB+RC
30 | end program main
31 | 


--------------------------------------------------------------------------------
/CODE/sequential.F90:
--------------------------------------------------------------------------------
 1 | program main
 2 |   use numerot
 3 |   implicit none
 4 |   real :: A(100), B(100), C(100)
 5 |   real :: RA, RB, RC
 6 |   
 7 |   A = 1
 8 |   B = 1
 9 |   C = 1
10 | 
11 |   RA = yksi(A)
12 |   RB = kaksi(B)
13 |   RC = kolme(C)
14 |   
15 |   print*,RA+RB+RC
16 | end program main
17 | 


--------------------------------------------------------------------------------
/Career_Trajectory.md:
--------------------------------------------------------------------------------
 1 | # Summary
 2 | 
 3 | I started out in organic chemistry and ended up making supercomputers at Intel and NVIDIA.
 4 | 
 5 | Since people often ask me about how this happened, or to speak about career paths in general, I figure I should write it down.
 6 | 
 7 | #  The beginning
 8 | 
 9 | I was fascinated by quantum mechanics from a young age, probably because it was elusive, not because I had a particular need for it.  I remember reading books on heavy-element chemistry and string theory in high school, in those books written by popularizers that hide all the math.  This was most useful in chemistry class, mostly because my boredom often led to mischief, and a chemistry lab is a very bad place to conduct mischief (my high school was old and had actual uranium in storage that nobody knew how to dispose of without shutting down the school).
10 | 
11 | My career plan at that time was to become a lawyer, because I was under the foolish impression that this was a good way to get rich arguing with people.  However, I did really well in chemistry and math classes, so I kept taking them, because they were often an easy way to get my grades up.  However, it became clear in college that my math skills were more suited for a chemistry major than a math major, which aligned with my disinterest in not learning German, French and/or Russian (as was prescribed for PhDs in math).
12 | 
13 | # University of Washington
14 | 
15 | I became a chemistry major because of a University of Washington Professor named [Nic Epiotis](https://www.legacy.com/us/obituaries/seattletimes/name/nicholas-epiotis-obituary?pid=175420056).  Epiotis was a character: he told off-topic stories during lectures and disparaged organic chemistry traditions like "actually reading the textbook" and "memorizing things".  I recall one day he held up a 1000-page [organic chemistry textbook](https://www.wiley.com/en-us/Organic+Chemistry%2C+12th+Edition-p-9781118875766), pinched off about 20 pages and declared, "if you read and understand this section, you can get an A in this class - the rest of the book doesn't matter."  As an inherently lazy person, this was my kind of class.  I feel in love with the quantum mechanical aspects of organic chemistry, which explain, among other things, why carrots are orange (see e.g. [this page](https://www.maplesoft.com/support/help/Maple/view.aspx?path=QuantumChemistry/ParticleInBox-Dyes) to understand the details).
16 | 
17 | Later in my undergraduate years, I was drawn into computational chemistry research thanks to Professors Wes Borden and Bill Reinhardt, along with Eric Brown, who was a rather charismatic T.A. and my research mentor in Wes' group.  Eric filled me with all sorts of ideas, essentially all of which have turned out to be useful.  Eric discouraged me from taking any classes in computer science because, "why do you want to learn Java?" and encouraged me to go deep into the theoretical and computational aspects of chemistry, which I did in grad school.  At the time, I was proficient in running Gaussian, and dabbled in Mathematica and Matlab, but I couldn't have compiled a C or Fortran program if my life depended on it.  I really wanted to undestand how Gaussian produced results and what sort of math was behind B3LYP and MP2, for example.
18 | 
19 | # University of Chicago
20 | 
21 | I went to the University of Chicago (UC) for grad school because (1) the quality of the faculty, (2) its reputation as an intensely academic place, and (3) the cost-of-living was a lot lower than at Berkeley, which matters when you're going to make $20K/year.  The first half of grad school was devoted to intensely theoretical topics with rudimentary computation, primarily done in Matlab.  I attended (sometimes for credit) a lot of classes outside the chemistry department to broaden my horizons, including my first proper computing courses, a course on random number generators, and a few weeks of intro to economics with Steve Levitt (the Freakonomics guy).  UC was as erudite as I had been promised and I had a wonderful social life full of profoundly nerds and geeks.  I also met my wife, who was a different type of nerd than I was.
22 | 
23 | In my first year of grad school, I applied to fellowships, including the Department of Energy (DOE) Computational Science Graduate Fellowship (henceforth "CSGF").  Applying for this fellowship requires non-trivial essays about computational science, which prompted me to think a lot about big computers and software in ways that I hadn't before.  Unfortunately, I did not get the fellowship the first time I applied.  Fortunately, in those days, it was possible to apply as a second-year grad student, which I did successfully.  At the time, I was only a Matlab programmer (and a bad one at that), but I had dreams of running semidefinite programming algorithms on Blue Gene/L, which I assume appealed to the selection committee more than my grades.
24 | 
25 | # DOE-CSGF and PNNL
26 | 
27 | One of the novel features of the CSGF is that it requires the student to do a "practicum" (internship) at a DOE lab.  I focused on Argonne National Lab (Argonne) and Pacific Northwest National Lab (PNNL), for both technical and geographic reasons.  I ended up at PNNL working on NWChem because they didn't care that I had no relevant programming experience.  As some of the details of my practicum are found in [Deixis](https://www.krellinst.org/doecsgf/docs/deixis/deixis2007.pdf), I won't repeat them here.
28 | 
29 | Working at PNNL in the summer of 2006 changed my life and is the reason I've been able to have a career in computing (I elaborate on this [here](https://www.youtube.com/watch?v=2wTZz-6eaSQ&ab_channel=KrellInstitute)).  Tim Carlson patiently taught me how to administer a Linux system, Jochen Autschbach and Bert de Jong got me started with Fortran (e.g. "start typing at column seven"), Dunyou Wang taught me about version control and regression testing, and Karol Kowalski taught me quantum many-body theory and putting everything together in NWChem.  Karol became my research supervisor and mentor -- we've written about a dozen papers together by now, which was an abnormally good outcome for a summer internship.
30 | 
31 | I worked on NWChem full-time for my last three years of grad school and my [dissertation](https://figshare.com/articles/thesis/Coupled_cluster_response_theory_parallel_algorithms_and_novel_applications/967859) is based on that work.  During that time, I became a lot better at programming, although still primary in [old-school Fortran](https://stevelionel.com/drfortran/2020/05/16/doctor-fortran-in-military-strength/), and I learned about running jobs on supercomputers, first on MPP2, a 1000-node supercomputer made of Itanium2 processors, the Quadrics Elan2 interconnect, and properly provisioned local and global filesystems, and but also on Argonne Blue Gene systems.
32 | 
33 | One of the things that changed for me due to working on NWChem was how I prioritized research, (software) development, and support.  Where once I believed that the scientific food chain had academics at the top, and everyone else was merely a failure who couldn't get a faculty position somewhere, I came to appreciate that national labs are at least as a good a place to do science, and that publishing papers is relatively low impact compared to building and supporting tools like NWChem that allow others to do science.  It is not typical for grad students to do software support, but I found a great deal of satisfaction in this, and it forced me to learn things about NWChem that I never would have learned otherwise.  I also started learning how to write documentation and practice empathy, which have proven to be far more useful in my professional life than solving quantum many-body equations in parallel.
34 | 
35 | In the same way that I wanted to understand the guts of Gaussian and B3LYP as an undergrad, working on NWChem inspired to wonder about how low-level system programming and supercomputer interconnects worked.  While I could write massively parallel code in NWChem, this had more to do with the structured approach to parallelism built into the application and the Global Arrays runtime system rather than any skill of mine.  Robert Harrison quoted So Hirata, who created the Tensor Contraction Engine (TCE) on which my work was built, as saying, "my code is working in parallel and I don't know why."  I had a similar experience as So, although I decided I cared more about "why?" questions in parallel computing than chemistry itself.
36 | 
37 | # Argonne
38 | 
39 | Thanks for CSGF, I had a number of connections at DOE labs to help find a job (postdoc).  While Argonne had once declared me unsuitable for their purposes because I didn't know how to program when I was a prospective intern, they were my top choice for a postdoc for geographic reasons.  I wanted to go to Argonne because they had a really good computer science division and I wanted to become proficient at both C and MPI.  They were also standing up rather novel supercomputers in the Argonne Leadership Computing Facility (ALCF) and needed someone with computational chemistry skills who could make those codes run on the Blue Gene/P architecture.
40 | 
41 | Shortly after the new computing building was finished, Vinod Tipparaju visited Argonne and I ended up in a conference room with him and Pavan Balaji.  I knew Vinod from PNNL days and he was one of the lead developers of Global Arrays.  Pavan was (and is) an MPI extraordinaire.  I didn't know Pavan very well back then, so it was serendipitous that Vinod made this introduction.  We spent an hour or two analyzing common patterns in scalable HPC applications, which set into motion a number of research projects that eventually led to [ARMCI-MPI](https://github.com/pmodels/armci-mpi), among other things, and was the beginning of a long and very fruitful collaboration between Pavan and me.  Working with Pavan and the rest of the MPICH team turned me into a moderately functioning computer scientist, and is the primary reason I'm capable of doing anything useful for Intel.
42 | 
43 | Because this blog post must be finite, I'm not going to describe all of the wonderful people at Argonne who taught me things, but one essential person in all of this was my postdoc supervisor, Ray Bair, who, like me, did a PhD in computational chemistry and wandered into HPC.  In stark contrast to academic postdocs, Ray did not give me a project, but rather mentored me on how to collaborate with people across the lab and taught me how to be successful in both my ALCF support "day job" and my research "side job".  Ironically, I have never co-authored a paper with Ray, but his support made possible some of my most cited papers with Larry Curtiss, Eugene DePrince and Pavan.
44 | 
45 | In the second half of my tenure at Argonne, I became involved in some of the more facility-oriented activities of ALCF, including the acceptance of the Blue Gene/Q system and the specification of DOE's pre-exascale systems as part of the [CORAL-1 program](https://science.osti.gov/-/media/ascr/ascac/pdf/meetings/20140331/CORAL_Update_for_ASCAC_March_31_2014_V31.pdf).  This allowed me to be acquainted with a number of HPC vendors and the technical and nontechnical aspects of designing and buying a very expensive supercomputer years in advance of its existence.  I learned about technical topics like "hardware-software co-design" and nontechnical topics like "using a statement-of-work to hold your vendor accountable."
46 | 
47 | The other thing that happened to me in my last two years in DOE was a string of rejections to proposals I submitted, many of which were unrelated to merit but rather due to me violating unwritten political critera such as "we must fund the oldest white men first", "you can't do _that_ research at _that_ lab", and "it doesn't matter how good your proposal is if the program office wants to fund another branch of science."  Furthermore, even when I was successful, I was kicked off of projects for being a chemist, even when my contributions were computer science.  Finally, my efforts to secure an Early Career grant were impeded because, as someone at a DOE user facility rather than a research division, "I was not a problem in need of solving" (i.e. I was covered by block funding and the lab didn't _need_ me to get any grants).
48 | 
49 | At the end of this long series of rejections from the program office, I concluded that they were never going to allow me to have a research career and that I might as well focus on the facility work I was doing, which included many things that I enjoyed.  In hindsight, this was a blessing, because it freed me from any long-term obligations to DOE and allowed me and of the burden of thinking about the politically driven shenanigans associated with research funding.  It also set the stage for my departure to industry, which never would have happened if I had been co-PI of an exascale co-design center or SciDAC project, as I had wanted so badly.
50 | 
51 | # Intel
52 | 
53 | In early 2014, Tim Mattson wrote me with an invitation to join his group at Intel Labs.  We had a very long email conversation related to my requirements, which included (1) never using Windows, (2) open-sourcing as much code as possible, and (3) continuing to have no formal working hours or location.  The most important factor in all of this was relocation to Portland, Oregon, which was one of the very few places on earth that would motivate me to leave Argonne.  There was a time when I thought I'd spend my entire career in DOE, but between the abuse of government by Ted Cruz and the total capitulation of Steve Chu to penny-wise, pound-foolish budget obsession, I was more than happy to take my chances with the private sector.
54 | 
55 | When I interviewed with the Intel Parallel Computing Lab, they didn't provide much in terms of guidance about what I'd be doing.  At one point, I asked whether they wanted me to focus on computational chemistry, parallel programming models, or something else.  The answer was something along the lines of, "all of that sounds good to us."  I've always been fortunate to have a great deal of freedom in what I do and how I do it, so this was appreciated, although it forced me to take a small leap of faith as to whether or not they meant it.  Fortunately, they did mean it, and in my 6+ years at Intel, I have worked under and 80-20 rule, wherein I control about 80% of my time, and somebody else controls about 20% of my time.
56 | 
57 | I'll write more about my time at Intel in another post.  Once I do that, I also need to write about working at NVIDIA.
58 | 
59 | (c) Copyright Jeff Hammond, 2023. No reuse permitted except by permission from the author.
60 | 


--------------------------------------------------------------------------------
/ChatGPT.md:
--------------------------------------------------------------------------------
  1 | TODO: write a blog post about ChatGPT generates terrible code, so that people stop freaking out.
  2 | 
  3 | ## Matrix Transpose
  4 | 
  5 | Q: Please write a code that transposes a matrix of arbitrary size using CUDA Fortran
  6 | 
  7 | A: Here is an example of a Fortran program that transposes a matrix of arbitrary size using CUDA:
  8 | 
  9 | ```fortran
 10 | program matrix_transpose
 11 |   use cudafor
 12 |   implicit none
 13 | 
 14 |   ! Declare variables
 15 |   integer, parameter :: m = 1000, n = 1000
 16 |   real, dimension(m,n) :: a, b
 17 |   integer :: i, j
 18 | 
 19 |   ! Initialize input matrix
 20 |   do i = 1, m
 21 |     do j = 1, n
 22 |       a(i,j) = i * j
 23 |     end do
 24 |   end do
 25 | 
 26 |   ! Transfer input matrix to device
 27 |   a = a !$cuf
 28 | 
 29 |   ! Launch kernel to transpose matrix on device
 30 |   call transpose_matrix(m, n, a, b)
 31 | 
 32 |   ! Transfer result back to host
 33 |   b = b !$cuf
 34 | 
 35 |   ! Print transposed matrix
 36 |   do i = 1, m
 37 |     do j = 1, n
 38 |       write(*,*) b(i,j)
 39 |     end do
 40 |   end do
 41 | 
 42 | end program matrix_transpose
 43 | 
 44 | ! Kernel to transpose matrix on device
 45 | attributes(global) subroutine transpose_matrix(m, n, a, b)
 46 |   implicit none
 47 | 
 48 |   ! Declare variables
 49 |   integer, value :: m, n
 50 |   real, dimension(m,n) :: a, b
 51 |   integer :: i, j
 52 | 
 53 |   ! Get thread and block indices
 54 |   i = blockIdx%x
 55 |   j = threadIdx%x
 56 | 
 57 |   ! Compute transposed matrix element
 58 |   b(i,j) = a(j,i)
 59 | 
 60 | end subroutine transpose_matrix
 61 | ```
 62 | This program defines a matrix_transpose program that initializes an input matrix on the host, transfers it to the device, launches a kernel to transpose the matrix on the device, and then transfers the result back to the host to be printed to the screen. The transpose_matrix kernel uses the thread and block indices to compute the transposed matrix element at the corresponding position in the output matrix. The size of the matrix can be changed by modifying the m and n parameters. Note that this is just one possible way to implement matrix transposition using CUDA Fortran, and there are many other ways to achieve the same result.
 63 | 
 64 | 
 65 | ## Matrix Multiplication with OpenMP
 66 | 
 67 | Q: Please write a Fortran code that does matrix multiplication using OpenMP target offload
 68 | A: Here is an example of a Fortran program that performs matrix multiplication using OpenMP target offload:
 69 | 
 70 | ```fortran
 71 | program matrix_multiply
 72 |   use omp_lib
 73 |   implicit none
 74 | 
 75 |   ! Declare variables
 76 |   integer, parameter :: m = 1000, n = 1000, p = 1000
 77 |   real, dimension(m,n) :: a, b, c
 78 |   integer :: i, j, k
 79 | 
 80 |   ! Initialize input matrices
 81 |   do i = 1, m
 82 |     do j = 1, n
 83 |       a(i,j) = i * j
 84 |     end do
 85 |   end do
 86 |   do i = 1, n
 87 |     do j = 1, p
 88 |       b(i,j) = i + j
 89 |     end do
 90 |   end do
 91 | 
 92 |   ! Perform matrix multiplication on device
 93 |   !$omp target map(tofrom: a, b, c)
 94 |   !$omp parallel do default(none) shared(m,n,p,a,b,c) private(i,j,k)
 95 |   do i = 1, m
 96 |     do j = 1, p
 97 |       c(i,j) = 0
 98 |       do k = 1, n
 99 |         c(i,j) = c(i,j) + a(i,k) * b(k,j)
100 |       end do
101 |     end do
102 |   end do
103 |   !$omp end parallel do
104 | 
105 |   ! Print result matrix
106 |   do i = 1, m
107 |     do j = 1, p
108 |       write(*,*) c(i,j)
109 |     end do
110 |   end do
111 | 
112 | end program matrix_multiply
113 | ```
114 | This program defines a matrix_multiply program that initializes two input matrices on the host, then uses OpenMP target offload to perform matrix multiplication on the device. The omp parallel do construct is used to distribute the computation of the matrix multiplication across multiple threads on the device. The result matrix is then printed to the screen on the host. The size of the matrices can be changed by modifying the m, n, and p parameters. Note that this is just one possible way to implement matrix multiplication using OpenMP target offload, and there are many other ways to achieve the same result.
115 | 
116 | ## Matrix Multiplication 2
117 | 
118 | Q: Please write a matrix multiplication code in Fortran that uses cache and register blocking to achieve a reasonable performance
119 | A: Here is an example of a Fortran program that performs matrix multiplication using cache and register blocking to optimize performance:
120 | 
121 | ```fortran
122 | program matrix_multiply
123 |   implicit none
124 | 
125 |   ! Declare variables
126 |   integer, parameter :: m = 1000, n = 1000, p = 1000
127 |   real, dimension(m,n) :: a, b, c
128 |   integer :: i, j, k, ii, jj, kk, block_size
129 |   real, dimension(:), allocatable :: a_block, b_block
130 | 
131 |   ! Set block size
132 |   block_size = 32
133 | 
134 |   ! Allocate memory for block arrays
135 |   allocate(a_block(block_size*block_size))
136 |   allocate(b_block(block_size*block_size))
137 | 
138 |   ! Initialize input matrices
139 |   do i = 1, m
140 |     do j = 1, n
141 |       a(i,j) = i * j
142 |     end do
143 |   end do
144 |   do i = 1, n
145 |     do j = 1, p
146 |       b(i,j) = i + j
147 |     end do
148 |   end do
149 | 
150 |   ! Perform matrix multiplication with blocking
151 |   do ii = 1, m, block_size
152 |     do jj = 1, p, block_size
153 |       do kk = 1, n, block_size
154 | 
155 |         ! Copy blocks of matrix A and B to registers
156 |         do i = 1, block_size
157 |           do j = 1, block_size
158 |             a_block(i + (j-1)*block_size) = a(ii+i-1, kk+j-1)
159 |             b_block(i + (j-1)*block_size) = b(kk+i-1, jj+j-1)
160 |           end do
161 |         end do
162 | 
163 |         ! Perform matrix multiplication on block
164 |         do i = 1, block_size
165 |           do j = 1, block_size
166 |             c(ii+i-1, jj+j-1) = 0
167 |             do k = 1, block_size
168 |               c(ii+i-1, jj+j-1) = c(ii+i-1, jj+j-1) + a_block(i + (k-1)*block_size) * b_block(k + (j-1)*block_size)
169 |             end do
170 |           end do
171 |         end do
172 | 
173 |       end do
174 |     end do
175 |   end do
176 | 
177 |   ! Print result matrix
178 |   do i = 1, m
179 |     do j = 1, p
180 |       write(*,*) c(i,j)
181 |     end do
182 |   end do
183 | 
184 | end program matrix_multiply
185 | ```
186 | This program defines a matrix_multiply program that initializes two input matrices on the host, then uses cache and register blocking to perform matrix multiplication in an optimized way. The algorithm divides the matrices into blocks of a specified size and performs the multiplication on each block, storing the blocks in registers to reduce access to main memory. This can improve performance by reducing the number of memory accesses and increasing the amount of data that can be kept in cache. The size of the matrices and the block size can be changed by modifying the corresponding parameters. Note that this is just one possible way to implement matrix multiplication using cache and register blocking, and there are many other ways to achieve the same result.
187 | 


--------------------------------------------------------------------------------
/Dealing_with_imperfect_Fortran_compilers.md:
--------------------------------------------------------------------------------
  1 | # Dealing with Imperfect Fortran Compilers, Part 1
  2 | 
  3 | TL;DR You can build your Fortran application with two different compilers to get
  4 | the best of both worlds when it comes to coarrays and GPU parallelism, for example.
  5 | This is not the easiest thing to do, but it's better than the bucket of tears
  6 | you're living with right now.
  7 | 
  8 | This post describes the motivation.  [Part 2](https://github.com/jeffhammond/blog/blob/main/Dealing_with_imperfect_Fortran_compilers_2.md) has the technical details.
  9 | 
 10 | ## The Current State of Fortran Compilers
 11 | 
 12 | Most programmers are familiar with imperfect software.
 13 | There are few, if any, nontrivial bug-free programs.
 14 | However, users of Fortran are likely more familiar with this topic than the average programmer.
 15 | It was not too long ago that there was no free compiler that
 16 | supported the latest Fortran standard, or even a decent fraction of it.
 17 | For example, when I started programming in Fortran in 2006,
 18 | the free compilers that existed were g77 and g95, neither
 19 | of which resembled a real Fortran 2003 compiler.
 20 | Obviously, better Fortran compilers have always been available,
 21 | but not necessarily at the right price.
 22 | For example, Cray Fortran is an excellent compiler, and the compiler
 23 | is free with the purchase of a Cray system, but the minimum
 24 | purchasable unit of Cray hardware has historically been a
 25 | rack, which costs around a million dollars.
 26 | Reportedly, TITECH bought a single Cray XK7 system
 27 | so they could use Cray's Fortran OpenACC compiler on
 28 | one of the TSUBAME systems.
 29 | 
 30 | Regardless of the frustrating history of Fortran compilers,
 31 | the situation in 2022 is a lot better.
 32 | GCC Fortran covers a large portion of the latest Fortran standard (2018),
 33 | Intel's Fortran compiler and NVIDIA's NVHPC (nee PGI) 
 34 | Fortran compilers are freely available (without purchase of hardware),
 35 | and the LLVM Fortran effort has made significant progress.
 36 | The first-generation LLVM Fortran, Flang, was based on PGI's Fortran
 37 | compiler and is the basis for Fortran products from AMD and ARM,
 38 | although neither can claim to support the majority of Fortran 2008.
 39 | The new LLVM Fortran project, F18, is expected to support all the latest
 40 | standard features when it reaches production quality.
 41 | 
 42 | One area where Fortran compiler support is quite poor is the distributed
 43 | memory model known as coarrays.
 44 | Cray has a great implementation of coarrays but it's tied to their
 45 | high-performance networking hardware.
 46 | Intel and GCC Fortran both support coarrays, but one of these
 47 | compilers is very good at distributed-memory performance and
 48 | the other is very good at shared-memory performance
 49 | (which is which is left as an exercise for the reader)
 50 | so HPC users at NCAR, for example, are forced to choose
 51 | which half of the HPC performance spectrum matters to them.
 52 | Right now, none of AMD, ARM, LLVM, or NVIDIA support coarrays
 53 | at all, although one hopes that future progress in LLVM F18
 54 | will include coarrays and percolate into vendor derivatives.
 55 | 
 56 | Fortran programmers fall into a few different categories:
 57 | 
 58 |   1. Luddites who haven't left the 20th century,
 59 |      don't know that the Fortran language has changed since the mid-1980s [1],
 60 |      and whose code compiles everywhere this side of a punchcard reader.
 61 |   2. Pragmatists, who write to the widely supported common subset of Fortran language support, 
 62 |      which can be approximated by Fortran 2003, and definitely does not include coarrays.
 63 |      These folks also don't care about coarrays because MPI is better and 
 64 |      has been universally available since before the `gfortran` project began.
 65 |   3. Purists, who insist that, if WG5 can imagine it, then it should be usable, 
 66 |      at least within a few years of the ISO ink drying.
 67 |      Such users either have very business relationships with a compiler vendor or are willing to compromise
 68 |      on at least one of performance and portability.
 69 | 
 70 | In a CPU-only HPC universe, particularly one dominated by x86, most programmers
 71 | could live relatively comfortably within one of these categories.
 72 | However, we haven't lived in a CPU-only HPC universe since at least 2012,
 73 | when ORNL's Titan Cray XK7 ushered in the beginning of the GPU era of HPC.
 74 | Furthermore, x86 domination in HPC streadily eroded as numerous ARM-based
 75 | alternatives have emerged, ranging from the exotic Fujitsu A64fx processor
 76 | to awesome-yet-boring cloud-oriented offerings from Ampere and AWS.
 77 | 
 78 | Now, our purists are far less happy than before and likely have at least one of the following grievances:
 79 | 
 80 |   1. My system does not support coarrays properly or at all.
 81 |   2. My system does not support OpenACC, CUDA Fortran, or StdPar (i.e. `DO CONCURRENT` on GPUs).
 82 |   3. My system does not support an feature-complete OpenMP 5 GPU compiler.
 83 |   4. My system does not support Fortran 2018 features not related to parallelism.
 84 |   5. The only good Fortran compiler on my system is not mainstream and not supported by important HPC software.
 85 |   6. The only good Fortran compiler I can use has no well-defined support model.
 86 | 
 87 | Even the pragmatists are starting to get impatient and would like more of the post-2003 features
 88 | than are universally available.
 89 | 
 90 | Fortunately, there is a solution to these problems, but it requires a bit of software gymnastics.
 91 | On the other hand, if you can actually use most of the features in Fortran 2003+, you're more than
 92 | smart enough to deal with the back handspring I'm going to describe next.
 93 | 
 94 | ## A Brief Digression about Application-Binary Interfaces (ABIs)
 95 | 
 96 | None of the aforementioned problems exist in the C world because
 97 | - with a small number of reasonable caveats -
 98 | C compilers are interoperable, and there's no issue mixing objects
 99 | from GCC, Clang and a vendor C compiler based on EDG.
100 | This is because C supports ABI stability on a given platform
101 | and very few users want to mix C standard libraries, which
102 | is the one thing one cannot do.
103 | Similarly, in C++, one can mix GCC and Clang or a EDG-based vendor compiler
104 | as long as they use the same STL.
105 | 
106 | Unfortunately, Fortran offers nothing in the way of ABI stability.
107 | Each Fortran compiler can have its own convention for passing
108 | `CHARACTER*(*)` strings and arrays, with the latter often including
109 | a non-standard descriptor format, which may not be well-documented.
110 | Finally, I/O statements and all the intrinsics are a based on
111 | a compiler-specific runtime library, which is tighly bound
112 | to the aforementioned calling conventions.
113 | 
114 | However, starting in Fortran 2003, there has been standardized
115 | interoperability between Fortran and C, and this feature set
116 | became almost magical in Fortran 2018, with the introduction
117 | of `CFI_cdesc_t` and other features.
118 | 
119 | Extended C-Fortran interoperability (CFI) in Fortran 2018 is the magic
120 | that is going to allow us to break free from the limitations
121 | of a single imperfect compiler, to realize the features provided
122 | by TWO imperfect compilers, so long as there are clean boundaries
123 | between the Fortran code called by each.
124 | 
125 | ## The Luddites Might be Right
126 | 
127 | Amusingly, the luddites who stopped reading already have been able
128 | to rely on the almost-ABI stability of legacy Fortran
129 | (which is erroneously called "Fortran 77" by many,
130 | but is more accurately Fortran ~85).
131 | Because `integer A(*)` behaves like C99's `int a[restrict]`,
132 | and the infrequent use of proper strings in Fortran,
133 | Fortran libraries like the BLAS and LAPACK are mostly compiler-agnostic.
134 | For example, one does not need to handle the string length in
135 | `DGEMM` because exactly 1 character is read per argument.
136 | There is an issue with complex number return values, 
137 | but I'm going to ignore that one.  Not today, Satan.
138 | 
139 | So basically, if you are willing to write Fortran with REO Speedwagon
140 | playing in the background, and your idea of a complicated datatype
141 | is `DOUBLE PRECISION A(LDA,*)` then you don't need to know what
142 | comes next.
143 | 
144 | ## Getting to the Point
145 | 
146 | You've waited long enough.
147 | The trick we are going to use to make all of our Fortran dreams come true
148 | is to split Fortran applications into pieces that can be compiled with 
149 | different Fortran compilers, and to connect them using CFI features.
150 | 
151 | Pictorially, this can be described as follows [2]:
152 | 
153 | ![72lgcj](https://user-images.githubusercontent.com/406118/204720665-04588b4e-36d1-40d5-bf61-32e9928bc94a.jpg)
154 | 
155 | The overall effect of this is as if we turn 1 application into
156 | 1 application and N libraries, where the libraries have C linkage.
157 | All of this is possible in a strictly standard-compliant way
158 | as of Fortran 2018, although it is a bit tricky to implement,
159 | and there are some limitations.
160 | 
161 | One of the more obvious applications of this technique is to build
162 | a program that uses coarrays for distributed-memory parallelism
163 | and `DO CONCURRENT` with GPU parallelism.
164 | Currently, this is not possible on any interesting platform,
165 | since the NVIDIA HPC Fortran compiler does not support coarrays
166 | and neither GCC or Cray has GPU support for `DO CONCURRENT`
167 | right now.  Intel Fortran supports coarrays and recently released
168 | GPU support for `DO CONCURRENT`, but Intel has not shipped a
169 | proper HPC GPU, hence the "interesting platform" caveat.
170 | 
171 | A less exciting application of this method is to build libraries
172 | that are implemented using modern Fortran that are usable from
173 | any language that supports C calling conventions / linkage.
174 | For example, one could rewrite the BLAS and LAPACK without
175 | disrupting user experience for those accustomed to the legacy
176 | Fortran APIs, or even ship an implementation of CBLAS that
177 | contains zero C code.
178 | 
179 | The technical details of this are described in a [follow-up post](https://github.com/jeffhammond/blog/blob/main/Dealing_with_imperfect_Fortran_compilers_2.md).
180 | 
181 | ## References
182 | 
183 |   1. [Doctor Fortran in "Military Strength"](https://stevelionel.com/drfortran/2020/05/16/doctor-fortran-in-military-strength/)
184 |   2. [Aquarium Leaking Slap Meme Generator](https://imgflip.com/memegenerator/194158970/Aquarium-Leaking-Slap)
185 |   3. Intel recently released a compiler that has `DO CONCURRENT` GPU support but it has a few issues that need to be addressed.
186 | 
187 | ## Disclaimer and license
188 | 
189 | The opinions expressed in this post are exclusively the author's 
190 | and not those of his current and past co-workers, co-authors, friends or family members.
191 | 
192 | (c) Copyright Jeff Hammond, 2022. No reuse permitted except by permission from the author.
193 | 


--------------------------------------------------------------------------------
/Dealing_with_imperfect_Fortran_compilers_2.md:
--------------------------------------------------------------------------------
  1 | # Dealing with Imperfect Fortran Compilers, Part 2
  2 | 
  3 | TL;DR You can build your Fortran application with two different compilers to get
  4 | the best of both worlds when it comes to coarrays and GPU parallelism, for example.
  5 | This is not the easiest thing to do, but it's better than the bucket of tears
  6 | you're living with right now.
  7 | 
  8 | ## The Goal
  9 | 
 10 | What we want to build here is a bridge between two Fortran compilers.
 11 | The most general bridge is the standard C-Fortran interoperability (CFI)
 12 | feature set found in Fortran 2018, but we will also describe
 13 | a bridge that uses the implementation-specific array descriptor
 14 | of the NVIDIA (nee PGI) Fortran compiler, because that was original
 15 | motivation for this project.
 16 | 
 17 | ## Fortran Type-Checking
 18 | 
 19 | Fortran is a strongly type language, although legacy Fortran
 20 | compilers had no good way to enforce this, so users regularly
 21 | abused the fact that Fortran compilers (almost?) always pass
 22 | arguments by reference.
 23 | This has allowed functions like `MPI_Bcast` to work for any 
 24 | Fortran buffer input, since the underlying implementation only
 25 | cares about the number of bytes that need to fly around the
 26 | machine.
 27 | Starting in Fortran 90, compilers used modules and the 
 28 | interfaces contained therein to check types, which was
 29 | a problem for MPI [4] although it was mitigated with the
 30 | use of non-standard directives
 31 | (usually containing `IGNORE_TKR`, which means "ignore Type, Kind and Rank").
 32 | 
 33 | In part because of MPI, Fortran added a way to do type-agnostic
 34 | arguments, which is sort of like C's `void *`, but it
 35 | imposes more rules of what users can do with it.
 36 | 
 37 | ## How CFI Works
 38 | 
 39 | In a Fortran program, one can declare a dummy argument that is
 40 | assumed-type (`TYPE(*)`) and assumed-rank (`DIMENSION(..)`).
 41 | Within a Fortran program, one can decode such an argument
 42 | using `SELECT TYPE` and `SELECT RANK`.
 43 | You can look up how those works but they are not important here.
 44 | When such arguments are used in the context of CFI,
 45 | the C function sees a special argument of the type
 46 | `CFI_cdesc_t` that contains all of the information required
 47 | to reconstitute the Fortran array details.
 48 | 
 49 | For example, if I pass a 1D array of double precision elements,
 50 | which technically should use `real(c_double)` but I can cheat 
 51 | if I'm sure that a C `double` and a Fortran `double precision`
 52 | are the same thing,
 53 | my C code will use the following members of `CFI_cdesc_t`:
 54 | 
 55 | ```c
 56 | void * base_addr = <memory location of the array data>
 57 | size_t elem_len  = sizeof(double) = 8
 58 | CFI_rank_t rank  = 1
 59 | CFI_type_t type  = CFI_type_double
 60 | CFI_dim_t dim[1] = { .. }
 61 | ```
 62 | The last listed memory, `dim`, contains the size information
 63 | for each array dimension.
 64 | For contiguous array arguments, it's easy to understand this,
 65 | while for non-contiguous array arguments, one has to be a bit
 66 | more careful.
 67 | 
 68 | Please see Ref. [1,2,3] for details.
 69 | This blog post is not meant to be a complete tutorial on CFI.
 70 | 
 71 | ## Duct Tape, Part 1
 72 | 
 73 | Unfortunately, CFI is only a standard API, and the ABI is not specified.
 74 | Implementations are permitted to use different integer types
 75 | for the various members, e.g. `CFI_rank_t`, and can choose
 76 | their own order of the struct members, with the exception of
 77 | `base_addr`, `elem_len`, and `version`, which must come first.
 78 | This means that one has to compile C code for each Fortran
 79 | compiler using the correct `ISO_Fortran_binding.h` header file.
 80 | 
 81 | There is a simple but annoying solution to this.
 82 | If I define my own implementation of `CFI_cdesc_t`
 83 | (with a different name, of course)
 84 | then I can convert from one compiler's ABI to anothers as follows.
 85 | Note that the code below was written directly into this
 86 | blog and has never been compiled or tested.
 87 | Implementing a correctly functioning version of this
 88 | is left as an exercise to the reader (for now).
 89 | 
 90 | ```c
 91 | #include <ISO_Fortran_binding.h>
 92 | 
 93 | // the symbols need to be disambiguity
 94 | #define MANGLE(x) FORTRAN_COMPILER_NAME ## _ ## x
 95 | 
 96 | // DT = Duct Tape
 97 | // use int64_t under the assumption that it is big enough for any implementation ABI. 
 98 | // this assumption should be verified in production code.
 99 | typedef DT_cdesc_t {
100 |   void * base_addr;
101 |   size_t elem_len;
102 |   int64_t rank;
103 |   int64_t type;
104 |   ...
105 | }
106 | 
107 | // define a DT_type_t enum etc.
108 | 
109 | // convert a CFI descriptor to a DT one
110 | void MANGLE(CFI_to_DT)
111 | (const CFI_cdesc_t * i, DT_cdesc_t * o)
112 | {
113 |   o->base_addr = i->base_addr;
114 |   o->elem_len  = i->elem_len;
115 |   o->rank      = i->rank;
116 |   
117 |   // because the values of CFI_type_t can be different between implementations,
118 |   // we need to have our set of DT_type_t values and translate them in both directions.
119 |   o->type      = translate_type_t_CFI_to_DT(i->type);
120 |   
121 |   // more stuff...
122 | }
123 | 
124 | // convert a DFT descriptor to a CFI one
125 | void MANGLE(DT_to_CFI)
126 | (const DT_cdesc_t * i, CFI_cdesc_t * o)
127 | {
128 |   o->base_addr = i->base_addr;
129 |   o->elem_len  = i->elem_len;
130 |   o->rank      = i->rank;
131 |   
132 |   // because the values of CFI_type_t can be different between implementations,
133 |   // we need to have our set of DT_type_t values and translate them in both directions.
134 |   o->type      = translate_type_t_DT_to_CFI(i->type);
135 |   
136 |   // more stuff...
137 | }
138 | ```
139 | 
140 | This file needs to be compiled for every Fortran compiler.
141 | 
142 | Then I can write another C function that uses e.g.
143 | `Intel_Fortran_CFI_to_DT` and `Cray_Fortran_DT_to_CFI`
144 | to send an array from an Intel Fortran subroutine
145 | to a Cray Fortran subroutine.
146 | 
147 | ## Duct Tape, Part 2
148 | 
149 | Unfortunately, not all Fortran compilers support Fortran 2018 CFI right now.
150 | Fortunately, it's actually easier to implement the equivalent of the above
151 | duct tape in this case, because there is no ABI conflict between CFI
152 | descriptors and non-standard ones.
153 | 
154 | Now I'm going to describe how to call NVIDIA Fortran from another Fortran
155 | compiler that supports Fortran 2018, particularly both CFI and coarrays.
156 | The motivation is that there are people out there who want to use 
157 | GPU `DO CONCURRENT` support in the NVIDIA Fortran compiler along with
158 | existing coarray applications.
159 | 
160 | The following code is taken from https://github.com/jeffhammond/Cthulhu, 
161 | which has been compiled and works correctly in limited testing.
162 | We use the [Parallel Research Kernels](https://github.com/ParRes/Kernels)
163 | implementations of `nstream` (like STREAM triad) to illustrate how this works.
164 | The goal is to take [nstream-coarray.F90](https://github.com/ParRes/Kernels/blob/default/FORTRAN/nstream-coarray.F90)
165 | and allow the `do concurrent` part to use the NVIDIA Fortran compiler
166 | with GPU support enabled.
167 | The relevant portion of the code is shown below.
168 | 
169 | ```fortran
170 | ...
171 |   real(kind=REAL64), allocatable ::  A(:)[:]
172 |   real(kind=REAL64), allocatable ::  B(:)[:]
173 |   real(kind=REAL64), allocatable ::  C(:)[:]
174 |   real(kind=REAL64) :: scalar
175 | ...
176 |     do concurrent (i=1:length)
177 |       A(i) = A(i) + B(i) + scalar * C(i)
178 |     enddo
179 | ...
180 | ```
181 | 
182 | The first step is to outline the `do concurrent` part
183 | and make it into a subroutine call.
184 | ```fortran
185 |     !do concurrent (i=1:length)
186 |     !  A(i) = A(i) + B(i) + scalar * C(i)
187 |     !enddo
188 |     call nstream_colon_trampoline(length,scalar,A,B,C)
189 | ```
190 | _Aside: I use `colon` in the name to refer to `(:)`, in contrast
191 | to other dummy argument syntax, `(N)` or `(*)`, that I tried,
192 | not because of any connection to the gastrointestinal system._
193 | 
194 | The subroutine has the following interface defined in a module.
195 | ```fortran
196 |     interface
197 |         subroutine nstream_colon_trampoline(length,scalar,A,B,C) bind(C)
198 |             use, intrinsic :: iso_fortran_env
199 |             integer(kind=INT64), value :: length
200 |             real(kind=REAL64), value :: scalar
201 |             real(kind=REAL64), dimension(:) :: A,B,C
202 |         end subroutine nstream_colon_trampoline
203 |     end interface
204 | ```
205 | Having the interface is important if the F90 array descriptor is sufficiently
206 | incompatible with a CFI descriptor so as to not work.
207 | It is possible that some Fortran compilers need to see the
208 | `type(*), dimension(..)` to generate a proper `CFI_cdesc_t`.
209 | 
210 | The above is implemented in C and looks like this:
211 | ```c
212 | void nstream_colon_trampoline(int64_t length, double scalar, 
213 |                               CFI_cdesc_t * dA, CFI_cdesc_t * dB, CFI_cdesc_t * dC)
214 | {
215 |     double * restrict A = dA->base_addr;
216 |     double * restrict B = dB->base_addr;
217 |     double * restrict C = dC->base_addr;
218 |     F90_Desc_la pA={0}, pB={0}, pC={0};
219 |     cfi_to_pgi_desc(dA,&pA);
220 |     cfi_to_pgi_desc(dB,&pB);
221 |     cfi_to_pgi_desc(dC,&pC);
222 |     nstream_colon(length, scalar, A, B, C, &pA, &pB, &pC);
223 | }
224 | ```
225 | Here we see the descriptor conversion from CFI to the PGI->NVIDIA descriptor (`F90_Desc_la`),
226 | which is defined in `nvhpc_cuda_runtime.h` that ships with the 
227 | [NVHPC SDK](https://developer.nvidia.com/nvidia-hpc-sdk-downloads).
228 | We copied the relevant parts, which can be seen in
229 | [pgif90.h](https://github.com/jeffhammond/Cthulhu/blob/main/pgif90.h).
230 | 
231 | The descriptor conversion is done in `cfi_to_pgi_desc`, which can be found in
232 | [trampoline.h](https://github.com/jeffhammond/Cthulhu/blob/main/trampoline.h).
233 | A real implementation of the type id conversion alluded to above is shown there as well.
234 | 
235 | The C code calls `nstream_colon`, which is a Fortran subroutine compiled with
236 | the NVIDIA compiler, shown below.
237 | ```fortran
238 | subroutine nstream_colon(length,scalar,A,B,C) bind(C)
239 |     use, intrinsic :: iso_fortran_env
240 |     integer(kind=INT64), value :: length
241 |     real(kind=REAL64), value :: scalar
242 |     real(kind=REAL64), dimension(:) :: A,B,C
243 |     integer(kind=INT64) :: i
244 |     do concurrent (i=1:length)
245 |       A(i) = A(i) + B(i) + scalar * C(i)
246 |     enddo
247 | end subroutine nstream_colon
248 | ```
249 | 
250 | To glue the C code to the NVIDIA Fortran code,
251 | I needed to know that NVIDIA Fortran passes the buffer address in the expected place
252 | and appends the array descriptors at the end, similar to how Fortran strings are passed.
253 | 
254 | ## Summary
255 | 
256 | What we just did was write a Fortran program that calls CFI-compatible interface
257 | (using Fortran 2018 features)
258 | to a C function that converts one Fortran compiler array descriptor to another
259 | Fortran compiler array descriptor that calls a C-compatible Fortran subroutine
260 | (using only Fortran 2003 features).
261 | 
262 | Neither of the Fortran compilers know each other exist as each is talking to C code.
263 | The Fortran coarray code is calling a C function, with a known-compatible interface.
264 | The Fortran `do concurrent` code is called by a C function that passes it the necessary metadata.
265 | All of this is within the scope of why CFI was designed,
266 | although it's not clear if WG5 (the Fortran standards committee) foresaw the
267 | perverse use case show here.
268 | (Most likely, WG5 imagined that one would never need to do this because all
269 | Fortran compilers are perfect implementations of the standard. 😉)
270 | 
271 | The real hero is, of course, the all-powerful C language, which can communicate with
272 | any other programming languages thanks to its lack of support for the type of
273 | expressive language features that would get in the way.
274 | However, since you are here because you love Fortran enough to try to use the union
275 | of features found in two different compilers, the other hero here is CFI,
276 | which allows Fortran to masquerade as C at the binary object level, thereby
277 | enabling Fortran programmers to create libraries as if they were written in C,
278 | and thus can be called from any other language, including Fortran.
279 | 
280 | ## References
281 | 
282 |   1. Michael Metcalf, John Reid, Malcolm Cohen. [Modern Fortran Explained: Incorporating Fortran 2018](https://academic.oup.com/book/26799)
283 |   2. Intel Fortran: [C Structures, Typedefs, and Macros for Interoperability](https://www.intel.com/content/www/us/en/develop/documentation/fortran-compiler-oneapi-dev-guide-and-reference/top/compiler-reference/mixed-language-programming/standard-tools-for-interoperability/c-structures-typedefs-macros-for-interoperability.html#c-structures-typedefs-macros-for-interoperability)
284 |   3. GCC Fortran: [Interoperability with C](https://gcc.gnu.org/onlinedocs/gfortran/Interoperability-with-C.html)
285 |   4. C. E. Rasmussen, J. M. Squyres. [A Case for New MPI Fortran Bindings](https://www.open-mpi.org/papers/euro-pvmmpi-2005-fortran/euro-pvm-mpi-2005-fortran.pdf).
286 | 
287 | ## Disclaimer and license
288 | 
289 | The opinions expressed in this post are exclusively the author's 
290 | and not those of his current and past co-workers, co-authors, friends or family members.
291 | 
292 | (c) Copyright Jeff Hammond, 2022. No reuse permitted except by permission from the author.
293 | 


--------------------------------------------------------------------------------
/Dell_Inspiron5000_Linux.md:
--------------------------------------------------------------------------------
  1 | # Summary
  2 | 
  3 | This post describes my experience getting Linux installed on a [Dell Inspiron 5000 laptop](https://www.dell.com/en-us/member/shop/dell-laptops/new-inspiron-14-5000-laptop/spd/inspiron-14-5402-laptop/nn5402ejobh) with the latest (as of 2020) Intel processor, known as Tiger Lake.
  4 | 
  5 | **TL;DR** Turn of BitLocker, install Ubuntu 20.10 (not 20.04!), and repartition the drive to dual-boot Linux alongside Windows 10.  Everything works nicely.
  6 | 
  7 | # Disclaimer
  8 | 
  9 | I was not compensated by anyone to write this, but I work for Intel and thus you should not treat this as an objective, third-party review.  In any case, my focus here will be explaining what I did to install Linux and how things are working so far, not to compare it to any competitive products.  My primary basis for comparison will older Dell laptops and other computers on which I run Linux.
 10 | 
 11 | # What's in the box?
 12 | 
 13 | The box contains a laptop, the power cable and the trivial paperwork.  That's it.  There was minimal plastic (sleeves around each item) and the cardboard appears to be recycled.
 14 | 
 15 | Here are some hardware details if you care:
 16 | ```sh
 17 | $ sudo lshw 
 18 | tigerlake                   
 19 |     description: Notebook
 20 |     product: Inspiron 5402 (0A01)
 21 |     vendor: Dell Inc.
 22 |     serial: *
 23 |     width: 64 bits
 24 |     capabilities: smbios-3.2.0 dmi-3.2.0 smp vsyscall32
 25 |     configuration: boot=normal chassis=notebook family=Inspiron sku=0A01 
 26 |   *-core
 27 |        description: Motherboard
 28 |        product: 0MF3C8
 29 |        vendor: Dell Inc.
 30 |        physical id: 0
 31 |        version: A00
 32 |        serial: *
 33 |      *-firmware
 34 |           description: BIOS
 35 |           vendor: Dell Inc.
 36 |           physical id: 0
 37 |           version: 1.1.5
 38 |           date: 09/22/2020
 39 |           size: 1MiB
 40 |           capacity: 32MiB
 41 |           capabilities: pci pnp upgrade shadowing cdboot bootselect edd int5printscreen int9keyboard int14serial int17printer acpi usb smartbattery biosbootspecification netboot uefi
 42 |      *-cpu
 43 |           description: CPU
 44 |           product: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
 45 |           vendor: Intel Corp.
 46 |           physical id: 400
 47 |           bus info: cpu@0
 48 |           version: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
 49 |           slot: CPU 1
 50 |           size: 1274MHz
 51 |           capacity: 4700MHz
 52 |           width: 64 bits
 53 |           clock: 100MHz
 54 |           capabilities: lm fpu fpu_exception wp vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp x86-64 constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb cat_l2 invpcid_single cdp_l2 ssbd ibrs ibpb stibp ibrs_enhanced tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb intel_pt avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves split_lock_detect dtherm ida arat pln pts hwp hwp_notify hwp_act_window hwp_epp hwp_pkg_req avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid movdiri movdir64b fsrm avx512_vp2intersect md_clear flush_l1d arch_capabilities cpufreq
 55 |           configuration: cores=4 enabledcores=4 threads=8
 56 | ```
 57 | 
 58 | # Booting for the first time
 59 | 
 60 | When you power on the system, it enters into the Windows installer.  I do not like Windows and the installer insisted that I provide personal information to proceed.  You can find more about this elsewhere, so I won't elaborate.
 61 | 
 62 | # Installing WSL
 63 | 
 64 | Before I went all-in with Linux, I decided to see how WSL works.  It's pretty nice, although switching from regular Windows to the Insider Preview updates meant that I had to install the Preview version of Windows Terminal, which seems like an unnecessary user inconvenience.
 65 | 
 66 | The biggest issue with WSL (both 1 and 2) is that I cannot get GPU compute support.  This is an area of active interest for many, including Microsoft, all the major GPU vendors, and countless users.  I expect it will be fixed some time in 2021, but this is just an uninformed guess.
 67 | 
 68 | The other thing I disliked about about Windows and WSL is how many times I had to reboot the computer.  On Linux, the only time I reboot is when I update the OS kernel itself.  Nothing else requires a reboot.  On Windows, you have to reboot to change just about anything.  I'm sure somebody thinks there is a good reason for this, but it's annoying and one of many reasons why I cannot take Windows seriously.
 69 | 
 70 | # Installing Linux
 71 | 
 72 | My first few attempts to boot to a USB drive and run Linux live were unsuccessful and I made my computer very angry.  The repeated reboots, including a few forced reboots, triggered system checks and something that looked like a rescue process.  Fortunately, I did not render my machine unusable in the process.
 73 | 
 74 | Eventually, I figured out how to boot from an USB drive.  I didn't capture the details properly but you should plug the USB into the driver, go into the BIOS settings, and place the USB boot drive about the others.
 75 | 
 76 | The first time I tried to run Linux, I was using the Ubuntu 20.04 ISO on a USB.  This image does not know about WiFi6, which means that I couldn't do anything with it, because the laptop doesn't have an Ethernet jack and I do not have a USB-to-Ethernet dongle.
 77 | 
 78 | It appears that ArchLinux has the latest kernel in an ISO but I'm less familiar with Arch so I tried Ubuntu 20.10 instead.  Fortunately, Ubuntu 20.10 has the updates requires to recognize the wireless chip (and all the other hardware I'm aware of).
 79 | 
 80 | When I booted into Ubuntu 20.10 in live mode, I could see that all the hardware was recognized, including the wireless and the MicroSD port.  The touchpad and screen worked perfectly.  At this point, I decided to make the Linux install persistent.
 81 | 
 82 | Because I wasn't completely sure that Linux can do all the BIOS/firmware updates directly, I decided to dual-boot.  I don't expect to need more than ~250 GB for each OS image, so this won't be an issue even if I never use Windows 10 again.
 83 | 
 84 | Ubuntu is really smart and understands the Windows 10 filesystem, and can reorganize the drive to add a Linux (ext4) partition.  However, it can only do this if you turn off BitLocker.  I suppose I can turn on BitLocker again now that Linux is installed, but as this laptop is unlikely to leave my office for a long time, I'm not that worried about physical security.
 85 | 
 86 | After booting into Windows to disable Bitlocker (takes about 5 minutes or less), I proceeded with the Ubuntu installer, which was extremely fast (less than 5 minutes).  I opted for the minimal configuration, but added the third-party proprietary drivers in case any of the hardware needs that.
 87 | 
 88 | # Running Linux
 89 | 
 90 | After the necessary reboot, I saw the Ubuntu boot menu, which defaults to Ubuntu, but also gives me the option to go into the Windows boot flow.  I didn't bother to test the Windows boot again because I don't really care if it works, but if something doesn't work, I'll add a note about that here.
 91 | 
 92 | Because I am a luddite, I installed all the things I wanted with Apt in Terminal.  The apps I install everywhere include the Chrome browser, the latest versions of GCC and LLVM, and associated development tools like Git, Vim, Valgrind and CMake (yes, I hate CMake but it's a necessary evil).
 93 | 
 94 | # Comparison to past experiences
 95 | 
 96 | I've been a Dell Linux laptop user for many years, since 2005 or so.  In the past, I would install OpenSUSE because it seemed to have the best driver support and wouldn't even think the fact that Windows was there, although I recall the horrors of ndiswrapper in cases where my wireless chip didn't permit native Linux drives.  Things are different now, mostly in good ways, although BIOS security features and SecureBoot mean that nuking Windows requires slightly more work.  As noted already, I am keeping Windows 10 around in a dual-boot configuration just in case I need it to update the firmware or something like that.
 97 | 
 98 | # Installing oneAPI
 99 | 
100 | Because the motivation for getting this machine was to evaluate the developer experience of oneAPI on a Tiger Lake laptop, I installed oneAPI and other Intel GPU software.  Fortunately, both support Apt and the process is trivial.  Since the details are documented elsewhere, I will merely link to those:
101 | 
102 | * https://dgpu-docs.intel.com/installation-guides/ubuntu/ubuntu-focal.html
103 | * https://software.intel.com/content/www/us/en/develop/articles/installing-intel-oneapi-toolkits-via-apt.html
104 | 
105 | Aside: the above works just fine in WSL as well, but the GPU isn't exposed because the driver support isn't there yet.
106 | 
107 | # Performance
108 | 
109 | ## Practical
110 | 
111 | The other computers on my desk right now are a Macbook Pro circa 2018-2019 and a Hades Canyon NUC.  The Tiger Lake laptop feels noticeably faster than the NUC even though the NUC has a higher power envelope and frequency ceiling.  I'm not sure whether this is real or not, and if it's real, how it correlates with the processor, memory, or SSD capability. 
112 | 
113 | I can't really compare the Mac laptop because it is burdened with corporate IT bloatware like Microsoft Outlook that I unfortunately have to run all the time.  I've mostly given up on building software natively on MacOS because Apple refuses to let me program the GPU in a sensible manner (their OpenCL is decent for a 1.x implementation, but that's not saying much).
114 | 
115 | The other thing I like about this laptop is that it cost around $800.  My Macbook Pro cost around $3000 and while it has twice the SSD and more memory (16 vs 12), it's not faster than the cheaper one, and certainly not anywhere near four times faster.  I can get a similarly provisioned Mac laptop with the new M1 procesor in it for *only* twice the cost of my Dell.  While people on Twitter tell me that the M1 is seventy bazillion times better than every other processor ever made, I am very happy with the performance and the battery life on the Dell, particularly for the price.  Also, my name is not Jon Masters and I do not love ARM processors *that* much.
116 | 
117 | ## Raw
118 | 
119 | `clpeak` is a nice way to measure the peak memory and compute capability of CPU and GPU devices using an equivalent methodology.
120 | 
121 | I ran these tests with the Linux governor set for performance ([details](https://askubuntu.com/questions/604720/setting-to-high-performance)).
122 | 
123 | ```sh
124 | jrhammon@tigerlake:~/clpeak$ echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
125 | performance
126 | ```
127 | 
128 | ```sh
129 | jrhammon@tigerlake:~/clpeak$ ./clpeak | tee clpeak.log
130 | 
131 | Platform: Intel(R) OpenCL
132 |   Device: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
133 |     Driver version  : 2020.11.10.0.05_160000 (Linux x64)
134 |     Compute units   : 8
135 |     Clock frequency : 2800 MHz
136 | 
137 |     Global memory bandwidth (GBPS)
138 |       float   : 32.09
139 |       float2  : 31.57
140 |       float4  : 34.14
141 |       float8  : 28.76
142 |       float16 : 21.88
143 | 
144 |     Single-precision compute (GFLOPS)
145 |       float   : 218.19
146 |       float2  : 409.03
147 |       float4  : 407.21
148 |       float8  : 400.21
149 |       float16 : 393.09
150 | 
151 |     No half precision support! Skipped
152 | 
153 |     Double-precision compute (GFLOPS)
154 |       double   : 211.05
155 |       double2  : 203.61
156 |       double4  : 201.58
157 |       double8  : 198.51
158 |       double16 : 86.28
159 | 
160 |     Integer compute (GIOPS)
161 |       int   : 75.64
162 |       int2  : 134.78
163 |       int4  : 172.68
164 |       int8  : 88.42
165 |       int16 : 86.30
166 | 
167 |     Integer compute Fast 24bit (GIOPS)
168 |       int   : 58.13
169 |       int2  : 85.43
170 |       int4  : 90.68
171 |       int8  : 89.62
172 |       int16 : 85.92
173 | 
174 |     Transfer bandwidth (GBPS)
175 |       enqueueWriteBuffer              : 14.70
176 |       enqueueReadBuffer               : 14.85
177 |       enqueueWriteBuffer non-blocking : 14.68
178 |       enqueueReadBuffer non-blocking  : 14.82
179 |       enqueueMapBuffer(for read)      : 59322.75
180 |         memcpy from mapped ptr        : 14.69
181 |       enqueueUnmap(after write)       : 52377.65
182 |         memcpy to mapped ptr          : 14.51
183 | 
184 |     Kernel launch latency : 1.97 us
185 | 
186 | Platform: Intel(R) OpenCL HD Graphics
187 |   Device: Intel(R) Graphics Gen12LP [0x9a49]
188 |     Driver version  : 20.46.18421 (Linux x64)
189 |     Compute units   : 96
190 |     Clock frequency : 1300 MHz
191 | 
192 |     Global memory bandwidth (GBPS)
193 |       float   : 32.51
194 |       float2  : 24.16
195 |       float4  : 31.49
196 |       float8  : 32.43
197 |       float16 : 40.02
198 | 
199 |     Single-precision compute (GFLOPS)
200 |       float   : 1413.85
201 |       float2  : 1410.00
202 |       float4  : 860.81
203 |       float8  : 899.68
204 |       float16 : 753.69
205 | 
206 |     Half-precision compute (GFLOPS)
207 |       half   : 2327.97
208 |       half2  : 2304.86
209 |       half4  : 2329.34
210 |       half8  : 1427.09
211 |       half16 : 1612.55
212 | 
213 |     No double precision support! Skipped
214 | 
215 |     Integer compute (GIOPS)
216 |       int   : 329.21
217 |       int2  : 238.15
218 |       int4  : 225.79
219 |       int8  : 308.19
220 |       int16 : 260.67
221 | 
222 |     Integer compute Fast 24bit (GIOPS)
223 |       int   : 326.83
224 |       int2  : 235.12
225 |       int4  : 252.99
226 |       int8  : 252.23
227 |       int16 : 256.53
228 | 
229 |     Transfer bandwidth (GBPS)
230 |       enqueueWriteBuffer              : 12.41
231 |       enqueueReadBuffer               : 12.48
232 |       enqueueWriteBuffer non-blocking : 10.37
233 |       enqueueReadBuffer non-blocking  : 10.18
234 |       enqueueMapBuffer(for read)      : 4294959.00
235 |         memcpy from mapped ptr        : 12.43
236 |       enqueueUnmap(after write)       : inf
237 |         memcpy to mapped ptr          : 12.43
238 | 
239 |     Kernel launch latency : 26.90 us
240 | 
241 | ```
242 | 
243 | # Hardware details
244 | 
245 | The wireless hardware that Ubuntu 20.04 doesn't support is:
246 | ```
247 | $ sudo lshw -C network
248 |   *-network                 
249 |        description: Wireless interface
250 |        product: Wi-Fi 6 AX201
251 |        vendor: Intel Corporation
252 |        physical id: 14.3
253 |        logical name: wlp0s20f3
254 |        version: 20
255 |        serial: *
256 |        width: 64 bits
257 |        clock: 33MHz
258 |        capabilities: pm msi pciexpress msix bus_master cap_list ethernet physical wireless
259 |        configuration: broadcast=yes driver=iwlwifi driverversion=5.8.0-29-generic firmware=55.d9698065.0 ip=* latency=0 link=yes multicast=yes wireless=IEEE 802.11
260 | ```
261 | 
262 | (c) Copyright Jeff Hammond, 2020.  No reuse permitted except by permission from the author.
263 | 


--------------------------------------------------------------------------------
/Empathy.md:
--------------------------------------------------------------------------------
 1 | # Summary
 2 | 
 3 | Empathy is an essential non-technical skill.  If you lack it, you might be able to be successful by some metrics, but your impact will be limited and your colleagues (and friends) will constantly need to compensate for your inadequacy in this respect.  Fortunately, empathy is something that can be taught and learned, and if it is determined that you are lacking in empathy, you should focus a large amount of your personal and professional development energies towards acquiring more of it.
 4 | 
 5 | I will write more on this later.
 6 | 
 7 | 
 8 | 
 9 | (c) Copyright Jeff Hammond, 2020. No reuse permitted except by permission from the author.
10 | 


--------------------------------------------------------------------------------
/Finnish_Nonalcoholic_Beer.md:
--------------------------------------------------------------------------------
 1 | # Jeff rates Finnish non-alcoholic beer
 2 | 
 3 | I do not consume ethanol, but I like beer.
 4 | These are my notes on this topic, from the perspective of a resident of Finland.
 5 | 
 6 | ### [KOFF Crisp Vaalea Lager](https://www.carlsberggroup.com/products/koff/koff-crisp-vaalea-lager)
 7 | 
 8 | I like this one quite a bit.  There is enough hops flavor and the flavor palatte seems well-balanced.
 9 | The quality-control of this beer is imperfect - I can detect variation between batches.
10 | Some cans taste a bit metallic, while others are a bit skunkier than others.
11 | 
12 | ### [More Brewing Lager](https://www.hartwall.fi/yritys/uutiset/2021/hartwallin-uusi-olutbrandi-more-brewing-on-alkoholittomana-syntynyt/)
13 | 
14 | This one is pretty good, but not as good as Crisp.
15 | There is a lot of hops flavor in this one, but it seems like the beer is trying too hard to do this.
16 | There are a lot of American alcoholic beers that try too hard to taste like hops, and if you like those beers, you will like this one.
17 | 
18 | ### [Karjala](https://www.hartwall.fi/juomat/oluet/karjala/karjala-00/)
19 | 
20 | I do not like this one at all.
21 | I do not understand what the creators were going for, but this is not a good beer.
22 | There is not even a bad American beer -- e.g. Milwaukee's Best Ice -- that I can remember being worse.
23 | 
24 | ### [Tsingtao Alcohol Free Lager](https://www.alko.fi/en/products/708554/Tsingtao-Alcohol-Free-Lager/)
25 | 
26 | This one was pretty good.
27 | I can't remember Tsingtao well enough to know if it is an authentic reproduction of the flavor, but it tastes like a real beer.
28 | 
29 | ### [Kukko Pils](https://laitilan.com/en/tuote/kukko-pils-alcohol-free/)
30 | 
31 | _Note: this one - like the others from Kukko - is 0.3%, not strictly 0.0%, so if a tiny amount of alcohol is a problem for you, avoid this one._
32 | 
33 | This one is okay.  I don't like Pilsner that much, but I think it's a reasonable approximation to a real pilsner.
34 | 


--------------------------------------------------------------------------------
/Finnish_is_cool.md:
--------------------------------------------------------------------------------
 1 | # Thoughts on the Finnish language
 2 | 
 3 | I have been learning Finnish for approximately 600 days now.  These are my thoughts on the language.
 4 | I am somewhere in the vicinity of A1 proficiency, so my linguistic analysis should not be given much weight.
 5 | 
 6 | ## Notable differences versus English and other European languages
 7 | 
 8 | The two most obvious differences with languages like English and German are
 9 | the lack of articles and the lack of gender:
10 | * She is a Finn = Hän on Suomalainen
11 | * He is a Finn = Hän on Suomalainen
12 | 
13 | This saves both words and time relative to say, German, which pointlessly assigns a gender to pencils and clocks:
14 | * the hotel, the pencil, the clock = das Hotel, der Bleistift, die Uhr
15 | * the hotel, the pencil, the clock = hotelli, kynä, kello.
16 | 
17 | Not gendering third-person pronouns is both practical and inclusive.
18 | 
19 | The distinction between "a" and "the" is positional:
20 | You have _the_ ticket = Lippu on sinulla
21 | I have _a_ ticket = Minulla on lippu
22 | 
23 | ## Pronounciation
24 | 
25 | Finnish is strictly phonetic.
26 | My friend Mikko says that a spelling bee in Finland would be a hearing test.
27 | The good news is that, if one can hear words correctly, one can write them.
28 | The bad news is that one has to listen very carefully, because every sound matters.
29 | One can turn a meeting into a murder by not making an "a" sound long enough ("tapaan" versus "tapan").
30 | Fortunately, the crime rate in Finland is quite low and the likelihood of confusion here is unlikely.
31 | 
32 | ## "no niin"
33 | 
34 | This is the linguistic equivalent of "duct tape" in Finland.  You can use it pretty much anywhere.
35 | Ismo explains this [here](https://youtu.be/9EWMlCusxjQ).
36 | The literal translation is "well so" but has more inflection-driven versatility than either
37 | "well" and "so" in English, although some of the usage patterns overlap.
38 | From what I've inferred, "no niin" is a standard way to answer the telephone with familiars.
39 | 
40 | 


--------------------------------------------------------------------------------
/Finnish_language.md:
--------------------------------------------------------------------------------
  1 | # The Finnish language
  2 | 
  3 | I've lived in Finland for approximately one year.
  4 | These are my thoughts on learning Finnish, based on my experience so far.
  5 | 
  6 | TL;DR You will not need to know any Finnish to survive in the Helsinki area, but it helps to know a few things.
  7 | 
  8 | # Basics
  9 | 
 10 | The two most important words in Finnish are kiitos (thank you) and anteeksi (excuse me).
 11 | If you can infer what people are saying from context,
 12 | you can get through a number of interactions with strangers using only "kiitos",
 13 | for example, grocery shopping.
 14 | 
 15 | When I want to work hard to communicate to a Finn that I want to speak English with them,
 16 | I say "voimmeko puhua englantia?" (can we speak English?).  If I am lazy, I just say, "sorry",
 17 | and they switch to English most of the time.
 18 | 
 19 | # Conversing with Finns
 20 | 
 21 | The average Finnish person speaks English very well, although they will always
 22 | underestimate their ability to do so (Finns are excessively modest).
 23 | In most cases, unless it is an explicit goal to use Finnish, 
 24 | it is more effective to just switch to English.
 25 | Switching back and forth between languages is difficult for most people, and will
 26 | often confuse the listener more than speaking one language they only partially understand.
 27 | 
 28 | If a Finn does not understand a word you use, it can be useful to replace it with a literal
 29 | description of what it is, since many Finnish words are created this way
 30 | ([examples](https://becomingfinnishsite.wordpress.com/2018/08/13/funny-finnish-words-with-literal-translations/)).
 31 | 
 32 | # Hearing and Pronuncing Words
 33 | 
 34 | Finnish pronounciation is very straightforward - the language is perfectly phonetic - but
 35 | it's obviously not identical to any other language.
 36 | Pronunciation is more important in Finnish than in English, particularly since 
 37 | words that sound very similar can have very different meanings
 38 | (e.g. [tapan versus tapaan](https://www.finnishpod101.com/blog/2020/12/11/common-finnish-mistakes/)).
 39 | 
 40 | Unlike English, where one can be extremely casual about pronounciation and still be understood
 41 | by most proficient speakers, one should try to pronounce words correctly in Finnish
 42 | even if one does not understand the meaning.
 43 | For example, one should learn to pronounce one's address properly, for the purposes of providing
 44 | it to taxi drivers.
 45 | 
 46 | Note that the sound of one letter and two letters is meaningful, for both vowels and consonants,
 47 | as demonstrated above with tapan/tapaan.
 48 | You also need to pronounce double-t and double-k properly, 
 49 | since these are part of two different syllables.
 50 | For example, bank=pankki, which is pronounced pank-ki.
 51 | All of the school materials for kids learning Finnish hyphenates the words so they learn the
 52 | correct pronounciation.
 53 | 
 54 | # Spoken versus Written Finnish
 55 | 
 56 | The spoken language ("puhekieli") is usually a shorthand version of the
 57 | formal language ("kirjakieli").  The former is geographically dependent and
 58 | varies between, say, Helsinki and Pori.
 59 | If you speak kirjakieli, everyone will know you are a novice speaker, but
 60 | at least they will understand you.  I don't see any reason to try to figure out
 61 | puhekieli until you are at least a year into learning Finnish and live here.
 62 | I have not devoted any time to it, but my family tells me things.
 63 | 
 64 | If you are going to learn puhekieli to listen to others, the only parts I've managed
 65 | to learn so far are that mä=minä and sä=sinä (shorthand for first and second person
 66 | singular pronouns) and that numbers get shortened from, say, yksi to yks, which
 67 | is good because the written form of 99 is yhdeksänkymmentäyhdeksän, which makes for a
 68 | rather tedious singing of "99 bottles of beer on the wall"...
 69 | 
 70 | # Swearing and Rude Phrases
 71 | 
 72 | Just say "perkele" (Devil).  It's the universal swear word when things go wrong.
 73 | The harder you roll your Rs, the more you mean it.
 74 | 
 75 | There is no reason to learn how to say rude things to people in Finland, because
 76 | everyone is helpful.  If you want to be rude to someone, say
 77 | "äitisi oli hamsteri ja isäsi haisi seljanmarjoilta."
 78 | 
 79 | Most references to "saunan takana" (behind the sauna) are death threats, although
 80 | it is unlikely the usage will be serious enough to warrant action by the listener.
 81 | I'm only familiar with its use in jest or when Finns are being abusive to one another on the internet
 82 | (since Facebook doesn't understand this colloquialism, it is not recognized as a Terms of Service violation).
 83 | 
 84 | # DuoLingo
 85 | 
 86 | Lots of people on the internet like to criticize DuoLingo.
 87 | I have found it very useful, and it is certainly better than nothing.
 88 | Unfortunately, the Finnish course is rather short relative to German, for example,
 89 | and it does not cover critical topics like the days of the week or months of the year.
 90 | On the other hand, it is good for practicing a bunch of basic concepts, such as
 91 | ordering food at a restaurant and describing the elements of nature.
 92 | 
 93 | I finished the DuoLingo Finnish course and can understand about a third of the words
 94 | that I hear spoken in public, and be able to hear most of the others well enough that
 95 | I can look them up in a dictionary.
 96 | 
 97 | Given that finding Finnish language education outside of Finland is quite difficult,
 98 | if you are planning to come to Finland for a nontrivial period of time, I recommend that 
 99 | you devote some time to DuoLingo Finnish.
100 | 
101 | # Google Translate
102 | 
103 | Google Translate for Finnish is far from perfect.
104 | It knows very little about homonyms, and will falsely lead you to believe that
105 | you are eating varnish-flavored quark, because "lakka" means both varnish and cloudberry.
106 | You will also have to figure out from context whether something is a cucumber or a throat
107 | (both are "kurkku").
108 | 
109 | On the other hand, it can translate from images, which is incredible useful when one 
110 | receives a form letter or a parking ticket.  I frequently use my phone to translate
111 | street signs (not while driving, of course) and other public postings
112 | 
113 | # Online Resources
114 | 
115 | These are online resources I find useful:
116 | * https://uusikielemme.fi/
117 | 
118 | There are many other good resources, particularly from universities.
119 | If you are serious about learning Finnish, you'll search for these online and find them easily enough.
120 | 


--------------------------------------------------------------------------------
/Fortrans_Missing_Parallelism.md:
--------------------------------------------------------------------------------
  1 | # Fortran's Missing Parallelism
  2 | 
  3 | ## Summary
  4 | 
  5 | Starting in Fortran 2008, Fortran supports two forms of parallelism:
  6 |   1. `DO CONCURRENT`, which supports loop-level data parallelism.
  7 |   2. coarrays, which is form of [PGAS](https://en.wikipedia.org/wiki/Partitioned_global_address_space).
  8 | 
  9 | This document will describe a third form of parallelism and argue that it should be supported by the Fortran language.
 10 | The third form of parallelism is shared-memory task parallelism, which supports a range of use cases not easily covered by 1 and 2.
 11 | 
 12 | ## Background Reading
 13 | 
 14 | The reader may wish to consult the following for additional context on this topic:
 15 |   * _Patterns for Parallel Programming_ by Timothy G. Mattson, Beverly Sanders and Berna Massingill
 16 |   * _Task Parallelism By Example_ from the Chapel Project ([Slides](https://chapel-lang.org/tutorials/SC14/SC14-4-Chapel-TaskPar.pdf))
 17 |   * _OpenMP	Tasking Explained_ by Ruud	van	der	Pas ([Slides](https://openmp.org/wp-content/uploads/sc13.tasking.ruud.pdf))
 18 |   * _OpenMP	Tasking_ by Christian Terboven and Michael Klemm ([Slides](https://www.openmp.org/wp-content/uploads/sc15-openmp-CT-MK-tasking.pdf))
 19 |   * _The Problem with Threads_ by Edward A. Lee ([Paper](https://www2.eecs.berkeley.edu/Pubs/TechRpts/2006/EECS-2006-1.pdf))
 20 | 
 21 | ## Motivating Example
 22 | 
 23 | Consider the following Fortran program:
 24 | ```fortran
 25 | module numerot
 26 |   contains
 27 |     pure real function yksi(X)
 28 |       implicit none
 29 |       real, intent(in) :: X(100)
 30 |       !real, intent(out) :: R
 31 |       yksi = norm2(X)
 32 |     end function yksi
 33 |     pure real function kaksi(X)
 34 |       implicit none
 35 |       real, intent(in) :: X(100)
 36 |       kaksi = 2*norm2(X)
 37 |     end function kaksi
 38 |     pure real function kolme(X)
 39 |       implicit none
 40 |       real, intent(in) :: X(100)
 41 |       kolme = 3*norm2(X)
 42 |     end function kolme
 43 | end module numerot
 44 | ```
 45 | 
 46 | ```fortran
 47 | program main
 48 |   use numerot
 49 |   implicit none
 50 |   real :: A(100), B(100), C(100)
 51 |   real :: RA, RB, RC
 52 |   
 53 |   A = 1
 54 |   B = 1
 55 |   C = 1
 56 |   
 57 |   RA = yksi(A)
 58 |   RB = kaksi(B)
 59 |   RC = kolme(C)
 60 |   
 61 |   print*,RA+RB+RC
 62 | end program main
 63 | ```
 64 | 
 65 | Assuming that `yksi`, `kaksi`, `kolme` share no state, then
 66 | all three functions can execute concurrently.
 67 | 
 68 | How would we implement this in Fortran 2018?
 69 | 
 70 | One way is to use coarrays and assign each function to a different image.
 71 | ```fortran
 72 | program main
 73 |   use numerot
 74 |   implicit none
 75 |   real :: A(100), B(100), C(100)
 76 |   real :: R
 77 |   
 78 |   A = 1
 79 |   B = 1
 80 |   C = 1
 81 |   
 82 |   if (num_images().ne.3) STOP
 83 |   
 84 |   if (this_image().eq.1) R = yksi(A)
 85 |   if (this_image().eq.2) R = kaksi(A)
 86 |   if (this_image().eq.3) R = kolme(A)
 87 |   
 88 |   SYNC ALL()
 89 |   
 90 |   call co_sum(R)
 91 |   if (this_image()) print*,R
 92 | end program main
 93 | ```
 94 | While this works, this approach has many shortcomings.
 95 | First, there is no way to share data directly between images - data must be explicitly copied using coarray operations.
 96 | Second, images exist throughout the lifetime of the program (unless they fail) and thus the amount of parallelism
 97 | is restricted to what is specified at runtime.
 98 | Third, if there are many functions that can execute concurrently,
 99 | many more than the number of images (which are likely to be processor cores or similar),
100 | then either the system will be oversubscribed or the user needs to implement scheduling by hand.
101 | Dynamic load-balancing is nontrivial and should not be delegated to application programmers in most cases.
102 | 
103 | Another way to implement this program is to use `DO CONCURRENT`:
104 | ```fortran
105 | program main
106 |   use numerot
107 |   implicit none
108 |   real :: A(100), B(100), C(100)
109 |   real :: RA, RB, RC
110 |   integer :: k
111 |   
112 |   A = 1
113 |   B = 1
114 |   C = 1
115 | 
116 |   do concurrent (k=1:3)
117 |   
118 |     if (k.eq.1) RA = yksi(A)
119 |     if (k.eq.2) RB = kaksi(B)
120 |     if (k.eq.3) RC = kolme(C)
121 |     
122 |   end do
123 |   
124 |   print*,RA+RB+RC
125 | end program main
126 | ```
127 | This could work if the external functions are declared `PURE`,
128 | but `DO CONCURRENT` provides no means for dynamic load-balancing.
129 | The bigger problem is that Fortran implementations cannot agree on what form of parallelism
130 | `DO CONCURRENT` uses.  Some implementations will use threads while others will use vector lanes.
131 | The latter is going to be useless for most purposes.
132 | Finally, the above is ugly and tedious - no one wants to write code like that
133 | to execute independent tasks.
134 | 
135 | ## The OpenMP/OpenACC Solution
136 | 
137 | There is a proven solution for Fortran task parallelism in OpenMP (4.0 or later) or OpenACC:
138 | 
139 | ```fortran
140 | program main
141 |   use numerot
142 |   implicit none
143 |   real :: A(100), B(100), C(100)
144 |   real :: RA, RB, RC
145 |   
146 |   A = 1
147 |   B = 1
148 |   C = 1
149 |   
150 |   !$omp parallel
151 |   !$omp master
152 |   
153 |   !$omp task
154 |   RA = yksi(A)
155 |   !$omp end task
156 |   
157 |   !$omp task
158 |   RB = kaksi(B)
159 |   !$omp end task
160 |   
161 |   !$omp task
162 |   RC = kolme(C)
163 |   !$omp end task
164 |   
165 |   !$omp end master
166 |   !$omp end parallel
167 |   
168 |   print*,RA+RB+RC
169 | end program main
170 | ```
171 | 
172 | ```fortran
173 | program main
174 |   use numerot
175 |   implicit none
176 |   real :: A(100), B(100), C(100)
177 |   real :: RA, RB, RC
178 |   
179 |   A = 1
180 |   B = 1
181 |   C = 1
182 |   
183 |   !$acc async
184 |   RA = yksi(A)
185 |   !$acc end async
186 |   
187 |   !$acc async
188 |   RB = kaksi(B)
189 |   !$acc end async
190 |   
191 |   !$acc async
192 |   RC = kolme(C)
193 |   !$acc end async
194 |   
195 |   !$acc async wait
196 |   
197 |   print*,RA+RB+RC
198 | end program main
199 | ```
200 | 
201 | These programs will execute regardless of the available hardware parallelism, including sequentially.
202 | OpenMP tasking is more powerful in some use cases than OpenACC, by allowing the user to create
203 | dependencies between tasks, which forces the runtime to do more work when scheduling.
204 | This feature - tasks with dependencies - is not proposed for Fortran.
205 | 
206 | ## The Proposal for Fortran
207 | 
208 | Because OpenMP independent tasks is implemented in essentially all of the Fortran 2008 compilers, 
209 | it is reasonable to assume that the design is portable.
210 | The goal here is to design a language feature for Fortran that is consistent with
211 | its existing semantics and syntax.
212 | 
213 | We consider the `BLOCK` construct to be an appropriate starting point, because it
214 | defines a scope, and scoping data is an essential part of defining task parallelism.
215 | Because we need more than just data scoping, we use the keyword `task_block` to
216 | tell the implementation that execution concurrency is both permitted and desirable.
217 | ```fortran
218 | program main
219 |   use numerot
220 |   implicit none
221 |   real :: A(100), B(100), C(100)
222 |   real :: RA, RB, RC
223 |   
224 |   A = 1
225 |   B = 1
226 |   C = 1
227 |   
228 |   task_block
229 |   RA = yksi(A)
230 |   end task_block
231 |   
232 |   task_block
233 |   RB = kaksi(B)
234 |   end task_block
235 |   
236 |   task_block
237 |   RC = kolme(C)
238 |   end task_block
239 |   
240 |   task_sync all
241 |   
242 |   print*,RA+RB+RC
243 | end program main
244 | ```
245 | 
246 | ## Non-trivial data issues
247 | 
248 | Obviously, very few programs can exploit concurrency where all data is strictly private.
249 | In `DO CONCURRENT`, locality specifiers are used to inform the implementation about
250 | whether data is shared, etc.
251 | (See [this](https://developer.nvidia.com/blog/accelerating-fortran-do-concurrent-with-gpus-and-the-nvidia-hpc-sdk/)
252 | or [this](https://software.intel.com/content/www/us/en/develop/documentation/fortran-compiler-oneapi-dev-guide-and-reference/top/language-reference/a-to-z-reference/c-to-d/do-concurrent.html) for details.)
253 | 
254 | Below we modify our program as if each function used a private scratch buffer.
255 | This is not the best way to allocate X, since X could be defined inside of the
256 | `task_block` scope or inside of the external function, but this is just an illustration of the syntax.
257 | We also add T, which could be a read-only lookup table, for example.
258 | ```fortran
259 | program main
260 |   use numerot
261 |   implicit none
262 |   real :: A(100), B(100), C(100)
263 |   real :: RA, RB, RC
264 |   real :: X(10)
265 |   real :: T(1000)
266 |   
267 |   A = 1
268 |   B = 1
269 |   C = 1
270 |   
271 |   task_block local(X) shared(T)
272 |   RA = yksi(A,X)
273 |   end task_block
274 |   
275 |   task_block local(X) shared(T)
276 |   RB = kaksi(B,X)
277 |   end task_block
278 |   
279 |   task_block local(X) shared(T)
280 |   RC = kolme(C,X)
281 |   end task_block
282 |   
283 |   task_sync all
284 |   
285 |   print*,RA+RB+RC
286 | end program main
287 | ```
288 | 
289 | Much like `DO CONCURRENT`, we should be able to write a fully explicit version using `default(none)`.
290 | ```fortran
291 | program main
292 |   use numerot
293 |   implicit none
294 |   real :: A(100), B(100), C(100)
295 |   real :: RA, RB, RC
296 |   real :: X(10)
297 |   real :: T(1000)
298 |   
299 |   A = 1
300 |   B = 1
301 |   C = 1
302 |   
303 |   task_block local_init(A) shared(RA) local(X) shared(T)
304 |   RA = yksi(A,X)
305 |   end task_block
306 |   
307 |   task_block local_init(B) shared(RB) local(X) shared(T)
308 |   RB = kaksi(B,X)
309 |   end task_block
310 |   
311 |   task_block local_init(C) shared(RC)  shared(T)
312 |   RC = kolme(C,X)
313 |   end task_block
314 |   
315 |   task_sync all
316 |   
317 |   print*,RA+RB+RC
318 | end program main
319 | ```
320 | It might make sense to have a new locality specifier, `local_final` but since there might have
321 | been a reason why that was not added for `DO CONCURRENT`, we use the `shared` specifier to the
322 | result of this function.
323 | 
324 | ## Dependencies
325 | 
326 | Many applications where task parallelism will be used have dependencies between tasks.
327 | For example, in our program, we can add a fourth function `nalja` that depends on
328 | `yksi` and `kaksi`.
329 | ```fortran
330 | program main
331 |   use iso_fortran_env, only : task_depend_kind
332 |   use numerot
333 |   implicit none
334 |   real :: A(100), B(100), C(100)
335 |   real :: RA, RB, RC
336 |   real :: X(10)
337 |   real :: T(1000)
338 |   type(task_depend_kind) :: DEP
339 |   
340 |   A = 1
341 |   B = 1
342 |   C = 1
343 |   
344 |   task_block depends_to(DEP)
345 |   RA = yksi(A)
346 |   end task_block
347 |   
348 |   task_block depends_to(DEP)
349 |   RB = kaksi(B)
350 |   end task_block
351 |   
352 |   task_block
353 |   RC = kolme(C)
354 |   end task_block
355 |   
356 |   task_block depend_from(DEP)
357 |   RD = nalja(RA,RB)
358 |   end task_block
359 |   
360 |   task_sync all
361 |   
362 |   print*,RC+RD
363 | end program main
364 | ```
365 | This syntax may not be ideal but it expresses the concept.
366 | In OpenMP, dependencies are expressed in the form of memory locations.
367 | Because this might be harder to implement in some scenarios,
368 | we propose an explicit opaque type that the implementation can use.
369 | 
370 | ## Known Shortcomings
371 | 
372 | Fortran lacks a memory model in the way that Java, C11 and C++11 do.
373 | We do not take a position on whether that is a good or bad thing, but
374 | instead attempt to make the fewest changes required to address
375 | hazards of concurrent data access by tasks.
376 | 
377 | One obvious solution for tasks is to reuse the coarray atomic operations,
378 | although this may not be acceptable to the committee.
379 | However, requiring that tasks use atomic operations to access data 
380 | that may be modified by another task is a straightforward solution to these hazards.
381 | Unfortunately, the overhead of coarray atomics may be higher than acceptable
382 | for shared-memory uses, in which case a new syntax is required.
383 | 
384 | ## Acknowledgements
385 | 
386 | Thanks to the following people, who read this proposal or related material and may have provided feedback:
387 | - Ondrej Certik
388 | - Jeff Larkin
389 | 


--------------------------------------------------------------------------------
/GitHub_Universe.md:
--------------------------------------------------------------------------------
 1 | # Summary
 2 | 
 3 | This blog post accompanies my GitHub Universe video presentation on oneAPI (see [Embrace the accelerated, cross-architecture era - upstream and downstream](https://universe.intel.com/img/vid.mp4/) on Intel's [Universe home page](https://universe.intel.com/#OpenSource)) if you happen to come here first).  I am going to continue to add stuff over the next week so if you find the current state incomplete, you might find it improves on its own.  Alternatively, feel free to contact me to ask for the content you want to see.
 4 | 
 5 | * email: it's on my GitHub [home page](https://jeffhammond.github.io/)
 6 | * tweet: [science_dot](https://twitter.com/science_dot)
 7 | * issue: create a GitHub issue against this repo to ask a question.  
 8 | 
 9 | # Direct links
10 | 
11 | * [Data Parallel C++ Tutorial](https://github.com/jeffhammond/dpcpp-tutorial)
12 | * [Parallel Research Kernels](https://github.com/ParRes/Kernels)
13 | * [Stencil Demo](https://github.com/jeffhammond/stencil-demo)
14 | * [Intel DPC++ Compiler](https://github.com/intel/llvm/)
15 | * [oneAPI GitHub Project](https://github.com/oneapi-src/)
16 | * [oneAPI CI Examples](https://github.com/oneapi-src/oneapi-ci)
17 | * [Jeff's blog about getting oneAPI working on a Tiger Lake laptop](
18 | https://github.com/jeffhammond/blog/blob/main/Dell_Inspiron5000_Linux.md)
19 | 
20 | # Details
21 | 
22 | ## Compiling DPC++
23 | 
24 | Download DPC++ from GitHub here: https://github.com/intel/llvm/.  The most common way to download is likely the following:
25 | ```sh
26 | git clone https://github.com/intel/llvm.git dpcpp
27 | ```
28 | 
29 | ### Intel Processors
30 | 
31 | You do not need to do this, but you are certainly free to compile DPC++ from source on Intel platforms.  If you do not want to compile DPC++, you can just install via Linux package managers as described on [Installing Intel® oneAPI Toolkits via Linux* Package Managers](https://software.intel.com/content/www/us/en/develop/articles/oneapi-repo-instructions.html).
32 | 
33 | The build for Intel processors is trivial:
34 | ```sh
35 | python ./buildbot/configure.py
36 | python ./buildbot/compile.py [-jN]
37 | ```
38 | 
39 | ### CUDA Processors
40 | 
41 | The build of DPC++ for CUDA (PTX back-end) is straightforward.  You should use CUDA 10.1, 11.0 or 11.1.  I recall that 11.2 is not yet supported.  Version 10.0 is not supported but mostly works (see below for additional comments).
42 | ```sh
43 | python ./buildbot/configure.py [--cuda]
44 | python ./buildbot/compile.py [-jN]
45 | ```
46 | I have tested DPC++ for CUDA on P100, V100 and A100.  It is possible to have problems due to various CUDA configuration issues on Linux.  If you experience such issues, report them on the [DPC++ GitHub project](https://github.com/intel/llvm/).
47 | 
48 | ### ARM Processors
49 | 
50 | I ported DPC++ to ARM in September ([PR 2333](https://github.com/intel/llvm/pull/2333)) but unfortunately, there has been a regression in the build system that I have not yet been able to fix, so please use my branch [agx-works](https://github.com/jeffhammond/intel-llvm/tree/agx-works) for now.
51 | 
52 | The ARM build is straightforward using the buildbot scripts:
53 | ```sh
54 | python ./buildbot/configure.py --arm [--cuda]
55 | python ./buildbot/compile.py [-j1]
56 | ```
57 | 
58 | If you build on an ARM+CUDA platform like Xavier AGX, you should add the `--cuda` option.  Note that the current AGX distribution of CUDA is version 10.0, which is technically unsupported (10.1 is) and likely causes an issue with memory deallocation in some programs.  I am optimistic that the upcoming refresh of the AGX software distribution will address this.
59 | 
60 | If you are building a Raspberry Pi, you need to disable parallelism (`-j1`) because the memory on a Pi is insufficient to do parallel builds of LLVM.  If you do not limit build parallelism, your Pi will almost become unresponsive and require power cycling.
61 | 
62 | ## Tutorials and Demos
63 | 
64 | TODO
65 | 
66 | # Questions and Answers
67 | 
68 | I'll add answers to any questions I receive.  If you ask a question in a public forum, I'll cite that, otherwise I will not attribute your question unless you specifically request it.
69 | 
70 | # License
71 | 
72 | (c) Copyright Jeff Hammond, 2020. CC BY 4.0 license.  See https://creativecommons.org/licenses/by/4.0/ for details.
73 | 


--------------------------------------------------------------------------------
/HPC_Numerical_Precision.md:
--------------------------------------------------------------------------------
1 | I should write this down once-and-for-all.
2 | 
3 | [Double Precision Is Not Needed for Many-Body Calculations: New Conventional Wisdom](https://chemrxiv.org/engage/chemrxiv/article-details/60c73dd2567dfe7b4dec36dd)
4 | 
5 | [Leveraging the bfloat16 Artificial Intelligence Datatype For Higher-Precision Computations](https://arxiv.org/abs/1904.06376)
6 | 
7 | [Double-precision FPUs in High-Performance Computing: an Embarrassment of Riches?](https://arxiv.org/abs/1810.09330)
8 | 


--------------------------------------------------------------------------------
/In_Defense_Of_Fortran.md:
--------------------------------------------------------------------------------
  1 | # In Defense of Fortran
  2 | 
  3 | Fortran is a programming language that people love to hate.
  4 | Hating programming languages isn't rare, and most programming languages have haters,
  5 | but it seems particularly popular for serious people in the software community to
  6 | criticize Fortran, often in ways that are logically or factually flawed.
  7 | 
  8 | The fact is, I don't love Fortran, certainly not like some people love Fortran.
  9 | There are people who write everything in Fortran, and believe that it's the best
 10 | programming language for just about everything.
 11 | While I used Fortran during grad school and have worked on NWChem -
 12 | a mostly pre-modern Fortran chemistry application - my entire career,
 13 | I am more proficient in C and it is the default language I use when writing new code.
 14 | I've also written a nontrivial anount of C++ and Python, and have spent enough
 15 | time writing other languages to appreciate the diversity of design choices that exist.
 16 | Thus, I think I'm qualified to write objectively about why Fortran is a good language,
 17 | and certainly why some of the criticisms of it are bogus.
 18 | 
 19 | One of the common fallacies in programming language evaluation is to argue that
 20 | because one language does better at a subset of features, it is therefore an
 21 | objectively better language overall.
 22 | The second fallacy is to ignore the experience and goals of the programmer using
 23 | the language.
 24 | Most criticisms of Fortran rely heavily on at least one of these.
 25 | 
 26 | Now let's look at some of the good and bad features of Fortran.
 27 | 
 28 | ## Fortran doesn't have a standard library
 29 | 
 30 | *Fortran afficianados will of course shout at their computer that I have
 31 | ignored https://github.com/fortran-lang/stdlib.  I have not.
 32 | An open-source project isn't the same as an ISO standard, and WG5 has not
 33 | ratified anything resembling the C++ STL.
 34 | Furthermore, that project is only fully supported by recent versions of
 35 | two compilers, which means it's standard neither in theory nor in practice.*
 36 | 
 37 | C++ is well-known for its standard library of containers and algorithms,
 38 | and many useful primitives.
 39 | On the other hand, Fortran has very few built-in algorithms, and the only
 40 | data structure in Fortran is an array.
 41 | This isn't too different from C, except that Fortran's strong typing and
 42 | more restrictive pointer semantics make it harder to implement a
 43 | linked-list than in C.
 44 | 
 45 | Standard libraries are great, and if the C++ STL is a great match for your
 46 | application or library, then you should probably use C++.
 47 | This does not mean that Fortran is an inferior programming language.
 48 | A large number of applications have no need for a linked-list, a dequeue,
 49 | or a hash map, and when Fortran applications need something like this,
 50 | they either implement it directly or call out to another language.
 51 | In NWChem, we implement distributed block sparse arrays with a map,
 52 | where the map data structure is a 2xN dimensional array of integers.
 53 | It works and, more importantly was a lot simpler than rewriting the
 54 | application in C++.
 55 | 
 56 | On the other hand, the Fortran intrinsics that are standard are quite
 57 | useful to Fortran applications.  Fortran has `MATMUL`, `TRANSPOSE`,
 58 | `DOT_PRODUCT`, and many other common operations for arrays, which are
 59 | useful to the applications for which Fortran is used.
 60 | C++20 doesn't even have proper multidimensional arrays, and it appears
 61 | that the equivalent of `MATMUL` may appear in C++26.
 62 | Does this mean that C++ is useful for linear algebra?
 63 | Of course not, because just like Fortran programmers, C++ programmers
 64 | are capable of using more than one-liner solutions.
 65 | 
 66 | In conclusion, if you are looking for a programming language ecosystem
 67 | with lots of turnkey library routines, C++ and Python are great options.
 68 | Fortran and C are not, and that's okay.
 69 | 
 70 | ## Fortran compilers are imperfect
 71 | 
 72 | I've used Fortran compilers from GCC, Intel, PGI->NVIDIA, Cray, IBM,
 73 | Fujitsu and Pathscale over the years, and none of them are perfect.
 74 | In particular, some of them are inadequate for modern Fortran code.
 75 | (Modern Fortran here means Fortran 2008 and later - I'm not aware of
 76 | any actively developed compiler that doesn't support Fortran 2003.)
 77 | Some people have tried to claim that this means that we have to abandon
 78 | Fortran for C++, which has the most amazing compiler support ever,
 79 | as long as the only C++ compilers we look at are the latest releases
 80 | of GCC and Clang.
 81 | 
 82 | I find it especially rich that anyone argues that C++ is better than
 83 | Fortran because the compilers are so great.
 84 | They seem to forget what using C++ was like before Cray, IBM and Intel
 85 | killed off their C++ front-ends in favor of Clang, which happened
 86 | because all the C++ programmers were so obsessed with using the latest
 87 | language features that their code only compiled with GCC and later Clang.
 88 | I don't use Windows so I don't really know about MSVC, but as best I
 89 | can tell, there are only two usable C++ compilers for Linux, and
 90 | one of them is propelled ever forward by the collective might of
 91 | companies with a market capitalization in excess of three trillion dollars,
 92 | who contribute at least $100M a year in employee effort to the project.
 93 | Yet, when investment of $10M to make Fortran compilers better is proposed,
 94 | Fortran users get uncomfortable, and nobody seems to want to spend a
 95 | dime on GCC Fortran.
 96 | 
 97 | Compilers aren't free, and good compilers are expensive.
 98 | While it's true that the HPC community gets a free lunch from 
 99 | Big Tech when it comes to modern C++ support, they still have
100 | to pay for OpenMP and GPU support, the maintenance costs of their
101 | parallel C++ frameworks, and an army of people to debug the 
102 | template instantiation error of the day.
103 | 
104 | The Classic Flang project demonstrated that a multi-vendor open-source
105 | collaboration around Fortran is possible, just like with Clang,
106 | and while the LLVM (new) Flang project isn't finished yet, they
107 | deserve patience in the same way that Clang deserved patience when
108 | it was not yet competitive with GCC.
109 | 
110 | ## Fortran is hard to teach and hard to learn
111 | 
112 | This one is especially rich coming from the C++ community, a language 
113 | that requires having an entire book written about [move semantics](https://www.cppmove.com/).
114 | However, everybody seems to agree that Python is easy to learn,
115 | so let's compare Numpy - the lingua franca and machine learning - and Fortran.
116 | 
117 | ### 2D Stencil
118 | 
119 | Below are excerpts from the [Parallel Research Kernels](https://github.com/ParRes/Kernels) (PRK)
120 | implementation of the 2D stencil, which is a common homework program in computational science.
121 | The major differences in the code shown are:
122 |  1. Fortran defaults to base-1 array indexing, whereas Python is 0-based like C.
123 |     Fortran supports 0-based indexing if somebody has a hard time with the default.
124 |  2. Fortran requires `end do`, whereas Python figures this out from indentation.
125 | 
126 | Other than those two trivial differences, the syntax is the same.
127 | Numpy supports the same expressive array syntax that Fortran 90 had.
128 | We can also look at PRK nstream, transpose and dgemm to see that they are also
129 | 1:1 using array expressions and math intrinsics.
130 | 
131 | ```python
132 | b = n-r
133 | for s in range(-r, r+1):
134 |   for t in range(-r, r+1):
135 |     B[r:b,r:b] += W[r+t,r+s] * A[r+t:b+t,r+s:b+s]
136 | ```
137 | 
138 | ```fortran
139 | b = n-r
140 | do j=-r,r
141 |   do i=-r,r
142 |     B(r+1:b,r+1:b) = B(r+1:b,r+1:b) + W(i,j) * A(r+i+1:b+i,r+j+1:b+j)
143 |   enddo
144 | enddo
145 | ```
146 | 
147 | Of coures, the big difference is in the code not shown.
148 | Fortran is strongly typed and requires everything to be declared
149 | (because we are not psychopaths and use `implicit none` everywhere),
150 | whereas Python infers types from the first usage of a variable.
151 | However, when using Numpy, it's often prudent to be somewhat explicit,
152 | e.g. `X = numpy.zeros(n,dtype=float)`, so it's unlikely that
153 | `real(kind=REAL64), allocatable ::  A(:,:)` is going to be a showstopper
154 | when learning Fortran.
155 | 
156 | While Fortran might require slightly more work than Numpy, they
157 | are similarly expressive when it comes to the mathematical code that matters,
158 | so it's hard to argue that Fortran can't be learned, while thousands of
159 | data scientists are learning Numpy every year.
160 | 
161 | ## Memory Safety
162 | 
163 | Rust is a relatively new programming language that is obsessed with safety
164 | and correctness, but manages to preserve performance in the process.
165 | The Rust community has written quite a bit about memory safety relative
166 | to C and C++, neither of which are memory safe.  One can write a careful
167 | subset and C++ to avoid the flaming chainsaw juggling that is C memory
168 | management, but C++ compilers aren't going to prevent programmers from
169 | doing horrible things the way Rust does.
170 | 
171 | I haven't seen anybody talk about Fortran being memory safe.  It's certainly
172 | not trying to be memory safe in the way that Rust is, but it is educational
173 | to try and write a memory leak in Fortran, because it's not easy.
174 | 
175 | ##  Optional and named arguments
176 | 
177 | This is a case where Fortran is just nice, and should be appreciated for it.
178 | Nobody is claiming that C++ is better than Fortran because
179 | of named arguments, because C++ doesn't have named arguments.
180 | 
181 | ## Object-Oriented Fortran
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 


--------------------------------------------------------------------------------
/Learning_Programming_Languages.md:
--------------------------------------------------------------------------------
 1 | # Summary
 2 | 
 3 | Talking to computers is different than talking to humans for a bunch of reasons, but I think there are some similarities.  I recently watched a video by professional language teachers about gaining language proficiency and apply those ideas to my own experiences with programming languages.
 4 | 
 5 | # Context
 6 |  
 7 | Professor [Jeffery Brown](https://www.linkedin.com/in/jeffery-brown-a14b8430/) has a [video](https://youtu.be/illApgaLgGA) on how to gain language proficiency.  He puts a lot of emphasis on the difference between the words "acquire" and "learn", which I dislike.  What he's really saying is, natural/organic/immersive language education is superior to formal, grammar-oriented language education.  I agree with this, but I don't know why we can't call it "learning naturally".
 8 | 
 9 | Some of the key ideas in Professor Brown's video are:
10 | 
11 |   1. Babies learn to communicate verbally without knowing how to read and with total disregard for language rules and grammar.
12 |   2. Comprehensible language input is the key to learning how language works.
13 |   3. When beginning to learn a language, focus on listening.  Listen to someone who is fluent speak.  You should look for a "language parent" that serves a similar role to the parent of a baby learning a language.
14 |   4. Only study grammar when you are fluent in the language.
15 |   
16 | Professor Brown espouses the [natural approach](https://en.wikipedia.org/wiki/Natural_approach) of Krashen and Terrell, which is widely used today.  I recognize the methods from my German classes in high school, which would have been a lot more effective if I had not been utterly lazy.
17 |   
18 | # Learning Fortran
19 | 
20 | My first experience becoming proficient in a programming language when I was an intern at PNNL in 2006 ([details](https://github.com/jeffhammond/blog/blob/main/Career_Trajectory.md#doe-csgf-and-pnnl)), when I started working on NWChem, which is written primarily in old-school Fortran (I will define this later).  At the time, I had never written any Fortran and I was unable to modify existing programs, which represents 0% fluency.  I was familiar with Fortran-style loops from Matlab programming, but Matlab is much simpler than Fortran for a bunch of reasons.
21 | 
22 | It has been a little over 14 years, but I'm pretty sure the first thing I had to do as an intern was modify https://github.com/nwchemgit/nwchem/blob/master/src/tce/tce_energy.F, which is an 11K-line subroutine.  Back then, it was probably twice as long, but I refactored it for my own sanity after a year or two of experience.  The first thing I remember was trying to compile my modifications and getting an unhelpful error (compiler error messages weren't great in those days).  I asked the only other person in the room at the time, and [he](https://scholar.google.com/citations?user=1w1T9HYAAAAJ&hl=en) said "you need to indent six spaces.  It seemed arbitrary to me, but it worked.  So the first thing I learned about Fortran was: indent six spaces.
23 | 
24 | During that summer and in the years to follow, I wrote thousands of lines of Fortran.  What I have never done in all my years as a programmer are (1) take a course on Fortran programming or (2) engage in any serious study of Fortran grammar.  While I own at least three books on Fortran, none of them have been even the slighest bit useful to me.  I will admit that I have referenced Fortran documentation on the internet from time to time, especially regarding formatted I/O, the primary methods I used to become proficient in Fortran are:
25 | 
26 |   1. Reading code.
27 |   2. Writing code and seeing if it (a) compiles and (b) does the thing I want it to do.
28 |   
29 | I recognized that these behaviors are very similar to the language acquisition noted above.  While I was recompiling NWChem with my latest modifications -- a multi-hour process until I understood the build system and header files better -- I would read the rest of the code, and a lot of other code, too.  **The first key point here is that reading code is hugely important but not something I see practiced much.**  Programmers love to point out everything that is wrong with code that already exists and don't seem to see the value in learning from imperfect code.  Babies learn to speak the language used at home even when their parent(s) do not speak properly.  NWChem is full of of kinds of Fortran but it wasn't hard to figure out the difference between good and bad style from its inherent comprehensibility.
30 | 
31 | The second thing I learned to do is stop looking up answers and just answer questions experimentally.  You know those Twitter polls asking "without testing it, is this code (a) correct, (b) undefined behavior, (c) blah blah, (d) show me results"?  I hate them with the burning passion of [VY Canis Majoris](https://en.wikipedia.org/wiki/VY_Canis_Majoris).  I guess those polls are for ISO language lawyers and compiler developers, but they are useless to me.  I care about what works, and if something gives me the right answer and passes strict compiler checking and sanitizers, that's all I need to know.  Over the years, I've kept a [programmer diary](https://github.com/jeffhammond/HPCInfo) of random tests I've written to see what works.  Many of those tests involve things that aren't covered by one of those ISO-blessed APIs anyways, and when it comes to what vendors ship on exotic supercomputers, the only thing that matters is what produces the correct results (vendor documentation is rarely perfect).
32 | 
33 | It turns out that this is how babies learn to speak.  My kids didn't read a book to know how to ask for food.  Toddlers emit semi-random sounds until they get what they want, and over time, they get better at it.  They get a banana and I get `$?==0`. 
34 | 
35 | # Learning C
36 | 
37 | Later in grad school, I tried to learn C.  It turns out this is rather hard coming from Fortran, because Fortran passes everything by reference, and C passes everything by value, I spent a lot of time during my first year with C wondering why I could modify `a` with `void foo(int)`.  But eventually I learned, not because I read the Kernighan and Ritchie book or [ISO/IEC 9899](http://www.iso-9899.info/wiki/The_Standard) but because I read thousands of lines of C code that I was able to determine was some form of good.  For example, [PSI](https://psicode.org/) isn't perfect -- it was created by a bunch of [3+ star programmers](https://wiki.c2.com/?ThreeStarProgrammer) -- but it was a lot better than the C I knew how to write at the time, and working C is better than C that either doesn't work or doesn't exist.
38 | 
39 | I've subsequently become a pretty good C programmer, by reading C code written by people who know what they are doing, and by writing lots of C and seeing how it works.  I eventually read K&R, but I read it as a history book, not as a technical manual.  C is a rather simple language and modern compilers are so good at issuing warnings when code is non-conforming that it's pretty easy to learn "toddler-style".
40 | 
41 | # Learning Java
42 | 
43 | At one point, I tried to learn Java.  I read a book for about a day and decided that it was close enough to C++ that I could just guess-and-check using Eclipse, which was a fantastic IDE for Java.  So I'd write C++ in Eclipse and it would tell me what I was doing wrong.  I wrote a number of simple but interesting programs in Java this way.
44 | 
45 | # Appendix
46 | 
47 | This is unnecessary detail but if you are the type of person who reads my blog posts, particularly ones about computer language acquisition, you might be the type of person who cares about unnecessary details.
48 | 
49 | ## What does "old-school Fortran" mean?
50 | 
51 | TL;DR old-school Fortran means:
52 | 
53 |   1. Fixed-source form with 72 columns.
54 |   2. Extensive use of common blocks.
55 |   3. No use of modules, interfaces, polymorphism, user-defined types and other features introduced with Fortran 90/95.
56 |   4. No use of Fortran dynamic memory allocation.  Memory management is done with C and passed using sketchy methods.
57 |   5. No use of any Fortran feature that is not implemented in every relevant compiler.
58 | 
59 | Steve Lionel, aka "Dr. Fortran" wrote a [blog post](https://stevelionel.com/drfortran/2020/05/16/doctor-fortran-in-military-strength/) that elaborates on a comment I made about the non-existence of actual Fortran 77 codes, which is something I picked up from Jeff Squyres in the MPI Forum during our many discussions of the MPI Fortran bindings (interfaces).
60 | 
61 | In the case of NWChem, the aversion to Fortran memory management is not just because that feature was added in Fortran 90.  The distributed memory programming model of NWChem, Global Arrays, relies heavily upon interprocess shared memory and one-sided communication, both of which require special memory allocation procedures that are done in C.  To make these efficient, NWChem allocates a large slab at program start and suballocates from that using a stack allocator (explicit push+pop).  This enforces a programmer discipline and makes memory leaks less likely (failure to pop in reverse order of push generates a runtime error).
62 | 
63 | (c) Copyright Jeff Hammond, 2020. No reuse permitted except by permission from the author.
64 | 


--------------------------------------------------------------------------------
/MPI_Needs_ABI.md:
--------------------------------------------------------------------------------
  1 | # It's past time for MPI to have a standard ABI
  2 | 
  3 | ## Introduction
  4 | 
  5 | [MPI](https://www.mpi-forum.org/) has always been an API standard.
  6 | Implementations are not constrained in how they define opaque types (e.g. `MPI_Comm`),
  7 | which means they compile into different binary representations.
  8 | This is fine for users who only use one implementation, or are content to recompile their software for each of these.
  9 | Many users, including those building both traditional C/C++/Fortran libraries and new languages that use MPI via the C ABI,
 10 | are tired of the duplication of effort required because MPI lacks a standard ABI.
 11 | 
 12 | Definitions:
 13 | - API = Application Programming Interface, i.e. the signature `MPI_Barrier(MPI_Comm)`.
 14 | - ABI = Application Binary Interface, i.e. the binary representation of `MPI_Comm` in memory (e.g. `int` versus a pointer).
 15 | 
 16 | ## Motivating example
 17 | 
 18 | The MPI Forum has often espoused the view that MPI is designed for building libraries.
 19 | Many of the APIs in the MPI standard are specifically designed for this purpose, and don't have much use in other contexts,
 20 | since attaching state to MPI communications, for example, isn't necessary if the code that calls MPI is monolithic;
 21 | in that case, such state can be managed directly by the application.
 22 | 
 23 | Let's consider a trivial library that implements a dot product using MPI:
 24 | ```c
 25 | // Please ignore the lack of support for vectors longer than `MPI_INT` or 
 26 | // error handling - this code is not intended to be used in production.
 27 | double parallel_ddot(MPI_Comm comm, const double * x, const double * y, int length)
 28 | {
 29 |   double z = 0.0;
 30 |   for (int i=0; i<length; ++i) {
 31 |     z += x[i] * y[i];
 32 |   }
 33 |   
 34 |   double result = 0.0;
 35 |   MPI_Allreduce(&z, &result, MPI_DOUBLE, MPI_SUM, comm);
 36 |   
 37 |   return result;
 38 | }
 39 | ```
 40 | 
 41 | Now, I compile this code for my system:
 42 | ```sh
 43 | mpicc -O2 -c parallel_ddot.c -o parallel_ddot.o
 44 | ```
 45 | Since MPI is a standard portable interface, the resulting object file is portable, right?  No!
 46 | 
 47 | When I invoked `mpicc`, I chose a specific implementation of MPI, and that means that the object file I produced
 48 | will only work properly with object files compiled with the same implementation, or an ABI-compatible one (more on this later).
 49 | Furthermore, if I built a library, such as `libparallel_math.a` that contains a bunch of object files like this one,
 50 | I can only link that library with an application that is compiled with the same MPI implementation.
 51 | And if I go completely nuts and use dynamic libraries, any `libparallel_math.so` I create can only be linked and loaded
 52 | alongside compatible shared objects and applications.
 53 | 
 54 | Finally, if I link my application against a specific MPI implementation, I must run the resulting application
 55 | with a compatible `mpirun`.
 56 | 
 57 | All of this would be irrelevant if everyone used the same MPI implementation, similar to how (almost) everyone uses glibc on Linux,
 58 | which means that all the binary applications shipped by Apt or Yum just work, because every single one of them relies on the same
 59 | C runtime library.
 60 | 
 61 | In contrast, in the MPI world, there are at least two common MPI implementations, Open-MPI and the MPICH family.
 62 | Here I say "MPICH family" because, while MPICH, MVAPICH2 and Intel MPI are technically different, they are all mutually
 63 | compatible in practice (https://www.mpich.org/abi/ demonstrates this is often intentional).
 64 | And to make matters worse, Open-MPI has not committed to a standard ABI, so one must treat different major versions
 65 | of Open-MPI as if they are different from an ABI perspective.
 66 | 
 67 | The practical consequence of this is that I, the library author, or the downstream packagers, need to build N copies of my library.
 68 | This is a real issue and wastes human effort, validation time and storage space.
 69 | Here's the MKL distribution of BLACS, the communication library for ScaLAPACK, distributed by Intel:
 70 | ```sh
 71 | $ ll /opt/intel/oneapi/mkl/2021.2.0/lib/intel64/libmkl_blacs*so
 72 | lrwxrwxrwx 1 root 32 Mar 25 04:50 /opt/intel/oneapi/mkl/2021.2.0/lib/intel64/libmkl_blacs_intelmpi_ilp64.so -> libmkl_blacs_intelmpi_ilp64.so.1
 73 | lrwxrwxrwx 1 root 31 Mar 25 04:50 /opt/intel/oneapi/mkl/2021.2.0/lib/intel64/libmkl_blacs_intelmpi_lp64.so -> libmkl_blacs_intelmpi_lp64.so.1
 74 | lrwxrwxrwx 1 root 31 Mar 25 04:50 /opt/intel/oneapi/mkl/2021.2.0/lib/intel64/libmkl_blacs_openmpi_ilp64.so -> libmkl_blacs_openmpi_ilp64.so.1
 75 | lrwxrwxrwx 1 root 30 Mar 25 04:50 /opt/intel/oneapi/mkl/2021.2.0/lib/intel64/libmkl_blacs_openmpi_lp64.so -> libmkl_blacs_openmpi_lp64.so.1
 76 | lrwxrwxrwx 1 root 30 Mar 25 04:50 /opt/intel/oneapi/mkl/2021.2.0/lib/intel64/libmkl_blacs_sgimpt_ilp64.so -> libmkl_blacs_sgimpt_ilp64.so.1
 77 | lrwxrwxrwx 1 root 29 Mar 25 04:50 /opt/intel/oneapi/mkl/2021.2.0/lib/intel64/libmkl_blacs_sgimpt_lp64.so -> libmkl_blacs_sgimpt_lp64.so.1
 78 | ```
 79 | There is one BLACS build for each of MPICH, Open-MPI and SGI MPT, plus one build for each flavor of Fortran ABI.
 80 | The Fortran ABI issue is similar but not one we are going to solve in the MPI Forum.
 81 | Plus, the Fortran standard experts will explain that this issue is the result of improper use of Fortran
 82 | compilers and can be avoided just by using features that already exist in the Fortran standard.
 83 | 
 84 | ## The language use case
 85 | 
 86 | C/C++ and Fortran applications aren't the only consumers of MPI.
 87 | Because of MPI's rich capability for multiprocessing, and standard nature, many developers
 88 | would like to use MPI from [Python](https://www.python.org/),
 89 | [Julia](https://julialang.org/), [Rust](https://www.rust-lang.org/), etc.
 90 | 
 91 | How does one do this?  Because MPI implementations are all written in C, any language
 92 | can call MPI via its own mechanism for calling C ABI symbols, which they all have
 93 | due to needing to interact with the Linux operating system, etc.
 94 | However, unlike e.g. `malloc`, which has a constant ABI on Linux, these languages
 95 | need to know the binary representation of all of the MPI types to call those symbols.
 96 | 
 97 | What this means is that the effort to build and test these MPI wrappers is O(N).
 98 | 
 99 | We see this clearly in the Rust MPI project, [rsmpi](https://github.com/rsmpi/rsmpi),
100 | which reports testing against three different implementations, plus untested user experiences
101 | with a fourth:
102 | > rsmpi is currently tested with these implementations:
103 | >
104 | > * OpenMPI 3.0.4, 3.1.4, 4.0.1
105 | > * MPICH 3.3, 3.2.1
106 | > * MS-MPI (Windows) 10.0.0
107 | >
108 | > Users have also had success with these MPI implementations, but they are not tested in CI:
109 | >
110 | > * Spectrum MPI 10.3.0.1
111 | 
112 | They furthermore alude to the O(N) effort here:
113 | 
114 | > Since the MPI standard leaves some details of the C API unspecified (e.g. whether to implement certain constants and even functions using preprocessor macros or native C constructs, the details of most types, ...) rsmpi takes a two step approach to generating functional low-level bindings.
115 | > 
116 | > First, it uses a thin static library written in C (see rsmpi.h and rsmpi.c) that tries to capture the underspecified identifiers and re-exports them with a fixed C API. This library is built from build.rs using the gcc crate.
117 | >
118 | > Second, to generate FFI definitions tailored to each MPI implementation, rsmpi uses rust-bindgen which needs libclang. See the bindgen project page for more information.
119 | >
120 | > Furthermore, rsmpi uses the libffi crate which installs the native libffi which depends on certain build tools. See the libffi project page for more information.
121 | 
122 | The [libffi](https://en.wikipedia.org/wiki/Libffi) project is used by many projects to call C libraries, so we can expect this pain to reappear over and over.
123 | 
124 | We see the same duplication of testing effort in [mpi4py](https://github.com/mpi4py/mpi4py/).
125 | The project's [Azure pipelines](https://github.com/mpi4py/mpi4py/blob/master/.azure/pipelines.yml) show
126 | tests for each of four different versions of Python on Linux, MacOS and Windows, where Linux and MacOS
127 | testing is doubled for MPICH and Open-MPI.
128 | It is possible to argue that projects should test against multiple implementations even if there is only
129 | one MPI ABI, but it's not obvious that this testing should be exhaustive in the way it is today,
130 | or that the hunt for implementation-specific bugs needs to be done in automated CI/CD environments
131 | running in shared-memory instances in the cloud.
132 | 
133 | ## How do we solve this problem?
134 | 
135 | The first step in solving any problem is to admit that there is a problem.
136 | This is nontrivial in this case, because many in the MPI Forum, especially the implementers,
137 | believe that implementation freedom w.r.t. ABI is a feature, not a defect.
138 | Implementations will often argue that their ABI is the best design, which obviously creates
139 | some irreconcilable differences with other implementations, plus at least the MPICH ABI camp
140 | will argue that, even if their ABI isn't perfect, it's stability is an essential feature of
141 | the MPI ecosystem, and the cost of changing it is too great.
142 | 
143 | As the argument goes on, there will be arguments about how compile-time constants allow
144 | lower latency than link-time constants, because of the potential for one cache miss or
145 | one branch prediction on the critical path.
146 | If these performance arguments are valid, we should be able to see the impact experimentally.
147 | Hemal Shah, Moshe Voloshin, and Devesh Sharma measured MPI latency of MVAPICH2 versus Open-MPI
148 | and presented at [MUG20](http://mug.mvapich.cse.ohio-state.edu/mug/20/).
149 | 
150 | <img width="594" alt="mv-vs-ompi" src="https://user-images.githubusercontent.com/406118/126758671-5946447f-037f-4761-96f0-164aa9335a2a.png">
151 | 
152 | If we attribute the entire difference between the two libraries to the ABI choice,
153 | then it is a very small effect, on the order of 100 nanoseconds, out of the 2500+ nanoseconds
154 | required to send a small message.
155 | See the [full presentation](http://mug.mvapich.cse.ohio-state.edu/static/media/mug/presentations/20/sharma-mug-20.pdf) for details.
156 | 
157 | Perhaps that 100 nanoseconds is due to a cache miss when `MPI_Send` in
158 | Open-MPI dereferences `struct ompi_communicator_t *MPI_Comm`, but it could also be a cache miss
159 | in the guts due to how these libraries represent state that isn't user-visible,
160 | or perhaps it is just the aggregate cost of a few dozen instructions and handful of branches
161 | that MVAPICH2 lacks versus Open-MPI.
162 | The question is whether the MPI user community cares more about these 100 nanoseconds
163 | versus the hours and days it takes humans to build the MPI software ecosystem twice,
164 | three times, or more, because of the lack of a standard ABI.
165 | 
166 | Once the MPI Forum accepts that this is a problem, we can have all the
167 | arguments about what the right standard ABI is, but we cannot use any of those
168 | arguments to get in the way of deciding that there is, in fact, a problem to solve.
169 | It is the MPI Forum's responsibility to act on behalf of its users, in order
170 | to remain relevant.
171 | 
172 | ## Relevant prior art / best practice
173 | 
174 | A very useful principal of the MPI Forum is that it does not standardize research, but practice.
175 | In the case of ABI compatibility, the HPC user community at [CEA](http://www-hpc.cea.fr/) has found the ABI
176 | issue to be sufficiently obnoxious that they built [wi4mpi](https://github.com/cea-hpc/wi4mpi)
177 | specifically to solve this problem.  CEA has been shipping this software since 2016, although
178 | the original license prevented its widespread use until now.
179 | 
180 | A similar effort was undertaken at the Perimeter Institute for Theoretical Physics, 
181 | leading to [MPItrampoline](https://github.com/eschnett/MPItrampoline), which defines
182 | its own ABI that can be redirected to many different MPI implementations, thereby
183 | solving the O(N) problem for applications.
184 | 
185 | It is also known that there is a patent on one particular methods of interoperating different
186 | MPI ABIs that prevents its use by the open-source community, but which demonstrates that
187 | at least one company recognized the value of solving this problem important enough to patent it.
188 | Sadly, the patent holder only managed to prevent others from solving the problem in open-source --
189 | they haven't actually bothered to implement the solution in a commercial product.
190 | 
191 | ## Conclusion
192 | 
193 | Every rational person in high-performance computing will admit that people are the most valuable
194 | component in our ecosystem.
195 | Furthermore, because MPI is a successful standard, there are hundreds of times more people
196 | using MPI and building software against it than there are implementing it.
197 | It is past time for the MPI Forum to prioritize the needs of its user community over the
198 | needs of its implementaters, or the dubious claims of performance overhead due to pointer chasing.
199 | 
200 | We need to pay the price of breaking all the ABIs one more time, in order to free our users from the pain
201 | of O(N) build and test of all the code that depends on MPI.
202 | 
203 | ![MPI-ABI](https://user-images.githubusercontent.com/406118/127174807-1cd9676a-eb8b-40d6-8da3-b154121e8182.jpg)
204 | 
205 | Modern processors are incredibly good at pointer chasing, branch prediction, etc.
206 | We need to let those processors do their jobs and stop prematurely optimizing for something
207 | that isn't even a proven bottleneck on processors built in the past decade.
208 | 
209 | Furthermore, we need to MPI implementers to get over their petty design arguments about whose
210 | ABI is superior to the other's and put users first.
211 | 
212 | I added [MPI ABI Technical Details](MPI_Needs_ABI_Part_2.md) for people who are convinced this is
213 | a great idea and want to start thinking about what a good ABI might look like.
214 | 
215 | ## Acknowledgements
216 | 
217 | Thanks to Gonzalo Brito and Jim Dinan for an inspiring discussion on Slack.
218 | 
219 | ## Disclaimer and license
220 | 
221 | The opinions expressed in this post are exclusively the author's and not those of his
222 | current and past co-workers, co-authors, friends or family members.
223 | 
224 | (c) Copyright Jeff Hammond, 2021. No reuse permitted except by permission from the author.
225 | 


--------------------------------------------------------------------------------
/MPI_Needs_ABI_Part_2.md:
--------------------------------------------------------------------------------
  1 | # MPI ABI Technical Details
  2 | 
  3 | This is a follow-up to [It's past time for MPI to have a standard ABI](MPI_Needs_ABI.md),
  4 | which includes more technical details about how MPI ABIs work.
  5 | 
  6 | As noted in the first article, the first step in solving a problem is admitting that there
  7 | is one, so one should not look at this post unless one has already committed to solving
  8 | the problem.
  9 | 
 10 | ## Overview of MPI ABIs
 11 | 
 12 | There are multiple aspects to an MPI ABI.  Here are a few:
 13 | 
 14 | - The `MPI_Status` object.  This is an object with transparent members, and MPI requires it to have specific fields.
 15 | - Opaque objects, including `MPI_Comm`, `MPI_Datatype`, `MPI_Win`, etc.
 16 |   As these are opaque, implementations can define them to be anything.
 17 |   
 18 | [MPI-4](https://www.mpi-forum.org/docs/mpi-4.0/mpi40-report.pdf)
 19 | imposes the following constraints on opaque objects:
 20 | 
 21 | > All named constants, with the exceptions noted below for Fortran, can be used in
 22 | > initialization expressions or assignments, but not necessarily in array declarations or as
 23 | > labels in C `switch` or Fortran `select`/`case` statements. This implies named constants
 24 | > to be link-time but not necessarily compile-time constants. The named constants listed
 25 | > below are required to be compile-time constants in both C and Fortran. These constants
 26 | > do not change values during execution. Opaque objects accessed by constant handles are
 27 | > defined and do not change value between MPI initialization (MPI_INIT) and MPI completion
 28 | > (MPI_FINALIZE). The handles themselves are constants and can be also used in initialization
 29 | > expressions or assignments
 30 | 
 31 | We will see below that MPICH has elected to provide compile-time constants, even though
 32 | they are not required.  This allows the implementation source code to do some things efficiently,
 33 | although portable applications cannot rely on this behavior.
 34 | 
 35 | ## The `MPI_Status` object
 36 | 
 37 | Let's look at three different implementations of the `MPI_Status` object:
 38 | 
 39 | ### New MPICH
 40 | 
 41 | This is the status object after [this commit](https://github.com/pmodels/mpich/commit/4b516e886aa3aa51379e0c3806c911c9333c2cc3),
 42 | which made MPICH consistent with Intel MPI, in order to establish the [MPICH ABI initiative](https://www.mpich.org/abi/).
 43 | This meant that applications and libraries compiled against Intel MPI could be run using many implementations.
 44 | 
 45 | ```c
 46 | typedef struct MPI_Status {
 47 |     int count_lo;
 48 |     int count_hi_and_cancelled;
 49 |     int MPI_SOURCE;
 50 |     int MPI_TAG;
 51 |     int MPI_ERROR;
 52 | } MPI_Status;
 53 | ```
 54 | 
 55 | ### Old MPICH
 56 | 
 57 | Prior to being consistent with Intel MPI, MPICH had the following status object.
 58 | 
 59 | ```c
 60 | // dnl    EXTRA_STATUS_DECL     - Any extra declarations that the device
 61 | // dnl                            needs added to the definition of MPI_Status.
 62 | ...
 63 | typedef struct MPI_Status {
 64 |     int MPI_SOURCE;
 65 |     int MPI_TAG;
 66 |     int MPI_ERROR;
 67 |     MPI_Count count;
 68 |     int cancelled;
 69 |     int abi_slush_fund[2];
 70 |     @EXTRA_STATUS_DECL@
 71 | } MPI_Status;
 72 | ```
 73 | 
 74 | ### Open-MPI
 75 | 
 76 | This is from Open-MPI as of [65bb9e6](https://github.com/open-mpi/ompi/blob/65bb9e6b4cffd1cafa23f73b2faf7817c5323ab8/ompi/include/mpi.h.in).
 77 | I have not attempted to track the history of the Open-MPI status object.
 78 | 
 79 | ```c
 80 | typedef struct ompi_status_public_t MPI_Status;
 81 | ...
 82 | struct ompi_status_public_t {
 83 |     /* These fields are publicly defined in the MPI specification.
 84 |        User applications may freely read from these fields. */
 85 |     int MPI_SOURCE;
 86 |     int MPI_TAG;
 87 |     int MPI_ERROR;
 88 |     /* The following two fields are internal to the Open MPI
 89 |        implementation and should not be accessed by MPI applications.
 90 |        They are subject to change at any time.  These are not the
 91 |        droids you're looking for. */
 92 |     int _cancelled;
 93 |     size_t _ucount;
 94 | };
 95 | typedef struct ompi_status_public_t ompi_status_public_t;
 96 | ```
 97 | 
 98 | The wi4mpi ABI for the status object is the same as Open-MPI's:
 99 | ```c
100 | struct CCC_mpi_status_struct {
101 |     /* These fields are publicly defined in the MPI specification.
102 |        User applications may freely read from these fields. */
103 |     int MPI_SOURCE;
104 |     int MPI_TAG;
105 |     int MPI_ERROR;
106 |     /* The following two fields are internal to the Open MPI
107 |        implementation and should not be accessed by MPI applications.
108 |        They are subject to change at any time.  These are not the
109 |        droids you're looking for. */
110 |     int _cancelled;
111 |     size_t _ucount;
112 | };
113 | typedef struct CCC_mpi_status_struct MPI_Status;
114 | ```
115 | 
116 | ### Analysis
117 | 
118 | We see here that all variants have the required fields, `MPI_SOURCE`, `MPI_TAG` and `MPI_ERROR`,
119 | and the old MPICH ABI matched the Open-MPI ABI in having both a dedicated `int` field for the cancelled
120 | state plus a count field that supports at least 63b values.
121 | 
122 | Apparently, the Intel MPI team decided to save 32 bits of space in their status object and distribute
123 | 63 bits of count and 1 bit of cancelled boolean across two `int` fields, plus they eliminated the ABI
124 | slush fund that would have allowed MPICH to adapt to future changes in the MPI standard that would
125 | have required new fields in the status object.
126 | 
127 | There isn't anything wrong with the Intel MPI ABI (aka new MPICH ABI).
128 | Testing the cancelled field involves testing a single bit rather than a 32b field,
129 | but since very few MPI programs cancel receives (and cancelling sends has been deprecated),
130 | the relative costs of these does not matter at all.
131 | The needs of the request object seem to be relatively stable over time, and in hindsight it seems
132 | like the ABI slush might have been unnecessarily conservative.
133 | 
134 | In any case, it seems like either the new MPICH or Open-MPI ABI would be fine for standardization.
135 | Some will argue that Open-MPI wastes 31 bits, but perhaps those bits can be used for other things
136 | in some implementations.  As this state isn't user-visible it doesn't matter how implementations use
137 | it, as long as they use it consistently.
138 | 
139 | If I was going to standardize an ABI for the status object, I'd put the public fields first and use
140 | 24 bytes total, which is sufficient for what both of the major ABIs do right now.
141 | I'm not aware of any architectural advantage of the 20 bytes Intel MPI uses.
142 | One could be conservative and round up to 32 bytes, which has some architectural advantages,
143 | since many modern CPUs have 256-bit data paths.
144 | ```c
145 | typedef struct MPI_Status {
146 |     int MPI_SOURCE;
147 |     int MPI_TAG;
148 |     int MPI_ERROR;
149 |     int extra[3];
150 | } MPI_Status;
151 | ```
152 | 
153 | ## MPI datatypes
154 | 
155 | MPI datatypes are opaque objects, which means implementations can represent them however they want.
156 | Here we see different philosophies in MPICH and Open-MPI.
157 | 
158 | ### MPICH
159 | 
160 | MPICH's [mpi.h](https://github.com/pmodels/mpich/blob/main/src/include/mpi.h.in) contains the following:
161 | ```c
162 | typedef int MPI_Datatype;                                                                
163 | #define MPI_CHAR           ((MPI_Datatype)0x4c000101)                                    
164 | #define MPI_SIGNED_CHAR    ((MPI_Datatype)0x4c000118)                                    
165 | #define MPI_UNSIGNED_CHAR  ((MPI_Datatype)0x4c000102)                                    
166 | #define MPI_BYTE           ((MPI_Datatype)0x4c00010d)                                    
167 | #define MPI_WCHAR          ((MPI_Datatype)0x4c00040e)                                    
168 | #define MPI_SHORT          ((MPI_Datatype)0x4c000203)                                    
169 | #define MPI_UNSIGNED_SHORT ((MPI_Datatype)0x4c000204)                                    
170 | #define MPI_INT            ((MPI_Datatype)0x4c000405)                                    
171 | #define MPI_UNSIGNED       ((MPI_Datatype)0x4c000406)                                    
172 | #define MPI_LONG           ((MPI_Datatype)0x4c000807)                                    
173 | #define MPI_UNSIGNED_LONG  ((MPI_Datatype)0x4c000808)                                    
174 | #define MPI_FLOAT          ((MPI_Datatype)0x4c00040a)                                    
175 | #define MPI_DOUBLE         ((MPI_Datatype)0x4c00080b)                                    
176 | #define MPI_LONG_DOUBLE    ((MPI_Datatype)0x4c00080c)                                    
177 | #define MPI_LONG_LONG_INT  ((MPI_Datatype)0x4c000809)  
178 | ```
179 | These values are obviously special, but how?
180 | One feature is that they encode the size of built-in datatypes
181 | such that these can be queried trivially with this macro:
182 | ```c
183 | #define MPIR_Datatype_get_basic_size(a) (((a)&0x0000ff00)>>8)
184 | ```
185 | There are a bunch of other macros that take advantage of the
186 | hidden structure of the `MPI_Datatype` handle that the reader
187 | can study in [mpir_datatype.h](https://github.com/pmodels/mpich/blob/main/src/include/mpir_datatype.h)
188 | 
189 | ### Open-MPI
190 | 
191 | Open-MPI's [mpi.h](https://github.com/open-mpi/ompi/blob/master/ompi/include/mpi.h.in)
192 | defines the datatype handle to be a pointer, which means that built-in datatypes
193 | cannot be compile-time constants, although they are link-time constants, which ends
194 | up being similarly efficient with modern toolchains, for most purposes.
195 | ```c
196 | typedef struct ompi_datatype_t *MPI_Datatype;
197 | ...
198 | /* C datatypes */
199 | #define MPI_DATATYPE_NULL OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_datatype_null)   
200 | #define MPI_BYTE OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_byte)                     
201 | #define MPI_PACKED OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_packed)                 
202 | #define MPI_CHAR OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_char)                     
203 | #define MPI_SHORT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_short)                   
204 | #define MPI_INT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_int)                       
205 | #define MPI_LONG OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_long)                     
206 | #define MPI_FLOAT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_float)                   
207 | #define MPI_DOUBLE OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_double) 
208 | ...
209 | ```
210 | 
211 | In contrast to MPICH, Open-MPI has to lookup the size of the datatype
212 | inside of a [352-byte `struct`](https://github.com/open-mpi/ompi/blob/master/opal/datatype/opal_datatype.h#L145),
213 | which is not a concerning overhead
214 | since the type of MPI code that will notice such an overhead is going
215 | to pass the same datatype over and over, in which case, the CPU is going
216 | to cache and correctly branch-predict the lookup and associated usage
217 | every time.
218 | ```
219 | static inline int32_t opal_datatype_type_size(const opal_datatype_t *pData, size_t *size)
220 | {
221 |     *size = pData->size;
222 |     return 0;
223 | }
224 | ```
225 | 
226 | ### wi4mpi
227 | 
228 | wi4mpi defines all the opaque handles to be `size_t`, which ensures they are at
229 | least as big as MPICH's `int` handles and Open-MPI's pointer handles,
230 | although I don't know if this is the reason.
231 | ```
232 | typedef size_t MPI_Comm;
233 | typedef size_t MPI_Datatype;
234 | typedef size_t MPI_Errhandler;
235 | typedef size_t MPI_File;
236 | typedef size_t MPI_Group;
237 | typedef size_t MPI_Info;
238 | typedef size_t MPI_Op;
239 | typedef size_t MPI_Request;
240 | typedef size_t MPI_Message;
241 | typedef size_t MPI_Win;
242 | ```
243 | 
244 | wi4mpi defines the built-in datatypes to be sequential integers,
245 | which means they are not attempting to encode useful information
246 | the way MPICH's do, although they are compile-time constants,
247 | unlike Open-MPI's.
248 | I do not know if compile-time constancy is important in wi4mpi.
249 | ```c
250 | /* C datatypes */
251 | #define MPI_DATATYPE_NULL 0
252 | #define MPI_BYTE 1
253 | #define MPI_PACKED 2
254 | #define MPI_CHAR 3
255 | #define MPI_SHORT 4
256 | #define MPI_INT 5
257 | #define MPI_LONG 6
258 | #define MPI_FLOAT 7
259 | #define MPI_DOUBLE 8
260 | ```
261 | 
262 | ### Analysis
263 | 
264 | There are advantages to both approaches.  MPICH optimizes for the common case of built-in types,
265 | and does a lookup for others, while Open-MPI always does a pointer lookup, but then has what
266 | it needs in both cases.
267 | 
268 | The other advantage of the MPI approach is with Fortran.  In Fortran, handles are `INTEGER`,
269 | or handles are a type with a single member that is an `INTEGER`.  MPICH conversions between
270 | C and Fortran are trivial (ignoring the case where Fortran `INTEGER` is larger than C `int`,
271 | which is a terrible idea anyways).  Open-MPI has to maintain a lookup table to go between
272 | C and Fortran.
273 | 
274 | The easy solution here is to use `intptr_t` for handles and change the Fortran 2008 handle
275 | definition to use `intptr_t` for `MPI_VAL`.  This allows for trivial conversions between
276 | C and Fortran 2008, for MPICH to continue use magic values for built-ins, and for Open-MPI
277 | to use pointers.  Open-MPI will still need a lookup table for the older Fortran interfaces,
278 | but one of these should be [deprecated](https://github.com/mpi-forum/mpi-issues/issues/561) 
279 | in MPI-5 anyways.
280 | 
281 | Update: we cannot just make `MPI_VAL` a different type from `INTEGER` because the standard
282 | says that `MPI_F08` handles must be interoperable with the raw handles in the legacy API.
283 | We could rely on truncation if the values of handles were always less than `INT_MAX`, but
284 | this is dangerous and won't happen.
285 | 
286 | ## Disclaimer and license
287 | 
288 | The opinions expressed in this post are exclusively the author's and not those of his current and past co-workers, co-authors, friends or family members.
289 | 
290 | (c) Copyright Jeff Hammond, 2021. No reuse permitted except by permission from the author.
291 | 


--------------------------------------------------------------------------------
/MPI_Needs_ABI_Part_3.md:
--------------------------------------------------------------------------------
 1 | # MPI ABI Part 3: Launchers
 2 | 
 3 | This is a follow-up to [It's past time for MPI to have a standard ABI](MPI_Needs_ABI.md)
 4 | and [MPI ABI Technical Details](MPI_Needs_ABI_Part_2.md).
 5 | 
 6 | ## What's a launcher?
 7 | 
 8 | A launcher is how one causes MPI programs to start running.
 9 | It is one of the least specified aspects of MPI, for various reasons.
10 | In MPI 4.1 11.5 "Portable MPI Process Startup", a syntax for a launcher
11 | named `mpiexec` is suggested, and how ones uses it for SPMD and MPMD cases.
12 | However, even though `mpiexec` is standardized, many users use the command
13 | `mpirun` or some other platform- or implementation-specific launcher.
14 | For example, on machines with [Slurm](https://slurm.schedmd.com/documentation.html),
15 | `srun` is often a recommended command.
16 | 
17 | Regardless of what command is actually used to launch an MPI parallel job, the
18 | interesting parts from an implementation compatibility perspective are unseen.
19 | Using a variety of system commands, the launcher needs to broadcast the binary 
20 | and its inputs to every compute node, and execute that binary on every node at least
21 | once.  Once all these programs are running, they need to figure out that they need
22 | to connect to each other, no later than in `MPI_Init`.  Standard output needs
23 | to be handles properly, which might involve forwarding to the node on which the job
24 | was launched.  And finally, while no one wants their MPI programs to fail, if
25 | they do, the launcher needs to clean up the mess and make sure there are no
26 | zombie processes or file handles to clog up the system for subsequent users.
27 | 
28 | One way to do this, which is roughly how MPICH Hydra does things, is
29 | to spawn a proxy on every node that will manage everything within its node.
30 | In this respect, it is a parent that takes its MPI program children outside
31 | and has to clean up after them when they make a mess.  For example, if you
32 | SIGKILL an MPI process, you probably want the whole ensemble to go down,
33 | rather than have N-1 processes running along forever until they deadlock.
34 | 
35 | ## How to not standardize launchers
36 | 
37 | Last time I looked, there are more launchers than there are MPI ABIs,
38 | so standardizing a launcher is at least as hard as getting everyone to agree
39 | on an ABI.  However, it may not be necessary, so let's try this:
40 | 
41 | In the case of Slurm or other launchers associated not with MPI
42 | implementations but with resource managers, there is no problem.
43 | If the MPI library is compiled with Slurm support, then it knows how
44 | to wire-up inside of `MPI_Init` based on environment variables that Slurm
45 | defines.  Both Open-MPI and MPICH support all of the major schedulers.
46 | 
47 | For users who expect to use `mpirun` or `mpiexec`, a hack is to
48 | figure out what launcher the program expects and then invoke it.
49 | In this design, `mpiexec` can be a shell script that calls `strings`
50 | or some other introspection method on the binary and figures out if
51 | it's MPICH or Open-MPI or Intel MPI or MVAPICH2, and then calls
52 | the implementation specific `mpiexec`.  This is not an elegant
53 | method but it probably works for a lot of users, and isn't any worse
54 | than the mess we have right now.
55 | 
56 | ## Disclaimer and license
57 | 
58 | The opinions expressed in this post are exclusively the author's and not those of his current and past co-workers, co-authors, friends or family members.
59 | 
60 | (c) Copyright Jeff Hammond, 2021. No reuse permitted except by permission from the author.
61 | 


--------------------------------------------------------------------------------
/MPI_Needs_ABI_Part_4.md:
--------------------------------------------------------------------------------
  1 | # MPI ABI Design Ideas and Challenges
  2 | 
  3 | This is not intended to be useful to others right now, but is a note-taking space for me...
  4 | 
  5 | 
  6 | ## Context
  7 | 
  8 | This is important:
  9 | 
 10 | > All named constants, with the exceptions noted below for Fortran, can be used in initialization expressions or assignments, 
 11 | > but not necessarily in array declarations or as labels in C switch or Fortran select/case statements. 
 12 | > This implies named constants to be link-time but not necessarily compile-time constants. 
 13 | > The named constants listed below are required to be compile-time constants in both C and Fortran. 
 14 | > These constants do not change values during execution. Opaque objects accessed by constant handles are defined 
 15 | > and do not change value between MPI initialization (MPI_INIT) and MPI completion (MPI_FINALIZE). 
 16 | > The handles themselves are constants and can be also used in initialization expressions or assignments.
 17 | 
 18 | ## Ideas
 19 | 
 20 | MPICH handles are `int`.  Open-MPI handles are pointers. 
 21 | We could standardize handles to be `intptr_t` so that both designs are valid, but we can do better.
 22 | One issue with both approaches is the lack of type checking.
 23 | For example, [this bug](https://github.com/ParRes/Kernels/commit/ee5e5fb09019bd78325d9680cd93f52858812aa4)
 24 | existed for years because the developers only tested with MPICH-based implementations.
 25 | More generally, C compilers have no way to distinguish between different `typedef`-to-`int` handles,
 26 | and thus in calls where it is possible to transpose handles, compilers may struggle to detect these mistakes
 27 | with some implementations (e.g. MPICH).
 28 | 
 29 | We can learn from the Fortran 2008 design here, and make handles a C `struct`, which contains
 30 | a single value, `intptr_t`.  This allows C compilers to check handles for type-correctness,
 31 | but adds no overhead, because there is no overhead to accessing the first element
 32 | of a struct.
 33 | 
 34 | Furthermore, since the exact same type can be defined in Fortran 2003, we can eliminate
 35 | handle conversion functions altogether.
 36 | Handle conversions will remain required for `use mpi` (`mpif.h` should be deleted in MPI-5)
 37 | but that's a necessary evil for legacy Fortran users.
 38 | 
 39 | Today, handle conversion overhead is nontrivial in operations like `MPI_Waitall`, because
 40 | a temporary vector must be allocated (unless the implementation "cheats" in some way).
 41 | The proposed ABI definition of handles will eliminate this.
 42 | 
 43 | ### C handles
 44 | 
 45 | This is how a handle should be defined:
 46 | ```c
 47 | typedef union {
 48 |   intptr_t val;
 49 |   // below is implementation defined, and can be anything 
 50 |   // as long as it does not change the layout implied by intptr_t
 51 | #ifdef IMPLEMENTATION_HAS_POINTER_HANDLES
 52 |   void * our_pointer;
 53 | #endif
 54 | #ifdef IMPLEMENTATION_HAS_INT_HANDLES
 55 |   int our_integer;
 56 | #endif
 57 | } MPI_Handle;
 58 | ```
 59 | The name of the member of the `union` does not matter, because users should not access them.
 60 | There is not a lot of value in obfuscating the contents, and some of the methods for doing
 61 | that make type checking impossible.
 62 | Having type checking for well-behaved users is far more important than trying to prevent
 63 | users who want to violate the standard from writing illegal code.
 64 | 
 65 | ### Fortran handles
 66 | 
 67 | We should change this:
 68 | ```fortran
 69 | type, bind(C) :: MPI_Handle
 70 |   integer :: MPI_VAL
 71 | end type MPI_Handle
 72 | ```
 73 | to this
 74 | ```fortran
 75 | type, bind(C) :: MPI_Handle
 76 |   integer(kind=c_intptr_t) :: MPI_VAL
 77 | end type MPI_Handle
 78 | ```
 79 | at which point all of the C-Fortran handle interoperability stuff becomes irrelevant.
 80 | 
 81 | Right now, Fortran handle conversions are trivial with MPICH but not trivial with Open-MPI.
 82 | No implemenation will have overhead with the MPI-5 ABI.
 83 | 
 84 | Alternatively, if we don't change the Fortran ABI, having the C ABI makes it easy to write
 85 | a standalone Fortran 2008 module, which can have a better Fortran ABI.
 86 | 
 87 | ## Challenges
 88 | 
 89 | ## `MPI_BSEND_OVERHEAD`
 90 | 
 91 | > The MPI constant MPI_BSEND_OVERHEAD provides an upper bound on the additional space consumed by the entry 
 92 | > (e.g., for pointers or envelope information).
 93 | 
 94 | This is implementation-specific.  We need to agree on an upper-bound so that it can be standardized.
 95 | 
 96 | ## Other compile-time constants
 97 | 
 98 | Right now, all we say about thread levels is:
 99 | > These values are monotonic; i.e., MPI_THREAD_SINGLE < MPI_THREAD_FUNNELED < MPI_THREAD_SERIALIZED < MPI_THREAD_MULTIPLE.
100 | MPICH defines them in a very logical way.  There is no reason not to standardize this, or something similar.
101 | ```
102 | MPI_THREAD_SINGLE     = 0
103 | MPI_THREAD_FUNNELED   = 1
104 | MPI_THREAD_SERIALIZED = 2
105 | MPI_THREAD_MULTIPLE   = 3
106 | ```
107 | 
108 | There are no rules for how these can be defined, but again, we have to pick something to standardize.
109 | ```
110 | MPI_IDENT     = 0
111 | MPI_CONGRUENT = 1
112 | MPI_SIMILAR   = 2
113 | MPI_UNEQUAL   = 3
114 | ```
115 | 
116 | ## String-related constants
117 | 
118 | We need to decide on an upper-bound for these, which are currently implementation-specific.
119 | ```
120 | MPI_MAX_PROCESSOR_NAME
121 | MPI_MAX_LIBRARY_VERSION_STRING
122 | MPI_MAX_ERROR_STRING
123 | MPI_MAX_DATAREP_STRING
124 | MPI_MAX_INFO_KEY
125 | MPI_MAX_INFO_VAL
126 | MPI_MAX_OBJECT_NAME
127 | MPI_MAX_PORT_NAME
128 | ```
129 | 
130 | ## Other constants
131 | 
132 | These are also compile-time constants:
133 | ```
134 | MPI_VERSION
135 | MPI_SUBVERSION
136 | MPI_F_STATUS_SIZE (C only)
137 | MPI_STATUS_SIZE (Fortran only)
138 | MPI_ADDRESS_KIND (Fortran only)
139 | MPI_COUNT_KIND (Fortran only)
140 | MPI_INTEGER_KIND (Fortran only)
141 | MPI_OFFSET_KIND (Fortran only)
142 | ```
143 | 
144 | `MPI_VERSION` and `MPI_SUBVERSION` remain specified based on the library features, at compile-time.
145 | Users can use `MPI_Get_version` to verify consistency with run-time support.
146 | 
147 | `MPI_F_STATUS_SIZE` and `MPI_STATUS_SIZE` are fixed as soon as the ABI of `MPI_Status` is defined.
148 | 
149 | `MPI_*_KIND` follow from standardization of the associated C types.
150 | 
151 | ## Fortran compiler support
152 | 
153 | These depend on the Fortran compiler, and how the library deals with `CFI_cdesc_t`.
154 | These should be deprecated and replaced with run-time queries, if possible, although
155 | some applications may need to be able to rely on them at compile-time.
156 | ```
157 | MPI_SUBARRAYS_SUPPORTED (Fortran only) 
158 | MPI_ASYNC_PROTECTS_NONBLOCKING (Fortran only)
159 | ```
160 | These features are associated with Fortran 2018 support, and should be widely supported
161 | by the time we are going to vote on an ABI anyways.
162 | It made sense to make them optional in 2012, but by 2024, they should be required.
163 | 
164 | ## Disclaimer and license
165 | 
166 | The opinions expressed in this post are exclusively the author's and not those of his 
167 | current and past co-workers, co-authors, friends or family members.
168 | 
169 | (c) Copyright Jeff Hammond, 2021. No reuse permitted except by permission from the author.
170 | 


--------------------------------------------------------------------------------
/MPI_Needs_ABI_Part_5.md:
--------------------------------------------------------------------------------
  1 | # Will we ever need 128-bit offsets?
  2 | 
  3 | One of the challenges that has come up in the course of designing the MPI standard ABI is how to define the 
  4 | MPI integer types.  For reference, these are:
  5 |  - `MPI_Aint` (address integer, but sometimes gets used in other ways)
  6 |  - `MPI_Offset` (file offset integer)
  7 |  - `MPI_Count` (large count integer, but also used to hold both of the former)
  8 |  - `MPI_Fint` (Fortran integer, i.e. the C type equivalent to Fortran's default `INTEGER`)
  9 | 
 10 | It is straightforward to see that `MPI_Aint` should be C `intptr_t`, because this is the
 11 | only C type guarenteed to be interconvertible with a pointer (and is signed, since we
 12 | can't use unsigned types because those don't exist in Fortran).
 13 | 
 14 | Right now, on both 32- and 64-bit platforms (meaning those with 32- or 64-bit addressing, i.e. pointers)
 15 | filesystems are usually 64-bits.  The [LFS](https://en.wikipedia.org/wiki/Large-file_support) initiative
 16 | was created to allow larger than 2 GiB files on 32-bit platforms.
 17 | 
 18 | There have been efforts to define 128-bit filesystems, but this post is going to explain why MPI
 19 | does not need to have 128-bit offsets even if the underlying filesystem uses 128-bit offsets.
 20 | MPI file offsets are for a single file, so it is not a question of how large the underlying
 21 | filesystem is, but the size and extent of a single MPI file that matters here.
 22 | 
 23 | The limit of a 64-bit offset, or rather a 63-bit offset if we are dealing with signed integers,
 24 | is 2^63 = 8 * (1024)^6 = >8 billion gigabytes.  Let's look at what it would take to exceed this
 25 | limit, in terms of money, time and energy.
 26 | 
 27 | # Money
 28 | 
 29 | According to https://diskprices.com/, a reasonable quality disk drive costs approximately $10/TB,
 30 | so if one bought a filesystem to store exactly one file that requires more than a 63-bit offset,
 31 | it's going to cost at least $90M, and that is making very aggressive assumptions about how much
 32 | extra gear is required to connect more than a million multi-terabyte drives together such that
 33 | they actually work as a filesystem that will hold such a file.
 34 | 
 35 | Obviously, as with everything in computing, prices go down over time, until physical limits are
 36 | reached.  Based on https://ourworldindata.org/grapher/historical-cost-of-computer-memory-and-storage,
 37 | there was a very nice exponential decrease in storage cost between 1990 and 2010, but since then the
 38 | exponent has changed and prices have decreased only ~3x in the past 10 years.
 39 | Let's assume that trend continues, which means storage costs will go down 10x every 20 years.
 40 | That means some time in the 2040s, one might be able to build a filesystem for a single >8 EiB file
 41 | that costs around $10M.  This is more than most HPC systems cost today...
 42 | 
 43 | The cost problem is much worse once we observe that the $10/TB pricing is for the _cheapest_
 44 | large-capacity storage media available, not for the fast media required to satisfy the time
 45 | requirements established below.  High-end storage media is at least 3x more expensive than 
 46 | 7200 RPM drives.  That 3x means we need to add another decade in order to have our exafile
 47 | cost anywhere near $10M.
 48 | 
 49 | # Time
 50 | 
 51 | Right now, the mean time to interrupt (MTTI) on a large supercomputer is less than a day 
 52 | and not expected to improve any time soon.  Let's assume that we need to write our >8 EiB file
 53 | in less than a day.  That's already pretty unreasonable, but it gives a way to establish some
 54 | conservative bounds.  What sort of I/O bandwidth is required to write such a file in a day?
 55 | One day is 86,400 seconds.  At an I/O speed of 1 TB/s, we can write 86.4 PB/day.
 56 | We would need an aggregate I/O speed of 100 TB/s to write the 8.64 EB/day required to populate
 57 | the gigantic file in question.
 58 | 
 59 | One current I/O standard is PCIe 5.0, which supports 64 GB/s (unidirectional).
 60 | PCIe 6.0 is [expected](https://www.theverge.com/2022/1/12/22879732/pcie-6-0-final-specification-bandwidth-speeds) 
 61 | to support twice that, i.e. 128 GB/s, with x16.  It seems we need around 1000 I/O devies
 62 | to drive this filesystem, but we will probably use a lot more than 1000 drives
 63 | to store the exafile, so the speed limit is the storage media, not the I/O into it.
 64 | 
 65 | Right now, state-of-the-art storage media supports approximately 10 GB/s of write
 66 | bandwidth.  That bandwidth goes down as an SSD gets full, as it will when writing
 67 | the exafile, but let's ignore that.  Right now, the best SSDs max out PCIE with only x4,
 68 | and it's likely that only change for the better.  If PCIe doubles a few more times
 69 | in the next decade, which is optimistic, then we might expect to be able to write 
 70 | at 256 GB/s to the most expensive SSDs in the future.
 71 | This means that we might be able to write the exafile in less than a day,
 72 | assuming we can build a filesystem with thousands of drives and the interconnect
 73 | requires to move the bytes where they need to go.
 74 | 
 75 | # Energy
 76 | 
 77 | Writing to storage costs approximately 1 nanojoule/bit according to 
 78 | [this](http://large.stanford.edu/courses/2018/ph240/jiang2/).
 79 | Our exafile requires writing 2^66 bits, or approximately 74 gigajoules.
 80 | A watt is defined to be a joule per second.  So we expend 74 gigajoules
 81 | in 86400 seconds, which is 854 kW.  That's a lot of power but not
 82 | unreasonable for data center operations, where large systems routinely
 83 | require many megawatts.
 84 | 
 85 | It doesn't look like power is the limit to storing the exafile.
 86 | 
 87 | # Conclusion
 88 | 
 89 | I am not an expert at these things, but it certainly seems like we have
 90 | approximately 30 years before anybody is going to be able afford
 91 | to write a file that requires `MPI_Offset` to be more than 63 bits.
 92 | 
 93 | Right now, OLCF's [Orion filesystem](https://community.hpe.com/t5/servers-systems-the-right/meet-the-world-s-largest-and-fastest-parallel-file-system/ba-p/7155645)
 94 | has a capacity of 690 PB and an aggregate bandwidth of 10 TB/s.
 95 | The most aggressive use case for this filesystem is:
 96 | 
 97 | > Each of simulations consisted of 2 trillion particles and more than 1,000 steps.
 98 | > The data generated by ONE simulation could total a whopping 200 PB,
 99 | 
100 | which one can reasonable assume requires approximately 1000 files of
101 | 200 TB each, which is consistent with our assumptions.
102 | In theory, users could insist on using exactly one file, in which
103 | case they are within 40x of the 63-bit limit, but 40x more data
104 | depends on at least 40x more simulation, which is likely three
105 | generations of supercomputers away, or more than a decade away.
106 | 
107 | # Disclaimer and license
108 | 
109 | The opinions expressed in this post are exclusively the author's and not those of his current and past co-workers, co-authors, friends or family members.
110 | 
111 | (c) Copyright Jeff Hammond, 2023. No reuse permitted except by permission from the author.
112 | 


--------------------------------------------------------------------------------
/MPI_Needs_ABI_Part_6.md:
--------------------------------------------------------------------------------
  1 | # MPI Fortran ABI Challenges and Solutions
  2 | 
  3 | This article focuses on the Fortran aspects of the MPI ABI.
  4 | The first thing to note is that the Fortran language and compiler ecosystem
  5 | does not allow for a standard ABI in the general sense, because Fortran
  6 | modules and calling conventions are not standard and known to differ --
  7 | often significantly -- between implementations.
  8 | When we discuss the MPI Fortran ABI, we are only referring to the parts
  9 | of the MPI C ABI that interact with Fortran.
 10 | 
 11 | # Handle conversion and `MPI_Fint`
 12 | 
 13 | The most obvious interaction between Fortran and the MPI C ABI is `MPI_Fint`
 14 | and functions that use it.  Sadly, it is allowed for Fortran compilers to
 15 | change the size of the type `INTEGER` using compiler flags (e.g., `-i8`),
 16 | hence the `MPI_Fint` type in C code has to know what Fortran compiler flags
 17 | were used.  Thus, there is no way to make this type definition part of the
 18 | MPI C ABI, and therefore all of the `MPI_<Handle>_{f2c,c2f}` functions are
 19 | ill-defined.
 20 | 
 21 | One solution to the `MPI_Fint` problem would be to define it to C `int`
 22 | and disallow MPI Fortran support from using an `INTEGER` that is not
 23 | equivalent.  Instead, we will add new functions that are nearly identical
 24 | to f2c/c2f that do not depend on the Fortran compiler.  These functions
 25 | are `MPI_<Handle>_{toint,fromint}`.
 26 | As long as Fortran `INTEGER` is not smaller than C `int`, which is true
 27 | in all reasonable environments, these functions can be used to implement
 28 | f2c/c2f in the MPI Fortran API.
 29 | 
 30 | Why bother with these functions at all?  Are they strictly necessary?
 31 | Once we have the MPI C ABI, the handle types will be known to be C pointers,
 32 | which could be stored in Fortran via CFI (C-Fortran interoperability).
 33 | Unfortunately, all of the MPI Fortran API assumes handles are `INTEGER`,
 34 | or a type that contains an `INTEGER`.
 35 | 
 36 | In an implementation of the MPI Fortran API, 
 37 | such as [Vapaa](https://github.com/jeffhammond/vapaa), 
 38 | it is necessary to convert handles from Fortran to and from C quickly.
 39 | For predefined handles, the MPI ABI makes this trivial, since all the
 40 | constant values are small and one can cast with truncation.
 41 | For user handles, the forward conversion (from Fortran to C) is often
 42 | on the critical path, which can implemented using an array of handles.
 43 | However, one of the most performance critical parts of MPI will involve
 44 | the back conversion from C to Fortran of requests.  It is possible to
 45 | implement this mapping in Vapaa but not easy to do efficiently.
 46 | It is expected that implementations can provide a more efficient implementation.
 47 | 
 48 | Because the C status object (`MPI_Status`) is now fully specified, no
 49 | new conversion functions are required.
 50 | 
 51 | # Fortran types and their MPI datatypes
 52 | 
 53 | When we call a reduction from Fortran with, e.g., `MPI_REAL`,
 54 | a user-defined callback will get a datatype argument.
 55 | If the C implementation of MPI doesn't know what `MPI_REAL` is,
 56 | it's going to detect this as an invalid datatype.
 57 | We can't just work around this by translating Fortran types
 58 | to C types and passing `MPI_FLOAT`, because this means that
 59 | datatype logic inside of user callbacks written in Fortran will
 60 | not work.
 61 | Thus, the MPI C implementation needs to know that `MPI_REAL` is
 62 | valid and to preserve it throughout the program.
 63 | At the same time, it also needs to know how to implement built-in
 64 | reductions and other features correctly.
 65 | 
 66 | The solution to this problem is a function to inform MPI of the
 67 | C equivalents of all Fortran types.  This way, the MPI library can
 68 | implement `(MPI_REAL,MPI_SUM)` reductions with `MPI_FLOAT` and get
 69 | native performance.  Otherwise, a library like Vapaa would have to
 70 | implement all the built-in reductions manually, which is not optimal.
 71 | 
 72 | The other issue here is that MPI may need to implement logical reductions
 73 | like `MPI_LAND`, `MPI_LOR` and `MPI_LXOR` in C.  This requires it to know
 74 | how Fortran `LOGICAL` works.  For historical reasons, going back to the
 75 | VAX platform, Fortran `LOGICAL` may not behave like C.  It may, for example,
 76 | use the sign bit to represent booleans, rather than 0 and non-zero.
 77 | Even if 0 is `.FALSE.`, `.TRUE.` could be 1 or `0xFFFFFFFF`.
 78 | 
 79 | As before, we need a function to tell MPI what the literal values of
 80 | Fortran `.TRUE.` and `.FALSE.` are.
 81 | 
 82 | The illustrate the previous case
 83 | 
 84 | Fortran Compiler|Flags|`.FALSE.`|`.TRUE.`
 85 | ---|---|---|---
 86 | GCC |  | 0 | 1
 87 | IFX |  | 0 | -1 (`0xFFFFFFFF`)
 88 | IFX | `-fpscomp logicals` | 0 | 1
 89 | NVHPC |  | 0 | -1 (`0xFFFFFFFF`)
 90 | LLVM 14 (`flang-new`) |  | 0 | -1 (`0xFFFFFFFF`)
 91 | LLVM 20 (`flang-new`) |  | 0 | 1
 92 | Cray |  | 0 | 1
 93 | 
 94 | I did not probe when in the LLVM version history the values of logical changed to match C.
 95 | 
 96 | Here's more fun data:
 97 | 
 98 | Fortran Compiler|Flags| 0 | 1 | -1 | 2
 99 | ---|---|---|---|---|---
100 | GCC |  | F & ! T | T & ! F | T & ! T | T & ! T
101 | IFX |  | F & ! T | T & ! F | T & ! F | F & ! T
102 | IFX | `-fpscomp logicals` | F & ! T | T & ! F | T & ! F | T & ! F
103 | NVHPC |  | F & ! T | T & ! F | T & ! F | F & ! T
104 | LLVM 14 (`flang-new`) |  | F & ! T | T & ! F | T & ! F | F & ! T
105 | 
106 | I didn't run this test for LLVM 20 or Cray.
107 | 
108 | # Sentinels
109 | 
110 | There was a request to provide addresses in C for all of 
111 | the Fortran sentinels, not just `MPI_F(08)_STATUS(ES)_IGNORE`.  
112 | We declined to solve this, because it's easy to implement directly
113 | in user code in the rare cases (profiling tools) where it is needed.
114 | 
115 | See https://github.com/jeffhammond/vapaa/blob/main/source/detect_sentinels.c
116 | and https://github.com/jeffhammond/vapaa/blob/main/source/detect_sentinels_c.F90.
117 | 
118 | # Callbacks
119 | 
120 | This one is more difficult, and is not part of the current proposal for the standard.
121 | There are multiple use cases, not just Fortran, for extended callbacks that have
122 | extra state associated therewith, the way `MPI_Grequest_start` does.
123 | This allows the language interface to attach language-specific information
124 | about types or error-handling to the callback, so that it can be implemented
125 | more efficiently or in a more idiomatic way.
126 | 
127 | A new reduction callback was proposed to address this, but there was too much debate
128 | about it's semantics to get it into the first version of the ABI.
129 | Specifically, should the user state be mutable or not, and if so, how
130 | is it protected from concurrent access (race conditions)?
131 | 
132 | This doesn't mean the problem cannot be solved.  It merely requires callback
133 | trampolines, as are implemented in
134 | [Mukautuva](https://github.com/jeffhammond/mukautuva) and
135 | [MPITrampoline](https://github.com/eschnett/MPItrampoline),
136 | at some added cost.
137 | However, since user-defined operations and callbacks are rarely
138 | on the critical path, this situation is tolerable.
139 | 
140 | We intend to fix the callback situation in a future revision of MPI.
141 | 
142 | # Module ABIs
143 | 
144 | The internal structure of a Fortran module appears to leak into the symbol names.
145 | This means that a design like MPICH's
146 | ```fortran
147 | MODULE MPI
148 |     USE MPI_CONSTANTS
149 |     USE MPI_SIZEOFS
150 |     USE MPI_BASE
151 |     USE PMPI_BASE
152 | END MODULE MPI
153 | ```
154 | may not be compatible with another implementation's module, if it uses different
155 | names internally.
156 | 
157 | It is not yet proven that this is a problem, but if it is, then the only way
158 | to get an MPI Fortran module ABI is to specify the internal structure.
159 | Ironically, the terrible `mpif.h` doesn't have this problem, because it doesn't
160 | use modules at all.
161 | 


--------------------------------------------------------------------------------
/MPI_Needs_ABI_Part_7.md:
--------------------------------------------------------------------------------
 1 | # MPI ABI Status Report
 2 | 
 3 | As of December 2024, the MPI ABI proposals, one for C support
 4 | and one for partial Fortran support, have passed the
 5 | [first vote](https://www.mpi-forum.org/meetings/2024/12/votes)
 6 | by the MPI Forum.
 7 | 
 8 | There will be a second vote followed by a vote to approve the entire
 9 | release of the standard containing the ABI, but it is unlikely
10 | that future votes will fail, given the lack of substantial objections
11 | to the current content of the proposal.
12 | 
13 | As noted in the previous blog
14 | ([part 6](https://github.com/jeffhammond/blog/blob/main/MPI_Needs_ABI_Part_6.md))
15 | there is still work left to be done to solve the Fortran problem.
16 | In order to get these parts right, we need to wait until the first phase
17 | of the ABI is done and then implement standalone Fortran bindings on
18 | top of it, as [Vapaa](https://github.com/jeffhammond/vapaa/) is doing.
19 | So far, Vapaa is focused only on `MPI_F08` support, but we will also
20 | prototype the legacy API to verify that nothing is lost if the
21 | MPI Fortran API is implemented separately from the MPI C API.
22 | 


--------------------------------------------------------------------------------
/MPI_RMA_HOWTO.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Mentoring_Interns.md:
--------------------------------------------------------------------------------
 1 | # Summary
 2 | 
 3 | I've supervised more than twenty interns, all of whom were succesful as interns and are -- as best I can tell -- successful in their careers.  This post summarizes my approach to mentoring, which boils down to (1) hire people then find projects that suite them, not the other way way around, and (2) do whatever it takes to make your interns successful, whether that means leaving them alone or sitting with them an hour or two a day.
 4 | 
 5 | # Hiring
 6 | 
 7 | I have hired 100% of the students who wanted to work for me or who were recommended to me, along with a number of others who I recruited because they had skills and interests that were aligned with my research activities.  As I have never declined to hire an applicant, even an unofficial one, I have no idea how to do intern selection.  What I can offer is my experience that everyone who wants to contribute can do so, given the right support, and it's up to the mentor to provide this.  If you are not up to the challenge of making your interns successful no matter what, maybe you aren't ready to be a mentor.
 8 | 
 9 | # Selecting a project
10 | 
11 | It helps to have a rough idea of something you and your intern will do together (and various administrative functions often require it), but the details should be late binding and you should always been open to changing directions even circumstances require.  I'll give an example of this later (it's in [Summer of 2011](Summer_of_2011.md)).
12 | 
13 | # Defining success
14 | 
15 | I applied ["Failure is not an option"](https://en.wikipedia.org/wiki/Failure_Is_Not_an_Option) to every one of my interns.  I had a wonderful experience as an intern at PNNL (see [this](https://github.com/jeffhammond/blog/blob/main/Career_Trajectory.md) for details) and a lot of people were incredibly generous of their time to make that possible, and I chose to hold myself to a similar standard as a mentor.
16 | 
17 | The grading criteria I used for interns was as follows:
18 | 
19 |   1) Minimum success criteria.  Whatever we need to accomplish for the internship to not be a complete waste of time.  It needs to be something that is entirely within the skills the intern has already and can be achieved with straightforward effort, possibly requiring significant investment from the mentor.  Nobody wants to be a failure, and succeeding at something straightforward gives people the confidence to take on bigger challenges.
20 | 
21 |   2) Very good results.  This is a pretty standard success criteria, such as publishing a peer-reviewed manuscript to which the intern contributed significantly.  It should be substantial enough that everyone will recognize the intern's contribution.
22 | 
23 |   3) Intergalactic science god.  Yes, I actually used those words in some cases.  I found that it was useful to provide an aspirational target for my interns that would keep them going if they managed hit level 2 relatively quickly.  More importantly, I wanted them to know they I had a lot of confidence in their abilities and felt that there was a chance to accomplish something so profound that it would make them famous in the scientific community.  It's important to have big dreams, even when you're an intern.
24 | 
25 | I didn't enumerate these criteria to every intern, but I had them in mind when I was designing their projects.  I'm not sure how other mentors do it, but I like to think that it's somewhat novel to think about 1 and 3, and work really hard to make sure that every intern gets to 1 as quickly as possible, and hopefully to 2.
26 | 
27 | And if you are wondering, yes, my some of my students hit level 3.  The work some of them did, often with very little help from me, changed their field of science and they are deservedly famous for it.  At one point in my career, I spent a lot of time working on tensor-related things, but I found that some of my former interns are so much better at this topic than I am that I stopped working on it and found new areas to which I can contribute.
28 | 
29 | (c) Copyright Jeff Hammond, 2020. No reuse permitted except by permission from the author.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # About this project
 2 | 
 3 | This is my blog.  I did not feel like learning a new system and I like GitHub a lot, so I figured I could just add my writing here and people could figure out how to read it if they want.
 4 | 
 5 | ## Permissions
 6 | 
 7 | I assert copyright on all of my original content here, unless noted otherwise.  I don't see why anyone would want to reuse my writing as their own, but if you want to publish something I've created elsewhere, please request permission via a GitHub issue.
 8 | 
 9 | Blog posts that are documenting technical information will probably use a different license.  Please check the license at the bottom of every post to be sure about the license. 
10 | 
11 | # About me
12 | 
13 | I am a nerd.  I have a life outside of work that revolves around my family that I do not discuss in detail online.
14 | 
15 | ## Professional biography
16 | 
17 | Jeff Hammond is a Principal Engineer at NVIDIA where he works on HPC software for GPUs and ARM CPUs.  His research interests include parallel programming models and system architecture.  Previously, Jeff worked at Intel and the Argonne Leadership Computing Facility where he worked on a range of projects, including MPI-3, oneAPI, Blue Gene and Xeon Phi.  Jeff received his PhD in Chemistry from the University of Chicago for work on NWChem.
18 | 


--------------------------------------------------------------------------------
/Reading_list.md:
--------------------------------------------------------------------------------
  1 | These are books I've read or am reading and which I have enjoyed.
  2 | 
  3 | # Currently Reading
  4 | 
  5 | [Focus  – The ASML way](https://focus-dewereldvanasml.nl/) by Marc Hijink.
  6 | 
  7 | Putin's People by Catherine Belton.
  8 | 
  9 | A Line in the Sand by James Barr.
 10 | 
 11 | The Key to Nuclear Restraint by Thomas Jonter.
 12 | 
 13 | Dark Sun: The Making of the Hydrogen Bomb, by Richard Rhodes.
 14 | 
 15 | # Planning to Read
 16 | 
 17 | A Concise History of Finland by Kirby.
 18 | 
 19 | Legacy of Ashes by Tim Weiner.
 20 | 
 21 | Lost Kingdom by Serghii Plokhy.
 22 | 
 23 | The Gates of Europe by Serghii Plokhy.
 24 | 
 25 | The Ascent of Money by Niall Ferguson.
 26 | 
 27 | The Battle of Bretton Woods by Steil.
 28 | 
 29 | The Origins of Totalitarianism by Hannah Arendt.
 30 | 
 31 | Sunburst and Luminary by Don Eyles.
 32 | 
 33 | # Have Read
 34 | 
 35 | The Time Machine by H. G. Wells.
 36 | 
 37 | Command and Control by Eric Schlosser.
 38 | 
 39 | [The Demon-Haunted World: Science as a Candle in the Dark](https://en.m.wikipedia.org/wiki/The_Demon-Haunted_World) by Carl Sagan.
 40 | 
 41 | [The Illustrated Man](https://en.wikipedia.org/wiki/The_Illustrated_Man) by Ray Bradbury.
 42 | 
 43 | Chip War by Chris Miller.  Outstanding book on the current and past state of semiconductors.
 44 | 
 45 | The Right Stuff by Tom Wolfe.
 46 | 
 47 | The NVIDIA Way by Tae Kim.
 48 | 
 49 | [The Strangest Man: The Hidden Life of Paul Dirac, Quantum Genius](https://en.wikipedia.org/wiki/The_Strangest_Man).
 50 | 
 51 | [What If?: Serious Scientific Answers to Absurd Hypothetical Questions](https://en.wikipedia.org/wiki/What_If%3F_(book)).
 52 | 
 53 | The Last Empire by Serghii Plokhy.
 54 | 
 55 | [American Prometheus](https://en.wikipedia.org/wiki/American_Prometheus).
 56 | 
 57 | [The Making of the Atomic Bomb by Richard Rhodes](https://en.wikipedia.org/wiki/The_Making_of_the_Atomic_Bomb).
 58 | 
 59 | How Not to Network A Nation by Benjamin Peters.
 60 | I stopped reading this one because it was too slow.  The topic could be covered with far fewer words.
 61 | 
 62 | Never Split the Difference by Chris Voss.
 63 | 
 64 | [The Great Influenza](https://www.penguinrandomhouse.com/books/288950/the-great-influenza-by-john-m-barry/) by John M. Barry.
 65 | I enjoyed this book a great deal.  The first third is about the development of modern medicine in the USA, not influenza specifically.
 66 | 
 67 | Abstracting Away the Machine by Lorenzo.
 68 | 
 69 | [Modern Fortran Explained: Incorporating Fortran 2018](https://academic.oup.com/book/26799) by Michael Metcalf, John Reid, Malcolm Cohen.
 70 | This is an excellent book on the topic.  If you are serious Fortran programmer, you should own it.
 71 | 
 72 | [Concurrent and Real-Time Programming in Ada](https://doi.org/10.1017/CBO9780511611230)  by Alan Burns and Andy Wellings.
 73 | I did not want to read this book and didn't enjoy it, partly because of the writing and partly because of the subject matter.
 74 | 
 75 | A History of Finland by Henrik Meinander.
 76 | 
 77 | Finland's Relations with the Soviet Union, 1944-84 by R Allison.
 78 | 
 79 | The Soul of a New Machine by Tracy Kidder.  (I guess I didn't finish this one)
 80 | 
 81 | Inside the Message Passing Interface by Supalov.
 82 | 
 83 | [Cybernetics](https://en.wikipedia.org/wiki/Cybernetics:_Or_Control_and_Communication_in_the_Animal_and_the_Machine) by Norbert Wiener.
 84 | 
 85 | [One Giant Leap](https://www.simonandschuster.com/books/One-Giant-Leap/Charles-Fishman/9781501106309) by Charles Fishman.
 86 | This is a great sociopolitical perspective on the Apollo Program.
 87 | 
 88 | [The Supermen](https://www.wiley.com/en-ie/The+Supermen:+The+Story+of+Seymour+Cray+and+the+Technical+Wizards+Behind+the+Supercomputer-p-9780471048855) by Charles J. Murray.
 89 | 
 90 | Ogilvy on Advertising by David Ogilvy.
 91 | 
 92 | The Decline and Fall of IBM by Robert X. Cringley.
 93 | 
 94 | Only the Paranoid Survive by Andy Grove.
 95 | 
 96 | The Innovator's Dilemma by Clayon Christensen.
 97 | 
 98 | Far from the Tree by Andrew Solomon.
 99 | 
100 | The Information: A History, A Theory, A Flood by James Gleick.
101 | 
102 | [The Media Lab](https://www.media.mit.edu/publications/the-media-lab-inventing-the-future-book/) by Steward Brand.
103 | I love this book, because it's from 1986 and I can see how well they predicted things, 
104 | like high-speed internet at home and mobile computing devices.
105 | 
106 | [Electrifying America](https://mitpress.mit.edu/9780262140485/) by David Nye.
107 | This one is probably too dry for most people, but I enjoyed it.
108 | I stopped reading 80-90% of the way through, because it was pretty clear that the story was 
109 | going to end with most Americans having electricity 😊
110 | 
111 | Working on the Bomb: An Oral History of WWII Hanford. By S. L. Sanger.
112 | This was a gift when I left PNNL.  It's a great perspective on the Manhatten Project from a different perspective
113 | than one gets from the Los Alamos centered works.
114 | 
115 | [The Puzzle Palace](https://en.wikipedia.org/wiki/The_Puzzle_Palace) by James Bamford.
116 | 
117 | [The Nobel Prize](https://www.simonandschuster.com/books/The-Nobel-Prize/Burton-Feldman/9781611457247) by Burton Feldman.
118 | 
119 | [Fermat's Enigma](https://en.wikipedia.org/wiki/Fermat%27s_Last_Theorem_(book)) by Simon Singh.
120 | 
121 | [Blank Spots on the Map](https://www.penguinrandomhouse.com/books/302310/blank-spots-on-the-map-by-trevor-paglen/) by Trevor Paglen.
122 | 
123 | [Expert C Programming: Deep C Secrets](https://www.oreilly.com/library/view/expert-c-programming/0131774298/) by P. van der Linden.
124 | It has been a while since I read this book, but I enjoyed it years ago.
125 | 
126 | Who Got Einstein's Office by Regis.
127 | 
128 | Genius by James Gleick.
129 | 
130 | Fermat's Enigma by Simon Singh.
131 | 
132 | How Would You Move Mount Fuji by William Poundstone.
133 | 


--------------------------------------------------------------------------------
/Summer_of_2011.md:
--------------------------------------------------------------------------------
 1 | This was originally part of [Mentoring Interns](Mentoring_Interns.md) and some of the CSGF references are explained in [Career Trajectory](Career_Trajectory.md), although I will eventually write a post dedicated to CSGF.
 2 | 
 3 | # The Summer of 2011
 4 | 
 5 | I've been asked about this before, so I might as well write it down here.  In the summer of 2010, I had started talking to Devin Matthews and Edgar Solomonik about doing their DOE-CSGF practica (i.e. internships) with me at Argonne.  At the time, Edgar was a student of Jim Demmel's at Berkeley, working on efficient algorithms for matrix computations.  Devin was a chemistry graduate student at Texas wtih John Stanton, working on efficient formulations of coupled-cluster theory and their applications to spectroscopy.
 6 | 
 7 | Edgar and I came up with a plan to do something related to dynamic load-balancing in quantum chemistry, which was closer to his undergraduate research with Sanjay Kale in the Charm++ group.  The idea was to add better task scheduling to [MPQC](https://mpqc.org/), which was a C++ code that did some of the same things as NWChem, but was a lot simpler to get working on Blue Gene/P.
 8 | 
 9 | Devin's project was going to be something that would expose him to more modern programming methods, since his work at Texas involved [CFOUR](http://www.cfour.de/), which is the closest thing to a genuine Fortran 77 code as I have ever seen.  Given Devin's experience with CFOUR, I figured we'd learn Fortran 95 together, and do something interesting along the way.
10 | 
11 | As it turned out, neither of these project ideas worked out.  After a week or two, Edgar found that he couldn't get past a bug in the IBM C++ compiler for Blue Gene/P, and given the rate at which those got fixed (2-3 months), we'd lose the entire summer waiting to merely compile the code.  Around the same time, Devin decided that Fortran 95 dynamic memory management was just too stupid and he was not going to continue with modern Fortran.  (I think Edgar put Devin onto C++ -- Devin is now one of the most talented C++ programmers I know, although he's still fluent in Fortran 77 for CFOUR purposes).
12 | 
13 | The other thing that was happening around that time -- this was May -- was that my wife was about to give birth to our first child and I was planning to be absent from the office for a while.  Knowing that I was not going to be available to help as much as normally would and the absence of any promising leads on either of their planned activities, I decided that the prudent thing was to come up with something new that they could do together while I was away.
14 | 
15 | The problem that I had been chewing on for a few years was how to do distributed tensor contractions efficiently, particularly on Blue Gene systems.  This mean using MPI collectives, not one-sided communication (while Blue Gene/P and /Q were really good at one-sided communication, they were utterly magical when it came to MPI collectives).  This project was closer to both of their dissertation projects and would not have been approved by the CSGF program stewards as a planned activity, but our plans were not going well.
16 | 
17 | We met in a conference room around the first week of June and I sketched out the problem statement.  We agreed it was a worthwhile thing to try to solve, and it was ideally suited for the two of them, since Edgar was a master of distributed linear algebra and Devin was a master of tensors in the context of coupled-cluster theory.  I left the lab shortly thereafter and didn't come back for about a month.  I was accessible via email and chat, but I don't recall much communication from them.
18 | 
19 | By the time I got back to the lab in July, they had solved the problem.  They hadn't implemented everything yet, but the ideas were all there.  Edgar created the [Cyclops Tensor Framework](https://solomon2.web.engr.illinois.edu/ctf/) (CTF) and Devin wrote what would become [AQUARIOUS](https://github.com/devinamatthews/aquarius).  Devin wrote CCSD and CCSDT using CTF, and along the way created a prototype of a Cholesky-decomposition based SCF code.  The CCSD and CCSDT codes based on CTF were faster than the NWChem TCE, which was at the time the highest performance massively parallel implementaiton of those methods.  Furthermore, CTF was based entirely on simple MPI primitives that were highly optimized on every supercomputer, which ensured portable parallel performance without any special effort, which has never been true of NWChem because of its reliance on one-sided communication.
20 | 
21 | Once CTF was published, it became the new standard for other researchers to beat.  Some have improved on CTF by improving the design, but the core idea that tensor contractions, even ones involving highly symmetric tensors, can and should be done using communication-optimal matrix algorithms combined with collective tensor transposes.  Furthermore, CTF showed that no code generation is required and that all of the operations of coupled-cluster theory, even higher-order methods like CCSDTQ, can be expressed in simple notation that translates efficiently to a small number of back-end functions.
22 | 
23 | # Details
24 | 
25 | If you want to know more about this project, please read the following:
26 | * [An Overview of Cyclops Tensor Framework](https://solomonik.cs.illinois.edu/talks/molssi-monterey-may-2017.pdf) - Edgar's overview slides.
27 | * [A preliminary analysis of Cyclops Tensor Framework](https://www2.eecs.berkeley.edu/Pubs/TechRpts/2012/EECS-2012-29.html) - The first paper.
28 | * [Cyclops Tensor Framework: reducing communication and eliminating load imbalance in massively parallel contractions
29 | ](https://www2.eecs.berkeley.edu/Pubs/TechRpts/2012/EECS-2012-210.html) - The second paper.
30 | * [A massively parallel tensor contraction framework for coupled-cluster computations](https://www2.eecs.berkeley.edu/Pubs/TechRpts/2014/EECS-2014-143.html) - The third paper.
31 | 
32 | [CTF on GitHub](https://github.com/cyclops-community/ctf) has references to additional applications and publications.
33 | 
34 | (c) Copyright Jeff Hammond, 2020. No reuse permitted except by permission from the author.
35 | 


--------------------------------------------------------------------------------
/Turing_Pi_v1.md:
--------------------------------------------------------------------------------
  1 | # Summary
  2 | 
  3 | I got a Turing Pi v1 ([docs](https://docs.turingpi.com/)) for fun.
  4 | 
  5 | # Acquisition
  6 | 
  7 | The Turing Pi system that you buy is just the board.
  8 | You need to buy compute modules and the power supply elsewhere.
  9 | I bought six Compute Module 3+ 8GB from [PiShop.us](https://www.pishop.us/product/raspberry-pi-compute-module-3-8gb/)
 10 | and the recommended LEDMO power supply from [Amazon](https://www.amazon.com/gp/product/B01461MOGQ/).
 11 | 
 12 | The total cost of this setup is just shy of $400, not including any additional storage you might need.
 13 | Obviously, it helps to have an HDMI cable, monitor, USB keyboard and mouse, and a USB to MicroUSB cable,
 14 | but most people who would buy a Turing Pi have those laying around.
 15 | 
 16 | # Setup
 17 | 
 18 | I watched the YouTube videos linked on the Turing Pi website to get an idea of what to do.
 19 | The documentation isn't perfect but I managed to do it on the first attempt, and I'm not very good at this sort of thing.
 20 | 
 21 | I downloaded the Raspberry Pi 3 64-bit image of [Ubuntu](https://ubuntu.com/download/raspberry-pi)
 22 | because I use 64-bit Ubuntu 20 almost everywhere else.
 23 | The Turing Pi people recommend some other distro.
 24 | If you want to follow their documentation exactly, use that instead.
 25 | 
 26 | Following the directions on their website, install `usbboot` on some other Linux (or Windows, but who uses that?) system.
 27 | You'll use this to boot the compute modules for flashing the OS image onto the eMMC storage.
 28 | You will need to do this for every module, which is a bit tedious, but presumably doesn't happen very often.
 29 | 
 30 | First, move the jumper next to the MicroUSB port to `flash` instead of `boot`.  Connect the machine where you downloaded the Pi image to the Turing Pi board via USB-to-MicroUSB cable.
 31 | 
 32 | For each compute module, do this:
 33 | 
 34 |   * Run `sudo ./rpiboot` in a terminal to mount the eMMC storage.
 35 |   * Write the image to it.  This takes about 10 minutes.
 36 |   
 37 | I used Ubuntu's `Disks` GUI app but one can of course use `dd` instead.
 38 | 
 39 | [This](https://synyx.de/blog/turing-pi-the-ultimate-cluster-board-for-raspis/) appears to be another good description
 40 | of the setup process, with more automation.  You might find that more useful than mine.
 41 | 
 42 | # Getting Started
 43 | 
 44 | I was dumb and initially only imaged one compute module.
 45 | What I learned from using the first image is that the Pi 3+ can barely run Gnome,
 46 | e.g. I saw multiple hangs and even when I went back to the pure terminal, the graphics was sluggish.
 47 | This isn't too surprising, but given that I have a pretty good time with Ubuntu 20 and Gnome
 48 | on my Raspberry Pi 4, 
 49 | I subsequently uninstalled Gnome and will only use these things via SSH from a more powerful system.
 50 | 
 51 | After imaging all of the modules, I see all of the node IP addresses in my router and with `nmap`.
 52 | I used `pdsh` to make Apt changes symmetrically.
 53 | 
 54 | # Running HPC Workloads
 55 | 
 56 | ## MPI
 57 | 
 58 | Open-MPI is much more reliable at launching processes on the Turing Pi.
 59 | The following just works.
 60 | ```
 61 | $ mpicc.openmpi -g -Os hello.c -o hello.x && pdsh -R exec -w turing[1-5] ssh -l ubuntu %h scp turing0:/tmp/hello.x /tmp/hello.x && /usr/bin/mpirun.openmpi --host turing0:4,turing1:4,turing2:4,turing3:4,turing4:4,turing5:4 /tmp/hello.x
 62 | ```
 63 | 
 64 | The following does not work.
 65 | ```
 66 | $ mpicc.mpich -g -Os hello.c -o hello.x && pdsh -R exec -w turing[1-5] ssh -l ubuntu %h scp turing0:/tmp/hello.x /tmp/hello.x && /usr/bin/mpirun.mpich --host turing0:4,turing1:4,turing2:4,turing3:4,turing4:4,turing5:4 /tmp/hello.x
 67 | ```
 68 | 
 69 | Eventually, I had to change `~/.ssh/config` to use the key by default and just use the raw IP addresses.
 70 | I don't know for sure, but it seems like a DNS issue (https://isitdns.com/).
 71 | ```
 72 | $ mpicc.mpich -g -Os hello.c -o hello.x && pdsh -R exec -w turing[1-5] ssh -l ubuntu %h scp turing0:/tmp/hello.x /tmp/hello.x && /usr/bin/mpirun.mpich -launcher ssh --host 192.168.1.23:4,192.168.1.24:4,192.168.1.25:4,192.168.1.26:4,192.168.1.27:4,192.168.1.28:4 /tmp/hello.x
 73 | ```
 74 | 
 75 | ### MPI `hello.c` (in case you need it)
 76 | ```c
 77 | #include <stdio.h>
 78 | #include <mpi.h>
 79 | 
 80 | int main(int argc, char** argv)
 81 | {
 82 |     MPI_Init(&argc, &argv);
 83 | 
 84 |     int np;
 85 |     MPI_Comm_size(MPI_COMM_WORLD, &np);
 86 | 
 87 |     int me;
 88 |     MPI_Comm_rank(MPI_COMM_WORLD, &me);
 89 | 
 90 |     int name_len; //unused
 91 |     char name[MPI_MAX_PROCESSOR_NAME];
 92 |     MPI_Get_processor_name(name, &name_len);
 93 |     printf("Hello from processor %s, rank %d out of %d processors\n", name, me, np);
 94 | 
 95 |     MPI_Finalize();
 96 | 
 97 |     return 0;
 98 | }
 99 | ```
100 | 
101 | ## NWChem
102 | 
103 | ### Install
104 | 
105 | NWChem is part of the Debian/Ubuntu package manager:
106 | ```
107 | apt install nwchem
108 | ```
109 | While the binary is not built optimally, it's good enough to start.
110 | Building NWChem is not trivial either in human time or execution time of the build -- using a pre-built binary saves a lot of time, although this is eventually offset by increased execution time of the application itself.
111 | 
112 | ### Launch
113 | 
114 | After doing all the necessary SSH things, including passwordless SSH keys and `~/.ssh/knownhosts`, one can use Open-MPI 4.0.3 (from Apt) to run jobs.  Test your MPI installing by running `hostname`.
115 | 
116 | ```
117 | $ /usr/bin/mpirun.openmpi --host turing0:4,turing1:4,turing2:4,turing3:4,turing4:4,turing5:4 /usr/bin/nwchem w9_b3lyp_6-31G_energy.nw 
118 | ```
119 | 
120 | ## More NWChem
121 | 
122 | This is WIP.  I am still debugging the NWChem build...
123 | ```
124 | $ /usr/bin/mpirun.mpich -launcher ssh --host 192.168.1.23:4,192.168.1.24:4,192.168.1.25:4,192.168.1.26:4,192.168.1.27:4,192.168.1.28:4 /tmp/nwchem w9_b3lyp_6-31G_energy.nw
125 | ```
126 | 


--------------------------------------------------------------------------------
/Why_Finland.md:
--------------------------------------------------------------------------------
 1 | # Why Finland?
 2 | 
 3 | This is a question I get asked quite a bit, and it is overdue for me to write about it.
 4 | This will be a long post, and probably have quite a few iterations before it is finished.
 5 | 
 6 | I will answer the questions of why I decided to leave the USA and why I did not choose
 7 | to live in other countries in separate posts, so if you are looking for that here,
 8 | you will be disappointed.
 9 | 
10 | # Finland is a great place for children
11 | 
12 | The juvenile education system is highly regarded.
13 | I will not try to restate what is well-covered
14 | [elsewhere](https://www.weforum.org/agenda/2018/09/10-reasons-why-finlands-education-system-is-the-best-in-the-world).
15 | 
16 | Finns do not pay for education in Finland.  Ever.
17 | [Only students from outside the EU, EEA and Switzerland pay tuition](https://www.study.eu/article/tuition-fees-in-finland).
18 | This is one of many reasons why it is reasonable to pay higher taxes.
19 | Rather than save up hundreds of thousands of dollars in order to give it to a private university
20 | with a multi-billion dollar endowment in the USA, or a nominally "public" institution
21 | with slightly lower fees (between $20K and $40K a year at present), I can pay my taxes,
22 | apply for citizenship, and send my kids to top universities for less than the cost of a bus pass.
23 | 
24 | Even those who pay tuition pay far less than they would in the USA.
25 | The maximum tuition rate in Finland is Helsinki University, at
26 | €18K/year, which is not too much more than a state resident pays for
27 | [UMass](https://www.massachusetts.edu/education/tuition-fees)
28 | and about half of what an out-of-state domestic student pays at
29 | [UCLA](https://admission.ucla.edu/tuition-aid/tuition-fees).
30 | 
31 | # Finland does not treat people like garbage
32 | 
33 | Finland also considers homelessness unacceptable and has policies to eliminate it
34 | (https://www.theguardian.com/cities/2019/jun/03/its-a-miracle-helsinkis-radical-solution-to-homelessness).
35 | This is in stark contrast to the USA, which has a
36 | [homeless population](https://en.wikipedia.org/wiki/Homelessness_in_the_United_States)
37 | on par with Helsinki or Portland, Oregon.
38 | 
39 | Police in Finland [rarely draw their guns](https://yle.fi/uutiset/osasto/news/police_in_finland_shoot_their_firearms_in_the_line_of_duty_about_ten_times_each_year/10183865)
40 | and frequently [kill no one for an entire year](https://en.wikipedia.org/wiki/List_of_killings_by_law_enforcement_officers_by_country#2010s).
41 | 
42 | The Finnish prison system actually attempts to [rehabilitate](https://yle.fi/uutiset/osasto/news/open_prisons_in_finland_are_like_a_holiday_camp__but_they_seem_to_work/11214953) people.
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/Winning_by_laziness.md:
--------------------------------------------------------------------------------
 1 | # Winning via Laziness
 2 | 
 3 | This post describes my life philosophy of never doing things myself that
 4 | can be done better by others.
 5 | 
 6 | # Background on Tensors
 7 | 
 8 | As a quantum chemist focused on coupled-cluster theory in graduate school,
 9 | I found myself looking at a lot of tensor contractions.
10 | For the uninitiated, tensor contractions are a class of linear algebra operation
11 | that look like matrix multiplication, but with more indices.
12 | Here is an example of a tensor contraction:
13 | ```
14 | for all i,j,k,a,b,c:
15 |     R(i,j,a,b) = T(i,k,a,c) * V(j,k,b,c)
16 | ```
17 | This example might be found in CCSD (coupled-cluster singles and doubles).
18 | 
19 | This can be contrasted wtih the simpler but similar case of matrix-matrix multiplication:
20 | ```
21 | for all i,j,k:
22 |     R(i,j) = T(i,k) * V(j,k)
23 | ```
24 | The well-known libraries for the latter are called the BLAS
25 | (Basic Linear Algebra Subroutines)
26 | and the specific procedure is `_GEMM`, where `_` is a letter
27 | associated with numerical type used.
28 | 
29 | In the BLAS `_GEMM`, there are 4 possible index permutations
30 | support, two for each input matrix, depending on whether one
31 | contracts over the row or column indices.
32 | The permutation on the output matrix can be handled implicitly
33 | by swapping the input matrices, because `(AB)^T=B^T A^T`.
34 | 
35 | In the first example, there are many more possibilities.
36 | One can access each tensor in 24 different ways, ranging from
37 | 1234 to 4321.  There are thus 24x24x24=13824 distinct implementations
38 | of that single tensor contraction, which is one of many possibilities.
39 | 
40 | There is one obvious simplification possible here, which is to 
41 | not try to optimize all the contractions directly, but to first
42 | rearrange the 4D tensors into cases that are handled directly
43 | by the BLAS.
44 | The simplest approach is to rearrange all 24 cases into a single
45 | canonical one, in which case, implementation all 13824 cases boils
46 | down to the application of 23 (24 minus the identity permutation)
47 | permutations to each of the 24 cases of 4D tensors,
48 | and one type of BLAS call.
49 | 
50 | However, as has been shown previously
51 | (cite dissertation and Paul's papers),
52 | tensor permutations are expensive, and may be the bottleneck if
53 | used excessively.
54 | It is therefore prudent to both optimize permutations and to call
55 | the least expensive ones.
56 | One way to reduce the need for expensive permutations is to observe
57 | that the BLAS can perform the canonical matrix transpose permutation
58 | internally, at negligible cost.
59 | Thus, one should be able to use only 11 permutations, and do the
60 | `(12)<->(34)` part of any permutation inside of the BLAS.
61 | There are additional reductions possible, by breaking up contractions
62 | into multiple BLAS calls, including matrix-vector products, not
63 | just matrix-matrix products.
64 | Edoardo de Nipolini and coworkers have studied this.
65 | 
66 | This is another way for one to optimize these operations,
67 | which is to convince other people to do it.
68 | 
69 | # Outsourcing hard problems
70 | 
71 | Many quantum chemists over the years have tried to be smart
72 | and solve hard computational problems with tensor contractions.
73 | I won't name all of them.
74 | I played around with optimization tensor permutations,
75 | which led to a modest success that made it into my dissertation,
76 | but I knew that there were much better implementations possible,
77 | and, more importantly, that I did not know how to produce them.
78 | 
79 | Supercomputing 2007 (?) was in Austin, Texas, which is home
80 | to the research group of Robert van de Geijn.
81 | Robert's group knows a bit about dense linear algebra.
82 | I met Robert during Supercomputing, and asked him about tensors.
83 | He invited me to his office later in the week, and I spent a
84 | day with Robert and Maggie, talking about everything that was
85 | right and wrong about linear algebra software.
86 | We did not solve any tensor contraction problems that day.
87 | However, I did manage to convince Robert that I had mildly interesting
88 | computational problems to solve.
89 | 
90 | I will not go into detail but the long-term result of that discussion,
91 | and many others that 
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------