OpenMP
OpenMP
Uses a fork-join model with parallel regions
Threads (CPU processes) have access to the same memory
#include <omp.h> #include <stdio.h> int main(int argc, char *argv[]) { (4); // number of threads to fork in parallel regions omp_set_num_threads // fork a parallel region #pragma omp parallel { int threadNumber = omp_get_thread_num(); /* each forked thread will do this */ if (threadNumber % 2 == 0) ("hello world\n"); printfelse ("bye World!\n"); printf} return 0; }
Timing and multithreaded for loops
double ompStart = omp_get_wtim();
double omp_end = omp_get_wtime();
double secsToRun = omp_end - omp_start;
Private and Shared Variables
/* specify which variables have a shared stack and which are private */
#pragma omp parallel default(none) private(n) shared(N,v)
/* combine the above for for loops */
#pragma omp parallel for default(none) private(n) shared(N,v)
Syncing threads
// slow approach
#pragma omp parallel for shared(s)
for(int i=0; i<N; ++i) {
#pragma omp critical
// forces all threads to wait for this one
{
= s + a[n];
s }
}
// this is faster using atomics
#pragma omp parallel for shared(s)
for(int n=0; n<N; ++n) {
#pragma omp atomic
// a region only accessible to one thread at a time
{
= s + a[n];
s }
}
// reduction, best
#pragma omp parallel for reduction(+:s)
for(int n=0; n<N; ++n) {
= s + a[n];
s }
16 :: MPI
OpenMP vs. MPI
- OpenMP
-
uses a shared memory model
- fork regions then rejoin
- MPI
-
each process has its own memory space, which allows us to use larger problem sizes
- processes don't need to run on the same computer
- Processes pass messages to other processes
- information organized in ranks, the only piece of information every process has is the ability to share information
#include <mpi.h>
#include <stdio.h>
int main(int argc, char *argv[])
{
(&argc, &argv);
MPI_INIT
/* this is run independently on however many processes we run code with */
// specify rank and communicator type
int rank, n_procs;
(MPI_COMM_WORLD, &rank); // current rank
MPI_Comm_rank(MPI_COMM_WORLD, &n_procs); // number of current processes
MPI_Comm_size
("This is process: %d out of %d\n", rank, n_procs);
printf
// thread divergence
if (rank % 2 == 0) printf("blah\n");
();
MPI_Finalizereturn 0;
}
Point to point messaging
#include <mpi.h>
int main(int argc, char *argv[])
{
int rank;
(MPI_COMM_WORLD, &rank); // find rank of current process
MPI_Comm_rank
int N = 10; // length of messages to exchange
int n, tag = 999; // message tag
if (rank == 0) // sends message to rank 2
{
int *outBuffer = (int *)calloc(N, sizeof(int));
for (n = 0; n < N; ++n)
{
(outBuffer, N, MPI_INT, dest, tag, MPI_COMM_WORLD);
MPI_Send}
}
if (rank == 1) // receives messages from rank 0
{
;
MPI_Status statusint *inBuffer = (int *)calloc(N, sizeof(int));
int source = 0;
(inBuffer, N, MPI_INT, source, tag, MPI_COMM_WORLD, &status);
MPI_Recv
for (n = 0; n < N; ++n)
{
("received: inBuffer[%d]=%d\n", n, inBuffer[n]);
printf}
}
();
MPI_Finalizereturn 0;
}
17 :: MPI Point-to-Point Communication
Barriers
- Enforces synchronization. Causes a process to wait until all processes in a communicator hit the barrier
- Expensive
#include <mpi.h>
#include <stdio.h>
int main(int argc, char *argv[])
{
int n_procs, rank;
(&argc, &argc);
MPI_Init
(MPI_COMM_WORLD, &n_procs);
MPI_Comm_size
(MPI_COMM_WORLD, &rank);
MPI_Comm_rank
int i;
for (i = 0; i < n_procs; ++i) {
if (i == rank) printf("%d of %d processes\n", rank, n_procs);
(MPI_COMM_WORLD); // processes have to get to here before moving on
MPI_Barrier}
();
MPI_Finalizereturn 0;
}
Communication between ranks
= MPI_Send(&data, n_data, MPI_FLOAT, dest_rank, tag, MPI_COMM_WORLD);
err = MPI_Recv(&data, n_data, MPI_FLOAT, src_rank, tag, MPI_COMM_WORLD, &status); err
#include <mpi.h>
#include <stdio.h>
int main(int argc, char *argv[])
{
(&argc, &argv);
MPI_Init(MPI_COMM_WORLD, &rank);
MPI_Comm_rank
int tag = 0;
int send_data = 7;
int recv_data = -1;
;
MPI_Status status
("Rank %d before: send = %d, recv = %d\n", rank, send_data, recv_data);
printf
if (rank == 0) MPI_Send(&send_data, 1, MPI_INT, 1, tag, MPI_COMM_WORLD);
if (rank == 1) MPI_Recv(&recv_data, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
("Rank %d after: send = %d, recv = %d\n", rank, send_data, recv_data);
printf
();
MPI_Finalizereturn 0;
}
18 :: Parallelization with p-p Communication
- Deadlock
-
Multiple processes are trying to synchronize with each other
- i.e. communicate at the same time
#include <mpi.h>
int main(int argc, char *argv[])
{
int rank;
(&argc, &argv);
MPI_Init
(MPI_COMM_WORLD, &rank);
MPI_Comm_rank
// barrier expects that every rank gets to this line, but rank 0 wont, causing deadlocl
if (rank != 0) MPI_Barrier(MPI_COMM_WORLD);
();
MPI_Finalizereturn 0;
}
19 :: MPI Broadcast
Definitions
- Broadcast
- One process sends a message to others (MPIBCast)
- Sum Reduction
- all processes collaborate to sum up a value (MPIReduce)
- Barrier
- All processes enter before they can leave (MPIBarrier)
- All to all
- All processes send a same length message to all others (MPIALLtoall)
Broadcast
- A process (root process) will send a message to all the other processes
- Each process that has the data will send it to another, rather than data being sent to each rank by one rank
int msgRoot = 0; // should be the root value for every rank involved in the broadcast
(msgOut, msgN, MPI_INT, msgRoot, MPI_COMM_WORLD); MPI_Bcast
#include <mpi.h>
int data = -1;
int myRoot = 0;
(&data, 1, MPI_INT, myRoot, MPI_COMM_WORLD); MPI_Bcast
20 :: MPI Reduce
- Reduce operation
- Values are accumulated from all ranks and the reduced result is deposited at the end of the root process
#include <mpi.h>
= MPI_Reduce(&data, &reduced_data, n_data, MPI_FLOAT, MPI_SUM, root, MPI_COMM_WORLD); err
List of Operations
- MPISum
- MPIPROD
- MPIMIN
- MPIMAX
- MPIMINLOC :: location of minimum (rank and index)
- MPIMAXLOC
21 :: Overview of MPI Collective Communications
MPIScatter :: Similar to Bcast, a root sends data to all processes
- Chunks of an array are sent to different processes
- Spread N pieces of data from root into N/P size arrays across P processes
(send_data, send_count, MPI_Datatype, recv_data, recv_count, recv_datatype, root, MPI_comm); MPI_Scatter
MPIGather :: inverse of scatter
(send_data, send_count, send_datatype, recv_data, recv_count, recv_datatype, root, MPI_Comm); MPI_Gather
- MPIALLgather
- a gather followed by a broadcast, sends chunks of data to all ranks
MPIAllreduce :: Reduce all, a blocking function (sets up a barrier)
- A reduce followed by a Bcast
= MPI_Allreduce(&data, &reduced_data, n_data, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); err
MPIAlltoall
- Takes ith chunk of data from process j, sends it to process i, which stores it as jth chunk of data
- Blocking operation
(&sendbuf, send_count, sendtype, &recvbuf, recvcount, recvtype, MPI_Comm); MPI_Alltoall
22 :: Domain Decomposition and Ghost Regions
23 :: MPI Isend Irecv
- Isend and Irecv
- Nonblocking, (unlike Send and Recv)
Test the status with MPITest, or wait with MPIWaitany or MPIWaitall
Multiple communications can be queued
MPIWait :: Blocks until a specific requested communication is complete
MPIWaitall :: Blocks until all requested communications are complete
MPIWaitany :: Blocks until one completes
;
MPI_Request request= MPI_Isend(&data, nuumData, MPI_FLOAT, dest_rank, tag, MPI_COMM_WORLD, &request);
err = MPI_Irecv(&data, numData, MPI_FLOAT, source_rank, tag, MPI_Comm, &request);
err
= MPI_Waitall(n_requests, *requests, *statuses); err
Best practices
- Queue the receives first, sends second
- Initialize requests array to MPIREQUESTNULL, MPIWait ignores null requests
- Distinct tag for communication pairs
24 :: Targeting Parallelism
Amdahl's law
- TP
- parallel run time
- TS
- sequential run time
- P
- number of processes
- SP
-
$\frac{T_S}{T_P}$, parallel speedup time
- Serial Fraction
- The part of every calculation that doesn't benefit from more processors s ∈ [0,1]
- Time P processors takes is: $$T_P=sT_1+\frac{1-s}{P}T_1$$
- The parallel speed $$S_P=\frac{T_1}{T_P}=\frac{1}{s+\frac{1-s}{P}}$$
- s=0
- Perfectly parallelizable SP = P
- s=1
- Purely serial, SP = 1
- s=0.5
- Speed up approaches 2 as P → ∞
- s=0.1
- Max speed up you can get is 10
Ex. Speedup of 20 on 10% of time $$\frac{1}{.9 + \frac{0.1}{20}}$$
Ex. Speedup of 2x on 90%, slow down the rest by 10x
$$S_P = \frac{1}{\frac{0.1}{0.1} + \frac{0.9}{2}}$$
Gustafson's law
- For larger scale computing with many processors
- Work performed by P processors is:
WP = sW0 + P(1−s)W0 SP = s + (1−s)P
25 :: Binary IO
FILE *file = fopen(filename, "rb"); // rb - read binary
(file, start_byte, reference_keyword); // reference keyword: SEEK_SET, SEEK_CUR, SEEK_END
fseekint *data;
(data, sizeof(int), numEntries, file); // read the data into data arr
fread(data, sizeof(int), numEntries, file);
fwrite(file); fclose
MPI Equiv
*file;
MPI_File (MPI_Comm, file, amode, MPI_Info, MPI_File); // amode: MPI_MODE_APPEND, MPI_MODE_CREATE, MPI_MODE_RDONLY, MPI_MODE_WRONLY
MPI_File_open(file, offset, whence); // whence: MPI_SEEK_SET, MPI_SEEK_CUR, MPI_SEEK_END
MPI_File_seek(file, data, count, MPI_Datatype, MPI_Status);
MPI_File_read(file, data, count, MPI_Dattaype, MPI_Status);
MPI_File_write(file); MPI_File_close
27&28 :: Intro to GPUs
Single Instruction Multiple Data
The CPU and GPU have separate memory, so we need to move the memory in some way
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <stdio.h>
int main(int argc, char *argv[])
{
void axpy(float a, float *xVec, float *yVec)
__global__ {
}
int N = atoi(argv[1]);
float a = 0.5;
float *x_host = (float *)malloc(N * sizeof(float));
= clock();
clock_t startTime = clock();
clock_t endTime float cpuTime = float(endTime - startTime) / (float)(nReps * CLOCKS_PER_SEC);
int i;
for (i = 0; i < N, ++i)
{
[i] = (float)i;
x_host}
float *x_device;
((void **)&x_device, N * sizeof(float));
cudaMalloc
int n_threads_per_block = 16;
int n_blocks = N / n_threads_per_block;
<<<n_blocks, n_threads_per_block>>>(a, x_device, y_device);
axpy
(x_device, x_host, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy
// copy back to CPU
(x_device);
cudaFreereturn 0;
}
openMP cheat sheet
- ompgetnumthreads() - get number of threadsin a parallel region
- ompgetthreadnum() - current thread number
- ompsetnumthreads(4) - only use 4 threads
- #pragma omp for - parallelize for loop
- #pragma omp parallel private(n) shared(N) - parallel region with n local to each thread and N shared
- #pragma omp sections - open a region of sections
- #pragma omp section - one thread handles the enclosed code
- #pragma omp critical - within a shared block, ensures the enclosed code occurs one thread at a time
- #pragma omp atomic - also happens one at a time
- #pragma omp parallel for reduction(+:s)
- an addition reduction is occurring on variable s
- ompgetwtime() - time