mirror of
https://github.com/adambard/learnxinyminutes-docs.git
synced 2024-11-22 04:04:29 +03:00
[OpenMP/en] Added documentation for OpenMP (#4645)
This commit is contained in:
parent
6a1234ad7a
commit
6754700132
346
openmp.html.markdown
Normal file
346
openmp.html.markdown
Normal file
@ -0,0 +1,346 @@
|
||||
---
|
||||
category: tool
|
||||
tool: OpenMP
|
||||
filename: learnopenMP.cpp
|
||||
contributors:
|
||||
- ["Cillian Smith", "https://github.com/smithc36-tcd"]
|
||||
---
|
||||
|
||||
**OpenMP** is a library used for parallel programming on shared-memory machines.
|
||||
OpenMP allows you to use simple high-level constructs for parallelism,
|
||||
while hiding the details, keeping it easy to use and quick to write.
|
||||
OpenMP is supported by C, C++, and Fortran.
|
||||
|
||||
## Structure
|
||||
|
||||
Generally an OpenMP program will use the following structure.
|
||||
|
||||
- **Master**: Start a Master thread, which will be used to set up the environment and
|
||||
initialize variables
|
||||
|
||||
- **Slave**: Slave threads are created for sections of code which are marked by a special
|
||||
directive, these are the threads which will run the parallel sections.
|
||||
|
||||
Each thread will have its own ID which can be obtained using the function
|
||||
`omp_get_thread_num()`, but more on that later.
|
||||
|
||||
```
|
||||
__________ Slave
|
||||
/__________ Slave
|
||||
/
|
||||
Master ------------- Master
|
||||
\___________ Slave
|
||||
\__________ Slave
|
||||
|
||||
```
|
||||
|
||||
## Compiling and running OpenMP
|
||||
|
||||
A simple "hello world" program can be parallelized using the `#pragma omp parallel` directive
|
||||
|
||||
```cpp
|
||||
#include <stdio.h>
|
||||
|
||||
int main() {
|
||||
#pragma omp parallel
|
||||
{
|
||||
printf("Hello, World!\n");
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
Compile it like this
|
||||
|
||||
```bash
|
||||
# The OpenMP flat depends on the compiler
|
||||
# intel : -openmp
|
||||
# gcc : -fopenmp
|
||||
# pgcc : -mp
|
||||
gcc -fopenmp hello.c -o Hello
|
||||
```
|
||||
|
||||
Running it should output
|
||||
|
||||
```
|
||||
Hello, World!
|
||||
...
|
||||
Hello, World!
|
||||
```
|
||||
|
||||
The exact number of "`Hello, Worlds`" depends on the number of cores of your machine,
|
||||
for example I got 12 my laptop.
|
||||
|
||||
## Threads and processes
|
||||
|
||||
You can change the default number of threads using `export OMP_NUM_THREADS=8`
|
||||
|
||||
Here are some useful library functions in the `omp.h` library
|
||||
|
||||
```cpp
|
||||
// Check the number of threads
|
||||
printf("Max Threads: %d\n", omp_get_max_threads());
|
||||
printf("Current number of threads: %d\n", omp_get_num_threads());
|
||||
printf("Current Thread ID: %d\n", omp_get_thread_num());
|
||||
|
||||
// Modify the number of threads
|
||||
omp_set_num_threads(int);
|
||||
|
||||
// Check if we are in a parallel region
|
||||
omp_in_parallel();
|
||||
|
||||
// Dynamically vary the number of threads
|
||||
omp_set_dynamic(int);
|
||||
omp_get_dynamic();
|
||||
|
||||
// Check the number of processors
|
||||
printf("Number of processors: %d\n", omp_num_procs());
|
||||
```
|
||||
|
||||
## Private and shared variables
|
||||
|
||||
```cpp
|
||||
// Variables in parallel sections can be either private or shared.
|
||||
|
||||
/* Private variables are private to each thread, as each thread has its own
|
||||
* private copy. These variables are not initialized or maintained outside
|
||||
* the thread.
|
||||
*/
|
||||
#pragma omp parallel private(x, y)
|
||||
|
||||
/* Shared variables are visible and accessible by all threads. By default,
|
||||
* all variables in the work sharing region are shared except the loop
|
||||
* iteration counter.
|
||||
*
|
||||
* Shared variables should be used with care as they can cause race conditions.
|
||||
*/
|
||||
#pragma omp parallel shared(a, b, c)
|
||||
|
||||
// They can be declared together as follows
|
||||
#pragma omp parallel private(x, y) shared(a,b,c)
|
||||
```
|
||||
|
||||
## Synchronization
|
||||
|
||||
OpenMP provides a number of directives to control the synchronization of threads
|
||||
|
||||
```cpp
|
||||
#pragma omp parallel {
|
||||
|
||||
/* `critical`: the enclosed code block will be executed by only one thread
|
||||
* at a time, and not simultaneously executed by multiple threads. It is
|
||||
* often used to protect shared data from race conditions.
|
||||
*/
|
||||
#pragma omp critical
|
||||
data += data + computed;
|
||||
|
||||
|
||||
/* `single`: used when a block of code needs to be run by only a single
|
||||
* thread in a parallel section. Good for managing control variables.
|
||||
*/
|
||||
#pragma omp single
|
||||
printf("Current number of threads: %d\n", omp_get_num_threads());
|
||||
|
||||
/* `atomic`: Ensures that a specific memory location is updated atomically
|
||||
* to avoid race conditions. */
|
||||
#pragma omp atomic
|
||||
counter += 1;
|
||||
|
||||
|
||||
/* `ordered`: the structured block is executed in the order in which
|
||||
* iterations would be executed in a sequential loop */
|
||||
#pragma omp for ordered
|
||||
for (int i = 0; i < N; ++i) {
|
||||
#pragma omp ordered
|
||||
process(data[i]);
|
||||
}
|
||||
|
||||
|
||||
/* `barrier`: Forces all threads to wait until all threads reach this point
|
||||
* before proceeding. */
|
||||
#pragma omp barrier
|
||||
|
||||
/* `nowait`: Allows threads to proceed with their next task without waiting
|
||||
* for other threads to complete the current one. */
|
||||
#pragma omp for nowait
|
||||
for (int i = 0; i < N; ++i) {
|
||||
process(data[i]);
|
||||
}
|
||||
|
||||
/* `reduction` : Combines the results of each thread's computation into a
|
||||
* single result. */
|
||||
#pragma omp parallel for reduction(+:sum)
|
||||
for (int i = 0; i < N; ++i) {
|
||||
sum += a[i] * b[i];
|
||||
}
|
||||
|
||||
}
|
||||
```
|
||||
|
||||
Example of the use of `barrier`
|
||||
|
||||
```c
|
||||
#include <omp.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int main() {
|
||||
|
||||
// Current number of active threads
|
||||
printf("Num of threads is %d\n", omp_get_num_threads());
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
// Current thread ID
|
||||
printf("Thread ID: %d\n", omp_get_thread_num());
|
||||
|
||||
#pragma omp barrier <--- Wait here until other threads have returned
|
||||
if(omp_get_thread_num() == 0)
|
||||
{
|
||||
printf("\nNumber of active threads: %d\n", omp_get_num_threads());
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
## Parallelizing Loops
|
||||
|
||||
It is simple to parallelise loops using OpenMP. Using a work-sharing directive we can do the following
|
||||
|
||||
```c
|
||||
#pragma omp parallel
|
||||
{
|
||||
#pragma omp for
|
||||
// for loop to be parallelized
|
||||
for() ...
|
||||
}
|
||||
```
|
||||
|
||||
A loop must be easily parallelisable for OpenMP to unroll and facilitate the assignment amoungst threads.
|
||||
If there are any data dependancies between one iteration to the next, OpenMP can't parallelise it.
|
||||
|
||||
## Speed Comparison
|
||||
|
||||
The following is a C++ program which compares parallelised code with serial code
|
||||
|
||||
```cpp
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <ctime>
|
||||
#include <chrono>
|
||||
#include <omp.h>
|
||||
|
||||
int main() {
|
||||
const int num_elements = 100000000;
|
||||
|
||||
std::vector<double> a(num_elements, 1.0);
|
||||
std::vector<double> b(num_elements, 2.0);
|
||||
std::vector<double> c(num_elements, 0.0);
|
||||
|
||||
// Serial version
|
||||
auto start_time = std::chrono::high_resolution_clock::now();
|
||||
for (int i = 0; i < num_elements; i++) {
|
||||
c[i] = a[i] * b[i];
|
||||
}
|
||||
auto end_time = std::chrono::high_resolution_clock::now();
|
||||
auto duration_serial = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
|
||||
|
||||
// Parallel version with OpenMP
|
||||
start_time = std::chrono::high_resolution_clock::now();
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < num_elements; i++) {
|
||||
c[i] = a[i] * b[i];
|
||||
}
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
auto duration_parallel = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
|
||||
|
||||
std::cout << "Serial execution time: " << duration_serial << " ms" << std::endl;
|
||||
std::cout << "Parallel execution time: " << duration_parallel << " ms" << std::endl;
|
||||
std::cout << "Speedup: " << static_cast<double>(duration_serial) / duration_parallel << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
This results in
|
||||
|
||||
```
|
||||
Serial execution time: 488 ms
|
||||
Parallel execution time: 148 ms
|
||||
Speedup: 3.2973
|
||||
```
|
||||
|
||||
It should be noted that this example is fairly contrived and the actual speedup
|
||||
depends on implementation and it should also be noted that serial code may run
|
||||
faster than parallel code due to cache preformace.
|
||||
|
||||
## Example
|
||||
|
||||
The following example uses OpenMP to calculate the Mandlebrot set
|
||||
|
||||
```cpp
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <complex>
|
||||
#include <vector>
|
||||
#include <omp.h>
|
||||
|
||||
const int width = 2000;
|
||||
const int height = 2000;
|
||||
const int max_iterations = 1000;
|
||||
|
||||
int mandelbrot(const std::complex<double> &c) {
|
||||
std::complex<double> z = c;
|
||||
int n = 0;
|
||||
while (abs(z) <= 2 && n < max_iterations) {
|
||||
z = z * z + c;
|
||||
n++;
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
int main() {
|
||||
std::vector<std::vector<int>> values(height, std::vector<int>(width));
|
||||
|
||||
// Calculate the Mandelbrot set using OpenMP
|
||||
#pragma omp parallel for schedule(dynamic)
|
||||
for (int y = 0; y < height; y++) {
|
||||
for (int x = 0; x < width; x++) {
|
||||
double real = (x - width / 2.0) * 4.0 / width;
|
||||
double imag = (y - height / 2.0) * 4.0 / height;
|
||||
std::complex<double> c(real, imag);
|
||||
|
||||
values[y][x] = mandelbrot(c);
|
||||
}
|
||||
}
|
||||
|
||||
// Prepare the output image
|
||||
std::ofstream image("mandelbrot_set.ppm");
|
||||
image << "P3\n" << width << " " << height << " 255\n";
|
||||
|
||||
// Write the output image in serial
|
||||
for (int y = 0; y < height; y++) {
|
||||
for (int x = 0; x < width; x++) {
|
||||
int value = values[y][x];
|
||||
int r = (value % 8) * 32;
|
||||
int g = (value % 16) * 16;
|
||||
int b = (value % 32) * 8;
|
||||
|
||||
image << r << " " << g << " " << b << " ";
|
||||
}
|
||||
image << "\n";
|
||||
}
|
||||
|
||||
image.close();
|
||||
std::cout << "Mandelbrot set image generated as mandelbrot_set.ppm." << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
## Resources
|
||||
|
||||
- [Intro to parallel programming](https://tildesites.bowdoin.edu/~ltoma/teaching/cs3225-GIS/fall17/Lectures/openmp.html)
|
||||
- [Tutorials currated by OpenMP](https://www.openmp.org/resources/tutorials-articles/)
|
||||
- [OpenMP cheatsheet](https://www.openmp.org/wp-content/uploads/OpenMPRefCard-5-2-web.pdf)
|
Loading…
Reference in New Issue
Block a user