openMP多线程编程-阿里云开发者社区

OpenMP(Open Muti-Processing)

OpenMP缺点：

1：作为高层抽象，OpenMp并不适合需要复杂的线程间同步和互斥的场合；

2：另一个缺点是不能在非共享内存系统(如计算机集群)上使用。在这样的系统上，MPI使用较多。

关于openMP实现 临界区与互斥锁 可参考 reference3

windows系统下使用

==========================WINDOWS系统中使用==========================

基本使用：

在visual C++2010中使用OpenMP

1：将 Project 的Properties中C/C++里Language的OpenMP Support开启（参数为 /openmp）；

2：在编写使用OpenMP 的程序时，则需要先include OpenMP的头文件：omp.h；

3：在要并行化的for循环前面加上 #pragma omp parallel for

如下简单例子：

[cpp]view plain copy
     
   
//未使用OpenMP  
#include <stdio.h>  
#include <stdlib.h>  
  
void Test(int n) {  
for(int i = 0; i < 10000; ++i)   
{  
      //do nothing, just waste time   
}  
    printf("%d, ", n);   
}  
  
int main(int argc,char* argv[])   
{  
    for(int i = 0; i < 16; ++i)   
    Test(i);  
    system("pause");   
}  

结果为：

0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11,12,13,14,15，

[cpp]view plain copy
     
   
//使用OpenMP  
<pre name="code" class="cpp">#include <stdio.h>   
#include <stdlib.h>  
#include <omp.h>  
  
void Test(int n) {  
for(int i = 0; i < 10000; ++i) {  
//do nothing, just waste time  
}  
    printf("%d, ", n);   
}  
  
int main(int argc,char* argv[])   
{  
#pragma omp parallel for  
    for(int i = 0; i < 16; ++i)   
    Test(i);  
    system("pause");   
}  

（我的笔记本为2核 4线程）

显示结果为：

0,12,4,8,1,13,5,9,2,14,6,10,3,15,7,11,

OpenMP将循环0-15拆分成0-3,4-7,8-11，12-15四个部分来执行。

当编译器发现#pragma omp parallel for后，自动将下面的for循环分成N份，(N为电脑CPU线程数)，然后把每份指派给一个线程去执行，而且多线程之间为并行执行。

关于获取CPU核数与线程ID

[cpp]view plain copy
     
   
#include <iostream>  
#include <omp.h>  
int main(){  
    int sum = 0;   
    int a[10] = {1,2,3,4,5,6,7,8,9,10};   
    int coreNum = omp_get_num_procs();//获得处理器个数（其实获取的是线程的数量，我的笔记本为2核4线程，测试时获取的数字为4）</span>   
    int* sumArray = new int[coreNum];//对应处理器个数，先生成一个数组   
    for (int i=0;i<coreNum;i++)//将数组各元素初始化为0   
        sumArray[i] = 0;  
#pragma omp parallel for  
    for (int i=0;i<10;i++)   
    {  
        int k = <span style="color:#3366FF;">omp_get_thread_num();//获得每个线程的ID</span>   
        sumArray[k] = sumArray[k]+a[i];  
    }  
    for (int i = 0;i<coreNum;i++)   
        sum = sum + sumArray[i];  
    std::cout<<"sum: "<<sum<<std::endl;   
    return 0;   
}  

Ubuntu系统中使用

=================ubuntu系统中=====================================

Hands on FAQ:

*怎么在Linux上运行OpenMP程序？
> 只需要安装支持OpenMP的编译器即可，比如GCC 4.2以上版本（好像Fedora Core带的部分4.1版本也支持），或者ICC（我用的version 9.1是支持的，其他没试过）。

*怎么缺点编译器是不是支持OpenMP？
> 看编译器安装路径下/include目录里有没有omp.h。

*怎么区分OpenMP程序？
> 程序中有没有以下内容：
> #include <omp.h>
> #pragma omp ...

*怎么编译OpenMP程序？
> gcc -fopenmp [sourcefile] -o [destination file]
> icc -openmp [sourcefile] -o [destination file]

*怎么运行OpenMP程序？
> 编译后得到的文件和普通可执行文件一样可以直接执行。

*怎么设置线程数？
>：在程序中写入set_num_threads(n);
> Method2：export OMP_NUM_THREADS=n;
> 两种方法各有用处，前者只对该程序有效，后者不用重新编译就可以修改线程数。

Example1:并行与串行时间差别

Sequetial Version:

[cpp]view plain copy
     
   
#include<iostream>  
#include<sys/time.h>  
#include<unistd.h>  
  
using namespace std;  
  
void test(int n)  
{     
    int a=0;   
    struct timeval tstart,tend;   
    double timeUsed;   
    gettimeofday(&tstart,NULL);  
    for(int i=0;i<1000000000;i++)   
    {  
        a=i+1;  
    }  
    gettimeofday(&tend,NULL);  
    timeUsed=1000000*(tend.tv_sec-tstart.tv_sec)+tend.tv_usec-tstart.tv_usec;  
    cout<<n<<" Time="<<timeUsed/1000<<" ms"<<endl;   
}  
int main()  
{  
    struct timeval tstart,tend;   
    double timeUsed;   
    gettimeofday(&tstart,NULL);  
    int j=0;   
    for(j=0;j<4;j++)   
    {  
        test(j);  
    }  
    gettimeofday(&tend,NULL);  
    timeUsed=1000000*(tend.tv_sec-tstart.tv_sec)+tend.tv_usec-tstart.tv_usec;  
    cout<<" Total Time="<<timeUsed/1000<<" ms"<<endl;   
    return 0;   
}  

Parallel Version:

[cpp]view plain copy
     
   
#include<iostream>  
#include<sys/time.h>  
#include<unistd.h>  
#include<omp.h>  
  
using namespace std;  
  
void test(int n)  
{     
    int a=0;   
    struct timeval tstart,tend;   
    double timeUsed;   
    gettimeofday(&tstart,NULL);  
    for(int i=0;i<1000000000;i++)   
    {  
        a=i+1;  
    }  
    gettimeofday(&tend,NULL);  
    timeUsed=1000000*(tend.tv_sec-tstart.tv_sec)+tend.tv_usec-tstart.tv_usec;  
    cout<<n<<" Time="<<timeUsed/1000<<" ms"<<endl;   
}  
int main()  
{  
    struct timeval tstart,tend;   
    double timeUsed;   
    gettimeofday(&tstart,NULL);  
    int j=0;   
#pragma omp parallel for  
    for(j=0;j<4;j++)   
    {  
        test(j);  
    }  
    gettimeofday(&tend,NULL);  
    timeUsed=1000000*(tend.tv_sec-tstart.tv_sec)+tend.tv_usec-tstart.tv_usec;  
    cout<<" Total Time="<<timeUsed/1000<<" ms"<<endl;   
    return 0;   
} 

Result:

Sequential version:

[cpp]view plain copy
     
   
0 Time=2064.69 ms  
1 Time=2061.11 ms  
2 Time=2076.32 ms  
3 Time=2077.93 ms  
 Total Time=8280.14 ms  

Parallel version:

[cpp]view plain copy
     
   
2 Time=2148.22 ms  
3 Time=2151.72 ms  
0 Time=2151.85 ms  
1 Time=2151.77 ms  
 Total Time=2158.81 ms  

------------------------------------------------------------------------------------------------------------------------------------------------------------

Example2:矩阵拟合法计算Pi

Sequential Version:

[cpp]view plain copy
     
   
#include<iostream>  
#include<sys/time.h>  
#include<unistd.h>  
//#include <omp.h>  
  
using namespace std;  
  
int main ()  
{  
    struct timeval tstart,tend;   
    double timeUsed;   
    static long num_steps =1000000000;   
    double step;   
    int i;   
    double x, pi, sum = 0.0;   
    step = 1.0/(double) num_steps;   
    gettimeofday(&tstart,NULL);  
//#pragma omp parallel for reduction(+:sum) private(x) /*只加了这一句，其他不变*/  
    for (i=0;i < num_steps; i++)   
    {  
        x = (i+0.5)*step;  
        sum = sum + 4.0/(1.0+x*x);  
    }  
    pi = step * sum;  
    gettimeofday(&tend,NULL);  
    timeUsed=1000000*(tend.tv_sec-tstart.tv_sec)+tend.tv_usec-tstart.tv_usec;  
    timeUsed=timeUsed/1000;  
    cout<<"pi="<<pi<<"  ("<<num_steps<<" )   "<<timeUsed<<" ms"<<endl;   
    return 0;   
}  

Parallel Version:

[cpp]view plain copy
     
   
#include<iostream>  
#include<sys/time.h>  
#include<unistd.h>  
#include <omp.h>  
  
using namespace std;  
  
int main ()  
{  
    struct timeval tstart,tend;   
    double timeUsed;   
    static long num_steps = 1000000000;   
    double step;   
    int i;   
    double x, pi, sum = 0.0;   
    step = 1.0/(double) num_steps;   
    gettimeofday(&tstart,NULL);  
#pragma omp parallel for reduction(+:sum) private(x) /*只加了这一句，其他不变*/  
    for (i=0;i < num_steps; i++)   
    {  
        x = (i+0.5)*step;  
        sum = sum + 4.0/(1.0+x*x);  
    }  
    pi = step * sum;  
    gettimeofday(&tend,NULL);  
    timeUsed=1000000*(tend.tv_sec-tstart.tv_sec)+tend.tv_usec-tstart.tv_usec;  
    timeUsed=timeUsed/1000;  
    cout<<"pi="<<pi<<"  ("<<num_steps<<" )   "<<timeUsed<<" ms"<<endl;   
    return 0;   
}  

运行结果为：

[cpp]view plain copy
     
   
von@von-pc:~/test$ ./parrPI2  
pi=3.14159  (1000000000 )   3729.68 ms  
von@von-pc:~/test$ ./seqPI2  
pi=3.14159  (1000000000 )   13433.1 ms