OpenMP并行FORTRAN嵌套循环问题
本帖最后由 Kieran 于 2023-4-25 20:33 编辑大家好,
我用OpenMP的COLLAPSE语句并行了FORTRAN的嵌套循环。编译也能成功,但在运行是,程序报错,信息如下。
Loading tbb version 2021.5.1
Loading compiler-rt version 2022.0.2
Loading oclfpga version 2022.0.2
Load "debugger" to debug DPC++ applications with the gdb-oneapi debugger.
Load "dpl" for additional DPC++ APIs: https://github.com/oneapi-src/oneDPL
Loading mkl version 2022.0.2
Loading mpi version 2021.5.1
===================================================================================
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= RANK 0 PID 15005 RUNNING AT login
= KILLED BY SIGNAL: 11 (Segmentation fault)
===================================================================================
+Done ./run.sh
我的代码中并行的部分如下。
!$OMP PARALLEL DO PRIVATE(i,j,l,m,pf,th,ll,sr,rl,ls,rs,rg,ag,ipiv,work,lwork,info,fd,tr)&
!$OMP SHARED(po,km,kp,nm,nb,ei,ic,tv,lr,ol,ks,ra,k,ch,te) COLLAPSE(2)&
!$OMP REDUCTION(+:t1,t2,t3)
DO i = 1, nm, 1
DO j = 1, km(1)*km(2)*km(3), 1
DO l = 1, nb, 1
DO m = 1, po(1)*po(2)*po(3), 1
pf = EXP(ic*(kp(j,1)*(tv(1,m)*lr(1,1)+tv(2,m)*lr(2,1)+tv(3,m)*lr(3,1))+kp(j,2)*(tv(1,m)*lr(1,2)+tv(2,m)*lr(2,2)+tv(3,m)*lr(3,2))+kp(j,3)*(tv(1,m)*lr(1,3)+tv(2,m)*lr(2,3)+tv(3,m)*lr(3,3))))
th(:,l) = pf*((ei(i)+ic)*ol(:,l,m)-ks(:,l,m))
END DO
END DO
ll(1,:,:) = th(ra(1)-k:ra(2)-k,ra(1)-k*2:ra(2)-k*2)
ll(2,:,:) = th(ra(1)-k:ra(2)-k,ra(1)-k:ra(2)-k)
ll(3,:,:) = th(ra(1)-k*2:ra(2)-k*2,ra(1)-k:ra(2)-k)
ll(4,:,:) = th(ra(1)-k:ra(2)-k,ra(1):ra(2))
ll(5,:,:) = th(ra(1):ra(2),ra(1)-k:ra(2)-k)
sr = th(ra(1):ra(2),ra(1):ra(2))
rl(1,:,:) = th(ra(1)+k:ra(2)+k,ra(1)+k*2:ra(2)+k*2)
rl(2,:,:) = th(ra(1)+k:ra(2)+k,ra(1)+k:ra(2)+k)
rl(3,:,:) = th(ra(1)+k*2:ra(2)+k*2,ra(1)+k:ra(2)+k)
rl(4,:,:) = th(ra(1):ra(2),ra(1)+k:ra(2)+k)
rl(5,:,:) = th(ra(1)+k:ra(2)+k,ra(1):ra(2))
ls = MATMUL(MATMUL(ll(1,:,:),ll(2,:,:)),ll(3,:,:))
ll(2,:,:) = ll(2,:,:) - ls
rs = MATMUL(MATMUL(rl(1,:,:),rl(2,:,:)),rl(3,:,:))
rl(2,:,:) = rl(2,:,:) - rs
rg = DCMPLX(0.0d0, 0.0d0)
rg = sr-ls-rs
CALL ZGETRF(ra(2)-ra(1),ra(2)-ra(1),rg,l,ipiv,info)
CALL ZGETRI(ra(2)-ra(1),rg,ra(2)-ra(1),ipiv,work,lwork,info)
DO l = 1, k, 1
ag(l,:) = DCONJG(rg(:,l))
END DO
fd(1) = 1.0d0/(EXP((ei(i)-ch(1))/te)+1.0d0)
fd(2) = 1.0d0/(EXP((ei(i)-ch(3))/te)+1.0d0)
IF (fd(1) > fd(2)) THEN
fd(3) = fd(1)
fd(1) = fd(2)
fd(2) = fd(3)
fd(3) = fd(2)-fd(1)
ELSE IF (fd(1) == fd(2)) THEN
fd(3) = 1.0d0
ELSE
fd(3) = fd(2)-fd(1)
END IF
t1(i,j,:,:) = t1(i,j,:,:)+fd(3)*MATMUL(MATMUL(rg,ls+rs),ag)/2.0d0-fd(1)*IMAG(rg)
tr = MATMUL(MATMUL(MATMUL(ls,ag),rs),rg)
DO l = 1, k, 1
t2(i) = t2(i)+tr(l,l)
END DO
t2(i) = t2(i)*fd(3)
tr = rg - ag
DO l = 1, k, 1
t3(i) = t3(i)+tr(l,l)
END DO
END DO
END DO
!$OMP END PARALLEL DO
我有尝试使用很小的值赋给nm和km变量,但报错信息仍然每天。麻烦大家帮我看下,问题出在哪里,又要如何解决呢?
谢谢啦。
k为公有,但30行修改了K li913 发表于 2023-4-25 15:44
k为公有,但30行修改了K
谢谢你的回复和指点。我修改了代码,我把原来的k变量,直接用ra(2)-ra(1)来替代,编译也成功了,但运行代码的时候,还是报错。错误信息如下。
===================================================================================
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= RANK 0 PID 18197 RUNNING AT login
= KILLED BY SIGNAL: 11 (Segmentation fault)
===================================================================================
是不是我的OMP中的REDUCTION或者COLLAPSE功能使用不当呢?还是其它原因呢?能麻烦你再给些建议,调整解决这个问题吗?
谢谢啦。 本帖最后由 Kieran 于 2023-4-26 01:57 编辑
li913 发表于 2023-4-25 15:44
k为公有,但30行修改了K
我又重新写了一个简短的完整的程序,这个程序与我上面问题里的程序结构相同。我把它放在下面了。我的目的是希望能把最外层的两个循环里的t1,t2和t3分别各自相加,然后得到最后加和后的t1,t2和t3。能否麻烦你帮我看下我的程序是否有问题吗?尤其是OpenMP中的COLLAPSE和REDUCTION语句是否使用正确呢?多谢啦。
PROGRAM TES
IMPLICIT NONE
INTEGER, PARAMETER :: dp = SELECTED_REAL_KIND(15,14)
INTEGER :: i, j, k, l, m
INTEGER :: na = 177
INTEGER :: nm = 100
INTEGER :: km(3)
INTEGER :: po(3)
REAL (KIND=dp), ALLOCATABLE :: kp(:,:)
COMPLEX (KIND=dp), ALLOCATABLE:: th(:,:)
COMPLEX (KIND=dp) :: ic = DCMPLX(0.0d0, 1.0d0)
COMPLEX (KIND=dp), ALLOCATABLE:: t1(:,:,:,:)
COMPLEX (KIND=dp), ALLOCATABLE:: t2(:)
COMPLEX (KIND=dp), ALLOCATABLE:: t3(:)
INTEGER :: omp_get_num_threads
INTEGER :: omp_get_thread_num
INTEGER :: num_thread
INTEGER :: id_thread
km(1) = 100
km(2) = 1
km(3) = 75
ALLOCATE (kp(km(1)*km(2)*km(3),3))
DO i = 1, km(1), 1
DO j = 1, km(2), 1
DO k = 1, km(3), 1
kp((i-1)*km(2)*km(3)+(j-1)*km(3)+k,1) = DBLE(i)
kp((i-1)*km(2)*km(3)+(j-1)*km(3)+k,2) = DBLE(j)
kp((i-1)*km(2)*km(3)+(j-1)*km(3)+k,3) = DBLE(k)
END DO
END DO
END DO
po(1) = 1
po(2) = 0
po(3) = 5
ALLOCATE (th(na,na))
ALLOCATE (t1(nm,km(1)*km(2)*km(3),na,na))
ALLOCATE (t2(nm))
ALLOCATE (t3(nm))
!$OMP PARALLEL DEFAULT(NONE) SHARED(num_thread)
num_thread = omp_get_num_threads()
!$OMP END PARALLEL
!$OMP PARALLEL DO PRIVATE(i,j,k,l,m,th)&
!$OMP SHARED(km,kp,nm,po,ic) COLLAPSE(2)&
!$OMP REDUCTION(+:t1,t2,t3)
DO i = 1, nm, 1
DO j = 1, km(1)*km(2)*km(3), 1
th = DCMPLX(0.0d0, 0.0d0)
DO k = 1, na, 1
DO l = 1, na, 1
DO m = 1, (po(1)*2+1)*(po(2)*2+1)*(po(3)*2+1), 1
th(l,k) = th(l,k) + kp(j,1)*k + kp(j,2)*ic - kp(j,3)*l
END DO
END DO
END DO
t1(i,j,:,:) = th*(i-j+ic)
DO k = 1, na, 1
t2(i) = t2(i) + th(k,k)*(i*ic-j)
END DO
DO k = na, 1, -1
t3(i) = t3(i) + th(k,na-k+1)*(i*ic+j)
END DO
END DO
END DO
!$OMP END PARALLEL DO
OPEN (UNIT=3, FILE='output.dat', STATUS='UNKNOWN')
DO i = 1, nm, 1
DO j = 1, km(1)*km(2)*km(3), 1
DO k = 1, na, 1
WRITE (UNIT=3, FMT=*) t1(i,j,k,:)
END DO
END DO
END DO
WRITE (UNIT=3, FMT=*)
DO i = 1, nm, 1
WRITE (UNIT=3, FMT=*) t2(i)
END DO
WRITE (UNIT=3, FMT=*)
DO i = 1, nm, 1
WRITE (UNIT=3, FMT=*) t3(i)
END DO
CLOSE (UNIT=3)
DEALLOCATE (kp)
DEALLOCATE (th)
DEALLOCATE (t1)
DEALLOCATE (t2)
DEALLOCATE (t3)
STOP
END PROGRAM TES
本帖最后由 Transpose 于 2023-4-26 11:23 编辑
t1(nm,km(1)*km(2)*km(3),na,na)
这个数组占的内存大概是180G,应该是内存不足了
100*100*1*75*177*177=23496750000
23496750000/1e9=23.5
23.5*8=188GB
Transpose 发表于 2023-4-26 11:17
这个数组占的内存大概是180G, ...
谢谢你的回复。
那请问针对于我的代码,你有什么好的并行方式,既不会占用太大的内存空间,同时也能把外层的nm变量循环和内层的km变量循环,这两个嵌套循环都并行了吗?
看你的网名,你也是做输运计算的吧?请问这类循环能量和k点双层嵌套循环,要如何并行会好一些呢?
盼复。 感觉只能修改算法,减少内存的占用了,这么大的数组直接定义,不太合理
另外注意数组的列优先顺序,使用ifort的时候,极易产生临时数组,这也会导致内存不足
t1(i,j,:,:) = th*(i-j+ic)
比如这一句,可能会产生临时数组
你的前一个程序,大量使用matmul,在ifort里面也是不提倡的,ifort的matmul优化不好,而且大量的产生临时数组 Transpose 发表于 2023-4-26 16:42
感觉只能修改算法,减少内存的占用了,这么大的数组直接定义,不太合理
另外注意数组的列优先顺序,使用ifo ...
谢谢你的建议,我有把大数组移到并行外面计算,然后只取大数组中要用到的较小的部分,放入并行中进行数组间的乘法运算。同时,我也自己手动计算了数组间的乘法,但运行程序时,还是报错。
我写了一个结构相同的,简短的数组相乘程序。能麻烦你再帮我看下哪里不合理吗?
谢谢。
这是我的数组相乘程序。
SUBROUTINE MULTIPLICATION_MATRIX(di,ou,i1,i2)
IMPLICIT NONE
INTEGER, PARAMETER :: dp = SELECTED_REAL_KIND(15,14) !Precision of the floating variable
INTEGER :: di !Dimension of the matrix
COMPLEX (KIND=dp), ALLOCATABLE:: ou(:,:) !Output matrix
COMPLEX (KIND=dp), ALLOCATABLE:: i1(:,:) !Input matrix
COMPLEX (KIND=dp), ALLOCATABLE:: i2(:,:) !Input matrix
COMPLEX (KIND=dp) :: i3
INTEGER :: i, j, k
ou = DCMPLX(0.0d0, 0.0d0)
DO i = 1, di, 1
DO j = 1, di, 1
i3 = DCMPLX(0.0d0, 0.0d0)
DO k = 1, di, 1
! ou(i,j) = ou(i,j)+i1(j,k)*i2(k,j)
i3 = i3+i1(j,k)*i2(k,j)
END DO
ou(i,j) = i3
END DO
END DO
RETURN
END SUBROUTINE MULTIPLICATION_MATRIX
PROGRAM CODE
IMPLICIT NONE
INTEGER, PARAMETER :: dp = SELECTED_REAL_KIND(15,14) !Precision of the floating variable
INTEGER :: di = 2 !Dimension of the matrix
COMPLEX (KIND=dp), ALLOCATABLE:: ou(:,:) !Output matrix
COMPLEX (KIND=dp), ALLOCATABLE:: i1(:,:) !Input matrix
COMPLEX (KIND=dp), ALLOCATABLE:: i2(:,:) !Input matrix
INTEGER :: i, j
ALLOCATE (ou(di,di))
ALLOCATE (i1(di,di))
ALLOCATE (i2(di,di))
DO i = 1, di, 1
DO j = 1, di, 1
i1(i,j) = DCMPLX(0.0d0, 1.0d0)*i-j
i2(i,j) = DCMPLX(1.0d0, 1.0d0)*j-i
END DO
END DO
CALL MULTIPLICATION_MATRIX(di,ou,i1,i2)
OPEN (UNIT=3, FILE='out.dat', STATUS='UNKNOWN')
DO i = 1, di, 1
WRITE (UNIT=3, FMT=*) ou(i,:)
END DO
DEALLOCATE (ou)
DEALLOCATE (i1)
DEALLOCATE (i2)
CLOSE (UNIT=3)
STOP
END PROGRAM CODE
这是编译并运行程序后的报错信息。
forrtl: severe (408): fort: (8): Attempt to fetch from allocatable variable I2 when it is not allocated
Image PC Routine Line Source
TE1 000000000040CA3FUnknown UnknownUnknown
TE1 00000000004057C8multiplication_ma 17TE1.f90
TE1 0000000000407369MAIN__ 45TE1.f90
TE1 0000000000404EE2Unknown UnknownUnknown
libc-2.17.so 00002B384CDDB555__libc_start_main UnknownUnknown
TE1 0000000000404DE9Unknown UnknownUnknown
我的数组在主程序中已经分配空间了,为什么还有错误信息,说没有分配空间呢?
这种数组传递方式需要写接口
http://fcode.cn/guide-103-1.html
页:
[1]