program example3
implicit none
*
* simple example to show how to generate a scalapack matrix
* contribution from Ed d'Azevedo, ORNL, 2005
*
integer BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
$ LLD_, MB_, M_, NB_, N_, RSRC_
parameter ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1,
$ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6,
$ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 )
*
integer descA(DLEN_)
integer descAtmp(DLEN_)
*
integer lwork
parameter(lwork=10*1000)
double precision work(lwork)
*
integer maxproc
parameter(maxproc=8*1024)
*
integer iabegin(0:maxproc)
integer ialast(0:maxproc)
integer ia_begin,ia_last,ia_size
integer local_ia, local_ja
*
integer krow
parameter(krow=10)
integer Atmpsize
parameter(Atmpsize=4*1000*1000)
*
integer Asize
parameter(Asize=32*1000*1000)
*
integer nout
parameter(nout=16)
*
double precision aij
double precision A(Asize)
double precision Atmp(Atmpsize)
*
integer m,n,mb,nb
integer info, ierr(1)
integer iam,nprocs
integer icontext, myprow,mypcol,nprow,npcol
integer rsrc,csrc,lld,Locp,Locq,Aneed
*
integer ia,ja,irprnt,icprnt
logical isroot, isok
*
integer prow,pcol
logical do_print
*
integer iastart,iaend
integer mm,nn
*
double precision t1,t2
double precision MPI_Wtime
external MPI_Wtime
*
integer numroc, indxg2p
external numroc, indxg2p
external descinit
*
mb = 50
nb = 50
m = 400
n = 400
*
* -----------------------
* setup blacs environment
* -----------------------
*
call blacs_pinfo( iam,nprocs)
*
do nprow=int( sqrt(real(nprocs)) )+1,1,-1
npcol = nprocs/nprow
if (nprow*npcol.eq.nprocs) goto 11
enddo
11 continue
*
call blacs_get(-1,0,icontext)
call blacs_gridinit( icontext, 'Col-major', nprow,npcol)
*
call blacs_gridinfo( icontext, nprow,npcol, myprow,mypcol)
isroot = (myprow.eq.0).and.(mypcol.eq.0)
*
if (isroot) then
write(*,*) 'nprow,npcol ', nprow,npcol
write(*,*) 'm,n ', m,n
write(*,*) 'mb,nb ', mb,nb
endif
*
* ---------------------------------------------------------
* compute local extent and allocate storage for local piece
* ---------------------------------------------------------
*
csrc = 0
rsrc = 0
*
Locq = numroc(n,nb,mypcol,csrc,npcol)
Locq = max(1,Locq)
Locp = numroc(m,mb,myprow,rsrc,nprow)
*
lld = max(Locp,1)
Aneed = lld*Locq
isok = (Aneed.le.Asize)
if (.not.isok) then
if (isroot) then
write(*,*) 'increase Asize to ',Aneed + 1
endif
goto 999
endif
*
* ----------------
* setup descriptor
* ----------------
*
call descinit(descA,m,n,mb,nb,rsrc,csrc,icontext,lld,info)
ierr(1) = info
call igsum2d( icontext, 'All', ' ',1,1,ierr,1,-1,-1)
isok = (info.eq.0)
if (.not.isok) then
if (isroot) then
write(*,*) 'descinit returns info = ',info
endif
goto 999
endif
*
* ---------------------------------
* setup descriptor and local matrix
* ---------------------------------
*
call descinit(descAtmp,krow,n,krow,n,myprow,mypcol,
& icontext,krow,info)
ierr(1) = info
call igsum2d( icontext, 'All', ' ',1,1,ierr,1,-1,-1)
isok = (info.eq.0)
if (.not.isok) then
if (isroot) then
write(*,*) 'descinit(Atmp) returns info = ',info
endif
goto 999
endif
*
isok = (krow*n.le.Atmpsize)
if (.not.isok) then
if (isroot) then
write(*,*) 'increase Atmpsize to ',krow*n+1
endif
goto 999
endif
*
* ------------------------------
* better method to setup matrix
* take advantage of the block 2D cyclic format
*
* (ia,ja) are global indices
* ------------------------------
*
call blacs_barrier( icontext, 'All')
t1 = MPI_Wtime()
*
* ----------------------------------------------
* Each processor generates several complete rows
* and then copies into global matrix
* ----------------------------------------------
*
isok = (nprow*npcol.le.maxproc)
if (.not.isok) then
if (isroot) then
write(*,*) 'increase maxproc to ',nprow*npcol+1
endif
goto 999
endif
*
do iastart=1,descA(M_),krow*nprow*npcol
*
iaend = min( descA(M_),iastart + krow*nprow*npcol-1)
*
* ----------------------------
* partition work to processors
* ----------------------------
*
ia_begin = iastart
do pcol=0,npcol-1
do prow=0,nprow-1
ia_last = min(iaend, ia_begin+krow-1)
iabegin( prow + pcol*nprow ) = ia_begin
ialast( prow + pcol*nprow ) = ia_last
*
ia_begin = ia_last + 1
enddo
enddo
*
* ------------------------------------------
* generate matrix entries on local processor
* ------------------------------------------
*
prow = myprow
pcol = mypcol
ia_begin = iabegin( prow + pcol*nprow )
ia_last = ialast( prow + pcol*nprow )
*
do ja=1,descA(N_)
do ia=ia_begin,ia_last
*
call generate_Aij( m,n, ia,ja, aij )
*
local_ia = (ia-ia_begin) + 1
local_ja = ja
Atmp( local_ia + (local_ja-1)*descAtmp(LLD_) ) = aij
enddo
enddo
*
* -----------------------------------------
* copy data to distributed block 2d format
* rely on PBLAS for communication
* -----------------------------------------
*
do pcol=0,npcol-1
do prow=0,nprow-1
ia_begin = iabegin( prow + pcol*nprow )
ia_last = ialast( prow + pcol*nprow )
ia_size = ia_last - ia_begin + 1
*
* -----------------
* adjust descriptor
* -----------------
*
descAtmp(RSRC_) = prow
descAtmp(CSRC_) = pcol
*
mm = ia_size
nn = descA(N_)
ia = ia_begin
ja = 1
call pdgecopy( mm,nn,
& Atmp,1,1,descAtmp,
& A,ia,ja,descA)
enddo
enddo
enddo
*
call blacs_barrier( icontext, 'All')
t2 = MPI_Wtime()
if (isroot) then
write(*,*) 'time to build matrix is ',t2-t1,' sec'
endif
*
* --------------------------------------------
* if matrix is not too big, print out content
* for debugging
* --------------------------------------------
*
do_print = ((m*n.le.200*1000).and.(lwork.ge.mb))
if (do_print) then
*
ia = 1
ja = 1
irprnt = 0
icprnt = 0
call pdlaprnt(m,n,A,ia,ja,descA,irprnt,icprnt,'A',nout,work)
*
endif
*
999 continue
*
* ---------------
* prepare to exit
* ---------------
*
call blacs_barrier(icontext, 'All')
call blacs_gridexit( icontext )
call blacs_exit(0)
stop
end
*
subroutine generate_Aij( m,n, ia,ja, aij )
implicit none
integer m,n, ia,ja
double precision aij
*
aij = dble(ia) + dble(ja-1)*dble(m)
return
end
*
subroutine pdgecopy( m,n,A,ia,ja,descA,C,ic,jc,descC )
implicit none
integer m,n, ia,ja,descA(*), ic,jc, descC(*)
double precision A(*), C(*)
*
double precision alpha, beta
*
* PxGEADD is new capability available in PBLAS Version 2
*
* PDGEADD adds a matrix to another
*
* sub( C ) := beta*sub( C ) + alpha*op( sub( A ) )
*
*
beta = 0.0d0
alpha = 1.0d0
call pdgeadd( 'NoTrans', m,n, alpha, A,ia,ja,descA,
& beta, C,ic,jc,descC )
return
end
*
subroutine pdgecopy0( m,n,A,ia,ja,descA,C,ic,jc,descC )
implicit none
integer BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
$ LLD_, MB_, M_, NB_, N_, RSRC_
parameter ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1,
$ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6,
$ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 )
*
integer m,n, ia,ja,descA(*), ic,jc, descC(*)
double precision A(*), C(*)
*
integer iia,jja, iic,jjc, ix,iy, i,j
*
* -----------------------------------------------
* if PBLAS version 2 is not available, use slower
* PBLAS PxCOPY instead
* -----------------------------------------------
*
if( m.ge.n) then
do j=1,n
iia = ia
jja = (ja-1) + j
iic = ic
jjc = (jc-1) + j
ix = 1
iy = 1
call pdcopy( m, A, iia,jja, descA, ix,
& C, iic,jjc, descC, iy )
enddo
else
do i=1,m
iia = (ia-1) + i
jja = ja
iic = (ic-1) + i
jjc = jc
ix = descA(M_)
iy = descC(M_)
call pdcopy( n, A, iia, jja, descA, ix,
& C, iic, jjc, descC, iy )
enddo
endif
return
end
*