program example3
      implicit none
*
*     simple example to show how to generate a scalapack matrix
*     contribution from Ed d'Azevedo, ORNL, 2005
*
      integer BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
     $        LLD_, MB_, M_, NB_, N_, RSRC_
      parameter ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1,
     $            CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6,
     $            RSRC_ = 7, CSRC_ = 8, LLD_ = 9 )
*
      integer descA(DLEN_)
      integer descAtmp(DLEN_)
*
      integer lwork
      parameter(lwork=10*1000)
      double precision work(lwork)
*
      integer maxproc
      parameter(maxproc=8*1024)
*
      integer iabegin(0:maxproc)
      integer ialast(0:maxproc)
      integer ia_begin,ia_last,ia_size
      integer local_ia, local_ja
*
      integer krow
      parameter(krow=10)
      integer Atmpsize
      parameter(Atmpsize=4*1000*1000)
*
      integer Asize
      parameter(Asize=32*1000*1000)
*
      integer nout
      parameter(nout=16)
*
      double precision aij
      double precision A(Asize)
      double precision Atmp(Atmpsize)
*
      integer m,n,mb,nb
      integer info, ierr(1)
      integer iam,nprocs
      integer icontext, myprow,mypcol,nprow,npcol
      integer rsrc,csrc,lld,Locp,Locq,Aneed
*
      integer ia,ja,irprnt,icprnt
      logical isroot, isok
*
      integer prow,pcol
      logical do_print
*
      integer iastart,iaend
      integer mm,nn
*
      double precision t1,t2
      double precision MPI_Wtime
      external MPI_Wtime
*
      integer numroc, indxg2p
      external numroc, indxg2p
      external descinit
*
      mb = 50
      nb = 50
      m = 400
      n = 400
*
*     -----------------------
*     setup blacs environment
*     -----------------------
*
      call blacs_pinfo( iam,nprocs)
*
      do nprow=int( sqrt(real(nprocs)) )+1,1,-1
         npcol = nprocs/nprow
         if (nprow*npcol.eq.nprocs) goto 11
      enddo
 11   continue
*
      call blacs_get(-1,0,icontext)
      call blacs_gridinit( icontext, 'Col-major', nprow,npcol)
*
      call blacs_gridinfo( icontext, nprow,npcol, myprow,mypcol)
      isroot = (myprow.eq.0).and.(mypcol.eq.0)
*
      if (isroot) then
         write(*,*) 'nprow,npcol ', nprow,npcol
         write(*,*) 'm,n ', m,n
         write(*,*) 'mb,nb ', mb,nb
      endif
*                 
*     ---------------------------------------------------------
*     compute local extent and allocate storage for local piece
*     ---------------------------------------------------------
*
      csrc = 0
      rsrc = 0
*
      Locq = numroc(n,nb,mypcol,csrc,npcol)
      Locq = max(1,Locq)
      Locp = numroc(m,mb,myprow,rsrc,nprow)
*
      lld = max(Locp,1)
      Aneed = lld*Locq
      isok = (Aneed.le.Asize)
      if (.not.isok) then
         if (isroot) then
            write(*,*) 'increase Asize to ',Aneed + 1
         endif
         goto 999
      endif
*
*     ----------------
*     setup descriptor
*     ----------------
*
      call descinit(descA,m,n,mb,nb,rsrc,csrc,icontext,lld,info)
      ierr(1) = info
      call igsum2d( icontext, 'All', ' ',1,1,ierr,1,-1,-1)
      isok = (info.eq.0)
      if (.not.isok) then
         if (isroot) then
            write(*,*) 'descinit returns info = ',info
         endif
         goto 999
      endif
*
*     ---------------------------------
*     setup descriptor and local matrix
*     ---------------------------------
*
      call descinit(descAtmp,krow,n,krow,n,myprow,mypcol,
     &                icontext,krow,info)
      ierr(1) = info
      call igsum2d( icontext, 'All', ' ',1,1,ierr,1,-1,-1)
      isok = (info.eq.0)
      if (.not.isok) then
         if (isroot) then
            write(*,*) 'descinit(Atmp) returns info = ',info
         endif
         goto 999
      endif
*
      isok = (krow*n.le.Atmpsize)
      if (.not.isok) then
         if (isroot) then
            write(*,*) 'increase Atmpsize to ',krow*n+1
         endif
         goto 999
      endif
*
*     ------------------------------
*     better method to setup matrix
*     take advantage of the block 2D cyclic format
*
*     (ia,ja) are global indices
*     ------------------------------
*
      call blacs_barrier( icontext, 'All')
      t1 = MPI_Wtime()
*
*     ----------------------------------------------
*     Each processor generates several complete rows
*     and then copies into global matrix
*     ----------------------------------------------
*
      isok = (nprow*npcol.le.maxproc)
      if (.not.isok) then
         if (isroot) then
            write(*,*) 'increase maxproc to ',nprow*npcol+1
         endif
         goto 999
      endif
*
      do iastart=1,descA(M_),krow*nprow*npcol
*
         iaend = min( descA(M_),iastart + krow*nprow*npcol-1)
*
*        ----------------------------
*        partition work to processors
*        ----------------------------
*
         ia_begin = iastart
         do pcol=0,npcol-1
         do prow=0,nprow-1
            ia_last = min(iaend, ia_begin+krow-1)
            iabegin( prow + pcol*nprow ) =  ia_begin
            ialast( prow + pcol*nprow ) = ia_last
*
            ia_begin = ia_last + 1
         enddo
         enddo
*
*        ------------------------------------------
*        generate matrix entries on local processor
*        ------------------------------------------
*
         prow = myprow
         pcol = mypcol
         ia_begin = iabegin( prow + pcol*nprow )
         ia_last = ialast( prow + pcol*nprow )
*
         do ja=1,descA(N_)
         do ia=ia_begin,ia_last
*
            call generate_Aij( m,n, ia,ja, aij )
*
            local_ia = (ia-ia_begin) + 1
            local_ja = ja
            Atmp( local_ia + (local_ja-1)*descAtmp(LLD_) ) = aij
         enddo
         enddo
*
*        -----------------------------------------
*        copy data to distributed block 2d  format
*        rely on PBLAS for communication
*        -----------------------------------------
*
         do pcol=0,npcol-1
         do prow=0,nprow-1
            ia_begin = iabegin( prow + pcol*nprow )
            ia_last = ialast( prow + pcol*nprow )
            ia_size = ia_last - ia_begin + 1
*
*           -----------------
*           adjust descriptor
*           -----------------
*
            descAtmp(RSRC_) = prow
            descAtmp(CSRC_) = pcol
*
            mm = ia_size
            nn = descA(N_)
            ia = ia_begin
            ja = 1
            call pdgecopy( mm,nn,
     &              Atmp,1,1,descAtmp,
     &              A,ia,ja,descA)

         enddo
         enddo


      enddo
*
      call blacs_barrier( icontext, 'All')
      t2 = MPI_Wtime()
      if (isroot) then
         write(*,*) 'time to build matrix is ',t2-t1,' sec'
      endif
*
*     --------------------------------------------
*     if matrix is not too big, print out content
*     for debugging
*     --------------------------------------------
*
      do_print = ((m*n.le.200*1000).and.(lwork.ge.mb))
      if (do_print) then
*
         ia = 1
         ja = 1
         irprnt = 0
         icprnt = 0
         call pdlaprnt(m,n,A,ia,ja,descA,irprnt,icprnt,'A',nout,work)
*
      endif
*
999   continue
*
*     ---------------
*     prepare to exit
*     ---------------
*
      call blacs_barrier(icontext, 'All')
      call blacs_gridexit( icontext )
      call blacs_exit(0)
      stop
      end
*
      subroutine generate_Aij( m,n, ia,ja, aij )
      implicit none
      integer m,n, ia,ja
      double precision aij
*
      aij = dble(ia) + dble(ja-1)*dble(m)
      return
      end
*
      subroutine pdgecopy( m,n,A,ia,ja,descA,C,ic,jc,descC )
      implicit none
      integer m,n,   ia,ja,descA(*), ic,jc, descC(*)
      double precision A(*), C(*)
*
      double precision alpha, beta
*
*  PxGEADD is new capability available in PBLAS Version 2
*
*  PDGEADD  adds a matrix to another
*
*     sub( C ) := beta*sub( C ) + alpha*op( sub( A ) )
*
*
      beta = 0.0d0
      alpha = 1.0d0
      call pdgeadd( 'NoTrans', m,n, alpha, A,ia,ja,descA,
     &                                beta,  C,ic,jc,descC )
      return
      end
*
      subroutine pdgecopy0( m,n,A,ia,ja,descA,C,ic,jc,descC )
      implicit none
      integer BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
     $        LLD_, MB_, M_, NB_, N_, RSRC_
      parameter ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1,
     $            CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6,
     $            RSRC_ = 7, CSRC_ = 8, LLD_ = 9 )
*
      integer m,n,   ia,ja,descA(*), ic,jc, descC(*)
      double precision A(*), C(*)
*
      integer iia,jja, iic,jjc, ix,iy,  i,j
*
*     -----------------------------------------------
*     if PBLAS version 2 is not available, use slower
*     PBLAS PxCOPY instead
*     -----------------------------------------------
*
      if( m.ge.n) then
         do j=1,n
            iia = ia
            jja = (ja-1) + j
            iic = ic
            jjc = (jc-1) + j
            ix = 1
            iy = 1
            call pdcopy( m, A, iia,jja, descA, ix,
     &                      C, iic,jjc, descC, iy )
         enddo
      else
         do i=1,m
            iia = (ia-1) + i
            jja = ja
            iic = (ic-1) + i
            jjc = jc
            ix = descA(M_)
            iy = descC(M_)
            call pdcopy( n, A, iia, jja, descA, ix,
     &                     C, iic, jjc, descC, iy )
         enddo
      endif
      return
      end
*