#include "blockvec_impl.h"
#include "dotproducts_impl.h"
#include "parpre_mat.h"
#include "parpre_vec.h"

static int events[5];
#define ORTHO_EVENT 0
#define COMM_EVENT 1
#define AXPY_EVENT 2
#define AYPX_EVENT 3
#define DOT_EVENT 4

#undef __FUNC__
#define __FUNC__ "ParPreBlockVecInit"
int ParPreBlockVecInit()
{
  int ierr;
  PetscFunctionBegin;
  ierr = PLogEventRegister
    (events+ORTHO_EVENT, "Block QR        ",PETSC_NULL); CHKERRQ(ierr);
  ierr = PLogEventRegister
    (events+COMM_EVENT,  "Block comm      ",PETSC_NULL); CHKERRQ(ierr);
  ierr = PLogEventRegister
    (events+AXPY_EVENT,  "Block AXPY      ",PETSC_NULL); CHKERRQ(ierr);
  ierr = PLogEventRegister
    (events+AYPX_EVENT,  "Block AYPX      ",PETSC_NULL); CHKERRQ(ierr);
  ierr = PLogEventRegister
    (events+DOT_EVENT,   "Block vecdot    ",PETSC_NULL); CHKERRQ(ierr);
  PetscFunctionReturn(0);
}

/****************************************************************
 * True Block Vectors
 ****************************************************************/

/**************** Create and Destroy ****************/
#undef __FUNC__
#define __FUNC__ "BlockVecCreateMPI"
int BlockVecCreateMPI(MPI_Comm comm,int nv,int n,int N,BlockVec *bv)
{
  BlockVec b;
  int size,rank,ierr;

  PetscFunctionBegin;
  MPI_Comm_size(comm,&size);
  MPI_Comm_rank(comm,&rank);
  ierr = PetscSplitOwnership(comm,&n,&N); CHKERRQ(ierr);
  b = PetscNew(struct _p_BlockVec); CHKPTRQ(b);
  if (nv) {
    Scalar *data; int iv;
    data = (Scalar *) PetscMalloc((nv*n+1)*sizeof(Scalar)); CHKPTRQ(data);
    b->vecs = (Vec *) PetscMalloc(nv*sizeof(Vec)); CHKPTRQ(b->vecs);
    for (iv=0; iv<nv; iv++) {
      ierr = VecCreateMPIWithArray
	(comm,n,N,data+iv*n,&(b->vecs[iv])); CHKERRQ(ierr);
    }
  } else b->vecs = 0;
  b->n = nv;
  b->comm = comm;
  *bv = b;
  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVecDestroy"
int BlockVecDestroy(BlockVec v)
{
  Scalar *a;
  int i,ierr;
  PetscFunctionBegin;
  ierr = BlockVecGetArray(v,&a); CHKERRQ(ierr);
  for (i=0; i<v->n; i++) {
    ierr = VecDestroy(v->vecs[i]); CHKERRQ(ierr);
  }
  PetscFree(v->vecs); PetscFree(a); PetscFree(v);
  PetscFunctionReturn(0);
}

/**************** Access Functions ****************/
#undef __FUNC__
#define __FUNC__ "BlockVecGetArray"
int BlockVecGetArray(BlockVec v,Scalar **a)
{
  int ierr;
  PetscFunctionBegin;
  ierr = VecGetArray(v->vecs[0],a); CHKERRQ(ierr);
  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVecRestoreArray"
int BlockVecRestoreArray(BlockVec v,Scalar **a)
{
  int ierr;
  PetscFunctionBegin;
  ierr = VecRestoreArray(v->vecs[0],a); CHKERRQ(ierr);
  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVecGetVec"
int BlockVecGetVec(BlockVec v,int i,Vec *vv)
{
  PetscFunctionBegin;
  *vv = v->vecs[i];
  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVecRestoreVec"
int BlockVecRestoreVec(BlockVec v,int i,Vec *vv)
{
  PetscFunctionBegin;
  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVecGetLocalSize"
int BlockVecGetLocalSize(BlockVec v,int *i,int *j)
{
  int ierr;
  PetscFunctionBegin;
  ierr = VecGetLocalSize(v->vecs[0],i); CHKERRQ(ierr);
  *j = v->n;
  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVecGetOwnershipRange"
int BlockVecGetOwnershipRange(BlockVec v,int *first,int *last)
{
  int ierr;
  PetscFunctionBegin;
  ierr = VecGetOwnershipRange(v->vecs[0],first,last); CHKERRQ(ierr);
  PetscFunctionReturn(0);
}

/**************** Operations ****************/
#if !defined(USE_PETSC_COMPLEX)
#define GEMM dgemm_
#define TRMM dtrmm_
#define EXTERN extern
#define CAST
#else
#define GEMM zgemm_
#define TRMM ztrmm_
#define EXTERN extern "C"
#define CAST (double *)
#endif

EXTERN void GEMM(char *,char *,int *,int *,int *,double *,
		 double *,int *,double *,int *,double *,
		 double *,int *);
EXTERN void TRMM(char *,char *,char *,char *, int *,int *,
		 double *,double *,int *, double *,int *);

#undef __FUNC__
#define __FUNC__ "BlockVecAXPY"
/*@
  BlockVecAXPY - Block vector AXPY with a small square scaling matrix

  Parameters:
+ a - square scaling matrix of size kxk
. u - the block vector of size Nxk to be scaled
. v - the output block vector

@*/
int BlockVecAXPY(Mat a,BlockVec u,BlockVec v)
{
  int is,js,ls,idum,ierr;
  Scalar *aa,*uu,*vv,one=1.;

  PetscFunctionBegin;
  PLogEventBegin(events[AXPY_EVENT],0,0,0,0);
  ierr = MatGetSize(a,&is,&js); CHKERRQ(ierr);
  ierr = MatGetArray(a,&aa); CHKERRQ(ierr);
  ierr = BlockVecGetArray(u,&uu); CHKERRQ(ierr);
  ierr = BlockVecGetArray(v,&vv); CHKERRQ(ierr);
  ierr = BlockVecGetLocalSize(u,&ls,&idum); CHKERRQ(ierr);
  if (idum!=is) SETERRQ(1,0,"Incompatible inner dimensions");
  GEMM("N","N", &ls,&js,&is,
       CAST &one,CAST uu,&ls, CAST aa,&is, CAST &one,CAST vv,&ls);
  ierr = MatRestoreArray(a,&aa); CHKERRQ(ierr);
  ierr = BlockVecRestoreArray(u,&uu); CHKERRQ(ierr);
  ierr = BlockVecRestoreArray(v,&vv); CHKERRQ(ierr);
  PLogFlops(2*ls*js*is);
  PLogEventEnd(events[AXPY_EVENT],0,0,0,0);
  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVecAYPX"
/*@
  BlockVecAYPX - Block vector AYPX with a small square scaling matrix

  Parameters:
+ a - square scaling matrix of size kxk
. trans - "N" or "T" depending on whether the matrix a was to be transposed
. u - the block vector of size Nxk to be scaled
. v - the output block vector
. t - a temporary vector

@*/
int BlockVecAYPX(Mat a,char *trans,BlockVec u,BlockVec v,BlockVec t)
{
  int i,is,js,ls,idum,ierr;
  Scalar *uu,*vv,*tt;

  PetscFunctionBegin;
  PLogEventBegin(events[AYPX_EVENT],0,0,0,0);
  ierr = MatGetSize(a,&is,&js); CHKERRQ(ierr);
  ierr = BlockVecGetLocalSize(u,&ls,&idum); CHKERRQ(ierr);
  if (idum!=is) SETERRQ(1,0,"Incompatible inner dimensions");
  ierr = BlockVecScale(a,trans,v,t); CHKERRQ(ierr);
  ierr = BlockVecGetArray(u,&uu); CHKERRQ(ierr);
  ierr = BlockVecGetArray(v,&vv); CHKERRQ(ierr);
  ierr = BlockVecGetArray(t,&tt); CHKERRQ(ierr);
  if (idum!=is) SETERRQ(1,0,"Incompatible inner dimensions");
  for (i=0; i<ls*is; i++) vv[i] = tt[i]+uu[i];
  PLogFlops(ls*is);
  ierr = BlockVecRestoreArray(u,&uu); CHKERRQ(ierr);
  ierr = BlockVecRestoreArray(v,&vv); CHKERRQ(ierr);
  ierr = BlockVecRestoreArray(t,&tt); CHKERRQ(ierr);
  PLogEventEnd(events[AYPX_EVENT],0,0,0,0);
  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVecAuYPX"
/*@
  BlockVecAuYPX - Block vector AYPX where the small square matrix matrix
  is symmetric, with upper triangle given only.

  Parameters:
+ a - square scaling matrix of size kxk
. trans - "N" or "T" depending on whether the matrix a was to be transposed
. u - the block vector of size Nxk to be scaled
- v - the output block vector

@*/
int BlockVecAuYPX(Mat a,char *trans,BlockVec u,BlockVec v)
{
  int i,is,js,ls,idum,ierr;
  Scalar *aa,*uu,*vv,one=1.;

  PetscFunctionBegin;
  PLogEventBegin(events[AYPX_EVENT],0,0,0,0);
  ierr = MatGetSize(a,&is,&js); CHKERRQ(ierr);
  ierr = BlockVecGetLocalSize(u,&ls,&idum); CHKERRQ(ierr);
  if (idum!=is) SETERRQ(1,0,"Incompatible inner dimensions");
  ierr = MatGetArray(a,&aa); CHKERRQ(ierr);
  ierr = BlockVecGetArray(u,&uu); CHKERRQ(ierr);
  ierr = BlockVecGetArray(v,&vv); CHKERRQ(ierr);
  TRMM("R","U",trans,"N", &ls,&js, CAST &one,CAST aa,&is,CAST vv,&ls);
  for (i=0; i<ls*is; i++) vv[i] = vv[i]+uu[i];
  /*ierr = BlockVecScale(a,trans,v,t); CHKERRQ(ierr);
    for (i=0; i<ls*is; i++) vv[i] = tt[i]+uu[i];*/
  PLogFlops(ls*is);
  ierr = MatRestoreArray(a,&aa); CHKERRQ(ierr);
  ierr = BlockVecRestoreArray(u,&uu); CHKERRQ(ierr);
  ierr = BlockVecRestoreArray(v,&vv); CHKERRQ(ierr);
  PLogEventEnd(events[AYPX_EVENT],0,0,0,0);
  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVecScale"
/*@
  BlockVecScale - Scale a block vector by a small square matrix.

  Parameters:
+ a - square scaling matrix of size kxk
. trans - "N" or "T" depending on whether the matrix a was to be transposed
. u - the vector of size Nxk to be scaled
- v - the output vector

@*/
int BlockVecScale(Mat a,char *trans,BlockVec u,BlockVec v)
{
  int ls,is,js,idum,ierr;
  Scalar *aa,*vv,*uu,one=1.,zero=0.;

  PetscFunctionBegin;
  ierr = MatGetSize(a,&is,&js); CHKERRQ(ierr);
  ierr = MatGetArray(a,&aa); CHKERRQ(ierr);
  ierr = BlockVecGetArray(u,&uu); CHKERRQ(ierr);
  ierr = BlockVecGetArray(v,&vv); CHKERRQ(ierr);
  ierr = BlockVecGetLocalSize(u,&ls,&idum); CHKERRQ(ierr);
  if (idum!=is) SETERRQ(1,0,"Incompatible inner dimensions");
  GEMM("N",trans, &ls,&js,&is,
       CAST &one,CAST uu,&ls, CAST aa,&is, CAST &zero,CAST vv,&ls);
  ierr = MatRestoreArray(a,&aa); CHKERRQ(ierr);
  ierr = BlockVecRestoreArray(u,&uu); CHKERRQ(ierr);
  ierr = BlockVecRestoreArray(v,&vv); CHKERRQ(ierr);
  PLogFlops(2*ls*js*is);
  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVecDotProduct"
int BlockVecDotProduct(BlockVec v1,BlockVec v2,DotProducts dp)
{
  int ierr;
  Scalar one=1., zero=0.;

  PetscFunctionBegin;

  if (dp->full) {
    Scalar *t=dp->tmpmat_array,*z=dp->mat_array;
    Scalar *x,*y;
    char *trans;
    int n,m;

    PLogEventBegin(events[DOT_EVENT],0,0,0,0);
    ierr = BlockVecGetLocalSize(v1,&m,&n); CHKERRQ(ierr);
    ierr = BlockVecGetArray(v1,&x); CHKERRQ(ierr);
    if (! (v1 == v2) ) {
      ierr = BlockVecGetArray(v2,&y); CHKERRQ(ierr);
    } else y = x;

    if (dp->complex) trans = "C"; else trans = "T";
    GEMM(trans,"N", &n,&n,&m,&one,CAST x,&m, CAST y,&m, &zero,CAST t,&n);
    PLogFlops(2*m*n*n);
    ierr = BlockVecRestoreArray(v1,&x); CHKERRQ(ierr);
    if (! (v1 == v2 ) ) {
      ierr = BlockVecRestoreArray(v2,&y); CHKERRQ(ierr);}
    PLogEventEnd(events[DOT_EVENT],0,0,0,0);

    PLogEventBegin(events[COMM_EVENT],0,0,0,0);
#if defined(USE_PETSC_COMPLEX)
    MPI_Allreduce((void*)t,(void*)z,n*n,MPI_DOUBLE_COMPLEX,
		  MPI_SUM,v1->comm); CHKERRQ(ierr);
#else
    MPI_Allreduce((void*)t,(void*)z,n*n,MPI_DOUBLE,
		  MPI_SUM,v1->comm); CHKERRQ(ierr);
#endif
  } else {
    int ivec;
    for (ivec=0; ivec<dp->dimension; ivec++) {
      Vec u,v;
      ierr = BlockVecGetVec(v1,ivec,&u); CHKERRQ(ierr);
      if (! (v1==v2) ) {
	ierr = BlockVecGetVec(v2,ivec,&v); CHKERRQ(ierr);
      } else v=u;
      ierr = DotProductsSet(dp,u,v); CHKERRQ(ierr);
      ierr = BlockVecRestoreVec(v1,ivec,&u); CHKERRQ(ierr);
      if(! (v1==v2) ) {
	ierr = BlockVecRestoreVec(v2,ivec,&v); CHKERRQ(ierr);}
    }
    for (ivec=0; ivec<dp->dimension; ivec++) {
      Scalar v;
      ierr = DotProductsGet(dp,&v); CHKERRQ(ierr);
      ierr = VecSetValues(dp->vec,1,&ivec,&v,INSERT_VALUES); CHKERRQ(ierr);
    }
    ierr = VecAssemblyBegin(dp->vec); CHKERRQ(ierr);
    ierr = VecAssemblyEnd(dp->vec); CHKERRQ(ierr);
  }
  ierr = DotProductsAssemblyBegin(dp); CHKERRQ(ierr);
  ierr = DotProductsAssemblyEnd(dp); CHKERRQ(ierr);

  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVecDot"
/*@
  BlockVecScale - Scale a block vector by a small square matrix.

  Parameters:
+ a - square scaling matrix of size kxk
. trans - "N" or "T" depending on whether the matrix a was to be transposed
. u - the vector to be scaled
- v - the output vector

@*/
int BlockVecDot(BlockVec v1,BlockVec v2,
		/* DotType type, */MatReuse use,Mat *u)
{
  DotProducts dp;
  Scalar *t;
  int n=v1->n,ierr;

  PetscFunctionBegin;
  ierr = DotProductsCreate(v1->comm,0,&dp); CHKERRQ(ierr);
  t = (Scalar *) PetscMalloc((n*n+1)*sizeof(Scalar)); CHKPTRQ(t);
  ierr = BlockVecDot_O("C",v1,v2, /*type, */use,u,t,dp); CHKERRQ(ierr);
  PetscFree(t);
  ierr = DotProductsDestroy(dp); CHKERRQ(ierr);

  PetscFunctionReturn(0);
}
#undef __FUNC__
#define __FUNC__ "BlockVecTDot"
int BlockVecTDot(BlockVec v1,BlockVec v2,
		/* DotType type, */MatReuse use,Mat *u)
{
  DotProducts dp;
  Scalar *t;
  int n=v1->n,ierr;

  PetscFunctionBegin;
  ierr = DotProductsCreate(v1->comm,0,&dp); CHKERRQ(ierr);
  t = (Scalar *) PetscMalloc((n*n+1)*sizeof(Scalar)); CHKPTRQ(t);
  ierr = BlockVecDot_O("T",v1,v2, /*type, */use,u,t,dp); CHKERRQ(ierr);
  PetscFree(t);
  ierr = DotProductsDestroy(dp); CHKERRQ(ierr);

  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVecDot_O"
int BlockVecDot_O(char *trans,BlockVec v1,BlockVec v2,/* DotType type, */MatReuse use,Mat *u,
		  Scalar *t,DotProducts dp)
{
  Scalar *x,*y,*z;
  double one=1.,zero=0.;
  int n,m,ierr;

  PetscFunctionBegin;
  PLogEventBegin(events[DOT_EVENT],0,0,0,0);
  ierr = BlockVecGetLocalSize(v1,&m,&n); CHKERRQ(ierr);
  if (use==MAT_INITIAL_MATRIX) {
    ierr = MatCreateSeqDense(MPI_COMM_SELF,n,n,PETSC_NULL,u); CHKERRQ(ierr);
  } else if (use==MAT_REUSE_MATRIX) {
    ierr = MatZeroEntries(*u); CHKERRQ(ierr);
  } else SETERRQ(1,0,"Unknown MatReuse mode");
  ierr = BlockVecGetArray(v1,&x); CHKERRQ(ierr);
  if (! (v1 == v2) ) {
    ierr = BlockVecGetArray(v2,&y); CHKERRQ(ierr);
  } else y = x;
  GEMM(trans,"N", &n,&n,&m,&one,CAST x,&m, CAST y,&m, &zero,CAST t,&n);
  ierr = BlockVecRestoreArray(v1,&x); CHKERRQ(ierr);
  if (! (v1 == v2 ) ) {
    ierr = BlockVecRestoreArray(v2,&y); CHKERRQ(ierr);}
  PLogFlops(2*m*n*n);
  PLogEventEnd(events[DOT_EVENT],0,0,0,0);
  PLogEventBegin(events[COMM_EVENT],0,0,0,0);
  ierr = MatGetArray(*u,&z); CHKERRQ(ierr);
#if defined(USE_PETSC_COMPLEX)
  MPI_Allreduce((void*)t,(void*)z,n*n,MPI_DOUBLE_COMPLEX,
		MPI_SUM,v1->comm); CHKERRQ(ierr);
#else
  MPI_Allreduce((void*)t,(void*)z,n*n,MPI_DOUBLE,
		MPI_SUM,v1->comm); CHKERRQ(ierr);
#endif
  ierr = MatRestoreArray(*u,&z); CHKERRQ(ierr);
  PLogEventEnd(events[COMM_EVENT],0,0,0,0);
  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVecOrthogonalise_O"
int BlockVecOrthogonalise_O
(int hermit,BlockVec v,MatReuse use,
 Mat *u,Mat u1,Mat u2,QRMethod meth,DotProducts dp)
{ 
  Vec *vecs = v->vecs;
  int m,n,cur,next,ierr;

  PetscFunctionBegin;
  PLogEventBegin(events[ORTHO_EVENT],0,0,0,0);
  ierr = BlockVecGetLocalSize(v,&m,&n); CHKERRQ(ierr);
  if (use==MAT_INITIAL_MATRIX) {
    ierr = MatCreateSeqDense(MPI_COMM_SELF,n,n,PETSC_NULL,u); CHKERRQ(ierr);
  } else if (use==MAT_REUSE_MATRIX) {
    ierr = MatZeroEntries(*u); CHKERRQ(ierr);
  } else SETERRQ(1,0,"Unknown MatReuse mode");

  if (meth==GRAM_SCHMIDT) {
    Scalar *tmp;
    tmp = (Scalar *) PetscMalloc(n*sizeof(Scalar)); CHKPTRQ(tmp);
    for (cur=0; cur<n; cur++) {
      Scalar norm;
      for (next=0; next<cur; next++) {
	if (hermit) {
	  ierr = DotProductsSet(dp,vecs[next],vecs[cur]); CHKERRQ(ierr);
	} else {
	  ierr = DotProductsSetT(dp,vecs[next],vecs[cur]); CHKERRQ(ierr);
	}
      }
      for (next=0; next<cur; next++) {
	Scalar prod;
	ierr = DotProductsGet(dp,&prod); CHKERRQ(ierr);
	ierr = MatSetValues
	  (*u,1,&next,1,&cur,&prod,INSERT_VALUES); CHKERRQ(ierr);
	tmp[next] = -prod;
      }
      for (next=0; next<cur; next++) {
	ierr = VecAXPY(tmp+next,vecs[next],vecs[cur]); CHKERRQ(ierr);
      }
      if (hermit) {
	ierr = VecDot(vecs[cur],vecs[cur],&norm); CHKERRQ(ierr);
      } else {
	ierr = VecTDot(vecs[cur],vecs[cur],&norm); CHKERRQ(ierr);
      }
      norm = sqrt(norm);
      ierr = MatSetValues(*u,1,&cur,1,&cur,&norm,INSERT_VALUES); CHKERRQ(ierr);
      norm = 1./norm;
      ierr = VecScale(&norm,vecs[cur]); CHKERRQ(ierr);
    }
    PetscFree(tmp);
  } else if (meth==MOD_GRAM_SCHMIDT) {
    if (!hermit) SETERRQ(1,0,"MGS not implemented for complex symm");
    for (cur=0; cur<n; cur++) {
      Scalar norm;
      ierr = VecDot(vecs[cur],vecs[cur],&norm); CHKERRQ(ierr);
      norm = sqrt(norm);
      ierr = MatSetValues(*u,1,&cur,1,&cur,&norm,INSERT_VALUES); CHKERRQ(ierr);
      norm = 1./norm;
      ierr = VecScale(&norm,vecs[cur]); CHKERRQ(ierr);
      for (next=cur+1; next<n; next++) {
	Scalar prod;
	ierr = VecDot(vecs[next],vecs[cur],&prod); CHKERRQ(ierr);
	ierr = MatSetValues(*u,1,&cur,1,&next,&prod,INSERT_VALUES); CHKERRQ(ierr);
	prod = -prod;
	ierr = VecAXPY(&prod,vecs[cur],vecs[next]); CHKERRQ(ierr);
      }
    }
  } else if (meth==GRAM_SCHMIDT_TWICE) {
    int dup1=0,dup2=0;
    if ((int)u1==PETSC_NULL) {
      dup1 = 1;
      ierr = MatDuplicate(*u,MAT_DO_NOT_COPY_VALUES,&u1); CHKERRQ(ierr);
    }
    if ((int)u2==PETSC_NULL) {
      dup2 = 1;
      ierr = MatDuplicate(*u,MAT_DO_NOT_COPY_VALUES,&u2); CHKERRQ(ierr);
    }
    if (hermit) {
      ierr = BlockVecOrthogonalise(v,use,&u1,GRAM_SCHMIDT); CHKERRQ(ierr);
      ierr = BlockVecOrthogonalise(v,use,&u2,GRAM_SCHMIDT); CHKERRQ(ierr);
    } else {
      ierr = BlockVecTOrthogonalise(v,use,&u1,GRAM_SCHMIDT); CHKERRQ(ierr);
      ierr = BlockVecTOrthogonalise(v,use,&u2,GRAM_SCHMIDT); CHKERRQ(ierr);
    }
    ierr = MatMatMultDense(u2,u1,*u); CHKERRQ(ierr);
    if (dup1) {ierr = MatDestroy(u1); CHKERRQ(ierr);}
    if (dup2) {ierr = MatDestroy(u2); CHKERRQ(ierr);}
  } else SETERRQ(1,0,"Unknown method");
  ierr = MatAssemblyBegin(*u,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
  ierr = MatAssemblyEnd(*u,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
  PLogEventEnd(events[ORTHO_EVENT],0,0,0,0);

  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVecOrthogonalise"
int BlockVecOrthogonalise(BlockVec v,MatReuse use,Mat *u,QRMethod meth)
{
  int ierr;
  Mat u1=PETSC_NULL,u2=PETSC_NULL;
  DotProducts dp;
  PetscFunctionBegin;
  ierr = DotProductsCreate(v->comm,0,&dp); CHKERRQ(ierr);
  if (meth==GRAM_SCHMIDT_TWICE) {
      ierr = MatDuplicate(*u,MAT_DO_NOT_COPY_VALUES,&u1); CHKERRQ(ierr);
      ierr = MatDuplicate(*u,MAT_DO_NOT_COPY_VALUES,&u2); CHKERRQ(ierr);
  }
  ierr = BlockVecOrthogonalise_O(1,v,use,u,u1,u2,meth,dp); CHKERRQ(ierr);
  if (meth==GRAM_SCHMIDT_TWICE) {
    ierr = MatDestroy(u1); CHKERRQ(ierr);
    ierr = MatDestroy(u2); CHKERRQ(ierr);
  }
  ierr = DotProductsDestroy(dp); CHKERRQ(ierr);
  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVecTOrthogonalise"
int BlockVecTOrthogonalise(BlockVec v,MatReuse use,Mat *u,QRMethod meth)
{
  int ierr;
  Mat u1=PETSC_NULL,u2=PETSC_NULL;
  DotProducts dp;
  PetscFunctionBegin;
  ierr = DotProductsCreate(v->comm,0,&dp); CHKERRQ(ierr);
  if (meth==GRAM_SCHMIDT_TWICE) {
      ierr = MatDuplicate(*u,MAT_DO_NOT_COPY_VALUES,&u1); CHKERRQ(ierr);
      ierr = MatDuplicate(*u,MAT_DO_NOT_COPY_VALUES,&u2); CHKERRQ(ierr);
  }
  ierr = BlockVecOrthogonalise_O(0,v,use,u,u1,u2,meth,dp); CHKERRQ(ierr);
  if (meth==GRAM_SCHMIDT_TWICE) {
    ierr = MatDestroy(u1); CHKERRQ(ierr);
    ierr = MatDestroy(u2); CHKERRQ(ierr);
  }
  ierr = DotProductsDestroy(dp); CHKERRQ(ierr);
  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVecBiOrthogonalise_O"
int BlockVecBiOrthogonalise_O
(int hermit,BlockVec w,BlockVec v,MatReuse use,Mat *u,
 Mat u1,Mat u2,QRMethod meth,DotProducts dp)
{ 
  Vec *vecs = v->vecs, *wecs = w->vecs;
  int m,n,cur,next,ierr;

  PetscFunctionBegin;
  PLogEventBegin(events[ORTHO_EVENT],0,0,0,0);
  ierr = BlockVecGetLocalSize(v,&m,&n); CHKERRQ(ierr);
  {
    int mm,nn;
    ierr = BlockVecGetLocalSize(w,&mm,&nn); CHKERRQ(ierr);
    if (mm!=m || nn!=n) SETERRQ(1,0,"BlockVecs mismatch");
  }
  if (use==MAT_INITIAL_MATRIX) {
    ierr = MatCreateSeqDense(MPI_COMM_SELF,n,n,PETSC_NULL,u); CHKERRQ(ierr);
  } else if (use==MAT_REUSE_MATRIX) {
    ierr = MatZeroEntries(*u); CHKERRQ(ierr);
  } else SETERRQ(1,0,"Unknown MatReuse mode");

  if (meth==GRAM_SCHMIDT) {
    Scalar *tmp;
    tmp = (Scalar *) PetscMalloc(n*sizeof(Scalar)); CHKPTRQ(tmp);
    for (cur=0; cur<n; cur++) {
      Scalar norm;
      for (next=0; next<cur; next++) {
	if (hermit) {
	  ierr = DotProductsSet(dp,wecs[cur],vecs[next]); CHKERRQ(ierr);
	} else {
	  ierr = DotProductsSetT(dp,wecs[cur],vecs[next]); CHKERRQ(ierr);
	}
      }
      for (next=0; next<cur; next++) {
	Scalar prod;
	ierr = DotProductsGet(dp,&prod); CHKERRQ(ierr);
	ierr = MatSetValues
	  (*u,1,&next,1,&cur,&prod,INSERT_VALUES); CHKERRQ(ierr);
	tmp[next] = -prod;
      }
      for (next=0; next<cur; next++) {
	ierr = VecAXPY(tmp+next,vecs[next],vecs[cur]); CHKERRQ(ierr);
	ierr = VecAXPY(tmp+next,wecs[next],wecs[cur]); CHKERRQ(ierr);
      }
      if (hermit) {
	ierr = VecDot(wecs[cur],vecs[cur],&norm); CHKERRQ(ierr);
      } else {
	ierr = VecTDot(wecs[cur],vecs[cur],&norm); CHKERRQ(ierr);
      }
      norm = sqrt(norm);
      ierr = MatSetValues(*u,1,&cur,1,&cur,&norm,INSERT_VALUES); CHKERRQ(ierr);
      norm = 1./norm;
      ierr = VecScale(&norm,vecs[cur]); CHKERRQ(ierr);
      ierr = VecScale(&norm,wecs[cur]); CHKERRQ(ierr);
    }
    PetscFree(tmp);
  } else if (meth==GRAM_SCHMIDT_TWICE) {
    int dup1=0,dup2=0;
    if ((int)u1==PETSC_NULL) {
      dup1 = 1;
      ierr = MatDuplicate(*u,MAT_DO_NOT_COPY_VALUES,&u1); CHKERRQ(ierr);
    }
    if ((int)u2==PETSC_NULL) {
      dup2 = 1;
      ierr = MatDuplicate(*u,MAT_DO_NOT_COPY_VALUES,&u2); CHKERRQ(ierr);
    }
    if (hermit) {
      ierr = BlockVecBiOrthogonalise(w,v,use,&u1,GRAM_SCHMIDT); CHKERRQ(ierr);
      ierr = BlockVecBiOrthogonalise(w,v,use,&u2,GRAM_SCHMIDT); CHKERRQ(ierr);
    } else {
      ierr = BlockVecTBiOrthogonalise(w,v,use,&u1,GRAM_SCHMIDT); CHKERRQ(ierr);
      ierr = BlockVecTBiOrthogonalise(w,v,use,&u2,GRAM_SCHMIDT); CHKERRQ(ierr);
    }
    ierr = MatMatMultDense(u2,u1,*u); CHKERRQ(ierr);
    if (dup1) {ierr = MatDestroy(u1); CHKERRQ(ierr);}
    if (dup2) {ierr = MatDestroy(u2); CHKERRQ(ierr);}
  } else SETERRQ(1,0,"Unknown method");
  ierr = MatAssemblyBegin(*u,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
  ierr = MatAssemblyEnd(*u,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
  PLogEventEnd(events[ORTHO_EVENT],0,0,0,0);

  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVecBiOrthogonalise"
int BlockVecBiOrthogonalise(BlockVec w,BlockVec v,
			    MatReuse use,Mat *u,QRMethod meth)
{
  int ierr;
  Mat u1=PETSC_NULL,u2=PETSC_NULL;
  DotProducts dp;
  PetscFunctionBegin;
  ierr = DotProductsCreate(v->comm,0,&dp); CHKERRQ(ierr);
  if (meth==GRAM_SCHMIDT_TWICE) {
      ierr = MatDuplicate(*u,MAT_DO_NOT_COPY_VALUES,&u1); CHKERRQ(ierr);
      ierr = MatDuplicate(*u,MAT_DO_NOT_COPY_VALUES,&u2); CHKERRQ(ierr);
  }
  ierr = BlockVecBiOrthogonalise_O(1,w,v,use,u,u1,u2,meth,dp); CHKERRQ(ierr);
  if (meth==GRAM_SCHMIDT_TWICE) {
    ierr = MatDestroy(u1); CHKERRQ(ierr);
    ierr = MatDestroy(u2); CHKERRQ(ierr);
  }
  ierr = DotProductsDestroy(dp); CHKERRQ(ierr);
  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVecTBiOrthogonalise"
int BlockVecTBiOrthogonalise(BlockVec w,BlockVec v,
			    MatReuse use,Mat *u,QRMethod meth)
{
  int ierr;
  Mat u1,u2;
  DotProducts dp;
  PetscFunctionBegin;
  ierr = DotProductsCreate(v->comm,0,&dp); CHKERRQ(ierr);
  if (meth==GRAM_SCHMIDT_TWICE) {
      ierr = MatDuplicate(*u,MAT_DO_NOT_COPY_VALUES,&u1); CHKERRQ(ierr);
      ierr = MatDuplicate(*u,MAT_DO_NOT_COPY_VALUES,&u2); CHKERRQ(ierr);
  }
  ierr = BlockVecBiOrthogonalise_O(0,w,v,use,u,u1,u2,meth,dp); CHKERRQ(ierr);
  if (meth==GRAM_SCHMIDT_TWICE) {
    ierr = MatDestroy(u1); CHKERRQ(ierr);
    ierr = MatDestroy(u2); CHKERRQ(ierr);
  }
  ierr = DotProductsDestroy(dp); CHKERRQ(ierr);
  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVecMatMult"
int BlockVecMatMult(Mat A,BlockVec bi,BlockVec bo)
{
  int iv,ierr;

  PetscFunctionBegin;
  if (bo->n < bi->n) SETERRQ(1,1,"Output vector too skinny");
  for (iv=0; iv<bi->n; iv++) {
    Vec vi,vo;
    ierr = BlockVecGetVec(bi,iv,&vi); CHKERRQ(ierr);
    ierr = BlockVecGetVec(bo,iv,&vo); CHKERRQ(ierr);
    ierr = MatMult(A,vi,vo); CHKERRQ(ierr);
    ierr = BlockVecRestoreVec(bi,iv,&vi); CHKERRQ(ierr);
    ierr = BlockVecRestoreVec(bo,iv,&vo); CHKERRQ(ierr);
  }
  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVecNormalise"
int BlockVecNormalise(BlockVec v,MatReuse use,Mat *u)
{
  DotProducts dp;
  Vec *vecs = v->vecs;
  int n = v->n, i,ierr;

  PetscFunctionBegin;
  PLogEventBegin(events[ORTHO_EVENT],0,0,0,0);
  ierr = DotProductsCreate(v->comm,0,&dp); CHKERRQ(ierr);

  if (use==MAT_REUSE_MATRIX) {
    ierr = MatZeroEntries(*u); CHKERRQ(ierr);
  } else {
    ierr = MatCreateSeqDense(MPI_COMM_SELF,n,n,PETSC_NULL,u); CHKERRQ(ierr);
  }

  for (i=0; i<n; i++) {
    ierr = DotProductsSet(dp,vecs[i],vecs[i]); CHKERRQ(ierr);
  }
  for (i=0; i<n; i++) {
    Scalar prod;
    ierr = DotProductsGet(dp,&prod); CHKERRQ(ierr);
    prod = sqrt(PetscAbsScalar(prod));
    ierr = MatSetValues(*u,1,&i,1,&i,&prod,INSERT_VALUES); CHKERRQ(ierr);
    prod = 1./prod;
    ierr = VecScale(&prod,vecs[i]); CHKERRQ(ierr);
  }
  ierr = MatAssemblyBegin(*u,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
  ierr = MatAssemblyEnd(*u,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
  
  ierr = DotProductsDestroy(dp); CHKERRQ(ierr);
  PLogEventEnd(events[ORTHO_EVENT],0,0,0,0);
  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVecBiNormalise"
int BlockVecBiNormalise(BlockVec v,BlockVec w,MatReuse use,Mat *u)
{
  DotProducts dp;
  Vec *vecs = v->vecs,*wecs = w->vecs;
  int n = v->n, i,ierr;

  PetscFunctionBegin;
  PLogEventBegin(events[ORTHO_EVENT],0,0,0,0);
  ierr = DotProductsCreate(v->comm,0,&dp); CHKERRQ(ierr);

  if (use==MAT_REUSE_MATRIX) {
    ierr = MatZeroEntries(*u); CHKERRQ(ierr);
  } else {
    ierr = MatCreateSeqDense(MPI_COMM_SELF,n,n,PETSC_NULL,u); CHKERRQ(ierr);
  }

  for (i=0; i<n; i++) {
    ierr = DotProductsSet(dp,wecs[i],vecs[i]); CHKERRQ(ierr);
  }
  for (i=0; i<n; i++) {
    Scalar prod;
    ierr = DotProductsGet(dp,&prod); CHKERRQ(ierr);
    prod = sqrt(PetscAbsScalar(prod));
    ierr = MatSetValues(*u,1,&i,1,&i,&prod,INSERT_VALUES); CHKERRQ(ierr);
    prod = 1./prod;
    ierr = VecScale(&prod,vecs[i]); CHKERRQ(ierr);
    ierr = VecScale(&prod,wecs[i]); CHKERRQ(ierr);
  }
  ierr = MatAssemblyBegin(*u,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
  ierr = MatAssemblyEnd(*u,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
  
  ierr = DotProductsDestroy(dp); CHKERRQ(ierr);
  PLogEventEnd(events[ORTHO_EVENT],0,0,0,0);
  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVecPCApply"
int BlockVecPCApply(PC M,BlockVec bi,BlockVec bo)
{
  int iv,ierr;

  PetscFunctionBegin;
  for (iv=0; iv<bi->n; iv++) {
    Vec vi,vo;
    ierr = BlockVecGetVec(bi,iv,&vi); CHKERRQ(ierr);
    ierr = BlockVecGetVec(bo,iv,&vo); CHKERRQ(ierr);
    ierr = PCApply(M,vi,vo); CHKERRQ(ierr);
    ierr = BlockVecRestoreVec(bi,iv,&vi); CHKERRQ(ierr);
    ierr = BlockVecRestoreVec(bo,iv,&vo); CHKERRQ(ierr);
  }
  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVecView"
int BlockVecView(BlockVec b,Viewer vw)
{
  Vec v;
  int ierr,iv;
  PetscFunctionBegin;
  for (iv=0; iv<b->n; iv++) {
    ierr = BlockVecGetVec(b,iv,&v); CHKERRQ(ierr);
    PetscPrintf(b->comm,"BlockVec component %d:\n",iv);
    VecView(v,vw);
    ierr = BlockVecRestoreVec(b,iv,&v); CHKERRQ(ierr);
  }
  PetscFunctionReturn(0);
}

/**************** Explicit data dependencies ****************/
#undef __FUNC__
#define __FUNC__ "BlockVecCopy"
int BlockVecCopy(BlockVec bi,BlockVec bo)
{
  Scalar *ai,*ao;
  int m,n,ierr;

  PetscFunctionBegin;
  ierr = BlockVecGetLocalSize(bi,&m,&n);
  ierr = BlockVecGetArray(bi,&ai); CHKERRQ(ierr);
  ierr = BlockVecGetArray(bo,&ao); CHKERRQ(ierr);
  PetscMemcpy(ao,ai,m*n*sizeof(Scalar));
  ierr = BlockVecRestoreArray(bi,&ai); CHKERRQ(ierr);
  ierr = BlockVecRestoreArray(bo,&ao); CHKERRQ(ierr);
  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "BlockVec2Vecs"
int BlockVec2Vecs(BlockVec vi,int n,Vec *vo)
{
  int ierr,iv;

  PetscFunctionBegin;
  if (n!=vi->n) SETERRQ(1,0,"Incompatible sizes");
  for (iv=0; iv<n; iv++) {
    Vec v;
    ierr = BlockVecGetVec(vi,iv,&v); CHKERRQ(ierr);
    ierr = VecCopy(v,vo[iv]); CHKERRQ(ierr);
    ierr = BlockVecRestoreVec(vi,iv,&v); CHKERRQ(ierr);
  }
  PetscFunctionReturn(0);
}

#undef __FUNC__
#define __FUNC__ "Vecs2BlockVec"
int Vecs2BlockVec(Vec *vi,int n,BlockVec vo)
{
  int ierr,iv;

  PetscFunctionBegin;
  if (n!=vo->n) SETERRQ(1,0,"Incompatible sizes");
  for (iv=0; iv<n; iv++) {
    Vec v;
    ierr = BlockVecGetVec(vo,iv,&v); CHKERRQ(ierr);
    ierr = VecCopy(vi[iv],v); CHKERRQ(ierr);
    ierr = BlockVecRestoreVec(vo,iv,&v); CHKERRQ(ierr);
  }
  PetscFunctionReturn(0);
}
