  /* get various sizes, and perform consistency check */
  ierr = MatGetOwnershipRange(A,&A_start,&idum); CHKERRQ(ierr);
  ierr = MatGetLocalSize(A,&A_locali,&A_localj); CHKERRQ(ierr);
  {
    int A_dum,B_dum /*,ai,aj,bi,bj*/;
    ierr = MatGetSize(A,&A_dum,&A_global); CHKERRQ(ierr);
    ierr = MatGetSize(B,&B_dum,&B_global); CHKERRQ(ierr);
    if (!(A_dum==B_dum))
      SETERRQ(1,0,"MatTMatMult_MPIAIJ: Global inner dimension mismatch");
    /* MatGetLocalSize(A,&ai,&aj); MatGetLocalSize(B,&bi,&bj);
      printf("MtM mpi %dx%d times %dx%d\n",ai,aj,bi,bj);*/
  }

  {
    Mat *sub,tmp1,tmp2;
    IS A_total,B_total,IS_A[2],IS_B[2];

    /* index sets in A and B */
    ierr = ISCreateStride(comm,A_locali,0,1,&A_total); CHKERRQ(ierr);
    ierr = ISCreateStride(comm,B_global,0,1,&B_total); CHKERRQ(ierr);
    IS_A[0] = A_total; IS_B[0] = B_total; 

    /* get the B block corresponding to our part of A */
    ierr = MatGetSubMatrices(B,1,IS_A,IS_B,MAT_INITIAL_MATRIX,&sub);
    CHKERRQ(ierr);
    ierr = ISDestroy(A_total); CHKERRQ(ierr);
    ierr = ISDestroy(B_total); CHKERRQ(ierr);

    /* multiply diag and off-diag block of A with relevant parts of B */
    ierr = MatTMatMult_AIJ(Aij->A,sub[0],&tmp1); CHKERRQ(ierr);
    /*{int i,j; MatGetLocalSize(tmp1,&i,&j);
      printf("tmp1 is %d,%d; about to get %d\n",i,j,A_locali);}*/
    {
      int i,j; MatGetSize(tmp1,&i,&j);
      if (i!=A_global) {
	printf("Ajsize=%d,(A->A^t x B[Alocali,Bglobalj])isize=%d\n",
	       A_global,i); SETERRQ(1,0,"tmp1 weirdness\n");}
    }
    ierr = MatTMatMult_AIJ(Aij->B,sub[0],&tmp2); CHKERRQ(ierr);
    if (lsize==PETSC_DECIDE) {
      ierr = MatCreateMPIAIJ
	(comm,PETSC_DECIDE,PETSC_DECIDE,A_global,B_global,5,0,5,0,&res);
      CHKERRQ(ierr);
    } else {
      if (A_global!=B_global)
	SETERRQ(1,1,"Only specified lsize for square matrices");
      if (lsize!=A_localj) {
	printf("Ouch! <%d,%d>\n",lsize,A_localj);
	ierr = MatCreateMPIAIJ
	  (comm,lsize,lsize,PETSC_DECIDE,PETSC_DECIDE,5,0,5,0,&res);
	CHKERRQ(ierr);
      } else {
	Vec bandv; int iRow;
	ierr = VecCreateMPI(comm,lsize,PETSC_DECIDE,&bandv); CHKERRQ(ierr);
	for (iRow=0; iRow<A_global; iRow++) {
	  int Row=iRow,ncols; Scalar colsv;
	  ierr = MatGetRow(tmp1,iRow,&ncols,PETSC_NULL,PETSC_NULL);
	  CHKERRQ(ierr);
	  colsv = (Scalar) ncols;
	  ierr = VecSetValues(bandv,1,&Row,&colsv,ADD_VALUES); CHKERRQ(ierr);
	  ierr = MatRestoreRow(tmp1,iRow,&ncols,PETSC_NULL,PETSC_NULL);
	  CHKERRQ(ierr);
	}
	for (iRow=0; iRow<A_off; iRow++) {
	  int Row=iRow+A_localj, ncols; Scalar colsv;
	  ierr = MatGetRow(tmp2,iRow,&ncols,PETSC_NULL,PETSC_NULL);
	  CHKERRQ(ierr);
	  colsv = (Scalar) ncols;
	  ierr = VecSetValues(bandv,1,&Row,&colsv,ADD_VALUES); CHKERRQ(ierr);
	  ierr = MatRestoreRow(tmp2,iRow,&ncols,PETSC_NULL,PETSC_NULL);
	  CHKERRQ(ierr);
	}
	ierr = VecAssemblyBegin(bandv); CHKERRQ(ierr);
	ierr = VecAssemblyEnd(bandv); CHKERRQ(ierr);
	/*VecView(bandv,0);*/
	{
	  Scalar *band; int *bandw,vsize,i;
	  ierr = VecGetLocalSize(bandv,&vsize); CHKERRQ(ierr);
	  bandw = (int*) PetscMalloc((vsize+1)*sizeof(int)); CHKPTRQ(bandw);
	  ierr = VecGetArray(bandv,&band); CHKERRQ(ierr);
	  for (i=0; i<vsize; i++) {
	    bandw[i] = (int)PetscAbsScalar(band[i]);}
	  ierr = VecRestoreArray(bandv,&band); CHKERRQ(ierr);
	  ierr = VecDestroy(bandv); CHKERRQ(ierr);
	  ierr = MatCreateMPIAIJ
	    (comm,lsize,lsize,PETSC_DECIDE,PETSC_DECIDE,
	     0,bandw,0,bandw,/*5,0,5,0,*/ &res); CHKERRQ(ierr);
	  PetscFree(bandw);
	}
      }
    }
    
    {
      int iRow;
      for (iRow=0; iRow<A_global/*A_localj*/; iRow++) {
	int Row=iRow, ncols,*cols; Scalar *vals;
	ierr = MatGetRow(tmp1,iRow,&ncols,&cols,&vals); CHKERRQ(ierr);
	ierr = MatSetValues(res,1,&Row,ncols,cols,vals,ADD_VALUES);
	CHKERRQ(ierr);
	ierr = MatRestoreRow(tmp1,iRow,&ncols,&cols,&vals); CHKERRQ(ierr);
      }
      for (iRow=0; iRow<A_off; iRow++) {
	int Row=iRow+A_localj, ncols,*cols; Scalar *vals;
	ierr = MatGetRow(tmp2,iRow,&ncols,&cols,&vals); CHKERRQ(ierr);
	ierr = MatSetValues(res,1,&Row,ncols,cols,vals,ADD_VALUES);
	CHKERRQ(ierr);
	ierr = MatRestoreRow(tmp2,iRow,&ncols,&cols,&vals); CHKERRQ(ierr);
      }
    }
    ierr = MatAssemblyBegin(res,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
    ierr = MatDestroy(sub[0]); CHKERRQ(ierr);
    PetscFree(sub);
    ierr = MatDestroy(tmp1); CHKERRQ(ierr);
    ierr = MatDestroy(tmp2); CHKERRQ(ierr);
    ierr = MatAssemblyEnd(res,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
  }

  *C = res;

