// File: tms.cc -- Implements class tms
// Author: Suvrit Sra
// Date: 15, nov 2003

/*************************************************************************
 THE ORIGINAL SVDPAKC COPYRIGHT
                           (c) Copyright 1993
                        University of Tennessee
                          All Rights Reserved                          
 *************************************************************************/

#include <cmath>
#include "tms.h"

using namespace ssvd;
double tms::epslon(double x)
{
  /* Estimate unit roundoff in quantities of size x 
     
     This function should work properly on all systems
     satisfying the following two assumptions,
     1. The base used in representiong floating point
        numbers is not a power of three.
     2. The quantity a in statement 10 is represented to 
        the accuracy used in floating point variables
        that are stored in memory.
     The statement number 10 and the while are intended 
     to force optimizing compilers to generate code
     satisfying assumption 2.
     Under these assumptions, it should be true that,
         A is not exactly equal to four-thirds,
         B has a zero for its last bit or digit,
         C is not exactly equal to one,
         EPS measures the separation of 1.0 from
             the next larger floating point number.
     This routine is based on Eispack. The developers of
     Eispack would appreciate being informed about any
     systems where these assumptions do not hold. */

  double  a,b,c,eps;

  a = 4.0/3.0;
  do {
     b = a - ONE; /* statement 10 */
     c = b + b + b;
     eps = fabs(c -ONE);
  }
  while (eps == ZERO);  
  eps = eps*fabs(x);
  return(eps);
}


void tms::disk(long n, double *sig, double *rad, long *csize, long *clus)
{
  /* monitor separation of gershgorin disks */

  double radi,radipi,tmp;
  long   cptr,i,k,flag,upper;

  /* assume ordering: sig[0] <= sig[1] <= ...<= sig[n-1] for sig 
     array in tsvd1.
     rad := radii for approximate e-values  

     csize array indicates location and size of clusters:
     csize[i]=k  (k != 0) gives cluster of size k with first disk
     centered at sig[i]. */

  upper = n-1;
  for(i=0; i<upper; i++) {
     radi = rad[i];
     radipi = rad[i+1];
     tmp = (radi+radipi) - (sig[i+1] - sig[i]);
     if(tmp<=0.0) clus[i] = 1;
     else  clus[i] = 0;
  }
  /* clustering vector filled, now locate clusters and their size */

  for(i=0; i<n; i++) csize[i] = 1;

  for(i=0; i<upper; i++) {
     cptr = i;
     flag = 1;
     k = i-1;
     while((flag) && (k>=0)) {
        if(csize[k])  flag = 0;
        else  cptr=k;
        k -= 1;
     }
   if(!clus[i]) {
     if(csize[i]) csize[i] += 1;
     else csize[cptr-1] += 1;
     csize[i+1] -= 1;
   }
  }
return;
}


void tms::isol(long i, double resid, double tol, long *init, double *ireso,
          double *creso, long s)
{
  /* monitor residual reduction for isolated singuar value
     approximations.
     Assume ordering : sig[0]<=sig[1]<=...<=sig[s-1] for sig 
     array in tsvd1. 
     Resid : 2-norm of B-eigenpair residual   */

  if((ireso[i]<ZERO) || (creso[i]>=ZERO)) {

      if(creso[i]<ZERO) ireso[i]=resid;
      else {
           ireso[i]=resid;
           creso[i]=-ONE;
      }
    }
  else 
       if(resid<=tol*ireso[i]) init[i] = TRUE;

  return;
}

void tms::clus(long i, long size, double *resid, double tol, long *init,
          double *ireso, double *creso, double *tmp, long s)
{
  /* Monitor residual reduction for clustered singular value 
     approxmations.
     Assume ordering : sig[0] <= sig[1] <=...<= sig[n-1] for 
     sig array in tsvd1.

     resid = 2-norm of B-eigenpair residuals. 
     i     = first disk in cluster.
     size  = number of disks in cluster. */


  double error;
  long   k,upper;

  for(k=0; k<size; k++) tmp[k] = resid[k]*resid[k];
  error = sqrt(dasum(size, tmp, 1));
  if(creso[i] < ZERO) { 
    if(ireso[i] < ZERO) creso[i] = error;
    else {
         creso[i] = error;
         ireso[i] = -ONE;
    }
  }
  else
     if(error <= (tol*creso[i])) {
       upper = i+size;
       for(k=i; k<upper; k++) init[k] = TRUE;
     }
  return;
}

void tms::cgts(long n, long left, double *w, double **v, long *cgiter,
          double sig, double sigold, double sigmax, double shift,
          long *kount, double eps)
{
  /* cg for independent systems in trace min. optimization step.
   (shift incorporated)
   v stores current orthog basis for r[y]. */

  double denom, bound, a, a0, b, error, rnorm, rnorm0, vnorm;
  long   maxi, i, j, k, ii;

  maxi = n;
  *kount = 0;
  if(sig!=shift) {
    bound = (sig-shift)/(sigmax-shift);
    bound=bound*bound;
  }
  else {
       bound = (sigold-shift)/(sigmax-shift);
       bound=bound*bound;
  }
  /* w0=w by definition , get first redidual residual via orthog.
     projector */

  myopb(n, w, z, shift);
  *kount += 1;
  dgemv2(TRANSP, n, left, ONE, v, 0, 0, z, ZERO, r);
  dgemv2(NTRANSP, n, left, ONE, v, 0, 0, r, ZERO, v2);
  for(i=0; i<n; i++) r[i] = z[i]-v2[i];
  rnorm0 = sqrt(ddot(n,r,1,r,1));
  for(i=0; i<n; i++) pp[i] = r[i];

  /* main iteration loop 30*/

  for( ii=0; ii<maxi; ii++) {

     myopb(n, pp, v2, shift);
     *kount += 1;
     denom = ddot(n,pp,1,v2,1);
     if(denom<=ZERO){
       return;
     }
     a = ddot(n, r, 1, r, 1) / denom;
     if(ii==0) a0 = a;
     daxpy(n, -a, pp, 1, w, 1);
     for(j=0; j<n; j++) r2[j] = r[j];
     dgemv2(TRANSP, n, left, ONE, v, 0, 0, v2, ZERO, z);
     dgemv2(NTRANSP, n, left, ONE, v, 0, 0, z, ZERO, z2);
     for(i=0; i<n; i++) v2[i] -= z2[i];
     daxpy(n, -a, v2, 1, r2, 1);
     rnorm = sqrt(ddot(n,r2,1,r2,1));
     for(k=0; k<n; k++) v2[k] = pp[k];
     dscal(n, a, v2, 1);
     vnorm = sqrt(ddot(n,v2,1,v2,1));

  /* early termination code: */

     error = fabs(a*rnorm*rnorm/(a0*rnorm0*rnorm0));
     if((error<=bound) || (vnorm <= eps))  {
       *cgiter += ii+1;
       return;
       }
     else if (ii==maxi-1) {
          *cgiter += maxi;
           printf("cgts failed to converge in  %ld  iterations\n",maxi);
           return;
          }

     for(j=0; j<n; j++) v2[j] = r2[j];
     b=ddot(n, r2, 1, r2, 1) / ddot(n, r, 1, r, 1);
     daxpy(n, b, pp, 1, v2, 1);
     for(j=0; j<n; j++) pp[j] = v2[j];
     for(j=0; j<n; j++) r[j] = r2[j];
  }
return;
}



void tms::cgt(long n, long left, double *w, double **v, long *cgiter, 
         double sig, double sigmax, long *kount, double eps)
{
  /* cg for independent systems in trace min. optimization step.
     v stores current orthog basis for r[y]. */

  double denom, bound, a, a0, b, error, rnorm, rnorm0, vnorm;
  long   maxi, i, j, k, ii;

  maxi = n;
  *kount = 0;
  bound = sig/sigmax ;
  bound = bound*bound;

  /* w0=w by definition , get first residual via orthog, projector */

  myopb(n, w, z, ZERO);
  *kount += 1;
  dgemv2(TRANSP, n, left, ONE, v, 0, 0, z, ZERO, r);
  dgemv2(NTRANSP, n, left, ONE, v, 0, 0, r, ZERO, v2);
  for(i=0; i<n; i++) r[i] = z[i]-v2[i];
  rnorm0 = sqrt(ddot(n,r,1,r,1));
  for(i=0; i<n; i++) pp[i] = r[i];

  /* main iteration loop */

  for( ii=0; ii<maxi; ii++) {

     myopb(n, pp, v2, ZERO);
     *kount += 1;
     denom = ddot(n, pp, 1, v2, 1);
     if(denom <= ZERO) {
       return;
     }
     a = ddot(n, r, 1, r, 1)/denom;
     if(ii==0) a0 = a;
     daxpy(n, -a, pp, 1, w, 1);
     for(j=0; j<n; j++) r2[j] = r[j];
     dgemv2(TRANSP, n, left, ONE, v, 0, 0, v2, ZERO, z);
     dgemv2(NTRANSP, n, left, ONE, v, 0, 0, z, ZERO, z2);
     for(i=0; i<n; i++) v2[i] -= z2[i];
     daxpy(n, -a, v2, 1, r2, 1);
     rnorm = sqrt(ddot(n, r2, 1,r2,1));
     for(k=0; k<n; k++) v2[k] = pp[k];
     dscal(n, a, v2, 1);
     vnorm = sqrt(ddot(n,v2,1,v2,1));

    /* early termination code: */

     error = fabs(a*rnorm*rnorm/(a0*rnorm0*rnorm0));
     if((error <= bound) || (vnorm <= eps))  {
       *cgiter += ii+1;
       return;
     }
     else if(ii==maxi-1) {
           *cgiter += maxi;
           printf("cgt failed to converge in  %ld   iterations\n",maxi);
           return;
          }
   
     for(j=0; j<n; j++) v2[j] = r2[j];
     b=ddot(n, r2, 1, r2, 1)/ddot(n, r, 1, r, 1);
     daxpy(n, b, pp, 1, v2, 1);
     for(j=0; j<n; j++) pp[j]=v2[j];
     for(j=0; j<n; j++) r[j]=r2[j];
  }
  return;
}


void tms::porth(long p, long f, long n, double **x, double *tmp,
           double *tmp1, double *tmp2, double *tmp3,
           double *tmp4, double e, long degree, double alpha)
/* 
     C translation of blas2  Gram-Schmidt orthogonalization procedure 
     for polynomial accelerated trace minimization (job=2)
     the n by p matrix z stored in columns f+1 to f+p of
     array x is reorthogonalized w.r.t. the first f columns of
     array x.  the resulting matrix z contains the desired P-orthogonal
     columns, where P is a polynomial in the matrix b from tsvd2.
     (based on orthog from Rutishauser)  */

{
long fpp,k,km1,j;

double s,t;

  if(!p) return;
  fpp = f+p;

  for(k=f; k<fpp; k++) {
     km1 = k-1;
L10:
    t = 0.0;
    if(km1<0) goto L25;
    pmul(n,x[k],tmp1,tmp2,tmp3,tmp4,e,degree,alpha);

  if(km1>0) {
    dgemv2(TRANSP,n,km1+1,1.0,x,0,0,tmp1,0.0,tmp);
    t += ddot(km1+1,tmp,1,tmp,1);
  }
  else {
     tmp[0] = ddot(n,x[0],1,tmp1,1);
     t += tmp[0]*tmp[0];
  }

  if(km1>0) dgemv2(NTRANSP,n,km1+1,-1.0,x,0,0,tmp,1.0,x[k]);
  else daxpy(n,-tmp[0],x[0],1,x[k],1);
  if(p==0) goto L50;
L25:
  pmul(n,x[k],tmp1,tmp2,tmp3,tmp4,e,degree,alpha);
  s = ddot(n,x[k],1,tmp1,1);
  t += s;
  if(s>t/100.0) goto L40;
  goto L10;
L40:
  s = sqrt(s);
  if(s!=0.0) s = 1.0/s;
  dscal(n,s,x[k],1);
L50:
  j=0;
  }
   return;
}


void tms::dtrsm(char side, char uplo, long transa, char diag, long m,
           long n, double alpha, double **a, double **b)

/*  dtrsm  solves one of the matrix equations
 *
 *     op( a )*x = alpha*b,   or   x*op( a ) = alpha*b,
 *
 *  where alpha is a scalar, x and b are m by n matrices, a is a unit, or
 *  non-unit,  upper or lower triangular matrix  and  op( a )  is one  of
 *
 *     op( a ) = a   or   op( a ) = a'.
 *
 *  the matrix x is overwritten on b.
 *
 *  parameters
 *  ==========
 *
 *  side   - character*1.
 *           on entry, side specifies whether op( a ) appears on the left
 *           or right of x as follows:
 *
 *              side = 'l' or 'l'   op( a )*x = alpha*b.
 *
 *              side = 'r' or 'r'   x*op( a ) = alpha*b.
 *
 *           unchanged on exit.
 *
 *  uplo   - character*1.
 *           on entry, uplo specifies whether the matrix a is an upper or
 *           lower triangular matrix as follows:
 *
 *              uplo = 'u' or 'u'   a is an upper triangular matrix.
 *
 *              uplo = 'l' or 'l'   a is a lower triangular matrix.
 *
 *           unchanged on exit.
 *
 *  transa - long.
 *           on entry, transa specifies the form of op( a ) to be used in
 *           the matrix multiplication as follows:
 *
 *              transa = 0    op( a ) = a.
 *
 *              transa = 1    op( a ) = a'.
 *
 *
 *           unchanged on exit.
 *
 *  diag   - character*1.
 *           on entry, diag specifies whether or not a is unit triangular
 *           as follows:
 *
 *              diag = 'u' or 'u'   a is assumed to be unit triangular.
 *
 *              diag = 'n' or 'n'   a is not assumed to be unit
 *                                  triangular.
 *           unchanged on exit.
 *
 *  m      - integer.
 *           on entry, m specifies the number of rows of b. m must be at
 *           least zero.
 *           unchanged on exit.
 *
 *  n      - integer.
 *           on entry, n specifies the number of columns of b.  n must be
 *           at least zero.
 *           unchanged on exit.
 *
 *  alpha  - double precision.
 *           on entry,  alpha specifies the scalar  alpha. when  alpha is
 *           zero then  a is not referenced and  b need not be set before
 *           entry.
 *           unchanged on exit.
 *
 *  a      - double precision array of dimension ( lda, k ), where k is m
 *           when  side = 'l' or 'l'  and is  n  when  side = 'r' or 'r'.
 *           before entry  with  uplo = 'u' or 'u',  the  leading  k by k
 *           upper triangular part of the array  a must contain the upper
 *           triangular matrix  and the strictly lower triangular part of
 *           a is not referenced.
 *           before entry  with  uplo = 'l' or 'l',  the  leading  k by k
 *           lower triangular part of the array  a must contain the lower
 *           triangular matrix  and the strictly upper triangular part of
 *           a is not referenced.
 *           note that when  diag = 'u' or 'u',  the diagonal elements of
 *           a  are not referenced either,  but are assumed to be  unity.
 *           unchanged on exit.
 *
 *  lda    - integer.
 *           on entry, lda specifies the first dimension of a as declared
 *           in the calling (sub) program.  when  side = 'l' or 'l'  then
 *           lda  must be at least  max( 1, m ),  when  side = 'r' or 'r'
 *           then lda must be at least max( 1, n ).
 *           unchanged on exit.
 *
 *  b      - double precision array of dimension ( ldb, n ).
 *           before entry,  the leading  m by n part of the array  b must
 *           contain  the  right-hand  side  matrix  b,  and  on exit  is
 *           overwritten by the solution matrix  x.
 *
 *  ldb    - integer.
 *           on entry, ldb specifies the first dimension of b as declared
 *           in  the  calling  (sub)  program.   ldb  must  be  at  least
 *           max( 1, m ).
 *           unchanged on exit.
 *
 *
 *  C translation of level 3 blas routine.
 *  -- written on 8-february-1989.
 *     jack dongarra, argonne national laboratory.
 *     iain duff, aere harwell.
 *     jeremy du croz, numerical algorithms group ltd.
 *     sven hammarling, numerical algorithms group ltd.   */
 
{
long i,j,k,lside,nounit,upper,nrowa;
double temp;

  if(side=='L') { 
    nrowa = m;
    lside=1;
  }
  else {
    nrowa = n;
    lside = 0;
  }
  if(diag=='N') nounit = 1; 
  else nounit = 0;
  if(uplo == 'U') upper = 1;
  else upper = 0;

  /*info = 0;
  if((!lside) && (!lsame(side,'R')) info =1; */
  
  
  if(alpha==ZERO) {
  for(j=0; j<n; j++)
     for(i=0; i<m; i++)
        b[j][i] = ZERO;
  return;
  }
if(lside) {
  if(!transa) {
    if(upper) {
      for(j=0; j<n; j++) {
         if(alpha!=ONE) for(i=0; i<m; i++) b[j][i] *= alpha;
         for(k=m-1; k>=0; k--) {
            if(b[j][k]!=ZERO) {
              if(nounit) b[j][k] /= a[k][k];
              for(i = 0; i<k; i++) b[j][i] -= b[j][k]*a[k][i];
            }
         }
      }
     }
    else {
     for(j=0; j<n; j++) {
        if(alpha!=ONE) for(i=0; i<m; i++) b[j][i] *= alpha;
        for(k=0; k<m; k++) {
             if(b[j][k]!=ZERO) {
               if(nounit) b[j][k] /= a[k][k];
               for(i=k+1; i<m; i++) b[j][i] -= b[j][k]*a[k][i];
             }
        }
     }
    }
 }
 else {
/* b = alpha*inv(a') *b */

    if(upper) {
      for(j=0; j<n; j++) {
         for(i=0; i<m; i++) {
            temp = alpha*b[j][i];
            for(k=0; k<i; k++) temp -= a[i][k]*b[j][k];
            if(nounit) temp /= a[i][i];
            b[j][i] = temp;
         }
      }
    }
    else {
       for(j=0; j<n; j++) {
          for(i=m-1; i>=0; i--) {
              temp = alpha*b[j][i];
              for(k=i+1; k<m; k++) temp -= a[i][k]*b[j][k];
              if(nounit) temp /= a[i][i];
              b[j][i] = temp;
          }
       }
    }
 }
}
else {
/* b = alpha*b*inv(a) */
     if(!transa) {
       if(upper) {
         for(j=0; j<n; j++) { 
            if(alpha!=ONE) for(i=0; i<m; i++) b[j][i] *= alpha;
            for(k=0; k<j; k++) {
               if(a[j][k]!=ZERO) {
                 for(i=0; i<m; i++) b[j][i] -= a[j][k]*b[k][i];
               }
            }
            if(nounit) {
              temp = ONE/a[j][j];
              for(i=0; i<m; i++) b[j][i] *= temp;
            }
         }
        }
      else {
        for(j=n-1; j>=0; j--) {
           if(alpha!=ONE) for(i=0; i<m; i++) b[j][i] *= alpha;
           for(k=j+1; k<n; k++) {
              if(a[j][k] !=ZERO) 
                for(i=0; i<m; i++) b[j][i] -= a[j][k]*b[k][i];
            }
           if(nounit) {
             temp = ONE/a[j][j];
             for(i=0; i<m; i++) b[j][i] *= temp;
           }
        }
      }
     }
   else {
/* form b = alpha*b*inv[a']. */
   if(upper) {
    for(k=n-1; k>=0; k--) {
       if(nounit) {
         temp = ONE/a[k][k];
         for(i=0; i<m; i++) b[k][i] *= temp;
       }
       for(j=0; j<k; j++) {
          if(a[k][j]!=ZERO) {
            temp = a[k][j];
            for(i=0; i<m; i++) b[j][i] -=temp*b[k][i];
          }
       }
       if(alpha !=ONE) for(i=0; i<m; i++) b[k][i] *= alpha;
    }
   } 
   else {
      for(k=0; k<n; k++) {
         if(nounit) {
           temp = ONE/a[k][k];
           for(i=0; i<m; i++) b[k][i] *= temp;
         }
         for(j=k+1; j<n; j++) {
            if(a[k][j]!=ZERO) {
              temp = a[k][j];
              for(i=0; i<m; i++) b[j][i] -= temp*b[k][i];
            }
         }
         if(alpha!=ONE) for(i=0; i<m; i++) b[k][i] *= alpha;
     }
   }
 }
}
}


void tms::pmul(long n, double *y, double *z, double *z2, double *z3,
          double *z4, double e, long degree, double alpha)

/* Compute the multiplication z=P(B)*y (and leave y untouched).
   P(B) is constructed from chebyshev polynomials. The input
   parameter degree specifies the polynomial degree to be used.
   Polynomials are constructed on interval (-e,e) as in Rutishauser's
   ritzit code                                                    */
{

double tmp2,alpha2,eps,tmp;
 
long i,kk;

  alpha2 = alpha*alpha;
  tmp2 = 1.0;
  eps = tmp2 + 1.0e-6;
  if(!degree) for(i=0; i<n; i++) z[i] = y[i];
  else if(degree==1) myopb(n,y,z,alpha2);
     else if(degree>1) {
          tmp2 = 2.0/e;
          tmp = 5.1e-1*tmp2;
          for(i=0; i<n; i++)
             z[i] = y[i];
          dscal(n,tmp,z,1);
          myopb(n,z,z3,alpha2);
          for(kk=1; kk<degree; kk++) {
             for(i=0; i<n; i++) {
                z4[i] = z3[i];
                z[i] = -z[i];
             }
             myopb(n,z3,z2,alpha2);
             daxpy(n,tmp2,z2,1,z,1);
             if(kk != degree-1) {
               for(i=0; i<n; i++) z3[i] = z[i];
               for(i=0; i<n; i++) z[i] = z4[i];
             }
         }
        }
  daxpy(n,eps,y,1,z,1);
  return;
}

void tms::myopb(long n, double *x, double *y, double shift)
{
/* multiplication of B = [(alpha*alpha-shift)*I - A'A] by a vector x , where

        where A is nrow by ncol (nrow>>ncol), and alpha an upper bound
        for the largest singular value of A, and 
        shift is the approximate singular value of A.
        (y stores product vector)                                          */

  long i,j,last;

  for(i=0; i<n; i++)
     y[i]=(alpha*alpha-shift)*x[i];

  for(i=0; i<nrow; i++) zz[i] = 0.0;

  for(i=0; i<ncol; i++) {
     last = pointr[i+1];
     for(j=pointr[i]; j<last; j++)
        zz[rowind[j]] += value[j]*x[i];
  }

  for(i=0; i<ncol; i++) {
     for(j=pointr[i]; j<pointr[i+1]; j++)
        y[i] -= value[j]*zz[rowind[j]];
  }
  return;
}
