/* File: AccurateIC.cpp
 *
 * Implements a routine for the computation of consistent initial values
 * for linear DAEs
 * 
 * C Michael Hanke 2023
 * Version: 2023-05-14
 */

/* 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.

*/

#include "DAE.hpp"
#include "AccurateIC.hpp"
#include "Chebyshev2.hpp"
#include "QuadratureRule.hpp"
#include "randutv/NoFLA_UTV_WY_blk_var2.h"
#include "Eigen/Dense"
#include "unsupported/Eigen/KroneckerProduct"
#include <forward_list>
#include <algorithm>
#include <iostream>
#include <cmath>
#include <cstdlib>
#ifdef USEMKL
#include <mkl.h>
#else
#include <lapacke.h>
#include <cblas.h>
#endif

// Fortran name mangling
#define DLAIC1 dlaic1_
#ifdef USE_RRQR
#define DGEQPX dgeqpx_
#define DGEQRF dgeqrf_

extern "C" {
    void DGEQPX(const lapack_int* job,
                const lapack_int* m,
                const lapack_int* n,
                const lapack_int* k,
                double* A,
                const lapack_int* lda,
                double* C,
                const lapack_int* ldc,
                lapack_int* jpvt,
                const double* ircond,
                double* orcond,
                lapack_int* rank,
                double svlues[4],
                double* work,
                const lapack_int* lwork,
                lapack_int* info);
}
#endif

#ifndef USEMKL
extern "C" {
    void DLAIC1(const lapack_int* job, const lapack_int* j, const double* x,
                const double* sest, const double* w, const double* gamma,
                double* sestpr, double* s, double* c);
}
#endif

/*
 * This code generates a matrix which can be used to pose correct initial
 * conditions for higher-index daes.
 */

using namespace LSCM;
using namespace Eigen;
using namespace std;

// Local auxillary functions

namespace {

// This is a wrapper for the random UTV decomposition method developed by P.G. Martinsson,
// G. Quintana-Orti, and N. Haevner, ACM TOMS 45(2019)1, 4:1-4:26. We use it to compute
// an ONB of the image of A and its orthogonal complement. A will be overwritten.

// The oversampling parameter is chosen slightly conservative. Maybe, it should be chosen
// by the user
void UTV(MatrixXd& A, MatrixXd& Q, LSCMint& r, const double sf, LSCMint pp = 5) {
    // technical parameters for UTV
    lapack_int nb = 32;     // Maybe, a better alternative is ilaenv with "DGEQP3"?
    lapack_int n_iter = 2;  // Good choice according to the paper
    if (pp < 0) {
        cerr << "UTV Warning: Oversampling less than 0. Set to 0." << endl;
        pp = 0;
    }
    
    lapack_int m = A.rows();
    lapack_int n = A.cols();
    if (m < n) {
        cerr << "UTV: Number of columns must not be greater than that of rows!" << endl;
        exit(1);
    }
    
    // Run main routine
    Q = MatrixXd(m,m);
    NoFLA_UTV_WY_blk_var2(m,n,A.data(),m,1,m,m,Q.data(),m,0,n,n,nullptr,n,nb,pp,n_iter);
    
    // At this point, the diagonal of A should contain good approximations to the
    // singular values of the original matrix A. However, they are not necessarily sorted.
    // However, most often they are. So they following algorithm has low complexity
    // and is much faster than, e.g., sorting.
    
    // Find the rank r and sort Q such that the first r columns form a basis of range(A)
    double smax = A.diagonal().array().maxCoeff();
    double threshold = sf*numeric_limits<double>::epsilon()*sqrt(n)*smax;
    r = 0;
    for (LSCMint i = 0; i < n; ++i)
        if (A(i,i) >= threshold) ++r;
    
    // Finally, sort the columns of Q.
    LSCMint iloc = 0;
    for (LSCMint i = 0; i < n; ++i)
        if (A(i,i) >= threshold) {
            if (i != iloc) {
                Q.col(i).swap(Q.col(iloc));
            }
            ++iloc;
            if (iloc == r) break;
        }
}

#ifdef USE_RRQR
// This is a wrapper for the rank-revealing QR-factorization by Christian M. Bischof
// and Gregorio Quintana-Orti, Algorithm 782 of TOMS. We use it to compute an ONB
// of the image of A and its orthogonal complement. A will be overwritten by the
// R factor.
//
// NOTE: We assume the matrix not to be the zero matrix! Neither that
//       A is an empty matrix! Must be secured in the caller!
void RRQR(MatrixXd& A, MatrixXd& Q, LSCMint& r, const double sf) {
    lapack_int m = A.rows();
    lapack_int n = A.cols();
    if (m < n) {
        cerr << "RRQR: Number of columns must not be greater than that of rows!" << endl;
        exit(1);
    }
    lapack_int job = 3;
    lapack_int k = m;
    double* Ap = A.data();
    Q = MatrixXd::Identity(m,m);
    double* Qp = Q.data();
    lapack_int* jpvt = new lapack_int[n];
    double ircond = sf*numeric_limits<double>::epsilon()*sqrt(n);
    double orcond;
    lapack_int rank;
    double svlues[4];
    
    // Optimize workspace
    double wt;
    lapack_int info;
    lapack_int minusone = -1;
    DGEQRF(&m,&n,Ap,&m,nullptr,&wt,&minusone,&info);
    if (info < 0) {
        cerr << "RRQR: Error in workspace query!?!" << endl;
        exit(1);
    }
    lapack_int nb = static_cast<lapack_int>(wt)/n;
    lapack_int lwork = 4*n+m;
    lapack_int lwork2 = 2*n+ nb*(nb+m);
    if (lwork2 > lwork) lwork = lwork2;
    double * work = new double[lwork];
    
    // RUN
    DGEQPX(&job,&m,&n,&k,Ap,&m,Qp,&m,jpvt,&ircond,&orcond,&rank,svlues,work,&lwork,&info);
    if (info < 0) {
        cerr << "RRQR: Error in call! info = " << info << endl;
        exit(1);
    }
    if (info > 0) {
        cerr << "RRQR: Method failed with info = " << info << endl;
        exit(1);
    }
    delete [] work;
    delete [] jpvt;
    
    r = rank;
}
#endif

// Here, we estimate the singular values of the upper triangular matrix A using
// the incremental condition estimator by Bischof as implemented in LAPACK.
// Unfortunately, there is no LAPACKE interface available. So we must stick to
// Fortran!
VectorXd sigmaest(const MatrixXd& A, const VectorXd& rho) {
    LSCMint n = A.cols();
    VectorXd smin(n);
    double *wmin, *wmax;
    wmin = new double[n];
    wmax = new double[n];
    // init
    double smax = abs(rho(0));
    smin(0) = smax;
    wmin[0] = 1.0;
    wmax[0] = 1.0;
    // GO!
    double smaxnew, s, c;
    LSCMint job;
    for (LSCMint j = 1; j < n; ++j) {
        job = 1;
        DLAIC1(&job,&j,wmax,&smax,A.col(j).data(),&rho(j),&smaxnew,&s,&c);
        cblas_dscal(j,s,wmax,1);
        wmax[j] = c;
        smax = smaxnew;
        job = 2;
        DLAIC1(&job,&j,wmin,&smin(j-1),A.col(j).data(),&rho(j),&smin(j),&s,&c);
        cblas_dscal(j,s,wmin,1);
        wmin[j] = c;
    }
    delete [] wmax;
    delete [] wmin;

    smin(0) = smax;
    return smin;
}

// Computes an ONB of the image of A and its orthogonal complement.
// We follow essentially the pseudocode given in Kielbasinski/Schwetlick
// but use pivoting strategies from Golub/Björck. LINPACK strategies are
// adopted.

// NOTE: We assume the matrix not to be the zero matrix! Neither that
//       A is an empty matrix! Must be secured in the caller!
// NOTE: There must be a strong gap between nonzero and zero
//       diagonal elements.
void ContinuousQR(MatrixXd& A, Matrix<LSCMint,Dynamic,1>& perm, MatrixXd& Q,
                        LSCMint& rank, double const sf, const bool init) {
    // The init argument is provided in order to allow for the QR
    // decomposition of a nearby matrix when the procedure was called
    // with init = true before. If init = false, the permutation perm
    // is used without being recomputed. Moreover, the rank is not allowed
    // to change which will be checked. The sign of perm(k) is used to
    // indicate if the sign of rho(k) should be changed for stabilitylapack_
    // reasons.
    //
    // NOTE: A will be overwritten with the R factor!
    
    // Note: The following implementation is far from being optimal
    //       since Eigen does always an index check even for element
    //       access! Moreover, there is a lot of constructors/destructors.
    LSCMint m = A.rows();
    LSCMint n = A.cols();
    LSCMint l = min(m-1,n);
    if (m < n) {
        cerr << "ContinuousQR: Number of columns must not be greater than that of rows!" << endl;
        exit(1);
    }
    if (!init) {
        if (perm.size() != n ||
            rank <= 0 || rank > n) {
            cerr << "ContinuousQR: Invalid call with init = false" << endl;
            exit(1);
        }
    }
    
    // Variales for pivoting
    VectorXd sold;
    VectorXd sact;
    if (init) {
        sold = VectorXd(n);
        for (LSCMint k = 0; k < n; ++k) sold(k) = A.col(k).norm();
        sact = sold;
        perm = VectorXi::LinSpaced(n,1,n);
    }
    
    // Diagonal elements of the R-factor
    VectorXd rho = VectorXd::Zero(n);
    
    // Orthogonalisation
    for (LSCMint k = 0; k < l; ++k) {
        LSCMint lam;
        if (init) {
            // Update column norms? Check correctness with Linpack!
            if (k > 0) {
                for (LSCMint j = k; j < n; ++j) {
                    double fact = A(k-1,j)/sact(j);
                    sact(j) *= sqrt(1.0-fact*fact);
                    if (sact(j) <= sqrt(numeric_limits<double>::epsilon())*sold(j)) {
                        sold(j) = A.col(j).tail(m-k).norm();
                        sact(j) = sold(j);
                    }
                }
            }

            // Pivoting
            double smax = 0.0;
            lam = k;
            for (LSCMint j = k; j < n; ++j) {
                if (smax < sact(j)) {
                    smax = sact(j);
                    lam = j;
                }
            }
            perm(k) = lam+1;
            if (smax == 0.0) break;  // Remainder is identical 0!
            if (k != lam) {
                double tmp = sold(k);
                sold(k) = sold(lam);
                sold(lam) = tmp;
                sact(lam) = sact(k);
                sact(k) = smax;
            }
        }
        else lam = abs(perm(k))-1;
        if (k != lam) {
            A.col(lam).swap(A.col(k));
            //VectorXd vtmp = A.col(k);
            //A.col(k) = A.col(lam);
            //A.col(lam) = vtmp;
        }

        // Apply Householder transformation
        // For security reasons, we do not use sact here!
        rho(k) = A.col(k).tail(m-k).norm();
        if (rho(k) == 0.0) continue;
        if (init) {
            if (A(k,k) > 0) {
                rho(k) = -rho(k);
                perm(k) = -perm(k);
            }
        }
        // The next statement assumes silently that possible changes of the
        // sign of A(k,k) in subsequent calls correspond to a small change in
        // the value of A(k,k) such that the algorithm remains stable.
        else 
            if (perm(k) < 0) rho(k) = -rho(k);
        A(k,k) = 1.0-A(k,k)/rho(k);
        A.col(k).tail(m-k-1) *= -1.0/rho(k);
        
        for (LSCMint j = k+1; j < n; ++j) {
            double beta = A.col(k).tail(m-k).transpose()*A.col(j).tail(m-k);
            A.col(j).tail(m-k) -= A.col(k).tail(m-k)*(beta/A(k,k));
        }
    }
    if (m == n) rho(n-1) = A(n-1,n-1);  // Is this correct?
    
    // Rank determination
    VectorXd sigma = sigmaest(A,rho);
    
    double threshold = sf*numeric_limits<double>::epsilon()*sqrt(n);
    if (init) {
        double piv = sigma(0);
        rank = n;
        for (LSCMint k = 0; k < n; ++k)
            if (sigma(k) <= threshold*piv) {
                rank = k;
                break;
            }
    }
    else {
        auto rhmax = sigma.head(rank).array().maxCoeff();
        auto rhmin = sigma.head(rank).array().minCoeff();
        if (rank < n) {
            auto rhminm = sigma.tail(n-rank).array().maxCoeff();
            if (rhminm > threshold*rhmax ||
                rhmax <= threshold*rhmin) {
                cerr << "ContinuousQR: Rank change!" << endl;
                exit(1);
            }
        }
        else {
            if (rhmax <= threshold*rhmin) {
                cerr << "ContinuousQR: Rank change!" << endl;
                exit(1);
            }
        }
    }
    
    // Prepare matrix Q
    Q.setIdentity(m,m);
    if (rank < l) l = rank;  // This is essential for continuous matrices!
    for (LSCMint k = l-1; k >= 0; --k) {
        if (A(k,k) != 0.0) {  // Should work in the present application!
            for (LSCMint j = k; j < m; ++j) {
                double beta = A.col(k).tail(m-k).transpose()*Q.col(j).tail(m-k);
                beta /= A(k,k);
                Q.col(j).tail(m-k) -= A.col(k).tail(m-k)*beta;
            }
        }
    }
}

// Preparation of the generalized Vandermondematrix
inline MatrixXd genV(const VectorXd& tau, LSCMint N) {
    auto x2 = tau*2.0;
    LSCMint M = tau.size();
    MatrixXd V(M,N+1);
    V.col(0) = VectorXd::Constant(M,1.0);
    if (N > 0) {
        V.col(1) = 0.5*x2;
        for (LSCMint k = 2; k <= N; ++k)
            V.col(k) = x2.cwiseProduct(V.col(k-1))-V.col(k-2);
    }
    return V;
}

// Preparation of the derivative of the generalized Vandermondematrix
inline MatrixXd gendV(const VectorXd& tau, LSCMint N) {
    auto x2 = tau*2.0;
    LSCMint M = tau.size();
    MatrixXd V(M,N+1);
    V.col(0) = VectorXd::Zero(M);
    if (N > 0) {
        V.col(1) = VectorXd::Constant(M,1.0);
        for (LSCMint k = 2; k <= N; ++k)
            V.col(k) = x2.cwiseProduct(V.col(k-1))-V.col(k-2);
    }
    for (LSCMint k = 2; k <= N; ++k) V.col(k) *= k;
    return V;
}


// Solve ||Aa|| --> min such that Ca = d
// TODO: A sparse version:
//  MatrixXd rlsq(const LSCMSparseMatrix& A, const LSCMSparseMatrix& C, const MatrixXd& d);
inline MatrixXd rlsq(const MatrixXd& A, const MatrixXd& C, const MatrixXd& d) {
    LSCMint m = C.rows();
    LSCMint dim = C.cols();
    LSCMint md = dim-m;
    LSCMint nv = d.cols();

    ColPivHouseholderQR<MatrixXd> qr(C);
    auto R = qr.matrixR();
    MatrixXd R1R = R.leftCols(m).triangularView<Upper>().solve(R.rightCols(md));
    MatrixXd R1d = R.leftCols(m).triangularView<Upper>().solve(qr.matrixQ().transpose()*d);
    auto Aperm = A*qr.colsPermutation();
    ColPivHouseholderQR<MatrixXd> aqr(Aperm.leftCols(m)*R1R-Aperm.rightCols(md));
    auto x2 = aqr.solve(Aperm.leftCols(m)*R1d);
    MatrixXd xtmp(dim,nv);
    xtmp.bottomRows(md) = x2;
    auto x1 = R1d-R1R*x2;
    xtmp.topRows(m) = x1;
    return qr.colsPermutation()*xtmp;
}

// NOTE: End of unnamed namespace
}

// The following algorithm is the main recursive procedure. For saving memory and
// speeding up the computation, it is transformed into an iterative version.
// This leads to an somehow unreadable implementation. A more readable version is
// also maintained, however, with a different interface. In particular, all arrays
// are handled in a Fortran77-like fashion (with the only exception C0,dC0).
MatrixXd AccurateIC::Cbasis(const vector<bool>& D, MatrixXd& E, MatrixXd& F,
                const VectorXd& scale) {
    // Determine size of Y (a basis of im E^\perp) and allocate
    Matrix<LSCMint,Dynamic,1> perm;  // For CI_CQR
    LSCMint m;
    LSCMint r = 0;
    
    // For result. Instead of immediate multiplication of the C_i matrices, this
    // approach saves some operations at the cost of saving all C_i.
    forward_list<MatrixXd> Cmatlist;
    MatrixXd C;
    
    // Check rank of E and initialize Y and Z
    m = D.size();
    MatrixXd Y;
    MatrixXd Z;
    // The following statement would be necessary for a general matrix function E(t).
    // However, we have the special situation E = D^T(A^T) with D = [I,0]. Hence, Y and Z
    // can be explicitly provided and depend only on D!
    //genYZ(scale,E,F,m,Y,Z,r,true);
    for (LSCMint i = 0; i < m; ++i)
        if (D[i]) ++r;
    {  // This block is intended to save memory.
        MatrixXd I1 = MatrixXd::Zero(m,r);
        MatrixXd I2 = MatrixXd::Zero(m,m-r);
        LSCMint i1 = 0;
        LSCMint i2 = 0;
        for (LSCMint i = 0; i < m; ++i) {
            if (D[i]) {
                I1(i,i1) = 1.0;
                ++i1;
            }
            else {
                I2(i,i2) = 1.0;
                ++i2;
            }
        }
        Y = MatrixXd(M,m*r);
        Z = MatrixXd(M,m*(m-r < r ? r : m-r));
        for (LSCMint i = 0; i < M; ++i) {
            Y.row(i).head(m*r) = I1.reshaped();
            Z.row(i).head(m*(m-r)) = I2.reshaped();
        }
    }

    while (m != r) {
        LSCMint mm = m*m;
        LSCMint mr = m*r;
        LSCMint mmr = m*(m-r);
        // Check Print

        // Prepare the next recursion step
        ++mu;
        // Need C0 and its derivative
        LSCMint rii;  // For CI_CQR
        MatrixXd C0(M,mr);  // Memory access can be optimized further (alloc in genYZ)
        
        switch (rankdet) {
            case CI_SVD:
                for (LSCMint i = 0; i < M; ++i) {
                    MatrixXd ZTmFi = Z.row(i).head(mmr).reshaped(m,m-r).transpose()*
                            F.row(i).head(mm).reshaped(m,m);
                    JacobiSVD<MatrixXd> svd;
                    svd.compute(ZTmFi,ComputeFullV);
                    svd.setThreshold(sf*numeric_limits<double>::epsilon()*m); // Ad hoc!
                    if (svd.rank() != m-r) {
                        cerr << "AccurateIC: Rank mismatch in F!" << endl;
                        exit(1);
                    }
                    C0.row(i) = svd.matrixV().rightCols(r).reshaped();
                }
                // This is essential since Z does not need to be smooth.
                Smoothing(C0,m,r);
            break;
            case CI_QR:
            {
                MatrixXd Q;
                MatrixXd ZTmFiT = (Z.row(iloc).head(mmr).reshaped(m,m-r).transpose()*
                            F.row(iloc).head(mm).reshaped(m,m)).transpose();
                ContinuousQR(ZTmFiT,perm,Q,rii,sf,true);
                C0.row(iloc) = Q.rightCols(r).reshaped();
                for (LSCMint i = 0; i < M; ++i) 
                    if (i != iloc) {
                        ZTmFiT = (Z.row(i).head(mmr).reshaped(m,m-r).transpose()*
                            F.row(i).head(mm).reshaped(m,m)).transpose();
                        ContinuousQR(ZTmFiT,perm,Q,rii,sf,false);
                        C0.row(i) = Q.rightCols(r).reshaped();
                    }
            }
            break;
            case CI_UTV:
            {
                MatrixXd Q;
                for (LSCMint i = 0; i < M; ++i) {
                    MatrixXd ZTmFiT = (Z.row(i).head(mmr).reshaped(m,m-r).transpose()*
                            F.row(i).head(mm).reshaped(m,m)).transpose();
                    UTV(ZTmFiT,Q,rii,sf);
                    if (rii != m-r) {
                        cerr << "AccurateIC: Rank mismatch in F!" << endl;
                        exit(1);
                    }
                    C0.row(i) = Q.rightCols(r).reshaped();
                }
                Smoothing(C0,m,r);
            }
            break;
#ifdef USE_RRQR
            case CI_RRQR:
            {
                MatrixXd Q;
                for (LSCMint i = 0; i < M; ++i) {
                    MatrixXd ZTmFiT = (Z.row(i).head(mmr).reshaped(m,m-r).transpose()*
                            F.row(i).head(mm).reshaped(m,m)).transpose();
                    RRQR(ZTmFiT,Q,rii,sf);
                    if (rii != m-r) {
                        cerr << "AccurateIC: Rank mismatch in F!" << endl;
                        exit(1);
                    }
                    C0.row(i) = Q.rightCols(r).reshaped();
                }
                Smoothing(C0,m,r);
            }
            break;
#endif
        }

        MatrixXd dC0 = dC0eval(C0);

        Cmatlist.push_front(C0.row(iloc).reshaped(m,r));

        // Recursion: Matrix update
        LSCMint rr = r*r;
        for (LSCMint i = 0; i < M; ++i) {
            F.row(i).head(rr) = (Y.row(i).head(mr).reshaped(m,r).transpose()*
                     (F.row(i).head(mm).reshaped(m,m)*C0.row(i).reshaped(m,r)+
                      E.row(i).head(mm).reshaped(m,m)*dC0.row(i).reshaped(m,r))).reshaped();
            E.row(i).head(rr) = (Y.row(i).head(mr).reshaped(m,r).transpose()*
                     E.row(i).head(mm).reshaped(m,m)*
                     C0.row(i).reshaped(m,r)).reshaped();
        }
        m = r;
        
        genYZ(scale,E,F,m,Y,Z,r,false);
    }
    
    // Generate basis
    if (mu >= N)
        cerr << "Warning AccurateIC: Interpolation degree seems too low." << endl;
    C = MatrixXd::Identity(m,m);
    for (auto i = Cmatlist.begin(); i != Cmatlist.end(); ++i)
        C  = *i*C;

    return C;
}

void AccurateIC::execComp()
{
    // Check input data
    if (H_ <= 0) {
        cerr << "AccurateIC: H is not positiv!" << endl;
        exit(1);
    }
    
    // Preparation of collocation nodes and related data
    switch (loc_) {
        case CI_CENTER:
            if (!((nodes->getProperties() & QuadratureRule::QR_SYMMETRIC) &&
                (nodes->getProperties() & QuadratureRule::QR_CENTERED))) {
                cerr << "AccurateIC: Location CI_CENTER requires QR_SYMMETRIC and QR_CENTERED" << endl;
            exit(1);
            }
            a = t_-H_*0.5;
            b = t_+H_*0.5;
            iloc = M/2;
            break;
        case CI_LEFT:
            if(!(nodes->getProperties() & QuadratureRule::QR_LEFT)) {
                cerr << "AccurateIC: Location CI_LEFT requires QR_LEFT" << endl;
                exit(1);
            }
            a = t_;
            b = t_+H_;
            iloc = 0;
            break;
        case CI_RIGHT:
            if(!(nodes->getProperties() & QuadratureRule::QR_RIGHT)) {
                cerr << "AccurateIC: Location CI_RIGHT requires QR_RIGHT" << endl;
                exit(1);
            }
            a = t_-H_;
            b = t_;
            iloc = M-1;
            break;
    }
    
    // Preparation of input arrays
    // In a matlab notation, we will use a three-dimensional array. It
    // will be mapped (column first) to an Eigen matrix: E(m,m,M) becomes
    // E(M,m*m).
    const VectorXd& tau = nodes->getnodes();
    LSCMint m = (dae_->getD()).size();
    LSCMint mm = m*m;
    MatrixXd E(M,mm);
    MatrixXd F(M,mm);
    VectorXd scale(M);  //  For checking for a zero matrix
    for (LSCMint i = 0; i < M; ++i) {
        double tj = 0.5*H_*(tau(i)+1.0)+a;
        auto A = dae_->A(tj);
        auto B = dae_->B(tj);
        E.row(i) = -(A.transpose().reshaped());
        F.row(i) = B.transpose().reshaped();
        scale(i) = 0.0;
        for (LSCMint j = 0; j < m; ++j) {
            double enrm = A.row(j).norm();
            if (scale(i) < enrm) scale(i) = enrm;
        }
    }
    
    // Need the derivative of E.
    MatrixXd dE = dC0eval(E);

    // GO!
    mu = 0;
    F += dE;
    MatrixXd C = Cbasis(dae_->getD(),E,F,scale);
    
    // DONE
    Ga = C.transpose()*dae_->A(t_);
}

void AccurateIC::initObj(LSCMint N_, shared_ptr<QuadratureRule> nodes_) {
    nodes = nodes_;
    N = N_;
    M = (nodes->getnodes()).size();
    if (M <= N) {
        cerr << "AccurateIC: Number of collocation points too small!" << endl;
        exit(1);
    }

    // initialize technical data
    V = genV(nodes->getnodes(),N);
    dV = gendV(nodes->getnodes(),N);
    if (M == N+1) Dmat = (nodes->getDifferentiationMatrix());
    else {
        ColPivHouseholderQR<MatrixXd> Vqr(V);
        Dmat = dV*Vqr.solve(MatrixXd::Identity(M,M));
        // Apply null sum trick
        for (LSCMint i = 0; i < Dmat.rows(); ++i) {
            double sum = 0.0;
            for (LSCMint j = 0; j < Dmat.cols(); ++j)
                if (i != j) sum += Dmat(i,j);
            Dmat(i,i) = -sum;
        }
    }
}

double AccurateIC::compOpening(const MatrixXd& Ga_) {
     if (Ga_.rows() == 0) {
        if (Ga.rows() == 0) return 0.0;
        return 1.0;
    }
    JacobiSVD<MatrixXd> svdGa, svdGa_;
    svdGa.compute(Ga,ComputeFullV);
    svdGa_.compute(Ga_,ComputeFullV);
    auto rankGa = svdGa.rank();
    auto rankGa_ = svdGa_.rank(); 
    if (rankGa != rankGa_) return 1.0;
    auto dim = Ga.cols();
    JacobiSVD<MatrixXd>
            svd(svdGa_.matrixV().leftCols(rankGa).transpose()*
            svdGa.matrixV().rightCols(dim-rankGa));
    return (svd.singularValues())(0);   
}

// Clenshaw algorithm for the computation of values for given Chebyshev
// expansions
MatrixXd AccurateIC::Clenshaw(const MatrixXd& coeff) {
    auto x2 = nodes->getnodes()*2.0;
    
    MatrixXd b2 = VectorXd::Constant(M,1.0)*coeff.row(N);
    if (N == 0) return b2;
    MatrixXd b1 = VectorXd::Constant(M,1.0)*coeff.row(N-1)+
                    x2.asDiagonal()*b2;
    if (N == 1) return b1;
    for (LSCMint k = N-2; k >=1; --k) {
        MatrixXd bk = VectorXd::Constant(M,1.0)*coeff.row(k)+
                x2.asDiagonal()*b1-b2;
        b2 = b1;
        b1 = bk;
    }
    return VectorXd::Constant(M,1.0)*coeff.row(0)+(0.5*x2).asDiagonal()*b1-b2;
}

// Clenshaw algorithm for the computation of values of derivatives for given
// Chebyshev expansions. Here we use the fact that T_n' = nU_{n-1}
// where T_n is the Chebyshev polynomial of the first kind while
// U_n is the Chebyshev polynomial of the second kind
MatrixXd AccurateIC::dClenshaw(const MatrixXd& coeff) {
    auto x2 = nodes->getnodes()*2.0;
    double rescale = 2.0/H_; // scaling the derivative from [-1,1] to [a,b]

    // The following algorithm uses U_{-1} = 0, U_0 = 1
    // In order to have a compact notation, we use matrix notation.
    // This might be optimized further (at the cost of readability)
    if (N == 0) return MatrixXd::Zero(M,coeff.cols());
    MatrixXd b2 = VectorXd::Constant(M,double(N)*rescale)*coeff.row(N);
    if (N == 1) return b2;
    MatrixXd b1 = VectorXd::Constant(M,double(N-1)*rescale)*coeff.row(N-1)+
                    x2.asDiagonal()*b2;
    if (N == 2) return b1;

    for (LSCMint k = N-2; k >=1; --k) {
        MatrixXd bk = VectorXd::Constant(M,double(k)*rescale)*coeff.row(k)+
                x2.asDiagonal()*b1-b2;
        b2 = b1;
        b1 = bk;
    }
    return b1;
}

// Compute the derivative of C0 with values at tau with a polynomial
// of degree M in a least squares sense. The polynomial is represented
// by Chebyshev polynomials of the first kind.
MatrixXd AccurateIC::dC0eval(const MatrixXd& C0)
{
    auto m2 = C0.cols();

    // Decide about the algorithm
    MatrixXd dC0 = MatrixXd::Zero(M,m2);
    for (LSCMint k = 0; k < M; ++k)
        for (LSCMint j = 0; j < M; ++j) {
            double fac = (2.0/H_)*Dmat(k,j);
            for (LSCMint i = 0; i < m2; ++i)
                dC0(k,i) += fac*(C0(j,i)-C0(k,i));
        }
    return dC0;
}

// Computation of smoothed bases by solving the differential equation
//     Uout' = P'*Uout, Uout(iloc) = U(iloc)
// Here, P = U*U^T  should be smooth.
void AccurateIC::Smoothing(MatrixXd& U, const LSCMint m, const LSCMint r) {
    LSCMint mr = m*r;
    LSCMint N1 = N+1;
    MatrixXd P(M,m*m);
    
    for (LSCMint i = 0; i < M; ++i) {
        auto Ui = U.row(i).head(mr).reshaped(m,r);
        P.row(i) = (Ui*Ui.transpose()).reshaped();
    }
    MatrixXd dP = dC0eval(P);
    
    // We solve the differential equation by a least-squares collocation
    // method
    // 1. Setup of linear least-squares problem
    double fac = 2.0/H_;
    MatrixXd A = MatrixXd::Zero((M-1)*m,N1*m);
    LSCMint ioff = 0;
    for (LSCMint i = 0; i < M; ++i) {
        if (i == iloc) continue;
        //MatrixXd Phik = kroneckerProduct(MatrixXd::Identity(m,m),V.row(i));
        MatrixXd Phik = MatrixXd::Zero(m,N1*m);
        LSCMint joff = 0;
        for (LSCMint j = 0; j < m; ++j) {
            Phik.block(j,joff,1,N1) = V.row(i);
            joff += N1;
        }
        //MatrixXd dPhik = kroneckerProduct(MatrixXd::Identity(m,m),fac*dV.row(i));
        MatrixXd dPhik = MatrixXd::Zero(m,N1*m);
        joff = 0;
        for (LSCMint j = 0; j < m; ++j) {
            dPhik.block(j,joff,1,N1) = fac*dV.row(i);
            joff += N1;
        }
        auto dPk = dP.row(i).reshaped(m,m);
        // The differential equation Uout' = (PP'-P'P)*Uout could also be implemented:
        //auto Pk = P.row(i).reshaped(m,m);
        //A.block(ioff,0,m,M*m) = dPhik-(Pk*dPk-dPk*Pk)*Phik;
        A.block(ioff,0,m,N1*m) = dPhik-dPk*Phik;
        ioff += m;
    }
    MatrixXd C = kroneckerProduct(MatrixXd::Identity(m,m),V.row(iloc));
    MatrixXd d = U.row(iloc).head(mr).reshaped(m,r);
    
    // 2. Solve ||Aa|| --> min such that Ca = d
    MatrixXd acoeff = rlsq(A,C,d);
    
    // 3. Evaluate the functions using the Clenshaw algorithm
    auto Uh = Clenshaw(acoeff.reshaped(N1,mr));
    for (LSCMint i = 0; i < M; ++i)
        U.row(i).head(mr) = Uh.row(i); // One copy too much! Can be saved by modifying Clenshaw.
}

void AccurateIC::genYZ(const VectorXd& scale, const MatrixXd& E,
                       const MatrixXd& F,
                       const LSCMint m,
                       MatrixXd& Y, MatrixXd& Z, LSCMint& r, bool init) {
    // Some init
    Matrix<LSCMint,Dynamic,1> perm;  // For CI_CQR
    LSCMint mm = m*m;
    r = -1;
        
    // Check for approximate zero matrix
    for (LSCMint i = 0; i < M; ++i) {
        auto Ei = E.row(i).head(mm).reshaped(m,m);
        double nrm = 0.0;
        for (LSCMint j = 0; j < m; ++j) {
            double tmp = Ei.col(j).norm();
            if (nrm < tmp) nrm = tmp;
        }
        if (nrm < sf*scale(i)*numeric_limits<double>::epsilon()) { // Dangerous!
            // E(i) is the zero matrix!
            LSCMint ri = 0;
            if (i == 0) {
                r = ri;
                if (init) {
                    Y = MatrixXd(M,0);
                    Z = MatrixXd(M,mm);
                }
            }
            else {
                if (ri != r) {
                    cerr << "AccurateIC: Rank change of E in interval!" << endl;
                    exit(1);
                }
            }
            Z.row(i).head(mm) = MatrixXd::Identity(m,m).reshaped();
        }
    }
    // If E is the zero function, r = 0 and Y, Z are set correctly. r > 0 cannot
    // happen.
    if (r == -1) {
        // Nonzero E.
        switch (rankdet) {
            case CI_SVD:
                for (LSCMint i = 0; i < M; ++i) {
                    auto Ei = E.row(i).head(mm).reshaped(m,m);
                    JacobiSVD<MatrixXd> svd;
                    svd.compute(Ei,ComputeFullU);
                    svd.setThreshold(sf*numeric_limits<double>::epsilon()*m); // Ad hoc!
                    LSCMint ri = svd.rank();
                    if (i == 0) {
                        r = ri;
                        if (init) {
                            Y = MatrixXd(M,m*r);
                            Z = MatrixXd(M,m*(m-r < r ? r : m-r));
                        }
                    }
                    else {
                        if (ri != r) {
                            cerr << "AccurateIC: Rank change of E in interval!" << endl;
                            exit(1);
                        }
                    }
                    auto U = svd.matrixU();
                    Y.row(i).head(m*r) = U.leftCols(r).reshaped();
                    Z.row(i).head(m*(m-r)) = U.rightCols(m-r).reshaped();
                }
                // There does not seem any need for Y and Z to be smooth!
                // However: No proof!
                Smoothing(Y,m,r);
                Smoothing(Z,m,m-r);
            break;
            case CI_QR:
            {
                MatrixXd Q;
                MatrixXd Ei = E.row(iloc).head(mm).reshaped(m,m);
                ContinuousQR(Ei,perm,Q,r,sf,true);
                if (init) {
                    Y = MatrixXd(M,m*r);
                    Z = MatrixXd(M,m*(m-r < r ? r : m-r));
                }
                Y.row(iloc).head(m*r) = Q.leftCols(r).reshaped();
                Z.row(iloc).head(m*(m-r)) = Q.rightCols(m-r).reshaped();
                for (LSCMint i = 0; i < M; ++i)
                    if (i != iloc) {
                        Ei = E.row(i).head(mm).reshaped(m,m);
                        ContinuousQR(Ei,perm,Q,r,sf,false);
                        Y.row(i).head(m*r) = Q.leftCols(r).reshaped();
                        Z.row(i).head(m*(m-r)) = Q.rightCols(m-r).reshaped();
                    }
            }
            break;
            case CI_UTV:
                for (LSCMint i = 0; i < M; ++i) {
                    MatrixXd Ei = E.row(i).head(mm).reshaped(m,m);
                    LSCMint ri;
                    MatrixXd Q;
                    UTV(Ei,Q,ri,sf);
                    if (i == 0) {
                        r = ri;
                        if (init) {
                            Y = MatrixXd(M,m*r);
                            Z = MatrixXd(M,m*(m-r < r ? r : m-r));
                        }
                    }
                    else {
                        if (ri != r) {
                            cerr << "AccurateIC: Rank change of E in interval!" << endl;
                            exit(1);
                        }
                    }
                    Y.row(i).head(m*r) = Q.leftCols(r).reshaped();
                    Z.row(i).head(m*(m-r)) = Q.rightCols(m-r).reshaped();
                }
                // There does not seem any need for Y and Z to be smooth!
                // However: No proof!
                Smoothing(Y,m,r);
                Smoothing(Z,m,m-r);
            break;
#ifdef USE_RRQR
            case CI_RRQR:
                for (LSCMint i = 0; i < M; ++i) {
                    MatrixXd Ei = E.row(i).head(mm).reshaped(m,m);
                    LSCMint ri;
                    MatrixXd Q;
                    RRQR(Ei,Q,ri,sf);
                    if (i == 0) {
                        r = ri;
                        if (init) {
                            Y = MatrixXd(M,m*r);
                            Z = MatrixXd(M,m*(m-r < r ? r : m-r));
                        }
                    }
                    else {
                        if (ri != r) {
                            cerr << "AccurateIC: Rank change of E in interval!" << endl;
                            exit(1);
                        }
                    }
                    Y.row(i).head(m*r) = Q.leftCols(r).reshaped();
                    Z.row(i).head(m*(m-r)) = Q.rightCols(m-r).reshaped();
                }
                // There does not seem any need for Y and Z to be smooth!
                // However: No proof!
                Smoothing(Y,m,r);
                Smoothing(Z,m,m-r);
            break;
#endif
        }
    }
}

