///
/// This file is part of Rheolef.
///
/// Copyright (C) 2000-2009 Pierre Saramito <Pierre.Saramito@imag.fr>
///
/// Rheolef is free software; you can redistribute it and/or modify
/// it under the terms of the GNU General Public License as published by
/// the Free Software Foundation; either version 2 of the License, or
/// (at your option) any later version.
///
/// Rheolef is distributed in the hope that it will be useful,
/// but WITHOUT ANY WARRANTY; without even the implied warranty of
/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
/// GNU General Public License for more details.
///
/// You should have received a copy of the GNU General Public License
/// along with Rheolef; if not, write to the Free Software
/// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
/// 
/// =========================================================================
// LDLt : factorization of symmetric matrix (sequential version)
// implementation: inspirated from the Tim Davis "LDLT" library
//
// author: Pierre.Saramito@imag.fr
//
// date: 15 september 2014
//
#include "rheolef/ldlt_seq_builtin.h"

namespace rheolef {

// -------------------------------------------------------------
// ordering
// -------------------------------------------------------------
#ifndef  _RHEOLEF_HAVE_SUITESPARSE_AMD_H
template<typename T>
inline
void ldlt_seq_builtin_ordering (
  const csr_rep<T,sequential>& a,
  std::valarray<size_t>&       perm)
{
  for (size_t i = 0, n = perm.size(); i < n; ++i)
    perm[i] = i;
}
#else // _RHEOLEF_HAVE_SUITESPARSE_AMD_H
#include <suitesparse/amd.h>
template<typename Size>
inline
void
ordering_amd (
	Size        n,
	const Size* ptr,
	const Size* idx,
	Size*       perm)
{
}
template<>
inline
void
ordering_amd (
	unsigned int        n,
	const unsigned int* ptr,
	const unsigned int* idx,
	unsigned int*       perm)
{
  double info [AMD_INFO];
  int amd_status = amd_order (n, (const int*)ptr, (const int*)idx, (int*)perm, 0, info);
  assert (amd_status >= AMD_OK);
  //amd_control (0);
  //amd_info (info);
}
template<>
inline
void
ordering_amd (
	unsigned long        n,
	const unsigned long* ptr,
	const unsigned long* idx,
	unsigned long*       perm)
{
  double info [AMD_INFO];
  int amd_status = amd_l_order (n, (const long*)ptr, (const long*)idx, (long*)perm, 0, info);
  assert (amd_status >= AMD_OK);
  //amd_l_control (0);
  //amd_l_info (info);
}
template<typename T>
void
ldlt_seq_builtin_ordering (
  const csr_rep<T,sequential>&  a,
  std::valarray<size_t>&        perm)
{
    std::valarray<size_t> ptr (a.nrow()+1);
    std::valarray<size_t> idx (a.nnz());
    typename csr_rep<T,sequential>::const_iterator ia = a.begin();
    for (size_t i = 0, n = a.nrow(); i < n+1; ++i)
      ptr[i] = ia[i] - ia[0];
    size_t* iter_idx = &idx[0];
    for (typename csr_rep<T,sequential>::const_data_iterator p = ia[0], q = ia[a.nrow()]; p != q; ++p)
      *iter_idx++ = (*p).first;
    ordering_amd (perm.size(), &ptr[0], &idx[0], &perm[0]);
}
#endif // _RHEOLEF_HAVE_SUITESPARSE_AMD_H
// -------------------------------------------------------------
// numeric factorization
// -------------------------------------------------------------
template <typename T>
bool ldlt_seq_builtin_numeric (
  const csr_rep<T,sequential>& A,
  const std::valarray<size_t>& perm,
  const std::valarray<size_t>& iperm,
  const std::valarray<size_t>& parent,
  vec<T,sequential>&           D,
  csr_rep<T,sequential>&       L,
  std::valarray<size_t>&       L_nnz,
  std::valarray<size_t>&       mark,
  std::valarray<size_t>&       pattern,
  vec<T,sequential>&           Y)
{
  size_t top = 0;
  for (size_t k = 0, n = A.nrow(); k < n; k++) {
    // compute nonzero pattern of kth row of L, in topological order
    Y[k] = T(0);	    // Y(0:k) is now all zero
    top = n;		    // stack for pattern is empty
    mark[k] = k;	    // mark node k as visited
    L_nnz[k] = 0;	    // count of nonzeros in column k of L
    typename csr_rep<T,sequential>::const_iterator ia = A.begin();
    for (typename csr_rep<T,sequential>::const_data_iterator p = ia[perm[k]], q = ia[perm[k]+1]; p != q; ++p) {
      size_t i = iperm[(*p).first];     // get A(i,k)
      if (i <= k) {
        Y[i] += (*p).second;    // scatter a(i,k) into Y (sum duplicates)
	size_t len = 0;
	for (; mark[i] != k; i = parent[i]) {
	  pattern[len++] = i;   // L(k,i) is nonzero
	  mark[i] = k;	        // mark i as visited
	}
	while (len > 0) pattern[--top] = pattern[--len];
      }
    }
    // compute numerical values kth row of L (a sparse triangular solve)
    D[k] = Y[k];		    // get D(k,k) and clear Y(k)
    Y[k] = T(0);
    for (; top < n; top++) {
      size_t i = pattern[top];	    // pattern[top:n-1] is pattern of L(:,k) */
      T yi = Y[i];	            // get and clear Y(i) */
      Y[i] = 0.0;
      typename csr_rep<T,sequential>::iterator il = L.begin();
      typename csr_rep<T,sequential>::data_iterator p = il[i];
      for (typename csr_rep<T,sequential>::data_iterator q = il[i] + L_nnz[i]; p < q; ++p) {
	Y[(*p).first] -= (*p).second*yi;
      }
      T l_ki = yi/D[i];	    // the nonzero entry L(k,i)
      D[k] -= l_ki*yi;
      (*p).first  = k;	    // store L(k,i) in column form of L
      (*p).second = l_ki;
      L_nnz[i]++;	    // increment count of nonzeros in col i
    }
    if (D[k] == 0) {
      warning_macro ("ldlt: zero pivot founded at index="<<k<<" (n="<<n<<")");
      return false; // failure: D(k,k) is zero
    }
  }
  return true;
}
// -------------------------------------------------------------
// symbolic factorization
// -------------------------------------------------------------
#include <limits>
template <typename T>
void ldlt_seq_builtin_symbolic (
  const csr_rep<T,sequential>& A,
  const std::valarray<size_t>& perm,
  const std::valarray<size_t>& iperm,
  std::valarray<size_t>&       parent,
  std::valarray<size_t>&       L_nnz,
  std::valarray<size_t>&       mark)
{
  const size_t unset = std::numeric_limits<size_t>::max();
  for (size_t k = 0, n = A.nrow(); k < n; ++k) {
    // L(k,:) pattern: all nodes reachable in etree from nz in A(0:k-1,k)
    parent[k] = unset;	// parent of k is not yet known
    mark[k] = k;	// mark node k as visited
    L_nnz[k] = 0;	// count of nonzeros in column k of L
    typename csr_rep<T,sequential>::const_iterator ia = A.begin();
    for (typename csr_rep<T,sequential>::const_data_iterator p = ia[perm[k]], q = ia[perm[k]+1]; p != q; ++p) {
      // a(i,k) is nonzero
      size_t i = iperm[(*p).first];
      if (i >= k) continue;
      // follow path from i to root of etree, stop at marked node
      for (; mark[i] != k; i = parent[i]) {
	// find parent of i if not yet determined
        if (parent[i] == unset) parent[i] = k;
	L_nnz[i]++;	// L (k,i) is nonzero
	mark[i] = k;	// mark i as visited
      }
    }
  }
}
// -------------------------------------------------------------
// cstor
// -------------------------------------------------------------
template <typename T>
ldlt_seq_builtin<T>::ldlt_seq_builtin()
  : L(), D(), perm(), status(false), fp(0)
{
}
template <typename T>
ldlt_seq_builtin<T>::ldlt_seq_builtin (const csr_rep<T,sequential>& a)
  : L(), D(), perm(), status(false), fp(0) {
  update_values(a);
}
template <typename T>
void
ldlt_seq_builtin<T>::update_values (const csr_rep<T,sequential>& a) {
  check_macro(a.is_symmetric(), "ldlt: unsupported unsymmetric matrix");
  size_t n = a.nrow();
  // 0) permutation
  perm.resize(n);
  ldlt_seq_builtin_ordering (a, perm);
  std::valarray<size_t> iperm(n);
  for (size_t i = 0; i < n; ++i)
    iperm[perm[i]] = i;
  // 1) symbolic pass:
  std::valarray<size_t> parent(n), L_nnz(n), mark(n);
  ldlt_seq_builtin_symbolic (a, perm, iperm, parent, L_nnz, mark);
  // 1b) The floating-point operation count of the subsequent call to numeric pass
  // is the sum of (L_nnz[k])*(L_nnz[k] + 2) for k = 0 to n-1.
  size_t l_nnz = 0;
  for (size_t k = 0; k < n; k++) {
    l_nnz += L_nnz[k];
    fp    += L_nnz[k]*(L_nnz[k] + 2);
  }
  // 2) construct L.ptr index array from L_nnz column counts
  L.resize (n, n, l_nnz);
  typename csr_rep<T,sequential>::iterator il = L.begin(); // access to non-const iterator
  for (size_t k = 0; k < n; k++)
    il[k+1] = il[k] + L_nnz[k];
  // 3) numeric pass:
  D.resize(n);
  vec<T,sequential> Y(n);
  std::valarray<size_t> pattern(n);
  status = ldlt_seq_builtin_numeric (a, perm, iperm, parent, D, L, L_nnz, mark, pattern, Y);
}
// -------------------------------------------------------------
// solve
// -------------------------------------------------------------
template <typename T>
template <typename M2>
vec<T,M2>&
ldlt_seq_builtin<T>::inplace_solve (vec<T,M2>& b) const {
  vec<T,M2> bp(b.size());
  for (size_t j = 0, n = b.size(); j < n; j++)
    bp[j] = b[perm[j]];
  typename csr_rep<T,sequential>::const_iterator il = L.begin();
  for (size_t j = 0, n = bp.size(); j < n; ++j)
    for (typename csr_rep<T,sequential>::const_data_iterator p = il[j], q = il[j+1]; p != q; ++p)
      bp[(*p).first] -= (*p).second*bp[j];
  for (size_t j = 0, n = bp.size(); j < n; ++j)
    bp[j] /= D[j];
  for (long int j = bp.size()-1; j >= 0; --j)
    for (typename csr_rep<T,sequential>::const_data_iterator p = il[j], q = il[j+1]; p != q; ++p)
      bp[j] -= (*p).second*bp[(*p).first];
  for (size_t j = 0, n = b.size(); j < n; j++)
    b[perm[j]] = bp[j];
  return b;
}
template <typename T>
template <typename M2>
vec<T,M2>
ldlt_seq_builtin<T>::solve (const vec<T,M2>& b) const {
  vec<T,M2> x = b;
  return inplace_solve(x);
}
// -------------------------------------------------------------
// io
// -------------------------------------------------------------
template<typename T>
odiststream& operator<< (odiststream& s, const ldlt_seq_builtin<T>& c) {
  c.L.put (s); return s;
}
// ----------------------------------------------------------------------------
// instanciation in library
// ----------------------------------------------------------------------------
#define _RHEOLEF_instanciation_a(T)					\
template class ldlt_seq_builtin<T>;					\
template odiststream& operator<< (odiststream& s, const ldlt_seq_builtin<T>& c);

#define _RHEOLEF_instanciation_b(T,M)					\
template vec<T,M> ldlt_seq_builtin<T>::solve (const vec<T,M>&) const;

#ifdef _RHEOLEF_HAVE_MPI
#define _RHEOLEF_instanciation(T)					\
	_RHEOLEF_instanciation_a(T)					\
	_RHEOLEF_instanciation_b(T,sequential)				\
	_RHEOLEF_instanciation_b(T,distributed)
#else // _RHEOLEF_HAVE_MPI
#define _RHEOLEF_instanciation(T)					\
	_RHEOLEF_instanciation_a(T)					\
	_RHEOLEF_instanciation_b(T,sequential)
#endif // _RHEOLEF_HAVE_MPI

_RHEOLEF_instanciation(Float)

#undef _RHEOLEF_instanciation_a
#undef _RHEOLEF_instanciation_b
#undef _RHEOLEF_instanciation

} // namespace rheolef
