/*
 * Copyright Nick Thompson, John Maddock 2020
 * Use, modification and distribution are subject to the
 * Boost Software License, Version 1.0. (See accompanying file
 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 */

#ifndef BOOST_MATH_SPECIAL_DAUBECHIES_SCALING_HPP
#define BOOST_MATH_SPECIAL_DAUBECHIES_SCALING_HPP

#include <cstdint>
#include <cstring>
#include <cmath>
#include <vector>
#include <array>
#include <thread>
#include <future>
#include <iostream>
#include <memory>
#include <boost/math/special_functions/detail/daubechies_scaling_integer_grid.hpp>
#include <boost/math/filters/daubechies.hpp>
#include <boost/math/interpolators/detail/cubic_hermite_detail.hpp>
#include <boost/math/interpolators/detail/quintic_hermite_detail.hpp>
#include <boost/math/interpolators/detail/septic_hermite_detail.hpp>

#include <boost/math/tools/is_standalone.hpp>
#ifndef BOOST_MATH_STANDALONE
#  include <boost/config.hpp>
#  ifdef BOOST_MATH_NO_CXX17_IF_CONSTEXPR
#    error "The header <boost/special_functions/daubechies_scaling.hpp> can only be used in C++17 and later."
#  endif
#endif

namespace boost::math {

template<class Real, int p, int order>
std::vector<Real> daubechies_scaling_dyadic_grid(int64_t j_max)
{
    using std::isnan;
    using std::sqrt;
    auto c = boost::math::filters::daubechies_scaling_filter<Real, p>();
    Real scale = sqrt(static_cast<Real>(2))*(1 << order);
    for (auto & x : c)
    {
        x *= scale;
    }

    auto phik = detail::daubechies_scaling_integer_grid<Real, p, order>();

    // Maximum sensible j for 32 bit floats is j_max = 22:
    if constexpr (std::is_same_v<Real, float>)
    {
        if (j_max > 23)
        {
            throw std::logic_error("Requested dyadic grid more dense than number of representables on the interval.");
        }
    }
    std::vector<Real> v(2*p + (2*p-1)*((1<<j_max) -1), std::numeric_limits<Real>::quiet_NaN());
    v[0] = 0;
    v[v.size()-1] = 0;
    for (int64_t i = 0; i < static_cast<int64_t>(phik.size()); ++i) {
        v[i*(1uLL<<j_max)] = phik[i];
    }

    for (int64_t j = 1; j <= j_max; ++j)
    {
        int64_t k_max = v.size()/(int64_t(1) << (j_max-j));
        for (int64_t k = 1; k < k_max;  k += 2)
        {
            // Where this value will go:
            int64_t delivery_idx = k*(1uLL << (j_max-j));
            // This is a nice check, but we've tested this exhaustively, and it's an expensive check:
            //if (delivery_idx >= static_cast<int64_t>(v.size())) {
            //    std::cerr << "Delivery index out of range!\n";
            //    continue;
            //}
            Real term = 0;
            for (int64_t l = 0; l < static_cast<int64_t>(c.size()); ++l)
            {
                int64_t idx = k*(int64_t(1) << (j_max - j + 1)) - l*(int64_t(1) << j_max);
                if (idx < 0)
                {
                    break;
                }
                if (idx < static_cast<int64_t>(v.size()))
                {
                    term += c[l]*v[idx];
                }
            }
            // Again, another nice check:
            //if (!isnan(v[delivery_idx])) {
            //    std::cerr << "Delivery index already populated!, = " << v[delivery_idx] << "\n";
            //    std::cerr << "would overwrite with " << term << "\n";
            //}
            v[delivery_idx] = term;
        }
    }
    return v;
}

namespace detail {

template<class RandomAccessContainer>
class matched_holder {
public:
    using Real = typename RandomAccessContainer::value_type;

    matched_holder(RandomAccessContainer && y, RandomAccessContainer && dydx, int grid_refinements, Real x0) : x0_{x0}, y_{std::move(y)}, dy_{std::move(dydx)}
    {
        inv_h_ = (1 << grid_refinements);
        Real h = 1/inv_h_;
        for (auto & dy : dy_)
        {
            dy *= h;
        }
    }

    inline Real operator()(Real x) const
    {
        using std::floor;
        using std::sqrt;
        // This is the exact Holder exponent, but it's pessimistic almost everywhere!
        // It's only exactly right at dyadic rationals.
        //Real const alpha = 2 - log(1+sqrt(Real(3)))/log(Real(2));
        // We're gonna use alpha = 1/2, rather than 0.5500...
        Real s = (x-x0_)*inv_h_;
        Real ii = floor(s);
        auto i = static_cast<decltype(y_.size())>(ii);
        Real t = s - ii;
        Real dphi = dy_[i+1];
        Real diff = y_[i+1] - y_[i];
        return y_[i] + (2*dphi - diff)*t + 2*sqrt(t)*(diff-dphi);
    }

    int64_t bytes() const
    {
        return 2*y_.size()*sizeof(Real) + sizeof(*this);
    }

private:
    Real x0_;
    Real inv_h_;
    RandomAccessContainer y_;
    RandomAccessContainer dy_;
};

template<class RandomAccessContainer>
class matched_holder_aos {
public:
    using Point = typename RandomAccessContainer::value_type;
    using Real = typename Point::value_type;

    matched_holder_aos(RandomAccessContainer && data, int grid_refinements, Real x0) : x0_{x0}, data_{std::move(data)}
    {
        inv_h_ = Real(1uLL << grid_refinements);
        Real h = 1/inv_h_;
        for (auto & datum : data_)
        {
            datum[1] *= h;
        }
    }

    inline Real operator()(Real x) const
    {
        using std::floor;
        using std::sqrt;
        Real s = (x-x0_)*inv_h_;
        Real ii = floor(s);
        auto i = static_cast<decltype(data_.size())>(ii);
        Real t = s - ii;
        Real y0 = data_[i][0];
        Real y1 = data_[i+1][0];
        Real dphi = data_[i+1][1];
        Real diff = y1 - y0;
        return y0 + (2*dphi - diff)*t + 2*sqrt(t)*(diff-dphi);
    }

    int64_t bytes() const
    {
        return data_.size()*data_[0].size()*sizeof(Real) + sizeof(*this);
    }

private:
    Real x0_;
    Real inv_h_;
    RandomAccessContainer data_;
};


template<class RandomAccessContainer>
class linear_interpolation {
public:
    using Real = typename RandomAccessContainer::value_type;

    linear_interpolation(RandomAccessContainer && y, RandomAccessContainer && dydx, int grid_refinements) : y_{std::move(y)}, dydx_{std::move(dydx)}
    {
        s_ = (1 << grid_refinements);
    }

    inline Real operator()(Real x) const
    {
        using std::floor;
        Real y = x*s_;
        Real k = floor(y);

        int64_t kk = static_cast<int64_t>(k);
        Real t = y - k;
        return (1-t)*y_[kk] + t*y_[kk+1];
    }

    inline Real prime(Real x) const
    {
        using std::floor;

        Real y = x*s_;
        Real k = floor(y);

        int64_t kk = static_cast<int64_t>(k);
        Real t = y - k;
        return static_cast<Real>((Real(1)-t)*dydx_[kk] + t*dydx_[kk+1]);
    }

    int64_t bytes() const
    {
        return (1 + y_.size() + dydx_.size())*sizeof(Real) + sizeof(y_) + sizeof(dydx_);
    }

private:
    Real s_;
    RandomAccessContainer y_;
    RandomAccessContainer dydx_;
};

template<class RandomAccessContainer>
class linear_interpolation_aos {
public:
    using Point = typename RandomAccessContainer::value_type;
    using Real = typename Point::value_type;

    linear_interpolation_aos(RandomAccessContainer && data, int grid_refinements, Real x0) : x0_{x0}, data_{std::move(data)}
    {
        s_ = Real(1uLL << grid_refinements);
    }

    inline Real operator()(Real x) const
    {
        using std::floor;
        Real y = (x-x0_)*s_;
        Real k = floor(y);

        int64_t kk = static_cast<int64_t>(k);
        Real t = y - k;
        return (t != 0) ? (1-t)*data_[kk][0] + t*data_[kk+1][0] : data_[kk][0];
    }

    inline Real prime(Real x) const
    {
        using std::floor;
        Real y = (x-x0_)*s_;
        Real k = floor(y);

        int64_t kk = static_cast<int64_t>(k);
        Real t = y - k;
        return t != 0 ? (1-t)*data_[kk][1] + t*data_[kk+1][1] : data_[kk][1];
    }

    int64_t bytes() const
    {
        return sizeof(*this) + data_.size()*data_[0].size()*sizeof(Real);
    }

private:
    Real x0_;
    Real s_;
    RandomAccessContainer data_;
};


template <class T>
struct daubechies_eval_type
{
    using type = T;

    static const std::vector<T>& vector_cast(const std::vector<T>& v) { return v; }

};
template <>
struct daubechies_eval_type<float>
{
    using type = double;

    inline static std::vector<float> vector_cast(const std::vector<double>& v)
    {
        std::vector<float> result(v.size());
        for (unsigned i = 0; i < v.size(); ++i)
            result[i] = static_cast<float>(v[i]);
        return result;
    }
};
template <>
struct daubechies_eval_type<double>
{
    using type = long double;

    inline static std::vector<double> vector_cast(const std::vector<long double>& v)
    {
        std::vector<double> result(v.size());
        for (unsigned i = 0; i < v.size(); ++i)
            result[i] = static_cast<double>(v[i]);
        return result;
    }
};

struct null_interpolator
{
    template <class T>
    T operator()(const T&)
    {
        return 1;
    }
};

} // namespace detail

template<class Real, int p>
class daubechies_scaling {
    //
    // Some type manipulation so we know the type of the interpolator, and the vector type it requires:
    //
    using vector_type = std::vector<std::array<Real, p < 6 ? 2 : p < 10 ? 3 : 4>>;
    //
    // List our interpolators:
    //
    using interpolator_list = std::tuple<
        detail::null_interpolator, detail::matched_holder_aos<vector_type>, detail::linear_interpolation_aos<vector_type>, 
        interpolators::detail::cardinal_cubic_hermite_detail_aos<vector_type>, interpolators::detail::cardinal_quintic_hermite_detail_aos<vector_type>,
        interpolators::detail::cardinal_septic_hermite_detail_aos<vector_type> >;
    //
    // Select the one we need:
    //
    using interpolator_type = std::tuple_element_t<
        p == 1 ? 0 :
        p == 2 ? 1 :
        p == 3 ? 2 :
        p <= 5 ? 3 :
        p <= 9 ? 4 : 5, interpolator_list>;

public:
    daubechies_scaling(int grid_refinements = -1)
    {
        static_assert(p < 20, "Daubechies scaling functions are only implemented for p < 20.");
        static_assert(p > 0, "Daubechies scaling functions must have at least 1 vanishing moment.");
        if constexpr (p == 1)
        {
            return;
        }
        else {
            if (grid_refinements < 0)
            {
                if constexpr (std::is_same_v<Real, float>)
                {
                if (grid_refinements == -2)
                {
                    // Control absolute error:
                    //                          p= 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
                    constexpr std::array<int, 20> r{ -1, -1, 18, 19, 16, 11,  8,  7,  7,  7,  5,  5,  4,  4,  4,  4,  3,  3,  3,  3 };
                    grid_refinements = r[p];
                }
                else
                {
                    // Control relative error:
                    //                          p= 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
                    constexpr std::array<int, 20> r{ -1, -1, 21, 21, 21, 17, 16, 15, 14, 13, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11 };
                    grid_refinements = r[p];
                }
                }
                else if constexpr (std::is_same_v<Real, double>)
                {
                    //                          p= 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
                    constexpr std::array<int, 20> r{ -1, -1, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 18, 18, 18, 18 };
                    grid_refinements = r[p];
                }
                else
                {
                    grid_refinements = 21;
                }
            }

            // Compute the refined grid:
            // In fact for float precision I know the grid must be computed in double precision and then cast back down, or else parts of the support are systematically inaccurate.
            std::future<std::vector<Real>> t0 = std::async(std::launch::async, [&grid_refinements]() {
                // Computing in higher precision and downcasting is essential for 1ULP evaluation in float precision:
                auto v = daubechies_scaling_dyadic_grid<typename detail::daubechies_eval_type<Real>::type, p, 0>(grid_refinements);
                return detail::daubechies_eval_type<Real>::vector_cast(v);
                });
            // Compute the derivative of the refined grid:
            std::future<std::vector<Real>> t1 = std::async(std::launch::async, [&grid_refinements]() {
                auto v = daubechies_scaling_dyadic_grid<typename detail::daubechies_eval_type<Real>::type, p, 1>(grid_refinements);
                return detail::daubechies_eval_type<Real>::vector_cast(v);
                });

            // if necessary, compute the second and third derivative:
            std::vector<Real> d2ydx2;
            std::vector<Real> d3ydx3;
            if constexpr (p >= 6) {
                std::future<std::vector<Real>> t3 = std::async(std::launch::async, [&grid_refinements]() {
                auto v = daubechies_scaling_dyadic_grid<typename detail::daubechies_eval_type<Real>::type, p, 2>(grid_refinements);
                return detail::daubechies_eval_type<Real>::vector_cast(v);
                });

                if constexpr (p >= 10) {
                std::future<std::vector<Real>> t4 = std::async(std::launch::async, [&grid_refinements]() {
                    auto v = daubechies_scaling_dyadic_grid<typename detail::daubechies_eval_type<Real>::type, p, 3>(grid_refinements);
                    return detail::daubechies_eval_type<Real>::vector_cast(v);
                    });
                d3ydx3 = t4.get();
                }
                d2ydx2 = t3.get();
            }


            auto y = t0.get();
            auto dydx = t1.get();

            if constexpr (p >= 2)
            {
                vector_type data(y.size());
                for (size_t i = 0; i < y.size(); ++i)
                {
                    data[i][0] = y[i];
                    data[i][1] = dydx[i];
                    if constexpr (p >= 6)
                        data[i][2] = d2ydx2[i];
                    if constexpr (p >= 10)
                        data[i][3] = d3ydx3[i];
                }
                if constexpr (p <= 3)
                    m_interpolator = std::make_shared<interpolator_type>(std::move(data), grid_refinements, Real(0));
                else
                    m_interpolator = std::make_shared<interpolator_type>(std::move(data), Real(0), Real(1) / (1 << grid_refinements));
            }
            else
                m_interpolator = std::make_shared<detail::null_interpolator>();
        }
    }

    inline Real operator()(Real x) const
    {
        if (x <= 0 || x >= 2*p-1)
        {
            return 0;
        }
        return (*m_interpolator)(x);
    }

    inline Real prime(Real x) const
    {
        static_assert(p > 2, "The 3-vanishing moment Daubechies scaling function is the first which is continuously differentiable.");
        if (x <= Real(0) || x >= 2*p-1)
        {
            return 0;
        }
        return m_interpolator->prime(x);
    }

    inline Real double_prime(Real x) const
    {
        static_assert(p >= 6, "Second derivatives require at least 6 vanishing moments.");
        if (x <= 0 || x >= 2*p - 1)
        {
            return Real(0);
        }
        return m_interpolator->double_prime(x);
    }

    std::pair<Real, Real> support() const
    {
        return {Real(0), Real(2*p-1)};
    }

    int64_t bytes() const
    {
        return m_interpolator->bytes() + sizeof(m_interpolator);
    }

private:
   std::shared_ptr<interpolator_type> m_interpolator;
};

}
#endif