Program Listing for File LinAlg.hpp
↰ Return to documentation for file (nvcv_types/include/nvcv/cuda/math/LinAlg.hpp
)
/*
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef NVCV_CUDA_MATH_LINALG_HPP
#define NVCV_CUDA_MATH_LINALG_HPP
#include <nvcv/cuda/MathWrappers.hpp> // for cuda::max, etc.
#include <nvcv/cuda/TypeTraits.hpp> // for cuda::Require, etc.
#include <algorithm> // for std::swap, etc.
#include <cassert> // for assert, etc.
#include <cmath> // for std::pow, etc.
#include <cstdlib> // for std::size_t, etc.
#include <initializer_list> // for std::initializer_list, etc.
#include <ostream> // for std::ostream, etc.
#include <vector> // for std::vector, etc.
namespace nvcv::cuda::math {
template<class T, int N>
class Vector
{
public:
// Type of values in this vector.
using Type = T;
constexpr __host__ __device__ void load(const T *inVector)
{
#pragma unroll
for (int i = 0; i < N; ++i)
{
m_data[i] = inVector[i];
}
}
constexpr __host__ __device__ void load(std::initializer_list<T> l)
{
load(std::data(l));
}
constexpr __host__ __device__ void store(T *outVector) const
{
#pragma unroll
for (int i = 0; i < N; ++i)
{
outVector[i] = m_data[i];
}
}
constexpr __host__ __device__ int size() const
{
return N;
}
constexpr const __host__ __device__ T &operator[](int i) const
{
assert(i >= 0 && i < size());
return m_data[i];
}
constexpr __host__ __device__ T &operator[](int i)
{
assert(i >= 0 && i < size());
return m_data[i];
}
constexpr __host__ __device__ operator const T *() const
{
return &m_data[0];
}
constexpr __host__ __device__ operator T *()
{
return &m_data[0];
}
constexpr const __host__ __device__ T *cbegin() const
{
return &m_data[0];
}
constexpr __host__ __device__ T *begin()
{
return &m_data[0];
}
constexpr const __host__ __device__ T *cend() const
{
return &m_data[0] + size();
}
constexpr __host__ __device__ T *end()
{
return &m_data[0] + size();
}
std::vector<T> to_vector() const
{
return std::vector<T>(cbegin(), cend());
}
template<int R>
constexpr __host__ __device__ Vector<T, R> subv(int beg) const
{
Vector<T, R> v;
for (int i = beg; i < beg + R; ++i)
{
v[i - beg] = m_data[i];
}
return v;
}
// On-purpose public data to allow POD-class direct initialization.
T m_data[N];
};
template<class T, int M, int N = M>
class Matrix
{
public:
// Type of values in this matrix.
using Type = T;
constexpr __host__ __device__ void load(const T *inFlattenMatrix)
{
int idx = 0;
#pragma unroll
for (int i = 0; i < M; ++i)
{
#pragma unroll
for (int j = 0; j < N; ++j)
{
m_data[i][j] = inFlattenMatrix[idx++];
}
}
}
constexpr __host__ __device__ void load(std::initializer_list<T> l)
{
load(std::data(l));
}
constexpr __host__ __device__ void store(T *outFlattenMatrix) const
{
int idx = 0;
#pragma unroll
for (int i = 0; i < M; ++i)
{
#pragma unroll
for (int j = 0; j < N; ++j)
{
outFlattenMatrix[idx++] = m_data[i][j];
}
}
}
constexpr __host__ __device__ int rows() const
{
return M;
}
constexpr __host__ __device__ int cols() const
{
return N;
}
constexpr const __host__ __device__ Vector<T, N> &operator[](int i) const
{
assert(i >= 0 && i < rows());
return m_data[i];
}
constexpr __host__ __device__ Vector<T, N> &operator[](int i)
{
assert(i >= 0 && i < rows());
return m_data[i];
}
constexpr const __host__ __device__ T &operator[](int2 c) const
{
assert(c.y >= 0 && c.y < rows());
assert(c.x >= 0 && c.x < cols());
return m_data[c.y][c.x];
}
constexpr __host__ __device__ T &operator[](int2 c)
{
assert(c.y >= 0 && c.y < rows());
assert(c.x >= 0 && c.x < cols());
return m_data[c.y][c.x];
}
constexpr __host__ __device__ Vector<T, M> col(int j) const
{
Vector<T, M> c;
#pragma unroll
for (int i = 0; i < rows(); ++i)
{
c[i] = m_data[i][j];
}
return c;
}
constexpr __host__ __device__ void set_col(int j, T v)
{
#pragma unroll
for (int i = 0; i < rows(); ++i)
{
m_data[i][j] = v;
}
}
constexpr __host__ __device__ void set_col(int j, const Vector<T, M> &c)
{
#pragma unroll
for (int i = 0; i < rows(); ++i)
{
m_data[i][j] = c[i];
}
}
// @overload void set_col(int j, const T *c)
constexpr __host__ __device__ void set_col(int j, const T *c)
{
#pragma unroll
for (int i = 0; i < rows(); ++i)
{
m_data[i][j] = c[i];
}
}
constexpr __host__ __device__ Matrix<T, M - 1, N - 1> subm(int skip_i, int skip_j) const
{
Matrix<T, M - 1, N - 1> ret;
int ri = 0;
for (int i = 0; i < rows(); ++i)
{
if (i == skip_i)
{
continue;
}
int rj = 0;
for (int j = 0; j < cols(); ++j)
{
if (j == skip_j)
{
continue;
}
ret[ri][rj] = (*this)[i][j];
++rj;
}
++ri;
}
return ret;
}
// On-purpose public data to allow POD-class direct initialization.
Vector<T, N> m_data[M];
};
namespace detail {
template<class T>
constexpr __host__ __device__ void swap(T &a, T &b)
{
#ifdef __CUDA_ARCH__
T c = a;
a = b;
b = c;
#else
std::swap(a, b);
#endif
}
} // namespace detail
// Vector-based operations -----------------------------------------------------
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> &operator+=(Vector<T, N> &lhs, const Vector<T, N> &rhs)
{
#pragma unroll
for (int j = 0; j < N; ++j)
{
lhs[j] += rhs[j];
}
return lhs;
}
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> operator+(const Vector<T, N> &a, const Vector<T, N> &b)
{
Vector<T, N> r(a);
return r += b;
}
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> &operator+=(Vector<T, N> &lhs, T rhs)
{
#pragma unroll
for (int j = 0; j < N; ++j)
{
lhs[j] += rhs;
}
return lhs;
}
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> operator+(const Vector<T, N> &a, T b)
{
Vector<T, N> r(a);
return r += b;
}
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> operator+(T a, const Vector<T, N> &b)
{
Vector<T, N> r(b);
return r += a;
}
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> &operator-=(Vector<T, N> &lhs, const Vector<T, N> &rhs)
{
return lhs += -rhs;
}
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> operator-(const Vector<T, N> &a, const Vector<T, N> &b)
{
Vector<T, N> r(a);
return r -= b;
}
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> &operator-=(Vector<T, N> &lhs, T rhs)
{
return lhs += static_cast<T>(-rhs);
}
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> operator-(const Vector<T, N> &a, T b)
{
Vector<T, N> r(a);
return r -= b;
}
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> operator-(T a, const Vector<T, N> &b)
{
Vector<T, N> r(-b);
return r += a;
}
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> &operator*=(Vector<T, N> &lhs, const T &rhs)
{
#pragma unroll
for (int j = 0; j < N; ++j)
{
lhs[j] *= rhs;
}
return lhs;
}
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> operator*(const Vector<T, N> &a, const T &b)
{
Vector<T, N> r(a);
return r *= b;
}
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> operator*(const T &a, const Vector<T, N> &b)
{
return b * a;
}
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> &operator*=(Vector<T, N> &lhs, const Vector<T, N> &rhs)
{
#pragma unroll
for (int j = 0; j < N; ++j)
{
lhs[j] *= rhs[j];
}
return lhs;
}
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> operator*(const Vector<T, N> &a, const Vector<T, N> &b)
{
Vector<T, N> r(a);
return r *= b;
}
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> &operator/=(Vector<T, N> &lhs, const T &rhs)
{
#pragma unroll
for (int j = 0; j < N; ++j)
{
lhs[j] /= rhs;
}
return lhs;
}
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> operator/(const Vector<T, N> &a, const T &b)
{
Vector<T, N> r(a);
return r /= b;
}
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> operator/(T a, const Vector<T, N> &b)
{
Vector<T, N> r(b);
#pragma unroll
for (int j = 0; j < N; ++j)
{
r[j] = a / r[j];
}
return r;
}
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> &operator/=(Vector<T, N> &lhs, const Vector<T, N> &rhs)
{
#pragma unroll
for (int j = 0; j < N; ++j)
{
lhs[j] /= rhs[j];
}
return lhs;
}
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> operator/(const Vector<T, N> &a, const Vector<T, N> &b)
{
Vector<T, N> r(a);
return r /= b;
}
template<class T, int N>
std::ostream &operator<<(std::ostream &out, const Vector<T, N> &v)
{
out << '[';
for (int i = 0; i < N; ++i)
{
out << v[i];
if (i < N - 1)
{
out << ' ';
}
}
return out << ']';
}
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> operator-(const Vector<T, N> &v)
{
Vector<T, N> r;
for (int i = 0; i < N; ++i)
{
r[i] = -v[i];
}
return r;
}
template<class T, int N>
constexpr __host__ __device__ bool operator==(const Vector<T, N> &a, const Vector<T, N> &b)
{
for (int i = 0; i < N; ++i)
{
if (a[i] != b[i])
{
return false;
}
}
return true;
}
template<class T, int N>
constexpr __host__ __device__ bool operator==(const T &a, const Vector<T, N> &b)
{
for (int i = 0; i < N; ++i)
{
if (a != b[i])
{
return false;
}
}
return true;
}
template<class T, int N>
constexpr __host__ __device__ bool operator==(const Vector<T, N> &a, const T &b)
{
for (int i = 0; i < N; ++i)
{
if (a[i] != b)
{
return false;
}
}
return true;
}
template<class T, int N>
constexpr __host__ __device__ bool operator<(const Vector<T, N> &a, const Vector<T, N> &b)
{
for (int i = 0; i < N; ++i)
{
if (a[i] != b[i])
{
return a[i] < b[i];
}
}
return false;
}
// Matrix-based operations -----------------------------------------------------
template<class T, int M, int N>
constexpr __host__ __device__ Matrix<T, M, N> operator*(const Matrix<T, M, N> &m, T val)
{
Matrix<T, M, N> r(m);
return r *= val;
}
template<class T, int M, int N>
std::ostream &operator<<(std::ostream &out, const Matrix<T, M, N> &m)
{
out << '[';
for (int i = 0; i < m.rows(); ++i)
{
for (int j = 0; j < m.cols(); ++j)
{
out << m[i][j];
if (j < m.cols() - 1)
{
out << ' ';
}
}
if (i < m.rows() - 1)
{
out << "\n";
}
}
return out << ']';
}
template<class T, int M, int N, int P>
constexpr __host__ __device__ Matrix<T, M, P> operator*(const Matrix<T, M, N> &a, const Matrix<T, N, P> &b)
{
Matrix<T, M, P> r;
#pragma unroll
for (int i = 0; i < M; ++i)
{
#pragma unroll
for (int j = 0; j < P; ++j)
{
r[i][j] = a[i][0] * b[0][j];
#pragma unroll
for (int k = 1; k < N; ++k)
{
r[i][j] += a[i][k] * b[k][j];
}
}
}
return r;
}
template<class T, int M, int N>
constexpr __host__ __device__ Matrix<T, M, N> &operator*=(Matrix<T, M, N> &lhs, T rhs)
{
#pragma unroll
for (int i = 0; i < lhs.rows(); ++i)
{
#pragma unroll
for (int j = 0; j < lhs.cols(); ++j)
{
lhs[i][j] *= rhs;
}
}
return lhs;
}
template<class T, int M, int N>
constexpr __host__ __device__ Matrix<T, M, N> operator*(T val, const Matrix<T, M, N> &m)
{
return m * val;
}
template<class T, int M, int N>
constexpr __host__ __device__ Matrix<T, M, N> &operator+=(Matrix<T, M, N> &lhs, const Matrix<T, M, N> &rhs)
{
#pragma unroll
for (int i = 0; i < M; ++i)
{
#pragma unroll
for (int j = 0; j < N; ++j)
{
lhs[i][j] += rhs[i][j];
}
}
return lhs;
}
template<class T, int M, int N>
constexpr __host__ __device__ Matrix<T, M, N> operator+(const Matrix<T, M, N> &lhs, const Matrix<T, M, N> &rhs)
{
Matrix<T, M, N> r(lhs);
return r += rhs;
}
template<class T, int M, int N>
constexpr __host__ __device__ Matrix<T, M, N> &operator-=(Matrix<T, M, N> &lhs, const Matrix<T, M, N> &rhs)
{
#pragma unroll
for (int i = 0; i < M; ++i)
{
#pragma unroll
for (int j = 0; j < N; ++j)
{
lhs[i][j] -= rhs[i][j];
}
}
return lhs;
}
template<class T, int M, int N>
constexpr __host__ __device__ Matrix<T, M, N> operator-(const Matrix<T, M, N> &a, const Matrix<T, M, N> &b)
{
Matrix<T, M, N> r(a);
return r -= b;
}
template<class T, int M, int N>
constexpr __host__ __device__ Vector<T, N> operator*(const Vector<T, M> &v, const Matrix<T, M, N> &m)
{
Vector<T, N> r;
#pragma unroll
for (int j = 0; j < m.cols(); ++j)
{
r[j] = v[0] * m[0][j];
#pragma unroll
for (int i = 1; i < m.rows(); ++i)
{
r[j] += v[i] * m[i][j];
}
}
return r;
}
template<class T, int M, int N>
constexpr __host__ __device__ Vector<T, M> operator*(const Matrix<T, M, N> &m, const Vector<T, N> &v)
{
Vector<T, M> r;
#pragma unroll
for (int i = 0; i < m.rows(); ++i)
{
r[i] = m[i][0] * v[0];
#pragma unroll
for (int j = 1; j < m.cols(); ++j)
{
r[i] += m[i][j] * v[j];
}
}
return r;
}
template<class T, int M, int N, class = cuda::Require<(M == N && N > 1)>>
constexpr __host__ __device__ Matrix<T, M, N> operator*(const Matrix<T, M, 1> &m, const Vector<T, N> &v)
{
Matrix<T, M, N> r;
#pragma unroll
for (int i = 0; i < r.rows(); ++i)
{
#pragma unroll
for (int j = 0; j < r.cols(); ++j)
{
r[i][j] = m[i][0] * v[j];
}
}
return r;
}
template<class T, int M, int N>
constexpr __host__ __device__ Vector<T, N> &operator*=(Vector<T, M> &v, const Matrix<T, M, M> &m)
{
return v = v * m;
}
template<class T, int M, int N>
constexpr __host__ __device__ Matrix<T, M, N> &operator*=(Matrix<T, M, N> &lhs, const Matrix<T, N, N> &rhs)
{
return lhs = lhs * rhs;
}
template<class T, int M, int N>
constexpr __host__ __device__ Matrix<T, M, N> operator-(const Matrix<T, M, N> &m)
{
Matrix<T, M, N> r;
for (int i = 0; i < M; ++i)
{
for (int j = 0; j < N; ++j)
{
r[i][j] = -m[i][j];
}
}
return r;
}
template<class T, int M, int N>
constexpr __host__ __device__ bool operator==(const Matrix<T, M, N> &a, const Matrix<T, M, N> &b)
{
for (int i = 0; i < M; ++i)
{
for (int j = 0; j < N; ++j)
{
if (a[i] != b[i])
{
return false;
}
}
}
return true;
}
template<class T, int M, int N>
constexpr __host__ __device__ bool operator==(const T &a, const Matrix<T, M, N> &b)
{
for (int i = 0; i < M; ++i)
{
for (int j = 0; j < N; ++j)
{
if (a != b[i])
{
return false;
}
}
}
return true;
}
template<class T, int M, int N>
constexpr __host__ __device__ bool operator==(const Matrix<T, M, N> &a, const T &b)
{
for (int i = 0; i < M; ++i)
{
for (int j = 0; j < N; ++j)
{
if (a[i] != b)
{
return false;
}
}
}
return true;
}
template<class T, int M, int N>
constexpr __host__ __device__ bool operator<(const Matrix<T, M, N> &a, const Matrix<T, M, N> &b)
{
for (int i = 0; i < M; ++i)
{
for (int j = 0; j < N; ++j)
{
if (a[i] != b[i])
{
return a[i] < b[i];
}
}
}
// if we reach here, it means that a==b
return false;
}
template<typename T, int N, int M>
constexpr Matrix<T, N, M> as_matrix(const T (&values)[N][M])
{
Matrix<T, N, M> m;
#pragma unroll
for (int i = 0; i < N; i++)
{
#pragma unroll
for (int j = 0; j < M; j++)
{
m[i][j] = values[i][j];
}
}
return m;
}
// Special matrices ------------------------------------------------------------
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> zeros()
{
Vector<T, N> v = {};
#pragma unroll
for (int j = 0; j < N; ++j)
{
v[j] = T{0};
}
return v;
}
template<class T, int M, int N>
constexpr __host__ __device__ Matrix<T, M, N> zeros()
{
Matrix<T, M, N> m = {};
#pragma unroll
for (int i = 0; i < M; ++i)
{
#pragma unroll
for (int j = 0; j < N; ++j)
{
m[i][j] = T{0};
}
}
return m;
}
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> ones()
{
Vector<T, N> v = {};
#pragma unroll
for (int j = 0; j < N; ++j)
{
v[j] = T{1};
}
return v;
}
template<class T, int M, int N>
constexpr __host__ __device__ Matrix<T, M, N> ones()
{
Matrix<T, M, N> m = {};
#pragma unroll
for (int i = 0; i < M; ++i)
{
#pragma unroll
for (int j = 0; j < N; ++j)
{
m[i][j] = T{1};
}
}
return m;
}
template<class T, int M, int N>
constexpr __host__ __device__ Matrix<T, M, N> identity()
{
Matrix<T, M, N> m = {};
#pragma unroll
for (int i = 0; i < M; ++i)
{
#pragma unroll
for (int j = 0; j < N; ++j)
{
m[i][j] = i == j ? 1 : 0;
}
}
return m;
}
template<class T, int M>
constexpr __host__ __device__ Matrix<T, M, M> vander(const Vector<T, M> &v)
{
Matrix<T, M, M> m = {};
for (int i = 0; i < M; ++i)
{
for (int j = 0; j < M; ++j)
{
m[i][j] = cuda::pow(v[j], i);
}
}
return m;
}
template<class T, int R>
constexpr __host__ __device__ Matrix<T, R, R> compan(const Vector<T, R> &a)
{
Matrix<T, R, R> m;
for (int i = 0; i < R; ++i)
{
for (int j = 0; j < R; ++j)
{
if (j == R - 1)
{
m[i][j] = -a[i];
}
else
{
m[i][j] = j == i - 1 ? 1 : 0;
}
}
}
return m;
}
template<class T, int M>
constexpr __host__ __device__ Matrix<T, M, M> diag(const Vector<T, M> &v)
{
Matrix<T, M, M> m = {};
for (int i = 0; i < M; ++i)
{
for (int j = 0; j < M; ++j)
{
m[i][j] = i == j ? v[i] : 0;
}
}
return m;
}
// Basic operations ------------------------------------------------------------
template<class T, int N>
constexpr __host__ __device__ T dot(const Vector<T, N> &a, const Vector<T, N> &b)
{
T d = a[0] * b[0];
#pragma unroll
for (int j = 1; j < N; ++j)
{
d += a[j] * b[j];
}
return d;
}
template<class T, int N>
constexpr __host__ __device__ Vector<T, N> reverse(const Vector<T, N> &a)
{
Vector<T, N> r = {};
#pragma unroll
for (int j = 0; j < N; ++j)
{
r[j] = a[N - 1 - j];
}
return r;
}
// Transformations -------------------------------------------------------------
template<class T, int M>
constexpr __host__ __device__ Matrix<T, M, M> &transp_inplace(Matrix<T, M, M> &m)
{
for (int i = 0; i < M; ++i)
{
for (int j = i + 1; j < M; ++j)
{
detail::swap(m[i][j], m[j][i]);
}
}
return m;
}
template<class T, int M, int N>
constexpr __host__ __device__ Matrix<T, N, M> transp(const Matrix<T, M, N> &m)
{
Matrix<T, N, M> tm = {};
#pragma unroll
for (int i = 0; i < M; ++i)
{
#pragma unroll
for (int j = 0; j < N; ++j)
{
tm[j][i] = m[i][j];
}
}
return tm;
}
template<class T, int N>
constexpr __host__ __device__ Matrix<T, N, 1> transp(const Vector<T, N> &v)
{
Matrix<T, N, 1> tv = {};
tv.set_col(0, v);
return tv;
}
template<class T, int M, int N>
constexpr __host__ __device__ Matrix<T, M, N> flip_rows(const Matrix<T, M, N> &m)
{
Matrix<T, M, N> f = {};
#pragma unroll
for (int i = 0; i < M; ++i)
{
#pragma unroll
for (int j = 0; j < N; ++j)
{
f[i][j] = m[M - 1 - i][j];
}
}
return f;
}
template<class T, int M, int N>
constexpr __host__ __device__ Matrix<T, M, N> flip_cols(const Matrix<T, M, N> &m)
{
Matrix<T, M, N> f = {};
#pragma unroll
for (int i = 0; i < M; ++i)
{
#pragma unroll
for (int j = 0; j < N; ++j)
{
f[i][j] = m[i][N - 1 - j];
}
}
return f;
}
template<class T, int M, int N>
constexpr __host__ __device__ Matrix<T, M, N> flip(const Matrix<T, M, N> &m)
{
Matrix<T, M, N> f = {};
#pragma unroll
for (int i = 0; i < M; ++i)
{
#pragma unroll
for (int j = 0; j < N; ++j)
{
f[i][j] = m[M - 1 - i][N - 1 - j];
}
}
return f;
}
template<int R, typename T, int M, int N, class = cuda::Require<R <= M>>
constexpr __host__ __device__ Matrix<T, R, N> head(const Matrix<T, M, N> &m)
{
Matrix<T, R, N> h;
#pragma unroll
for (int i = 0; i < R; ++i)
{
#pragma unroll
for (int j = 0; j < N; ++j)
{
h[i][j] = m[i][j];
}
}
return h;
}
template<int R, typename T, int M, int N, class = cuda::Require<R <= M>>
constexpr __host__ __device__ Matrix<T, R, N> tail(const Matrix<T, M, N> &m)
{
Matrix<T, R, N> t;
#pragma unroll
for (int i = 0; i < R; ++i)
{
#pragma unroll
for (int j = 0; j < N; ++j)
{
t[i][j] = m[M - R + i][j];
}
}
return t;
}
// Advanced operations ---------------------------------------------------------
// Linear-time invariant (LTI) filtering is a fundamental operation in signal and image processing. Many
// applications use LTI filters that can be expressed as linear, constant-coefficient difference equations.
// Functions below implement a convolution pass, i.e. finite impulse response (FIR), and a causal/anticausal
// combination of recursive filter passes, i.e. infinite impulse response (IIR), both defined by a set of weights.
// Definitions: input single element x, block b, length N; filter weights w, order R; prologue p; epilogue e.
// Illustrative example of N=11 and R=3 showing a block b in between previous and next blocks.
// |----------------- b -----------------|
// <previous block> | [ e0 e1 e2 ] x x x x x [ p0 p1 p2 ] | <next block>
// FIR + IIR filtering with causal combination is called forward; with anticausal combination is called reverse.
// Forward (fwd): y = w[0] * x - w[1] * p2 - w[2] * p1 - w[3] * p0
// The y passed in is considered to be: y = w[0] * x
// Reverse (rev): z = w[0] * y - w[1] * e0 - w[2] * e1 - w[3] * e2
// The z passed in is considered to be: z = w[0] * y
// Forward pass in a single element, updating prologue accordingly and returning result
template<typename T, int R>
constexpr __host__ __device__ T fwd1(Vector<T, R> &p, T y, const Vector<T, R + 1> &w)
{
y = y - p[R - 1] * w[1];
#pragma unroll
for (int k = R - 1; k >= 1; --k)
{
y = y - p[R - 1 - k] * w[k + 1];
p[R - 1 - k] = p[R - 1 - k + 1];
}
p[R - 1] = y;
return y;
}
// Forward pass in a block of N elements, updating prologue accordingly and in-place
template<typename T, int N, int R>
constexpr __host__ __device__ void fwdN(Vector<T, R> &p, Vector<T, N> &b, const Vector<T, R + 1> &w)
{
#pragma unroll
for (int k = 0; k < N; ++k)
{
b[k] = fwd1(p, w[0] * b[k], w);
}
}
// Forward-transpose pass over rows of a block of MxN elements, updating prologue accordingly and in-place
template<typename T, int M, int N, int R>
constexpr __host__ void fwdT(Matrix<T, M, R> &p, Matrix<T, M, N> &b, const Vector<T, R + 1> &w)
{
#pragma unroll
for (int i = 0; i < M; ++i)
{
fwdN(p[i], b[i], w);
}
}
// Forward pass over columns of a block of MxN elements, returning result
template<typename T, int M, int N, int R>
constexpr __host__ Matrix<T, M, N> fwd(const Matrix<T, R, N> &p, const Matrix<T, M, N> &b, const Vector<T, R + 1> &w)
{
Matrix<T, M, N> bout;
#pragma unroll
for (int j = 0; j < N; ++j)
{
Vector<T, R> pT = p.col(j);
#pragma unroll
for (int i = 0; i < M; ++i)
{
bout[i][j] = fwd1(pT, b[i][j] * w[0], w);
}
}
return bout;
}
// Reverse pass in a single element, updating epilogue accordingly and returning result
template<class T, int R>
constexpr __host__ __device__ T rev1(T z, Vector<T, R> &e, const Vector<T, R + 1> &w)
{
z = z - e[0] * w[1];
#pragma unroll
for (int k = R - 1; k >= 1; --k)
{
z = z - e[k] * w[k + 1];
e[k] = e[k - 1];
}
e[0] = z;
return z;
}
// Reverse pass in a block of N elements, updating prologue accordingly and in-place
template<typename T, int N, int R>
constexpr __host__ __device__ void revN(Vector<T, N> &b, Vector<T, R> &e, const Vector<T, R + 1> &w)
{
#pragma unroll
for (int k = N - 1; k >= 0; --k)
{
b[k] = rev1(w[0] * b[k], e, w);
}
}
// Reverse-transpose pass over rows of a block of MxN elements, updating prologue accordingly and in-place
template<typename T, int M, int N, int R>
constexpr __host__ void revT(Matrix<T, M, N> &b, Matrix<T, M, R> &e, const Vector<T, R + 1> &w)
{
#pragma unroll
for (int i = 0; i < M; ++i)
{
revN(b[i], e[i], w);
}
}
// Reverse pass over columns of a block of MxN elements, returning result
template<typename T, int M, int N, int R>
constexpr __host__ Matrix<T, M, N> rev(const Matrix<T, M, N> &b, const Matrix<T, R, N> &e, const Vector<T, R + 1> &w)
{
Matrix<T, M, N> bout;
#pragma unroll
for (int j = 0; j < N; ++j)
{
Vector<T, R> eT = e.col(j);
#pragma unroll
for (int i = M - 1; i >= 0; --i)
{
bout[i][j] = rev1(b[i][j] * w[0], eT, w);
}
}
return bout;
}
// Determinant -----------------------------------------------------------------
template<class T>
constexpr __host__ __device__ T det(const Matrix<T, 0, 0> &m)
{
return T{1};
}
template<class T>
constexpr __host__ __device__ T det(const Matrix<T, 1, 1> &m)
{
return m[0][0];
}
template<class T>
constexpr __host__ __device__ T det(const Matrix<T, 2, 2> &m)
{
return m[0][0] * m[1][1] - m[0][1] * m[1][0];
}
template<class T>
constexpr __host__ __device__ T det(const Matrix<T, 3, 3> &m)
{
return m[0][0] * (m[1][1] * m[2][2] - m[1][2] * m[2][1]) + m[0][1] * (m[1][2] * m[2][0] - m[1][0] * m[2][2])
+ m[0][2] * (m[1][0] * m[2][1] - m[1][1] * m[2][0]);
}
template<class T, int M>
constexpr __host__ __device__ T det(const Matrix<T, M, M> &m)
{
T d = T{0};
#pragma unroll
for (int i = 0; i < M; ++i)
{
d += ((i % 2 == 0 ? 1 : -1) * m[0][i] * det(m.subm(0, i)));
}
return d;
}
// LU decomposition & solve ----------------------------------------------------
// Do LU decomposition of matrix m using auxiliary pivot vector p and working type F (defaults to float)
template<class F = float, class T, int N>
constexpr __host__ __device__ bool lu_inplace(Matrix<T, N, N> &m, Vector<int, N> &p)
{
Vector<F, N> v = {};
#pragma unroll
for (int i = 0; i < N; ++i)
{
F big = 0;
#pragma unroll
for (int j = 0; j < N; ++j)
{
big = cuda::max<F>(big, cuda::abs(m[i][j]));
}
if (big == 0)
{
return false;
}
v[i] = 1.0 / big;
}
#pragma unroll
for (int k = 0; k < N; ++k)
{
F big = 0;
int imax = k;
#pragma unroll
for (int i = k; i < N; ++i)
{
F aux = v[i] * cuda::abs(m[i][k]);
if (aux > big)
{
big = aux;
imax = i;
}
}
if (k != imax)
{
detail::swap(m[imax], m[k]);
v[imax] = v[k];
}
p[k] = imax;
if (m[k][k] == 0)
{
return false;
}
#pragma unroll
for (int i = k + 1; i < N; ++i)
{
T aux = m[i][k] /= m[k][k];
#pragma unroll
for (int j = k + 1; j < N; ++j)
{
m[i][j] -= aux * m[k][j];
}
}
}
return true;
}
// Solve in-place using given LU decomposition lu and pivot p, the result x is returned in b
template<class T, int N>
constexpr __host__ __device__ void solve_inplace(const Matrix<T, N, N> &lu, const Vector<int, N> &p, Vector<T, N> &b)
{
int ii = -1;
#pragma unroll
for (int i = 0; i < N; ++i)
{
int ip = p[i];
T sum = b[ip];
b[ip] = b[i];
if (ii >= 0)
{
#pragma unroll
for (int j = ii; j < i; ++j)
{
sum -= lu[i][j] * b[j];
}
}
else if (sum != 0)
{
ii = i;
}
b[i] = sum;
}
#pragma unroll
for (int i = N - 1; i >= 0; --i)
{
T sum = b[i];
#pragma unroll
for (int j = i + 1; j < N; ++j)
{
sum -= lu[i][j] * b[j];
}
b[i] = sum / lu[i][i];
}
}
// Solve in-place m * x = b, where x is returned in b
template<class T, int N>
constexpr __host__ __device__ bool solve_inplace(const Matrix<T, N, N> &m, Vector<T, N> &b)
{
Vector<int, N> p = {};
Matrix<T, N, N> LU = m;
if (!lu_inplace(LU, p))
{
return false;
}
solve_inplace(LU, p, b);
return true;
}
// Matrix Inverse --------------------------------------------------------------
namespace detail {
// In this detail, all inverse (and in-place) functions use determinant d of the input matrix m
template<class T>
constexpr __host__ __device__ void inv_inplace(Matrix<T, 1, 1> &m, const T &d)
{
m[0][0] = T{1} / d;
}
template<class T>
constexpr __host__ __device__ Matrix<T, 1, 1> inv(const Matrix<T, 1, 1> &m, const T &d)
{
Matrix<T, 1, 1> A;
inv_inplace(A, d);
return A;
}
template<class T>
constexpr __host__ __device__ void inv_inplace(Matrix<T, 2, 2> &m, const T &d)
{
detail::swap(m[0][0], m[1][1]);
m[0][0] /= d;
m[1][1] /= d;
m[0][1] = -m[0][1] / d;
m[1][0] = -m[1][0] / d;
}
template<class T>
constexpr __host__ __device__ Matrix<T, 2, 2> inv(const Matrix<T, 2, 2> &m, const T &d)
{
Matrix<T, 2, 2> A = m;
inv_inplace(A, d);
return A;
}
template<class T>
constexpr __host__ __device__ Matrix<T, 3, 3> inv(const Matrix<T, 3, 3> &m, const T &d)
{
Matrix<T, 3, 3> A;
A[0][0] = (m[1][1] * m[2][2] - m[1][2] * m[2][1]) / d;
A[0][1] = -(m[0][1] * m[2][2] - m[0][2] * m[2][1]) / d;
A[0][2] = (m[0][1] * m[1][2] - m[0][2] * m[1][1]) / d;
A[1][0] = -(m[1][0] * m[2][2] - m[1][2] * m[2][0]) / d;
A[1][1] = (m[0][0] * m[2][2] - m[0][2] * m[2][0]) / d;
A[1][2] = -(m[0][0] * m[1][2] - m[0][2] * m[1][0]) / d;
A[2][0] = (m[1][0] * m[2][1] - m[1][1] * m[2][0]) / d;
A[2][1] = -(m[0][0] * m[2][1] - m[0][1] * m[2][0]) / d;
A[2][2] = (m[0][0] * m[1][1] - m[0][1] * m[1][0]) / d;
return A;
}
template<class T>
constexpr __host__ __device__ void inv_inplace(Matrix<T, 3, 3> &m, const T &d)
{
m = inv(m, d);
}
} // namespace detail
// Do inverse of matrix m asserting its success
template<class T, int N, class = cuda::Require<(N < 4)>>
constexpr __host__ __device__ Matrix<T, N, N> inv(const Matrix<T, N, N> &m)
{
T d = det(m);
assert(d != 0);
return detail::inv(m, d);
}
// Do inverse in-place of matrix m returning true if succeeded (m has determinant)
template<class T, int N, class = cuda::Require<(N < 4)>>
constexpr __host__ __device__ bool inv_inplace(Matrix<T, N, N> &m)
{
T d = det(m);
if (d == 0)
{
return false;
}
detail::inv_inplace(m, d);
return true;
}
// Do inverse in-place of matrix m returning out using LU decomposition written to m
template<class T, int M>
constexpr __host__ __device__ void inv_lu_inplace(Matrix<T, M, M> &out, Matrix<T, M, M> &m)
{
Vector<int, M> p = {};
bool validResult = lu_inplace(m, p);
assert(validResult);
if (!validResult)
{
return;
}
out = identity<T, M, M>();
#pragma unroll
for (int i = 0; i < M; ++i)
{
solve_inplace(m, p, out[i]);
}
transp_inplace(out);
}
// Do inverse in-place of matrix m using LU decomposition
template<class T, int M>
constexpr __host__ __device__ void inv_lu_inplace(Matrix<T, M, M> &m)
{
Matrix<T, M, M> res;
inv_lu_inplace(res, m);
m = res;
}
// Do inverse using LU decomposition
template<class T, int M>
constexpr __host__ __device__ Matrix<T, M, M> inv_lu(Matrix<T, M, M> m)
{
inv_lu_inplace(m);
return m;
}
} // namespace nvcv::cuda::math
#endif // NVCV_CUDA_MATH_LINALG_HPP