1#ifndef TAKANE_HDF5_SPARSE_MATRIX_HPP
2#define TAKANE_HDF5_SPARSE_MATRIX_HPP
4#include "ritsuko/ritsuko.hpp"
5#include "ritsuko/hdf5/hdf5.hpp"
8#include "utils_array.hpp"
9#include "utils_json.hpp"
29namespace compressed_sparse_matrix {
36inline std::array<uint64_t, 2> validate_shape(
const H5::Group& handle,
const Options&)
try {
37 auto shandle = ritsuko::hdf5::open_dataset(handle,
"shape");
38 if (ritsuko::hdf5::exceeds_integer_limit(shandle, 64,
false)) {
39 throw std::runtime_error(
"expected the datatype to be a subset of a 64-bit unsigned integer");
42 size_t len = ritsuko::hdf5::get_1d_length(shandle,
false);
44 throw std::runtime_error(
"expected the dataset to be of length 2");
47 std::array<uint64_t, 2> output;
48 shandle.read(output.data(), H5::PredType::NATIVE_UINT64);
51}
catch (std::exception& e) {
52 throw std::runtime_error(
"failed to validate sparse matrix shape at '" + ritsuko::hdf5::get_name(handle) +
"/shape'; " + std::string(e.what()));
55inline size_t validate_data(
const H5::Group& handle,
const Options&)
try {
56 auto dhandle = ritsuko::hdf5::open_dataset(handle,
"data");
58 auto type = ritsuko::hdf5::open_and_load_scalar_string_attribute(handle,
"type");
59 if (type ==
"integer") {
60 if (ritsuko::hdf5::exceeds_integer_limit(dhandle, 32,
true)) {
61 throw std::runtime_error(
"expected an integer 'data' to fit inside a 32-bit signed integer");
63 }
else if (type ==
"boolean") {
64 if (ritsuko::hdf5::exceeds_integer_limit(dhandle, 32,
true)) {
65 throw std::runtime_error(
"expected a boolean 'data' to fit inside a 32-bit signed integer");
67 }
else if (type ==
"number") {
68 if (ritsuko::hdf5::exceeds_float_limit(dhandle, 64)) {
69 throw std::runtime_error(
"expected a number 'data' to fit inside a 64-bit float");
72 throw std::runtime_error(
"unknown matrix type '" + type +
"'");
75 if (dhandle.attrExists(
"missing-value-placeholder")) {
76 auto attr = dhandle.openAttribute(
"missing-value-placeholder");
77 ritsuko::hdf5::check_missing_placeholder_attribute(dhandle, attr);
80 return ritsuko::hdf5::get_1d_length(dhandle,
false);
81}
catch (std::exception& e) {
82 throw std::runtime_error(
"failed to validate sparse matrix data at '" + ritsuko::hdf5::get_name(handle) +
"/data'; " + std::string(e.what()));
85inline std::vector<uint64_t> validate_indptrs(
const H5::Group& handle,
size_t primary_dim,
size_t num_nonzero)
try {
86 auto dhandle = ritsuko::hdf5::open_dataset(handle,
"indptr");
87 if (ritsuko::hdf5::exceeds_integer_limit(dhandle, 64,
false)) {
88 throw std::runtime_error(
"expected datatype to be a subset of a 64-bit unsigned integer");
91 size_t len = ritsuko::hdf5::get_1d_length(dhandle,
false);
92 if (len != primary_dim + 1) {
93 throw std::runtime_error(
"dataset should have length equal to the primary dimension extent plus 1");
96 std::vector<uint64_t> indptrs(len);
97 dhandle.read(indptrs.data(), H5::PredType::NATIVE_UINT64);
99 if (indptrs[0] != 0) {
100 throw std::runtime_error(
"first entry should be zero");
102 if (indptrs.back() != num_nonzero) {
103 throw std::runtime_error(
"last entry should equal the number of non-zero elements");
106 for (
size_t i = 1; i < len; ++i) {
107 if (indptrs[i] < indptrs[i-1]) {
108 throw std::runtime_error(
"pointers should be sorted in increasing order");
113}
catch (std::exception& e) {
114 throw std::runtime_error(
"failed to validate sparse matrix pointers at '" + ritsuko::hdf5::get_name(handle) +
"/indptr'; " + std::string(e.what()));
117inline void validate_indices(
const H5::Group& handle,
const std::vector<uint64_t>& indptrs, uint64_t secondary_dim,
const Options& options)
try {
118 auto dhandle = ritsuko::hdf5::open_dataset(handle,
"indices");
119 if (ritsuko::hdf5::exceeds_integer_limit(dhandle, 64,
false)) {
120 throw std::runtime_error(
"expected datatype to be a subset of a 64-bit unsigned integer");
123 auto len = ritsuko::hdf5::get_1d_length(dhandle.getSpace(),
false);
124 if (indptrs.back() != len) {
125 throw std::runtime_error(
"dataset length should be equal to the number of non-zero elements (expected " + std::to_string(indptrs.back()) +
", got " + std::to_string(len) +
")");
128 size_t which_ptr = 0;
129 uint64_t last_index = 0;
130 hsize_t limit = indptrs[0];
131 ritsuko::hdf5::Stream1dNumericDataset<uint64_t> stream(&dhandle, len, options.hdf5_buffer_size);
133 for (hsize_t i = 0; i < len; ++i, stream.next()) {
134 auto x = stream.get();
135 if (x >= secondary_dim) {
136 throw std::runtime_error(
"out-of-range index (" + std::to_string(x) +
")");
145 limit = indptrs[which_ptr];
146 }
while (i == limit);
147 }
else if (last_index >= x) {
148 throw std::runtime_error(
"indices should be strictly increasing");
154}
catch (std::exception& e) {
155 throw std::runtime_error(
"failed to validate sparse matrix indices at '" + ritsuko::hdf5::get_name(handle) +
"/indices'; " + std::string(e.what()));
169 const auto& vstring = internal_json::extract_version_for_type(metadata.
other,
"compressed_sparse_matrix");
170 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(),
true);
171 if (version.major != 1) {
172 throw std::runtime_error(
"unsupported version '" + vstring +
"'");
175 auto handle = ritsuko::hdf5::open_file(path /
"matrix.h5");
176 auto ghandle = ritsuko::hdf5::open_group(handle,
"compressed_sparse_matrix");
177 auto layout = ritsuko::hdf5::open_and_load_scalar_string_attribute(ghandle,
"layout");
179 if (layout ==
"CSC") {
181 }
else if (layout !=
"CSR") {
182 throw std::runtime_error(
"'layout' attribute must be one of 'CSC' or 'CSR'");
185 auto shape = internal::validate_shape(ghandle, options);
186 size_t num_nonzero = internal::validate_data(ghandle, options);
187 auto indptrs = internal::validate_indptrs(ghandle, shape[primary], num_nonzero);
188 internal::validate_indices(ghandle, indptrs, shape[1 - primary], options);
190 if (ghandle.exists(
"names")) {
191 std::vector<hsize_t> dims(shape.begin(), shape.end());
192 internal_array::check_dimnames(ghandle,
"names", dims, options);
203 auto handle = ritsuko::hdf5::open_file(path /
"matrix.h5");
204 auto ghandle = ritsuko::hdf5::open_group(handle,
"compressed_sparse_matrix");
205 auto shandle = ritsuko::hdf5::open_dataset(ghandle,
"shape");
206 std::array<uint64_t, 2> output;
207 shandle.read(output.data(), H5::PredType::NATIVE_UINT64);
208 return output.front();
218 auto handle = ritsuko::hdf5::open_file(path /
"matrix.h5");
219 auto ghandle = ritsuko::hdf5::open_group(handle,
"compressed_sparse_matrix");
220 auto shandle = ritsuko::hdf5::open_dataset(ghandle,
"shape");
221 std::array<uint64_t, 2> output;
222 shandle.read(output.data(), H5::PredType::NATIVE_UINT64);
223 return std::vector<size_t>(output.begin(), output.end());
std::vector< size_t > dimensions(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition compressed_sparse_matrix.hpp:217
size_t height(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition compressed_sparse_matrix.hpp:202
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition compressed_sparse_matrix.hpp:168
takane validation functions.
Definition _derived_from.hpp:15
Validation options.
Definition utils_public.hpp:94