takane
Validators for ArtifactDB file formats
Loading...
Searching...
No Matches
compressed_sparse_matrix.hpp
Go to the documentation of this file.
1#ifndef TAKANE_HDF5_SPARSE_MATRIX_HPP
2#define TAKANE_HDF5_SPARSE_MATRIX_HPP
3
4#include "ritsuko/ritsuko.hpp"
5#include "ritsuko/hdf5/hdf5.hpp"
6
7#include "utils_public.hpp"
8#include "utils_array.hpp"
9#include "utils_json.hpp"
10
11#include <filesystem>
12#include <stdexcept>
13#include <string>
14#include <cstdint>
15#include <array>
16#include <vector>
17
23namespace takane {
24
29namespace compressed_sparse_matrix {
30
34namespace internal {
35
36inline std::array<uint64_t, 2> validate_shape(const H5::Group& handle, const Options&) try {
37 auto shandle = ritsuko::hdf5::open_dataset(handle, "shape");
38 if (ritsuko::hdf5::exceeds_integer_limit(shandle, 64, false)) {
39 throw std::runtime_error("expected the datatype to be a subset of a 64-bit unsigned integer");
40 }
41
42 size_t len = ritsuko::hdf5::get_1d_length(shandle, false);
43 if (len != 2) {
44 throw std::runtime_error("expected the dataset to be of length 2");
45 }
46
47 std::array<uint64_t, 2> output;
48 shandle.read(output.data(), H5::PredType::NATIVE_UINT64);
49 return output;
50
51} catch (std::exception& e) {
52 throw std::runtime_error("failed to validate sparse matrix shape at '" + ritsuko::hdf5::get_name(handle) + "/shape'; " + std::string(e.what()));
53}
54
55inline size_t validate_data(const H5::Group& handle, const Options&) try {
56 auto dhandle = ritsuko::hdf5::open_dataset(handle, "data");
57
58 auto type = ritsuko::hdf5::open_and_load_scalar_string_attribute(handle, "type");
59 if (type == "integer") {
60 if (ritsuko::hdf5::exceeds_integer_limit(dhandle, 32, true)) {
61 throw std::runtime_error("expected an integer 'data' to fit inside a 32-bit signed integer");
62 }
63 } else if (type == "boolean") {
64 if (ritsuko::hdf5::exceeds_integer_limit(dhandle, 32, true)) {
65 throw std::runtime_error("expected a boolean 'data' to fit inside a 32-bit signed integer");
66 }
67 } else if (type == "number") {
68 if (ritsuko::hdf5::exceeds_float_limit(dhandle, 64)) {
69 throw std::runtime_error("expected a number 'data' to fit inside a 64-bit float");
70 }
71 } else {
72 throw std::runtime_error("unknown matrix type '" + type + "'");
73 }
74
75 if (dhandle.attrExists("missing-value-placeholder")) {
76 auto attr = dhandle.openAttribute("missing-value-placeholder");
77 ritsuko::hdf5::check_missing_placeholder_attribute(dhandle, attr);
78 }
79
80 return ritsuko::hdf5::get_1d_length(dhandle, false);
81} catch (std::exception& e) {
82 throw std::runtime_error("failed to validate sparse matrix data at '" + ritsuko::hdf5::get_name(handle) + "/data'; " + std::string(e.what()));
83}
84
85inline std::vector<uint64_t> validate_indptrs(const H5::Group& handle, size_t primary_dim, size_t num_nonzero) try {
86 auto dhandle = ritsuko::hdf5::open_dataset(handle, "indptr");
87 if (ritsuko::hdf5::exceeds_integer_limit(dhandle, 64, false)) {
88 throw std::runtime_error("expected datatype to be a subset of a 64-bit unsigned integer");
89 }
90
91 size_t len = ritsuko::hdf5::get_1d_length(dhandle, false);
92 if (len != primary_dim + 1) {
93 throw std::runtime_error("dataset should have length equal to the primary dimension extent plus 1");
94 }
95
96 std::vector<uint64_t> indptrs(len);
97 dhandle.read(indptrs.data(), H5::PredType::NATIVE_UINT64);
98
99 if (indptrs[0] != 0) {
100 throw std::runtime_error("first entry should be zero");
101 }
102 if (indptrs.back() != num_nonzero) {
103 throw std::runtime_error("last entry should equal the number of non-zero elements");
104 }
105
106 for (size_t i = 1; i < len; ++i) {
107 if (indptrs[i] < indptrs[i-1]) {
108 throw std::runtime_error("pointers should be sorted in increasing order");
109 }
110 }
111
112 return indptrs;
113} catch (std::exception& e) {
114 throw std::runtime_error("failed to validate sparse matrix pointers at '" + ritsuko::hdf5::get_name(handle) + "/indptr'; " + std::string(e.what()));
115}
116
117inline void validate_indices(const H5::Group& handle, const std::vector<uint64_t>& indptrs, uint64_t secondary_dim, const Options& options) try {
118 auto dhandle = ritsuko::hdf5::open_dataset(handle, "indices");
119 if (ritsuko::hdf5::exceeds_integer_limit(dhandle, 64, false)) {
120 throw std::runtime_error("expected datatype to be a subset of a 64-bit unsigned integer");
121 }
122
123 auto len = ritsuko::hdf5::get_1d_length(dhandle.getSpace(), false);
124 if (indptrs.back() != len) {
125 throw std::runtime_error("dataset length should be equal to the number of non-zero elements (expected " + std::to_string(indptrs.back()) + ", got " + std::to_string(len) + ")");
126 }
127
128 size_t which_ptr = 0;
129 uint64_t last_index = 0;
130 hsize_t limit = indptrs[0];
131 ritsuko::hdf5::Stream1dNumericDataset<uint64_t> stream(&dhandle, len, options.hdf5_buffer_size);
132
133 for (hsize_t i = 0; i < len; ++i, stream.next()) {
134 auto x = stream.get();
135 if (x >= secondary_dim) {
136 throw std::runtime_error("out-of-range index (" + std::to_string(x) + ")");
137 }
138
139 if (i == limit) {
140 // No need to check if there are more or fewer elements
141 // than expected, as we already know that indptr.back()
142 // is equal to the number of non-zero elements.
143 do {
144 ++which_ptr;
145 limit = indptrs[which_ptr];
146 } while (i == limit);
147 } else if (last_index >= x) {
148 throw std::runtime_error("indices should be strictly increasing");
149 }
150
151 last_index = x;
152 }
153
154} catch (std::exception& e) {
155 throw std::runtime_error("failed to validate sparse matrix indices at '" + ritsuko::hdf5::get_name(handle) + "/indices'; " + std::string(e.what()));
156}
157
158}
168inline void validate(const std::filesystem::path& path, const ObjectMetadata& metadata, Options& options) {
169 const auto& vstring = internal_json::extract_version_for_type(metadata.other, "compressed_sparse_matrix");
170 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true);
171 if (version.major != 1) {
172 throw std::runtime_error("unsupported version '" + vstring + "'");
173 }
174
175 auto handle = ritsuko::hdf5::open_file(path / "matrix.h5");
176 auto ghandle = ritsuko::hdf5::open_group(handle, "compressed_sparse_matrix");
177 auto layout = ritsuko::hdf5::open_and_load_scalar_string_attribute(ghandle, "layout");
178 size_t primary = 0;
179 if (layout == "CSC") {
180 primary = 1;
181 } else if (layout != "CSR") {
182 throw std::runtime_error("'layout' attribute must be one of 'CSC' or 'CSR'");
183 }
184
185 auto shape = internal::validate_shape(ghandle, options);
186 size_t num_nonzero = internal::validate_data(ghandle, options);
187 auto indptrs = internal::validate_indptrs(ghandle, shape[primary], num_nonzero);
188 internal::validate_indices(ghandle, indptrs, shape[1 - primary], options);
189
190 if (ghandle.exists("names")) {
191 std::vector<hsize_t> dims(shape.begin(), shape.end());
192 internal_array::check_dimnames(ghandle, "names", dims, options);
193 }
194}
195
202inline size_t height(const std::filesystem::path& path, [[maybe_unused]] const ObjectMetadata& metadata, [[maybe_unused]] Options& options) {
203 auto handle = ritsuko::hdf5::open_file(path / "matrix.h5");
204 auto ghandle = ritsuko::hdf5::open_group(handle, "compressed_sparse_matrix");
205 auto shandle = ritsuko::hdf5::open_dataset(ghandle, "shape");
206 std::array<uint64_t, 2> output;
207 shandle.read(output.data(), H5::PredType::NATIVE_UINT64);
208 return output.front();
209}
210
217inline std::vector<size_t> dimensions(const std::filesystem::path& path, [[maybe_unused]] const ObjectMetadata& metadata, [[maybe_unused]] Options& options) {
218 auto handle = ritsuko::hdf5::open_file(path / "matrix.h5");
219 auto ghandle = ritsuko::hdf5::open_group(handle, "compressed_sparse_matrix");
220 auto shandle = ritsuko::hdf5::open_dataset(ghandle, "shape");
221 std::array<uint64_t, 2> output;
222 shandle.read(output.data(), H5::PredType::NATIVE_UINT64);
223 return std::vector<size_t>(output.begin(), output.end());
224}
225
226}
227
228}
229
230#endif
std::vector< size_t > dimensions(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition compressed_sparse_matrix.hpp:217
size_t height(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition compressed_sparse_matrix.hpp:202
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition compressed_sparse_matrix.hpp:168
takane validation functions.
Definition _derived_from.hpp:15
Object metadata, including the type and other fields.
Definition utils_public.hpp:26
std::unordered_map< std::string, std::shared_ptr< millijson::Base > > other
Definition utils_public.hpp:35
Validation options.
Definition utils_public.hpp:94
Exported utilities.