chihaya
Validating delayed array operations in HDF5
Loading...
Searching...
No Matches
sparse_matrix.hpp
Go to the documentation of this file.
1#ifndef CHIHAYA_SPARSE_MATRIX_HPP
2#define CHIHAYA_SPARSE_MATRIX_HPP
3
4#include "H5Cpp.h"
5#include "ritsuko/hdf5/hdf5.hpp"
6
7#include <vector>
8#include <cstdint>
9
10#include "utils_public.hpp"
11#include "utils_misc.hpp"
12#include "utils_type.hpp"
13#include "utils_dimnames.hpp"
14
20namespace chihaya {
21
26namespace sparse_matrix {
27
31namespace internal {
32
33template<typename Index_>
34void validate_indices(const H5::DataSet& ihandle, const std::vector<uint64_t>& indptrs, size_t primary, size_t secondary, bool csc) {
35 ritsuko::hdf5::Stream1dNumericDataset<Index_> stream(&ihandle, indptrs.back(), 1000000);
36
37 for (size_t p = 0; p < primary; ++p) {
38 auto start = indptrs[p];
39 auto end = indptrs[p + 1];
40 if (start > end) {
41 throw std::runtime_error("entries of 'indptr' must be sorted");
42 }
43
44 // Checking for sortedness and good things.
45 Index_ previous;
46 for (auto x = start; x < end; ++x, stream.next()) {
47 auto i = stream.get();
48 if (i < 0) {
49 throw std::runtime_error("entries of 'indices' should be non-negative");
50 }
51 if (x > start && i <= previous) {
52 throw std::runtime_error("'indices' should be strictly increasing within each " + (csc ? std::string("column") : std::string("row")));
53 }
54 if (static_cast<size_t>(i) >= secondary) {
55 throw std::runtime_error("entries of 'indices' should be less than the number of " + (csc ? std::string("row") : std::string("column")) + "s");
56 }
57 previous = i;
58 }
59 }
60}
61
62}
75inline ArrayDetails validate(const H5::Group& handle, const ritsuko::Version& version, [[maybe_unused]] Options& options) {
76 std::vector<uint64_t> dims(2);
77 ArrayType array_type;
78
79 {
80 auto shandle = ritsuko::hdf5::open_dataset(handle, "shape");
81 auto len = ritsuko::hdf5::get_1d_length(shandle, false);
82 if (len != 2) {
83 throw std::runtime_error("'shape' should have length 2");
84 }
85
86 if (version.lt(1, 1, 0)) {
87 if (shandle.getTypeClass() != H5T_INTEGER) {
88 throw std::runtime_error("'shape' should be integer");
89 }
90 std::vector<int> dims_tmp(2);
91 shandle.read(dims_tmp.data(), H5::PredType::NATIVE_INT);
92 if (dims_tmp[0] < 0 || dims_tmp[1] < 0) {
93 throw std::runtime_error("'shape' should contain non-negative values");
94 }
95 std::copy(dims_tmp.begin(), dims_tmp.end(), dims.begin());
96 } else {
97 if (ritsuko::hdf5::exceeds_integer_limit(shandle, 64, false)) {
98 throw std::runtime_error("'shape' should have a datatype that can fit into a 64-bit unsigned integer");
99 }
100 shandle.read(dims.data(), H5::PredType::NATIVE_UINT64);
101 }
102 }
103
104 size_t nnz;
105 {
106 auto dhandle = ritsuko::hdf5::open_dataset(handle, "data");
107
108 try {
109 nnz = ritsuko::hdf5::get_1d_length(dhandle, false);
110
111 if (version.lt(1, 1, 0)) {
112 array_type = internal_type::translate_type_0_0(dhandle.getTypeClass());
113 if (internal_type::is_boolean(dhandle)) {
114 array_type = BOOLEAN;
115 }
116 } else {
117 auto type = ritsuko::hdf5::open_and_load_scalar_string_attribute(dhandle, "type");
118 array_type = internal_type::translate_type_1_1(type);
119 internal_type::check_type_1_1(dhandle, array_type);
120 }
121
122 if (array_type != INTEGER && array_type != BOOLEAN && array_type != FLOAT) {
123 throw std::runtime_error("dataset should be integer, float or boolean");
124 }
125
126 internal_misc::validate_missing_placeholder(dhandle, version);
127 } catch (std::exception& e) {
128 throw std::runtime_error("failed to validate 'data'; " + std::string(e.what()));
129 }
130 }
131
132 if (!options.details_only) {
133 bool csc = true;
134 if (!version.lt(1, 1, 0)) {
135 auto bhandle = ritsuko::hdf5::open_dataset(handle, "by_column");
136 if (!ritsuko::hdf5::is_scalar(bhandle)) {
137 throw std::runtime_error("'by_column' should be a scalar");
138 }
139 if (ritsuko::hdf5::exceeds_integer_limit(bhandle, 8, true)) {
140 throw std::runtime_error("datatype of 'by_column' should fit into an 8-bit signed integer");
141 }
142 csc = (ritsuko::hdf5::load_scalar_numeric_dataset<int8_t>(bhandle) != 0);
143 }
144
145 {
146 auto ihandle = ritsuko::hdf5::open_dataset(handle, "indices");
147
148 if (version.lt(1, 1, 0)) {
149 if (ihandle.getTypeClass() != H5T_INTEGER) {
150 throw std::runtime_error("'indices' should be integer");
151 }
152 } else {
153 if (ritsuko::hdf5::exceeds_integer_limit(ihandle, 64, false)) {
154 throw std::runtime_error("datatype of 'indices' should fit into a 64-bit unsigned integer");
155 }
156 }
157
158 if (nnz != ritsuko::hdf5::get_1d_length(ihandle, false)) {
159 throw std::runtime_error("'indices' and 'data' should have the same length");
160 }
161
162 auto iphandle = ritsuko::hdf5::open_dataset(handle, "indptr");
163 if (version.lt(1, 1, 0)) {
164 if (iphandle.getTypeClass() != H5T_INTEGER) {
165 throw std::runtime_error("'indptr' should be integer");
166 }
167 } else {
168 if (ritsuko::hdf5::exceeds_integer_limit(iphandle, 64, false)) {
169 throw std::runtime_error("datatype of 'indptr' should fit into a 64-bit unsigned integer");
170 }
171 }
172
173 auto primary = (csc ? dims[1] : dims[0]);
174 auto secondary = (csc ? dims[0] : dims[1]);
175 if (ritsuko::hdf5::get_1d_length(iphandle, false) != static_cast<size_t>(primary + 1)) {
176 throw std::runtime_error("'indptr' should have length equal to the number of " + (csc ? std::string("columns") : std::string("rows")) + " plus 1");
177 }
178 std::vector<uint64_t> indptrs(primary + 1);
179 iphandle.read(indptrs.data(), H5::PredType::NATIVE_UINT64);
180 if (indptrs[0] != 0) {
181 throw std::runtime_error("first entry of 'indptr' should be 0 for a sparse matrix");
182 }
183 if (indptrs.back() != static_cast<uint64_t>(nnz)) {
184 throw std::runtime_error("last entry of 'indptr' should be equal to the length of 'data'");
185 }
186
187 if (version.lt(1, 1, 0)) {
188 internal::validate_indices<int>(ihandle, indptrs, primary, secondary, csc);
189 } else {
190 internal::validate_indices<uint64_t>(ihandle, indptrs, primary, secondary, csc);
191 }
192 }
193
194 // Validating dimnames.
195 if (handle.exists("dimnames")) {
196 internal_dimnames::validate(handle, dims, version);
197 }
198 }
199
200 return ArrayDetails(array_type, std::vector<size_t>(dims.begin(), dims.end()));
201}
202
203}
204
205}
206
207#endif
ArrayDetails validate(const H5::Group &handle, const ritsuko::Version &version, Options &options)
Definition: sparse_matrix.hpp:75
Namespace for all chihaya functions.
Definition: binary_arithmetic.hpp:22
ArrayType
Definition: utils_public.hpp:27
Details about an array.
Definition: utils_public.hpp:36
Validation options.
Definition: utils_public.hpp:66
Various public utilities.