takane
Validators for ArtifactDB file formats
Loading...
Searching...
No Matches
dense_array.hpp
Go to the documentation of this file.
1#ifndef TAKANE_DENSE_ARRAY_HPP
2#define TAKANE_DENSE_ARRAY_HPP
3
4#include "ritsuko/ritsuko.hpp"
5#include "ritsuko/hdf5/hdf5.hpp"
6#include "ritsuko/hdf5/vls/vls.hpp"
7
8#include "utils_public.hpp"
9#include "utils_array.hpp"
10
11#include <vector>
12#include <string>
13#include <stdexcept>
14#include <filesystem>
15#include <cstdint>
16
22namespace takane {
23
28namespace dense_array {
29
33namespace internal {
34
35inline bool is_transposed(const H5::Group& ghandle) {
36 if (!ghandle.attrExists("transposed")) {
37 return false;
38 }
39
40 auto attr = ghandle.openAttribute("transposed");
41 if (!ritsuko::hdf5::is_scalar(attr)) {
42 throw std::runtime_error("expected 'transposed' attribute to be a scalar");
43 }
44 if (ritsuko::hdf5::exceeds_integer_limit(attr, 32, true)) {
45 throw std::runtime_error("expected 'transposed' attribute to have a datatype that fits in a 32-bit signed integer");
46 }
47
48 return ritsuko::hdf5::load_scalar_numeric_attribute<int32_t>(attr) != 0;
49}
50
51inline void retrieve_dimension_extents(const H5::DataSet& dhandle, std::vector<hsize_t>& extents) {
52 auto dspace = dhandle.getSpace();
53 size_t ndims = dspace.getSimpleExtentNdims();
54 if (ndims == 0) {
55 throw std::runtime_error("expected '" + ritsuko::hdf5::get_name(dhandle) + "' array to have at least one dimension");
56 }
57 extents.resize(ndims);
58 dspace.getSimpleExtentDims(extents.data());
59}
60
61}
71inline void validate(const std::filesystem::path& path, const ObjectMetadata& metadata, Options& options) {
72 const std::string type_name = "dense_array"; // use a separate variable to avoid dangling reference warnings from GCC.
73 const auto& vstring = internal_json::extract_version_for_type(metadata.other, type_name);
74 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true);
75 if (version.major != 1) {
76 throw std::runtime_error("unsupported version '" + vstring + "'");
77 }
78
79 auto handle = ritsuko::hdf5::open_file(path / "array.h5");
80 auto ghandle = ritsuko::hdf5::open_group(handle, "dense_array");
81 internal::is_transposed(ghandle); // just a check, not used here.
82 auto type = ritsuko::hdf5::open_and_load_scalar_string_attribute(ghandle, "type");
83 std::vector<hsize_t> extents;
84
85 const char* missing_attr_name = "missing-value-placeholder";
86
87 if (type == "vls") {
88 if (version.lt(1, 1, 0)) {
89 throw std::runtime_error("unsupported type '" + type + "'");
90 }
91
92 auto phandle = ritsuko::hdf5::vls::open_pointers(ghandle, "pointers", 64, 64);
93 internal::retrieve_dimension_extents(phandle, extents);
94 auto hhandle = ritsuko::hdf5::vls::open_heap(ghandle, "heap");
95 auto hlen = ritsuko::hdf5::get_1d_length(hhandle.getSpace(), false);
96 ritsuko::hdf5::vls::validate_nd_array<uint64_t, uint64_t>(phandle, extents, hlen, options.hdf5_buffer_size);
97
98 if (phandle.attrExists(missing_attr_name)) {
99 auto attr = phandle.openAttribute(missing_attr_name);
100 ritsuko::hdf5::check_string_missing_placeholder_attribute(attr);
101 }
102
103 } else {
104 auto dhandle = ritsuko::hdf5::open_dataset(ghandle, "data");
105 internal::retrieve_dimension_extents(dhandle, extents);
106
107 if (type == "string") {
108 if (!ritsuko::hdf5::is_utf8_string(dhandle)) {
109 throw std::runtime_error("expected string array to have a datatype that can be represented by a UTF-8 encoded string");
110 }
111 ritsuko::hdf5::validate_nd_string_dataset(dhandle, extents, options.hdf5_buffer_size);
112
113 if (dhandle.attrExists(missing_attr_name)) {
114 auto attr = dhandle.openAttribute(missing_attr_name);
115 ritsuko::hdf5::check_string_missing_placeholder_attribute(attr);
116 }
117
118 } else {
119 if (type == "integer") {
120 if (ritsuko::hdf5::exceeds_integer_limit(dhandle, 32, true)) {
121 throw std::runtime_error("expected integer array to have a datatype that fits into a 32-bit signed integer");
122 }
123 } else if (type == "boolean") {
124 if (ritsuko::hdf5::exceeds_integer_limit(dhandle, 32, true)) {
125 throw std::runtime_error("expected boolean array to have a datatype that fits into a 32-bit signed integer");
126 }
127 } else if (type == "number") {
128 if (ritsuko::hdf5::exceeds_float_limit(dhandle, 64)) {
129 throw std::runtime_error("expected number array to have a datatype that fits into a 64-bit float");
130 }
131 } else {
132 throw std::runtime_error("unknown array type '" + type + "'");
133 }
134
135 if (dhandle.attrExists(missing_attr_name)) {
136 auto attr = dhandle.openAttribute(missing_attr_name);
137 ritsuko::hdf5::check_numeric_missing_placeholder_attribute(dhandle, attr);
138 }
139 }
140 }
141
142 if (ghandle.exists("names")) {
143 internal_array::check_dimnames(ghandle, "names", extents, options);
144 }
145}
146
153inline size_t height(const std::filesystem::path& path, [[maybe_unused]] const ObjectMetadata& metadata, [[maybe_unused]] Options& options) {
154 auto handle = ritsuko::hdf5::open_file(path / "array.h5");
155 auto ghandle = ritsuko::hdf5::open_group(handle, "dense_array");
156
157 auto dhandle = ritsuko::hdf5::open_dataset(ghandle, "data");
158 auto dspace = dhandle.getSpace();
159 size_t ndims = dspace.getSimpleExtentNdims();
160 std::vector<hsize_t> extents(ndims);
161 dspace.getSimpleExtentDims(extents.data());
162
163 if (internal::is_transposed(ghandle)) {
164 return extents.back();
165 } else {
166 return extents.front();
167 }
168}
169
176inline std::vector<size_t> dimensions(const std::filesystem::path& path, [[maybe_unused]] const ObjectMetadata& metadata, [[maybe_unused]] Options& options) {
177 auto handle = ritsuko::hdf5::open_file(path / "array.h5");
178 auto ghandle = ritsuko::hdf5::open_group(handle, "dense_array");
179 auto type = ritsuko::hdf5::open_and_load_scalar_string_attribute(ghandle, "type");
180 std::vector<hsize_t> extents;
181
182 if (type == "vls") {
183 auto phandle = ghandle.openDataSet("pointers");
184 internal::retrieve_dimension_extents(phandle, extents);
185 } else {
186 auto dhandle = ghandle.openDataSet("data");
187 internal::retrieve_dimension_extents(dhandle, extents);
188 }
189
190 if (internal::is_transposed(ghandle)) {
191 return std::vector<size_t>(extents.rbegin(), extents.rend());
192 } else {
193 return std::vector<size_t>(extents.begin(), extents.end());
194 }
195}
196
197}
198
199}
200
201#endif
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition dense_array.hpp:71
size_t height(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition dense_array.hpp:153
std::vector< size_t > dimensions(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition dense_array.hpp:176
takane validation functions.
Definition _derived_from.hpp:15
Object metadata, including the type and other fields.
Definition utils_public.hpp:26
std::unordered_map< std::string, std::shared_ptr< millijson::Base > > other
Definition utils_public.hpp:35
Validation options.
Definition utils_public.hpp:94
hsize_t hdf5_buffer_size
Definition utils_public.hpp:103
Exported utilities.