takane
Validators for ArtifactDB file formats
Loading...
Searching...
No Matches
data_frame.hpp
Go to the documentation of this file.
1#ifndef TAKANE_HDF5_FRAME_HPP
2#define TAKANE_HDF5_FRAME_HPP
3
4#include "H5Cpp.h"
5#include "ritsuko/ritsuko.hpp"
6#include "ritsuko/hdf5/hdf5.hpp"
7#include "ritsuko/hdf5/vls/vls.hpp"
8
9#include <cstdint>
10#include <string>
11#include <stdexcept>
12#include <vector>
13#include <filesystem>
14#include <unordered_set>
15
16#include "utils_public.hpp"
17#include "utils_string.hpp"
18#include "utils_factor.hpp"
19#include "utils_other.hpp"
20#include "utils_json.hpp"
21
27namespace takane {
28
32void validate(const std::filesystem::path&, const ObjectMetadata&, Options& options);
33size_t height(const std::filesystem::path&, const ObjectMetadata&, Options& options);
38namespace data_frame {
39
43inline void validate_row_names(const H5::Group& handle, hsize_t num_rows, const Options& options) try {
44 if (handle.childObjType("row_names") != H5O_TYPE_DATASET) {
45 throw std::runtime_error("expected a 'row_names' dataset when row names are present");
46 }
47
48 auto rnhandle = handle.openDataSet("row_names");
49 if (!ritsuko::hdf5::is_utf8_string(rnhandle)) {
50 throw std::runtime_error("expected a datatype for 'row_names' that can be represented by a UTF-8 encoded string");
51 }
52
53 if (ritsuko::hdf5::get_1d_length(rnhandle.getSpace(), false) != num_rows) {
54 throw std::runtime_error("expected 'row_names' to have length equal to the number of rows");
55 }
56 ritsuko::hdf5::validate_1d_string_dataset(rnhandle, num_rows, options.hdf5_buffer_size);
57} catch (std::exception& e) {
58 throw std::runtime_error("failed to validate the row names for '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what()));
59}
60
61inline hsize_t validate_column_names(const H5::Group& ghandle, const Options& options) try {
62 auto cnhandle = ritsuko::hdf5::open_dataset(ghandle, "column_names");
63 if (!ritsuko::hdf5::is_utf8_string(cnhandle)) {
64 throw std::runtime_error("expected a datatype for 'column_names' that can be represented by a UTF-8 encoded string");
65 }
66
67 auto num_cols = ritsuko::hdf5::get_1d_length(cnhandle.getSpace(), false);
68
69 std::unordered_set<std::string> column_names;
70 ritsuko::hdf5::Stream1dStringDataset stream(&cnhandle, num_cols, options.hdf5_buffer_size);
71 for (size_t c = 0; c < num_cols; ++c, stream.next()) {
72 auto x = stream.steal();
73 if (x.empty()) {
74 throw std::runtime_error("column names should not be empty strings");
75 }
76 if (column_names.find(x) != column_names.end()) {
77 throw std::runtime_error("duplicated column name '" + x + "'");
78 }
79 column_names.insert(std::move(x));
80 }
81
82 return num_cols;
83
84} catch (std::exception& e) {
85 throw std::runtime_error("failed to validate the column names for '" + ritsuko::hdf5::get_name(ghandle) + "'; " + std::string(e.what()));
86}
87
88inline void validate_column(const H5::Group& dhandle, const std::string& dset_name, hsize_t num_rows, const ritsuko::Version& version, const Options& options) try {
89 const char* missing_attr_name = "missing-value-placeholder";
90
91 auto dtype = dhandle.childObjType(dset_name);
92 if (dtype == H5O_TYPE_GROUP) {
93 auto ghandle = dhandle.openGroup(dset_name);
94 auto type = ritsuko::hdf5::open_and_load_scalar_string_attribute(ghandle, "type");
95
96 if (type == "factor") {
97 internal_factor::check_ordered_attribute(ghandle);
98 auto num_levels = internal_factor::validate_factor_levels(ghandle, "levels", options.hdf5_buffer_size);
99 auto num_codes = internal_factor::validate_factor_codes(ghandle, "codes", num_levels, options.hdf5_buffer_size);
100 if (num_codes != num_rows) {
101 throw std::runtime_error("expected column to have length equal to the number of rows");
102 }
103
104 } else if (type == "vls") {
105 if (version.lt(1, 1, 0)) {
106 throw std::runtime_error("unsupported type '" + type + "'");
107 }
108
109 auto phandle = ritsuko::hdf5::vls::open_pointers(ghandle, "pointers", 64, 64);
110 auto vlen = ritsuko::hdf5::get_1d_length(phandle.getSpace(), false);
111 if (vlen != num_rows) {
112 throw std::runtime_error("expected column to have length equal to the number of rows");
113 }
114
115 auto hhandle = ritsuko::hdf5::vls::open_heap(ghandle, "heap");
116 auto hlen = ritsuko::hdf5::get_1d_length(hhandle.getSpace(), false);
117 ritsuko::hdf5::vls::validate_1d_array<uint64_t, uint64_t>(phandle, vlen, hlen, options.hdf5_buffer_size);
118
119 if (phandle.attrExists(missing_attr_name)) {
120 auto attr = phandle.openAttribute(missing_attr_name);
121 ritsuko::hdf5::check_string_missing_placeholder_attribute(attr);
122 }
123
124 } else {
125 throw std::runtime_error("unsupported type '" + type + "'");
126 }
127
128 } else if (dtype == H5O_TYPE_DATASET) {
129 auto xhandle = dhandle.openDataSet(dset_name);
130 if (num_rows != ritsuko::hdf5::get_1d_length(xhandle.getSpace(), false)) {
131 throw std::runtime_error("expected column to have length equal to the number of rows");
132 }
133
134 auto type = ritsuko::hdf5::open_and_load_scalar_string_attribute(xhandle, "type");
135 if (type == "string") {
136 if (!ritsuko::hdf5::is_utf8_string(xhandle)) {
137 throw std::runtime_error("expected a datatype for '" + dset_name + "' that can be represented by a UTF-8 encoded string");
138 }
139 auto missingness = ritsuko::hdf5::open_and_load_optional_string_missing_placeholder(xhandle, missing_attr_name);
140 std::string format = internal_string::fetch_format_attribute(xhandle);
141 internal_string::validate_string_format(xhandle, num_rows, format, missingness, options.hdf5_buffer_size);
142
143 } else {
144 if (type == "integer") {
145 if (ritsuko::hdf5::exceeds_integer_limit(xhandle, 32, true)) {
146 throw std::runtime_error("expected integer column to use a datatype that is a subset of a 32-bit signed integer");
147 }
148 } else if (type == "boolean") {
149 if (ritsuko::hdf5::exceeds_integer_limit(xhandle, 32, true)) {
150 throw std::runtime_error("expected boolean column to use a datatype that is a subset of a 32-bit signed integer");
151 }
152 } else if (type == "number") {
153 if (ritsuko::hdf5::exceeds_float_limit(xhandle, 64)) {
154 throw std::runtime_error("expected number column to use a datatype that is a subset of a 64-bit float");
155 }
156 } else {
157 throw std::runtime_error("unknown column type '" + type + "'");
158 }
159
160 if (xhandle.attrExists(missing_attr_name)) {
161 auto ahandle = xhandle.openAttribute(missing_attr_name);
162 ritsuko::hdf5::check_numeric_missing_placeholder_attribute(xhandle, ahandle);
163 }
164 }
165
166 } else {
167 throw std::runtime_error("unknown HDF5 object type");
168 }
169
170} catch (std::exception& e) {
171 throw std::runtime_error("failed to validate column at '" + ritsuko::hdf5::get_name(dhandle) + "/" + dset_name + "'; " + std::string(e.what()));
172}
182inline void validate(const std::filesystem::path& path, const ObjectMetadata& metadata, Options& options) {
183 const std::string type_name = "data_frame"; // use a separate variable to avoid dangling reference warnings from GCC.
184 const auto& vstring = internal_json::extract_version_for_type(metadata.other, type_name);
185 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true);
186 if (version.major != 1) {
187 throw std::runtime_error("unsupported version '" + vstring + "'");
188 }
189
190 auto handle = ritsuko::hdf5::open_file(path / "basic_columns.h5");
191 auto ghandle = ritsuko::hdf5::open_group(handle, type_name.c_str());
192
193 // Checking the number of rows.
194 auto attr = ritsuko::hdf5::open_scalar_attribute(ghandle, "row-count");
195 if (ritsuko::hdf5::exceeds_integer_limit(attr, 64, false)) {
196 throw std::runtime_error("'row-count' attribute should have a datatype that fits in a 64-bit unsigned integer");
197 }
198 uint64_t num_rows = 0;
199 attr.read(H5::PredType::NATIVE_UINT64, &num_rows);
200
201 // Checking row and column names.
202 if (ghandle.exists("row_names")) {
203 validate_row_names(ghandle, num_rows, options);
204 }
205 size_t NC = validate_column_names(ghandle, options);
206
207 // Finally iterating through the columns.
208 auto dhandle = ritsuko::hdf5::open_group(ghandle, "data");
209
210 hsize_t num_basic = 0;
211 auto other_dir = path / "other_columns";
212
213 for (size_t c = 0; c < NC; ++c) {
214 std::string dset_name = std::to_string(c);
215
216 if (!dhandle.exists(dset_name)) {
217 auto opath = other_dir / dset_name;
218 auto ometa = read_object_metadata(opath);
219 try {
220 ::takane::validate(opath, ometa, options);
221 } catch (std::exception& e) {
222 throw std::runtime_error("failed to validate 'other' column " + dset_name + "; " + std::string(e.what()));
223 }
224 if (::takane::height(opath, ometa, options) != num_rows) {
225 throw std::runtime_error("height of column " + dset_name + " of class '" + ometa.type + "' is not the same as the number of rows");
226 }
227
228 } else {
229 validate_column(dhandle, dset_name, num_rows, version, options);
230 ++num_basic;
231 }
232 }
233
234 if (std::filesystem::exists(other_dir)) {
235 if (internal_other::count_directory_entries(other_dir) != NC - num_basic) {
236 throw std::runtime_error("more objects than expected inside the 'other_columns' directory");
237 }
238 }
239
240 if (num_basic != dhandle.getNumObjs()) {
241 throw std::runtime_error("more objects present in the 'data_frame/data' group than expected");
242 }
243
244 internal_other::validate_mcols(path, "column_annotations", NC, options);
245 internal_other::validate_metadata(path, "other_annotations", options);
246}
247
254inline size_t height(const std::filesystem::path& path, [[maybe_unused]] const ObjectMetadata& metadata, [[maybe_unused]] Options& options) {
255 // Assume it's all valid already.
256 auto handle = ritsuko::hdf5::open_file(path / "basic_columns.h5");
257 auto ghandle = handle.openGroup("data_frame");
258 return ritsuko::hdf5::load_scalar_numeric_attribute<uint64_t>(ghandle.openAttribute("row-count"));
259}
260
267inline std::vector<size_t> dimensions(const std::filesystem::path& path, [[maybe_unused]] const ObjectMetadata& metadata, [[maybe_unused]] Options& options) {
268 // Assume it's all valid already.
269 auto handle = ritsuko::hdf5::open_file(path / "basic_columns.h5");
270 auto ghandle = handle.openGroup("data_frame");
271 std::vector<size_t> output(2);
272 output[0] = ritsuko::hdf5::load_scalar_numeric_attribute<uint64_t>(ghandle.openAttribute("row-count"));
273 output[1] = ritsuko::hdf5::get_1d_length(ghandle.openDataSet("column_names"), false);
274 return output;
275}
276
277}
278
279}
280
281#endif
takane validation functions.
Definition _derived_from.hpp:15
size_t height(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition _height.hpp:88
ObjectMetadata read_object_metadata(const std::filesystem::path &path)
Definition utils_public.hpp:74
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition _validate.hpp:107
std::vector< size_t > dimensions(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition _dimensions.hpp:69
Object metadata, including the type and other fields.
Definition utils_public.hpp:26
std::unordered_map< std::string, std::shared_ptr< millijson::Base > > other
Definition utils_public.hpp:35
Validation options.
Definition utils_public.hpp:94
Exported utilities.