takane
Validators for ArtifactDB file formats
Loading...
Searching...
No Matches
data_frame.hpp
Go to the documentation of this file.
1#ifndef TAKANE_HDF5_FRAME_HPP
2#define TAKANE_HDF5_FRAME_HPP
3
4#include "H5Cpp.h"
5#include "ritsuko/ritsuko.hpp"
6#include "ritsuko/hdf5/hdf5.hpp"
7
8#include <cstdint>
9#include <string>
10#include <stdexcept>
11#include <vector>
12#include <filesystem>
13#include <unordered_set>
14
15#include "utils_public.hpp"
16#include "utils_string.hpp"
17#include "utils_factor.hpp"
18#include "utils_other.hpp"
19#include "utils_json.hpp"
20
26namespace takane {
27
31void validate(const std::filesystem::path&, const ObjectMetadata&, Options& options);
32size_t height(const std::filesystem::path&, const ObjectMetadata&, Options& options);
37namespace data_frame {
38
42inline void validate_row_names(const H5::Group& handle, hsize_t num_rows, const Options& options) try {
43 if (handle.childObjType("row_names") != H5O_TYPE_DATASET) {
44 throw std::runtime_error("expected a 'row_names' dataset when row names are present");
45 }
46
47 auto rnhandle = handle.openDataSet("row_names");
48 if (!ritsuko::hdf5::is_utf8_string(rnhandle)) {
49 throw std::runtime_error("expected a datatype for 'row_names' that can be represented by a UTF-8 encoded string");
50 }
51
52 if (ritsuko::hdf5::get_1d_length(rnhandle.getSpace(), false) != num_rows) {
53 throw std::runtime_error("expected 'row_names' to have length equal to the number of rows");
54 }
55 ritsuko::hdf5::validate_1d_string_dataset(rnhandle, num_rows, options.hdf5_buffer_size);
56} catch (std::exception& e) {
57 throw std::runtime_error("failed to validate the row names for '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what()));
58}
59
60inline hsize_t validate_column_names(const H5::Group& ghandle, const Options& options) try {
61 auto cnhandle = ritsuko::hdf5::open_dataset(ghandle, "column_names");
62 if (!ritsuko::hdf5::is_utf8_string(cnhandle)) {
63 throw std::runtime_error("expected a datatype for 'column_names' that can be represented by a UTF-8 encoded string");
64 }
65
66 auto num_cols = ritsuko::hdf5::get_1d_length(cnhandle.getSpace(), false);
67
68 std::unordered_set<std::string> column_names;
69 ritsuko::hdf5::Stream1dStringDataset stream(&cnhandle, num_cols, options.hdf5_buffer_size);
70 for (size_t c = 0; c < num_cols; ++c, stream.next()) {
71 auto x = stream.steal();
72 if (x.empty()) {
73 throw std::runtime_error("column names should not be empty strings");
74 }
75 if (column_names.find(x) != column_names.end()) {
76 throw std::runtime_error("duplicated column name '" + x + "'");
77 }
78 column_names.insert(std::move(x));
79 }
80
81 return num_cols;
82
83} catch (std::exception& e) {
84 throw std::runtime_error("failed to validate the column names for '" + ritsuko::hdf5::get_name(ghandle) + "'; " + std::string(e.what()));
85}
86
87inline void validate_column(const H5::Group& dhandle, const std::string& dset_name, hsize_t num_rows, const Options& options) try {
88 auto dtype = dhandle.childObjType(dset_name);
89 if (dtype == H5O_TYPE_GROUP) {
90 auto fhandle = dhandle.openGroup(dset_name);
91 auto type = ritsuko::hdf5::open_and_load_scalar_string_attribute(fhandle, "type");
92 if (type != "factor") {
93 throw std::runtime_error("expected HDF5 groups to have a 'type' attribute set to 'factor'");
94 }
95
96 internal_factor::check_ordered_attribute(fhandle);
97
98 auto num_levels = internal_factor::validate_factor_levels(fhandle, "levels", options.hdf5_buffer_size);
99 auto num_codes = internal_factor::validate_factor_codes(fhandle, "codes", num_levels, options.hdf5_buffer_size);
100 if (num_codes != num_rows) {
101 throw std::runtime_error("expected column to have length equal to the number of rows");
102 }
103
104 } else if (dtype == H5O_TYPE_DATASET) {
105 auto xhandle = dhandle.openDataSet(dset_name);
106 if (num_rows != ritsuko::hdf5::get_1d_length(xhandle.getSpace(), false)) {
107 throw std::runtime_error("expected column to have length equal to the number of rows");
108 }
109
110 const char* missing_attr_name = "missing-value-placeholder";
111
112 auto type = ritsuko::hdf5::open_and_load_scalar_string_attribute(xhandle, "type");
113 if (type == "string") {
114 if (!ritsuko::hdf5::is_utf8_string(xhandle)) {
115 throw std::runtime_error("expected a datatype for '" + dset_name + "' that can be represented by a UTF-8 encoded string");
116 }
117 auto missingness = ritsuko::hdf5::open_and_load_optional_string_missing_placeholder(xhandle, missing_attr_name);
118 std::string format = internal_string::fetch_format_attribute(xhandle);
119 internal_string::validate_string_format(xhandle, num_rows, format, missingness.first, missingness.second, options.hdf5_buffer_size);
120
121 } else {
122 if (type == "integer") {
123 if (ritsuko::hdf5::exceeds_integer_limit(xhandle, 32, true)) {
124 throw std::runtime_error("expected integer column to use a datatype that is a subset of a 32-bit signed integer");
125 }
126 } else if (type == "boolean") {
127 if (ritsuko::hdf5::exceeds_integer_limit(xhandle, 32, true)) {
128 throw std::runtime_error("expected boolean column to use a datatype that is a subset of a 32-bit signed integer");
129 }
130 } else if (type == "number") {
131 if (ritsuko::hdf5::exceeds_float_limit(xhandle, 64)) {
132 throw std::runtime_error("expected number column to use a datatype that is a subset of a 64-bit float");
133 }
134 } else {
135 throw std::runtime_error("unknown column type '" + type + "'");
136 }
137
138 if (xhandle.attrExists(missing_attr_name)) {
139 auto ahandle = xhandle.openAttribute(missing_attr_name);
140 ritsuko::hdf5::check_missing_placeholder_attribute(xhandle, ahandle);
141 }
142 }
143
144 } else {
145 throw std::runtime_error("unknown HDF5 object type");
146 }
147
148} catch (std::exception& e) {
149 throw std::runtime_error("failed to validate column at '" + ritsuko::hdf5::get_name(dhandle) + "/" + dset_name + "'; " + std::string(e.what()));
150}
160inline void validate(const std::filesystem::path& path, const ObjectMetadata& metadata, Options& options) {
161 const auto& vstring = internal_json::extract_version_for_type(metadata.other, "data_frame");
162 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true);
163 if (version.major != 1) {
164 throw std::runtime_error("unsupported version '" + vstring + "'");
165 }
166
167 auto handle = ritsuko::hdf5::open_file(path / "basic_columns.h5");
168 auto ghandle = ritsuko::hdf5::open_group(handle, "data_frame");
169
170 // Checking the number of rows.
171 auto attr = ritsuko::hdf5::open_scalar_attribute(ghandle, "row-count");
172 if (ritsuko::hdf5::exceeds_integer_limit(attr, 64, false)) {
173 throw std::runtime_error("'row-count' attribute should have a datatype that fits in a 64-bit unsigned integer");
174 }
175 uint64_t num_rows = 0;
176 attr.read(H5::PredType::NATIVE_UINT64, &num_rows);
177
178 // Checking row and column names.
179 if (ghandle.exists("row_names")) {
180 validate_row_names(ghandle, num_rows, options);
181 }
182 size_t NC = validate_column_names(ghandle, options);
183
184 // Finally iterating through the columns.
185 auto dhandle = ritsuko::hdf5::open_group(ghandle, "data");
186
187 hsize_t num_basic = 0;
188 auto other_dir = path / "other_columns";
189
190 for (size_t c = 0; c < NC; ++c) {
191 std::string dset_name = std::to_string(c);
192
193 if (!dhandle.exists(dset_name)) {
194 auto opath = other_dir / dset_name;
195 auto ometa = read_object_metadata(opath);
196 try {
197 ::takane::validate(opath, ometa, options);
198 } catch (std::exception& e) {
199 throw std::runtime_error("failed to validate 'other' column " + dset_name + "; " + std::string(e.what()));
200 }
201 if (::takane::height(opath, ometa, options) != num_rows) {
202 throw std::runtime_error("height of column " + dset_name + " of class '" + ometa.type + "' is not the same as the number of rows");
203 }
204
205 } else {
206 validate_column(dhandle, dset_name, num_rows, options);
207 ++num_basic;
208 }
209 }
210
211 if (std::filesystem::exists(other_dir)) {
212 if (internal_other::count_directory_entries(other_dir) != NC - num_basic) {
213 throw std::runtime_error("more objects than expected inside the 'other_columns' directory");
214 }
215 }
216
217 if (num_basic != dhandle.getNumObjs()) {
218 throw std::runtime_error("more objects present in the 'data_frame/data' group than expected");
219 }
220
221 internal_other::validate_mcols(path, "column_annotations", NC, options);
222 internal_other::validate_metadata(path, "other_annotations", options);
223}
224
231inline size_t height(const std::filesystem::path& path, [[maybe_unused]] const ObjectMetadata& metadata, [[maybe_unused]] Options& options) {
232 // Assume it's all valid already.
233 auto handle = ritsuko::hdf5::open_file(path / "basic_columns.h5");
234 auto ghandle = handle.openGroup("data_frame");
235 return ritsuko::hdf5::load_scalar_numeric_attribute<uint64_t>(ghandle.openAttribute("row-count"));
236}
237
244inline std::vector<size_t> dimensions(const std::filesystem::path& path, [[maybe_unused]] const ObjectMetadata& metadata, [[maybe_unused]] Options& options) {
245 // Assume it's all valid already.
246 auto handle = ritsuko::hdf5::open_file(path / "basic_columns.h5");
247 auto ghandle = handle.openGroup("data_frame");
248 std::vector<size_t> output(2);
249 output[0] = ritsuko::hdf5::load_scalar_numeric_attribute<uint64_t>(ghandle.openAttribute("row-count"));
250 output[1] = ritsuko::hdf5::get_1d_length(ghandle.openDataSet("column_names"), false);
251 return output;
252}
253
254}
255
256}
257
258#endif
takane validation functions.
Definition _derived_from.hpp:15
size_t height(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition _height.hpp:88
ObjectMetadata read_object_metadata(const std::filesystem::path &path)
Definition utils_public.hpp:74
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition _validate.hpp:107
std::vector< size_t > dimensions(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition _dimensions.hpp:69
Object metadata, including the type and other fields.
Definition utils_public.hpp:26
std::unordered_map< std::string, std::shared_ptr< millijson::Base > > other
Definition utils_public.hpp:35
Validation options.
Definition utils_public.hpp:94
Exported utilities.