1#ifndef TAKANE_HDF5_FRAME_HPP
2#define TAKANE_HDF5_FRAME_HPP
5#include "ritsuko/ritsuko.hpp"
6#include "ritsuko/hdf5/hdf5.hpp"
7#include "ritsuko/hdf5/vls/vls.hpp"
14#include <unordered_set>
17#include "utils_string.hpp"
18#include "utils_factor.hpp"
19#include "utils_other.hpp"
20#include "utils_json.hpp"
32void validate(
const std::filesystem::path&,
const ObjectMetadata&, Options& options);
33size_t height(
const std::filesystem::path&,
const ObjectMetadata&, Options& options);
43inline void validate_row_names(
const H5::Group& handle, hsize_t num_rows,
const Options& options)
try {
44 if (handle.childObjType(
"row_names") != H5O_TYPE_DATASET) {
45 throw std::runtime_error(
"expected a 'row_names' dataset when row names are present");
48 auto rnhandle = handle.openDataSet(
"row_names");
49 if (!ritsuko::hdf5::is_utf8_string(rnhandle)) {
50 throw std::runtime_error(
"expected a datatype for 'row_names' that can be represented by a UTF-8 encoded string");
53 if (ritsuko::hdf5::get_1d_length(rnhandle.getSpace(),
false) != num_rows) {
54 throw std::runtime_error(
"expected 'row_names' to have length equal to the number of rows");
56 ritsuko::hdf5::validate_1d_string_dataset(rnhandle, num_rows, options.hdf5_buffer_size);
57}
catch (std::exception& e) {
58 throw std::runtime_error(
"failed to validate the row names for '" + ritsuko::hdf5::get_name(handle) +
"'; " + std::string(e.what()));
61inline hsize_t validate_column_names(
const H5::Group& ghandle,
const Options& options)
try {
62 auto cnhandle = ritsuko::hdf5::open_dataset(ghandle,
"column_names");
63 if (!ritsuko::hdf5::is_utf8_string(cnhandle)) {
64 throw std::runtime_error(
"expected a datatype for 'column_names' that can be represented by a UTF-8 encoded string");
67 auto num_cols = ritsuko::hdf5::get_1d_length(cnhandle.getSpace(),
false);
69 std::unordered_set<std::string> column_names;
70 ritsuko::hdf5::Stream1dStringDataset stream(&cnhandle, num_cols, options.hdf5_buffer_size);
71 for (
size_t c = 0; c < num_cols; ++c, stream.next()) {
72 auto x = stream.steal();
74 throw std::runtime_error(
"column names should not be empty strings");
76 if (column_names.find(x) != column_names.end()) {
77 throw std::runtime_error(
"duplicated column name '" + x +
"'");
79 column_names.insert(std::move(x));
84}
catch (std::exception& e) {
85 throw std::runtime_error(
"failed to validate the column names for '" + ritsuko::hdf5::get_name(ghandle) +
"'; " + std::string(e.what()));
88inline void validate_column(
const H5::Group& dhandle,
const std::string& dset_name, hsize_t num_rows,
const ritsuko::Version& version,
const Options& options)
try {
89 const char* missing_attr_name =
"missing-value-placeholder";
91 auto dtype = dhandle.childObjType(dset_name);
92 if (dtype == H5O_TYPE_GROUP) {
93 auto ghandle = dhandle.openGroup(dset_name);
94 auto type = ritsuko::hdf5::open_and_load_scalar_string_attribute(ghandle,
"type");
96 if (type ==
"factor") {
97 internal_factor::check_ordered_attribute(ghandle);
98 auto num_levels = internal_factor::validate_factor_levels(ghandle,
"levels", options.hdf5_buffer_size);
99 auto num_codes = internal_factor::validate_factor_codes(ghandle,
"codes", num_levels, options.hdf5_buffer_size);
100 if (num_codes != num_rows) {
101 throw std::runtime_error(
"expected column to have length equal to the number of rows");
104 }
else if (type ==
"vls") {
105 if (version.lt(1, 1, 0)) {
106 throw std::runtime_error(
"unsupported type '" + type +
"'");
109 auto phandle = ritsuko::hdf5::vls::open_pointers(ghandle,
"pointers", 64, 64);
110 auto vlen = ritsuko::hdf5::get_1d_length(phandle.getSpace(),
false);
111 if (vlen != num_rows) {
112 throw std::runtime_error(
"expected column to have length equal to the number of rows");
115 auto hhandle = ritsuko::hdf5::vls::open_heap(ghandle,
"heap");
116 auto hlen = ritsuko::hdf5::get_1d_length(hhandle.getSpace(),
false);
117 ritsuko::hdf5::vls::validate_1d_array<uint64_t, uint64_t>(phandle, vlen, hlen, options.hdf5_buffer_size);
119 if (phandle.attrExists(missing_attr_name)) {
120 auto attr = phandle.openAttribute(missing_attr_name);
121 ritsuko::hdf5::check_string_missing_placeholder_attribute(attr);
125 throw std::runtime_error(
"unsupported type '" + type +
"'");
128 }
else if (dtype == H5O_TYPE_DATASET) {
129 auto xhandle = dhandle.openDataSet(dset_name);
130 if (num_rows != ritsuko::hdf5::get_1d_length(xhandle.getSpace(),
false)) {
131 throw std::runtime_error(
"expected column to have length equal to the number of rows");
134 auto type = ritsuko::hdf5::open_and_load_scalar_string_attribute(xhandle,
"type");
135 if (type ==
"string") {
136 if (!ritsuko::hdf5::is_utf8_string(xhandle)) {
137 throw std::runtime_error(
"expected a datatype for '" + dset_name +
"' that can be represented by a UTF-8 encoded string");
139 auto missingness = ritsuko::hdf5::open_and_load_optional_string_missing_placeholder(xhandle, missing_attr_name);
140 std::string format = internal_string::fetch_format_attribute(xhandle);
141 internal_string::validate_string_format(xhandle, num_rows, format, missingness, options.hdf5_buffer_size);
144 if (type ==
"integer") {
145 if (ritsuko::hdf5::exceeds_integer_limit(xhandle, 32,
true)) {
146 throw std::runtime_error(
"expected integer column to use a datatype that is a subset of a 32-bit signed integer");
148 }
else if (type ==
"boolean") {
149 if (ritsuko::hdf5::exceeds_integer_limit(xhandle, 32,
true)) {
150 throw std::runtime_error(
"expected boolean column to use a datatype that is a subset of a 32-bit signed integer");
152 }
else if (type ==
"number") {
153 if (ritsuko::hdf5::exceeds_float_limit(xhandle, 64)) {
154 throw std::runtime_error(
"expected number column to use a datatype that is a subset of a 64-bit float");
157 throw std::runtime_error(
"unknown column type '" + type +
"'");
160 if (xhandle.attrExists(missing_attr_name)) {
161 auto ahandle = xhandle.openAttribute(missing_attr_name);
162 ritsuko::hdf5::check_numeric_missing_placeholder_attribute(xhandle, ahandle);
167 throw std::runtime_error(
"unknown HDF5 object type");
170}
catch (std::exception& e) {
171 throw std::runtime_error(
"failed to validate column at '" + ritsuko::hdf5::get_name(dhandle) +
"/" + dset_name +
"'; " + std::string(e.what()));
183 const std::string type_name =
"data_frame";
184 const auto& vstring = internal_json::extract_version_for_type(metadata.
other, type_name);
185 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(),
true);
186 if (version.major != 1) {
187 throw std::runtime_error(
"unsupported version '" + vstring +
"'");
190 auto handle = ritsuko::hdf5::open_file(path /
"basic_columns.h5");
191 auto ghandle = ritsuko::hdf5::open_group(handle, type_name.c_str());
194 auto attr = ritsuko::hdf5::open_scalar_attribute(ghandle,
"row-count");
195 if (ritsuko::hdf5::exceeds_integer_limit(attr, 64,
false)) {
196 throw std::runtime_error(
"'row-count' attribute should have a datatype that fits in a 64-bit unsigned integer");
198 uint64_t num_rows = 0;
199 attr.read(H5::PredType::NATIVE_UINT64, &num_rows);
202 if (ghandle.exists(
"row_names")) {
203 validate_row_names(ghandle, num_rows, options);
205 size_t NC = validate_column_names(ghandle, options);
208 auto dhandle = ritsuko::hdf5::open_group(ghandle,
"data");
210 hsize_t num_basic = 0;
211 auto other_dir = path /
"other_columns";
213 for (
size_t c = 0; c < NC; ++c) {
214 std::string dset_name = std::to_string(c);
216 if (!dhandle.exists(dset_name)) {
217 auto opath = other_dir / dset_name;
221 }
catch (std::exception& e) {
222 throw std::runtime_error(
"failed to validate 'other' column " + dset_name +
"; " + std::string(e.what()));
225 throw std::runtime_error(
"height of column " + dset_name +
" of class '" + ometa.type +
"' is not the same as the number of rows");
229 validate_column(dhandle, dset_name, num_rows, version, options);
234 if (std::filesystem::exists(other_dir)) {
235 if (internal_other::count_directory_entries(other_dir) != NC - num_basic) {
236 throw std::runtime_error(
"more objects than expected inside the 'other_columns' directory");
240 if (num_basic != dhandle.getNumObjs()) {
241 throw std::runtime_error(
"more objects present in the 'data_frame/data' group than expected");
244 internal_other::validate_mcols(path,
"column_annotations", NC, options);
245 internal_other::validate_metadata(path,
"other_annotations", options);
256 auto handle = ritsuko::hdf5::open_file(path /
"basic_columns.h5");
257 auto ghandle = handle.openGroup(
"data_frame");
258 return ritsuko::hdf5::load_scalar_numeric_attribute<uint64_t>(ghandle.openAttribute(
"row-count"));
269 auto handle = ritsuko::hdf5::open_file(path /
"basic_columns.h5");
270 auto ghandle = handle.openGroup(
"data_frame");
271 std::vector<size_t> output(2);
272 output[0] = ritsuko::hdf5::load_scalar_numeric_attribute<uint64_t>(ghandle.openAttribute(
"row-count"));
273 output[1] = ritsuko::hdf5::get_1d_length(ghandle.openDataSet(
"column_names"),
false);
takane validation functions.
Definition _derived_from.hpp:15
size_t height(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition _height.hpp:88
ObjectMetadata read_object_metadata(const std::filesystem::path &path)
Definition utils_public.hpp:74
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition _validate.hpp:107
std::vector< size_t > dimensions(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition _dimensions.hpp:69
Validation options.
Definition utils_public.hpp:94