1#ifndef TAKANE_HDF5_FRAME_HPP
2#define TAKANE_HDF5_FRAME_HPP
5#include "ritsuko/ritsuko.hpp"
6#include "ritsuko/hdf5/hdf5.hpp"
13#include <unordered_set>
16#include "utils_string.hpp"
17#include "utils_factor.hpp"
18#include "utils_other.hpp"
19#include "utils_json.hpp"
31void validate(
const std::filesystem::path&,
const ObjectMetadata&, Options& options);
32size_t height(
const std::filesystem::path&,
const ObjectMetadata&, Options& options);
42inline void validate_row_names(
const H5::Group& handle, hsize_t num_rows,
const Options& options)
try {
43 if (handle.childObjType(
"row_names") != H5O_TYPE_DATASET) {
44 throw std::runtime_error(
"expected a 'row_names' dataset when row names are present");
47 auto rnhandle = handle.openDataSet(
"row_names");
48 if (!ritsuko::hdf5::is_utf8_string(rnhandle)) {
49 throw std::runtime_error(
"expected a datatype for 'row_names' that can be represented by a UTF-8 encoded string");
52 if (ritsuko::hdf5::get_1d_length(rnhandle.getSpace(),
false) != num_rows) {
53 throw std::runtime_error(
"expected 'row_names' to have length equal to the number of rows");
55 ritsuko::hdf5::validate_1d_string_dataset(rnhandle, num_rows, options.hdf5_buffer_size);
56}
catch (std::exception& e) {
57 throw std::runtime_error(
"failed to validate the row names for '" + ritsuko::hdf5::get_name(handle) +
"'; " + std::string(e.what()));
60inline hsize_t validate_column_names(
const H5::Group& ghandle,
const Options& options)
try {
61 auto cnhandle = ritsuko::hdf5::open_dataset(ghandle,
"column_names");
62 if (!ritsuko::hdf5::is_utf8_string(cnhandle)) {
63 throw std::runtime_error(
"expected a datatype for 'column_names' that can be represented by a UTF-8 encoded string");
66 auto num_cols = ritsuko::hdf5::get_1d_length(cnhandle.getSpace(),
false);
68 std::unordered_set<std::string> column_names;
69 ritsuko::hdf5::Stream1dStringDataset stream(&cnhandle, num_cols, options.hdf5_buffer_size);
70 for (
size_t c = 0; c < num_cols; ++c, stream.next()) {
71 auto x = stream.steal();
73 throw std::runtime_error(
"column names should not be empty strings");
75 if (column_names.find(x) != column_names.end()) {
76 throw std::runtime_error(
"duplicated column name '" + x +
"'");
78 column_names.insert(std::move(x));
83}
catch (std::exception& e) {
84 throw std::runtime_error(
"failed to validate the column names for '" + ritsuko::hdf5::get_name(ghandle) +
"'; " + std::string(e.what()));
87inline void validate_column(
const H5::Group& dhandle,
const std::string& dset_name, hsize_t num_rows,
const Options& options)
try {
88 auto dtype = dhandle.childObjType(dset_name);
89 if (dtype == H5O_TYPE_GROUP) {
90 auto fhandle = dhandle.openGroup(dset_name);
91 auto type = ritsuko::hdf5::open_and_load_scalar_string_attribute(fhandle,
"type");
92 if (type !=
"factor") {
93 throw std::runtime_error(
"expected HDF5 groups to have a 'type' attribute set to 'factor'");
96 internal_factor::check_ordered_attribute(fhandle);
98 auto num_levels = internal_factor::validate_factor_levels(fhandle,
"levels", options.hdf5_buffer_size);
99 auto num_codes = internal_factor::validate_factor_codes(fhandle,
"codes", num_levels, options.hdf5_buffer_size);
100 if (num_codes != num_rows) {
101 throw std::runtime_error(
"expected column to have length equal to the number of rows");
104 }
else if (dtype == H5O_TYPE_DATASET) {
105 auto xhandle = dhandle.openDataSet(dset_name);
106 if (num_rows != ritsuko::hdf5::get_1d_length(xhandle.getSpace(),
false)) {
107 throw std::runtime_error(
"expected column to have length equal to the number of rows");
110 const char* missing_attr_name =
"missing-value-placeholder";
112 auto type = ritsuko::hdf5::open_and_load_scalar_string_attribute(xhandle,
"type");
113 if (type ==
"string") {
114 if (!ritsuko::hdf5::is_utf8_string(xhandle)) {
115 throw std::runtime_error(
"expected a datatype for '" + dset_name +
"' that can be represented by a UTF-8 encoded string");
117 auto missingness = ritsuko::hdf5::open_and_load_optional_string_missing_placeholder(xhandle, missing_attr_name);
118 std::string format = internal_string::fetch_format_attribute(xhandle);
119 internal_string::validate_string_format(xhandle, num_rows, format, missingness.first, missingness.second, options.hdf5_buffer_size);
122 if (type ==
"integer") {
123 if (ritsuko::hdf5::exceeds_integer_limit(xhandle, 32,
true)) {
124 throw std::runtime_error(
"expected integer column to use a datatype that is a subset of a 32-bit signed integer");
126 }
else if (type ==
"boolean") {
127 if (ritsuko::hdf5::exceeds_integer_limit(xhandle, 32,
true)) {
128 throw std::runtime_error(
"expected boolean column to use a datatype that is a subset of a 32-bit signed integer");
130 }
else if (type ==
"number") {
131 if (ritsuko::hdf5::exceeds_float_limit(xhandle, 64)) {
132 throw std::runtime_error(
"expected number column to use a datatype that is a subset of a 64-bit float");
135 throw std::runtime_error(
"unknown column type '" + type +
"'");
138 if (xhandle.attrExists(missing_attr_name)) {
139 auto ahandle = xhandle.openAttribute(missing_attr_name);
140 ritsuko::hdf5::check_missing_placeholder_attribute(xhandle, ahandle);
145 throw std::runtime_error(
"unknown HDF5 object type");
148}
catch (std::exception& e) {
149 throw std::runtime_error(
"failed to validate column at '" + ritsuko::hdf5::get_name(dhandle) +
"/" + dset_name +
"'; " + std::string(e.what()));
161 const auto& vstring = internal_json::extract_version_for_type(metadata.
other,
"data_frame");
162 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(),
true);
163 if (version.major != 1) {
164 throw std::runtime_error(
"unsupported version '" + vstring +
"'");
167 auto handle = ritsuko::hdf5::open_file(path /
"basic_columns.h5");
168 auto ghandle = ritsuko::hdf5::open_group(handle,
"data_frame");
171 auto attr = ritsuko::hdf5::open_scalar_attribute(ghandle,
"row-count");
172 if (ritsuko::hdf5::exceeds_integer_limit(attr, 64,
false)) {
173 throw std::runtime_error(
"'row-count' attribute should have a datatype that fits in a 64-bit unsigned integer");
175 uint64_t num_rows = 0;
176 attr.read(H5::PredType::NATIVE_UINT64, &num_rows);
179 if (ghandle.exists(
"row_names")) {
180 validate_row_names(ghandle, num_rows, options);
182 size_t NC = validate_column_names(ghandle, options);
185 auto dhandle = ritsuko::hdf5::open_group(ghandle,
"data");
187 hsize_t num_basic = 0;
188 auto other_dir = path /
"other_columns";
190 for (
size_t c = 0; c < NC; ++c) {
191 std::string dset_name = std::to_string(c);
193 if (!dhandle.exists(dset_name)) {
194 auto opath = other_dir / dset_name;
198 }
catch (std::exception& e) {
199 throw std::runtime_error(
"failed to validate 'other' column " + dset_name +
"; " + std::string(e.what()));
202 throw std::runtime_error(
"height of column " + dset_name +
" of class '" + ometa.type +
"' is not the same as the number of rows");
206 validate_column(dhandle, dset_name, num_rows, options);
211 if (std::filesystem::exists(other_dir)) {
212 if (internal_other::count_directory_entries(other_dir) != NC - num_basic) {
213 throw std::runtime_error(
"more objects than expected inside the 'other_columns' directory");
217 if (num_basic != dhandle.getNumObjs()) {
218 throw std::runtime_error(
"more objects present in the 'data_frame/data' group than expected");
221 internal_other::validate_mcols(path,
"column_annotations", NC, options);
222 internal_other::validate_metadata(path,
"other_annotations", options);
233 auto handle = ritsuko::hdf5::open_file(path /
"basic_columns.h5");
234 auto ghandle = handle.openGroup(
"data_frame");
235 return ritsuko::hdf5::load_scalar_numeric_attribute<uint64_t>(ghandle.openAttribute(
"row-count"));
246 auto handle = ritsuko::hdf5::open_file(path /
"basic_columns.h5");
247 auto ghandle = handle.openGroup(
"data_frame");
248 std::vector<size_t> output(2);
249 output[0] = ritsuko::hdf5::load_scalar_numeric_attribute<uint64_t>(ghandle.openAttribute(
"row-count"));
250 output[1] = ritsuko::hdf5::get_1d_length(ghandle.openDataSet(
"column_names"),
false);
takane validation functions.
Definition _derived_from.hpp:15
size_t height(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition _height.hpp:88
ObjectMetadata read_object_metadata(const std::filesystem::path &path)
Definition utils_public.hpp:74
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition _validate.hpp:107
std::vector< size_t > dimensions(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition _dimensions.hpp:69
Validation options.
Definition utils_public.hpp:94