1#ifndef UZUKI2_PARSE_HPP
2#define UZUKI2_PARSE_HPP
11#include <unordered_set>
17#include "ExternalTracker.hpp"
21#include "ritsuko/ritsuko.hpp"
22#include "ritsuko/hdf5/hdf5.hpp"
23#include "ritsuko/hdf5/vls/vls.hpp"
45inline H5::DataSet check_scalar_dataset(
const H5::Group& handle,
const char* name) {
46 if (handle.childObjType(name) != H5O_TYPE_DATASET) {
47 throw std::runtime_error(
"expected '" + std::string(name) +
"' to be a dataset");
49 auto dhandle = handle.openDataSet(name);
50 if (!ritsuko::hdf5::is_scalar(dhandle)) {
51 throw std::runtime_error(
"expected '" + std::string(name) +
"'to be a scalar dataset");
56template<
class Host_,
class Function_>
57void parse_integer_like(
const H5::DataSet& handle, Host_* ptr,
bool is_scalar, Function_ check,
const Version& version, hsize_t buffer_size)
try {
58 if (ritsuko::hdf5::exceeds_integer_limit(handle, 32,
true)) {
59 throw std::runtime_error(
"dataset cannot be represented by 32-bit signed integers");
62 bool has_missing =
false;
63 int32_t missing_value = -2147483648;
64 if (version.equals(1, 0)) {
67 const char* placeholder_name =
"missing-value-placeholder";
68 has_missing = handle.attrExists(placeholder_name);
70 auto attr = handle.openAttribute(placeholder_name);
71 ritsuko::hdf5::check_numeric_missing_placeholder_attribute(handle, attr, version.lt(1, 2));
72 attr.read(H5::PredType::NATIVE_INT32, &missing_value);
76 auto set = [&](hsize_t i, int32_t x) ->
void {
77 if (has_missing && x == missing_value) {
87 handle.read(&value, H5::PredType::NATIVE_INT32);
90 hsize_t full_length = ptr->size();
91 ritsuko::hdf5::Stream1dNumericDataset<int32_t> stream(&handle, full_length, buffer_size);
92 for (hsize_t i = 0; i < full_length; ++i, stream.next()) {
97}
catch (std::exception& e) {
98 throw std::runtime_error(
"failed to load integer dataset at '" + ritsuko::hdf5::get_name(handle) +
"'; " + std::string(e.what()));
101template<
class Host_,
class Function_>
102void parse_string_like(
const H5::DataSet& handle, Host_* ptr,
bool is_scalar, Function_ check, hsize_t buffer_size)
try {
103 if (!ritsuko::hdf5::is_utf8_string(handle)) {
104 throw std::runtime_error(
"expected a datatype that can be represented by a UTF-8 encoded string");
107 auto missingness = ritsuko::hdf5::open_and_load_optional_string_missing_placeholder(handle,
"missing-value-placeholder");
108 auto set = [&](hsize_t i, std::string x) ->
void {
109 if (missingness.has_value() && x == *missingness) {
113 ptr->set(i, std::move(x));
118 auto x = ritsuko::hdf5::load_scalar_string_dataset(handle);
119 set(0, std::move(x));
121 hsize_t full_length = ptr->size();
122 ritsuko::hdf5::Stream1dStringDataset stream(&handle, full_length, buffer_size);
123 for (hsize_t i = 0; i < full_length; ++i, stream.next()) {
124 set(i, stream.steal());
128}
catch (std::exception& e) {
129 throw std::runtime_error(
"failed to load string dataset at '" + ritsuko::hdf5::get_name(handle) +
"'; " + std::string(e.what()));
132template<
class Host_,
class Function_>
133void parse_numbers(
const H5::DataSet& handle, Host_* ptr,
bool is_scalar, Function_ check,
const Version& version, hsize_t buffer_size)
try {
134 if (version.lt(1, 3)) {
135 if (handle.getTypeClass() != H5T_FLOAT) {
136 throw std::runtime_error(
"expected a floating-point dataset");
139 if (ritsuko::hdf5::exceeds_float_limit(handle, 64)) {
140 throw std::runtime_error(
"dataset cannot be represented by 64-bit floats");
144 bool has_missing =
false;
145 double missing_value = 0;
146 if (version.equals(1, 0)) {
148 missing_value = ritsuko::r_missing_value();
150 const char* placeholder_name =
"missing-value-placeholder";
151 has_missing = handle.attrExists(placeholder_name);
153 auto attr = handle.openAttribute(placeholder_name);
154 ritsuko::hdf5::check_numeric_missing_placeholder_attribute(handle, attr, version.lt(1, 2));
155 attr.read(H5::PredType::NATIVE_DOUBLE, &missing_value);
159 bool should_compare_nan = version.lt(1, 3);
160 bool is_placeholder_nan = std::isnan(missing_value);
161 auto is_missing_value = [&](
double val) ->
bool {
162 if (should_compare_nan) {
163 return ritsuko::are_floats_identical(&val, &missing_value);
164 }
else if (is_placeholder_nan) {
165 return std::isnan(val);
167 return val == missing_value;
171 auto set = [&](hsize_t i,
double x) ->
void {
172 if (has_missing && is_missing_value(x)) {
182 handle.read(&val, H5::PredType::NATIVE_DOUBLE);
185 hsize_t full_length = ptr->size();
186 ritsuko::hdf5::Stream1dNumericDataset<double> stream(&handle, full_length, buffer_size);
187 for (hsize_t i = 0; i < full_length; ++i, stream.next()) {
188 set(i, stream.get());
192}
catch (std::exception& e) {
193 throw std::runtime_error(
"failed to load floating-point dataset at '" + ritsuko::hdf5::get_name(handle) +
"'; " + std::string(e.what()));
197void extract_names(
const H5::Group& handle, Host_* ptr, hsize_t buffer_size)
try {
198 if (handle.childObjType(
"names") != H5O_TYPE_DATASET) {
199 throw std::runtime_error(
"expected a dataset");
202 auto nhandle = handle.openDataSet(
"names");
203 if (!ritsuko::hdf5::is_utf8_string(nhandle)) {
204 throw std::runtime_error(
"expected a datatype that can be represented by a UTF-8 encoded string");
207 size_t len = ptr->size();
208 size_t nlen = ritsuko::hdf5::get_1d_length(nhandle.getSpace(),
false);
210 throw std::runtime_error(
"number of names should be equal to the object length");
213 ritsuko::hdf5::Stream1dStringDataset stream(&nhandle, nlen, buffer_size);
214 for (
size_t i = 0; i < nlen; ++i, stream.next()) {
215 ptr->set_name(i, stream.steal());
217}
catch (std::exception& e) {
218 throw std::runtime_error(
"failed to load names at '" + ritsuko::hdf5::get_name(handle) +
"'; " + std::string(e.what()));
221template<
class Provisioner_,
class Externals_>
222std::shared_ptr<Base> parse_inner(
const H5::Group& handle, Externals_& ext,
const Version& version, hsize_t buffer_size)
try {
224 auto object_type = ritsuko::hdf5::open_and_load_scalar_string_attribute(handle,
"uzuki_object");
225 std::shared_ptr<Base> output;
227 if (object_type ==
"list") {
228 auto dhandle = ritsuko::hdf5::open_group(handle,
"data");
229 size_t len = dhandle.getNumObjs();
231 bool named = handle.exists(
"names");
232 auto lptr = Provisioner_::new_List(len, named);
236 for (
size_t i = 0; i < len; ++i) {
237 auto istr = std::to_string(i);
238 auto lhandle = ritsuko::hdf5::open_group(dhandle, istr.c_str());
239 lptr->set(i, parse_inner<Provisioner_>(lhandle, ext, version, buffer_size));
241 }
catch (std::exception& e) {
242 throw std::runtime_error(
"failed to parse list contents in 'data'; " + std::string(e.what()));
246 extract_names(handle, lptr, buffer_size);
249 }
else if (object_type ==
"vector") {
250 auto vector_type = ritsuko::hdf5::open_and_load_scalar_string_attribute(handle,
"uzuki_type");
252 auto dhandle = ritsuko::hdf5::open_dataset(handle,
"data");
253 size_t len = ritsuko::hdf5::get_1d_length(dhandle.getSpace(),
true);
254 bool is_scalar = (len == 0);
259 bool named = handle.exists(
"names");
261 if (vector_type ==
"integer") {
262 auto iptr = Provisioner_::new_Integer(len, named, is_scalar);
268 [](int32_t) ->
void {},
273 }
else if (vector_type ==
"boolean") {
274 auto bptr = Provisioner_::new_Boolean(len, named, is_scalar);
280 [&](int32_t x) ->
void {
281 if (x != 0 && x != 1) {
282 throw std::runtime_error(
"boolean values should be 0 or 1");
289 }
else if (vector_type ==
"factor" || (version.equals(1, 0) && vector_type ==
"ordered")) {
290 auto levhandle = ritsuko::hdf5::open_dataset(handle,
"levels");
291 if (!ritsuko::hdf5::is_utf8_string(levhandle)) {
292 throw std::runtime_error(
"expected a datatype that can be represented by a UTF-8 encoded string");
295 int32_t levlen = ritsuko::hdf5::get_1d_length(levhandle.getSpace(),
false);
296 bool ordered =
false;
297 if (vector_type ==
"ordered") {
299 }
else if (handle.exists(
"ordered")) {
300 auto ohandle = check_scalar_dataset(handle,
"ordered");
301 if (ritsuko::hdf5::exceeds_integer_limit(ohandle, 32,
true)) {
302 throw std::runtime_error(
"'ordered' value cannot be represented by a 32-bit integer");
304 int32_t tmp_ordered = 0;
305 ohandle.read(&tmp_ordered, H5::PredType::NATIVE_INT32);
306 ordered = tmp_ordered > 0;
309 auto fptr = Provisioner_::new_Factor(len, named, is_scalar, levlen, ordered);
315 [&](int32_t x) ->
void {
316 if (x < 0 || x >= levlen) {
317 throw std::runtime_error(
"factor codes should be non-negative and less than the number of levels");
324 std::unordered_set<std::string> present;
325 ritsuko::hdf5::Stream1dStringDataset stream(&levhandle, levlen, buffer_size);
326 for (int32_t i = 0; i < levlen; ++i, stream.next()) {
327 auto x = stream.steal();
328 if (present.find(x) != present.end()) {
329 throw std::runtime_error(
"levels should be unique");
331 fptr->set_level(i, x);
332 present.insert(std::move(x));
335 }
else if (vector_type ==
"vls" && !version.lt(1, 4)) {
336 ritsuko::hdf5::vls::validate_pointer_datatype(dhandle.getCompType(), 64, 64);
337 auto hhandle = ritsuko::hdf5::vls::open_heap(handle,
"heap");
338 auto missingness = ritsuko::hdf5::open_and_load_optional_string_missing_placeholder(dhandle,
"missing-value-placeholder");
340 auto ptr = Provisioner_::new_String(len, named, is_scalar, StringVector::NONE);
344 ritsuko::hdf5::vls::Pointer<uint64_t, uint64_t> vlsptr;
345 dhandle.read(&vlsptr, ritsuko::hdf5::vls::define_pointer_datatype<uint64_t, uint64_t>());
347 hsize_t len = vlsptr.length;
348 H5::DataSpace mspace(1, &len);
349 hsize_t offset = vlsptr.offset;
350 hsize_t hlen = ritsuko::hdf5::get_1d_length(hhandle,
false);
351 H5::DataSpace dspace(1, &hlen);
352 dspace.selectHyperslab(H5S_SELECT_SET, &len, &offset);
354 std::vector<uint8_t> buffer(vlsptr.length);
355 hhandle.read(buffer.data(), H5::PredType::NATIVE_UINT8, mspace, dspace);
356 auto cptr =
reinterpret_cast<const char*
>(buffer.data());
357 std::string str(cptr, cptr + ritsuko::hdf5::find_string_length(cptr, vlsptr.length));
359 if (missingness.has_value() && str == *missingness) {
362 ptr->set(0, std::move(str));
366 ritsuko::hdf5::vls::Stream1dArray<uint64_t, uint64_t> stream(&dhandle, &hhandle, len, buffer_size);
367 for (hsize_t i = 0; i < len; ++i, stream.next()) {
368 auto x = stream.steal();
369 if (missingness.has_value() && x == *missingness) {
372 ptr->set(i, std::move(x));
377 }
else if (vector_type ==
"string" || (version.equals(1, 0) && (vector_type ==
"date" || vector_type ==
"date-time"))) {
379 if (version.equals(1, 0)) {
380 if (vector_type ==
"date") {
381 format = StringVector::DATE;
382 }
else if (vector_type ==
"date-time") {
383 format = StringVector::DATETIME;
386 }
else if (handle.exists(
"format")) {
387 auto fhandle = check_scalar_dataset(handle,
"format");
388 if (!ritsuko::hdf5::is_utf8_string(fhandle)) {
389 throw std::runtime_error(
"expected a datatype that can be represented by a UTF-8 encoded string");
391 auto x = ritsuko::hdf5::load_scalar_string_dataset(fhandle);
393 format = StringVector::DATE;
394 }
else if (x ==
"date-time") {
395 format = StringVector::DATETIME;
397 throw std::runtime_error(
"unsupported format '" + x +
"'");
401 auto sptr = Provisioner_::new_String(len, named, is_scalar, format);
403 if (format == StringVector::NONE) {
408 [](
const std::string&) ->
void {},
412 }
else if (format == StringVector::DATE) {
417 [&](
const std::string& x) ->
void {
418 if (!ritsuko::is_date(x.c_str(), x.size())) {
419 throw std::runtime_error(
"dates should follow YYYY-MM-DD formatting");
425 }
else if (format == StringVector::DATETIME) {
430 [&](
const std::string& x) ->
void {
431 if (!ritsuko::is_rfc3339(x.c_str(), x.size())) {
432 throw std::runtime_error(
"date-times should follow the Internet Date/Time format");
439 }
else if (vector_type ==
"number") {
440 auto dptr = Provisioner_::new_Number(len, named, is_scalar);
446 [](
double) ->
void {},
452 throw std::runtime_error(
"unknown vector type '" + vector_type +
"'");
456 auto vptr =
static_cast<Vector*
>(output.get());
457 extract_names(handle, vptr, buffer_size);
460 }
else if (object_type ==
"nothing") {
461 output.reset(Provisioner_::new_Nothing());
463 }
else if (object_type ==
"external") {
464 auto ihandle = ritsuko::hdf5::open_dataset(handle,
"index");
465 if (ritsuko::hdf5::exceeds_integer_limit(ihandle, 32,
true)) {
466 throw std::runtime_error(
"external index at 'index' cannot be represented by a 32-bit signed integer");
469 auto ispace = ihandle.getSpace();
470 int idims = ispace.getSimpleExtentNdims();
472 throw std::runtime_error(
"expected scalar dataset at 'index'");
476 ihandle.read(&idx, H5::PredType::NATIVE_INT32);
477 if (idx < 0 ||
static_cast<size_t>(idx) >= ext.size()) {
478 throw std::runtime_error(
"external index out of range at 'index'");
481 output.reset(Provisioner_::new_External(ext.get(idx)));
484 throw std::runtime_error(
"unknown uzuki2 object type '" + object_type +
"'");
488}
catch (std::exception& e) {
489 throw std::runtime_error(
"failed to load object at '" + ritsuko::hdf5::get_name(handle) +
"'; " + std::string(e.what()));
556template<
class Provisioner_,
class Externals_>
559 if (handle.attrExists(
"uzuki_version")) {
560 auto ver_str = ritsuko::hdf5::open_and_load_scalar_string_attribute(handle,
"uzuki_version");
561 auto vraw = ritsuko::parse_version_string(ver_str.c_str(), ver_str.size(),
true);
562 version.major = vraw.major;
563 version.minor = vraw.minor;
566 ExternalTracker etrack(std::move(ext));
567 auto ptr = parse_inner<Provisioner_>(handle, etrack, version, options.
buffer_size);
570 throw std::runtime_error(
"top-level object should represent an R list");
574 return ParsedList(std::move(ptr), std::move(version));
593template<
class Provisioner_,
class Externals_>
595 H5::H5File handle(file, H5F_ACC_RDONLY);
596 return parse<Provisioner_>(ritsuko::hdf5::open_group(handle, name.c_str()), std::move(ext), options);
607inline void validate(
const H5::Group& handle,
int num_external,
const Options& options) {
620inline void validate(
const std::string& file,
const std::string& name,
int num_external,
const Options& options) {
Dummy classes for parsing without storing the results.
Class to hold the parsed list.
Dummy class satisfying the Externals_ interface of hdf5::parse().
Definition Dummy.hpp:130
Format
Definition interfaces.hpp:158
Defines the interfaces to use in HDF5 parsing.
void validate(const H5::Group &handle, int num_external, const Options &options)
Definition parse_hdf5.hpp:607
ParsedList parse(const H5::Group &handle, Externals_ ext, const Options &options)
Definition parse_hdf5.hpp:557
Parse an R list from a HDF5 or JSON file.
Definition parse_json.hpp:28
Results of parsing a list from file.
Definition ParsedList.hpp:19
Options for HDF5 file parsing.
Definition parse_hdf5.hpp:499
hsize_t buffer_size
Definition parse_hdf5.hpp:503
bool strict_list
Definition parse_hdf5.hpp:508