1#ifndef UZUKI2_PARSE_HPP
2#define UZUKI2_PARSE_HPP
11#include <unordered_set>
17#include "ExternalTracker.hpp"
19#include "ParsedList.hpp"
21#include "ritsuko/ritsuko.hpp"
22#include "ritsuko/hdf5/hdf5.hpp"
44inline H5::DataSet check_scalar_dataset(
const H5::Group& handle,
const char* name) {
45 if (handle.childObjType(name) != H5O_TYPE_DATASET) {
46 throw std::runtime_error(
"expected '" + std::string(name) +
"' to be a dataset");
48 auto dhandle = handle.openDataSet(name);
49 if (!ritsuko::hdf5::is_scalar(dhandle)) {
50 throw std::runtime_error(
"expected '" + std::string(name) +
"'to be a scalar dataset");
55template<
class Host,
class Function>
56void parse_integer_like(
const H5::DataSet& handle, Host* ptr, Function check,
const Version& version, hsize_t buffer_size)
try {
57 if (ritsuko::hdf5::exceeds_integer_limit(handle, 32,
true)) {
58 throw std::runtime_error(
"dataset cannot be represented by 32-bit signed integers");
61 bool has_missing =
false;
62 int32_t missing_value = -2147483648;
63 if (version.equals(1, 0)) {
66 const char* placeholder_name =
"missing-value-placeholder";
67 has_missing = handle.attrExists(placeholder_name);
69 auto attr = handle.openAttribute(placeholder_name);
70 ritsuko::hdf5::check_missing_placeholder_attribute(handle, attr, version.lt(1, 2));
71 attr.read(H5::PredType::NATIVE_INT32, &missing_value);
75 hsize_t full_length = ptr->size();
76 ritsuko::hdf5::Stream1dNumericDataset<int32_t> stream(&handle, full_length, buffer_size);
77 for (hsize_t i = 0; i < full_length; ++i, stream.next()) {
78 auto current = stream.get();
79 if (has_missing && current == missing_value) {
87}
catch (std::exception& e) {
88 throw std::runtime_error(
"failed to load integer dataset at '" + ritsuko::hdf5::get_name(handle) +
"'; " + std::string(e.what()));
91template<
class Host,
class Function>
92void parse_string_like(
const H5::DataSet& handle, Host* ptr, Function check, hsize_t buffer_size)
try {
93 if (!ritsuko::hdf5::is_utf8_string(handle)) {
94 throw std::runtime_error(
"expected a datatype that can be represented by a UTF-8 encoded string");
97 auto missingness = ritsuko::hdf5::open_and_load_optional_string_missing_placeholder(handle,
"missing-value-placeholder");
98 bool has_missing = missingness.first;
99 std::string missing_val = missingness.second;
101 hsize_t full_length = ptr->size();
102 ritsuko::hdf5::Stream1dStringDataset stream(&handle, full_length, buffer_size);
103 for (hsize_t i = 0; i < full_length; ++i, stream.next()) {
104 auto x = stream.steal();
105 if (has_missing && x == missing_val) {
109 ptr->set(i, std::move(x));
113}
catch (std::exception& e) {
114 throw std::runtime_error(
"failed to load string dataset at '" + ritsuko::hdf5::get_name(handle) +
"'; " + std::string(e.what()));
117template<
class Host,
class Function>
118void parse_numbers(
const H5::DataSet& handle, Host* ptr, Function check,
const Version& version, hsize_t buffer_size)
try {
119 if (version.lt(1, 3)) {
120 if (handle.getTypeClass() != H5T_FLOAT) {
121 throw std::runtime_error(
"expected a floating-point dataset");
124 if (ritsuko::hdf5::exceeds_float_limit(handle, 64)) {
125 throw std::runtime_error(
"dataset cannot be represented by 64-bit floats");
129 bool has_missing =
false;
130 double missing_value = 0;
132 if (version.equals(1, 0)) {
134 missing_value = ritsuko::r_missing_value();
136 const char* placeholder_name =
"missing-value-placeholder";
137 has_missing = handle.attrExists(placeholder_name);
139 auto attr = handle.openAttribute(placeholder_name);
140 ritsuko::hdf5::check_missing_placeholder_attribute(handle, attr, version.lt(1, 2));
141 attr.read(H5::PredType::NATIVE_DOUBLE, &missing_value);
145 bool should_compare_nan = version.lt(1, 3);
146 bool is_placeholder_nan = std::isnan(missing_value);
147 auto is_missing_value = [&](
double val) ->
bool {
148 if (should_compare_nan) {
149 return ritsuko::are_floats_identical(&val, &missing_value);
150 }
else if (is_placeholder_nan) {
151 return std::isnan(val);
153 return val == missing_value;
157 hsize_t full_length = ptr->size();
158 ritsuko::hdf5::Stream1dNumericDataset<double> stream(&handle, full_length, buffer_size);
159 for (hsize_t i = 0; i < full_length; ++i, stream.next()) {
160 auto current = stream.get();
161 if (has_missing && is_missing_value(current)) {
165 ptr->set(i, current);
169}
catch (std::exception& e) {
170 throw std::runtime_error(
"failed to load floating-point dataset at '" + ritsuko::hdf5::get_name(handle) +
"'; " + std::string(e.what()));
174void extract_names(
const H5::Group& handle, Host* ptr, hsize_t buffer_size)
try {
175 if (handle.childObjType(
"names") != H5O_TYPE_DATASET) {
176 throw std::runtime_error(
"expected a dataset");
179 auto nhandle = handle.openDataSet(
"names");
180 if (!ritsuko::hdf5::is_utf8_string(nhandle)) {
181 throw std::runtime_error(
"expected a datatype that can be represented by a UTF-8 encoded string");
184 size_t len = ptr->size();
185 size_t nlen = ritsuko::hdf5::get_1d_length(nhandle.getSpace(),
false);
187 throw std::runtime_error(
"number of names should be equal to the object length");
190 ritsuko::hdf5::Stream1dStringDataset stream(&nhandle, nlen, buffer_size);
191 for (
size_t i = 0; i < nlen; ++i, stream.next()) {
192 ptr->set_name(i, stream.steal());
194}
catch (std::exception& e) {
195 throw std::runtime_error(
"failed to load names at '" + ritsuko::hdf5::get_name(handle) +
"'; " + std::string(e.what()));
198template<
class Provisioner,
class Externals>
199std::shared_ptr<Base> parse_inner(
const H5::Group& handle, Externals& ext,
const Version& version, hsize_t buffer_size)
try {
201 auto object_type = ritsuko::hdf5::open_and_load_scalar_string_attribute(handle,
"uzuki_object");
202 std::shared_ptr<Base> output;
204 if (object_type ==
"list") {
205 auto dhandle = ritsuko::hdf5::open_group(handle,
"data");
206 size_t len = dhandle.getNumObjs();
208 bool named = handle.exists(
"names");
209 auto lptr = Provisioner::new_List(len, named);
213 for (
size_t i = 0; i < len; ++i) {
214 auto istr = std::to_string(i);
215 auto lhandle = ritsuko::hdf5::open_group(dhandle, istr.c_str());
216 lptr->set(i, parse_inner<Provisioner>(lhandle, ext, version, buffer_size));
218 }
catch (std::exception& e) {
219 throw std::runtime_error(
"failed to parse list contents in 'data'; " + std::string(e.what()));
223 extract_names(handle, lptr, buffer_size);
226 }
else if (object_type ==
"vector") {
227 auto vector_type = ritsuko::hdf5::open_and_load_scalar_string_attribute(handle,
"uzuki_type");
229 auto dhandle = ritsuko::hdf5::open_dataset(handle,
"data");
230 size_t len = ritsuko::hdf5::get_1d_length(dhandle.getSpace(),
true);
231 bool is_scalar = (len == 0);
236 bool named = handle.exists(
"names");
238 if (vector_type ==
"integer") {
239 auto iptr = Provisioner::new_Integer(len, named, is_scalar);
241 parse_integer_like(dhandle, iptr, [](int32_t) ->
void {}, version, buffer_size);
243 }
else if (vector_type ==
"boolean") {
244 auto bptr = Provisioner::new_Boolean(len, named, is_scalar);
246 parse_integer_like(dhandle, bptr, [&](int32_t x) ->
void {
247 if (x != 0 && x != 1) {
248 throw std::runtime_error(
"boolean values should be 0 or 1");
250 }, version, buffer_size);
252 }
else if (vector_type ==
"factor" || (version.equals(1, 0) && vector_type ==
"ordered")) {
253 auto levhandle = ritsuko::hdf5::open_dataset(handle,
"levels");
254 if (!ritsuko::hdf5::is_utf8_string(levhandle)) {
255 throw std::runtime_error(
"expected a datatype that can be represented by a UTF-8 encoded string");
258 int32_t levlen = ritsuko::hdf5::get_1d_length(levhandle.getSpace(),
false);
259 bool ordered =
false;
260 if (vector_type ==
"ordered") {
262 }
else if (handle.exists(
"ordered")) {
263 auto ohandle = check_scalar_dataset(handle,
"ordered");
264 if (ritsuko::hdf5::exceeds_integer_limit(ohandle, 32,
true)) {
265 throw std::runtime_error(
"'ordered' value cannot be represented by a 32-bit integer");
267 int32_t tmp_ordered = 0;
268 ohandle.read(&tmp_ordered, H5::PredType::NATIVE_INT32);
269 ordered = tmp_ordered > 0;
272 auto fptr = Provisioner::new_Factor(len, named, is_scalar, levlen, ordered);
274 parse_integer_like(dhandle, fptr, [&](int32_t x) ->
void {
275 if (x < 0 || x >= levlen) {
276 throw std::runtime_error(
"factor codes should be non-negative and less than the number of levels");
278 }, version, buffer_size);
280 std::unordered_set<std::string> present;
281 ritsuko::hdf5::Stream1dStringDataset stream(&levhandle, levlen, buffer_size);
282 for (int32_t i = 0; i < levlen; ++i, stream.next()) {
283 auto x = stream.steal();
284 if (present.find(x) != present.end()) {
285 throw std::runtime_error(
"levels should be unique");
287 fptr->set_level(i, x);
288 present.insert(std::move(x));
291 }
else if (vector_type ==
"string" || (version.equals(1, 0) && (vector_type ==
"date" || vector_type ==
"date-time"))) {
293 if (version.equals(1, 0)) {
294 if (vector_type ==
"date") {
295 format = StringVector::DATE;
296 }
else if (vector_type ==
"date-time") {
297 format = StringVector::DATETIME;
300 }
else if (handle.exists(
"format")) {
301 auto fhandle = check_scalar_dataset(handle,
"format");
302 if (!ritsuko::hdf5::is_utf8_string(fhandle)) {
303 throw std::runtime_error(
"expected a datatype that can be represented by a UTF-8 encoded string");
305 auto x = ritsuko::hdf5::load_scalar_string_dataset(fhandle);
307 format = StringVector::DATE;
308 }
else if (x ==
"date-time") {
309 format = StringVector::DATETIME;
311 throw std::runtime_error(
"unsupported format '" + x +
"'");
315 auto sptr = Provisioner::new_String(len, named, is_scalar, format);
317 if (format == StringVector::NONE) {
318 parse_string_like(dhandle, sptr, [](
const std::string&) ->
void {}, buffer_size);
320 }
else if (format == StringVector::DATE) {
321 parse_string_like(dhandle, sptr, [&](
const std::string& x) ->
void {
322 if (!ritsuko::is_date(x.c_str(), x.size())) {
323 throw std::runtime_error(
"dates should follow YYYY-MM-DD formatting");
327 }
else if (format == StringVector::DATETIME) {
328 parse_string_like(dhandle, sptr, [&](
const std::string& x) ->
void {
329 if (!ritsuko::is_rfc3339(x.c_str(), x.size())) {
330 throw std::runtime_error(
"date-times should follow the Internet Date/Time format");
335 }
else if (vector_type ==
"number") {
336 auto dptr = Provisioner::new_Number(len, named, is_scalar);
338 parse_numbers(dhandle, dptr, [](
double) ->
void {}, version, buffer_size);
341 throw std::runtime_error(
"unknown vector type '" + vector_type +
"'");
345 auto vptr =
static_cast<Vector*
>(output.get());
346 extract_names(handle, vptr, buffer_size);
349 }
else if (object_type ==
"nothing") {
350 output.reset(Provisioner::new_Nothing());
352 }
else if (object_type ==
"external") {
353 auto ihandle = ritsuko::hdf5::open_dataset(handle,
"index");
354 if (ritsuko::hdf5::exceeds_integer_limit(ihandle, 32,
true)) {
355 throw std::runtime_error(
"external index at 'index' cannot be represented by a 32-bit signed integer");
358 auto ispace = ihandle.getSpace();
359 int idims = ispace.getSimpleExtentNdims();
361 throw std::runtime_error(
"expected scalar dataset at 'index'");
365 ihandle.read(&idx, H5::PredType::NATIVE_INT32);
366 if (idx < 0 ||
static_cast<size_t>(idx) >= ext.size()) {
367 throw std::runtime_error(
"external index out of range at 'index'");
370 output.reset(Provisioner::new_External(ext.get(idx)));
373 throw std::runtime_error(
"unknown uzuki2 object type '" + object_type +
"'");
377}
catch (std::exception& e) {
378 throw std::runtime_error(
"failed to load object at '" + ritsuko::hdf5::get_name(handle) +
"'; " + std::string(e.what()));
444template<
class Provisioner,
class Externals>
447 if (handle.attrExists(
"uzuki_version")) {
448 auto ver_str = ritsuko::hdf5::open_and_load_scalar_string_attribute(handle,
"uzuki_version");
449 auto vraw = ritsuko::parse_version_string(ver_str.c_str(), ver_str.size(),
true);
450 version.major = vraw.major;
451 version.minor = vraw.minor;
454 ExternalTracker etrack(std::move(ext));
455 auto ptr = parse_inner<Provisioner>(handle, etrack, version, options.buffer_size);
457 if (options.strict_list && ptr->type() != LIST) {
458 throw std::runtime_error(
"top-level object should represent an R list");
462 return ParsedList(std::move(ptr), std::move(version));
479template<
class Provisioner>
500template<
class Provisioner,
class Externals>
521template<
class Provisioner>
Defines the interfaces to use in HDF5 parsing.
ParsedList parse(const H5::Group &handle, Externals ext, Options options=Options())
Definition parse_hdf5.hpp:445
void validate(const H5::Group &handle, int num_external=0, Options options=Options())
Definition parse_hdf5.hpp:535
Parse an R list from a HDF5 or JSON file.
Definition parse_json.hpp:30
Format
Definition interfaces.hpp:148
Options for HDF5 file parsing.
Definition parse_hdf5.hpp:387
hsize_t buffer_size
Definition parse_hdf5.hpp:391
bool strict_list
Definition parse_hdf5.hpp:396