uzuki2
Recovering R lists faithfully from HDF5 or JSON
Loading...
Searching...
No Matches
parse_hdf5.hpp
Go to the documentation of this file.
1#ifndef UZUKI2_PARSE_HPP
2#define UZUKI2_PARSE_HPP
3
4#include <memory>
5#include <vector>
6#include <cctype>
7#include <string>
8#include <cstring>
9#include <stdexcept>
10#include <cstdint>
11#include <unordered_set>
12
13#include "H5Cpp.h"
14
15#include "interfaces.hpp"
16#include "Dummy.hpp"
17#include "ExternalTracker.hpp"
18#include "Version.hpp"
19#include "ParsedList.hpp"
20
21#include "ritsuko/ritsuko.hpp"
22#include "ritsuko/hdf5/hdf5.hpp"
23#include "ritsuko/hdf5/vls/vls.hpp"
24
30namespace uzuki2 {
31
40namespace hdf5 {
41
45inline H5::DataSet check_scalar_dataset(const H5::Group& handle, const char* name) {
46 if (handle.childObjType(name) != H5O_TYPE_DATASET) {
47 throw std::runtime_error("expected '" + std::string(name) + "' to be a dataset");
48 }
49 auto dhandle = handle.openDataSet(name);
50 if (!ritsuko::hdf5::is_scalar(dhandle)) {
51 throw std::runtime_error("expected '" + std::string(name) + "'to be a scalar dataset");
52 }
53 return dhandle;
54}
55
56template<class Host_, class Function_>
57void parse_integer_like(const H5::DataSet& handle, Host_* ptr, bool is_scalar, Function_ check, const Version& version, hsize_t buffer_size) try {
58 if (ritsuko::hdf5::exceeds_integer_limit(handle, 32, true)) {
59 throw std::runtime_error("dataset cannot be represented by 32-bit signed integers");
60 }
61
62 bool has_missing = false;
63 int32_t missing_value = -2147483648;
64 if (version.equals(1, 0)) {
65 has_missing = true;
66 } else {
67 const char* placeholder_name = "missing-value-placeholder";
68 has_missing = handle.attrExists(placeholder_name);
69 if (has_missing) {
70 auto attr = handle.openAttribute(placeholder_name);
71 ritsuko::hdf5::check_numeric_missing_placeholder_attribute(handle, attr, /* type_class_only = */ version.lt(1, 2));
72 attr.read(H5::PredType::NATIVE_INT32, &missing_value);
73 }
74 }
75
76 auto set = [&](hsize_t i, int32_t x) -> void {
77 if (has_missing && x == missing_value) {
78 ptr->set_missing(i);
79 } else {
80 check(x);
81 ptr->set(i, x);
82 }
83 };
84
85 if (is_scalar) {
86 int32_t value;
87 handle.read(&value, H5::PredType::NATIVE_INT32);
88 set(0, value);
89 } else {
90 hsize_t full_length = ptr->size();
91 ritsuko::hdf5::Stream1dNumericDataset<int32_t> stream(&handle, full_length, buffer_size);
92 for (hsize_t i = 0; i < full_length; ++i, stream.next()) {
93 set(i, stream.get());
94 }
95 }
96
97} catch (std::exception& e) {
98 throw std::runtime_error("failed to load integer dataset at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what()));
99}
100
101template<class Host_, class Function_>
102void parse_string_like(const H5::DataSet& handle, Host_* ptr, bool is_scalar, Function_ check, hsize_t buffer_size) try {
103 if (!ritsuko::hdf5::is_utf8_string(handle)) {
104 throw std::runtime_error("expected a datatype that can be represented by a UTF-8 encoded string");
105 }
106
107 auto missingness = ritsuko::hdf5::open_and_load_optional_string_missing_placeholder(handle, "missing-value-placeholder");
108 auto set = [&](hsize_t i, std::string x) -> void {
109 if (missingness.has_value() && x == *missingness) {
110 ptr->set_missing(i);
111 } else {
112 check(x);
113 ptr->set(i, std::move(x));
114 }
115 };
116
117 if (is_scalar) {
118 auto x = ritsuko::hdf5::load_scalar_string_dataset(handle);
119 set(0, std::move(x));
120 } else {
121 hsize_t full_length = ptr->size();
122 ritsuko::hdf5::Stream1dStringDataset stream(&handle, full_length, buffer_size);
123 for (hsize_t i = 0; i < full_length; ++i, stream.next()) {
124 set(i, stream.steal());
125 }
126 }
127
128} catch (std::exception& e) {
129 throw std::runtime_error("failed to load string dataset at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what()));
130}
131
132template<class Host_, class Function_>
133void parse_numbers(const H5::DataSet& handle, Host_* ptr, bool is_scalar, Function_ check, const Version& version, hsize_t buffer_size) try {
134 if (version.lt(1, 3)) {
135 if (handle.getTypeClass() != H5T_FLOAT) {
136 throw std::runtime_error("expected a floating-point dataset");
137 }
138 } else {
139 if (ritsuko::hdf5::exceeds_float_limit(handle, 64)) {
140 throw std::runtime_error("dataset cannot be represented by 64-bit floats");
141 }
142 }
143
144 bool has_missing = false;
145 double missing_value = 0;
146 if (version.equals(1, 0)) {
147 has_missing = true;
148 missing_value = ritsuko::r_missing_value();
149 } else {
150 const char* placeholder_name = "missing-value-placeholder";
151 has_missing = handle.attrExists(placeholder_name);
152 if (has_missing) {
153 auto attr = handle.openAttribute(placeholder_name);
154 ritsuko::hdf5::check_numeric_missing_placeholder_attribute(handle, attr, /* type_class_only = */ version.lt(1, 2));
155 attr.read(H5::PredType::NATIVE_DOUBLE, &missing_value);
156 }
157 }
158
159 bool should_compare_nan = version.lt(1, 3);
160 bool is_placeholder_nan = std::isnan(missing_value);
161 auto is_missing_value = [&](double val) -> bool {
162 if (should_compare_nan) {
163 return ritsuko::are_floats_identical(&val, &missing_value);
164 } else if (is_placeholder_nan) {
165 return std::isnan(val);
166 } else {
167 return val == missing_value;
168 }
169 };
170
171 auto set = [&](hsize_t i, double x) -> void {
172 if (has_missing && is_missing_value(x)) {
173 ptr->set_missing(i);
174 } else {
175 check(x);
176 ptr->set(i, x);
177 }
178 };
179
180 if (is_scalar) {
181 double val;
182 handle.read(&val, H5::PredType::NATIVE_DOUBLE);
183 set(0, val);
184 } else {
185 hsize_t full_length = ptr->size();
186 ritsuko::hdf5::Stream1dNumericDataset<double> stream(&handle, full_length, buffer_size);
187 for (hsize_t i = 0; i < full_length; ++i, stream.next()) {
188 set(i, stream.get());
189 }
190 }
191
192} catch (std::exception& e) {
193 throw std::runtime_error("failed to load floating-point dataset at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what()));
194}
195
196template<class Host_>
197void extract_names(const H5::Group& handle, Host_* ptr, hsize_t buffer_size) try {
198 if (handle.childObjType("names") != H5O_TYPE_DATASET) {
199 throw std::runtime_error("expected a dataset");
200 }
201
202 auto nhandle = handle.openDataSet("names");
203 if (!ritsuko::hdf5::is_utf8_string(nhandle)) {
204 throw std::runtime_error("expected a datatype that can be represented by a UTF-8 encoded string");
205 }
206
207 size_t len = ptr->size();
208 size_t nlen = ritsuko::hdf5::get_1d_length(nhandle.getSpace(), false);
209 if (nlen != len) {
210 throw std::runtime_error("number of names should be equal to the object length");
211 }
212
213 ritsuko::hdf5::Stream1dStringDataset stream(&nhandle, nlen, buffer_size);
214 for (size_t i = 0; i < nlen; ++i, stream.next()) {
215 ptr->set_name(i, stream.steal());
216 }
217} catch (std::exception& e) {
218 throw std::runtime_error("failed to load names at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what()));
219}
220
221template<class Provisioner_, class Externals_>
222std::shared_ptr<Base> parse_inner(const H5::Group& handle, Externals_& ext, const Version& version, hsize_t buffer_size) try {
223 // Deciding what type we're dealing with.
224 auto object_type = ritsuko::hdf5::open_and_load_scalar_string_attribute(handle, "uzuki_object");
225 std::shared_ptr<Base> output;
226
227 if (object_type == "list") {
228 auto dhandle = ritsuko::hdf5::open_group(handle, "data");
229 size_t len = dhandle.getNumObjs();
230
231 bool named = handle.exists("names");
232 auto lptr = Provisioner_::new_List(len, named);
233 output.reset(lptr);
234
235 try {
236 for (size_t i = 0; i < len; ++i) {
237 auto istr = std::to_string(i);
238 auto lhandle = ritsuko::hdf5::open_group(dhandle, istr.c_str());
239 lptr->set(i, parse_inner<Provisioner_>(lhandle, ext, version, buffer_size));
240 }
241 } catch (std::exception& e) {
242 throw std::runtime_error("failed to parse list contents in 'data'; " + std::string(e.what()));
243 }
244
245 if (named) {
246 extract_names(handle, lptr, buffer_size);
247 }
248
249 } else if (object_type == "vector") {
250 auto vector_type = ritsuko::hdf5::open_and_load_scalar_string_attribute(handle, "uzuki_type");
251
252 auto dhandle = ritsuko::hdf5::open_dataset(handle, "data");
253 size_t len = ritsuko::hdf5::get_1d_length(dhandle.getSpace(), true);
254 bool is_scalar = (len == 0);
255 if (is_scalar) {
256 len = 1;
257 }
258
259 bool named = handle.exists("names");
260
261 if (vector_type == "integer") {
262 auto iptr = Provisioner_::new_Integer(len, named, is_scalar);
263 output.reset(iptr);
264 parse_integer_like(
265 dhandle,
266 iptr,
267 is_scalar,
268 [](int32_t) -> void {},
269 version,
270 buffer_size
271 );
272
273 } else if (vector_type == "boolean") {
274 auto bptr = Provisioner_::new_Boolean(len, named, is_scalar);
275 output.reset(bptr);
276 parse_integer_like(
277 dhandle,
278 bptr,
279 is_scalar,
280 [&](int32_t x) -> void {
281 if (x != 0 && x != 1) {
282 throw std::runtime_error("boolean values should be 0 or 1");
283 }
284 },
285 version,
286 buffer_size
287 );
288
289 } else if (vector_type == "factor" || (version.equals(1, 0) && vector_type == "ordered")) {
290 auto levhandle = ritsuko::hdf5::open_dataset(handle, "levels");
291 if (!ritsuko::hdf5::is_utf8_string(levhandle)) {
292 throw std::runtime_error("expected a datatype that can be represented by a UTF-8 encoded string");
293 }
294
295 int32_t levlen = ritsuko::hdf5::get_1d_length(levhandle.getSpace(), false);
296 bool ordered = false;
297 if (vector_type == "ordered") {
298 ordered = true;
299 } else if (handle.exists("ordered")) {
300 auto ohandle = check_scalar_dataset(handle, "ordered");
301 if (ritsuko::hdf5::exceeds_integer_limit(ohandle, 32, true)) {
302 throw std::runtime_error("'ordered' value cannot be represented by a 32-bit integer");
303 }
304 int32_t tmp_ordered = 0;
305 ohandle.read(&tmp_ordered, H5::PredType::NATIVE_INT32);
306 ordered = tmp_ordered > 0;
307 }
308
309 auto fptr = Provisioner_::new_Factor(len, named, is_scalar, levlen, ordered);
310 output.reset(fptr);
311 parse_integer_like(
312 dhandle,
313 fptr,
314 is_scalar,
315 [&](int32_t x) -> void {
316 if (x < 0 || x >= levlen) {
317 throw std::runtime_error("factor codes should be non-negative and less than the number of levels");
318 }
319 },
320 version,
321 buffer_size
322 );
323
324 std::unordered_set<std::string> present;
325 ritsuko::hdf5::Stream1dStringDataset stream(&levhandle, levlen, buffer_size);
326 for (int32_t i = 0; i < levlen; ++i, stream.next()) {
327 auto x = stream.steal();
328 if (present.find(x) != present.end()) {
329 throw std::runtime_error("levels should be unique");
330 }
331 fptr->set_level(i, x);
332 present.insert(std::move(x));
333 }
334
335 } else if (vector_type == "vls" && !version.lt(1, 4)) {
336 ritsuko::hdf5::vls::validate_pointer_datatype(dhandle.getCompType(), 64, 64);
337 auto hhandle = ritsuko::hdf5::vls::open_heap(handle, "heap");
338 auto missingness = ritsuko::hdf5::open_and_load_optional_string_missing_placeholder(dhandle, "missing-value-placeholder");
339
340 auto ptr = Provisioner_::new_String(len, named, is_scalar, StringVector::NONE);
341 output.reset(ptr);
342
343 if (is_scalar) {
344 ritsuko::hdf5::vls::Pointer<uint64_t, uint64_t> vlsptr;
345 dhandle.read(&vlsptr, ritsuko::hdf5::vls::define_pointer_datatype<uint64_t, uint64_t>());
346
347 hsize_t len = vlsptr.length;
348 H5::DataSpace mspace(1, &len);
349 hsize_t offset = vlsptr.offset;
350 hsize_t hlen = ritsuko::hdf5::get_1d_length(hhandle, false);
351 H5::DataSpace dspace(1, &hlen);
352 dspace.selectHyperslab(H5S_SELECT_SET, &len, &offset);
353
354 std::vector<uint8_t> buffer(vlsptr.length);
355 hhandle.read(buffer.data(), H5::PredType::NATIVE_UINT8, mspace, dspace);
356 auto cptr = reinterpret_cast<const char*>(buffer.data());
357 std::string str(cptr, cptr + ritsuko::hdf5::find_string_length(cptr, vlsptr.length));
358
359 if (missingness.has_value() && str == *missingness) {
360 ptr->set_missing(0);
361 } else {
362 ptr->set(0, std::move(str));
363 }
364
365 } else {
366 ritsuko::hdf5::vls::Stream1dArray<uint64_t, uint64_t> stream(&dhandle, &hhandle, len, buffer_size);
367 for (hsize_t i = 0; i < len; ++i, stream.next()) {
368 auto x = stream.steal();
369 if (missingness.has_value() && x == *missingness) {
370 ptr->set_missing(i);
371 } else {
372 ptr->set(i, std::move(x));
373 }
374 }
375 }
376
377 } else if (vector_type == "string" || (version.equals(1, 0) && (vector_type == "date" || vector_type == "date-time"))) {
378 StringVector::Format format = StringVector::NONE;
379 if (version.equals(1, 0)) {
380 if (vector_type == "date") {
381 format = StringVector::DATE;
382 } else if (vector_type == "date-time") {
383 format = StringVector::DATETIME;
384 }
385
386 } else if (handle.exists("format")) {
387 auto fhandle = check_scalar_dataset(handle, "format");
388 if (!ritsuko::hdf5::is_utf8_string(fhandle)) {
389 throw std::runtime_error("expected a datatype that can be represented by a UTF-8 encoded string");
390 }
391 auto x = ritsuko::hdf5::load_scalar_string_dataset(fhandle);
392 if (x == "date") {
393 format = StringVector::DATE;
394 } else if (x == "date-time") {
395 format = StringVector::DATETIME;
396 } else {
397 throw std::runtime_error("unsupported format '" + x + "'");
398 }
399 }
400
401 auto sptr = Provisioner_::new_String(len, named, is_scalar, format);
402 output.reset(sptr);
403 if (format == StringVector::NONE) {
404 parse_string_like(
405 dhandle,
406 sptr,
407 is_scalar,
408 [](const std::string&) -> void {},
409 buffer_size
410 );
411
412 } else if (format == StringVector::DATE) {
413 parse_string_like(
414 dhandle,
415 sptr,
416 is_scalar,
417 [&](const std::string& x) -> void {
418 if (!ritsuko::is_date(x.c_str(), x.size())) {
419 throw std::runtime_error("dates should follow YYYY-MM-DD formatting");
420 }
421 },
422 buffer_size
423 );
424
425 } else if (format == StringVector::DATETIME) {
426 parse_string_like(
427 dhandle,
428 sptr,
429 is_scalar,
430 [&](const std::string& x) -> void {
431 if (!ritsuko::is_rfc3339(x.c_str(), x.size())) {
432 throw std::runtime_error("date-times should follow the Internet Date/Time format");
433 }
434 },
435 buffer_size
436 );
437 }
438
439 } else if (vector_type == "number") {
440 auto dptr = Provisioner_::new_Number(len, named, is_scalar);
441 output.reset(dptr);
442 parse_numbers(
443 dhandle,
444 dptr,
445 is_scalar,
446 [](double) -> void {},
447 version,
448 buffer_size
449 );
450
451 } else {
452 throw std::runtime_error("unknown vector type '" + vector_type + "'");
453 }
454
455 if (named) {
456 auto vptr = static_cast<Vector*>(output.get());
457 extract_names(handle, vptr, buffer_size);
458 }
459
460 } else if (object_type == "nothing") {
461 output.reset(Provisioner_::new_Nothing());
462
463 } else if (object_type == "external") {
464 auto ihandle = ritsuko::hdf5::open_dataset(handle, "index");
465 if (ritsuko::hdf5::exceeds_integer_limit(ihandle, 32, true)) {
466 throw std::runtime_error("external index at 'index' cannot be represented by a 32-bit signed integer");
467 }
468
469 auto ispace = ihandle.getSpace();
470 int idims = ispace.getSimpleExtentNdims();
471 if (idims != 0) {
472 throw std::runtime_error("expected scalar dataset at 'index'");
473 }
474
475 int32_t idx;
476 ihandle.read(&idx, H5::PredType::NATIVE_INT32);
477 if (idx < 0 || static_cast<size_t>(idx) >= ext.size()) {
478 throw std::runtime_error("external index out of range at 'index'");
479 }
480
481 output.reset(Provisioner_::new_External(ext.get(idx)));
482
483 } else {
484 throw std::runtime_error("unknown uzuki2 object type '" + object_type + "'");
485 }
486
487 return output;
488} catch (std::exception& e) {
489 throw std::runtime_error("failed to load object at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what()));
490 return nullptr; // for consistency.
491}
499struct Options {
503 hsize_t buffer_size = 10000;
504
508 bool strict_list = true;
509};
510
556template<class Provisioner_, class Externals_>
557ParsedList parse(const H5::Group& handle, Externals_ ext, const Options& options) {
558 Version version;
559 if (handle.attrExists("uzuki_version")) {
560 auto ver_str = ritsuko::hdf5::open_and_load_scalar_string_attribute(handle, "uzuki_version");
561 auto vraw = ritsuko::parse_version_string(ver_str.c_str(), ver_str.size(), /* skip_patch = */ true);
562 version.major = vraw.major;
563 version.minor = vraw.minor;
564 }
565
566 ExternalTracker etrack(std::move(ext));
567 auto ptr = parse_inner<Provisioner_>(handle, etrack, version, options.buffer_size);
568
569 if (options.strict_list && ptr->type() != LIST) {
570 throw std::runtime_error("top-level object should represent an R list");
571 }
572 etrack.validate();
573
574 return ParsedList(std::move(ptr), std::move(version));
575}
576
593template<class Provisioner_, class Externals_>
594ParsedList parse(const std::string& file, const std::string& name, Externals_ ext, Options options = Options()) {
595 H5::H5File handle(file, H5F_ACC_RDONLY);
596 return parse<Provisioner_>(ritsuko::hdf5::open_group(handle, name.c_str()), std::move(ext), options);
597}
598
607inline void validate(const H5::Group& handle, int num_external, const Options& options) {
608 parse<DummyProvisioner>(handle, DummyExternals(num_external), options);
609}
610
620inline void validate(const std::string& file, const std::string& name, int num_external, const Options& options) {
621 parse<DummyProvisioner>(file, name, DummyExternals(num_external), options);
622}
623
624}
625
626}
627
628#endif
Dummy classes for parsing without storing the results.
Class to hold the parsed list.
Dummy class satisfying the Externals_ interface of hdf5::parse().
Definition Dummy.hpp:130
Format
Definition interfaces.hpp:158
Defines the interfaces to use in HDF5 parsing.
void validate(const H5::Group &handle, int num_external, const Options &options)
Definition parse_hdf5.hpp:607
ParsedList parse(const H5::Group &handle, Externals_ ext, const Options &options)
Definition parse_hdf5.hpp:557
Parse an R list from a HDF5 or JSON file.
Definition parse_json.hpp:28
Results of parsing a list from file.
Definition ParsedList.hpp:19
Options for HDF5 file parsing.
Definition parse_hdf5.hpp:499
hsize_t buffer_size
Definition parse_hdf5.hpp:503
bool strict_list
Definition parse_hdf5.hpp:508