uzuki2
Recovering R lists faithfully from HDF5 or JSON
Loading...
Searching...
No Matches
parse_hdf5.hpp
Go to the documentation of this file.
1#ifndef UZUKI2_PARSE_HPP
2#define UZUKI2_PARSE_HPP
3
4#include <memory>
5#include <vector>
6#include <cctype>
7#include <string>
8#include <cstring>
9#include <stdexcept>
10#include <cstdint>
11#include <unordered_set>
12
13#include "H5Cpp.h"
14
15#include "interfaces.hpp"
16#include "Dummy.hpp"
17#include "ExternalTracker.hpp"
18#include "Version.hpp"
19#include "ParsedList.hpp"
20
21#include "ritsuko/ritsuko.hpp"
22#include "ritsuko/hdf5/hdf5.hpp"
23
29namespace uzuki2 {
30
39namespace hdf5 {
40
44inline H5::DataSet check_scalar_dataset(const H5::Group& handle, const char* name) {
45 if (handle.childObjType(name) != H5O_TYPE_DATASET) {
46 throw std::runtime_error("expected '" + std::string(name) + "' to be a dataset");
47 }
48 auto dhandle = handle.openDataSet(name);
49 if (!ritsuko::hdf5::is_scalar(dhandle)) {
50 throw std::runtime_error("expected '" + std::string(name) + "'to be a scalar dataset");
51 }
52 return dhandle;
53}
54
55template<class Host, class Function>
56void parse_integer_like(const H5::DataSet& handle, Host* ptr, Function check, const Version& version, hsize_t buffer_size) try {
57 if (ritsuko::hdf5::exceeds_integer_limit(handle, 32, true)) {
58 throw std::runtime_error("dataset cannot be represented by 32-bit signed integers");
59 }
60
61 bool has_missing = false;
62 int32_t missing_value = -2147483648;
63 if (version.equals(1, 0)) {
64 has_missing = true;
65 } else {
66 const char* placeholder_name = "missing-value-placeholder";
67 has_missing = handle.attrExists(placeholder_name);
68 if (has_missing) {
69 auto attr = handle.openAttribute(placeholder_name);
70 ritsuko::hdf5::check_missing_placeholder_attribute(handle, attr, /* type_class_only = */ version.lt(1, 2));
71 attr.read(H5::PredType::NATIVE_INT32, &missing_value);
72 }
73 }
74
75 hsize_t full_length = ptr->size();
76 ritsuko::hdf5::Stream1dNumericDataset<int32_t> stream(&handle, full_length, buffer_size);
77 for (hsize_t i = 0; i < full_length; ++i, stream.next()) {
78 auto current = stream.get();
79 if (has_missing && current == missing_value) {
80 ptr->set_missing(i);
81 } else {
82 check(current);
83 ptr->set(i, current);
84 }
85 }
86
87} catch (std::exception& e) {
88 throw std::runtime_error("failed to load integer dataset at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what()));
89}
90
91template<class Host, class Function>
92void parse_string_like(const H5::DataSet& handle, Host* ptr, Function check, hsize_t buffer_size) try {
93 if (!ritsuko::hdf5::is_utf8_string(handle)) {
94 throw std::runtime_error("expected a datatype that can be represented by a UTF-8 encoded string");
95 }
96
97 auto missingness = ritsuko::hdf5::open_and_load_optional_string_missing_placeholder(handle, "missing-value-placeholder");
98 bool has_missing = missingness.first;
99 std::string missing_val = missingness.second;
100
101 hsize_t full_length = ptr->size();
102 ritsuko::hdf5::Stream1dStringDataset stream(&handle, full_length, buffer_size);
103 for (hsize_t i = 0; i < full_length; ++i, stream.next()) {
104 auto x = stream.steal();
105 if (has_missing && x == missing_val) {
106 ptr->set_missing(i);
107 } else {
108 check(x);
109 ptr->set(i, std::move(x));
110 }
111 }
112
113} catch (std::exception& e) {
114 throw std::runtime_error("failed to load string dataset at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what()));
115}
116
117template<class Host, class Function>
118void parse_numbers(const H5::DataSet& handle, Host* ptr, Function check, const Version& version, hsize_t buffer_size) try {
119 if (version.lt(1, 3)) {
120 if (handle.getTypeClass() != H5T_FLOAT) {
121 throw std::runtime_error("expected a floating-point dataset");
122 }
123 } else {
124 if (ritsuko::hdf5::exceeds_float_limit(handle, 64)) {
125 throw std::runtime_error("dataset cannot be represented by 64-bit floats");
126 }
127 }
128
129 bool has_missing = false;
130 double missing_value = 0;
131
132 if (version.equals(1, 0)) {
133 has_missing = true;
134 missing_value = ritsuko::r_missing_value();
135 } else {
136 const char* placeholder_name = "missing-value-placeholder";
137 has_missing = handle.attrExists(placeholder_name);
138 if (has_missing) {
139 auto attr = handle.openAttribute(placeholder_name);
140 ritsuko::hdf5::check_missing_placeholder_attribute(handle, attr, /* type_class_only = */ version.lt(1, 2));
141 attr.read(H5::PredType::NATIVE_DOUBLE, &missing_value);
142 }
143 }
144
145 bool should_compare_nan = version.lt(1, 3);
146 bool is_placeholder_nan = std::isnan(missing_value);
147 auto is_missing_value = [&](double val) -> bool {
148 if (should_compare_nan) {
149 return ritsuko::are_floats_identical(&val, &missing_value);
150 } else if (is_placeholder_nan) {
151 return std::isnan(val);
152 } else {
153 return val == missing_value;
154 }
155 };
156
157 hsize_t full_length = ptr->size();
158 ritsuko::hdf5::Stream1dNumericDataset<double> stream(&handle, full_length, buffer_size);
159 for (hsize_t i = 0; i < full_length; ++i, stream.next()) {
160 auto current = stream.get();
161 if (has_missing && is_missing_value(current)) {
162 ptr->set_missing(i);
163 } else {
164 check(current);
165 ptr->set(i, current);
166 }
167 }
168
169} catch (std::exception& e) {
170 throw std::runtime_error("failed to load floating-point dataset at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what()));
171}
172
173template<class Host>
174void extract_names(const H5::Group& handle, Host* ptr, hsize_t buffer_size) try {
175 if (handle.childObjType("names") != H5O_TYPE_DATASET) {
176 throw std::runtime_error("expected a dataset");
177 }
178
179 auto nhandle = handle.openDataSet("names");
180 if (!ritsuko::hdf5::is_utf8_string(nhandle)) {
181 throw std::runtime_error("expected a datatype that can be represented by a UTF-8 encoded string");
182 }
183
184 size_t len = ptr->size();
185 size_t nlen = ritsuko::hdf5::get_1d_length(nhandle.getSpace(), false);
186 if (nlen != len) {
187 throw std::runtime_error("number of names should be equal to the object length");
188 }
189
190 ritsuko::hdf5::Stream1dStringDataset stream(&nhandle, nlen, buffer_size);
191 for (size_t i = 0; i < nlen; ++i, stream.next()) {
192 ptr->set_name(i, stream.steal());
193 }
194} catch (std::exception& e) {
195 throw std::runtime_error("failed to load names at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what()));
196}
197
198template<class Provisioner, class Externals>
199std::shared_ptr<Base> parse_inner(const H5::Group& handle, Externals& ext, const Version& version, hsize_t buffer_size) try {
200 // Deciding what type we're dealing with.
201 auto object_type = ritsuko::hdf5::open_and_load_scalar_string_attribute(handle, "uzuki_object");
202 std::shared_ptr<Base> output;
203
204 if (object_type == "list") {
205 auto dhandle = ritsuko::hdf5::open_group(handle, "data");
206 size_t len = dhandle.getNumObjs();
207
208 bool named = handle.exists("names");
209 auto lptr = Provisioner::new_List(len, named);
210 output.reset(lptr);
211
212 try {
213 for (size_t i = 0; i < len; ++i) {
214 auto istr = std::to_string(i);
215 auto lhandle = ritsuko::hdf5::open_group(dhandle, istr.c_str());
216 lptr->set(i, parse_inner<Provisioner>(lhandle, ext, version, buffer_size));
217 }
218 } catch (std::exception& e) {
219 throw std::runtime_error("failed to parse list contents in 'data'; " + std::string(e.what()));
220 }
221
222 if (named) {
223 extract_names(handle, lptr, buffer_size);
224 }
225
226 } else if (object_type == "vector") {
227 auto vector_type = ritsuko::hdf5::open_and_load_scalar_string_attribute(handle, "uzuki_type");
228
229 auto dhandle = ritsuko::hdf5::open_dataset(handle, "data");
230 size_t len = ritsuko::hdf5::get_1d_length(dhandle.getSpace(), true);
231 bool is_scalar = (len == 0);
232 if (is_scalar) {
233 len = 1;
234 }
235
236 bool named = handle.exists("names");
237
238 if (vector_type == "integer") {
239 auto iptr = Provisioner::new_Integer(len, named, is_scalar);
240 output.reset(iptr);
241 parse_integer_like(dhandle, iptr, [](int32_t) -> void {}, version, buffer_size);
242
243 } else if (vector_type == "boolean") {
244 auto bptr = Provisioner::new_Boolean(len, named, is_scalar);
245 output.reset(bptr);
246 parse_integer_like(dhandle, bptr, [&](int32_t x) -> void {
247 if (x != 0 && x != 1) {
248 throw std::runtime_error("boolean values should be 0 or 1");
249 }
250 }, version, buffer_size);
251
252 } else if (vector_type == "factor" || (version.equals(1, 0) && vector_type == "ordered")) {
253 auto levhandle = ritsuko::hdf5::open_dataset(handle, "levels");
254 if (!ritsuko::hdf5::is_utf8_string(levhandle)) {
255 throw std::runtime_error("expected a datatype that can be represented by a UTF-8 encoded string");
256 }
257
258 int32_t levlen = ritsuko::hdf5::get_1d_length(levhandle.getSpace(), false);
259 bool ordered = false;
260 if (vector_type == "ordered") {
261 ordered = true;
262 } else if (handle.exists("ordered")) {
263 auto ohandle = check_scalar_dataset(handle, "ordered");
264 if (ritsuko::hdf5::exceeds_integer_limit(ohandle, 32, true)) {
265 throw std::runtime_error("'ordered' value cannot be represented by a 32-bit integer");
266 }
267 int32_t tmp_ordered = 0;
268 ohandle.read(&tmp_ordered, H5::PredType::NATIVE_INT32);
269 ordered = tmp_ordered > 0;
270 }
271
272 auto fptr = Provisioner::new_Factor(len, named, is_scalar, levlen, ordered);
273 output.reset(fptr);
274 parse_integer_like(dhandle, fptr, [&](int32_t x) -> void {
275 if (x < 0 || x >= levlen) {
276 throw std::runtime_error("factor codes should be non-negative and less than the number of levels");
277 }
278 }, version, buffer_size);
279
280 std::unordered_set<std::string> present;
281 ritsuko::hdf5::Stream1dStringDataset stream(&levhandle, levlen, buffer_size);
282 for (int32_t i = 0; i < levlen; ++i, stream.next()) {
283 auto x = stream.steal();
284 if (present.find(x) != present.end()) {
285 throw std::runtime_error("levels should be unique");
286 }
287 fptr->set_level(i, x);
288 present.insert(std::move(x));
289 }
290
291 } else if (vector_type == "string" || (version.equals(1, 0) && (vector_type == "date" || vector_type == "date-time"))) {
292 StringVector::Format format = StringVector::NONE;
293 if (version.equals(1, 0)) {
294 if (vector_type == "date") {
295 format = StringVector::DATE;
296 } else if (vector_type == "date-time") {
297 format = StringVector::DATETIME;
298 }
299
300 } else if (handle.exists("format")) {
301 auto fhandle = check_scalar_dataset(handle, "format");
302 if (!ritsuko::hdf5::is_utf8_string(fhandle)) {
303 throw std::runtime_error("expected a datatype that can be represented by a UTF-8 encoded string");
304 }
305 auto x = ritsuko::hdf5::load_scalar_string_dataset(fhandle);
306 if (x == "date") {
307 format = StringVector::DATE;
308 } else if (x == "date-time") {
309 format = StringVector::DATETIME;
310 } else {
311 throw std::runtime_error("unsupported format '" + x + "'");
312 }
313 }
314
315 auto sptr = Provisioner::new_String(len, named, is_scalar, format);
316 output.reset(sptr);
317 if (format == StringVector::NONE) {
318 parse_string_like(dhandle, sptr, [](const std::string&) -> void {}, buffer_size);
319
320 } else if (format == StringVector::DATE) {
321 parse_string_like(dhandle, sptr, [&](const std::string& x) -> void {
322 if (!ritsuko::is_date(x.c_str(), x.size())) {
323 throw std::runtime_error("dates should follow YYYY-MM-DD formatting");
324 }
325 }, buffer_size);
326
327 } else if (format == StringVector::DATETIME) {
328 parse_string_like(dhandle, sptr, [&](const std::string& x) -> void {
329 if (!ritsuko::is_rfc3339(x.c_str(), x.size())) {
330 throw std::runtime_error("date-times should follow the Internet Date/Time format");
331 }
332 }, buffer_size);
333 }
334
335 } else if (vector_type == "number") {
336 auto dptr = Provisioner::new_Number(len, named, is_scalar);
337 output.reset(dptr);
338 parse_numbers(dhandle, dptr, [](double) -> void {}, version, buffer_size);
339
340 } else {
341 throw std::runtime_error("unknown vector type '" + vector_type + "'");
342 }
343
344 if (named) {
345 auto vptr = static_cast<Vector*>(output.get());
346 extract_names(handle, vptr, buffer_size);
347 }
348
349 } else if (object_type == "nothing") {
350 output.reset(Provisioner::new_Nothing());
351
352 } else if (object_type == "external") {
353 auto ihandle = ritsuko::hdf5::open_dataset(handle, "index");
354 if (ritsuko::hdf5::exceeds_integer_limit(ihandle, 32, true)) {
355 throw std::runtime_error("external index at 'index' cannot be represented by a 32-bit signed integer");
356 }
357
358 auto ispace = ihandle.getSpace();
359 int idims = ispace.getSimpleExtentNdims();
360 if (idims != 0) {
361 throw std::runtime_error("expected scalar dataset at 'index'");
362 }
363
364 int32_t idx;
365 ihandle.read(&idx, H5::PredType::NATIVE_INT32);
366 if (idx < 0 || static_cast<size_t>(idx) >= ext.size()) {
367 throw std::runtime_error("external index out of range at 'index'");
368 }
369
370 output.reset(Provisioner::new_External(ext.get(idx)));
371
372 } else {
373 throw std::runtime_error("unknown uzuki2 object type '" + object_type + "'");
374 }
375
376 return output;
377} catch (std::exception& e) {
378 throw std::runtime_error("failed to load object at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what()));
379}
387struct Options {
391 hsize_t buffer_size = 10000;
392
396 bool strict_list = true;
397};
398
444template<class Provisioner, class Externals>
445ParsedList parse(const H5::Group& handle, Externals ext, Options options = Options()) {
446 Version version;
447 if (handle.attrExists("uzuki_version")) {
448 auto ver_str = ritsuko::hdf5::open_and_load_scalar_string_attribute(handle, "uzuki_version");
449 auto vraw = ritsuko::parse_version_string(ver_str.c_str(), ver_str.size(), /* skip_patch = */ true);
450 version.major = vraw.major;
451 version.minor = vraw.minor;
452 }
453
454 ExternalTracker etrack(std::move(ext));
455 auto ptr = parse_inner<Provisioner>(handle, etrack, version, options.buffer_size);
456
457 if (options.strict_list && ptr->type() != LIST) {
458 throw std::runtime_error("top-level object should represent an R list");
459 }
460 etrack.validate();
461
462 return ParsedList(std::move(ptr), std::move(version));
463}
464
479template<class Provisioner>
480ParsedList parse(const H5::Group& handle, Options options = Options()) {
481 return parse<Provisioner>(handle, uzuki2::DummyExternals(0), std::move(options));
482}
483
500template<class Provisioner, class Externals>
501ParsedList parse(const std::string& file, const std::string& name, Externals ext, Options options = Options()) {
502 H5::H5File handle(file, H5F_ACC_RDONLY);
503 return parse<Provisioner>(ritsuko::hdf5::open_group(handle, name.c_str()), std::move(ext), std::move(options));
504}
505
521template<class Provisioner>
522ParsedList parse(const std::string& file, const std::string& name, Options options = Options()) {
523 H5::H5File handle(file, H5F_ACC_RDONLY);
524 return parse<Provisioner>(ritsuko::hdf5::open_group(handle, name.c_str()), uzuki2::DummyExternals(0), std::move(options));
525}
526
535inline void validate(const H5::Group& handle, int num_external = 0, Options options = Options()) {
538 return;
539}
540
550inline void validate(const std::string& file, const std::string& name, int num_external = 0, Options options = Options()) {
553 return;
554}
555
556}
557
558}
559
560#endif
Defines the interfaces to use in HDF5 parsing.
ParsedList parse(const H5::Group &handle, Externals ext, Options options=Options())
Definition parse_hdf5.hpp:445
void validate(const H5::Group &handle, int num_external=0, Options options=Options())
Definition parse_hdf5.hpp:535
Parse an R list from a HDF5 or JSON file.
Definition parse_json.hpp:30
Format
Definition interfaces.hpp:148
Options for HDF5 file parsing.
Definition parse_hdf5.hpp:387
hsize_t buffer_size
Definition parse_hdf5.hpp:391
bool strict_list
Definition parse_hdf5.hpp:396