uzuki2
Recovering R lists faithfully from HDF5 or JSON
Loading...
Searching...
No Matches
parse_json.hpp
Go to the documentation of this file.
1#ifndef UZUKI2_PARSE_JSON_HPP
2#define UZUKI2_PARSE_JSON_HPP
3
4#include <memory>
5#include <vector>
6#include <cctype>
7#include <string>
8#include <stdexcept>
9#include <cmath>
10#include <unordered_map>
11#include <unordered_set>
12#include <type_traits>
13
14#include "byteme/byteme.hpp"
15#include "millijson/millijson.hpp"
16#include "ritsuko/ritsuko.hpp"
17
18#include "interfaces.hpp"
19#include "Dummy.hpp"
20#include "ExternalTracker.hpp"
21#include "ParsedList.hpp"
22
28namespace uzuki2 {
29
38namespace json {
39
43inline const std::vector<std::shared_ptr<millijson::Base> >& extract_array(
44 const std::unordered_map<std::string, std::shared_ptr<millijson::Base> >& properties,
45 const std::string& name,
46 const std::string& path)
47{
48 auto vIt = properties.find(name);
49 if (vIt == properties.end()) {
50 throw std::runtime_error("expected '" + name + "' property for object at '" + path + "'");
51 }
52
53 const auto& values_ptr = vIt->second;
54 if (values_ptr->type() != millijson::ARRAY) {
55 throw std::runtime_error("expected an array in '" + path + "." + name + "'");
56 }
57
58 return static_cast<const millijson::Array*>(values_ptr.get())->value();
59}
60
61inline const millijson::Array* has_names(const std::unordered_map<std::string, std::shared_ptr<millijson::Base> >& properties, const std::string& path) {
62 auto nIt = properties.find("names");
63 if (nIt == properties.end()) {
64 return NULL;
65 }
66
67 const auto name_ptr = nIt->second;
68 if (name_ptr->type() != millijson::ARRAY) {
69 throw std::runtime_error("expected an array in '" + path + ".names'");
70 }
71 return static_cast<const millijson::Array*>(name_ptr.get());
72}
73
74template<class Destination_>
75void fill_names(const millijson::Array* names_ptr, Destination_* dest, const std::string& path) {
76 const auto& names = names_ptr->value();
77 if (names.size() != dest->size()) {
78 throw std::runtime_error("length of 'names' and 'values' should be the same in '" + path + "'");
79 }
80
81 for (size_t i = 0; i < names.size(); ++i) {
82 if (names[i]->type() != millijson::STRING) {
83 throw std::runtime_error("expected a string at '" + path + ".names[" + std::to_string(i) + "]'");
84 }
85 dest->set_name(i, static_cast<const millijson::String*>(names[i].get())->value());
86 }
87}
88
89template<class Function_>
90auto process_array_or_scalar_values(
91 const std::unordered_map<std::string, std::shared_ptr<millijson::Base> >& properties,
92 const std::string& path,
93 Function_ fun)
94{
95 auto vIt = properties.find("values");
96 if (vIt == properties.end()) {
97 throw std::runtime_error("expected 'values' property for object at '" + path + "'");
98 }
99
100 auto names_ptr = has_names(properties, path);
101 bool has_names = names_ptr != NULL;
102
103 typename std::invoke_result<Function_,std::vector<std::shared_ptr<millijson::Base> >,bool,bool>::type out_ptr;
104
105 const auto& values_ptr = vIt->second;
106 if (values_ptr->type() == millijson::ARRAY) {
107 out_ptr = fun(static_cast<const millijson::Array*>(values_ptr.get())->value(), has_names, false);
108 } else {
109 std::vector<std::shared_ptr<millijson::Base> > temp { values_ptr };
110 out_ptr = fun(temp, has_names, true);
111 }
112
113 if (has_names) {
114 fill_names(names_ptr, out_ptr, path);
115 }
116 return out_ptr;
117}
118
119template<class Destination_, class Function_>
120void extract_integers(const std::vector<std::shared_ptr<millijson::Base> >& values, Destination_* dest, Function_ check, const std::string& path, const Version& version) {
121 for (size_t i = 0; i < values.size(); ++i) {
122 if (values[i]->type() == millijson::NOTHING) {
123 dest->set_missing(i);
124 continue;
125 }
126
127 if (values[i]->type() != millijson::NUMBER) {
128 throw std::runtime_error("expected a number at '" + path + ".values[" + std::to_string(i) + "]'");
129 }
130
131 auto val = static_cast<const millijson::Number*>(values[i].get())->value();
132 if (val != std::floor(val)) {
133 throw std::runtime_error("expected an integer at '" + path + ".values[" + std::to_string(i) + "]'");
134 }
135
136 constexpr double upper = std::numeric_limits<int32_t>::max();
137 constexpr double lower = std::numeric_limits<int32_t>::min();
138 if (val < lower || val > upper) {
139 throw std::runtime_error("value at '" + path + ".values[" + std::to_string(i) + "]' cannot be represented by a 32-bit signed integer");
140 }
141
142 int32_t ival = val;
143 if (version.equals(1, 0) && val == -2147483648) {
144 dest->set_missing(i);
145 continue;
146 }
147
148 check(ival);
149 dest->set(i, ival);
150 }
151}
152
153template<class Destination_, class Function_>
154void extract_strings(const std::vector<std::shared_ptr<millijson::Base> >& values, Destination_* dest, Function_ check, const std::string& path) {
155 for (size_t i = 0; i < values.size(); ++i) {
156 if (values[i]->type() == millijson::NOTHING) {
157 dest->set_missing(i);
158 continue;
159 }
160
161 if (values[i]->type() != millijson::STRING) {
162 throw std::runtime_error("expected a string at '" + path + ".values[" + std::to_string(i) + "]'");
163 }
164
165 const auto& str = static_cast<const millijson::String*>(values[i].get())->value();
166 check(str);
167 dest->set(i, str);
168 }
169}
170
171template<class Provisioner_, class Externals_>
172std::shared_ptr<Base> parse_object(const millijson::Base* contents, Externals_& ext, const std::string& path, const Version& version) {
173 if (contents->type() != millijson::OBJECT) {
174 throw std::runtime_error("each R object should be represented by a JSON object at '" + path + "'");
175 }
176 const auto& map = static_cast<const millijson::Object*>(contents)->value();
177
178 auto tIt = map.find("type");
179 if (tIt == map.end()) {
180 throw std::runtime_error("missing 'type' property for JSON object at '" + path + "'");
181 }
182 const auto& type_ptr = tIt->second;
183 if (type_ptr->type() != millijson::STRING) {
184 throw std::runtime_error("expected a string at '" + path + ".type'");
185 }
186 const auto& type = static_cast<const millijson::String*>(type_ptr.get())->value();
187
188 std::shared_ptr<Base> output;
189 if (type == "nothing") {
190 output.reset(Provisioner_::new_Nothing());
191
192 } else if (type == "external") {
193 auto iIt = map.find("index");
194 if (iIt == map.end()) {
195 throw std::runtime_error("expected 'index' property for 'external' type at '" + path + "'");
196 }
197 const auto& index_ptr = iIt->second;
198 if (index_ptr->type() != millijson::NUMBER) {
199 throw std::runtime_error("expected a number at '" + path + ".index'");
200 }
201 auto index = static_cast<const millijson::Number*>(index_ptr.get())->value();
202
203 if (index != std::floor(index)) {
204 throw std::runtime_error("expected an integer at '" + path + ".index'");
205 } else if (index < 0 || index >= static_cast<double>(ext.size())) {
206 throw std::runtime_error("external index out of range at '" + path + ".index'");
207 }
208 output.reset(Provisioner_::new_External(ext.get(index)));
209
210 } else if (type == "integer") {
211 process_array_or_scalar_values(map, path, [&](const auto& vals, bool named, bool scalar) -> auto {
212 auto ptr = Provisioner_::new_Integer(vals.size(), named, scalar);
213 output.reset(ptr);
214 extract_integers(vals, ptr, [](int32_t) -> void {}, path, version);
215 return ptr;
216 });
217
218 } else if (type == "factor" || (version.equals(1, 0) && type == "ordered")) {
219 bool ordered = false;
220 if (type == "ordered") {
221 ordered = true;
222 } else {
223 auto oIt = map.find("ordered");
224 if (oIt != map.end()) {
225 if (oIt->second->type() != millijson::BOOLEAN) {
226 throw std::runtime_error("expected a boolean at '" + path + ".ordered'");
227 }
228 ordered = static_cast<const millijson::Boolean*>((oIt->second).get())->value();
229 }
230 }
231
232 const std::string levels_name = "levels"; // avoid dangling reference from casting of string literal.
233 const auto& lvals = extract_array(map, levels_name, path);
234 int32_t nlevels = lvals.size();
235 auto fptr = process_array_or_scalar_values(map, path, [&](const auto& vals, bool named, bool scalar) -> auto {
236 auto ptr = Provisioner_::new_Factor(vals.size(), named, scalar, nlevels, ordered);
237 output.reset(ptr);
238 extract_integers(vals, ptr, [&](int32_t x) -> void {
239 if (x < 0 || x >= nlevels) {
240 throw std::runtime_error("factor indices of out of range of levels in '" + path + "'");
241 }
242 }, path, version);
243 return ptr;
244 });
245
246 std::unordered_set<std::string> existing;
247 for (size_t l = 0; l < lvals.size(); ++l) {
248 if (lvals[l]->type() != millijson::STRING) {
249 throw std::runtime_error("expected strings at '" + path + ".levels[" + std::to_string(l) + "]'");
250 }
251
252 const auto& level = static_cast<const millijson::String*>(lvals[l].get())->value();
253 if (existing.find(level) != existing.end()) {
254 throw std::runtime_error("detected duplicate string at '" + path + ".levels[" + std::to_string(l) + "]'");
255 }
256 fptr->set_level(l, level);
257 existing.insert(level);
258 }
259
260 } else if (type == "boolean") {
261 process_array_or_scalar_values(map, path, [&](const auto& vals, bool named, bool scalar) -> auto {
262 auto ptr = Provisioner_::new_Boolean(vals.size(), named, scalar);
263 output.reset(ptr);
264
265 for (size_t i = 0; i < vals.size(); ++i) {
266 if (vals[i]->type() == millijson::NOTHING) {
267 ptr->set_missing(i);
268 continue;
269 }
270
271 if (vals[i]->type() != millijson::BOOLEAN) {
272 throw std::runtime_error("expected a boolean at '" + path + ".values[" + std::to_string(i) + "]'");
273 }
274 ptr->set(i, static_cast<const millijson::Boolean*>(vals[i].get())->value());
275 }
276
277 return ptr;
278 });
279
280 } else if (type == "number") {
281 process_array_or_scalar_values(map, path, [&](const auto& vals, bool named, bool scalar) -> auto {
282 auto ptr = Provisioner_::new_Number(vals.size(), named, scalar);
283 output.reset(ptr);
284
285 for (size_t i = 0; i < vals.size(); ++i) {
286 if (vals[i]->type() == millijson::NOTHING) {
287 ptr->set_missing(i);
288 continue;
289 }
290
291 if (vals[i]->type() == millijson::NUMBER) {
292 ptr->set(i, static_cast<const millijson::Number*>(vals[i].get())->value());
293 } else if (vals[i]->type() == millijson::STRING) {
294 auto str = static_cast<const millijson::String*>(vals[i].get())->value();
295 if (str == "NaN") {
296 ptr->set(i, std::numeric_limits<double>::quiet_NaN());
297 } else if (str == "Inf") {
298 ptr->set(i, std::numeric_limits<double>::infinity());
299 } else if (str == "-Inf") {
300 ptr->set(i, -std::numeric_limits<double>::infinity());
301 } else {
302 throw std::runtime_error("unsupported string '" + str + "' at '" + path + ".values[" + std::to_string(i) + "]'");
303 }
304 } else {
305 throw std::runtime_error("expected a number at '" + path + ".values[" + std::to_string(i) + "]'");
306 }
307 }
308
309 return ptr;
310 });
311
312 } else if (type == "string" || (version.equals(1, 0) && (type == "date" || type == "date-time"))) {
313 StringVector::Format format = StringVector::NONE;
314 if (version.equals(1, 0)) {
315 if (type == "date") {
316 format = StringVector::DATE;
317 } else if (type == "date-time") {
318 format = StringVector::DATETIME;
319 }
320 } else {
321 auto fIt = map.find("format");
322 if (fIt != map.end()) {
323 if (fIt->second->type() != millijson::STRING) {
324 throw std::runtime_error("expected a string at '" + path + ".format'");
325 }
326 auto fptr = static_cast<const millijson::String*>(fIt->second.get());
327 if (fptr->value() == "date") {
328 format = StringVector::DATE;
329 } else if (fptr->value() == "date-time") {
330 format = StringVector::DATETIME;
331 } else {
332 throw std::runtime_error("unsupported format '" + fptr->value() + "' at '" + path + ".format'");
333 }
334 }
335 }
336
337 process_array_or_scalar_values(map, path, [&](const auto& vals, bool named, bool scalar) -> auto {
338 auto ptr = Provisioner_::new_String(vals.size(), named, scalar, format);
339 output.reset(ptr);
340
341 if (format == StringVector::NONE) {
342 extract_strings(vals, ptr, [](const std::string&) -> void {}, path);
343 } else if (format == StringVector::DATE) {
344 extract_strings(vals, ptr, [&](const std::string& x) -> void {
345 if (!ritsuko::is_date(x.c_str(), x.size())) {
346 throw std::runtime_error("dates should follow YYYY-MM-DD formatting in '" + path + ".values'");
347 }
348 }, path);
349 } else if (format == StringVector::DATETIME) {
350 extract_strings(vals, ptr, [&](const std::string& x) -> void {
351 if (!ritsuko::is_rfc3339(x.c_str(), x.size())) {
352 throw std::runtime_error("date-times should follow the Internet Date/Time format in '" + path + ".values'");
353 }
354 }, path);
355 }
356
357 return ptr;
358 });
359
360 } else if (type == "list") {
361 auto names_ptr = has_names(map, path);
362 bool has_names = names_ptr != NULL;
363
364 const std::string values_name = "values"; // avoid dangling reference from casting of string literal.
365 const auto& vals = extract_array(map, values_name, path);
366 auto ptr = Provisioner_::new_List(vals.size(), has_names);
367 output.reset(ptr);
368
369 for (size_t i = 0; i < vals.size(); ++i) {
370 ptr->set(i, parse_object<Provisioner_>(vals[i].get(), ext, path + ".values[" + std::to_string(i) + "]", version));
371 }
372
373 if (has_names) {
374 fill_names(names_ptr, ptr, path);
375 }
376
377 } else {
378 throw std::runtime_error("unknown object type '" + type + "' at '" + path + ".type'");
379 }
380
381 return output;
382}
390struct Options {
395 bool parallel = false;
396
400 bool strict_list = true;
401
406 size_t buffer_size = 65536;
407};
408
426template<class Provisioner_, class Externals_>
427ParsedList parse(byteme::Reader& reader, Externals_ ext, const Options& options) {
428 std::unique_ptr<byteme::PerByteInterface<char> > pb;
429 if (options.parallel) {
431 } else {
433 }
434 auto contents = millijson::parse(*pb);
435
436 Version version;
437 if (contents->type() == millijson::OBJECT) {
438 const auto& map = static_cast<const millijson::Object*>(contents.get())->value();
439 auto vIt = map.find("version");
440 if (vIt != map.end()) {
441 if (vIt->second->type() != millijson::STRING) {
442 throw std::runtime_error("expected a string in 'version'");
443 }
444 const auto& vstr = static_cast<const millijson::String*>(vIt->second.get())->value();
445 auto vraw = ritsuko::parse_version_string(vstr.c_str(), vstr.size(), /* skip_patch = */ true);
446 version.major = vraw.major;
447 version.minor = vraw.minor;
448 }
449 }
450
451 ExternalTracker etrack(std::move(ext));
452 auto output = parse_object<Provisioner_>(contents.get(), etrack, "", version);
453
454 if (options.strict_list && output->type() != LIST) {
455 throw std::runtime_error("top-level object should represent an R list");
456 }
457 etrack.validate();
458
459 return ParsedList(std::move(output), std::move(version));
460}
461
479template<class Provisioner_, class Externals_>
480ParsedList parse_file(const std::string& file, Externals_ ext, const Options& options) {
481 byteme::SomeFileReader reader(file.c_str(), [&]{
482 byteme::SomeFileReaderOptions sopt;
483 sopt.buffer_size = options.buffer_size;
484 return sopt;
485 }());
486 return parse<Provisioner_>(reader, std::move(ext), options);
487}
488
507template<class Provisioner_, class Externals_>
508ParsedList parse_buffer(const unsigned char* buffer, size_t len, Externals_ ext, const Options& options) {
509 byteme::SomeBufferReader reader(buffer, len, [&]{
511 sopt.buffer_size = options.buffer_size;
512 return sopt;
513 }());
514 return parse<Provisioner_>(reader, std::move(ext), options);
515}
516
525inline void validate(byteme::Reader& reader, int num_external, const Options& options) {
526 parse<DummyProvisioner>(reader, DummyExternals(num_external), options);
527}
528
537inline void validate_file(const std::string& file, int num_external, const Options& options) {
538 parse_file<DummyProvisioner>(file, DummyExternals(num_external), options);
539}
540
550inline void validate_buffer(const unsigned char* buffer, size_t len, int num_external, const Options& options) {
551 parse_buffer<DummyProvisioner>(buffer, len, DummyExternals(num_external), options);
552}
553
554}
555
556}
557
558#endif
Dummy classes for parsing without storing the results.
Class to hold the parsed list.
Dummy class satisfying the Externals_ interface of hdf5::parse().
Definition Dummy.hpp:130
Format
Definition interfaces.hpp:158
Defines the interfaces to use in HDF5 parsing.
void validate_file(const std::string &file, int num_external, const Options &options)
Definition parse_json.hpp:537
ParsedList parse(byteme::Reader &reader, Externals_ ext, const Options &options)
Definition parse_json.hpp:427
void validate_buffer(const unsigned char *buffer, size_t len, int num_external, const Options &options)
Definition parse_json.hpp:550
void validate(byteme::Reader &reader, int num_external, const Options &options)
Definition parse_json.hpp:525
ParsedList parse_buffer(const unsigned char *buffer, size_t len, Externals_ ext, const Options &options)
Definition parse_json.hpp:508
ParsedList parse_file(const std::string &file, Externals_ ext, const Options &options)
Definition parse_json.hpp:480
Parse an R list from a HDF5 or JSON file.
Definition parse_json.hpp:28
Results of parsing a list from file.
Definition ParsedList.hpp:19
Options for JSON file parsing.
Definition parse_json.hpp:390
bool strict_list
Definition parse_json.hpp:400
bool parallel
Definition parse_json.hpp:395
size_t buffer_size
Definition parse_json.hpp:406