uzuki2
Recovering R lists faithfully from HDF5 or JSON
Loading...
Searching...
No Matches
parse_json.hpp
Go to the documentation of this file.
1#ifndef UZUKI2_PARSE_JSON_HPP
2#define UZUKI2_PARSE_JSON_HPP
3
4#include <memory>
5#include <vector>
6#include <cctype>
7#include <string>
8#include <stdexcept>
9#include <cmath>
10#include <unordered_map>
11#include <unordered_set>
12#include <type_traits>
13
14#include "byteme/PerByte.hpp"
15#include "byteme/SomeFileReader.hpp"
16#include "byteme/SomeBufferReader.hpp"
17#include "millijson/millijson.hpp"
18#include "ritsuko/ritsuko.hpp"
19
20#include "interfaces.hpp"
21#include "Dummy.hpp"
22#include "ExternalTracker.hpp"
23#include "ParsedList.hpp"
24
30namespace uzuki2 {
31
40namespace json {
41
45inline const std::vector<std::shared_ptr<millijson::Base> >& extract_array(
46 const std::unordered_map<std::string, std::shared_ptr<millijson::Base> >& properties,
47 const std::string& name,
48 const std::string& path)
49{
50 auto vIt = properties.find(name);
51 if (vIt == properties.end()) {
52 throw std::runtime_error("expected '" + name + "' property for object at '" + path + "'");
53 }
54
55 const auto& values_ptr = vIt->second;
56 if (values_ptr->type() != millijson::ARRAY) {
57 throw std::runtime_error("expected an array in '" + path + "." + name + "'");
58 }
59
60 return static_cast<const millijson::Array*>(values_ptr.get())->values;
61}
62
63inline const millijson::Array* has_names(const std::unordered_map<std::string, std::shared_ptr<millijson::Base> >& properties, const std::string& path) {
64 auto nIt = properties.find("names");
65 if (nIt == properties.end()) {
66 return NULL;
67 }
68
69 const auto name_ptr = nIt->second;
70 if (name_ptr->type() != millijson::ARRAY) {
71 throw std::runtime_error("expected an array in '" + path + ".names'");
72 }
73 return static_cast<const millijson::Array*>(name_ptr.get());
74}
75
76template<class Destination>
77void fill_names(const millijson::Array* names_ptr, Destination* dest, const std::string& path) {
78 const auto& names = names_ptr->values;
79 if (names.size() != dest->size()) {
80 throw std::runtime_error("length of 'names' and 'values' should be the same in '" + path + "'");
81 }
82
83 for (size_t i = 0; i < names.size(); ++i) {
84 if (names[i]->type() != millijson::STRING) {
85 throw std::runtime_error("expected a string at '" + path + ".names[" + std::to_string(i) + "]'");
86 }
87 dest->set_name(i, static_cast<const millijson::String*>(names[i].get())->value);
88 }
89}
90
91template<class Function>
93 const std::unordered_map<std::string, std::shared_ptr<millijson::Base> >& properties,
94 const std::string& path,
96{
97 auto vIt = properties.find("values");
98 if (vIt == properties.end()) {
99 throw std::runtime_error("expected 'values' property for object at '" + path + "'");
100 }
101
103 bool has_names = names_ptr != NULL;
104
105 typename std::invoke_result<Function,std::vector<std::shared_ptr<millijson::Base> >,bool,bool>::type out_ptr;
106
107 const auto& values_ptr = vIt->second;
108 if (values_ptr->type() == millijson::ARRAY) {
109 out_ptr = fun(static_cast<const millijson::Array*>(values_ptr.get())->values, has_names, false);
110 } else {
111 std::vector<std::shared_ptr<millijson::Base> > temp { values_ptr };
112 out_ptr = fun(temp, has_names, true);
113 }
114
115 if (has_names) {
117 }
118 return out_ptr;
119}
120
121template<class Destination, class Function>
122void extract_integers(const std::vector<std::shared_ptr<millijson::Base> >& values, Destination* dest, Function check, const std::string& path, const Version& version) {
123 for (size_t i = 0; i < values.size(); ++i) {
124 if (values[i]->type() == millijson::NOTHING) {
125 dest->set_missing(i);
126 continue;
127 }
128
129 if (values[i]->type() != millijson::NUMBER) {
130 throw std::runtime_error("expected a number at '" + path + ".values[" + std::to_string(i) + "]'");
131 }
132
133 auto val = static_cast<const millijson::Number*>(values[i].get())->value;
134 if (val != std::floor(val)) {
135 throw std::runtime_error("expected an integer at '" + path + ".values[" + std::to_string(i) + "]'");
136 }
137
138 constexpr double upper = std::numeric_limits<int32_t>::max();
139 constexpr double lower = std::numeric_limits<int32_t>::min();
141 throw std::runtime_error("value at '" + path + ".values[" + std::to_string(i) + "]' cannot be represented by a 32-bit signed integer");
142 }
143
144 int32_t ival = val;
145 if (version.equals(1, 0) && val == -2147483648) {
146 dest->set_missing(i);
147 continue;
148 }
149
150 check(ival);
151 dest->set(i, ival);
152 }
153}
154
155template<class Destination, class Function>
156void extract_strings(const std::vector<std::shared_ptr<millijson::Base> >& values, Destination* dest, Function check, const std::string& path) {
157 for (size_t i = 0; i < values.size(); ++i) {
158 if (values[i]->type() == millijson::NOTHING) {
159 dest->set_missing(i);
160 continue;
161 }
162
163 if (values[i]->type() != millijson::STRING) {
164 throw std::runtime_error("expected a string at '" + path + ".values[" + std::to_string(i) + "]'");
165 }
166
167 const auto& str = static_cast<const millijson::String*>(values[i].get())->value;
168 check(str);
169 dest->set(i, str);
170 }
171}
172
173template<class Provisioner, class Externals>
174std::shared_ptr<Base> parse_object(const millijson::Base* contents, Externals& ext, const std::string& path, const Version& version) {
175 if (contents->type() != millijson::OBJECT) {
176 throw std::runtime_error("each R object should be represented by a JSON object at '" + path + "'");
177 }
178
179 auto optr = static_cast<const millijson::Object*>(contents);
180 const auto& map = optr->values;
181
182 auto tIt = map.find("type");
183 if (tIt == map.end()) {
184 throw std::runtime_error("missing 'type' property for JSON object at '" + path + "'");
185 }
186 const auto& type_ptr = tIt->second;
187 if (type_ptr->type() != millijson::STRING) {
188 throw std::runtime_error("expected a string at '" + path + ".type'");
189 }
190 const auto& type = static_cast<const millijson::String*>(type_ptr.get())->value;
191
192 std::shared_ptr<Base> output;
193 if (type == "nothing") {
194 output.reset(Provisioner::new_Nothing());
195
196 } else if (type == "external") {
197 auto iIt = map.find("index");
198 if (iIt == map.end()) {
199 throw std::runtime_error("expected 'index' property for 'external' type at '" + path + "'");
200 }
201 const auto& index_ptr = iIt->second;
202 if (index_ptr->type() != millijson::NUMBER) {
203 throw std::runtime_error("expected a number at '" + path + ".index'");
204 }
205 auto index = static_cast<const millijson::Number*>(index_ptr.get())->value;
206
207 if (index != std::floor(index)) {
208 throw std::runtime_error("expected an integer at '" + path + ".index'");
209 } else if (index < 0 || index >= static_cast<double>(ext.size())) {
210 throw std::runtime_error("external index out of range at '" + path + ".index'");
211 }
212 output.reset(Provisioner::new_External(ext.get(index)));
213
214 } else if (type == "integer") {
215 process_array_or_scalar_values(map, path, [&](const auto& vals, bool named, bool scalar) -> auto {
216 auto ptr = Provisioner::new_Integer(vals.size(), named, scalar);
217 output.reset(ptr);
218 extract_integers(vals, ptr, [](int32_t) -> void {}, path, version);
219 return ptr;
220 });
221
222 } else if (type == "factor" || (version.equals(1, 0) && type == "ordered")) {
223 bool ordered = false;
224 if (type == "ordered") {
225 ordered = true;
226 } else {
227 auto oIt = map.find("ordered");
228 if (oIt != map.end()) {
229 if (oIt->second->type() != millijson::BOOLEAN) {
230 throw std::runtime_error("expected a boolean at '" + path + ".ordered'");
231 }
232 auto optr = static_cast<const millijson::Boolean*>((oIt->second).get());
233 ordered = optr->value;
234 }
235 }
236
237 const auto& lvals = extract_array(map, "levels", path);
238 int32_t nlevels = lvals.size();
239 auto fptr = process_array_or_scalar_values(map, path, [&](const auto& vals, bool named, bool scalar) -> auto {
240 auto ptr = Provisioner::new_Factor(vals.size(), named, scalar, nlevels, ordered);
241 output.reset(ptr);
242 extract_integers(vals, ptr, [&](int32_t x) -> void {
243 if (x < 0 || x >= nlevels) {
244 throw std::runtime_error("factor indices of out of range of levels in '" + path + "'");
245 }
246 }, path, version);
247 return ptr;
248 });
249
250 std::unordered_set<std::string> existing;
251 for (size_t l = 0; l < lvals.size(); ++l) {
252 if (lvals[l]->type() != millijson::STRING) {
253 throw std::runtime_error("expected strings at '" + path + ".levels[" + std::to_string(l) + "]'");
254 }
255
256 const auto& level = static_cast<const millijson::String*>(lvals[l].get())->value;
257 if (existing.find(level) != existing.end()) {
258 throw std::runtime_error("detected duplicate string at '" + path + ".levels[" + std::to_string(l) + "]'");
259 }
260 fptr->set_level(l, level);
261 existing.insert(level);
262 }
263
264 } else if (type == "boolean") {
265 process_array_or_scalar_values(map, path, [&](const auto& vals, bool named, bool scalar) -> auto {
266 auto ptr = Provisioner::new_Boolean(vals.size(), named, scalar);
267 output.reset(ptr);
268
269 for (size_t i = 0; i < vals.size(); ++i) {
270 if (vals[i]->type() == millijson::NOTHING) {
271 ptr->set_missing(i);
272 continue;
273 }
274
275 if (vals[i]->type() != millijson::BOOLEAN) {
276 throw std::runtime_error("expected a boolean at '" + path + ".values[" + std::to_string(i) + "]'");
277 }
278 ptr->set(i, static_cast<const millijson::Boolean*>(vals[i].get())->value);
279 }
280
281 return ptr;
282 });
283
284 } else if (type == "number") {
285 process_array_or_scalar_values(map, path, [&](const auto& vals, bool named, bool scalar) -> auto {
286 auto ptr = Provisioner::new_Number(vals.size(), named, scalar);
287 output.reset(ptr);
288
289 for (size_t i = 0; i < vals.size(); ++i) {
290 if (vals[i]->type() == millijson::NOTHING) {
291 ptr->set_missing(i);
292 continue;
293 }
294
295 if (vals[i]->type() == millijson::NUMBER) {
296 ptr->set(i, static_cast<const millijson::Number*>(vals[i].get())->value);
297 } else if (vals[i]->type() == millijson::STRING) {
298 auto str = static_cast<const millijson::String*>(vals[i].get())->value;
299 if (str == "NaN") {
300 ptr->set(i, std::numeric_limits<double>::quiet_NaN());
301 } else if (str == "Inf") {
302 ptr->set(i, std::numeric_limits<double>::infinity());
303 } else if (str == "-Inf") {
304 ptr->set(i, -std::numeric_limits<double>::infinity());
305 } else {
306 throw std::runtime_error("unsupported string '" + str + "' at '" + path + ".values[" + std::to_string(i) + "]'");
307 }
308 } else {
309 throw std::runtime_error("expected a number at '" + path + ".values[" + std::to_string(i) + "]'");
310 }
311 }
312
313 return ptr;
314 });
315
316 } else if (type == "string" || (version.equals(1, 0) && (type == "date" || type == "date-time"))) {
317 StringVector::Format format = StringVector::NONE;
318 if (version.equals(1, 0)) {
319 if (type == "date") {
320 format = StringVector::DATE;
321 } else if (type == "date-time") {
322 format = StringVector::DATETIME;
323 }
324 } else {
325 auto fIt = map.find("format");
326 if (fIt != map.end()) {
327 if (fIt->second->type() != millijson::STRING) {
328 throw std::runtime_error("expected a string at '" + path + ".format'");
329 }
330 auto fptr = static_cast<const millijson::String*>(fIt->second.get());
331 if (fptr->value == "date") {
332 format = StringVector::DATE;
333 } else if (fptr->value == "date-time") {
334 format = StringVector::DATETIME;
335 } else {
336 throw std::runtime_error("unsupported format '" + fptr->value + "' at '" + path + ".format'");
337 }
338 }
339 }
340
341 process_array_or_scalar_values(map, path, [&](const auto& vals, bool named, bool scalar) -> auto {
342 auto ptr = Provisioner::new_String(vals.size(), named, scalar, format);
343 output.reset(ptr);
344
345 if (format == StringVector::NONE) {
346 extract_strings(vals, ptr, [](const std::string&) -> void {}, path);
347 } else if (format == StringVector::DATE) {
348 extract_strings(vals, ptr, [&](const std::string& x) -> void {
349 if (!ritsuko::is_date(x.c_str(), x.size())) {
350 throw std::runtime_error("dates should follow YYYY-MM-DD formatting in '" + path + ".values'");
351 }
352 }, path);
353 } else if (format == StringVector::DATETIME) {
354 extract_strings(vals, ptr, [&](const std::string& x) -> void {
355 if (!ritsuko::is_rfc3339(x.c_str(), x.size())) {
356 throw std::runtime_error("date-times should follow the Internet Date/Time format in '" + path + ".values'");
357 }
358 }, path);
359 }
360
361 return ptr;
362 });
363
364 } else if (type == "list") {
365 auto names_ptr = has_names(map, path);
366 bool has_names = names_ptr != NULL;
367
368 const auto& vals = extract_array(map, "values", path);
369 auto ptr = Provisioner::new_List(vals.size(), has_names);
370 output.reset(ptr);
371
372 for (size_t i = 0; i < vals.size(); ++i) {
373 ptr->set(i, parse_object<Provisioner>(vals[i].get(), ext, path + ".values[" + std::to_string(i) + "]", version));
374 }
375
376 if (has_names) {
378 }
379
380 } else {
381 throw std::runtime_error("unknown object type '" + type + "' at '" + path + ".type'");
382 }
383
384 return output;
385}
393struct Options {
398 bool parallel = false;
399
403 bool strict_list = true;
404};
405
423template<class Provisioner, class Externals>
425 std::shared_ptr<millijson::Base> contents;
426 if (options.parallel) {
427 byteme::PerByte bytestream(&reader);
428 contents = millijson::parse(bytestream);
429 } else {
430 byteme::PerByteParallel bytestream(&reader);
431 contents = millijson::parse(bytestream);
432 }
433
435 if (contents->type() == millijson::OBJECT) {
436 auto optr = static_cast<const millijson::Object*>(contents.get());
437 const auto& map = optr->values;
438 auto vIt = map.find("version");
439 if (vIt != map.end()) {
440 if (vIt->second->type() != millijson::STRING) {
441 throw std::runtime_error("expected a string in 'version'");
442 }
443 const auto& vstr = static_cast<const millijson::String*>(vIt->second.get())->value;
444 auto vraw = ritsuko::parse_version_string(vstr.c_str(), vstr.size(), /* skip_patch = */ true);
445 version.major = vraw.major;
446 version.minor = vraw.minor;
447 }
448 }
449
450 ExternalTracker etrack(std::move(ext));
452
453 if (options.strict_list && output->type() != LIST) {
454 throw std::runtime_error("top-level object should represent an R list");
455 }
456 etrack.validate();
457
458 return ParsedList(std::move(output), std::move(version));
459}
460
475template<class Provisioner>
478 return parse<Provisioner>(reader, std::move(ext), std::move(options));
479}
480
498template<class Provisioner, class Externals>
500 byteme::SomeFileReader reader(file.c_str());
501 return parse<Provisioner>(reader, std::move(ext), std::move(options));
502}
503
518template<class Provisioner>
521 return parse_file<Provisioner>(file, std::move(ext), std::move(options));
522}
523
542template<class Provisioner, class Externals>
543ParsedList parse_buffer(const unsigned char* buffer, size_t len, Externals ext, Options options = Options()) {
544 byteme::SomeBufferReader reader(buffer, len);
545 return parse<Provisioner>(reader, std::move(ext), std::move(options));
546}
547
563template<class Provisioner>
564ParsedList parse_buffer(const unsigned char* buffer, size_t len, Options options = Options()) {
566 return parse_buffer<Provisioner>(buffer, len, std::move(ext), std::move(options));
567}
568
577inline void validate(byteme::Reader& reader, int num_external = 0, Options options = Options()) {
579 parse<DummyProvisioner>(reader, std::move(ext), std::move(options));
580 return;
581}
582
591inline void validate_file(const std::string& file, int num_external = 0, Options options = Options()) {
593 parse_file<DummyProvisioner>(file, std::move(ext), std::move(options));
594 return;
595}
596
606inline void validate_buffer(const unsigned char* buffer, size_t len, int num_external = 0, Options options = Options()) {
608 parse_buffer<DummyProvisioner>(buffer, len, std::move(ext), std::move(options));
609 return;
610}
611
612}
613
614}
615
616#endif
Defines the interfaces to use in HDF5 parsing.
ParsedList parse_file(const std::string &file, Externals ext, Options options=Options())
Definition parse_json.hpp:499
ParsedList parse(byteme::Reader &reader, Externals ext, Options options=Options())
Definition parse_json.hpp:424
void validate_file(const std::string &file, int num_external=0, Options options=Options())
Definition parse_json.hpp:591
void validate(byteme::Reader &reader, int num_external=0, Options options=Options())
Definition parse_json.hpp:577
void validate_buffer(const unsigned char *buffer, size_t len, int num_external=0, Options options=Options())
Definition parse_json.hpp:606
ParsedList parse_buffer(const unsigned char *buffer, size_t len, Externals ext, Options options=Options())
Definition parse_json.hpp:543
Parse an R list from a HDF5 or JSON file.
Definition parse_json.hpp:30
Format
Definition interfaces.hpp:148
Options for JSON file parsing.
Definition parse_json.hpp:393
bool strict_list
Definition parse_json.hpp:403
bool parallel
Definition parse_json.hpp:398