uzuki2
Recovering R lists faithfully from HDF5 or JSON
Loading...
Searching...
No Matches
parse_json.hpp
Go to the documentation of this file.
1#ifndef UZUKI2_PARSE_JSON_HPP
2#define UZUKI2_PARSE_JSON_HPP
3
4#include <memory>
5#include <vector>
6#include <cctype>
7#include <string>
8#include <stdexcept>
9#include <cmath>
10#include <unordered_map>
11#include <unordered_set>
12#include <type_traits>
13
14#include "byteme/PerByte.hpp"
15#include "byteme/SomeFileReader.hpp"
16#include "byteme/SomeBufferReader.hpp"
17#include "millijson/millijson.hpp"
18
19#include "interfaces.hpp"
20#include "Dummy.hpp"
21#include "ExternalTracker.hpp"
22#include "ParsedList.hpp"
23
29namespace uzuki2 {
30
39namespace json {
40
44inline const std::vector<std::shared_ptr<millijson::Base> >& extract_array(
45 const std::unordered_map<std::string, std::shared_ptr<millijson::Base> >& properties,
46 const std::string& name,
47 const std::string& path)
48{
49 auto vIt = properties.find(name);
50 if (vIt == properties.end()) {
51 throw std::runtime_error("expected '" + name + "' property for object at '" + path + "'");
52 }
53
54 const auto& values_ptr = vIt->second;
55 if (values_ptr->type() != millijson::ARRAY) {
56 throw std::runtime_error("expected an array in '" + path + "." + name + "'");
57 }
58
59 return static_cast<const millijson::Array*>(values_ptr.get())->values;
60}
61
62inline const millijson::Array* has_names(const std::unordered_map<std::string, std::shared_ptr<millijson::Base> >& properties, const std::string& path) {
63 auto nIt = properties.find("names");
64 if (nIt == properties.end()) {
65 return NULL;
66 }
67
68 const auto name_ptr = nIt->second;
69 if (name_ptr->type() != millijson::ARRAY) {
70 throw std::runtime_error("expected an array in '" + path + ".names'");
71 }
72 return static_cast<const millijson::Array*>(name_ptr.get());
73}
74
75template<class Destination>
76void fill_names(const millijson::Array* names_ptr, Destination* dest, const std::string& path) {
77 const auto& names = names_ptr->values;
78 if (names.size() != dest->size()) {
79 throw std::runtime_error("length of 'names' and 'values' should be the same in '" + path + "'");
80 }
81
82 for (size_t i = 0; i < names.size(); ++i) {
83 if (names[i]->type() != millijson::STRING) {
84 throw std::runtime_error("expected a string at '" + path + ".names[" + std::to_string(i) + "]'");
85 }
86 dest->set_name(i, static_cast<const millijson::String*>(names[i].get())->value);
87 }
88}
89
90template<class Function>
92 const std::unordered_map<std::string, std::shared_ptr<millijson::Base> >& properties,
93 const std::string& path,
95{
96 auto vIt = properties.find("values");
97 if (vIt == properties.end()) {
98 throw std::runtime_error("expected 'values' property for object at '" + path + "'");
99 }
100
102 bool has_names = names_ptr != NULL;
103
104 typename std::invoke_result<Function,std::vector<std::shared_ptr<millijson::Base> >,bool,bool>::type out_ptr;
105
106 const auto& values_ptr = vIt->second;
107 if (values_ptr->type() == millijson::ARRAY) {
108 out_ptr = fun(static_cast<const millijson::Array*>(values_ptr.get())->values, has_names, false);
109 } else {
110 std::vector<std::shared_ptr<millijson::Base> > temp { values_ptr };
111 out_ptr = fun(temp, has_names, true);
112 }
113
114 if (has_names) {
116 }
117 return out_ptr;
118}
119
120template<class Destination, class Function>
121void extract_integers(const std::vector<std::shared_ptr<millijson::Base> >& values, Destination* dest, Function check, const std::string& path, const Version& version) {
122 for (size_t i = 0; i < values.size(); ++i) {
123 if (values[i]->type() == millijson::NOTHING) {
124 dest->set_missing(i);
125 continue;
126 }
127
128 if (values[i]->type() != millijson::NUMBER) {
129 throw std::runtime_error("expected a number at '" + path + ".values[" + std::to_string(i) + "]'");
130 }
131
132 auto val = static_cast<const millijson::Number*>(values[i].get())->value;
133 if (val != std::floor(val)) {
134 throw std::runtime_error("expected an integer at '" + path + ".values[" + std::to_string(i) + "]'");
135 }
136
137 constexpr double upper = std::numeric_limits<int32_t>::max();
138 constexpr double lower = std::numeric_limits<int32_t>::min();
140 throw std::runtime_error("value at '" + path + ".values[" + std::to_string(i) + "]' cannot be represented by a 32-bit signed integer");
141 }
142
143 int32_t ival = val;
144 if (version.equals(1, 0) && val == -2147483648) {
145 dest->set_missing(i);
146 continue;
147 }
148
149 check(ival);
150 dest->set(i, ival);
151 }
152}
153
154template<class Destination, class Function>
155void extract_strings(const std::vector<std::shared_ptr<millijson::Base> >& values, Destination* dest, Function check, const std::string& path) {
156 for (size_t i = 0; i < values.size(); ++i) {
157 if (values[i]->type() == millijson::NOTHING) {
158 dest->set_missing(i);
159 continue;
160 }
161
162 if (values[i]->type() != millijson::STRING) {
163 throw std::runtime_error("expected a string at '" + path + ".values[" + std::to_string(i) + "]'");
164 }
165
166 const auto& str = static_cast<const millijson::String*>(values[i].get())->value;
167 check(str);
168 dest->set(i, str);
169 }
170}
171
172template<class Provisioner, class Externals>
173std::shared_ptr<Base> parse_object(const millijson::Base* contents, Externals& ext, const std::string& path, const Version& version) {
174 if (contents->type() != millijson::OBJECT) {
175 throw std::runtime_error("each R object should be represented by a JSON object at '" + path + "'");
176 }
177
178 auto optr = static_cast<const millijson::Object*>(contents);
179 const auto& map = optr->values;
180
181 auto tIt = map.find("type");
182 if (tIt == map.end()) {
183 throw std::runtime_error("missing 'type' property for JSON object at '" + path + "'");
184 }
185 const auto& type_ptr = tIt->second;
186 if (type_ptr->type() != millijson::STRING) {
187 throw std::runtime_error("expected a string at '" + path + ".type'");
188 }
189 const auto& type = static_cast<const millijson::String*>(type_ptr.get())->value;
190
191 std::shared_ptr<Base> output;
192 if (type == "nothing") {
193 output.reset(Provisioner::new_Nothing());
194
195 } else if (type == "external") {
196 auto iIt = map.find("index");
197 if (iIt == map.end()) {
198 throw std::runtime_error("expected 'index' property for 'external' type at '" + path + "'");
199 }
200 const auto& index_ptr = iIt->second;
201 if (index_ptr->type() != millijson::NUMBER) {
202 throw std::runtime_error("expected a number at '" + path + ".index'");
203 }
204 auto index = static_cast<const millijson::Number*>(index_ptr.get())->value;
205
206 if (index != std::floor(index)) {
207 throw std::runtime_error("expected an integer at '" + path + ".index'");
208 } else if (index < 0 || index >= static_cast<double>(ext.size())) {
209 throw std::runtime_error("external index out of range at '" + path + ".index'");
210 }
211 output.reset(Provisioner::new_External(ext.get(index)));
212
213 } else if (type == "integer") {
214 process_array_or_scalar_values(map, path, [&](const auto& vals, bool named, bool scalar) -> auto {
215 auto ptr = Provisioner::new_Integer(vals.size(), named, scalar);
216 output.reset(ptr);
217 extract_integers(vals, ptr, [](int32_t) -> void {}, path, version);
218 return ptr;
219 });
220
221 } else if (type == "factor" || (version.equals(1, 0) && type == "ordered")) {
222 bool ordered = false;
223 if (type == "ordered") {
224 ordered = true;
225 } else {
226 auto oIt = map.find("ordered");
227 if (oIt != map.end()) {
228 if (oIt->second->type() != millijson::BOOLEAN) {
229 throw std::runtime_error("expected a boolean at '" + path + ".ordered'");
230 }
231 auto optr = static_cast<const millijson::Boolean*>((oIt->second).get());
232 ordered = optr->value;
233 }
234 }
235
236 const auto& lvals = extract_array(map, "levels", path);
237 int32_t nlevels = lvals.size();
238 auto fptr = process_array_or_scalar_values(map, path, [&](const auto& vals, bool named, bool scalar) -> auto {
239 auto ptr = Provisioner::new_Factor(vals.size(), named, scalar, nlevels, ordered);
240 output.reset(ptr);
241 extract_integers(vals, ptr, [&](int32_t x) -> void {
242 if (x < 0 || x >= nlevels) {
243 throw std::runtime_error("factor indices of out of range of levels in '" + path + "'");
244 }
245 }, path, version);
246 return ptr;
247 });
248
249 std::unordered_set<std::string> existing;
250 for (size_t l = 0; l < lvals.size(); ++l) {
251 if (lvals[l]->type() != millijson::STRING) {
252 throw std::runtime_error("expected strings at '" + path + ".levels[" + std::to_string(l) + "]'");
253 }
254
255 const auto& level = static_cast<const millijson::String*>(lvals[l].get())->value;
256 if (existing.find(level) != existing.end()) {
257 throw std::runtime_error("detected duplicate string at '" + path + ".levels[" + std::to_string(l) + "]'");
258 }
259 fptr->set_level(l, level);
260 existing.insert(level);
261 }
262
263 } else if (type == "boolean") {
264 process_array_or_scalar_values(map, path, [&](const auto& vals, bool named, bool scalar) -> auto {
265 auto ptr = Provisioner::new_Boolean(vals.size(), named, scalar);
266 output.reset(ptr);
267
268 for (size_t i = 0; i < vals.size(); ++i) {
269 if (vals[i]->type() == millijson::NOTHING) {
270 ptr->set_missing(i);
271 continue;
272 }
273
274 if (vals[i]->type() != millijson::BOOLEAN) {
275 throw std::runtime_error("expected a boolean at '" + path + ".values[" + std::to_string(i) + "]'");
276 }
277 ptr->set(i, static_cast<const millijson::Boolean*>(vals[i].get())->value);
278 }
279
280 return ptr;
281 });
282
283 } else if (type == "number") {
284 process_array_or_scalar_values(map, path, [&](const auto& vals, bool named, bool scalar) -> auto {
285 auto ptr = Provisioner::new_Number(vals.size(), named, scalar);
286 output.reset(ptr);
287
288 for (size_t i = 0; i < vals.size(); ++i) {
289 if (vals[i]->type() == millijson::NOTHING) {
290 ptr->set_missing(i);
291 continue;
292 }
293
294 if (vals[i]->type() == millijson::NUMBER) {
295 ptr->set(i, static_cast<const millijson::Number*>(vals[i].get())->value);
296 } else if (vals[i]->type() == millijson::STRING) {
297 auto str = static_cast<const millijson::String*>(vals[i].get())->value;
298 if (str == "NaN") {
299 ptr->set(i, std::numeric_limits<double>::quiet_NaN());
300 } else if (str == "Inf") {
301 ptr->set(i, std::numeric_limits<double>::infinity());
302 } else if (str == "-Inf") {
303 ptr->set(i, -std::numeric_limits<double>::infinity());
304 } else {
305 throw std::runtime_error("unsupported string '" + str + "' at '" + path + ".values[" + std::to_string(i) + "]'");
306 }
307 } else {
308 throw std::runtime_error("expected a number at '" + path + ".values[" + std::to_string(i) + "]'");
309 }
310 }
311
312 return ptr;
313 });
314
315 } else if (type == "string" || (version.equals(1, 0) && (type == "date" || type == "date-time"))) {
316 StringVector::Format format = StringVector::NONE;
317 if (version.equals(1, 0)) {
318 if (type == "date") {
319 format = StringVector::DATE;
320 } else if (type == "date-time") {
321 format = StringVector::DATETIME;
322 }
323 } else {
324 auto fIt = map.find("format");
325 if (fIt != map.end()) {
326 if (fIt->second->type() != millijson::STRING) {
327 throw std::runtime_error("expected a string at '" + path + ".format'");
328 }
329 auto fptr = static_cast<const millijson::String*>(fIt->second.get());
330 if (fptr->value == "date") {
331 format = StringVector::DATE;
332 } else if (fptr->value == "date-time") {
333 format = StringVector::DATETIME;
334 } else {
335 throw std::runtime_error("unsupported format '" + fptr->value + "' at '" + path + ".format'");
336 }
337 }
338 }
339
340 process_array_or_scalar_values(map, path, [&](const auto& vals, bool named, bool scalar) -> auto {
341 auto ptr = Provisioner::new_String(vals.size(), named, scalar, format);
342 output.reset(ptr);
343
344 if (format == StringVector::NONE) {
345 extract_strings(vals, ptr, [](const std::string&) -> void {}, path);
346 } else if (format == StringVector::DATE) {
347 extract_strings(vals, ptr, [&](const std::string& x) -> void {
348 if (!ritsuko::is_date(x.c_str(), x.size())) {
349 throw std::runtime_error("dates should follow YYYY-MM-DD formatting in '" + path + ".values'");
350 }
351 }, path);
352 } else if (format == StringVector::DATETIME) {
353 extract_strings(vals, ptr, [&](const std::string& x) -> void {
354 if (!ritsuko::is_rfc3339(x.c_str(), x.size())) {
355 throw std::runtime_error("date-times should follow the Internet Date/Time format in '" + path + ".values'");
356 }
357 }, path);
358 }
359
360 return ptr;
361 });
362
363 } else if (type == "list") {
364 auto names_ptr = has_names(map, path);
365 bool has_names = names_ptr != NULL;
366
367 const auto& vals = extract_array(map, "values", path);
368 auto ptr = Provisioner::new_List(vals.size(), has_names);
369 output.reset(ptr);
370
371 for (size_t i = 0; i < vals.size(); ++i) {
372 ptr->set(i, parse_object<Provisioner>(vals[i].get(), ext, path + ".values[" + std::to_string(i) + "]", version));
373 }
374
375 if (has_names) {
377 }
378
379 } else {
380 throw std::runtime_error("unknown object type '" + type + "' at '" + path + ".type'");
381 }
382
383 return output;
384}
392struct Options {
397 bool parallel = false;
398
402 bool strict_list = true;
403};
404
422template<class Provisioner, class Externals>
424 std::shared_ptr<millijson::Base> contents;
425 if (options.parallel) {
426 byteme::PerByte bytestream(&reader);
427 contents = millijson::parse(bytestream);
428 } else {
429 byteme::PerByteParallel bytestream(&reader);
430 contents = millijson::parse(bytestream);
431 }
432
434 if (contents->type() == millijson::OBJECT) {
435 auto optr = static_cast<const millijson::Object*>(contents.get());
436 const auto& map = optr->values;
437 auto vIt = map.find("version");
438 if (vIt != map.end()) {
439 if (vIt->second->type() != millijson::STRING) {
440 throw std::runtime_error("expected a string in 'version'");
441 }
442 const auto& vstr = static_cast<const millijson::String*>(vIt->second.get())->value;
443 auto vraw = ritsuko::parse_version_string(vstr.c_str(), vstr.size(), /* skip_patch = */ true);
444 version.major = vraw.major;
445 version.minor = vraw.minor;
446 }
447 }
448
449 ExternalTracker etrack(std::move(ext));
451
452 if (options.strict_list && output->type() != LIST) {
453 throw std::runtime_error("top-level object should represent an R list");
454 }
455 etrack.validate();
456
457 return ParsedList(std::move(output), std::move(version));
458}
459
474template<class Provisioner>
477 return parse<Provisioner>(reader, std::move(ext), std::move(options));
478}
479
497template<class Provisioner, class Externals>
499 byteme::SomeFileReader reader(file.c_str());
500 return parse<Provisioner>(reader, std::move(ext), std::move(options));
501}
502
517template<class Provisioner>
520 return parse_file<Provisioner>(file, std::move(ext), std::move(options));
521}
522
541template<class Provisioner, class Externals>
542ParsedList parse_buffer(const unsigned char* buffer, size_t len, Externals ext, Options options = Options()) {
543 byteme::SomeBufferReader reader(buffer, len);
544 return parse<Provisioner>(reader, std::move(ext), std::move(options));
545}
546
562template<class Provisioner>
563ParsedList parse_buffer(const unsigned char* buffer, size_t len, Options options = Options()) {
565 return parse_buffer<Provisioner>(buffer, len, std::move(ext), std::move(options));
566}
567
576inline void validate(byteme::Reader& reader, int num_external = 0, Options options = Options()) {
578 parse<DummyProvisioner>(reader, std::move(ext), std::move(options));
579 return;
580}
581
590inline void validate_file(const std::string& file, int num_external = 0, Options options = Options()) {
592 parse_file<DummyProvisioner>(file, std::move(ext), std::move(options));
593 return;
594}
595
605inline void validate_buffer(const unsigned char* buffer, size_t len, int num_external = 0, Options options = Options()) {
607 parse_buffer<DummyProvisioner>(buffer, len, std::move(ext), std::move(options));
608 return;
609}
610
611}
612
613}
614
615#endif
Defines the interfaces to use in HDF5 parsing.
ParsedList parse_file(const std::string &file, Externals ext, Options options=Options())
Definition parse_json.hpp:498
ParsedList parse(byteme::Reader &reader, Externals ext, Options options=Options())
Definition parse_json.hpp:423
void validate_file(const std::string &file, int num_external=0, Options options=Options())
Definition parse_json.hpp:590
void validate(byteme::Reader &reader, int num_external=0, Options options=Options())
Definition parse_json.hpp:576
void validate_buffer(const unsigned char *buffer, size_t len, int num_external=0, Options options=Options())
Definition parse_json.hpp:605
ParsedList parse_buffer(const unsigned char *buffer, size_t len, Externals ext, Options options=Options())
Definition parse_json.hpp:542
Parse an R list from a HDF5 or JSON file.
Definition parse_json.hpp:29
Format
Definition interfaces.hpp:148
Options for JSON file parsing.
Definition parse_json.hpp:392
bool strict_list
Definition parse_json.hpp:402
bool parallel
Definition parse_json.hpp:397