uzuki2
Recovering R lists faithfully from HDF5 or JSON
Loading...
Searching...
No Matches
parse_json.hpp
Go to the documentation of this file.
1#ifndef UZUKI2_PARSE_JSON_HPP
2#define UZUKI2_PARSE_JSON_HPP
3
4#include <memory>
5#include <vector>
6#include <cctype>
7#include <string>
8#include <stdexcept>
9#include <cmath>
10#include <unordered_map>
11#include <unordered_set>
12#include <type_traits>
13
14#include "byteme/PerByte.hpp"
15#include "byteme/SomeFileReader.hpp"
16#include "byteme/SomeBufferReader.hpp"
17#include "millijson/millijson.hpp"
18#include "ritsuko/ritsuko.hpp"
19
20#include "interfaces.hpp"
21#include "Dummy.hpp"
22#include "ExternalTracker.hpp"
23#include "ParsedList.hpp"
24
30namespace uzuki2 {
31
40namespace json {
41
45inline const std::vector<std::shared_ptr<millijson::Base> >& extract_array(
46 const std::unordered_map<std::string, std::shared_ptr<millijson::Base> >& properties,
47 const std::string& name,
48 const std::string& path)
49{
50 auto vIt = properties.find(name);
51 if (vIt == properties.end()) {
52 throw std::runtime_error("expected '" + name + "' property for object at '" + path + "'");
53 }
54
55 const auto& values_ptr = vIt->second;
56 if (values_ptr->type() != millijson::ARRAY) {
57 throw std::runtime_error("expected an array in '" + path + "." + name + "'");
58 }
59
60 return static_cast<const millijson::Array*>(values_ptr.get())->values;
61}
62
63inline const millijson::Array* has_names(const std::unordered_map<std::string, std::shared_ptr<millijson::Base> >& properties, const std::string& path) {
64 auto nIt = properties.find("names");
65 if (nIt == properties.end()) {
66 return NULL;
67 }
68
69 const auto name_ptr = nIt->second;
70 if (name_ptr->type() != millijson::ARRAY) {
71 throw std::runtime_error("expected an array in '" + path + ".names'");
72 }
73 return static_cast<const millijson::Array*>(name_ptr.get());
74}
75
76template<class Destination>
77void fill_names(const millijson::Array* names_ptr, Destination* dest, const std::string& path) {
78 const auto& names = names_ptr->values;
79 if (names.size() != dest->size()) {
80 throw std::runtime_error("length of 'names' and 'values' should be the same in '" + path + "'");
81 }
82
83 for (size_t i = 0; i < names.size(); ++i) {
84 if (names[i]->type() != millijson::STRING) {
85 throw std::runtime_error("expected a string at '" + path + ".names[" + std::to_string(i) + "]'");
86 }
87 dest->set_name(i, static_cast<const millijson::String*>(names[i].get())->value);
88 }
89}
90
91template<class Function>
92auto process_array_or_scalar_values(
93 const std::unordered_map<std::string, std::shared_ptr<millijson::Base> >& properties,
94 const std::string& path,
95 Function fun)
96{
97 auto vIt = properties.find("values");
98 if (vIt == properties.end()) {
99 throw std::runtime_error("expected 'values' property for object at '" + path + "'");
100 }
101
102 auto names_ptr = has_names(properties, path);
103 bool has_names = names_ptr != NULL;
104
105 typename std::invoke_result<Function,std::vector<std::shared_ptr<millijson::Base> >,bool,bool>::type out_ptr;
106
107 const auto& values_ptr = vIt->second;
108 if (values_ptr->type() == millijson::ARRAY) {
109 out_ptr = fun(static_cast<const millijson::Array*>(values_ptr.get())->values, has_names, false);
110 } else {
111 std::vector<std::shared_ptr<millijson::Base> > temp { values_ptr };
112 out_ptr = fun(temp, has_names, true);
113 }
114
115 if (has_names) {
116 fill_names(names_ptr, out_ptr, path);
117 }
118 return out_ptr;
119}
120
121template<class Destination, class Function>
122void extract_integers(const std::vector<std::shared_ptr<millijson::Base> >& values, Destination* dest, Function check, const std::string& path, const Version& version) {
123 for (size_t i = 0; i < values.size(); ++i) {
124 if (values[i]->type() == millijson::NOTHING) {
125 dest->set_missing(i);
126 continue;
127 }
128
129 if (values[i]->type() != millijson::NUMBER) {
130 throw std::runtime_error("expected a number at '" + path + ".values[" + std::to_string(i) + "]'");
131 }
132
133 auto val = static_cast<const millijson::Number*>(values[i].get())->value;
134 if (val != std::floor(val)) {
135 throw std::runtime_error("expected an integer at '" + path + ".values[" + std::to_string(i) + "]'");
136 }
137
138 constexpr double upper = std::numeric_limits<int32_t>::max();
139 constexpr double lower = std::numeric_limits<int32_t>::min();
140 if (val < lower || val > upper) {
141 throw std::runtime_error("value at '" + path + ".values[" + std::to_string(i) + "]' cannot be represented by a 32-bit signed integer");
142 }
143
144 int32_t ival = val;
145 if (version.equals(1, 0) && val == -2147483648) {
146 dest->set_missing(i);
147 continue;
148 }
149
150 check(ival);
151 dest->set(i, ival);
152 }
153}
154
155template<class Destination, class Function>
156void extract_strings(const std::vector<std::shared_ptr<millijson::Base> >& values, Destination* dest, Function check, const std::string& path) {
157 for (size_t i = 0; i < values.size(); ++i) {
158 if (values[i]->type() == millijson::NOTHING) {
159 dest->set_missing(i);
160 continue;
161 }
162
163 if (values[i]->type() != millijson::STRING) {
164 throw std::runtime_error("expected a string at '" + path + ".values[" + std::to_string(i) + "]'");
165 }
166
167 const auto& str = static_cast<const millijson::String*>(values[i].get())->value;
168 check(str);
169 dest->set(i, str);
170 }
171}
172
173template<class Provisioner, class Externals>
174std::shared_ptr<Base> parse_object(const millijson::Base* contents, Externals& ext, const std::string& path, const Version& version) {
175 if (contents->type() != millijson::OBJECT) {
176 throw std::runtime_error("each R object should be represented by a JSON object at '" + path + "'");
177 }
178
179 auto optr = static_cast<const millijson::Object*>(contents);
180 const auto& map = optr->values;
181
182 auto tIt = map.find("type");
183 if (tIt == map.end()) {
184 throw std::runtime_error("missing 'type' property for JSON object at '" + path + "'");
185 }
186 const auto& type_ptr = tIt->second;
187 if (type_ptr->type() != millijson::STRING) {
188 throw std::runtime_error("expected a string at '" + path + ".type'");
189 }
190 const auto& type = static_cast<const millijson::String*>(type_ptr.get())->value;
191
192 std::shared_ptr<Base> output;
193 if (type == "nothing") {
194 output.reset(Provisioner::new_Nothing());
195
196 } else if (type == "external") {
197 auto iIt = map.find("index");
198 if (iIt == map.end()) {
199 throw std::runtime_error("expected 'index' property for 'external' type at '" + path + "'");
200 }
201 const auto& index_ptr = iIt->second;
202 if (index_ptr->type() != millijson::NUMBER) {
203 throw std::runtime_error("expected a number at '" + path + ".index'");
204 }
205 auto index = static_cast<const millijson::Number*>(index_ptr.get())->value;
206
207 if (index != std::floor(index)) {
208 throw std::runtime_error("expected an integer at '" + path + ".index'");
209 } else if (index < 0 || index >= static_cast<double>(ext.size())) {
210 throw std::runtime_error("external index out of range at '" + path + ".index'");
211 }
212 output.reset(Provisioner::new_External(ext.get(index)));
213
214 } else if (type == "integer") {
215 process_array_or_scalar_values(map, path, [&](const auto& vals, bool named, bool scalar) -> auto {
216 auto ptr = Provisioner::new_Integer(vals.size(), named, scalar);
217 output.reset(ptr);
218 extract_integers(vals, ptr, [](int32_t) -> void {}, path, version);
219 return ptr;
220 });
221
222 } else if (type == "factor" || (version.equals(1, 0) && type == "ordered")) {
223 bool ordered = false;
224 if (type == "ordered") {
225 ordered = true;
226 } else {
227 auto oIt = map.find("ordered");
228 if (oIt != map.end()) {
229 if (oIt->second->type() != millijson::BOOLEAN) {
230 throw std::runtime_error("expected a boolean at '" + path + ".ordered'");
231 }
232 auto optr = static_cast<const millijson::Boolean*>((oIt->second).get());
233 ordered = optr->value;
234 }
235 }
236
237 const std::string levels_name = "levels"; // avoid dangling reference from casting of string literal.
238 const auto& lvals = extract_array(map, levels_name, path);
239 int32_t nlevels = lvals.size();
240 auto fptr = process_array_or_scalar_values(map, path, [&](const auto& vals, bool named, bool scalar) -> auto {
241 auto ptr = Provisioner::new_Factor(vals.size(), named, scalar, nlevels, ordered);
242 output.reset(ptr);
243 extract_integers(vals, ptr, [&](int32_t x) -> void {
244 if (x < 0 || x >= nlevels) {
245 throw std::runtime_error("factor indices of out of range of levels in '" + path + "'");
246 }
247 }, path, version);
248 return ptr;
249 });
250
251 std::unordered_set<std::string> existing;
252 for (size_t l = 0; l < lvals.size(); ++l) {
253 if (lvals[l]->type() != millijson::STRING) {
254 throw std::runtime_error("expected strings at '" + path + ".levels[" + std::to_string(l) + "]'");
255 }
256
257 const auto& level = static_cast<const millijson::String*>(lvals[l].get())->value;
258 if (existing.find(level) != existing.end()) {
259 throw std::runtime_error("detected duplicate string at '" + path + ".levels[" + std::to_string(l) + "]'");
260 }
261 fptr->set_level(l, level);
262 existing.insert(level);
263 }
264
265 } else if (type == "boolean") {
266 process_array_or_scalar_values(map, path, [&](const auto& vals, bool named, bool scalar) -> auto {
267 auto ptr = Provisioner::new_Boolean(vals.size(), named, scalar);
268 output.reset(ptr);
269
270 for (size_t i = 0; i < vals.size(); ++i) {
271 if (vals[i]->type() == millijson::NOTHING) {
272 ptr->set_missing(i);
273 continue;
274 }
275
276 if (vals[i]->type() != millijson::BOOLEAN) {
277 throw std::runtime_error("expected a boolean at '" + path + ".values[" + std::to_string(i) + "]'");
278 }
279 ptr->set(i, static_cast<const millijson::Boolean*>(vals[i].get())->value);
280 }
281
282 return ptr;
283 });
284
285 } else if (type == "number") {
286 process_array_or_scalar_values(map, path, [&](const auto& vals, bool named, bool scalar) -> auto {
287 auto ptr = Provisioner::new_Number(vals.size(), named, scalar);
288 output.reset(ptr);
289
290 for (size_t i = 0; i < vals.size(); ++i) {
291 if (vals[i]->type() == millijson::NOTHING) {
292 ptr->set_missing(i);
293 continue;
294 }
295
296 if (vals[i]->type() == millijson::NUMBER) {
297 ptr->set(i, static_cast<const millijson::Number*>(vals[i].get())->value);
298 } else if (vals[i]->type() == millijson::STRING) {
299 auto str = static_cast<const millijson::String*>(vals[i].get())->value;
300 if (str == "NaN") {
301 ptr->set(i, std::numeric_limits<double>::quiet_NaN());
302 } else if (str == "Inf") {
303 ptr->set(i, std::numeric_limits<double>::infinity());
304 } else if (str == "-Inf") {
305 ptr->set(i, -std::numeric_limits<double>::infinity());
306 } else {
307 throw std::runtime_error("unsupported string '" + str + "' at '" + path + ".values[" + std::to_string(i) + "]'");
308 }
309 } else {
310 throw std::runtime_error("expected a number at '" + path + ".values[" + std::to_string(i) + "]'");
311 }
312 }
313
314 return ptr;
315 });
316
317 } else if (type == "string" || (version.equals(1, 0) && (type == "date" || type == "date-time"))) {
318 StringVector::Format format = StringVector::NONE;
319 if (version.equals(1, 0)) {
320 if (type == "date") {
321 format = StringVector::DATE;
322 } else if (type == "date-time") {
323 format = StringVector::DATETIME;
324 }
325 } else {
326 auto fIt = map.find("format");
327 if (fIt != map.end()) {
328 if (fIt->second->type() != millijson::STRING) {
329 throw std::runtime_error("expected a string at '" + path + ".format'");
330 }
331 auto fptr = static_cast<const millijson::String*>(fIt->second.get());
332 if (fptr->value == "date") {
333 format = StringVector::DATE;
334 } else if (fptr->value == "date-time") {
335 format = StringVector::DATETIME;
336 } else {
337 throw std::runtime_error("unsupported format '" + fptr->value + "' at '" + path + ".format'");
338 }
339 }
340 }
341
342 process_array_or_scalar_values(map, path, [&](const auto& vals, bool named, bool scalar) -> auto {
343 auto ptr = Provisioner::new_String(vals.size(), named, scalar, format);
344 output.reset(ptr);
345
346 if (format == StringVector::NONE) {
347 extract_strings(vals, ptr, [](const std::string&) -> void {}, path);
348 } else if (format == StringVector::DATE) {
349 extract_strings(vals, ptr, [&](const std::string& x) -> void {
350 if (!ritsuko::is_date(x.c_str(), x.size())) {
351 throw std::runtime_error("dates should follow YYYY-MM-DD formatting in '" + path + ".values'");
352 }
353 }, path);
354 } else if (format == StringVector::DATETIME) {
355 extract_strings(vals, ptr, [&](const std::string& x) -> void {
356 if (!ritsuko::is_rfc3339(x.c_str(), x.size())) {
357 throw std::runtime_error("date-times should follow the Internet Date/Time format in '" + path + ".values'");
358 }
359 }, path);
360 }
361
362 return ptr;
363 });
364
365 } else if (type == "list") {
366 auto names_ptr = has_names(map, path);
367 bool has_names = names_ptr != NULL;
368
369 const std::string values_name = "values"; // avoid dangling reference from casting of string literal.
370 const auto& vals = extract_array(map, values_name, path);
371 auto ptr = Provisioner::new_List(vals.size(), has_names);
372 output.reset(ptr);
373
374 for (size_t i = 0; i < vals.size(); ++i) {
375 ptr->set(i, parse_object<Provisioner>(vals[i].get(), ext, path + ".values[" + std::to_string(i) + "]", version));
376 }
377
378 if (has_names) {
379 fill_names(names_ptr, ptr, path);
380 }
381
382 } else {
383 throw std::runtime_error("unknown object type '" + type + "' at '" + path + ".type'");
384 }
385
386 return output;
387}
395struct Options {
400 bool parallel = false;
401
405 bool strict_list = true;
406};
407
425template<class Provisioner, class Externals>
426ParsedList parse(byteme::Reader& reader, Externals ext, Options options = Options()) {
427 std::shared_ptr<millijson::Base> contents;
428 if (options.parallel) {
429 byteme::PerByte bytestream(&reader);
430 contents = millijson::parse(bytestream);
431 } else {
432 byteme::PerByteParallel bytestream(&reader);
433 contents = millijson::parse(bytestream);
434 }
435
436 Version version;
437 if (contents->type() == millijson::OBJECT) {
438 auto optr = static_cast<const millijson::Object*>(contents.get());
439 const auto& map = optr->values;
440 auto vIt = map.find("version");
441 if (vIt != map.end()) {
442 if (vIt->second->type() != millijson::STRING) {
443 throw std::runtime_error("expected a string in 'version'");
444 }
445 const auto& vstr = static_cast<const millijson::String*>(vIt->second.get())->value;
446 auto vraw = ritsuko::parse_version_string(vstr.c_str(), vstr.size(), /* skip_patch = */ true);
447 version.major = vraw.major;
448 version.minor = vraw.minor;
449 }
450 }
451
452 ExternalTracker etrack(std::move(ext));
453 auto output = parse_object<Provisioner>(contents.get(), etrack, "", version);
454
455 if (options.strict_list && output->type() != LIST) {
456 throw std::runtime_error("top-level object should represent an R list");
457 }
458 etrack.validate();
459
460 return ParsedList(std::move(output), std::move(version));
461}
462
477template<class Provisioner>
478ParsedList parse(byteme::Reader& reader, Options options = Options()) {
479 DummyExternals ext(0);
480 return parse<Provisioner>(reader, std::move(ext), std::move(options));
481}
482
500template<class Provisioner, class Externals>
501ParsedList parse_file(const std::string& file, Externals ext, Options options = Options()) {
502 byteme::SomeFileReader reader(file.c_str());
503 return parse<Provisioner>(reader, std::move(ext), std::move(options));
504}
505
520template<class Provisioner>
521ParsedList parse_file(const std::string& file, Options options = Options()) {
522 DummyExternals ext(0);
523 return parse_file<Provisioner>(file, std::move(ext), std::move(options));
524}
525
544template<class Provisioner, class Externals>
545ParsedList parse_buffer(const unsigned char* buffer, size_t len, Externals ext, Options options = Options()) {
546 byteme::SomeBufferReader reader(buffer, len);
547 return parse<Provisioner>(reader, std::move(ext), std::move(options));
548}
549
565template<class Provisioner>
566ParsedList parse_buffer(const unsigned char* buffer, size_t len, Options options = Options()) {
567 DummyExternals ext(0);
568 return parse_buffer<Provisioner>(buffer, len, std::move(ext), std::move(options));
569}
570
579inline void validate(byteme::Reader& reader, int num_external = 0, Options options = Options()) {
580 DummyExternals ext(num_external);
581 parse<DummyProvisioner>(reader, std::move(ext), std::move(options));
582 return;
583}
584
593inline void validate_file(const std::string& file, int num_external = 0, Options options = Options()) {
594 DummyExternals ext(num_external);
595 parse_file<DummyProvisioner>(file, std::move(ext), std::move(options));
596 return;
597}
598
608inline void validate_buffer(const unsigned char* buffer, size_t len, int num_external = 0, Options options = Options()) {
609 DummyExternals ext(num_external);
610 parse_buffer<DummyProvisioner>(buffer, len, std::move(ext), std::move(options));
611 return;
612}
613
614}
615
616}
617
618#endif
Defines the interfaces to use in HDF5 parsing.
ParsedList parse_file(const std::string &file, Externals ext, Options options=Options())
Definition parse_json.hpp:501
ParsedList parse(byteme::Reader &reader, Externals ext, Options options=Options())
Definition parse_json.hpp:426
void validate_file(const std::string &file, int num_external=0, Options options=Options())
Definition parse_json.hpp:593
void validate(byteme::Reader &reader, int num_external=0, Options options=Options())
Definition parse_json.hpp:579
void validate_buffer(const unsigned char *buffer, size_t len, int num_external=0, Options options=Options())
Definition parse_json.hpp:608
ParsedList parse_buffer(const unsigned char *buffer, size_t len, Externals ext, Options options=Options())
Definition parse_json.hpp:545
Parse an R list from a HDF5 or JSON file.
Definition parse_json.hpp:30
Format
Definition interfaces.hpp:148
Options for JSON file parsing.
Definition parse_json.hpp:395
bool strict_list
Definition parse_json.hpp:405
bool parallel
Definition parse_json.hpp:400