comservatory
Strict validation of CSV files in C++
Loading...
Searching...
No Matches
Parser.hpp
Go to the documentation of this file.
1#ifndef COMSERVATORY_PARSER_HPP
2#define COMSERVATORY_PARSER_HPP
3
4#include <vector>
5#include <string>
6#include <stdexcept>
7#include <unordered_set>
8#include <algorithm>
9#include <memory>
10#include <thread>
11
12#include "convert.hpp"
13#include "Field.hpp"
14#include "Creator.hpp"
15
16#include "byteme/PerByte.hpp"
17
24namespace comservatory {
25
29struct Parser;
37struct Contents {
41 std::vector<std::unique_ptr<Field> > fields;
42
47 std::vector<std::string> names;
48
52 size_t num_fields () const {
53 return names.size();
54 }
55
59 size_t num_records () const {
60 return (num_fields() ? fields[0]->size() : fallback);
61 }
62
63private:
64 friend Parser;
65 size_t fallback = 0;
66};
67
71struct Parser {
72public:
73 Parser(const FieldCreator* f) : creator(f) {}
74
75public:
76 Parser& set_check_store(bool s = false) {
77 check_store = s;
78 return *this;
79 }
80
81 template<class NameIter>
82 Parser& set_store_by_name(NameIter start, NameIter end) {
83 to_store_by_name = std::unordered_set<std::string>(start, end);
84 return *this;
85 }
86
87 template<class NameContainer>
88 Parser& set_store_by_name(const NameContainer& k) {
89 return set_store_by_name(k.begin(), k.end());
90 }
91
92 template<class IndexIter>
93 Parser& set_store_by_index(IndexIter start, IndexIter end) {
94 to_store_by_index = std::unordered_set<size_t>(start, end);
95 return *this;
96 }
97
98 template<class IndexContainer>
99 Parser& set_store_by_index(const IndexContainer& k) {
100 return set_store_by_index(k.begin(), k.end());
101 }
102
103private:
104 static Field* fetch_column(Contents& info, size_t column, size_t line) {
105 auto& everything = info.fields;
106 if (column >= everything.size()) {
107 throw std::runtime_error("more fields on line " + std::to_string(line + 1) + " than expected from the header");
108 }
109 return everything[column].get();
110 }
111
112 Field* check_column_type(Contents& info, Type observed, size_t column, size_t line) const {
113 Field* current = fetch_column(info, column, line);
114 auto expected = current->type();
115
116 if (expected == UNKNOWN) {
117 bool use_dummy = check_store &&
118 to_store_by_name.find(info.names[column]) == to_store_by_name.end() &&
119 to_store_by_index.find(column) == to_store_by_index.end();
120
121 auto ptr = creator->create(observed, current->size(), use_dummy);
122 info.fields[column].reset(ptr);
123 current = info.fields[column].get();
124 } else if (expected != observed) {
125 throw std::runtime_error("previous and current types do not match up");
126 }
127
128 return current;
129 }
130
131 template<class Input>
132 void store_nan(Input& input, Contents& info, size_t column, size_t line) const {
133 input.advance();
134 expect_fixed(input, "an", "AN", column, line); // i.e., NaN or any of its capitalizations.
135 auto* current = check_column_type(info, NUMBER, column, line);
136 static_cast<NumberField*>(current)->push_back(std::numeric_limits<double>::quiet_NaN());
137 }
138
139 template<class Input>
140 void store_inf(Input& input, Contents& info, size_t column, size_t line, bool negative) const {
141 input.advance();
142 expect_fixed(input, "nf", "NF", column, line); // i.e., Inf or any of its capitalizations.
143 auto* current = check_column_type(info, NUMBER, column, line);
144
145 double val = std::numeric_limits<double>::infinity();
146 if (negative) {
147 val *= -1;
148 }
149 static_cast<NumberField*>(current)->push_back(val);
150 }
151
152 template<class Input>
153 void store_na_or_nan(Input& input, Contents& info, size_t column, size_t line) const {
154 // Some shenanigans required here to distinguish between
155 // NAN/NaN/etc. and NA, given that both are allowed.
156 input.advance();
157 if (!input.valid()) {
158 throw std::runtime_error("truncated keyword in " + get_location(column, line));
159 }
160
161 char second = input.get();
162 bool is_missing = true;
163 if (second == 'a') {
164 is_missing = false;
165 } else if (second != 'A') {
166 throw std::runtime_error("unknown keyword in " + get_location(column, line));
167 }
168
169 input.advance();
170 if (!input.valid()) {
171 if (is_missing) {
172 throw std::runtime_error("line " + std::to_string(line + 1) + " should terminate with a newline");
173 } else {
174 throw std::runtime_error("truncated keyword in " + get_location(column, line));
175 }
176 }
177
178 char next = input.get();
179 if (next == 'n' || next == 'N') {
180 auto* current = check_column_type(info, NUMBER, column, line);
181 static_cast<NumberField*>(current)->push_back(std::numeric_limits<double>::quiet_NaN());
182 input.advance(); // for consistency with the NA case, in the sense that we are always past the keyword regardless of whether the keyword is NaN or NA.
183 } else if (is_missing) {
184 auto raw = fetch_column(info, column, line);
185 raw->add_missing();
186 } else {
187 throw std::runtime_error("unknown keyword in " + get_location(column, line));
188 }
189 }
190
191 template<class Input>
192 void store_number_or_complex(Input& input, Contents& info, size_t column, size_t line, bool negative) const {
193 auto first = to_number(input, column, line);
194 if (negative) {
195 first *= -1;
196 }
197
198 char next = input.get(); // no need to check validity, as to_number always leaves us on a valid position (or throws itself).
199 if (next == ',' || next == '\n') {
200 auto* current = check_column_type(info, NUMBER, column, line);
201 static_cast<NumberField*>(current)->push_back(first);
202 return;
203 }
204
205 char second_neg = false;
206 if (next == '-') {
207 second_neg = true;
208 } else if (next != '+') {
209 throw std::runtime_error("incorrectly formatted number in " + get_location(column, line));
210 }
211
212 input.advance();
213 if (!input.valid()) {
214 throw std::runtime_error("truncated complex number in " + get_location(column, line));
215 } else if (!std::isdigit(input.get())) {
216 throw std::runtime_error("incorrectly formatted complex number in " + get_location(column, line));
217 }
218
219 auto second = to_number(input, column, line);
220 if (second_neg) {
221 second *= -1;
222 }
223 if (input.get() != 'i') { // again, no need to check validity.
224 throw std::runtime_error("incorrectly formatted complex number in " + get_location(column, line));
225 }
226 input.advance(); // for consistency with the numbers, in the sense that we are always past the keyword regardless of whether we're a NUMBER or COMPLEX.
227
228 auto* current = check_column_type(info, COMPLEX, column, line);
229 static_cast<ComplexField*>(current)->push_back(std::complex<double>(first, second));
230 }
231
232private:
233 template<class Input>
234 void parse_loop(Input& input, Contents& info) const {
235 if (!input.valid()) {
236 throw std::runtime_error("CSV file is empty");
237 }
238
239 // Special case for a new-line only file.
240 if (input.get() == '\n') {
241 auto& line = info.fallback;
242 while (1) {
243 input.advance();
244 if (!input.valid()) {
245 break;
246 }
247 ++line;
248 if (input.get() != '\n') {
249 throw std::runtime_error("more fields on line " + std::to_string(line + 1) + " than expected from the header");
250 }
251 }
252 return;
253 }
254
255 // Processing the header.
256 std::vector<std::string> header_names;
257 while (1) {
258 char c = input.get();
259 if (c != '"') {
260 throw std::runtime_error("all headers should be quoted strings");
261 }
262
263 header_names.push_back(to_string(input, info.names.size(), 0)); // no need to check validity, as to_string always leaves us on a valid position (or throws itself).
264
265 char next = input.get();
266 input.advance();
267 if (next == '\n') {
268 break;
269 } else if (next != ',') {
270 throw std::runtime_error("header " + std::to_string(info.names.size()) + " contains trailing character '" + std::string(1, next) + "'");
271 }
272 }
273
274 {
275 std::unordered_set<std::string> copy;
276 for (const auto& x : header_names) {
277 if (copy.find(x) != copy.end()) {
278 throw std::runtime_error("detected duplicated header name '" + x + "'");
279 }
280 copy.insert(x);
281 }
282 }
283
284 // If information isn't already provided, we fill it in from the file,
285 // otherwise we check its consistency.
286 if (info.names.empty()) {
287 info.names.swap(header_names);
288 } else {
289 if (info.names.size() != header_names.size()) {
290 throw std::runtime_error("provided number of names is not equal to the number of header names");
291 }
292 for (size_t i = 0, end = header_names.size(); i < end; ++i) {
293 if (info.names[i] != header_names[i]) {
294 throw std::runtime_error("mismatch between provided and observed header name for column " + std::to_string(i + 1) + " ('" + info.names[i] + "', '" + header_names[i] + "')");
295 }
296 }
297 }
298
299 if (info.fields.empty()) {
300 info.fields.resize(info.names.size());
301 for (auto& o : info.fields) {
302 o.reset(new UnknownField);
303 }
304 } else if (info.fields.size() != info.names.size()) {
305 throw std::runtime_error("provided number of fields is not equal to the number of header names");
306 }
307
308 // Special case if there are no records, i.e., it's header-only.
309 if (!input.valid()) {
310 return;
311 }
312
313 // Processing the records in a CSV.
314 size_t column = 0;
315 size_t line = 1;
316 while (1) {
317 switch (input.get()) {
318 case '"':
319 {
320 auto* current = check_column_type(info, STRING, column, line);
321 static_cast<StringField*>(current)->push_back(to_string(input, column, line));
322 }
323 break;
324
325 case 't': case 'T':
326 {
327 input.advance();
328 expect_fixed(input, "rue", "RUE", column, line);
329 auto* current = check_column_type(info, BOOLEAN, column, line);
330 static_cast<BooleanField*>(current)->push_back(true);
331 }
332 break;
333
334 case 'f': case 'F':
335 {
336 input.advance();
337 expect_fixed(input, "alse", "ALSE", column, line);
338 auto* current = check_column_type(info, BOOLEAN, column, line);
339 static_cast<BooleanField*>(current)->push_back(false);
340 }
341 break;
342
343 case 'N':
344 store_na_or_nan(input, info, column, line);
345 break;
346
347 case 'n':
348 store_nan(input, info, column, line);
349 break;
350
351 case 'i': case 'I':
352 store_inf(input, info, column, line, false);
353 break;
354
355 case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
356 store_number_or_complex(input, info, column, line, false);
357 break;
358
359 case '+':
360 input.advance();
361 if (!input.valid()) {
362 throw std::runtime_error("truncated field in " + get_location(column, line));
363 } else if (!std::isdigit(input.get())) {
364 throw std::runtime_error("invalid number in " + get_location(column, line));
365 }
366 store_number_or_complex(input, info, column, line, false);
367 break;
368
369 case '-':
370 {
371 input.advance();
372 if (!input.valid()) {
373 throw std::runtime_error("truncated field in " + get_location(column, line));
374 }
375
376 char next = input.get();
377 if (next == 'i' || next == 'I') {
378 store_inf(input, info, column, line, true);
379 } else if (next == 'n' || next == 'N') {
380 store_nan(input, info, column, line);
381 } else if (std::isdigit(next)) {
382 store_number_or_complex(input, info, column, line, true);
383 } else {
384 throw std::runtime_error("incorrectly formatted number in " + get_location(column, line));
385 }
386 }
387 break;
388
389 case '\n':
390 throw std::runtime_error(get_location(column, line) + " is empty");
391
392 default:
393 throw std::runtime_error("unknown type starting with '" + std::string(1, input.get()) + "' in " + get_location(column, line));
394 }
395
396 if (!input.valid()) {
397 throw std::runtime_error("last line must be terminated by a single newline");
398 }
399
400 char next = input.get();
401 input.advance();
402 if (next == ',') {
403 ++column;
404 if (!input.valid()) {
405 throw std::runtime_error("line " + std::to_string(line + 1) + " is truncated at column " + std::to_string(column + 1));
406 }
407 } else if (next == '\n') {
408 if (column + 1 != info.names.size()) {
409 throw std::runtime_error("line " + std::to_string(line + 1) + " has fewer fields than expected from the header");
410 }
411 if (!input.valid()) {
412 break;
413 }
414 column = 0;
415 ++line;
416 } else {
417 throw std::runtime_error(get_location(column, line) + " contains trailing character '" + std::string(1, next) + "'");
418 }
419 }
420 }
421
422public:
423 template<class Reader>
424 void parse(Reader& reader, Contents& info, bool parallel) const {
425 if (parallel) {
426 byteme::PerByteParallel input(&reader);
427 parse_loop(input, info);
428 } else {
429 byteme::PerByte input(&reader);
430 parse_loop(input, info);
431 }
432 }
433
434 const FieldCreator* creator;
435
436 bool check_store = false;
437 std::unordered_set<std::string> to_store_by_name;
438 std::unordered_set<size_t> to_store_by_index;
439};
444}
445
446#endif
Defines the FieldCreator class and defaults.
Defines the Field virtual class and concrete implementations.
Contains all comservatory functions and classes.
Definition: Field.hpp:16
TypedField< std::string, STRING > StringField
Definition: Field.hpp:189
TypedField< bool, BOOLEAN > BooleanField
Virtual class for a Field of booleans.
Definition: Field.hpp:219
TypedField< std::complex< double >, COMPLEX > ComplexField
Definition: Field.hpp:234
TypedField< double, NUMBER > NumberField
Definition: Field.hpp:204
The parsed contents of a CSV file.
Definition: Parser.hpp:37
std::vector< std::unique_ptr< Field > > fields
Definition: Parser.hpp:41
size_t num_fields() const
Definition: Parser.hpp:52
size_t num_records() const
Definition: Parser.hpp:59
std::vector< std::string > names
Definition: Parser.hpp:47