1#ifndef COMSERVATORY_PARSER_HPP
2#define COMSERVATORY_PARSER_HPP
7#include <unordered_set>
16#include "byteme/PerByte.hpp"
41 std::vector<std::unique_ptr<Field> >
fields;
73 Parser(
const FieldCreator* f) : creator(f) {}
76 Parser& set_check_store(
bool s =
false) {
81 template<
class NameIter>
82 Parser& set_store_by_name(NameIter start, NameIter end) {
83 to_store_by_name = std::unordered_set<std::string>(start, end);
87 template<
class NameContainer>
88 Parser& set_store_by_name(
const NameContainer& k) {
89 return set_store_by_name(k.begin(), k.end());
92 template<
class IndexIter>
93 Parser& set_store_by_index(IndexIter start, IndexIter end) {
94 to_store_by_index = std::unordered_set<size_t>(start, end);
98 template<
class IndexContainer>
99 Parser& set_store_by_index(
const IndexContainer& k) {
100 return set_store_by_index(k.begin(), k.end());
104 static Field* fetch_column(Contents& info,
size_t column,
size_t line) {
105 auto& everything = info.fields;
106 if (column >= everything.size()) {
107 throw std::runtime_error(
"more fields on line " + std::to_string(line + 1) +
" than expected from the header");
109 return everything[column].get();
112 Field* check_column_type(Contents& info, Type observed,
size_t column,
size_t line)
const {
113 Field* current = fetch_column(info, column, line);
114 auto expected = current->type();
116 if (expected == UNKNOWN) {
117 bool use_dummy = check_store &&
118 to_store_by_name.find(info.names[column]) == to_store_by_name.end() &&
119 to_store_by_index.find(column) == to_store_by_index.end();
121 auto ptr = creator->create(observed, current->size(), use_dummy);
122 info.fields[column].reset(ptr);
123 current = info.fields[column].get();
124 }
else if (expected != observed) {
125 throw std::runtime_error(
"previous and current types do not match up");
131 template<
class Input>
132 void store_nan(Input& input, Contents& info,
size_t column,
size_t line)
const {
134 expect_fixed(input,
"an",
"AN", column, line);
135 auto* current = check_column_type(info, NUMBER, column, line);
136 static_cast<NumberField*
>(current)->push_back(std::numeric_limits<double>::quiet_NaN());
139 template<
class Input>
140 void store_inf(Input& input, Contents& info,
size_t column,
size_t line,
bool negative)
const {
142 expect_fixed(input,
"nf",
"NF", column, line);
143 auto* current = check_column_type(info, NUMBER, column, line);
145 double val = std::numeric_limits<double>::infinity();
149 static_cast<NumberField*
>(current)->push_back(val);
152 template<
class Input>
153 void store_na_or_nan(Input& input, Contents& info,
size_t column,
size_t line)
const {
157 if (!input.valid()) {
158 throw std::runtime_error(
"truncated keyword in " + get_location(column, line));
161 char second = input.get();
162 bool is_missing =
true;
165 }
else if (second !=
'A') {
166 throw std::runtime_error(
"unknown keyword in " + get_location(column, line));
170 if (!input.valid()) {
172 throw std::runtime_error(
"line " + std::to_string(line + 1) +
" should terminate with a newline");
174 throw std::runtime_error(
"truncated keyword in " + get_location(column, line));
178 char next = input.get();
179 if (next ==
'n' || next ==
'N') {
180 auto* current = check_column_type(info, NUMBER, column, line);
181 static_cast<NumberField*
>(current)->push_back(std::numeric_limits<double>::quiet_NaN());
183 }
else if (is_missing) {
184 auto raw = fetch_column(info, column, line);
187 throw std::runtime_error(
"unknown keyword in " + get_location(column, line));
191 template<
class Input>
192 void store_number_or_complex(Input& input, Contents& info,
size_t column,
size_t line,
bool negative)
const {
193 auto first = to_number(input, column, line);
198 char next = input.get();
199 if (next ==
',' || next ==
'\n') {
200 auto* current = check_column_type(info, NUMBER, column, line);
201 static_cast<NumberField*
>(current)->push_back(first);
205 char second_neg =
false;
208 }
else if (next !=
'+') {
209 throw std::runtime_error(
"incorrectly formatted number in " + get_location(column, line));
213 if (!input.valid()) {
214 throw std::runtime_error(
"truncated complex number in " + get_location(column, line));
215 }
else if (!std::isdigit(input.get())) {
216 throw std::runtime_error(
"incorrectly formatted complex number in " + get_location(column, line));
219 auto second = to_number(input, column, line);
223 if (input.get() !=
'i') {
224 throw std::runtime_error(
"incorrectly formatted complex number in " + get_location(column, line));
228 auto* current = check_column_type(info, COMPLEX, column, line);
229 static_cast<ComplexField*
>(current)->push_back(std::complex<double>(first, second));
233 template<
class Input>
234 void parse_loop(Input& input, Contents& info)
const {
235 if (!input.valid()) {
236 throw std::runtime_error(
"CSV file is empty");
240 if (input.get() ==
'\n') {
241 auto& line = info.fallback;
244 if (!input.valid()) {
248 if (input.get() !=
'\n') {
249 throw std::runtime_error(
"more fields on line " + std::to_string(line + 1) +
" than expected from the header");
256 std::vector<std::string> header_names;
258 char c = input.get();
260 throw std::runtime_error(
"all headers should be quoted strings");
263 header_names.push_back(to_string(input, info.names.size(), 0));
265 char next = input.get();
269 }
else if (next !=
',') {
270 throw std::runtime_error(
"header " + std::to_string(info.names.size()) +
" contains trailing character '" + std::string(1, next) +
"'");
275 std::unordered_set<std::string> copy;
276 for (
const auto& x : header_names) {
277 if (copy.find(x) != copy.end()) {
278 throw std::runtime_error(
"detected duplicated header name '" + x +
"'");
286 if (info.names.empty()) {
287 info.names.swap(header_names);
289 if (info.names.size() != header_names.size()) {
290 throw std::runtime_error(
"provided number of names is not equal to the number of header names");
292 for (
size_t i = 0, end = header_names.size(); i < end; ++i) {
293 if (info.names[i] != header_names[i]) {
294 throw std::runtime_error(
"mismatch between provided and observed header name for column " + std::to_string(i + 1) +
" ('" + info.names[i] +
"', '" + header_names[i] +
"')");
299 if (info.fields.empty()) {
300 info.fields.resize(info.names.size());
301 for (
auto& o : info.fields) {
302 o.reset(
new UnknownField);
304 }
else if (info.fields.size() != info.names.size()) {
305 throw std::runtime_error(
"provided number of fields is not equal to the number of header names");
309 if (!input.valid()) {
317 switch (input.get()) {
320 auto* current = check_column_type(info, STRING, column, line);
321 static_cast<StringField*
>(current)->push_back(to_string(input, column, line));
328 expect_fixed(input,
"rue",
"RUE", column, line);
329 auto* current = check_column_type(info, BOOLEAN, column, line);
337 expect_fixed(input,
"alse",
"ALSE", column, line);
338 auto* current = check_column_type(info, BOOLEAN, column, line);
344 store_na_or_nan(input, info, column, line);
348 store_nan(input, info, column, line);
352 store_inf(input, info, column, line,
false);
355 case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
356 store_number_or_complex(input, info, column, line,
false);
361 if (!input.valid()) {
362 throw std::runtime_error(
"truncated field in " + get_location(column, line));
363 }
else if (!std::isdigit(input.get())) {
364 throw std::runtime_error(
"invalid number in " + get_location(column, line));
366 store_number_or_complex(input, info, column, line,
false);
372 if (!input.valid()) {
373 throw std::runtime_error(
"truncated field in " + get_location(column, line));
376 char next = input.get();
377 if (next ==
'i' || next ==
'I') {
378 store_inf(input, info, column, line,
true);
379 }
else if (next ==
'n' || next ==
'N') {
380 store_nan(input, info, column, line);
381 }
else if (std::isdigit(next)) {
382 store_number_or_complex(input, info, column, line,
true);
384 throw std::runtime_error(
"incorrectly formatted number in " + get_location(column, line));
390 throw std::runtime_error(get_location(column, line) +
" is empty");
393 throw std::runtime_error(
"unknown type starting with '" + std::string(1, input.get()) +
"' in " + get_location(column, line));
396 if (!input.valid()) {
397 throw std::runtime_error(
"last line must be terminated by a single newline");
400 char next = input.get();
404 if (!input.valid()) {
405 throw std::runtime_error(
"line " + std::to_string(line + 1) +
" is truncated at column " + std::to_string(column + 1));
407 }
else if (next ==
'\n') {
408 if (column + 1 != info.names.size()) {
409 throw std::runtime_error(
"line " + std::to_string(line + 1) +
" has fewer fields than expected from the header");
411 if (!input.valid()) {
417 throw std::runtime_error(get_location(column, line) +
" contains trailing character '" + std::string(1, next) +
"'");
423 template<
class Reader>
424 void parse(Reader& reader, Contents& info,
bool parallel)
const {
426 byteme::PerByteParallel input(&reader);
427 parse_loop(input, info);
429 byteme::PerByte input(&reader);
430 parse_loop(input, info);
434 const FieldCreator* creator;
436 bool check_store =
false;
437 std::unordered_set<std::string> to_store_by_name;
438 std::unordered_set<size_t> to_store_by_index;
Defines the FieldCreator class and defaults.
Defines the Field virtual class and concrete implementations.
Contains all comservatory functions and classes.
Definition: Field.hpp:16
TypedField< std::string, STRING > StringField
Definition: Field.hpp:189
TypedField< bool, BOOLEAN > BooleanField
Virtual class for a Field of booleans.
Definition: Field.hpp:219
TypedField< std::complex< double >, COMPLEX > ComplexField
Definition: Field.hpp:234
TypedField< double, NUMBER > NumberField
Definition: Field.hpp:204
The parsed contents of a CSV file.
Definition: Parser.hpp:37
std::vector< std::unique_ptr< Field > > fields
Definition: Parser.hpp:41
size_t num_fields() const
Definition: Parser.hpp:52
size_t num_records() const
Definition: Parser.hpp:59
std::vector< std::string > names
Definition: Parser.hpp:47