takane
Validators for ArtifactDB file formats
Loading...
Searching...
No Matches
utils_csv.hpp
Go to the documentation of this file.
1#ifndef TAKANE_UTILS_CSV_HPP
2#define TAKANE_UTILS_CSV_HPP
3
4#include "ritsuko/ritsuko.hpp"
5#include "comservatory/comservatory.hpp"
6
7#include <unordered_set>
8#include <string>
9#include <stdexcept>
10#include <vector>
11#include <memory>
12
18namespace takane {
19
31 ~CsvFieldCreator() = default;
40 virtual comservatory::NumberField* integer() {
41 return number();
42 }
43
47 virtual comservatory::StringField* string() = 0;
48
52 virtual comservatory::NumberField* number() = 0;
53
57 virtual comservatory::BooleanField* boolean() = 0;
58};
59
68 comservatory::StringField* string() {
69 return new comservatory::DummyStringField;
70 }
71
72 comservatory::NumberField* number() {
73 return new comservatory::DummyNumberField;
74 }
75
76 comservatory::BooleanField* boolean() {
77 return new comservatory::DummyBooleanField;
78 }
79};
80
89 std::vector<std::unique_ptr<comservatory::Field> > fields;
90
94 void reconstitute(std::vector<std::unique_ptr<comservatory::Field> >& source) {
95 for (size_t c = 0, total = source.size(); c < total; ++c) {
96 if (!fields[c]) {
97 fields[c].swap(source[c]);
98 }
99 }
100 }
104};
105
109struct CsvNameField : public comservatory::StringField {
110 CsvNameField(bool ar, comservatory::StringField* p) : as_rownames(ar), child(p) {}
111
112public:
113 bool as_rownames;
114 comservatory::StringField* child;
115
116public:
117 void add_missing() {
118 throw std::runtime_error("missing values should not be present in the " + (as_rownames ? std::string("row names") : std::string("names")) + " column");
119 }
120
121 void push_back(std::string x) {
122 child->push_back(std::move(x));
123 }
124
125 size_t size() const {
126 return child->size();
127 }
128
129 bool filled() const {
130 return true;
131 }
132};
133
134template<typename T>
135constexpr T upper_integer_limit() {
136 return 2147483647;
137}
138
139template<typename T>
140constexpr T lower_integer_limit() {
141 return -2147483648;
142}
143
144struct CsvIntegerField : public comservatory::NumberField {
145 CsvIntegerField(int cid, comservatory::NumberField* p) : column_id(cid), child(p) {}
146
147public:
148 int column_id;
149 comservatory::NumberField* child;
150
151public:
152 void add_missing() {
153 child->add_missing();
154 }
155
156 void push_back(double x) {
157 if (x < lower_integer_limit<double>() || x > upper_integer_limit<double>()) { // constrain within limits.
158 throw std::runtime_error("value in column " + std::to_string(column_id + 1) + " does not fit inside a 32-bit signed integer");
159 }
160 if (x != std::floor(x)) {
161 throw std::runtime_error("value in column " + std::to_string(column_id + 1) + " is not an integer");
162 }
163 child->push_back(x);
164 }
165
166 size_t size() const {
167 return child->size();
168 }
169
170 bool filled() const {
171 return true;
172 }
173};
174
175struct CsvNonNegativeIntegerField : public comservatory::NumberField {
176 CsvNonNegativeIntegerField(int cid, comservatory::NumberField* p) : column_id(cid), child(p) {}
177
178public:
179 int column_id;
180 comservatory::NumberField* child;
181
182public:
183 void add_missing() {
184 child->add_missing();
185 }
186
187 void push_back(double x) {
188 if (x < 0) {
189 throw std::runtime_error("value in column " + std::to_string(column_id + 1) + " should not be negative");
190 }
191 if (x > upper_integer_limit<double>()) {
192 throw std::runtime_error("value in column " + std::to_string(column_id + 1) + " does not fit inside a 32-bit signed integer");
193 }
194 if (x != std::floor(x)) {
195 throw std::runtime_error("value in column " + std::to_string(column_id + 1) + " is not an integer");
196 }
197 child->push_back(x);
198 }
199
200 size_t size() const {
201 return child->size();
202 }
203
204 bool filled() const {
205 return true;
206 }
207};
208
209struct CsvDateField : public comservatory::StringField {
210 CsvDateField(int cid, comservatory::StringField* p) : column_id(cid), child(p) {}
211
212public:
213 int column_id;
214 comservatory::StringField* child;
215
216public:
217 void push_back(std::string x) {
218 if (!ritsuko::is_date(x.c_str(), x.size())) {
219 throw std::runtime_error("expected a date in column " + std::to_string(column_id + 1) + ", got '" + x + "' instead");
220 }
221 child->push_back(std::move(x));
222 }
223
224 void add_missing() {
225 child->add_missing();
226 }
227
228 size_t size() const {
229 return child->size();
230 }
231
232 bool filled() const {
233 return true;
234 }
235};
236
237struct CsvDateTimeField : public comservatory::StringField {
238 CsvDateTimeField(int cid, comservatory::StringField* p) : column_id(cid), child(p) {}
239
240public:
241 int column_id;
242 comservatory::StringField* child;
243
244public:
245 void push_back(std::string x) {
246 if (!ritsuko::is_rfc3339(x.c_str(), x.size())) {
247 throw std::runtime_error("expected an Internet date/time in column " + std::to_string(column_id + 1) + ", got '" + x + "' instead");
248 }
249 child->push_back(std::move(x));
250 }
251
252 void add_missing() {
253 child->add_missing();
254 }
255
256 size_t size() const {
257 return child->size();
258 }
259
260 bool filled() const {
261 return true;
262 }
263};
264
265struct CsvFactorV1Field : public comservatory::StringField {
266 CsvFactorV1Field(int cid, const std::unordered_set<std::string>* l, comservatory::StringField* p) : column_id(cid), levels(l), child(p) {}
267
268public:
269 int column_id;
270 const std::unordered_set<std::string>* levels;
271 comservatory::StringField* child;
272
273public:
274 void push_back(std::string x) {
275 if (levels->find(x) == levels->end()) {
276 throw std::runtime_error("value '" + x + "' in column " + std::to_string(column_id + 1) + " does not refer to a valid level");
277 }
278 child->push_back(std::move(x));
279 }
280
281 void add_missing() {
282 child->add_missing();
283 }
284
285 size_t size() const {
286 return child->size();
287 }
288
289 bool filled() const {
290 return true;
291 }
292};
293
294struct CsvFactorV2Field : public comservatory::NumberField {
295 CsvFactorV2Field(int cid, size_t l, comservatory::NumberField* p) : column_id(cid), nlevels(l), child(p) {
296 if (nlevels > upper_integer_limit<size_t>()) {
297 throw std::runtime_error("number of levels must fit into a 32-bit signed integer");
298 }
299 }
300
301public:
302 int column_id;
303 double nlevels; // casting for an easier comparison.
304 comservatory::NumberField* child;
305
306public:
307 void push_back(double x) {
308 if (x < 0 || x >= nlevels) {
309 throw std::runtime_error("code in column " + std::to_string(column_id + 1) + " should be non-negative and less than the number of levels");
310 }
311 if (x != std::floor(x)) {
312 throw std::runtime_error("value in column " + std::to_string(column_id + 1) + " is not an integer");
313 }
314 child->push_back(x);
315 }
316
317 void add_missing() {
318 child->add_missing();
319 }
320
321 size_t size() const {
322 return child->size();
323 }
324
325 bool filled() const {
326 return true;
327 }
328};
329
330struct CsvCompressedLengthField : public CsvNonNegativeIntegerField {
331 CsvCompressedLengthField(int cid, comservatory::NumberField* p) : CsvNonNegativeIntegerField(cid, p) {}
332
333 void add_missing() {
334 throw std::runtime_error("lengths should not be missing");
335 }
336
337 void push_back(double x) {
338 CsvNonNegativeIntegerField::push_back(x);
339 total += static_cast<size_t>(x);
340 }
341
342 size_t total = 0;
343};
344
345struct CsvUniqueStringField : public comservatory::StringField {
346 CsvUniqueStringField(int cid, comservatory::StringField* p) : column_id(cid), child(p) {}
347
348private:
349 int column_id;
350 comservatory::StringField* child;
351 std::unordered_set<std::string> collected;
352
353public:
354 void add_missing() {
355 throw std::runtime_error("missing values should not be present in column " + std::to_string(column_id));
356 }
357
358 void push_back(std::string x) {
359 if (collected.find(x) != collected.end()) {
360 throw std::runtime_error("duplicated value '" + x + "' in column " + std::to_string(column_id));
361 }
362 collected.insert(x);
363 child->push_back(std::move(x));
364 }
365
366 size_t size() const {
367 return child->size();
368 }
369
370 bool filled() const {
371 return true;
372 }
373};
378}
379
380#endif
takane validation functions.
Definition _derived_from.hpp:15
Contents of the parsed CSV.
Definition utils_csv.hpp:84
std::vector< std::unique_ptr< comservatory::Field > > fields
Definition utils_csv.hpp:89
Create comservatory::Field objects to capture column contents.
Definition utils_csv.hpp:27
virtual comservatory::NumberField * integer()
Definition utils_csv.hpp:40
virtual comservatory::NumberField * number()=0
virtual comservatory::BooleanField * boolean()=0
virtual comservatory::StringField * string()=0
Dummy column creator.
Definition utils_csv.hpp:67
comservatory::StringField * string()
Definition utils_csv.hpp:68
comservatory::BooleanField * boolean()
Definition utils_csv.hpp:76
comservatory::NumberField * number()
Definition utils_csv.hpp:72