takane
Validators for ArtifactDB file formats
Loading...
Searching...
No Matches
sequence_string_set.hpp
Go to the documentation of this file.
1#ifndef TAKANE_SEQUENCE_STRING_SET_HPP
2#define TAKANE_SEQUENCE_STRING_SET_HPP
3
4#include "byteme/byteme.hpp"
5
6#include "ritsuko/ritsuko.hpp"
7#include "utils_other.hpp"
8
9#include <array>
10#include <algorithm>
11#include <stdexcept>
12#include <limits>
13#include <cctype>
14#include <filesystem>
15
21namespace takane {
22
27namespace sequence_string_set {
28
32namespace internal {
33
34inline int char2int(char val) {
35 return static_cast<int>(val) - static_cast<int>(std::numeric_limits<char>::min());
36}
37
38template<bool has_quality_, bool parallel_>
39size_t parse_sequences(const std::filesystem::path& path, std::array<bool, 255> allowed, char lowest_quality) {
40 auto gzreader = internal_other::open_reader<byteme::GzipFileReader>(path);
41 typedef typename std::conditional<parallel_, byteme::PerByteParallel<>, byteme::PerByte<> >::type PB;
42 PB pb(&gzreader);
43
44 size_t nseq = 0;
45 size_t line_count = 0;
46 auto advance_and_check = [&]() -> char {
47 if (!pb.advance()) {
48 throw std::runtime_error("premature end of the file at line " + std::to_string(line_count + 1));
49 }
50 return pb.get();
51 };
52
53 while (pb.valid()) {
54 // Processing the name.
55 char val = pb.get();
56 if constexpr(!has_quality_) {
57 if (val != '>') {
58 throw std::runtime_error("sequence name should start with '>' at line " + std::to_string(line_count + 1));
59 }
60 } else {
61 if (val != '@') {
62 throw std::runtime_error("sequence name should start with '@' at line " + std::to_string(line_count + 1));
63 }
64 }
65
66 val = advance_and_check();
67 size_t proposed = 0;
68 bool empty = true;
69 while (val != '\n') {
70 if (!std::isdigit(val)) {
71 throw std::runtime_error("sequence name should be a non-negative integer at line " + std::to_string(line_count + 1));
72 }
73 empty = false;
74 proposed *= 10;
75 proposed += (val - '0');
76 val = advance_and_check();
77 }
78 if (empty || proposed != nseq) {
79 throw std::runtime_error("sequence name should be its index at line " + std::to_string(line_count + 1));
80 }
81 ++line_count;
82
83 if constexpr(!has_quality_) {
84 // Processing the sequence itself until we get to a '>' or we run out of bytes.
85 val = advance_and_check();
86 while (true) {
87 if (val == '\n') {
88 ++line_count;
89 if (!pb.advance()) {
90 break;
91 }
92 val = pb.get();
93 if (val == '>') {
94 break;
95 }
96 } else {
97 if (!allowed[char2int(val)]) {
98 throw std::runtime_error("forbidden character '" + std::string(1, val) + "' in sequence at line " + std::to_string(line_count + 1));
99 }
100 val = advance_and_check();
101 }
102 }
103
104 } else {
105 // Processing the sequence itself until we get to a '+'.
106 val = advance_and_check();
107 size_t seq_length = 0;
108 while (true) {
109 if (val == '\n') {
110 ++line_count;
111 val = advance_and_check();
112 if (val == '+') {
113 break;
114 }
115 } else {
116 if (!allowed[char2int(val)]) {
117 throw std::runtime_error("forbidden character '" + std::string(1, val) + "' in sequence at line " + std::to_string(line_count + 1));
118 }
119 ++seq_length;
120 val = advance_and_check();
121 }
122 }
123
124 // Next should be a single line; starting with '+' is implicit from above.
125 do {
126 val = advance_and_check();
127 } while (val != '\n');
128 ++line_count;
129
130 // Processing the qualities. Extraction is allowed to fail if we're at
131 // the end of the file. Note that we can't check for '@' as a
132 // delimitor, as this can be a valid score, so instead we check at each
133 // newline whether we've reached the specified length, and quit if so.
134 size_t qual_length = 0;
135 while (true) {
136 val = advance_and_check();
137 if (val == '\n') {
138 ++line_count;
139 if (qual_length >= seq_length) {
140 while (pb.advance() && pb.get() == '\n') {} // sneak past any newlines.
141 break;
142 }
143 } else {
144 if (val < lowest_quality) {
145 throw std::runtime_error("out-of-range quality score '" + std::string(1, val) + "' detected at line " + std::to_string(line_count + 1));
146 }
147 ++qual_length;
148 }
149 }
150
151 if (qual_length != seq_length) {
152 throw std::runtime_error("unequal lengths for quality and sequence strings at line " + std::to_string(line_count + 1) + ")");
153 }
154 }
155
156 ++nseq;
157 }
158
159 return nseq;
160}
161
162template<bool parallel_>
163size_t parse_names(const std::filesystem::path& path) {
164 auto gzreader = internal_other::open_reader<byteme::GzipFileReader>(path);
165 typedef typename std::conditional<parallel_, byteme::PerByteParallel<>, byteme::PerByte<> >::type PB;
166 PB pb(&gzreader);
167
168 size_t nseq = 0;
169 size_t line_count = 0;
170 auto advance_and_check = [&]() -> char {
171 if (!pb.advance()) {
172 throw std::runtime_error("premature end of the file at line " + std::to_string(line_count + 1));
173 }
174 return pb.get();
175 };
176
177 while (pb.valid()) {
178 char val = pb.get();
179 if (val != '"') {
180 throw std::runtime_error("name should start with a quote");
181 }
182
183 while (true) {
184 val = advance_and_check();
185 if (val == '"') {
186 val = advance_and_check();
187 if (val == '\n') {
188 ++nseq;
189 ++line_count;
190 pb.advance();
191 break;
192 } else if (val != '"') {
193 throw std::runtime_error("characters present after end quote at line " + std::to_string(line_count + 1));
194 }
195 } else if (val == '\n') {
196 ++line_count;
197 }
198 }
199 }
200
201 return nseq;
202}
203
204}
214inline void validate(const std::filesystem::path& path, const ObjectMetadata& metadata, Options& options) {
215 const std::string type_name = "sequence_string_set"; // use a separate variable to avoid dangling reference warnings from GCC.
216 const auto& obj = internal_json::extract_typed_object_from_metadata(metadata.other, type_name);
217
218 const std::string version_name = "version"; // again, avoid dangling reference warnings.
219 const auto& vstring = internal_json::extract_string_from_typed_object(obj, version_name, type_name);
220 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true);
221 if (version.major != 1) {
222 throw std::runtime_error("unsupported version string '" + vstring + "'");
223 }
224
225 size_t expected_nseq = 0;
226 {
227 auto lIt = obj.find("length");
228 if (lIt == obj.end()) {
229 throw std::runtime_error("expected a 'sequence_string_set.length' property");
230 }
231
232 const auto& val = lIt->second;
233 if (val->type() != millijson::NUMBER) {
234 throw std::runtime_error("'sequence_string_set.length' property should be a JSON number");
235 }
236
237 auto num = reinterpret_cast<const millijson::Number*>(val.get())->value;
238 if (num < 0 || std::floor(num) != num) {
239 throw std::runtime_error("'sequence_string_set.length' should be a non-negative integer");
240 }
241
242 expected_nseq = num;
243 }
244
245 std::array<bool, 255> allowed;
246 std::fill(allowed.begin(), allowed.end(), false);
247 {
248 const std::string stype_name = "sequence_type"; // avoid dangling reference again.
249 const std::string& stype = internal_json::extract_string(obj, stype_name, [&](std::exception& e) -> void {
250 throw std::runtime_error("failed to extract 'sequence_string_set." + stype_name + "' from the object metadata; " + std::string(e.what()));
251 });
252
253 std::string allowable;
254 if (stype == "DNA" || stype == "RNA") {
255 allowable = "ACGRYSWKMBDHVN";
256 if (stype == "DNA") {
257 allowable += "T";
258 } else {
259 allowable += "U";
260 }
261 } else if (stype == "AA") {
262 allowable = "ACDEFGHIKLMNPQRSTVWY";
263 } else if (stype == "custom") {
264 std::fill(allowed.begin() + internal::char2int('!'), allowed.begin() + internal::char2int('~') + 1, true);
265 } else {
266 throw std::runtime_error("invalid string '" + stype + "' in the 'sequence_string_set." + stype_name + "' property");
267 }
268
269 for (auto a : allowable) {
270 allowed[internal::char2int(a)] = true;
271 allowed[internal::char2int(std::tolower(a))] = true;
272 }
273 allowed[internal::char2int('.')] = true;
274 allowed[internal::char2int('-')] = true;
275 }
276
277 bool has_qualities = false;
278 char lowest_quality = 0;
279 {
280 auto xIt = obj.find("quality_type");
281 if (xIt != obj.end()) {
282 const auto& val = xIt->second;
283 if (val->type() != millijson::STRING) {
284 throw std::runtime_error("'sequence_string_set.quality_type' property should be a JSON string");
285 }
286
287 const auto& qtype = reinterpret_cast<const millijson::String*>(val.get())->value;
288 has_qualities = true;
289
290 if (qtype == "phred") {
291 auto oIt = obj.find("quality_offset");
292 if (oIt == obj.end()) {
293 throw std::runtime_error("expected a 'sequence_string_set.quality_offset' property for Phred quality scores");
294 }
295
296 const auto& val = oIt->second;
297 if (val->type() != millijson::NUMBER) {
298 throw std::runtime_error("'sequence_string_set.quality_offset' property should be a JSON number");
299 }
300
301 double offset = reinterpret_cast<const millijson::Number*>(val.get())->value;
302 if (offset == 33) {
303 lowest_quality = '!';
304 } else if (offset == 64) {
305 lowest_quality = '@';
306 } else {
307 throw std::runtime_error("'sequence_string_set.quality_offset' property should be either 33 or 64");
308 }
309
310 } else if (qtype == "solexa") {
311 lowest_quality = ';';
312
313 } else if (qtype == "none") {
314 has_qualities = false;
315
316 } else {
317 throw std::runtime_error("invalid string '" + qtype + "' for the 'sequence_string_set.quality_type' property");
318 }
319 }
320 }
321
322 size_t nseq = 0;
323 if (has_qualities) {
324 auto spath = path / "sequences.fastq.gz";
325 if (options.parallel_reads) {
326 nseq = internal::parse_sequences<true, true>(spath, allowed, lowest_quality);
327 } else {
328 nseq = internal::parse_sequences<true, false>(spath, allowed, lowest_quality);
329 }
330 } else {
331 auto spath = path / "sequences.fasta.gz";
332 if (options.parallel_reads) {
333 nseq = internal::parse_sequences<false, true>(spath, allowed, lowest_quality);
334 } else {
335 nseq = internal::parse_sequences<false, false>(spath, allowed, lowest_quality);
336 }
337 }
338 if (nseq != expected_nseq) {
339 throw std::runtime_error("observed number of sequences is different from the expected number (" + std::to_string(nseq) + " to " + std::to_string(expected_nseq) + ")");
340 }
341
342 auto npath = path / "names.txt.gz";
343 if (std::filesystem::exists(npath)) {
344 size_t nnames = 0;
345 if (options.parallel_reads) {
346 nnames = internal::parse_names<true>(npath);
347 } else {
348 nnames = internal::parse_names<false>(npath);
349 }
350 if (nnames != expected_nseq) {
351 throw std::runtime_error("number of names is different from the number of sequences (" + std::to_string(nnames) + " to " + std::to_string(expected_nseq) + ")");
352 }
353 }
354
355 internal_other::validate_mcols(path, "sequence_annotations", nseq, options);
356 internal_other::validate_metadata(path, "other_annotations", options);
357}
358
365inline size_t height([[maybe_unused]] const std::filesystem::path& path, const ObjectMetadata& metadata, [[maybe_unused]] Options& options) {
366 const auto& obj = internal_json::extract_typed_object_from_metadata(metadata.other, "sequence_string_set");
367 auto lIt = obj.find("length");
368 const auto& val = lIt->second;
369 return reinterpret_cast<const millijson::Number*>(val.get())->value;
370}
371
372}
373
374}
375
376#endif
size_t height(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition sequence_string_set.hpp:365
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition sequence_string_set.hpp:214
takane validation functions.
Definition _derived_from.hpp:15
Object metadata, including the type and other fields.
Definition utils_public.hpp:26
std::unordered_map< std::string, std::shared_ptr< millijson::Base > > other
Definition utils_public.hpp:35
Validation options.
Definition utils_public.hpp:94
bool parallel_reads
Definition utils_public.hpp:98