takane
Validators for ArtifactDB file formats
Loading...
Searching...
No Matches
sequence_string_set.hpp
Go to the documentation of this file.
1#ifndef TAKANE_SEQUENCE_STRING_SET_HPP
2#define TAKANE_SEQUENCE_STRING_SET_HPP
3
4#include "byteme/byteme.hpp"
5
6#include "ritsuko/ritsuko.hpp"
7#include "utils_other.hpp"
8
9#include <array>
10#include <algorithm>
11#include <stdexcept>
12#include <limits>
13#include <cctype>
14#include <filesystem>
15
21namespace takane {
22
27namespace sequence_string_set {
28
32namespace internal {
33
34inline int char2int(char val) {
35 return static_cast<int>(val) - static_cast<int>(std::numeric_limits<char>::min());
36}
37
38template<bool has_quality_, bool parallel_>
39size_t parse_sequences(const std::filesystem::path& path, std::array<bool, 255> allowed, char lowest_quality) {
40 auto gzreader = internal_other::open_reader<byteme::GzipFileReader>(path);
41 typedef typename std::conditional<parallel_, byteme::PerByteParallel<>, byteme::PerByte<> >::type PB;
42 PB pb(&gzreader);
43
44 size_t nseq = 0;
45 size_t line_count = 0;
46 auto advance_and_check = [&]() -> char {
47 if (!pb.advance()) {
48 throw std::runtime_error("premature end of the file at line " + std::to_string(line_count + 1));
49 }
50 return pb.get();
51 };
52
53 while (pb.valid()) {
54 // Processing the name.
55 char val = pb.get();
56 if constexpr(!has_quality_) {
57 if (val != '>') {
58 throw std::runtime_error("sequence name should start with '>' at line " + std::to_string(line_count + 1));
59 }
60 } else {
61 if (val != '@') {
62 throw std::runtime_error("sequence name should start with '@' at line " + std::to_string(line_count + 1));
63 }
64 }
65
66 val = advance_and_check();
67 size_t proposed = 0;
68 bool empty = true;
69 while (val != '\n') {
70 if (!std::isdigit(val)) {
71 throw std::runtime_error("sequence name should be a non-negative integer at line " + std::to_string(line_count + 1));
72 }
73 empty = false;
74 proposed *= 10;
75 proposed += (val - '0');
76 val = advance_and_check();
77 }
78 if (empty || proposed != nseq) {
79 throw std::runtime_error("sequence name should be its index at line " + std::to_string(line_count + 1));
80 }
81 ++line_count;
82
83 if constexpr(!has_quality_) {
84 // Processing the sequence itself until we get to a '>' or we run out of bytes.
85 val = advance_and_check();
86 while (true) {
87 if (val == '\n') {
88 ++line_count;
89 if (!pb.advance()) {
90 break;
91 }
92 val = pb.get();
93 if (val == '>') {
94 break;
95 }
96 } else {
97 if (!allowed[char2int(val)]) {
98 throw std::runtime_error("forbidden character '" + std::string(1, val) + "' in sequence at line " + std::to_string(line_count + 1));
99 }
100 val = advance_and_check();
101 }
102 }
103
104 } else {
105 // Processing the sequence itself until we get to a '+'.
106 val = advance_and_check();
107 size_t seq_length = 0;
108 while (true) {
109 if (val == '\n') {
110 ++line_count;
111 val = advance_and_check();
112 if (val == '+') {
113 break;
114 }
115 } else {
116 if (!allowed[char2int(val)]) {
117 throw std::runtime_error("forbidden character '" + std::string(1, val) + "' in sequence at line " + std::to_string(line_count + 1));
118 }
119 ++seq_length;
120 val = advance_and_check();
121 }
122 }
123
124 // Next should be a single line; starting with '+' is implicit from above.
125 do {
126 val = advance_and_check();
127 } while (val != '\n');
128 ++line_count;
129
130 // Processing the qualities. Extraction is allowed to fail if we're at
131 // the end of the file. Note that we can't check for '@' as a
132 // delimitor, as this can be a valid score, so instead we check at each
133 // newline whether we've reached the specified length, and quit if so.
134 size_t qual_length = 0;
135 while (true) {
136 val = advance_and_check();
137 if (val == '\n') {
138 ++line_count;
139 if (qual_length >= seq_length) {
140 while (pb.advance() && pb.get() == '\n') {} // sneak past any newlines.
141 break;
142 }
143 } else {
144 if (val < lowest_quality) {
145 throw std::runtime_error("out-of-range quality score '" + std::string(1, val) + "' detected at line " + std::to_string(line_count + 1));
146 }
147 ++qual_length;
148 }
149 }
150
151 if (qual_length != seq_length) {
152 throw std::runtime_error("unequal lengths for quality and sequence strings at line " + std::to_string(line_count + 1) + ")");
153 }
154 }
155
156 ++nseq;
157 }
158
159 return nseq;
160}
161
162template<bool parallel_>
163size_t parse_names(const std::filesystem::path& path) {
164 auto gzreader = internal_other::open_reader<byteme::GzipFileReader>(path);
165 typedef typename std::conditional<parallel_, byteme::PerByteParallel<>, byteme::PerByte<> >::type PB;
166 PB pb(&gzreader);
167
168 size_t nseq = 0;
169 size_t line_count = 0;
170 auto advance_and_check = [&]() -> char {
171 if (!pb.advance()) {
172 throw std::runtime_error("premature end of the file at line " + std::to_string(line_count + 1));
173 }
174 return pb.get();
175 };
176
177 while (pb.valid()) {
178 char val = pb.get();
179 if (val != '"') {
180 throw std::runtime_error("name should start with a quote");
181 }
182
183 while (true) {
184 val = advance_and_check();
185 if (val == '"') {
186 val = advance_and_check();
187 if (val == '\n') {
188 ++nseq;
189 ++line_count;
190 pb.advance();
191 break;
192 } else if (val != '"') {
193 throw std::runtime_error("characters present after end quote at line " + std::to_string(line_count + 1));
194 }
195 } else if (val == '\n') {
196 ++line_count;
197 }
198 }
199 }
200
201 return nseq;
202}
203
204}
214inline void validate(const std::filesystem::path& path, const ObjectMetadata& metadata, Options& options) {
215 const auto& obj = internal_json::extract_typed_object_from_metadata(metadata.other, "sequence_string_set");
216 const auto& vstring = internal_json::extract_string_from_typed_object(obj, "version", "sequence_string_set");
217 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true);
218 if (version.major != 1) {
219 throw std::runtime_error("unsupported version string '" + vstring + "'");
220 }
221
222 size_t expected_nseq = 0;
223 {
224 auto lIt = obj.find("length");
225 if (lIt == obj.end()) {
226 throw std::runtime_error("expected a 'sequence_string_set.length' property");
227 }
228
229 const auto& val = lIt->second;
230 if (val->type() != millijson::NUMBER) {
231 throw std::runtime_error("'sequence_string_set.length' property should be a JSON number");
232 }
233
234 auto num = reinterpret_cast<const millijson::Number*>(val.get())->value;
235 if (num < 0 || std::floor(num) != num) {
236 throw std::runtime_error("'sequence_string_set.length' should be a non-negative integer");
237 }
238
239 expected_nseq = num;
240 }
241
242 std::array<bool, 255> allowed;
243 std::fill(allowed.begin(), allowed.end(), false);
244 {
245 const std::string& stype = internal_json::extract_string(obj, "sequence_type", [&](std::exception& e) -> void {
246 throw std::runtime_error("failed to extract 'sequence_string_set.sequence_type' from the object metadata; " + std::string(e.what()));
247 });
248
249 std::string allowable;
250 if (stype == "DNA" || stype == "RNA") {
251 allowable = "ACGRYSWKMBDHVN";
252 if (stype == "DNA") {
253 allowable += "T";
254 } else {
255 allowable += "U";
256 }
257 } else if (stype == "AA") {
258 allowable = "ACDEFGHIKLMNPQRSTVWY";
259 } else if (stype == "custom") {
260 std::fill(allowed.begin() + internal::char2int('!'), allowed.begin() + internal::char2int('~') + 1, true);
261 } else {
262 throw std::runtime_error("invalid string '" + stype + "' in the 'sequence_string_set.sequence_type' property");
263 }
264
265 for (auto a : allowable) {
266 allowed[internal::char2int(a)] = true;
267 allowed[internal::char2int(std::tolower(a))] = true;
268 }
269 allowed[internal::char2int('.')] = true;
270 allowed[internal::char2int('-')] = true;
271 }
272
273 bool has_qualities = false;
274 char lowest_quality = 0;
275 {
276 auto xIt = obj.find("quality_type");
277 if (xIt != obj.end()) {
278 const auto& val = xIt->second;
279 if (val->type() != millijson::STRING) {
280 throw std::runtime_error("'sequence_string_set.quality_type' property should be a JSON string");
281 }
282
283 const auto& qtype = reinterpret_cast<const millijson::String*>(val.get())->value;
284 has_qualities = true;
285
286 if (qtype == "phred") {
287 auto oIt = obj.find("quality_offset");
288 if (oIt == obj.end()) {
289 throw std::runtime_error("expected a 'sequence_string_set.quality_offset' property for Phred quality scores");
290 }
291
292 const auto& val = oIt->second;
293 if (val->type() != millijson::NUMBER) {
294 throw std::runtime_error("'sequence_string_set.quality_offset' property should be a JSON number");
295 }
296
297 double offset = reinterpret_cast<const millijson::Number*>(val.get())->value;
298 if (offset == 33) {
299 lowest_quality = '!';
300 } else if (offset == 64) {
301 lowest_quality = '@';
302 } else {
303 throw std::runtime_error("'sequence_string_set.quality_offset' property should be either 33 or 64");
304 }
305
306 } else if (qtype == "solexa") {
307 lowest_quality = ';';
308
309 } else if (qtype == "none") {
310 has_qualities = false;
311
312 } else {
313 throw std::runtime_error("invalid string '" + qtype + "' for the 'sequence_string_set.quality_type' property");
314 }
315 }
316 }
317
318 size_t nseq = 0;
319 if (has_qualities) {
320 auto spath = path / "sequences.fastq.gz";
321 if (options.parallel_reads) {
322 nseq = internal::parse_sequences<true, true>(spath, allowed, lowest_quality);
323 } else {
324 nseq = internal::parse_sequences<true, false>(spath, allowed, lowest_quality);
325 }
326 } else {
327 auto spath = path / "sequences.fasta.gz";
328 if (options.parallel_reads) {
329 nseq = internal::parse_sequences<false, true>(spath, allowed, lowest_quality);
330 } else {
331 nseq = internal::parse_sequences<false, false>(spath, allowed, lowest_quality);
332 }
333 }
334 if (nseq != expected_nseq) {
335 throw std::runtime_error("observed number of sequences is different from the expected number (" + std::to_string(nseq) + " to " + std::to_string(expected_nseq) + ")");
336 }
337
338 auto npath = path / "names.txt.gz";
339 if (std::filesystem::exists(npath)) {
340 size_t nnames = 0;
341 if (options.parallel_reads) {
342 nnames = internal::parse_names<true>(npath);
343 } else {
344 nnames = internal::parse_names<false>(npath);
345 }
346 if (nnames != expected_nseq) {
347 throw std::runtime_error("number of names is different from the number of sequences (" + std::to_string(nnames) + " to " + std::to_string(expected_nseq) + ")");
348 }
349 }
350
351 internal_other::validate_mcols(path, "sequence_annotations", nseq, options);
352 internal_other::validate_metadata(path, "other_annotations", options);
353}
354
361inline size_t height([[maybe_unused]] const std::filesystem::path& path, const ObjectMetadata& metadata, [[maybe_unused]] Options& options) {
362 const auto& obj = internal_json::extract_typed_object_from_metadata(metadata.other, "sequence_string_set");
363 auto lIt = obj.find("length");
364 const auto& val = lIt->second;
365 return reinterpret_cast<const millijson::Number*>(val.get())->value;
366}
367
368}
369
370}
371
372#endif
size_t height(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition sequence_string_set.hpp:361
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition sequence_string_set.hpp:214
takane validation functions.
Definition _derived_from.hpp:15
Object metadata, including the type and other fields.
Definition utils_public.hpp:26
std::unordered_map< std::string, std::shared_ptr< millijson::Base > > other
Definition utils_public.hpp:35
Validation options.
Definition utils_public.hpp:94
bool parallel_reads
Definition utils_public.hpp:98