takane
Validators for ArtifactDB file formats
Loading...
Searching...
No Matches
sequence_string_set.hpp
Go to the documentation of this file.
1#ifndef TAKANE_SEQUENCE_STRING_SET_HPP
2#define TAKANE_SEQUENCE_STRING_SET_HPP
3
4#include "byteme/byteme.hpp"
5
6#include "ritsuko/ritsuko.hpp"
7#include "utils_other.hpp"
8
9#include <array>
10#include <algorithm>
11#include <stdexcept>
12#include <limits>
13#include <cctype>
14#include <filesystem>
15
21namespace takane {
22
27namespace sequence_string_set {
28
32namespace internal {
33
34inline int char2int(char val) {
35 return static_cast<int>(val) - static_cast<int>(std::numeric_limits<char>::min());
36}
37
38template<bool has_quality_>
39size_t parse_sequences(const std::filesystem::path& path, std::array<bool, 255> allowed, char lowest_quality, bool parallel) {
40 auto gzreader = internal_other::open_reader<byteme::GzipFileReader>(path, byteme::GzipFileReaderOptions());
41 auto pb = internal_other::wrap_reader_for_bytes<char>(std::move(gzreader), parallel);
42
43 size_t nseq = 0;
44 size_t line_count = 0;
45 auto advance_and_check = [&]() -> char {
46 if (!pb->advance()) {
47 throw std::runtime_error("premature end of the file at line " + std::to_string(line_count + 1));
48 }
49 return pb->get();
50 };
51
52 while (pb->valid()) {
53 // Processing the name.
54 char val = pb->get();
55 if constexpr(!has_quality_) {
56 if (val != '>') {
57 throw std::runtime_error("sequence name should start with '>' at line " + std::to_string(line_count + 1));
58 }
59 } else {
60 if (val != '@') {
61 throw std::runtime_error("sequence name should start with '@' at line " + std::to_string(line_count + 1));
62 }
63 }
64
65 val = advance_and_check();
66 size_t proposed = 0;
67 bool empty = true;
68 while (val != '\n') {
69 if (!std::isdigit(val)) {
70 throw std::runtime_error("sequence name should be a non-negative integer at line " + std::to_string(line_count + 1));
71 }
72 empty = false;
73 proposed *= 10;
74 proposed += (val - '0');
75 val = advance_and_check();
76 }
77 if (empty || proposed != nseq) {
78 throw std::runtime_error("sequence name should be its index at line " + std::to_string(line_count + 1));
79 }
80 ++line_count;
81
82 if constexpr(!has_quality_) {
83 // Processing the sequence itself until we get to a '>' or we run out of bytes.
84 val = advance_and_check();
85 while (true) {
86 if (val == '\n') {
87 ++line_count;
88 if (!pb->advance()) {
89 break;
90 }
91 val = pb->get();
92 if (val == '>') {
93 break;
94 }
95 } else {
96 if (!allowed[char2int(val)]) {
97 throw std::runtime_error("forbidden character '" + std::string(1, val) + "' in sequence at line " + std::to_string(line_count + 1));
98 }
99 val = advance_and_check();
100 }
101 }
102
103 } else {
104 // Processing the sequence itself until we get to a '+'.
105 val = advance_and_check();
106 size_t seq_length = 0;
107 while (true) {
108 if (val == '\n') {
109 ++line_count;
110 val = advance_and_check();
111 if (val == '+') {
112 break;
113 }
114 } else {
115 if (!allowed[char2int(val)]) {
116 throw std::runtime_error("forbidden character '" + std::string(1, val) + "' in sequence at line " + std::to_string(line_count + 1));
117 }
118 ++seq_length;
119 val = advance_and_check();
120 }
121 }
122
123 // Next should be a single line; starting with '+' is implicit from above.
124 do {
125 val = advance_and_check();
126 } while (val != '\n');
127 ++line_count;
128
129 // Processing the qualities. Extraction is allowed to fail if we're at
130 // the end of the file. Note that we can't check for '@' as a
131 // delimitor, as this can be a valid score, so instead we check at each
132 // newline whether we've reached the specified length, and quit if so.
133 size_t qual_length = 0;
134 while (true) {
135 val = advance_and_check();
136 if (val == '\n') {
137 ++line_count;
138 if (qual_length >= seq_length) {
139 while (pb->advance() && pb->get() == '\n') {} // sneak past any newlines.
140 break;
141 }
142 } else {
143 if (val < lowest_quality) {
144 throw std::runtime_error("out-of-range quality score '" + std::string(1, val) + "' detected at line " + std::to_string(line_count + 1));
145 }
146 ++qual_length;
147 }
148 }
149
150 if (qual_length != seq_length) {
151 throw std::runtime_error("unequal lengths for quality and sequence strings at line " + std::to_string(line_count + 1) + ")");
152 }
153 }
154
155 ++nseq;
156 }
157
158 return nseq;
159}
160
161inline size_t parse_names(const std::filesystem::path& path, bool parallel) {
162 auto gzreader = internal_other::open_reader<byteme::GzipFileReader>(path, byteme::GzipFileReaderOptions());
163 auto pb = internal_other::wrap_reader_for_bytes<char>(std::move(gzreader), parallel);
164
165 size_t nseq = 0;
166 size_t line_count = 0;
167 auto advance_and_check = [&]() -> char {
168 if (!pb->advance()) {
169 throw std::runtime_error("premature end of the file at line " + std::to_string(line_count + 1));
170 }
171 return pb->get();
172 };
173
174 while (pb->valid()) {
175 char val = pb->get();
176 if (val != '"') {
177 throw std::runtime_error("name should start with a quote");
178 }
179
180 while (true) {
181 val = advance_and_check();
182 if (val == '"') {
183 val = advance_and_check();
184 if (val == '\n') {
185 ++nseq;
186 ++line_count;
187 pb->advance();
188 break;
189 } else if (val != '"') {
190 throw std::runtime_error("characters present after end quote at line " + std::to_string(line_count + 1));
191 }
192 } else if (val == '\n') {
193 ++line_count;
194 }
195 }
196 }
197
198 return nseq;
199}
200
201}
211inline void validate(const std::filesystem::path& path, const ObjectMetadata& metadata, Options& options) {
212 const std::string type_name = "sequence_string_set"; // use a separate variable to avoid dangling reference warnings from GCC.
213 const auto& obj = internal_json::extract_typed_object_from_metadata(metadata.other, type_name);
214
215 const std::string version_name = "version"; // again, avoid dangling reference warnings.
216 const auto& vstring = internal_json::extract_string_from_typed_object(obj, version_name, type_name);
217 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true);
218 if (version.major != 1) {
219 throw std::runtime_error("unsupported version string '" + vstring + "'");
220 }
221
222 size_t expected_nseq = 0;
223 {
224 auto lIt = obj.find("length");
225 if (lIt == obj.end()) {
226 throw std::runtime_error("expected a 'sequence_string_set.length' property");
227 }
228
229 const auto& val = lIt->second;
230 if (val->type() != millijson::NUMBER) {
231 throw std::runtime_error("'sequence_string_set.length' property should be a JSON number");
232 }
233
234 auto num = reinterpret_cast<const millijson::Number*>(val.get())->value();
235 if (num < 0 || std::floor(num) != num) {
236 throw std::runtime_error("'sequence_string_set.length' should be a non-negative integer");
237 }
238
239 expected_nseq = num;
240 }
241
242 std::array<bool, 255> allowed;
243 std::fill(allowed.begin(), allowed.end(), false);
244 {
245 const std::string stype_name = "sequence_type"; // avoid dangling reference again.
246 const std::string& stype = internal_json::extract_string(obj, stype_name, [&](std::exception& e) -> void {
247 throw std::runtime_error("failed to extract 'sequence_string_set." + stype_name + "' from the object metadata; " + std::string(e.what()));
248 });
249
250 std::string allowable;
251 if (stype == "DNA" || stype == "RNA") {
252 allowable = "ACGRYSWKMBDHVN";
253 if (stype == "DNA") {
254 allowable += "T";
255 } else {
256 allowable += "U";
257 }
258 } else if (stype == "AA") {
259 allowable = "ACDEFGHIKLMNPQRSTVWY";
260 } else if (stype == "custom") {
261 std::fill(allowed.begin() + internal::char2int('!'), allowed.begin() + internal::char2int('~') + 1, true);
262 } else {
263 throw std::runtime_error("invalid string '" + stype + "' in the 'sequence_string_set." + stype_name + "' property");
264 }
265
266 for (auto a : allowable) {
267 allowed[internal::char2int(a)] = true;
268 allowed[internal::char2int(std::tolower(a))] = true;
269 }
270 allowed[internal::char2int('.')] = true;
271 allowed[internal::char2int('-')] = true;
272 }
273
274 bool has_qualities = false;
275 char lowest_quality = 0;
276 {
277 auto xIt = obj.find("quality_type");
278 if (xIt != obj.end()) {
279 const auto& val = xIt->second;
280 if (val->type() != millijson::STRING) {
281 throw std::runtime_error("'sequence_string_set.quality_type' property should be a JSON string");
282 }
283
284 const auto& qtype = reinterpret_cast<const millijson::String*>(val.get())->value();
285 has_qualities = true;
286
287 if (qtype == "phred") {
288 auto oIt = obj.find("quality_offset");
289 if (oIt == obj.end()) {
290 throw std::runtime_error("expected a 'sequence_string_set.quality_offset' property for Phred quality scores");
291 }
292
293 const auto& val = oIt->second;
294 if (val->type() != millijson::NUMBER) {
295 throw std::runtime_error("'sequence_string_set.quality_offset' property should be a JSON number");
296 }
297
298 double offset = reinterpret_cast<const millijson::Number*>(val.get())->value();
299 if (offset == 33) {
300 lowest_quality = '!';
301 } else if (offset == 64) {
302 lowest_quality = '@';
303 } else {
304 throw std::runtime_error("'sequence_string_set.quality_offset' property should be either 33 or 64");
305 }
306
307 } else if (qtype == "solexa") {
308 lowest_quality = ';';
309
310 } else if (qtype == "none") {
311 has_qualities = false;
312
313 } else {
314 throw std::runtime_error("invalid string '" + qtype + "' for the 'sequence_string_set.quality_type' property");
315 }
316 }
317 }
318
319 size_t nseq = 0;
320 if (has_qualities) {
321 auto spath = path / "sequences.fastq.gz";
322 nseq = internal::parse_sequences<true>(spath, allowed, lowest_quality, options.parallel_reads);
323 } else {
324 auto spath = path / "sequences.fasta.gz";
325 nseq = internal::parse_sequences<false>(spath, allowed, lowest_quality, options.parallel_reads);
326 }
327 if (nseq != expected_nseq) {
328 throw std::runtime_error("observed number of sequences is different from the expected number (" + std::to_string(nseq) + " to " + std::to_string(expected_nseq) + ")");
329 }
330
331 auto npath = path / "names.txt.gz";
332 if (std::filesystem::exists(npath)) {
333 size_t nnames = internal::parse_names(npath, options.parallel_reads);
334 if (nnames != expected_nseq) {
335 throw std::runtime_error("number of names is different from the number of sequences (" + std::to_string(nnames) + " to " + std::to_string(expected_nseq) + ")");
336 }
337 }
338
339 internal_other::validate_mcols(path, "sequence_annotations", nseq, options);
340 internal_other::validate_metadata(path, "other_annotations", options);
341}
342
349inline size_t height([[maybe_unused]] const std::filesystem::path& path, const ObjectMetadata& metadata, [[maybe_unused]] Options& options) {
350 const std::string type_name = "sequence_string_set"; // use a separate variable to avoid dangling reference warnings from GCC.
351 const auto& obj = internal_json::extract_typed_object_from_metadata(metadata.other, type_name);
352 auto lIt = obj.find("length");
353 const auto& val = lIt->second;
354 return reinterpret_cast<const millijson::Number*>(val.get())->value();
355}
356
357}
358
359}
360
361#endif
size_t height(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition sequence_string_set.hpp:349
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition sequence_string_set.hpp:211
takane validation functions.
Definition _derived_from.hpp:15
Object metadata, including the type and other fields.
Definition utils_public.hpp:26
std::unordered_map< std::string, std::shared_ptr< millijson::Base > > other
Definition utils_public.hpp:35
Validation options.
Definition utils_public.hpp:94
bool parallel_reads
Definition utils_public.hpp:98