1#ifndef TAKANE_SEQUENCE_STRING_SET_HPP
2#define TAKANE_SEQUENCE_STRING_SET_HPP
4#include "byteme/byteme.hpp"
6#include "ritsuko/ritsuko.hpp"
7#include "utils_other.hpp"
27namespace sequence_string_set {
34inline int char2int(
char val) {
35 return static_cast<int>(val) -
static_cast<int>(std::numeric_limits<char>::min());
38template<
bool has_quality_,
bool parallel_>
39size_t parse_sequences(
const std::filesystem::path& path, std::array<bool, 255> allowed,
char lowest_quality) {
40 auto gzreader = internal_other::open_reader<byteme::GzipFileReader>(path);
41 typedef typename std::conditional<parallel_, byteme::PerByteParallel<>, byteme::PerByte<> >::type PB;
45 size_t line_count = 0;
46 auto advance_and_check = [&]() ->
char {
48 throw std::runtime_error(
"premature end of the file at line " + std::to_string(line_count + 1));
56 if constexpr(!has_quality_) {
58 throw std::runtime_error(
"sequence name should start with '>' at line " + std::to_string(line_count + 1));
62 throw std::runtime_error(
"sequence name should start with '@' at line " + std::to_string(line_count + 1));
66 val = advance_and_check();
70 if (!std::isdigit(val)) {
71 throw std::runtime_error(
"sequence name should be a non-negative integer at line " + std::to_string(line_count + 1));
75 proposed += (val -
'0');
76 val = advance_and_check();
78 if (empty || proposed != nseq) {
79 throw std::runtime_error(
"sequence name should be its index at line " + std::to_string(line_count + 1));
83 if constexpr(!has_quality_) {
85 val = advance_and_check();
97 if (!allowed[char2int(val)]) {
98 throw std::runtime_error(
"forbidden character '" + std::string(1, val) +
"' in sequence at line " + std::to_string(line_count + 1));
100 val = advance_and_check();
106 val = advance_and_check();
107 size_t seq_length = 0;
111 val = advance_and_check();
116 if (!allowed[char2int(val)]) {
117 throw std::runtime_error(
"forbidden character '" + std::string(1, val) +
"' in sequence at line " + std::to_string(line_count + 1));
120 val = advance_and_check();
126 val = advance_and_check();
127 }
while (val !=
'\n');
134 size_t qual_length = 0;
136 val = advance_and_check();
139 if (qual_length >= seq_length) {
140 while (pb.advance() && pb.get() ==
'\n') {}
144 if (val < lowest_quality) {
145 throw std::runtime_error(
"out-of-range quality score '" + std::string(1, val) +
"' detected at line " + std::to_string(line_count + 1));
151 if (qual_length != seq_length) {
152 throw std::runtime_error(
"unequal lengths for quality and sequence strings at line " + std::to_string(line_count + 1) +
")");
162template<
bool parallel_>
163size_t parse_names(
const std::filesystem::path& path) {
164 auto gzreader = internal_other::open_reader<byteme::GzipFileReader>(path);
165 typedef typename std::conditional<parallel_, byteme::PerByteParallel<>, byteme::PerByte<> >::type PB;
169 size_t line_count = 0;
170 auto advance_and_check = [&]() ->
char {
172 throw std::runtime_error(
"premature end of the file at line " + std::to_string(line_count + 1));
180 throw std::runtime_error(
"name should start with a quote");
184 val = advance_and_check();
186 val = advance_and_check();
192 }
else if (val !=
'"') {
193 throw std::runtime_error(
"characters present after end quote at line " + std::to_string(line_count + 1));
195 }
else if (val ==
'\n') {
215 const auto& obj = internal_json::extract_typed_object_from_metadata(metadata.
other,
"sequence_string_set");
216 const auto& vstring = internal_json::extract_string_from_typed_object(obj,
"version",
"sequence_string_set");
217 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(),
true);
218 if (version.major != 1) {
219 throw std::runtime_error(
"unsupported version string '" + vstring +
"'");
222 size_t expected_nseq = 0;
224 auto lIt = obj.find(
"length");
225 if (lIt == obj.end()) {
226 throw std::runtime_error(
"expected a 'sequence_string_set.length' property");
229 const auto& val = lIt->second;
230 if (val->type() != millijson::NUMBER) {
231 throw std::runtime_error(
"'sequence_string_set.length' property should be a JSON number");
234 auto num =
reinterpret_cast<const millijson::Number*
>(val.get())->value;
235 if (num < 0 || std::floor(num) != num) {
236 throw std::runtime_error(
"'sequence_string_set.length' should be a non-negative integer");
242 std::array<bool, 255> allowed;
243 std::fill(allowed.begin(), allowed.end(),
false);
245 const std::string& stype = internal_json::extract_string(obj,
"sequence_type", [&](std::exception& e) ->
void {
246 throw std::runtime_error(
"failed to extract 'sequence_string_set.sequence_type' from the object metadata; " + std::string(e.what()));
249 std::string allowable;
250 if (stype ==
"DNA" || stype ==
"RNA") {
251 allowable =
"ACGRYSWKMBDHVN";
252 if (stype ==
"DNA") {
257 }
else if (stype ==
"AA") {
258 allowable =
"ACDEFGHIKLMNPQRSTVWY";
259 }
else if (stype ==
"custom") {
260 std::fill(allowed.begin() + internal::char2int(
'!'), allowed.begin() + internal::char2int(
'~') + 1,
true);
262 throw std::runtime_error(
"invalid string '" + stype +
"' in the 'sequence_string_set.sequence_type' property");
265 for (
auto a : allowable) {
266 allowed[internal::char2int(a)] =
true;
267 allowed[internal::char2int(std::tolower(a))] =
true;
269 allowed[internal::char2int(
'.')] =
true;
270 allowed[internal::char2int(
'-')] =
true;
273 bool has_qualities =
false;
274 char lowest_quality = 0;
276 auto xIt = obj.find(
"quality_type");
277 if (xIt != obj.end()) {
278 const auto& val = xIt->second;
279 if (val->type() != millijson::STRING) {
280 throw std::runtime_error(
"'sequence_string_set.quality_type' property should be a JSON string");
283 const auto& qtype =
reinterpret_cast<const millijson::String*
>(val.get())->value;
284 has_qualities =
true;
286 if (qtype ==
"phred") {
287 auto oIt = obj.find(
"quality_offset");
288 if (oIt == obj.end()) {
289 throw std::runtime_error(
"expected a 'sequence_string_set.quality_offset' property for Phred quality scores");
292 const auto& val = oIt->second;
293 if (val->type() != millijson::NUMBER) {
294 throw std::runtime_error(
"'sequence_string_set.quality_offset' property should be a JSON number");
297 double offset =
reinterpret_cast<const millijson::Number*
>(val.get())->value;
299 lowest_quality =
'!';
300 }
else if (offset == 64) {
301 lowest_quality =
'@';
303 throw std::runtime_error(
"'sequence_string_set.quality_offset' property should be either 33 or 64");
306 }
else if (qtype ==
"solexa") {
307 lowest_quality =
';';
309 }
else if (qtype ==
"none") {
310 has_qualities =
false;
313 throw std::runtime_error(
"invalid string '" + qtype +
"' for the 'sequence_string_set.quality_type' property");
320 auto spath = path /
"sequences.fastq.gz";
322 nseq = internal::parse_sequences<true, true>(spath, allowed, lowest_quality);
324 nseq = internal::parse_sequences<true, false>(spath, allowed, lowest_quality);
327 auto spath = path /
"sequences.fasta.gz";
329 nseq = internal::parse_sequences<false, true>(spath, allowed, lowest_quality);
331 nseq = internal::parse_sequences<false, false>(spath, allowed, lowest_quality);
334 if (nseq != expected_nseq) {
335 throw std::runtime_error(
"observed number of sequences is different from the expected number (" + std::to_string(nseq) +
" to " + std::to_string(expected_nseq) +
")");
338 auto npath = path /
"names.txt.gz";
339 if (std::filesystem::exists(npath)) {
342 nnames = internal::parse_names<true>(npath);
344 nnames = internal::parse_names<false>(npath);
346 if (nnames != expected_nseq) {
347 throw std::runtime_error(
"number of names is different from the number of sequences (" + std::to_string(nnames) +
" to " + std::to_string(expected_nseq) +
")");
351 internal_other::validate_mcols(path,
"sequence_annotations", nseq, options);
352 internal_other::validate_metadata(path,
"other_annotations", options);
362 const auto& obj = internal_json::extract_typed_object_from_metadata(metadata.
other,
"sequence_string_set");
363 auto lIt = obj.find(
"length");
364 const auto& val = lIt->second;
365 return reinterpret_cast<const millijson::Number*
>(val.get())->value;
size_t height(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition sequence_string_set.hpp:361
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition sequence_string_set.hpp:214
takane validation functions.
Definition _derived_from.hpp:15
Validation options.
Definition utils_public.hpp:94
bool parallel_reads
Definition utils_public.hpp:98