1#ifndef TAKANE_SEQUENCE_STRING_SET_HPP
2#define TAKANE_SEQUENCE_STRING_SET_HPP
4#include "byteme/byteme.hpp"
6#include "ritsuko/ritsuko.hpp"
7#include "utils_other.hpp"
27namespace sequence_string_set {
34inline int char2int(
char val) {
35 return static_cast<int>(val) -
static_cast<int>(std::numeric_limits<char>::min());
38template<
bool has_quality_,
bool parallel_>
39size_t parse_sequences(
const std::filesystem::path& path, std::array<bool, 255> allowed,
char lowest_quality) {
40 auto gzreader = internal_other::open_reader<byteme::GzipFileReader>(path);
41 typedef typename std::conditional<parallel_, byteme::PerByteParallel<>, byteme::PerByte<> >::type PB;
45 size_t line_count = 0;
46 auto advance_and_check = [&]() ->
char {
48 throw std::runtime_error(
"premature end of the file at line " + std::to_string(line_count + 1));
56 if constexpr(!has_quality_) {
58 throw std::runtime_error(
"sequence name should start with '>' at line " + std::to_string(line_count + 1));
62 throw std::runtime_error(
"sequence name should start with '@' at line " + std::to_string(line_count + 1));
66 val = advance_and_check();
70 if (!std::isdigit(val)) {
71 throw std::runtime_error(
"sequence name should be a non-negative integer at line " + std::to_string(line_count + 1));
75 proposed += (val -
'0');
76 val = advance_and_check();
78 if (empty || proposed != nseq) {
79 throw std::runtime_error(
"sequence name should be its index at line " + std::to_string(line_count + 1));
83 if constexpr(!has_quality_) {
85 val = advance_and_check();
97 if (!allowed[char2int(val)]) {
98 throw std::runtime_error(
"forbidden character '" + std::string(1, val) +
"' in sequence at line " + std::to_string(line_count + 1));
100 val = advance_and_check();
106 val = advance_and_check();
107 size_t seq_length = 0;
111 val = advance_and_check();
116 if (!allowed[char2int(val)]) {
117 throw std::runtime_error(
"forbidden character '" + std::string(1, val) +
"' in sequence at line " + std::to_string(line_count + 1));
120 val = advance_and_check();
126 val = advance_and_check();
127 }
while (val !=
'\n');
134 size_t qual_length = 0;
136 val = advance_and_check();
139 if (qual_length >= seq_length) {
140 while (pb.advance() && pb.get() ==
'\n') {}
144 if (val < lowest_quality) {
145 throw std::runtime_error(
"out-of-range quality score '" + std::string(1, val) +
"' detected at line " + std::to_string(line_count + 1));
151 if (qual_length != seq_length) {
152 throw std::runtime_error(
"unequal lengths for quality and sequence strings at line " + std::to_string(line_count + 1) +
")");
162template<
bool parallel_>
163size_t parse_names(
const std::filesystem::path& path) {
164 auto gzreader = internal_other::open_reader<byteme::GzipFileReader>(path);
165 typedef typename std::conditional<parallel_, byteme::PerByteParallel<>, byteme::PerByte<> >::type PB;
169 size_t line_count = 0;
170 auto advance_and_check = [&]() ->
char {
172 throw std::runtime_error(
"premature end of the file at line " + std::to_string(line_count + 1));
180 throw std::runtime_error(
"name should start with a quote");
184 val = advance_and_check();
186 val = advance_and_check();
192 }
else if (val !=
'"') {
193 throw std::runtime_error(
"characters present after end quote at line " + std::to_string(line_count + 1));
195 }
else if (val ==
'\n') {
215 const std::string type_name =
"sequence_string_set";
216 const auto& obj = internal_json::extract_typed_object_from_metadata(metadata.
other, type_name);
218 const std::string version_name =
"version";
219 const auto& vstring = internal_json::extract_string_from_typed_object(obj, version_name, type_name);
220 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(),
true);
221 if (version.major != 1) {
222 throw std::runtime_error(
"unsupported version string '" + vstring +
"'");
225 size_t expected_nseq = 0;
227 auto lIt = obj.find(
"length");
228 if (lIt == obj.end()) {
229 throw std::runtime_error(
"expected a 'sequence_string_set.length' property");
232 const auto& val = lIt->second;
233 if (val->type() != millijson::NUMBER) {
234 throw std::runtime_error(
"'sequence_string_set.length' property should be a JSON number");
237 auto num =
reinterpret_cast<const millijson::Number*
>(val.get())->value;
238 if (num < 0 || std::floor(num) != num) {
239 throw std::runtime_error(
"'sequence_string_set.length' should be a non-negative integer");
245 std::array<bool, 255> allowed;
246 std::fill(allowed.begin(), allowed.end(),
false);
248 const std::string stype_name =
"sequence_type";
249 const std::string& stype = internal_json::extract_string(obj, stype_name, [&](std::exception& e) ->
void {
250 throw std::runtime_error(
"failed to extract 'sequence_string_set." + stype_name +
"' from the object metadata; " + std::string(e.what()));
253 std::string allowable;
254 if (stype ==
"DNA" || stype ==
"RNA") {
255 allowable =
"ACGRYSWKMBDHVN";
256 if (stype ==
"DNA") {
261 }
else if (stype ==
"AA") {
262 allowable =
"ACDEFGHIKLMNPQRSTVWY";
263 }
else if (stype ==
"custom") {
264 std::fill(allowed.begin() + internal::char2int(
'!'), allowed.begin() + internal::char2int(
'~') + 1,
true);
266 throw std::runtime_error(
"invalid string '" + stype +
"' in the 'sequence_string_set." + stype_name +
"' property");
269 for (
auto a : allowable) {
270 allowed[internal::char2int(a)] =
true;
271 allowed[internal::char2int(std::tolower(a))] =
true;
273 allowed[internal::char2int(
'.')] =
true;
274 allowed[internal::char2int(
'-')] =
true;
277 bool has_qualities =
false;
278 char lowest_quality = 0;
280 auto xIt = obj.find(
"quality_type");
281 if (xIt != obj.end()) {
282 const auto& val = xIt->second;
283 if (val->type() != millijson::STRING) {
284 throw std::runtime_error(
"'sequence_string_set.quality_type' property should be a JSON string");
287 const auto& qtype =
reinterpret_cast<const millijson::String*
>(val.get())->value;
288 has_qualities =
true;
290 if (qtype ==
"phred") {
291 auto oIt = obj.find(
"quality_offset");
292 if (oIt == obj.end()) {
293 throw std::runtime_error(
"expected a 'sequence_string_set.quality_offset' property for Phred quality scores");
296 const auto& val = oIt->second;
297 if (val->type() != millijson::NUMBER) {
298 throw std::runtime_error(
"'sequence_string_set.quality_offset' property should be a JSON number");
301 double offset =
reinterpret_cast<const millijson::Number*
>(val.get())->value;
303 lowest_quality =
'!';
304 }
else if (offset == 64) {
305 lowest_quality =
'@';
307 throw std::runtime_error(
"'sequence_string_set.quality_offset' property should be either 33 or 64");
310 }
else if (qtype ==
"solexa") {
311 lowest_quality =
';';
313 }
else if (qtype ==
"none") {
314 has_qualities =
false;
317 throw std::runtime_error(
"invalid string '" + qtype +
"' for the 'sequence_string_set.quality_type' property");
324 auto spath = path /
"sequences.fastq.gz";
326 nseq = internal::parse_sequences<true, true>(spath, allowed, lowest_quality);
328 nseq = internal::parse_sequences<true, false>(spath, allowed, lowest_quality);
331 auto spath = path /
"sequences.fasta.gz";
333 nseq = internal::parse_sequences<false, true>(spath, allowed, lowest_quality);
335 nseq = internal::parse_sequences<false, false>(spath, allowed, lowest_quality);
338 if (nseq != expected_nseq) {
339 throw std::runtime_error(
"observed number of sequences is different from the expected number (" + std::to_string(nseq) +
" to " + std::to_string(expected_nseq) +
")");
342 auto npath = path /
"names.txt.gz";
343 if (std::filesystem::exists(npath)) {
346 nnames = internal::parse_names<true>(npath);
348 nnames = internal::parse_names<false>(npath);
350 if (nnames != expected_nseq) {
351 throw std::runtime_error(
"number of names is different from the number of sequences (" + std::to_string(nnames) +
" to " + std::to_string(expected_nseq) +
")");
355 internal_other::validate_mcols(path,
"sequence_annotations", nseq, options);
356 internal_other::validate_metadata(path,
"other_annotations", options);
366 const auto& obj = internal_json::extract_typed_object_from_metadata(metadata.
other,
"sequence_string_set");
367 auto lIt = obj.find(
"length");
368 const auto& val = lIt->second;
369 return reinterpret_cast<const millijson::Number*
>(val.get())->value;
size_t height(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition sequence_string_set.hpp:365
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition sequence_string_set.hpp:214
takane validation functions.
Definition _derived_from.hpp:15
Validation options.
Definition utils_public.hpp:94
bool parallel_reads
Definition utils_public.hpp:98