1#ifndef TAKANE_SEQUENCE_STRING_SET_HPP
2#define TAKANE_SEQUENCE_STRING_SET_HPP
4#include "byteme/byteme.hpp"
6#include "ritsuko/ritsuko.hpp"
7#include "utils_other.hpp"
27namespace sequence_string_set {
34inline int char2int(
char val) {
35 return static_cast<int>(val) -
static_cast<int>(std::numeric_limits<char>::min());
38template<
bool has_quality_>
39size_t parse_sequences(
const std::filesystem::path& path, std::array<bool, 255> allowed,
char lowest_quality,
bool parallel) {
40 auto gzreader = internal_other::open_reader<byteme::GzipFileReader>(path, byteme::GzipFileReaderOptions());
41 auto pb = internal_other::wrap_reader_for_bytes<char>(std::move(gzreader), parallel);
44 size_t line_count = 0;
45 auto advance_and_check = [&]() ->
char {
47 throw std::runtime_error(
"premature end of the file at line " + std::to_string(line_count + 1));
55 if constexpr(!has_quality_) {
57 throw std::runtime_error(
"sequence name should start with '>' at line " + std::to_string(line_count + 1));
61 throw std::runtime_error(
"sequence name should start with '@' at line " + std::to_string(line_count + 1));
65 val = advance_and_check();
69 if (!std::isdigit(val)) {
70 throw std::runtime_error(
"sequence name should be a non-negative integer at line " + std::to_string(line_count + 1));
74 proposed += (val -
'0');
75 val = advance_and_check();
77 if (empty || proposed != nseq) {
78 throw std::runtime_error(
"sequence name should be its index at line " + std::to_string(line_count + 1));
82 if constexpr(!has_quality_) {
84 val = advance_and_check();
96 if (!allowed[char2int(val)]) {
97 throw std::runtime_error(
"forbidden character '" + std::string(1, val) +
"' in sequence at line " + std::to_string(line_count + 1));
99 val = advance_and_check();
105 val = advance_and_check();
106 size_t seq_length = 0;
110 val = advance_and_check();
115 if (!allowed[char2int(val)]) {
116 throw std::runtime_error(
"forbidden character '" + std::string(1, val) +
"' in sequence at line " + std::to_string(line_count + 1));
119 val = advance_and_check();
125 val = advance_and_check();
126 }
while (val !=
'\n');
133 size_t qual_length = 0;
135 val = advance_and_check();
138 if (qual_length >= seq_length) {
139 while (pb->advance() && pb->get() ==
'\n') {}
143 if (val < lowest_quality) {
144 throw std::runtime_error(
"out-of-range quality score '" + std::string(1, val) +
"' detected at line " + std::to_string(line_count + 1));
150 if (qual_length != seq_length) {
151 throw std::runtime_error(
"unequal lengths for quality and sequence strings at line " + std::to_string(line_count + 1) +
")");
161inline size_t parse_names(
const std::filesystem::path& path,
bool parallel) {
162 auto gzreader = internal_other::open_reader<byteme::GzipFileReader>(path, byteme::GzipFileReaderOptions());
163 auto pb = internal_other::wrap_reader_for_bytes<char>(std::move(gzreader), parallel);
166 size_t line_count = 0;
167 auto advance_and_check = [&]() ->
char {
168 if (!pb->advance()) {
169 throw std::runtime_error(
"premature end of the file at line " + std::to_string(line_count + 1));
174 while (pb->valid()) {
175 char val = pb->get();
177 throw std::runtime_error(
"name should start with a quote");
181 val = advance_and_check();
183 val = advance_and_check();
189 }
else if (val !=
'"') {
190 throw std::runtime_error(
"characters present after end quote at line " + std::to_string(line_count + 1));
192 }
else if (val ==
'\n') {
212 const std::string type_name =
"sequence_string_set";
213 const auto& obj = internal_json::extract_typed_object_from_metadata(metadata.
other, type_name);
215 const std::string version_name =
"version";
216 const auto& vstring = internal_json::extract_string_from_typed_object(obj, version_name, type_name);
217 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(),
true);
218 if (version.major != 1) {
219 throw std::runtime_error(
"unsupported version string '" + vstring +
"'");
222 size_t expected_nseq = 0;
224 auto lIt = obj.find(
"length");
225 if (lIt == obj.end()) {
226 throw std::runtime_error(
"expected a 'sequence_string_set.length' property");
229 const auto& val = lIt->second;
230 if (val->type() != millijson::NUMBER) {
231 throw std::runtime_error(
"'sequence_string_set.length' property should be a JSON number");
234 auto num =
reinterpret_cast<const millijson::Number*
>(val.get())->value();
235 if (num < 0 || std::floor(num) != num) {
236 throw std::runtime_error(
"'sequence_string_set.length' should be a non-negative integer");
242 std::array<bool, 255> allowed;
243 std::fill(allowed.begin(), allowed.end(),
false);
245 const std::string stype_name =
"sequence_type";
246 const std::string& stype = internal_json::extract_string(obj, stype_name, [&](std::exception& e) ->
void {
247 throw std::runtime_error(
"failed to extract 'sequence_string_set." + stype_name +
"' from the object metadata; " + std::string(e.what()));
250 std::string allowable;
251 if (stype ==
"DNA" || stype ==
"RNA") {
252 allowable =
"ACGRYSWKMBDHVN";
253 if (stype ==
"DNA") {
258 }
else if (stype ==
"AA") {
259 allowable =
"ACDEFGHIKLMNPQRSTVWY";
260 }
else if (stype ==
"custom") {
261 std::fill(allowed.begin() + internal::char2int(
'!'), allowed.begin() + internal::char2int(
'~') + 1,
true);
263 throw std::runtime_error(
"invalid string '" + stype +
"' in the 'sequence_string_set." + stype_name +
"' property");
266 for (
auto a : allowable) {
267 allowed[internal::char2int(a)] =
true;
268 allowed[internal::char2int(std::tolower(a))] =
true;
270 allowed[internal::char2int(
'.')] =
true;
271 allowed[internal::char2int(
'-')] =
true;
274 bool has_qualities =
false;
275 char lowest_quality = 0;
277 auto xIt = obj.find(
"quality_type");
278 if (xIt != obj.end()) {
279 const auto& val = xIt->second;
280 if (val->type() != millijson::STRING) {
281 throw std::runtime_error(
"'sequence_string_set.quality_type' property should be a JSON string");
284 const auto& qtype =
reinterpret_cast<const millijson::String*
>(val.get())->value();
285 has_qualities =
true;
287 if (qtype ==
"phred") {
288 auto oIt = obj.find(
"quality_offset");
289 if (oIt == obj.end()) {
290 throw std::runtime_error(
"expected a 'sequence_string_set.quality_offset' property for Phred quality scores");
293 const auto& val = oIt->second;
294 if (val->type() != millijson::NUMBER) {
295 throw std::runtime_error(
"'sequence_string_set.quality_offset' property should be a JSON number");
298 double offset =
reinterpret_cast<const millijson::Number*
>(val.get())->value();
300 lowest_quality =
'!';
301 }
else if (offset == 64) {
302 lowest_quality =
'@';
304 throw std::runtime_error(
"'sequence_string_set.quality_offset' property should be either 33 or 64");
307 }
else if (qtype ==
"solexa") {
308 lowest_quality =
';';
310 }
else if (qtype ==
"none") {
311 has_qualities =
false;
314 throw std::runtime_error(
"invalid string '" + qtype +
"' for the 'sequence_string_set.quality_type' property");
321 auto spath = path /
"sequences.fastq.gz";
322 nseq = internal::parse_sequences<true>(spath, allowed, lowest_quality, options.
parallel_reads);
324 auto spath = path /
"sequences.fasta.gz";
325 nseq = internal::parse_sequences<false>(spath, allowed, lowest_quality, options.
parallel_reads);
327 if (nseq != expected_nseq) {
328 throw std::runtime_error(
"observed number of sequences is different from the expected number (" + std::to_string(nseq) +
" to " + std::to_string(expected_nseq) +
")");
331 auto npath = path /
"names.txt.gz";
332 if (std::filesystem::exists(npath)) {
333 size_t nnames = internal::parse_names(npath, options.
parallel_reads);
334 if (nnames != expected_nseq) {
335 throw std::runtime_error(
"number of names is different from the number of sequences (" + std::to_string(nnames) +
" to " + std::to_string(expected_nseq) +
")");
339 internal_other::validate_mcols(path,
"sequence_annotations", nseq, options);
340 internal_other::validate_metadata(path,
"other_annotations", options);
350 const std::string type_name =
"sequence_string_set";
351 const auto& obj = internal_json::extract_typed_object_from_metadata(metadata.
other, type_name);
352 auto lIt = obj.find(
"length");
353 const auto& val = lIt->second;
354 return reinterpret_cast<const millijson::Number*
>(val.get())->value();
size_t height(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition sequence_string_set.hpp:349
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition sequence_string_set.hpp:211
takane validation functions.
Definition _derived_from.hpp:15
Validation options.
Definition utils_public.hpp:94
bool parallel_reads
Definition utils_public.hpp:98