takane
Validators for ArtifactDB file formats
Loading...
Searching...
No Matches
genomic_ranges.hpp
Go to the documentation of this file.
1#ifndef TAKANE_GENOMIC_RANGES_HPP
2#define TAKANE_GENOMIC_RANGES_HPP
3
4#include "H5Cpp.h"
5#include "ritsuko/ritsuko.hpp"
6#include "ritsuko/hdf5/hdf5.hpp"
7
8#include <string>
9#include <filesystem>
10#include <stdexcept>
11#include <cstdint>
12#include <type_traits>
13#include <limits>
14
15#include "utils_string.hpp"
16#include "utils_public.hpp"
17#include "utils_other.hpp"
18#include "utils_json.hpp"
19
25namespace takane {
26
30void validate(const std::filesystem::path&, const ObjectMetadata&, Options& options);
31bool derived_from(const std::string&, const std::string&, const Options& options);
40namespace genomic_ranges {
41
45namespace internal {
46
47struct SequenceLimits {
48 SequenceLimits(size_t n) : has_circular(n), circular(n), has_seqlen(n), seqlen(n) {}
49 std::vector<unsigned char> has_circular, circular, has_seqlen;
50 std::vector<uint64_t> seqlen;
51};
52
53inline SequenceLimits find_sequence_limits(const std::filesystem::path& path, Options& options) {
54 auto smeta = read_object_metadata(path);
55 if (!derived_from(smeta.type, "sequence_information", options)) {
56 throw std::runtime_error("'sequence_information' directory should contain a 'sequence_information' object");
57 }
58 ::takane::validate(path, smeta, options);
59
60 auto handle = ritsuko::hdf5::open_file(path / "info.h5");
61 auto ghandle = handle.openGroup("sequence_information");
62
63 auto lhandle = ghandle.openDataSet("length");
64 auto num_seq = ritsuko::hdf5::get_1d_length(lhandle.getSpace(), false);
65 ritsuko::hdf5::Stream1dNumericDataset<uint64_t> lstream(&lhandle, num_seq, options.hdf5_buffer_size);
66 auto lmissing = ritsuko::hdf5::open_and_load_optional_numeric_missing_placeholder<uint64_t>(lhandle, "missing-value-placeholder");
67
68 auto chandle = ghandle.openDataSet("circular");
69 ritsuko::hdf5::Stream1dNumericDataset<int32_t> cstream(&chandle, num_seq, options.hdf5_buffer_size);
70 auto cmissing = ritsuko::hdf5::open_and_load_optional_numeric_missing_placeholder<int32_t>(chandle, "missing-value-placeholder");
71
72 SequenceLimits output(num_seq);
73 for (size_t i = 0; i < num_seq; ++i, lstream.next(), cstream.next()) {
74 auto slen = lstream.get();
75 auto circ = cstream.get();
76 output.has_seqlen[i] = !(lmissing.first && lmissing.second == slen);
77 output.seqlen[i] = slen;
78 output.has_circular[i] = !(cmissing.first && cmissing.second == circ);
79 output.circular[i] = circ;
80 }
81
82 return output;
83}
84
85}
95inline void validate(const std::filesystem::path& path, const ObjectMetadata& metadata, Options& options) {
96 const auto& vstring = internal_json::extract_version_for_type(metadata.other, "genomic_ranges");
97 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true);
98 if (version.major != 1) {
99 throw std::runtime_error("unsupported version string '" + vstring + "'");
100 }
101
102 // Figuring out the sequence length constraints.
103 auto limits = internal::find_sequence_limits(path / "sequence_information", options);
104 size_t num_sequences = limits.seqlen.size();
105
106 // Now loading all three components.
107 auto handle = ritsuko::hdf5::open_file(path / "ranges.h5");
108 auto ghandle = ritsuko::hdf5::open_group(handle, "genomic_ranges");
109 auto id_handle = ritsuko::hdf5::open_dataset(ghandle, "sequence");
110 auto num_ranges = ritsuko::hdf5::get_1d_length(id_handle, false);
111 if (ritsuko::hdf5::exceeds_integer_limit(id_handle, 64, false)) {
112 throw std::runtime_error("expected 'sequence' to have a datatype that fits into a 64-bit unsigned integer");
113 }
114 ritsuko::hdf5::Stream1dNumericDataset<uint64_t> id_stream(&id_handle, num_ranges, options.hdf5_buffer_size);
115
116 auto start_handle = ritsuko::hdf5::open_dataset(ghandle, "start");
117 if (num_ranges != ritsuko::hdf5::get_1d_length(start_handle, false)) {
118 throw std::runtime_error("'start' and 'sequence' should have the same length");
119 }
120 if (ritsuko::hdf5::exceeds_integer_limit(start_handle, 64, true)) {
121 throw std::runtime_error("expected 'start' to have a datatype that fits into a 64-bit signed integer");
122 }
123 ritsuko::hdf5::Stream1dNumericDataset<int64_t> start_stream(&start_handle, num_ranges, options.hdf5_buffer_size);
124
125 auto width_handle = ritsuko::hdf5::open_dataset(ghandle, "width");
126 if (num_ranges != ritsuko::hdf5::get_1d_length(width_handle, false)) {
127 throw std::runtime_error("'width' and 'sequence' should have the same length");
128 }
129 if (ritsuko::hdf5::exceeds_integer_limit(width_handle, 64, false)) {
130 throw std::runtime_error("expected 'width' to have a datatype that fits into a 64-bit unsigned integer");
131 }
132 ritsuko::hdf5::Stream1dNumericDataset<uint64_t> width_stream(&width_handle, num_ranges, options.hdf5_buffer_size);
133
134 constexpr uint64_t end_limit = std::numeric_limits<int64_t>::max();
135 for (size_t i = 0; i < num_ranges; ++i, id_stream.next(), start_stream.next(), width_stream.next()) {
136 auto id = id_stream.get();
137 if (id >= num_sequences) {
138 throw std::runtime_error("'sequence' must be less than the number of sequences (got " + std::to_string(id) + ")");
139 }
140
141 auto start = start_stream.get();
142 auto width = width_stream.get();
143
144 // If it's definitely non-circular, the start position should be positive.
145 if (limits.has_circular[id] && !limits.circular[id]) {
146 if (start < 1) {
147 throw std::runtime_error("non-positive start position (" + std::to_string(start) + ") for non-circular sequence");
148 }
149
150 if (limits.has_seqlen[id]) {
151 // If the sequence length is provided, the end position shouldn't overflow.
152 auto spos = static_cast<uint64_t>(start);
153 auto limit = limits.seqlen[id];
154 if (spos > limit) {
155 throw std::runtime_error("start position beyond sequence length (" + std::to_string(start) + " > " + std::to_string(limit) + ") for non-circular sequence");
156 }
157
158 // The LHS should not overflow as 'spos >= 1' so 'limit - spos + 1' should still be no greater than 'limit'.
159 if (limit - spos + 1 < width) {
160 throw std::runtime_error("end position beyond sequence length (" +
161 std::to_string(start) + " + " + std::to_string(width) + " > " + std::to_string(limit) +
162 ") for non-circular sequence");
163 }
164 }
165 }
166
167 bool exceeded = false;
168 if (start > 0) {
169 // 'end_limit - start' is always non-negative as 'end_limit' is the largest value of an int64_t and 'start' is also int64_t.
170 exceeded = (end_limit - static_cast<uint64_t>(start) < width);
171 } else {
172 // 'end_limit - start' will not overflow a uint64_t, because 'end_limit' is the largest value of an int64_t and 'start' as also 'int64_t'.
173 exceeded = (end_limit + static_cast<uint64_t>(-start) < width);
174 }
175 if (exceeded) {
176 throw std::runtime_error("end position beyond the range of a 64-bit integer (" + std::to_string(start) + " + " + std::to_string(width) + ")");
177 }
178 }
179
180 {
181 auto strand_handle = ritsuko::hdf5::open_dataset(ghandle, "strand");
182 if (num_ranges != ritsuko::hdf5::get_1d_length(strand_handle, false)) {
183 throw std::runtime_error("'strand' and 'sequence' should have the same length");
184 }
185 if (ritsuko::hdf5::exceeds_integer_limit(strand_handle, 32, true)) {
186 throw std::runtime_error("expected 'strand' to have a datatype that fits into a 32-bit signed integer");
187 }
188
189 ritsuko::hdf5::Stream1dNumericDataset<int32_t> strand_stream(&strand_handle, num_ranges, options.hdf5_buffer_size);
190 for (hsize_t i = 0; i < num_ranges; ++i, strand_stream.next()) {
191 auto x = strand_stream.get();
192 if (x < -1 || x > 1) {
193 throw std::runtime_error("values of 'strand' should be one of 0, -1, or 1 (got " + std::to_string(x) + ")");
194 }
195 }
196 }
197
198 internal_other::validate_mcols(path, "range_annotations", num_ranges, options);
199 internal_other::validate_metadata(path, "other_annotations", options);
200
201 internal_string::validate_names(ghandle, "name", num_ranges, options.hdf5_buffer_size);
202}
203
210inline size_t height(const std::filesystem::path& path, [[maybe_unused]] const ObjectMetadata& metadata, [[maybe_unused]] Options& options) {
211 auto handle = ritsuko::hdf5::open_file(path / "ranges.h5");
212 auto ghandle = handle.openGroup("genomic_ranges");
213 auto dhandle = ghandle.openDataSet("sequence");
214 return ritsuko::hdf5::get_1d_length(dhandle, false);
215}
216
217}
218
219}
220
221#endif
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition genomic_ranges.hpp:95
size_t height(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition genomic_ranges.hpp:210
takane validation functions.
Definition _derived_from.hpp:15
ObjectMetadata read_object_metadata(const std::filesystem::path &path)
Definition utils_public.hpp:74
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition _validate.hpp:107
bool derived_from(const std::string &type, const std::string &base, const Options &options)
Definition _derived_from.hpp:80
Object metadata, including the type and other fields.
Definition utils_public.hpp:26
std::unordered_map< std::string, std::shared_ptr< millijson::Base > > other
Definition utils_public.hpp:35
Validation options.
Definition utils_public.hpp:94
hsize_t hdf5_buffer_size
Definition utils_public.hpp:103
Exported utilities.