takane
Validators for ArtifactDB file formats
Loading...
Searching...
No Matches
genomic_ranges.hpp
Go to the documentation of this file.
1#ifndef TAKANE_GENOMIC_RANGES_HPP
2#define TAKANE_GENOMIC_RANGES_HPP
3
4#include "H5Cpp.h"
5#include "ritsuko/ritsuko.hpp"
6#include "ritsuko/hdf5/hdf5.hpp"
7
8#include <string>
9#include <filesystem>
10#include <stdexcept>
11#include <cstdint>
12#include <type_traits>
13#include <limits>
14
15#include "utils_string.hpp"
16#include "utils_public.hpp"
17#include "utils_other.hpp"
18#include "utils_json.hpp"
19
25namespace takane {
26
30void validate(const std::filesystem::path&, const ObjectMetadata&, Options& options);
31bool derived_from(const std::string&, const std::string&, const Options& options);
40namespace genomic_ranges {
41
45namespace internal {
46
47struct SequenceLimits {
48 SequenceLimits(size_t n) : has_circular(n), circular(n), has_seqlen(n), seqlen(n) {}
49 std::vector<unsigned char> has_circular, circular, has_seqlen;
50 std::vector<uint64_t> seqlen;
51};
52
53inline SequenceLimits find_sequence_limits(const std::filesystem::path& path, Options& options) {
54 const std::string type_name = "sequence_information";
55 auto smeta = read_object_metadata(path);
56 if (!derived_from(smeta.type, type_name, options)) {
57 throw std::runtime_error("'sequence_information' directory should contain a 'sequence_information' object");
58 }
59 ::takane::validate(path, smeta, options);
60
61 auto handle = ritsuko::hdf5::open_file(path / "info.h5");
62 auto ghandle = handle.openGroup(type_name.c_str());
63
64 const char* missing_attr_name = "missing-value-placeholder";
65
66 auto lhandle = ghandle.openDataSet("length");
67 auto num_seq = ritsuko::hdf5::get_1d_length(lhandle.getSpace(), false);
68 ritsuko::hdf5::Stream1dNumericDataset<uint64_t> lstream(&lhandle, num_seq, options.hdf5_buffer_size);
69 auto lmissing = ritsuko::hdf5::open_and_load_optional_numeric_missing_placeholder<uint64_t>(lhandle, missing_attr_name);
70
71 auto chandle = ghandle.openDataSet("circular");
72 ritsuko::hdf5::Stream1dNumericDataset<int32_t> cstream(&chandle, num_seq, options.hdf5_buffer_size);
73 auto cmissing = ritsuko::hdf5::open_and_load_optional_numeric_missing_placeholder<int32_t>(chandle, missing_attr_name);
74
75 SequenceLimits output(num_seq);
76 for (size_t i = 0; i < num_seq; ++i, lstream.next(), cstream.next()) {
77 auto slen = lstream.get();
78 auto circ = cstream.get();
79 output.has_seqlen[i] = !(lmissing.has_value() && *lmissing == slen);
80 output.seqlen[i] = slen;
81 output.has_circular[i] = !(cmissing.has_value() && *cmissing == circ);
82 output.circular[i] = circ;
83 }
84
85 return output;
86}
87
88}
98inline void validate(const std::filesystem::path& path, const ObjectMetadata& metadata, Options& options) {
99 const std::string type_name = "genomic_ranges"; // use a separate variable to avoid dangling reference warnings from GCC.
100 const auto& vstring = internal_json::extract_version_for_type(metadata.other, type_name);
101 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true);
102 if (version.major != 1) {
103 throw std::runtime_error("unsupported version string '" + vstring + "'");
104 }
105
106 // Figuring out the sequence length constraints.
107 auto limits = internal::find_sequence_limits(path / "sequence_information", options);
108 size_t num_sequences = limits.seqlen.size();
109
110 // Now loading all three components.
111 auto handle = ritsuko::hdf5::open_file(path / "ranges.h5");
112 auto ghandle = ritsuko::hdf5::open_group(handle, type_name.c_str());
113 auto id_handle = ritsuko::hdf5::open_dataset(ghandle, "sequence");
114 auto num_ranges = ritsuko::hdf5::get_1d_length(id_handle, false);
115 if (ritsuko::hdf5::exceeds_integer_limit(id_handle, 64, false)) {
116 throw std::runtime_error("expected 'sequence' to have a datatype that fits into a 64-bit unsigned integer");
117 }
118 ritsuko::hdf5::Stream1dNumericDataset<uint64_t> id_stream(&id_handle, num_ranges, options.hdf5_buffer_size);
119
120 auto start_handle = ritsuko::hdf5::open_dataset(ghandle, "start");
121 if (num_ranges != ritsuko::hdf5::get_1d_length(start_handle, false)) {
122 throw std::runtime_error("'start' and 'sequence' should have the same length");
123 }
124 if (ritsuko::hdf5::exceeds_integer_limit(start_handle, 64, true)) {
125 throw std::runtime_error("expected 'start' to have a datatype that fits into a 64-bit signed integer");
126 }
127 ritsuko::hdf5::Stream1dNumericDataset<int64_t> start_stream(&start_handle, num_ranges, options.hdf5_buffer_size);
128
129 auto width_handle = ritsuko::hdf5::open_dataset(ghandle, "width");
130 if (num_ranges != ritsuko::hdf5::get_1d_length(width_handle, false)) {
131 throw std::runtime_error("'width' and 'sequence' should have the same length");
132 }
133 if (ritsuko::hdf5::exceeds_integer_limit(width_handle, 64, false)) {
134 throw std::runtime_error("expected 'width' to have a datatype that fits into a 64-bit unsigned integer");
135 }
136 ritsuko::hdf5::Stream1dNumericDataset<uint64_t> width_stream(&width_handle, num_ranges, options.hdf5_buffer_size);
137
138 constexpr uint64_t end_limit = std::numeric_limits<int64_t>::max();
139 for (size_t i = 0; i < num_ranges; ++i, id_stream.next(), start_stream.next(), width_stream.next()) {
140 auto id = id_stream.get();
141 if (id >= num_sequences) {
142 throw std::runtime_error("'sequence' must be less than the number of sequences (got " + std::to_string(id) + ")");
143 }
144
145 auto start = start_stream.get();
146 auto width = width_stream.get();
147
148 // If it's definitely non-circular, the start position should be positive.
149 if (limits.has_circular[id] && !limits.circular[id]) {
150 if (start < 1) {
151 throw std::runtime_error("non-positive start position (" + std::to_string(start) + ") for non-circular sequence");
152 }
153
154 if (limits.has_seqlen[id]) {
155 // If the sequence length is provided, the end position shouldn't overflow.
156 auto spos = static_cast<uint64_t>(start);
157 auto limit = limits.seqlen[id];
158 if (spos > limit) {
159 throw std::runtime_error("start position beyond sequence length (" + std::to_string(start) + " > " + std::to_string(limit) + ") for non-circular sequence");
160 }
161
162 // The LHS should not overflow as 'spos >= 1' so 'limit - spos + 1' should still be no greater than 'limit'.
163 if (limit - spos + 1 < width) {
164 throw std::runtime_error("end position beyond sequence length (" +
165 std::to_string(start) + " + " + std::to_string(width) + " > " + std::to_string(limit) +
166 ") for non-circular sequence");
167 }
168 }
169 }
170
171 bool exceeded = false;
172 if (start > 0) {
173 // 'end_limit - start' is always non-negative as 'end_limit' is the largest value of an int64_t and 'start' is also int64_t.
174 exceeded = (end_limit - static_cast<uint64_t>(start) < width);
175 } else {
176 // 'end_limit - start' will not overflow a uint64_t, because 'end_limit' is the largest value of an int64_t and 'start' as also 'int64_t'.
177 exceeded = (end_limit + static_cast<uint64_t>(-start) < width);
178 }
179 if (exceeded) {
180 throw std::runtime_error("end position beyond the range of a 64-bit integer (" + std::to_string(start) + " + " + std::to_string(width) + ")");
181 }
182 }
183
184 {
185 auto strand_handle = ritsuko::hdf5::open_dataset(ghandle, "strand");
186 if (num_ranges != ritsuko::hdf5::get_1d_length(strand_handle, false)) {
187 throw std::runtime_error("'strand' and 'sequence' should have the same length");
188 }
189 if (ritsuko::hdf5::exceeds_integer_limit(strand_handle, 32, true)) {
190 throw std::runtime_error("expected 'strand' to have a datatype that fits into a 32-bit signed integer");
191 }
192
193 ritsuko::hdf5::Stream1dNumericDataset<int32_t> strand_stream(&strand_handle, num_ranges, options.hdf5_buffer_size);
194 for (hsize_t i = 0; i < num_ranges; ++i, strand_stream.next()) {
195 auto x = strand_stream.get();
196 if (x < -1 || x > 1) {
197 throw std::runtime_error("values of 'strand' should be one of 0, -1, or 1 (got " + std::to_string(x) + ")");
198 }
199 }
200 }
201
202 internal_other::validate_mcols(path, "range_annotations", num_ranges, options);
203 internal_other::validate_metadata(path, "other_annotations", options);
204
205 internal_string::validate_names(ghandle, "name", num_ranges, options.hdf5_buffer_size);
206}
207
214inline size_t height(const std::filesystem::path& path, [[maybe_unused]] const ObjectMetadata& metadata, [[maybe_unused]] Options& options) {
215 auto handle = ritsuko::hdf5::open_file(path / "ranges.h5");
216 auto ghandle = handle.openGroup("genomic_ranges");
217 auto dhandle = ghandle.openDataSet("sequence");
218 return ritsuko::hdf5::get_1d_length(dhandle, false);
219}
220
221}
222
223}
224
225#endif
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition genomic_ranges.hpp:98
size_t height(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition genomic_ranges.hpp:214
takane validation functions.
Definition _derived_from.hpp:15
ObjectMetadata read_object_metadata(const std::filesystem::path &path)
Definition utils_public.hpp:74
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition _validate.hpp:107
bool derived_from(const std::string &type, const std::string &base, const Options &options)
Definition _derived_from.hpp:80
Object metadata, including the type and other fields.
Definition utils_public.hpp:26
std::unordered_map< std::string, std::shared_ptr< millijson::Base > > other
Definition utils_public.hpp:35
Validation options.
Definition utils_public.hpp:94
hsize_t hdf5_buffer_size
Definition utils_public.hpp:103
Exported utilities.