takane
Validators for ArtifactDB file formats
Loading...
Searching...
No Matches
sequence_information.hpp
Go to the documentation of this file.
1#ifndef TAKANE_SEQUENCE_INFORMATION_HPP
2#define TAKANE_SEQUENCE_INFORMATION_HPP
3
4#include "ritsuko/ritsuko.hpp"
5#include "ritsuko/hdf5/hdf5.hpp"
6
7#include <filesystem>
8#include <stdexcept>
9#include <unordered_set>
10#include <string>
11
12#include "utils_public.hpp"
13#include "utils_json.hpp"
14
20namespace takane {
21
26namespace sequence_information {
27
33inline void validate(const std::filesystem::path& path, const ObjectMetadata& metadata, Options& options) {
34 auto vstring = internal_json::extract_version_for_type(metadata.other, "sequence_information");
35 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true);
36 if (version.major != 1) {
37 throw std::runtime_error("unsupported version string '" + vstring + "'");
38 }
39
40 auto handle = ritsuko::hdf5::open_file(path / "info.h5");
41 auto ghandle = ritsuko::hdf5::open_group(handle, "sequence_information");
42
43 size_t nseq = 0;
44 {
45 auto nhandle = ritsuko::hdf5::open_dataset(ghandle, "name");
46 if (!ritsuko::hdf5::is_utf8_string(nhandle)) {
47 throw std::runtime_error("expected 'name' to have a datatype that can be represented by a UTF-8 encoded string");
48 }
49
50 nseq = ritsuko::hdf5::get_1d_length(nhandle.getSpace(), false);
51 std::unordered_set<std::string> collected;
52 ritsuko::hdf5::Stream1dStringDataset stream(&nhandle, nseq, options.hdf5_buffer_size);
53 for (size_t s = 0; s < nseq; ++s, stream.next()) {
54 auto x = stream.steal();
55 if (collected.find(x) != collected.end()) {
56 throw std::runtime_error("detected duplicated sequence name '" + x + "'");
57 }
58 collected.insert(std::move(x));
59 }
60 }
61
62 const char* missing_attr_name = "missing-value-placeholder";
63
64 {
65 auto lhandle = ritsuko::hdf5::open_dataset(ghandle, "length");
66 if (ritsuko::hdf5::exceeds_integer_limit(lhandle, 64, false)) {
67 throw std::runtime_error("expected a datatype for 'length' that fits in a 64-bit unsigned integer");
68 }
69 if (ritsuko::hdf5::get_1d_length(lhandle.getSpace(), false) != nseq) {
70 throw std::runtime_error("expected lengths of 'length' and 'name' to be equal");
71 }
72 if (lhandle.attrExists(missing_attr_name)) {
73 auto ahandle = lhandle.openAttribute(missing_attr_name);
74 ritsuko::hdf5::check_missing_placeholder_attribute(lhandle, ahandle);
75 }
76 }
77
78 {
79 auto chandle = ritsuko::hdf5::open_dataset(ghandle, "circular");
80 if (ritsuko::hdf5::exceeds_integer_limit(chandle, 32, true)) {
81 throw std::runtime_error("expected a datatype for 'circular' that fits in a 32-bit signed integer");
82 }
83 if (ritsuko::hdf5::get_1d_length(chandle.getSpace(), false) != nseq) {
84 throw std::runtime_error("expected lengths of 'length' and 'circular' to be equal");
85 }
86 if (chandle.attrExists(missing_attr_name)) {
87 auto ahandle = chandle.openAttribute(missing_attr_name);
88 ritsuko::hdf5::check_missing_placeholder_attribute(chandle, ahandle);
89 }
90 }
91
92 {
93 auto gnhandle = ritsuko::hdf5::open_dataset(ghandle, "genome");
94 if (!ritsuko::hdf5::is_utf8_string(gnhandle)) {
95 throw std::runtime_error("expected 'genome' to have a datatype that can be represented by a UTF-8 encoded string");
96 }
97 if (ritsuko::hdf5::get_1d_length(gnhandle.getSpace(), false) != nseq) {
98 throw std::runtime_error("expected lengths of 'length' and 'genome' to be equal");
99 }
100 if (gnhandle.attrExists(missing_attr_name)) {
101 auto ahandle = gnhandle.openAttribute(missing_attr_name);
102 ritsuko::hdf5::check_missing_placeholder_attribute(gnhandle, ahandle);
103 }
104 }
105}
106
107}
108
109}
110
111#endif
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition sequence_information.hpp:33
takane validation functions.
Definition _derived_from.hpp:15
Object metadata, including the type and other fields.
Definition utils_public.hpp:26
std::unordered_map< std::string, std::shared_ptr< millijson::Base > > other
Definition utils_public.hpp:35
Validation options.
Definition utils_public.hpp:94
hsize_t hdf5_buffer_size
Definition utils_public.hpp:103
Exported utilities.