takane
Validators for ArtifactDB file formats
Loading...
Searching...
No Matches
sequence_information.hpp
Go to the documentation of this file.
1#ifndef TAKANE_SEQUENCE_INFORMATION_HPP
2#define TAKANE_SEQUENCE_INFORMATION_HPP
3
4#include "ritsuko/ritsuko.hpp"
5#include "ritsuko/hdf5/hdf5.hpp"
6
7#include <filesystem>
8#include <stdexcept>
9#include <unordered_set>
10#include <string>
11
12#include "utils_public.hpp"
13#include "utils_json.hpp"
14
20namespace takane {
21
26namespace sequence_information {
27
33inline void validate(const std::filesystem::path& path, const ObjectMetadata& metadata, Options& options) {
34 const std::string type_name = "sequence_information"; // use a separate variable to avoid dangling reference warnings from GCC.
35 const auto& vstring = internal_json::extract_version_for_type(metadata.other, type_name);
36 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true);
37 if (version.major != 1) {
38 throw std::runtime_error("unsupported version string '" + vstring + "'");
39 }
40
41 auto handle = ritsuko::hdf5::open_file(path / "info.h5");
42 auto ghandle = ritsuko::hdf5::open_group(handle, type_name.c_str());
43
44 size_t nseq = 0;
45 {
46 auto nhandle = ritsuko::hdf5::open_dataset(ghandle, "name");
47 if (!ritsuko::hdf5::is_utf8_string(nhandle)) {
48 throw std::runtime_error("expected 'name' to have a datatype that can be represented by a UTF-8 encoded string");
49 }
50
51 nseq = ritsuko::hdf5::get_1d_length(nhandle.getSpace(), false);
52 std::unordered_set<std::string> collected;
53 ritsuko::hdf5::Stream1dStringDataset stream(&nhandle, nseq, options.hdf5_buffer_size);
54 for (size_t s = 0; s < nseq; ++s, stream.next()) {
55 auto x = stream.steal();
56 if (collected.find(x) != collected.end()) {
57 throw std::runtime_error("detected duplicated sequence name '" + x + "'");
58 }
59 collected.insert(std::move(x));
60 }
61 }
62
63 const char* missing_attr_name = "missing-value-placeholder";
64
65 {
66 auto lhandle = ritsuko::hdf5::open_dataset(ghandle, "length");
67 if (ritsuko::hdf5::exceeds_integer_limit(lhandle, 64, false)) {
68 throw std::runtime_error("expected a datatype for 'length' that fits in a 64-bit unsigned integer");
69 }
70 if (ritsuko::hdf5::get_1d_length(lhandle.getSpace(), false) != nseq) {
71 throw std::runtime_error("expected lengths of 'length' and 'name' to be equal");
72 }
73 if (lhandle.attrExists(missing_attr_name)) {
74 auto ahandle = lhandle.openAttribute(missing_attr_name);
75 ritsuko::hdf5::check_numeric_missing_placeholder_attribute(lhandle, ahandle);
76 }
77 }
78
79 {
80 auto chandle = ritsuko::hdf5::open_dataset(ghandle, "circular");
81 if (ritsuko::hdf5::exceeds_integer_limit(chandle, 32, true)) {
82 throw std::runtime_error("expected a datatype for 'circular' that fits in a 32-bit signed integer");
83 }
84 if (ritsuko::hdf5::get_1d_length(chandle.getSpace(), false) != nseq) {
85 throw std::runtime_error("expected lengths of 'length' and 'circular' to be equal");
86 }
87 if (chandle.attrExists(missing_attr_name)) {
88 auto ahandle = chandle.openAttribute(missing_attr_name);
89 ritsuko::hdf5::check_numeric_missing_placeholder_attribute(chandle, ahandle);
90 }
91 }
92
93 {
94 auto gnhandle = ritsuko::hdf5::open_dataset(ghandle, "genome");
95 if (!ritsuko::hdf5::is_utf8_string(gnhandle)) {
96 throw std::runtime_error("expected 'genome' to have a datatype that can be represented by a UTF-8 encoded string");
97 }
98 if (ritsuko::hdf5::get_1d_length(gnhandle.getSpace(), false) != nseq) {
99 throw std::runtime_error("expected lengths of 'length' and 'genome' to be equal");
100 }
101 if (gnhandle.attrExists(missing_attr_name)) {
102 auto ahandle = gnhandle.openAttribute(missing_attr_name);
103 ritsuko::hdf5::check_string_missing_placeholder_attribute(ahandle);
104 }
105 }
106}
107
108}
109
110}
111
112#endif
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition sequence_information.hpp:33
takane validation functions.
Definition _derived_from.hpp:15
Object metadata, including the type and other fields.
Definition utils_public.hpp:26
std::unordered_map< std::string, std::shared_ptr< millijson::Base > > other
Definition utils_public.hpp:35
Validation options.
Definition utils_public.hpp:94
hsize_t hdf5_buffer_size
Definition utils_public.hpp:103
Exported utilities.