takane
Validators for ArtifactDB file formats
Loading...
Searching...
No Matches
multi_sample_dataset.hpp
Go to the documentation of this file.
1#ifndef TAKANE_MULTI_SAMPLE_DATASET_HPP
2#define TAKANE_MULTI_SAMPLE_DATASET_HPP
3
4#include <filesystem>
5#include <string>
6#include <cstdint>
7#include <stdexcept>
8
9#include "utils_public.hpp"
10#include "utils_other.hpp"
11#include "utils_summarized_experiment.hpp"
12
18namespace takane {
19
23void validate(const std::filesystem::path&, const ObjectMetadata&, Options&);
24size_t height(const std::filesystem::path&, const ObjectMetadata&, Options&);
25bool satisfies_interface(const std::string&, const std::string&, const Options&);
34namespace multi_sample_dataset {
35
41inline void validate(const std::filesystem::path& path, const ObjectMetadata& metadata, Options& options) {
42 const auto& dmap = internal_json::extract_typed_object_from_metadata(metadata.other, "multi_sample_dataset");
43
44 const std::string& vstring = internal_json::extract_string_from_typed_object(dmap, "version", "multi_sample_dataset");
45 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true);
46 if (version.major != 1) {
47 throw std::runtime_error("unsupported version string '" + vstring + "'");
48 }
49
50 // Sample data should exist.
51 auto sd_path = path / "sample_data";
52 auto sdmeta = read_object_metadata(sd_path);
53 if (!satisfies_interface(sdmeta.type, "DATA_FRAME", options)) {
54 throw std::runtime_error("object in 'sample_data' should satisfy the 'DATA_FRAME' interface");
55 }
56 try {
57 ::takane::validate(sd_path, sdmeta, options);
58 } catch (std::exception& e) {
59 throw std::runtime_error("failed to validate 'sample_data'; " + std::string(e.what()));
60 }
61 size_t num_samples = ::takane::height(sd_path, sdmeta, options);
62
63 // Checking the experiments.
64 std::vector<size_t> num_columns;
65 auto edir = path / "experiments";
66 if (std::filesystem::exists(edir)) {
67 size_t num_experiments = internal_summarized_experiment::check_names_json(edir);
68 num_columns.reserve(num_experiments);
69
70 for (size_t e = 0; e < num_experiments; ++e) {
71 auto ename = std::to_string(e);
72 auto epath = edir / ename;
73 auto emeta = read_object_metadata(epath);
74
75 if (!satisfies_interface(emeta.type, "SUMMARIZED_EXPERIMENT", options)) {
76 throw std::runtime_error("object in 'experiments/" + ename + "' should satisfy the 'SUMMARIZED_EXPERIMENT' interface");
77 }
78
79 try {
80 ::takane::validate(epath, emeta, options);
81 } catch (std::exception& e) {
82 throw std::runtime_error("failed to validate 'experiments/" + ename + "'; " + std::string(e.what()));
83 }
84
85 auto dims = ::takane::dimensions(epath, emeta, options);
86 num_columns.push_back(dims[1]);
87 }
88
89 size_t num_dir_obj = internal_other::count_directory_entries(edir);
90 if (num_dir_obj - 1 != num_experiments) { // -1 to account for the names.json file itself.
91 throw std::runtime_error("more objects than expected inside the 'experiments' subdirectory");
92 }
93 }
94
95 // Checking the sample map.
96 if (num_columns.size() > 0) {
97 try {
98 auto handle = ritsuko::hdf5::open_file(path / "sample_map.h5");
99 auto ghandle = ritsuko::hdf5::open_group(handle, "multi_sample_dataset");
100
101 for (size_t e = 0, end = num_columns.size(); e < end; ++e) {
102 auto ename = std::to_string(e);
103 auto dhandle = ritsuko::hdf5::open_dataset(ghandle, ename.c_str());
104 if (ritsuko::hdf5::exceeds_integer_limit(dhandle, 64, false)) {
105 throw std::runtime_error("'multi_sample_dataset/" + ename + "' should have a datatype that fits into a 64-bit unsigned integer");
106 }
107
108 auto len = ritsuko::hdf5::get_1d_length(dhandle.getSpace(), false);
109 if (len != num_columns[e]) {
110 throw std::runtime_error("length of 'multi_sample_dataset/" + ename + "' should equal the number of columns of 'experiments/" + ename + "'");
111 }
112
113 ritsuko::hdf5::Stream1dNumericDataset<uint64_t> stream(&dhandle, len, options.hdf5_buffer_size);
114 for (hsize_t i = 0; i < len; ++i, stream.next()) {
115 auto x = stream.get();
116 if (static_cast<size_t>(x) >= num_samples) {
117 throw std::runtime_error("indices in 'multi_sample_dataset/" + ename + "' should be less than the number of samples");
118 }
119 }
120 }
121
122 if (num_columns.size() != ghandle.getNumObjs()) {
123 throw std::runtime_error("more objects present in the 'multi_sample_dataset' group than expected");
124 }
125 } catch (std::exception& e) {
126 throw std::runtime_error("failed to validate the sample mapping; " + std::string(e.what()));
127 }
128 }
129
130 internal_other::validate_metadata(path, "other_data", options);
131}
132
133}
134
135}
136
137#endif
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition multi_sample_dataset.hpp:41
takane validation functions.
Definition _derived_from.hpp:15
size_t height(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition _height.hpp:88
bool satisfies_interface(const std::string &type, const std::string &interface, const Options &options)
Definition _satisfies_interface.hpp:67
ObjectMetadata read_object_metadata(const std::filesystem::path &path)
Definition utils_public.hpp:74
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition _validate.hpp:107
std::vector< size_t > dimensions(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition _dimensions.hpp:69
Object metadata, including the type and other fields.
Definition utils_public.hpp:26
std::unordered_map< std::string, std::shared_ptr< millijson::Base > > other
Definition utils_public.hpp:35
Validation options.
Definition utils_public.hpp:94
hsize_t hdf5_buffer_size
Definition utils_public.hpp:103
Exported utilities.