takane
Validators for ArtifactDB file formats
Loading...
Searching...
No Matches
multi_sample_dataset.hpp
Go to the documentation of this file.
1#ifndef TAKANE_MULTI_SAMPLE_DATASET_HPP
2#define TAKANE_MULTI_SAMPLE_DATASET_HPP
3
4#include <filesystem>
5#include <string>
6#include <cstdint>
7#include <stdexcept>
8
9#include "utils_public.hpp"
10#include "utils_other.hpp"
11#include "utils_summarized_experiment.hpp"
12
18namespace takane {
19
23void validate(const std::filesystem::path&, const ObjectMetadata&, Options&);
24size_t height(const std::filesystem::path&, const ObjectMetadata&, Options&);
25bool satisfies_interface(const std::string&, const std::string&, const Options&);
34namespace multi_sample_dataset {
35
41inline void validate(const std::filesystem::path& path, const ObjectMetadata& metadata, Options& options) {
42 const std::string type_name = "multi_sample_dataset"; // use a separate variable to avoid dangling reference warnings from GCC.
43 const auto& dmap = internal_json::extract_typed_object_from_metadata(metadata.other, type_name);
44
45 const std::string version_name = "version"; // again, avoid dangling reference warnings.
46 const std::string& vstring = internal_json::extract_string_from_typed_object(dmap, version_name, type_name);
47 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true);
48 if (version.major != 1) {
49 throw std::runtime_error("unsupported version string '" + vstring + "'");
50 }
51
52 // Sample data should exist.
53 auto sd_path = path / "sample_data";
54 auto sdmeta = read_object_metadata(sd_path);
55 if (!satisfies_interface(sdmeta.type, "DATA_FRAME", options)) {
56 throw std::runtime_error("object in 'sample_data' should satisfy the 'DATA_FRAME' interface");
57 }
58 try {
59 ::takane::validate(sd_path, sdmeta, options);
60 } catch (std::exception& e) {
61 throw std::runtime_error("failed to validate 'sample_data'; " + std::string(e.what()));
62 }
63 size_t num_samples = ::takane::height(sd_path, sdmeta, options);
64
65 // Checking the experiments.
66 std::vector<size_t> num_columns;
67 auto edir = path / "experiments";
68 if (std::filesystem::exists(edir)) {
69 size_t num_experiments = internal_summarized_experiment::check_names_json(edir);
70 num_columns.reserve(num_experiments);
71
72 for (size_t e = 0; e < num_experiments; ++e) {
73 auto ename = std::to_string(e);
74 auto epath = edir / ename;
75 auto emeta = read_object_metadata(epath);
76
77 if (!satisfies_interface(emeta.type, "SUMMARIZED_EXPERIMENT", options)) {
78 throw std::runtime_error("object in 'experiments/" + ename + "' should satisfy the 'SUMMARIZED_EXPERIMENT' interface");
79 }
80
81 try {
82 ::takane::validate(epath, emeta, options);
83 } catch (std::exception& e) {
84 throw std::runtime_error("failed to validate 'experiments/" + ename + "'; " + std::string(e.what()));
85 }
86
87 auto dims = ::takane::dimensions(epath, emeta, options);
88 num_columns.push_back(dims[1]);
89 }
90
91 size_t num_dir_obj = internal_other::count_directory_entries(edir);
92 if (num_dir_obj - 1 != num_experiments) { // -1 to account for the names.json file itself.
93 throw std::runtime_error("more objects than expected inside the 'experiments' subdirectory");
94 }
95 }
96
97 // Checking the sample map.
98 if (num_columns.size() > 0) {
99 try {
100 auto handle = ritsuko::hdf5::open_file(path / "sample_map.h5");
101 auto ghandle = ritsuko::hdf5::open_group(handle, type_name.c_str());
102
103 for (size_t e = 0, end = num_columns.size(); e < end; ++e) {
104 auto ename = std::to_string(e);
105 auto dhandle = ritsuko::hdf5::open_dataset(ghandle, ename.c_str());
106 if (ritsuko::hdf5::exceeds_integer_limit(dhandle, 64, false)) {
107 throw std::runtime_error("'multi_sample_dataset/" + ename + "' should have a datatype that fits into a 64-bit unsigned integer");
108 }
109
110 auto len = ritsuko::hdf5::get_1d_length(dhandle.getSpace(), false);
111 if (len != num_columns[e]) {
112 throw std::runtime_error("length of 'multi_sample_dataset/" + ename + "' should equal the number of columns of 'experiments/" + ename + "'");
113 }
114
115 ritsuko::hdf5::Stream1dNumericDataset<uint64_t> stream(&dhandle, len, options.hdf5_buffer_size);
116 for (hsize_t i = 0; i < len; ++i, stream.next()) {
117 auto x = stream.get();
118 if (static_cast<size_t>(x) >= num_samples) {
119 throw std::runtime_error("indices in 'multi_sample_dataset/" + ename + "' should be less than the number of samples");
120 }
121 }
122 }
123
124 if (num_columns.size() != ghandle.getNumObjs()) {
125 throw std::runtime_error("more objects present in the 'multi_sample_dataset' group than expected");
126 }
127 } catch (std::exception& e) {
128 throw std::runtime_error("failed to validate the sample mapping; " + std::string(e.what()));
129 }
130 }
131
132 internal_other::validate_metadata(path, "other_data", options);
133}
134
135}
136
137}
138
139#endif
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition multi_sample_dataset.hpp:41
takane validation functions.
Definition _derived_from.hpp:15
size_t height(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition _height.hpp:88
bool satisfies_interface(const std::string &type, const std::string &interface, const Options &options)
Definition _satisfies_interface.hpp:67
ObjectMetadata read_object_metadata(const std::filesystem::path &path)
Definition utils_public.hpp:74
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition _validate.hpp:107
std::vector< size_t > dimensions(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition _dimensions.hpp:69
Object metadata, including the type and other fields.
Definition utils_public.hpp:26
std::unordered_map< std::string, std::shared_ptr< millijson::Base > > other
Definition utils_public.hpp:35
Validation options.
Definition utils_public.hpp:94
hsize_t hdf5_buffer_size
Definition utils_public.hpp:103
Exported utilities.