ritsuko
Helper utilities for ArtifactDB C++ code
Loading...
Searching...
No Matches
Stream1dArray.hpp
Go to the documentation of this file.
1#ifndef RITSUKO_HDF5_STREAM_1D_ARRAY_HPP
2#define RITSUKO_HDF5_STREAM_1D_ARRAY_HPP
3
4#include "H5Cpp.h"
5
6#include <vector>
7#include <string>
8#include <stdexcept>
9#include <cstdint>
10
12#include "../get_1d_length.hpp"
13#include "../get_name.hpp"
14#include "../_strings.hpp"
15#include "Pointer.hpp"
16
22namespace ritsuko {
23
24namespace hdf5 {
25
26namespace vls {
27
37template<typename Offset_, typename Length_>
39public:
47 Stream1dArray(const H5::DataSet* pointers, const H5::DataSet* heap, hsize_t length, hsize_t buffer_size) :
48 my_pointers(pointers),
49 my_heap(heap),
50 my_pointer_full_length(length),
51 my_heap_full_length(get_1d_length(my_heap->getSpace(), false)),
52 my_pointer_block_size(pick_1d_block_size(my_pointers->getCreatePlist(), my_pointer_full_length, buffer_size)),
53 my_pointer_mspace(1, &my_pointer_block_size),
54 my_pointer_dspace(1, &my_pointer_full_length),
55 my_heap_dspace(1, &my_heap_full_length),
56 my_pointer_dtype(define_pointer_datatype<Offset_, Length_>()),
57 my_pointer_buffer(my_pointer_block_size),
58 my_final_buffer(my_pointer_block_size)
59 {
60 }
61
70 Stream1dArray(const H5::DataSet* pointers, const H5::DataSet* heap, hsize_t buffer_size) :
71 Stream1dArray(pointers, heap, get_1d_length(pointers->getSpace(), false), buffer_size)
72 {}
73
74public:
78 std::string get() {
79 while (my_consumed >= my_available) {
80 my_consumed -= my_available;
81 load();
82 }
83 return my_final_buffer[my_consumed];
84 }
85
91 std::string steal() {
92 while (my_consumed >= my_available) {
93 my_consumed -= my_available;
94 load();
95 }
96 return std::move(my_final_buffer[my_consumed]);
97 }
98
104 void next(size_t jump = 1) {
105 my_consumed += jump;
106 }
107
111 hsize_t length() const {
112 return my_pointer_full_length;
113 }
114
118 hsize_t position() const {
119 return my_consumed + my_last_loaded;
120 }
121
122private:
123 const H5::DataSet* my_pointers;
124 const H5::DataSet* my_heap;
125 hsize_t my_pointer_full_length, my_heap_full_length;
126 hsize_t my_pointer_block_size;
127 H5::DataSpace my_pointer_mspace, my_pointer_dspace;
128 H5::DataSpace my_heap_mspace, my_heap_dspace;
129
130 H5::DataType my_pointer_dtype;
131 std::vector<Pointer<Offset_, Length_> > my_pointer_buffer;
132 std::vector<uint8_t> my_heap_buffer;
133 std::vector<std::string> my_final_buffer;
134
135 hsize_t my_last_loaded = 0;
136 hsize_t my_consumed = 0;
137 hsize_t my_available = 0;
138
139 void load() {
140 if (my_last_loaded >= my_pointer_full_length) {
141 throw std::runtime_error("requesting data beyond the end of the dataset at '" + get_name(*my_pointers) + "'");
142 }
143 my_available = std::min(my_pointer_full_length - my_last_loaded, my_pointer_block_size);
144
145 constexpr hsize_t zero = 0;
146 my_pointer_mspace.selectHyperslab(H5S_SELECT_SET, &my_available, &zero);
147 my_pointer_dspace.selectHyperslab(H5S_SELECT_SET, &my_available, &my_last_loaded);
148 my_heap_dspace.selectNone();
149 my_pointers->read(my_pointer_buffer.data(), my_pointer_dtype, my_pointer_mspace, my_pointer_dspace);
150
151 for (size_t i = 0; i < my_available; ++i) {
152 const auto& val = my_pointer_buffer[i];
153 hsize_t start = val.offset;
154 hsize_t count = val.length;
155 if (start > my_heap_full_length || start + count > my_heap_full_length) {
156 throw std::runtime_error("VLS array pointers at '" + get_name(*my_pointers) + "' are out of range of the heap at '" + get_name(*my_heap) + "'");
157 }
158
159 auto& curstr = my_final_buffer[i];
160 curstr.clear();
161
162 if (count) {
163 // Don't attempt to batch these reads as we aren't guaranteed
164 // that they are non-overlapping or ordered. Hopefully HDF5 is
165 // keeping enough things in cache for repeated reads.
166 my_heap_mspace.setExtentSimple(1, &count);
167 my_heap_mspace.selectAll();
168 my_heap_dspace.selectHyperslab(H5S_SELECT_SET, &count, &start);
169 my_heap_buffer.resize(count);
170 my_heap->read(my_heap_buffer.data(), H5::PredType::NATIVE_UINT8, my_heap_mspace, my_heap_dspace);
171 const char* text_ptr = reinterpret_cast<const char*>(my_heap_buffer.data());
172 curstr.insert(curstr.end(), text_ptr, text_ptr + find_string_length(text_ptr, count));
173
174 /*
175 * Is it generally portable to reinterpret_cast the bytes in a
176 * uint8_t array? I think so; according to the C standard,
177 * uint8_t is guaranteed to be exactly 8 bits
178 * (https://stackoverflow.com/questions/15039077/uint8-t-8-bits-guarantee),
179 * so a uint8_t value should have the same bit representation
180 * across all implementations that define the uint8_t type. If
181 * we save a byte to HDF5 as a UINT8 on one machine and read it
182 * back into to memory on another machine, we should recover
183 * the same bit pattern. Thus, the reinterpret_cast to a char*
184 * should yield the same bit pattern across machines, allowing
185 * us to portably interpret the array as a string following the
186 * ASCII/UTF-8 spec (which define each character in binary).
187 */
188 }
189 }
190
191 my_last_loaded += my_available;
192 }
193};
194
195}
196
197}
198
199}
200
201#endif
Compound datatype for the VLS heap pointer.
Stream a 1-dimensional VLS array into memory.
Definition Stream1dArray.hpp:38
hsize_t length() const
Definition Stream1dArray.hpp:111
hsize_t position() const
Definition Stream1dArray.hpp:118
std::string get()
Definition Stream1dArray.hpp:78
std::string steal()
Definition Stream1dArray.hpp:91
Stream1dArray(const H5::DataSet *pointers, const H5::DataSet *heap, hsize_t length, hsize_t buffer_size)
Definition Stream1dArray.hpp:47
Stream1dArray(const H5::DataSet *pointers, const H5::DataSet *heap, hsize_t buffer_size)
Definition Stream1dArray.hpp:70
void next(size_t jump=1)
Definition Stream1dArray.hpp:104
Get the length of a 1-dimensional HDF5 dataset.
Get the name of a HDF5 object.
H5::CompType define_pointer_datatype()
Definition Pointer.hpp:60
std::string get_name(const Handle_ &handle)
Definition get_name.hpp:24
hsize_t pick_1d_block_size(const H5::DSetCreatPropList &cplist, hsize_t full_length, hsize_t buffer_size=10000)
Definition pick_1d_block_size.hpp:26
hsize_t get_1d_length(const H5::DataSpace &space, bool allow_scalar)
Definition get_1d_length.hpp:25
Assorted helper functions for parsing and validation.
Definition choose_missing_placeholder.hpp:15
Pick a block size for a 1-dimensional HDF5 dataset.