takane
Validators for ArtifactDB file formats
Loading...
Searching...
No Matches
vcf_experiment.hpp
Go to the documentation of this file.
1#ifndef TAKANE_VCF_EXPERIMENT_HPP
2#define TAKANE_VCF_EXPERIMENT_HPP
3
4#include <string>
5#include <stdexcept>
6#include <filesystem>
7
8#include "ritsuko/hdf5/hdf5.hpp"
9
10#include "utils_public.hpp"
11#include "utils_string.hpp"
12#include "utils_summarized_experiment.hpp"
13#include "utils_json.hpp"
14#include "utils_files.hpp"
15
21namespace takane {
22
27namespace vcf_experiment {
28
32namespace internal {
33
34// Format specification taken from https://samtools.github.io/hts-specs/VCFv4.1.pdf.
35inline std::pair<size_t, size_t> scan_vcf_dimensions(const std::filesystem::path& path, bool expanded, bool parallel) {
36 internal_files::check_gzip_signature(path);
37 auto reader = internal_other::open_reader<byteme::GzipFileReader>(path, byteme::GzipFileReaderOptions());
38 auto pb = internal_other::wrap_reader_for_bytes<char>(std::move(reader), parallel);
39
40 // Checking the signature.
41 {
42 const std::string expected = "##fileformat=VCFv";
43 const size_t len = expected.size();
44 bool okay = pb->valid();
45
46 for (size_t i = 0; i < len; ++i) {
47 if (!okay) {
48 throw std::runtime_error("incomplete VCF file signature");
49 }
50 if (pb->get() != expected[i]) {
51 throw std::runtime_error("incorrect VCF file signature");
52 }
53 okay = pb->advance();
54 }
55 }
56
57 // Scanning until we find a line that doesn't start with '##'.
58 while (true) {
59 if (pb->get() == '\n') {
60 bool escape = false;
61 for (int i = 0; i < 2; ++i) {
62 if (!pb->advance()) {
63 throw std::runtime_error("premature end to the VCF file");
64 }
65 if (pb->get() != '#') {
66 escape = true;
67 break;
68 }
69 }
70 if (escape) {
71 break;
72 }
73 }
74 if (!pb->advance()) {
75 throw std::runtime_error("premature end to the VCF file");
76 }
77 }
78
79 // Scanning the header line to count the number of samples.
80 size_t num_samples = 0;
81 {
82 size_t num_indents = 0;
83 while (true) {
84 char current = pb->get();
85 if (current == '\t') {
86 ++num_indents;
87 } else if (current == '\n') {
88 pb->advance(); // skip past the newline.
89 break;
90 }
91 if (!pb->advance()) {
92 throw std::runtime_error("premature end to the VCF file");
93 }
94 }
95
96 if (num_indents < 8) {
97 throw std::runtime_error("expected at least 9 fields in the VCF header line, including 'FORMAT'");
98 }
99 num_samples = num_indents - 8;
100 }
101
102 size_t expected_rows = 0;
103 if (expanded) {
104 while (pb->valid()) {
105 ++expected_rows;
106
107 // Scanning up to the ALT field, which is the 5th one. We need this to
108 // find the number of alternative alleles (and thus the expansion of rows).
109 size_t num_indents = 0;
110 while (true) {
111 char current = pb->get();
112 if (current == '\t') {
113 ++num_indents;
114 if (num_indents == 4) {
115 if (!pb->advance()) { // get past this indent.
116 throw std::runtime_error("premature end of line for VCF record");
117 }
118 break;
119 }
120 } else if (current == '\n') {
121 throw std::runtime_error("premature end of line for VCF record");
122 }
123 if (!pb->advance()) {
124 throw std::runtime_error("premature end of line for VCF record");
125 }
126 }
127
128 // Checking that we don't have any commas if it's expanded.
129 while (true) {
130 char current = pb->get();
131 if (current == ',') {
132 throw std::runtime_error("expected a 1:1 mapping of rows to alternative alleles when 'vcf_experiment.expanded = true'");
133 } else if (current == '\t') {
134 break;
135 } else if (current == '\n') {
136 throw std::runtime_error("premature end of line for VCF record");
137 }
138 if (!pb->advance()) {
139 throw std::runtime_error("premature end of line for VCF record");
140 }
141 }
142
143 // Chomping the rest of the line; we assume all lines are newline-terminated.
144 while (true) {
145 if (pb->get() == '\n') {
146 pb->advance(); // skip past the newline.
147 break;
148 } else {
149 if (!pb->advance()) {
150 throw std::runtime_error("premature end of line for VCF record");
151 }
152 }
153 }
154 }
155
156 } else {
157 if (pb->valid()) {
158 while (true) {
159 if (pb->get() == '\n') {
160 ++expected_rows; // assume all files are newline-terminated.
161 if (!pb->advance()) {
162 break;
163 }
164 } else {
165 if (!pb->advance()) {
166 throw std::runtime_error("premature end of line for VCF record");
167 }
168 }
169 }
170 }
171 }
172
173 return std::make_pair(expected_rows, num_samples);
174}
175
176}
186inline void validate(const std::filesystem::path& path, const ObjectMetadata& metadata, Options& options) {
187 const std::string type_name = "vcf_experiment"; // use a separate variable to avoid dangling reference warnings from GCC.
188 const auto& vcfmap = internal_json::extract_typed_object_from_metadata(metadata.other, type_name);
189
190 const std::string version_name = "version"; // again, avoid dangling reference warnings.
191 const std::string& vstring = internal_json::extract_string_from_typed_object(vcfmap, version_name, type_name);
192 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true);
193 if (version.major != 1) {
194 throw std::runtime_error("unsupported version string '" + vstring + "'");
195 }
196
197 // Other bits and pieces of the metadata to check.
198 auto dims = internal_summarized_experiment::extract_dimensions_json(vcfmap, "vcf_experiment");
199
200 bool exp = false;
201 {
202 auto eIt = vcfmap.find("expanded");
203 if (eIt == vcfmap.end()) {
204 throw std::runtime_error("expected a 'vcf_experiment.expanded' property");
205 }
206 const auto& val = eIt->second;
207 if (val->type() != millijson::BOOLEAN) {
208 throw std::runtime_error("'vcf_experiment.expanded' property should be a JSON boolean");
209 }
210 exp = reinterpret_cast<const millijson::Boolean*>(val.get())->value();
211 }
212
213 auto ipath = path / "file.vcf.gz";
214 std::pair<size_t, size_t> obs_dims;
215 try {
216 obs_dims = internal::scan_vcf_dimensions(ipath, exp, options.parallel_reads);
217 } catch (std::exception& e) {
218 throw std::runtime_error("failed to parse '" + ipath.string() + "'; " + std::string(e.what()));
219 }
220
221 if (obs_dims.first != dims.first) {
222 throw std::runtime_error("reported 'vcf_experiment.dimensions[0]' does not match the number of records in '" + ipath.string() + "'");
223 }
224 if (obs_dims.second != dims.second) {
225 throw std::runtime_error("reported 'vcf_experiment.dimensions[1]' does not match the number of samples in '" + ipath.string() + "'");
226 }
227}
228
235inline size_t height([[maybe_unused]] const std::filesystem::path& path, const ObjectMetadata& metadata, [[maybe_unused]] Options& options) {
236 const std::string type_name = "vcf_experiment"; // use a separate variable to avoid dangling reference warnings from GCC.
237 const auto& vcfmap = internal_json::extract_typed_object_from_metadata(metadata.other, type_name);
238 auto dims = internal_summarized_experiment::extract_dimensions_json(vcfmap, type_name);
239 return dims.first;
240}
241
248inline std::vector<size_t> dimensions([[maybe_unused]] const std::filesystem::path& path, const ObjectMetadata& metadata, [[maybe_unused]] Options& options) {
249 const std::string type_name = "vcf_experiment"; // use a separate variable to avoid dangling reference warnings from GCC.
250 const auto& vcfmap = internal_json::extract_typed_object_from_metadata(metadata.other, type_name);
251 auto dims = internal_summarized_experiment::extract_dimensions_json(vcfmap, type_name);
252 return std::vector<size_t>{ dims.first, dims.second };
253}
254
255}
256
257}
258
259#endif
std::vector< size_t > dimensions(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition vcf_experiment.hpp:248
size_t height(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition vcf_experiment.hpp:235
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition vcf_experiment.hpp:186
takane validation functions.
Definition _derived_from.hpp:15
Object metadata, including the type and other fields.
Definition utils_public.hpp:26
std::unordered_map< std::string, std::shared_ptr< millijson::Base > > other
Definition utils_public.hpp:35
Validation options.
Definition utils_public.hpp:94
bool parallel_reads
Definition utils_public.hpp:98
Exported utilities.