takane
Validators for ArtifactDB file formats
Loading...
Searching...
No Matches
vcf_experiment.hpp
Go to the documentation of this file.
1#ifndef TAKANE_VCF_EXPERIMENT_HPP
2#define TAKANE_VCF_EXPERIMENT_HPP
3
4#include <string>
5#include <stdexcept>
6#include <filesystem>
7
8#include "ritsuko/hdf5/hdf5.hpp"
9
10#include "utils_public.hpp"
11#include "utils_string.hpp"
12#include "utils_summarized_experiment.hpp"
13#include "utils_json.hpp"
14#include "utils_files.hpp"
15
21namespace takane {
22
27namespace vcf_experiment {
28
32namespace internal {
33
34// Format specification taken from https://samtools.github.io/hts-specs/VCFv4.1.pdf.
35template<bool parallel_>
36std::pair<size_t, size_t> scan_vcf_dimensions(const std::filesystem::path& path, bool expanded) {
37 internal_files::check_gzip_signature(path);
38 auto reader = internal_other::open_reader<byteme::GzipFileReader>(path);
39 typename std::conditional<parallel_, byteme::PerByteParallel<>, byteme::PerByte<> >::type pb(&reader);
40
41 // Checking the signature.
42 {
43 const std::string expected = "##fileformat=VCFv";
44 const size_t len = expected.size();
45 bool okay = pb.valid();
46
47 for (size_t i = 0; i < len; ++i) {
48 if (!okay) {
49 throw std::runtime_error("incomplete VCF file signature");
50 }
51 if (pb.get() != expected[i]) {
52 throw std::runtime_error("incorrect VCF file signature");
53 }
54 okay = pb.advance();
55 }
56 }
57
58 // Scanning until we find a line that doesn't start with '##'.
59 while (true) {
60 if (pb.get() == '\n') {
61 bool escape = false;
62 for (int i = 0; i < 2; ++i) {
63 if (!pb.advance()) {
64 throw std::runtime_error("premature end to the VCF file");
65 }
66 if (pb.get() != '#') {
67 escape = true;
68 break;
69 }
70 }
71 if (escape) {
72 break;
73 }
74 }
75 if (!pb.advance()) {
76 throw std::runtime_error("premature end to the VCF file");
77 }
78 }
79
80 // Scanning the header line to count the number of samples.
81 size_t num_samples = 0;
82 {
83 size_t num_indents = 0;
84 while (true) {
85 char current = pb.get();
86 if (current == '\t') {
87 ++num_indents;
88 } else if (current == '\n') {
89 pb.advance(); // skip past the newline.
90 break;
91 }
92 if (!pb.advance()) {
93 throw std::runtime_error("premature end to the VCF file");
94 }
95 }
96
97 if (num_indents < 8) {
98 throw std::runtime_error("expected at least 9 fields in the VCF header line, including 'FORMAT'");
99 }
100 num_samples = num_indents - 8;
101 }
102
103 size_t expected_rows = 0;
104 if (expanded) {
105 while (pb.valid()) {
106 ++expected_rows;
107
108 // Scanning up to the ALT field, which is the 5th one. We need this to
109 // find the number of alternative alleles (and thus the expansion of rows).
110 size_t num_indents = 0;
111 while (true) {
112 char current = pb.get();
113 if (current == '\t') {
114 ++num_indents;
115 if (num_indents == 4) {
116 if (!pb.advance()) { // get past this indent.
117 throw std::runtime_error("premature end of line for VCF record");
118 }
119 break;
120 }
121 } else if (current == '\n') {
122 throw std::runtime_error("premature end of line for VCF record");
123 }
124 if (!pb.advance()) {
125 throw std::runtime_error("premature end of line for VCF record");
126 }
127 }
128
129 // Checking that we don't have any commas if it's expanded.
130 while (true) {
131 char current = pb.get();
132 if (current == ',') {
133 throw std::runtime_error("expected a 1:1 mapping of rows to alternative alleles when 'vcf_experiment.expanded = true'");
134 } else if (current == '\t') {
135 break;
136 } else if (current == '\n') {
137 throw std::runtime_error("premature end of line for VCF record");
138 }
139 if (!pb.advance()) {
140 throw std::runtime_error("premature end of line for VCF record");
141 }
142 }
143
144 // Chomping the rest of the line; we assume all lines are newline-terminated.
145 while (true) {
146 if (pb.get() == '\n') {
147 pb.advance(); // skip past the newline.
148 break;
149 } else {
150 if (!pb.advance()) {
151 throw std::runtime_error("premature end of line for VCF record");
152 }
153 }
154 }
155 }
156
157 } else {
158 if (pb.valid()) {
159 while (true) {
160 if (pb.get() == '\n') {
161 ++expected_rows; // assume all files are newline-terminated.
162 if (!pb.advance()) {
163 break;
164 }
165 } else {
166 if (!pb.advance()) {
167 throw std::runtime_error("premature end of line for VCF record");
168 }
169 }
170 }
171 }
172 }
173
174 return std::make_pair(expected_rows, num_samples);
175}
176
177}
187inline void validate(const std::filesystem::path& path, const ObjectMetadata& metadata, Options& options) {
188 const auto& vcfmap = internal_json::extract_typed_object_from_metadata(metadata.other, "vcf_experiment");
189
190 const std::string& vstring = internal_json::extract_string_from_typed_object(vcfmap, "version", "vcf_experiment");
191 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true);
192 if (version.major != 1) {
193 throw std::runtime_error("unsupported version string '" + vstring + "'");
194 }
195
196 // Other bits and pieces of the metadata to check.
197 auto dims = internal_summarized_experiment::extract_dimensions_json(vcfmap, "vcf_experiment");
198
199 bool exp = false;
200 {
201 auto eIt = vcfmap.find("expanded");
202 if (eIt == vcfmap.end()) {
203 throw std::runtime_error("expected a 'vcf_experiment.expanded' property");
204 }
205 const auto& val = eIt->second;
206 if (val->type() != millijson::BOOLEAN) {
207 throw std::runtime_error("'vcf_experiment.expanded' property should be a JSON boolean");
208 }
209 exp = reinterpret_cast<const millijson::Boolean*>(val.get())->value;
210 }
211
212 auto ipath = path / "file.vcf.gz";
213 std::pair<size_t, size_t> obs_dims;
214 try {
215 if (options.parallel_reads) {
216 obs_dims = internal::scan_vcf_dimensions<true>(ipath, exp);
217 } else {
218 obs_dims = internal::scan_vcf_dimensions<false>(ipath, exp);
219 }
220 } catch (std::exception& e) {
221 throw std::runtime_error("failed to parse '" + ipath.string() + "'; " + std::string(e.what()));
222 }
223
224 if (obs_dims.first != dims.first) {
225 throw std::runtime_error("reported 'vcf_experiment.dimensions[0]' does not match the number of records in '" + ipath.string() + "'");
226 }
227 if (obs_dims.second != dims.second) {
228 throw std::runtime_error("reported 'vcf_experiment.dimensions[1]' does not match the number of samples in '" + ipath.string() + "'");
229 }
230}
231
238inline size_t height([[maybe_unused]] const std::filesystem::path& path, const ObjectMetadata& metadata, [[maybe_unused]] Options& options) {
239 const auto& vcfmap = internal_json::extract_typed_object_from_metadata(metadata.other, "vcf_experiment");
240 auto dims = internal_summarized_experiment::extract_dimensions_json(vcfmap, "vcf_experiment");
241 return dims.first;
242}
243
250inline std::vector<size_t> dimensions([[maybe_unused]] const std::filesystem::path& path, const ObjectMetadata& metadata, [[maybe_unused]] Options& options) {
251 const auto& vcfmap = internal_json::extract_typed_object_from_metadata(metadata.other, "vcf_experiment");
252 auto dims = internal_summarized_experiment::extract_dimensions_json(vcfmap, "vcf_experiment");
253 return std::vector<size_t>{ dims.first, dims.second };
254}
255
256}
257
258}
259
260#endif
std::vector< size_t > dimensions(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition vcf_experiment.hpp:250
size_t height(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition vcf_experiment.hpp:238
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition vcf_experiment.hpp:187
takane validation functions.
Definition _derived_from.hpp:15
Object metadata, including the type and other fields.
Definition utils_public.hpp:26
std::unordered_map< std::string, std::shared_ptr< millijson::Base > > other
Definition utils_public.hpp:35
Validation options.
Definition utils_public.hpp:94
bool parallel_reads
Definition utils_public.hpp:98
Exported utilities.