1#ifndef TAKANE_VCF_EXPERIMENT_HPP
2#define TAKANE_VCF_EXPERIMENT_HPP
8#include "ritsuko/hdf5/hdf5.hpp"
9#include "byteme/byteme.hpp"
12#include "utils_string.hpp"
13#include "utils_summarized_experiment.hpp"
14#include "utils_json.hpp"
15#include "utils_files.hpp"
28namespace vcf_experiment {
36inline std::pair<size_t, size_t> scan_vcf_dimensions(
const std::filesystem::path& path,
bool expanded,
bool parallel) {
37 internal_files::check_gzip_signature(path);
38 auto reader = internal_other::open_reader<byteme::GzipFileReader>(path, byteme::GzipFileReaderOptions());
39 auto pb = internal_other::wrap_reader_for_bytes<char>(std::move(reader), parallel);
43 const std::string expected =
"##fileformat=VCFv";
44 const size_t len = expected.size();
45 bool okay = pb->valid();
47 for (
size_t i = 0; i < len; ++i) {
49 throw std::runtime_error(
"incomplete VCF file signature");
51 if (pb->get() != expected[i]) {
52 throw std::runtime_error(
"incorrect VCF file signature");
60 if (pb->get() ==
'\n') {
62 for (
int i = 0; i < 2; ++i) {
64 throw std::runtime_error(
"premature end to the VCF file");
66 if (pb->get() !=
'#') {
76 throw std::runtime_error(
"premature end to the VCF file");
81 size_t num_samples = 0;
83 size_t num_indents = 0;
85 char current = pb->get();
86 if (current ==
'\t') {
88 }
else if (current ==
'\n') {
93 throw std::runtime_error(
"premature end to the VCF file");
97 if (num_indents < 8) {
98 throw std::runtime_error(
"expected at least 9 fields in the VCF header line, including 'FORMAT'");
100 num_samples = num_indents - 8;
103 size_t expected_rows = 0;
105 while (pb->valid()) {
110 size_t num_indents = 0;
112 char current = pb->get();
113 if (current ==
'\t') {
115 if (num_indents == 4) {
116 if (!pb->advance()) {
117 throw std::runtime_error(
"premature end of line for VCF record");
121 }
else if (current ==
'\n') {
122 throw std::runtime_error(
"premature end of line for VCF record");
124 if (!pb->advance()) {
125 throw std::runtime_error(
"premature end of line for VCF record");
131 char current = pb->get();
132 if (current ==
',') {
133 throw std::runtime_error(
"expected a 1:1 mapping of rows to alternative alleles when 'vcf_experiment.expanded = true'");
134 }
else if (current ==
'\t') {
136 }
else if (current ==
'\n') {
137 throw std::runtime_error(
"premature end of line for VCF record");
139 if (!pb->advance()) {
140 throw std::runtime_error(
"premature end of line for VCF record");
146 if (pb->get() ==
'\n') {
150 if (!pb->advance()) {
151 throw std::runtime_error(
"premature end of line for VCF record");
160 if (pb->get() ==
'\n') {
162 if (!pb->advance()) {
166 if (!pb->advance()) {
167 throw std::runtime_error(
"premature end of line for VCF record");
174 return std::make_pair(expected_rows, num_samples);
188 const std::string type_name =
"vcf_experiment";
189 const auto& vcfmap = internal_json::extract_typed_object_from_metadata(metadata.
other, type_name);
191 const std::string version_name =
"version";
192 const std::string& vstring = internal_json::extract_string_from_typed_object(vcfmap, version_name, type_name);
193 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(),
true);
194 if (version.major != 1) {
195 throw std::runtime_error(
"unsupported version string '" + vstring +
"'");
199 auto dims = internal_summarized_experiment::extract_dimensions_json(vcfmap,
"vcf_experiment");
203 auto eIt = vcfmap.find(
"expanded");
204 if (eIt == vcfmap.end()) {
205 throw std::runtime_error(
"expected a 'vcf_experiment.expanded' property");
207 const auto& val = eIt->second;
208 if (val->type() != millijson::BOOLEAN) {
209 throw std::runtime_error(
"'vcf_experiment.expanded' property should be a JSON boolean");
211 exp =
reinterpret_cast<const millijson::Boolean*
>(val.get())->value();
214 auto ipath = path /
"file.vcf.gz";
215 std::pair<size_t, size_t> obs_dims;
217 obs_dims = internal::scan_vcf_dimensions(ipath, exp, options.
parallel_reads);
218 }
catch (std::exception& e) {
219 throw std::runtime_error(
"failed to parse '" + ipath.string() +
"'; " + std::string(e.what()));
222 if (obs_dims.first != dims.first) {
223 throw std::runtime_error(
"reported 'vcf_experiment.dimensions[0]' does not match the number of records in '" + ipath.string() +
"'");
225 if (obs_dims.second != dims.second) {
226 throw std::runtime_error(
"reported 'vcf_experiment.dimensions[1]' does not match the number of samples in '" + ipath.string() +
"'");
237 const std::string type_name =
"vcf_experiment";
238 const auto& vcfmap = internal_json::extract_typed_object_from_metadata(metadata.
other, type_name);
239 auto dims = internal_summarized_experiment::extract_dimensions_json(vcfmap, type_name);
250 const std::string type_name =
"vcf_experiment";
251 const auto& vcfmap = internal_json::extract_typed_object_from_metadata(metadata.
other, type_name);
252 auto dims = internal_summarized_experiment::extract_dimensions_json(vcfmap, type_name);
253 return std::vector<size_t>{ dims.first, dims.second };
std::vector< size_t > dimensions(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition vcf_experiment.hpp:249
size_t height(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition vcf_experiment.hpp:236
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition vcf_experiment.hpp:187
takane validation functions.
Definition _derived_from.hpp:15
Validation options.
Definition utils_public.hpp:93
bool parallel_reads
Definition utils_public.hpp:97