1#ifndef TAKANE_VCF_EXPERIMENT_HPP
2#define TAKANE_VCF_EXPERIMENT_HPP
8#include "ritsuko/hdf5/hdf5.hpp"
11#include "utils_string.hpp"
12#include "utils_summarized_experiment.hpp"
13#include "utils_json.hpp"
14#include "utils_files.hpp"
27namespace vcf_experiment {
35template<
bool parallel_>
36std::pair<size_t, size_t> scan_vcf_dimensions(
const std::filesystem::path& path,
bool expanded) {
37 internal_files::check_gzip_signature(path);
38 auto reader = internal_other::open_reader<byteme::GzipFileReader>(path);
39 typename std::conditional<parallel_, byteme::PerByteParallel<>, byteme::PerByte<> >::type pb(&reader);
43 const std::string expected =
"##fileformat=VCFv";
44 const size_t len = expected.size();
45 bool okay = pb.valid();
47 for (
size_t i = 0; i < len; ++i) {
49 throw std::runtime_error(
"incomplete VCF file signature");
51 if (pb.get() != expected[i]) {
52 throw std::runtime_error(
"incorrect VCF file signature");
60 if (pb.get() ==
'\n') {
62 for (
int i = 0; i < 2; ++i) {
64 throw std::runtime_error(
"premature end to the VCF file");
66 if (pb.get() !=
'#') {
76 throw std::runtime_error(
"premature end to the VCF file");
81 size_t num_samples = 0;
83 size_t num_indents = 0;
85 char current = pb.get();
86 if (current ==
'\t') {
88 }
else if (current ==
'\n') {
93 throw std::runtime_error(
"premature end to the VCF file");
97 if (num_indents < 8) {
98 throw std::runtime_error(
"expected at least 9 fields in the VCF header line, including 'FORMAT'");
100 num_samples = num_indents - 8;
103 size_t expected_rows = 0;
110 size_t num_indents = 0;
112 char current = pb.get();
113 if (current ==
'\t') {
115 if (num_indents == 4) {
117 throw std::runtime_error(
"premature end of line for VCF record");
121 }
else if (current ==
'\n') {
122 throw std::runtime_error(
"premature end of line for VCF record");
125 throw std::runtime_error(
"premature end of line for VCF record");
131 char current = pb.get();
132 if (current ==
',') {
133 throw std::runtime_error(
"expected a 1:1 mapping of rows to alternative alleles when 'vcf_experiment.expanded = true'");
134 }
else if (current ==
'\t') {
136 }
else if (current ==
'\n') {
137 throw std::runtime_error(
"premature end of line for VCF record");
140 throw std::runtime_error(
"premature end of line for VCF record");
146 if (pb.get() ==
'\n') {
151 throw std::runtime_error(
"premature end of line for VCF record");
160 if (pb.get() ==
'\n') {
167 throw std::runtime_error(
"premature end of line for VCF record");
174 return std::make_pair(expected_rows, num_samples);
188 const auto& vcfmap = internal_json::extract_typed_object_from_metadata(metadata.
other,
"vcf_experiment");
190 const std::string& vstring = internal_json::extract_string_from_typed_object(vcfmap,
"version",
"vcf_experiment");
191 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(),
true);
192 if (version.major != 1) {
193 throw std::runtime_error(
"unsupported version string '" + vstring +
"'");
197 auto dims = internal_summarized_experiment::extract_dimensions_json(vcfmap,
"vcf_experiment");
201 auto eIt = vcfmap.find(
"expanded");
202 if (eIt == vcfmap.end()) {
203 throw std::runtime_error(
"expected a 'vcf_experiment.expanded' property");
205 const auto& val = eIt->second;
206 if (val->type() != millijson::BOOLEAN) {
207 throw std::runtime_error(
"'vcf_experiment.expanded' property should be a JSON boolean");
209 exp =
reinterpret_cast<const millijson::Boolean*
>(val.get())->value;
212 auto ipath = path /
"file.vcf.gz";
213 std::pair<size_t, size_t> obs_dims;
216 obs_dims = internal::scan_vcf_dimensions<true>(ipath, exp);
218 obs_dims = internal::scan_vcf_dimensions<false>(ipath, exp);
220 }
catch (std::exception& e) {
221 throw std::runtime_error(
"failed to parse '" + ipath.string() +
"'; " + std::string(e.what()));
224 if (obs_dims.first != dims.first) {
225 throw std::runtime_error(
"reported 'vcf_experiment.dimensions[0]' does not match the number of records in '" + ipath.string() +
"'");
227 if (obs_dims.second != dims.second) {
228 throw std::runtime_error(
"reported 'vcf_experiment.dimensions[1]' does not match the number of samples in '" + ipath.string() +
"'");
239 const auto& vcfmap = internal_json::extract_typed_object_from_metadata(metadata.
other,
"vcf_experiment");
240 auto dims = internal_summarized_experiment::extract_dimensions_json(vcfmap,
"vcf_experiment");
251 const auto& vcfmap = internal_json::extract_typed_object_from_metadata(metadata.
other,
"vcf_experiment");
252 auto dims = internal_summarized_experiment::extract_dimensions_json(vcfmap,
"vcf_experiment");
253 return std::vector<size_t>{ dims.first, dims.second };
std::vector< size_t > dimensions(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition vcf_experiment.hpp:250
size_t height(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition vcf_experiment.hpp:238
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition vcf_experiment.hpp:187
takane validation functions.
Definition _derived_from.hpp:15
Validation options.
Definition utils_public.hpp:94
bool parallel_reads
Definition utils_public.hpp:98