1#ifndef TAKANE_VCF_EXPERIMENT_HPP
2#define TAKANE_VCF_EXPERIMENT_HPP
8#include "ritsuko/hdf5/hdf5.hpp"
11#include "utils_string.hpp"
12#include "utils_summarized_experiment.hpp"
13#include "utils_json.hpp"
14#include "utils_files.hpp"
27namespace vcf_experiment {
35inline std::pair<size_t, size_t> scan_vcf_dimensions(
const std::filesystem::path& path,
bool expanded,
bool parallel) {
36 internal_files::check_gzip_signature(path);
37 auto reader = internal_other::open_reader<byteme::GzipFileReader>(path, byteme::GzipFileReaderOptions());
38 auto pb = internal_other::wrap_reader_for_bytes<char>(std::move(reader), parallel);
42 const std::string expected =
"##fileformat=VCFv";
43 const size_t len = expected.size();
44 bool okay = pb->valid();
46 for (
size_t i = 0; i < len; ++i) {
48 throw std::runtime_error(
"incomplete VCF file signature");
50 if (pb->get() != expected[i]) {
51 throw std::runtime_error(
"incorrect VCF file signature");
59 if (pb->get() ==
'\n') {
61 for (
int i = 0; i < 2; ++i) {
63 throw std::runtime_error(
"premature end to the VCF file");
65 if (pb->get() !=
'#') {
75 throw std::runtime_error(
"premature end to the VCF file");
80 size_t num_samples = 0;
82 size_t num_indents = 0;
84 char current = pb->get();
85 if (current ==
'\t') {
87 }
else if (current ==
'\n') {
92 throw std::runtime_error(
"premature end to the VCF file");
96 if (num_indents < 8) {
97 throw std::runtime_error(
"expected at least 9 fields in the VCF header line, including 'FORMAT'");
99 num_samples = num_indents - 8;
102 size_t expected_rows = 0;
104 while (pb->valid()) {
109 size_t num_indents = 0;
111 char current = pb->get();
112 if (current ==
'\t') {
114 if (num_indents == 4) {
115 if (!pb->advance()) {
116 throw std::runtime_error(
"premature end of line for VCF record");
120 }
else if (current ==
'\n') {
121 throw std::runtime_error(
"premature end of line for VCF record");
123 if (!pb->advance()) {
124 throw std::runtime_error(
"premature end of line for VCF record");
130 char current = pb->get();
131 if (current ==
',') {
132 throw std::runtime_error(
"expected a 1:1 mapping of rows to alternative alleles when 'vcf_experiment.expanded = true'");
133 }
else if (current ==
'\t') {
135 }
else if (current ==
'\n') {
136 throw std::runtime_error(
"premature end of line for VCF record");
138 if (!pb->advance()) {
139 throw std::runtime_error(
"premature end of line for VCF record");
145 if (pb->get() ==
'\n') {
149 if (!pb->advance()) {
150 throw std::runtime_error(
"premature end of line for VCF record");
159 if (pb->get() ==
'\n') {
161 if (!pb->advance()) {
165 if (!pb->advance()) {
166 throw std::runtime_error(
"premature end of line for VCF record");
173 return std::make_pair(expected_rows, num_samples);
187 const std::string type_name =
"vcf_experiment";
188 const auto& vcfmap = internal_json::extract_typed_object_from_metadata(metadata.
other, type_name);
190 const std::string version_name =
"version";
191 const std::string& vstring = internal_json::extract_string_from_typed_object(vcfmap, version_name, type_name);
192 auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(),
true);
193 if (version.major != 1) {
194 throw std::runtime_error(
"unsupported version string '" + vstring +
"'");
198 auto dims = internal_summarized_experiment::extract_dimensions_json(vcfmap,
"vcf_experiment");
202 auto eIt = vcfmap.find(
"expanded");
203 if (eIt == vcfmap.end()) {
204 throw std::runtime_error(
"expected a 'vcf_experiment.expanded' property");
206 const auto& val = eIt->second;
207 if (val->type() != millijson::BOOLEAN) {
208 throw std::runtime_error(
"'vcf_experiment.expanded' property should be a JSON boolean");
210 exp =
reinterpret_cast<const millijson::Boolean*
>(val.get())->value();
213 auto ipath = path /
"file.vcf.gz";
214 std::pair<size_t, size_t> obs_dims;
216 obs_dims = internal::scan_vcf_dimensions(ipath, exp, options.
parallel_reads);
217 }
catch (std::exception& e) {
218 throw std::runtime_error(
"failed to parse '" + ipath.string() +
"'; " + std::string(e.what()));
221 if (obs_dims.first != dims.first) {
222 throw std::runtime_error(
"reported 'vcf_experiment.dimensions[0]' does not match the number of records in '" + ipath.string() +
"'");
224 if (obs_dims.second != dims.second) {
225 throw std::runtime_error(
"reported 'vcf_experiment.dimensions[1]' does not match the number of samples in '" + ipath.string() +
"'");
236 const std::string type_name =
"vcf_experiment";
237 const auto& vcfmap = internal_json::extract_typed_object_from_metadata(metadata.
other, type_name);
238 auto dims = internal_summarized_experiment::extract_dimensions_json(vcfmap, type_name);
249 const std::string type_name =
"vcf_experiment";
250 const auto& vcfmap = internal_json::extract_typed_object_from_metadata(metadata.
other, type_name);
251 auto dims = internal_summarized_experiment::extract_dimensions_json(vcfmap, type_name);
252 return std::vector<size_t>{ dims.first, dims.second };
std::vector< size_t > dimensions(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition vcf_experiment.hpp:248
size_t height(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition vcf_experiment.hpp:235
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition vcf_experiment.hpp:186
takane validation functions.
Definition _derived_from.hpp:15
Validation options.
Definition utils_public.hpp:94
bool parallel_reads
Definition utils_public.hpp:98