1#ifndef TAKANE_VCF_EXPERIMENT_HPP 
    2#define TAKANE_VCF_EXPERIMENT_HPP 
    8#include "ritsuko/hdf5/hdf5.hpp" 
   11#include "utils_string.hpp" 
   12#include "utils_summarized_experiment.hpp" 
   13#include "utils_json.hpp" 
   14#include "utils_files.hpp" 
   27namespace vcf_experiment {
 
   35inline std::pair<size_t, size_t> scan_vcf_dimensions(
const std::filesystem::path& path, 
bool expanded, 
bool parallel) {
 
   36    internal_files::check_gzip_signature(path);
 
   37    auto reader = internal_other::open_reader<byteme::GzipFileReader>(path, byteme::GzipFileReaderOptions());
 
   38    auto pb = internal_other::wrap_reader_for_bytes<char>(std::move(reader), parallel);
 
   42        const std::string expected = 
"##fileformat=VCFv";
 
   43        const size_t len = expected.size();
 
   44        bool okay = pb->valid();
 
   46        for (
size_t i = 0; i < len; ++i) {
 
   48                throw std::runtime_error(
"incomplete VCF file signature");
 
   50            if (pb->get() != expected[i]) {
 
   51                throw std::runtime_error(
"incorrect VCF file signature");
 
   59        if (pb->get() == 
'\n') {
 
   61            for (
int i = 0; i < 2; ++i) {
 
   63                    throw std::runtime_error(
"premature end to the VCF file");
 
   65                if (pb->get() != 
'#') {
 
   75            throw std::runtime_error(
"premature end to the VCF file");
 
   80    size_t num_samples = 0;
 
   82        size_t num_indents = 0;
 
   84            char current = pb->get();
 
   85            if (current == 
'\t') {
 
   87            } 
else if (current == 
'\n') {
 
   92                throw std::runtime_error(
"premature end to the VCF file");
 
   96        if (num_indents < 8) {
 
   97            throw std::runtime_error(
"expected at least 9 fields in the VCF header line, including 'FORMAT'");
 
   99        num_samples = num_indents - 8;
 
  102    size_t expected_rows = 0;
 
  104        while (pb->valid()) {
 
  109            size_t num_indents = 0;
 
  111                char current = pb->get();
 
  112                if (current == 
'\t') {
 
  114                    if (num_indents == 4) { 
 
  115                        if (!pb->advance()) { 
 
  116                            throw std::runtime_error(
"premature end of line for VCF record");
 
  120                } 
else if (current == 
'\n') {
 
  121                    throw std::runtime_error(
"premature end of line for VCF record");
 
  123                if (!pb->advance()) {
 
  124                    throw std::runtime_error(
"premature end of line for VCF record");
 
  130                char current = pb->get();
 
  131                if (current == 
',') {
 
  132                    throw std::runtime_error(
"expected a 1:1 mapping of rows to alternative alleles when 'vcf_experiment.expanded = true'");
 
  133                } 
else if (current == 
'\t') {
 
  135                } 
else if (current == 
'\n') {
 
  136                    throw std::runtime_error(
"premature end of line for VCF record");
 
  138                if (!pb->advance()) {
 
  139                    throw std::runtime_error(
"premature end of line for VCF record");
 
  145                if (pb->get() == 
'\n') {
 
  149                    if (!pb->advance()) {
 
  150                        throw std::runtime_error(
"premature end of line for VCF record");
 
  159                if (pb->get() == 
'\n') {
 
  161                    if (!pb->advance()) {
 
  165                    if (!pb->advance()) {
 
  166                        throw std::runtime_error(
"premature end of line for VCF record");
 
  173    return std::make_pair(expected_rows, num_samples);
 
  187    const std::string type_name = 
"vcf_experiment"; 
 
  188    const auto& vcfmap = internal_json::extract_typed_object_from_metadata(metadata.
other, type_name);
 
  190    const std::string version_name = 
"version"; 
 
  191    const std::string& vstring = internal_json::extract_string_from_typed_object(vcfmap, version_name, type_name);
 
  192    auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(),  
true);
 
  193    if (version.major != 1) {
 
  194        throw std::runtime_error(
"unsupported version string '" + vstring + 
"'");
 
  198    auto dims = internal_summarized_experiment::extract_dimensions_json(vcfmap, 
"vcf_experiment");
 
  202        auto eIt = vcfmap.find(
"expanded");
 
  203        if (eIt == vcfmap.end()) {
 
  204            throw std::runtime_error(
"expected a 'vcf_experiment.expanded' property");
 
  206        const auto& val = eIt->second;
 
  207        if (val->type() != millijson::BOOLEAN) {
 
  208            throw std::runtime_error(
"'vcf_experiment.expanded' property should be a JSON boolean");
 
  210        exp = 
reinterpret_cast<const millijson::Boolean*
>(val.get())->value();
 
  213    auto ipath = path / 
"file.vcf.gz";
 
  214    std::pair<size_t, size_t> obs_dims;
 
  216        obs_dims = internal::scan_vcf_dimensions(ipath, exp, options.
parallel_reads);
 
  217    } 
catch (std::exception& e) {
 
  218        throw std::runtime_error(
"failed to parse '" + ipath.string() + 
"'; " + std::string(e.what()));
 
  221    if (obs_dims.first != dims.first) {
 
  222        throw std::runtime_error(
"reported 'vcf_experiment.dimensions[0]' does not match the number of records in '" + ipath.string() + 
"'");
 
  224    if (obs_dims.second != dims.second) {
 
  225        throw std::runtime_error(
"reported 'vcf_experiment.dimensions[1]' does not match the number of samples in '" + ipath.string() + 
"'");
 
 
  236    const std::string type_name = 
"vcf_experiment"; 
 
  237    const auto& vcfmap = internal_json::extract_typed_object_from_metadata(metadata.
other, type_name);
 
  238    auto dims = internal_summarized_experiment::extract_dimensions_json(vcfmap, type_name);
 
 
  249    const std::string type_name = 
"vcf_experiment"; 
 
  250    const auto& vcfmap = internal_json::extract_typed_object_from_metadata(metadata.
other, type_name);
 
  251    auto dims = internal_summarized_experiment::extract_dimensions_json(vcfmap, type_name);
 
  252    return std::vector<size_t>{ dims.first, dims.second };
 
 
std::vector< size_t > dimensions(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition vcf_experiment.hpp:248
size_t height(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition vcf_experiment.hpp:235
void validate(const std::filesystem::path &path, const ObjectMetadata &metadata, Options &options)
Definition vcf_experiment.hpp:186
takane validation functions.
Definition _derived_from.hpp:15
Validation options.
Definition utils_public.hpp:94
bool parallel_reads
Definition utils_public.hpp:98