From b93f4fea59dfab2c8cb06d00b32a567838982bdf Mon Sep 17 00:00:00 2001 From: Adrian Kummerlaender Date: Fri, 6 Oct 2017 16:46:31 +0200 Subject: Implement primitive dictzip index queries --- CMakeLists.txt | 1 + example.cc | 8 ++++++-- src/istream/buffer.cc | 2 +- src/util/query.cc | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/util/query.h | 12 +++++++++++ 5 files changed, 77 insertions(+), 3 deletions(-) create mode 100644 src/util/query.cc create mode 100644 src/util/query.h diff --git a/CMakeLists.txt b/CMakeLists.txt index f67bc42..45d4b67 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,6 +14,7 @@ add_library( DictzipQuery SHARED src/util/base64.cc + src/util/query.cc src/istream/stream.cc src/istream/buffer.cc ) diff --git a/example.cc b/example.cc index f2a091d..f1fbc8c 100644 --- a/example.cc +++ b/example.cc @@ -1,5 +1,6 @@ #include "istream/stream.h" #include "util/base64.h" +#include "util/query.h" #include #include @@ -17,10 +18,13 @@ std::string get(const std::string& path, std::size_t offset, std::size_t length) } int main() { + // Get location of _Accession_ + const std::string line = dictzip::get_line_starting_with("gcide.index", "Accession"); + // Decode location of _Accession_ // `gcide.index[1089]: "Accession 8Aw Wt" - const std::size_t offset = dictzip::base64_decode("8Aw"); - const std::size_t length = dictzip::base64_decode("Wt"); + const std::size_t offset = dictzip::base64_decode(dictzip::get_encoded_offset(line)); + const std::size_t length = dictzip::base64_decode(dictzip::get_encoded_length(line)); // Print the GCIDE definition of _Accession_ std::cout << get("gcide.dict.dz", offset, length) << std::endl; diff --git a/src/istream/buffer.cc b/src/istream/buffer.cc index 26697f4..2a195e0 100644 --- a/src/istream/buffer.cc +++ b/src/istream/buffer.cc @@ -20,7 +20,7 @@ void IstreamBuf::readChunk(long n) { std::fseek(dictzip_file_, data_offset_ + chunkN.offset, SEEK_SET); - if ( fread(zBuf, 1, chunkN.size, dictzip_file_) != chunkN.size ) { + if ( std::fread(zBuf, 1, chunkN.size, dictzip_file_) != chunkN.size ) { throw std::runtime_error("Could not read dictzip chunk."); } diff --git a/src/util/query.cc b/src/util/query.cc new file mode 100644 index 0000000..a6fdcd3 --- /dev/null +++ b/src/util/query.cc @@ -0,0 +1,57 @@ +#include "query.h" + +#include + +#include +#include + +#include + +namespace dictzip { + +std::string get_line_starting_with( + const std::string& path, const std::string& substring) { + static const auto BUFFER_SIZE = 16*1024; + + FILE* file = std::fopen(path.c_str(), "r"); + + posix_fadvise(fileno(file), 0, 0, 1); // FDADVICE_SEQUENTIAL + + char buffer[BUFFER_SIZE + 1]; + char* start_of_match = nullptr; + + while( std::size_t bytes_read = std::fread(buffer, sizeof(char), BUFFER_SIZE, file) ) { + if ( bytes_read ) { + for ( char* p = buffer; + (p = static_cast(std::memchr(p, '\n', (buffer + bytes_read) - p))); + ++p ) { + if ( start_of_match == nullptr ) { + if ( std::strncmp(substring.c_str(), p+1, substring.size()) == 0 ) { + start_of_match = p+1; + } + } else { + return std::string(start_of_match, p-start_of_match); + } + } + } else { + break; + } + } + + throw std::runtime_error("No match found"); +} + +std::string get_encoded_offset(const std::string& line) { + const std::size_t start = line.find_first_of('\t'); + const std::size_t end = line.find_last_of('\t'); + + return line.substr(start + 1, end - (start + 1)); +} + +std::string get_encoded_length(const std::string& line) { + const std::size_t start = line.find_last_of('\t'); + + return line.substr(start + 1); +} + +} diff --git a/src/util/query.h b/src/util/query.h new file mode 100644 index 0000000..3a25206 --- /dev/null +++ b/src/util/query.h @@ -0,0 +1,12 @@ +#pragma once + +#include + +namespace dictzip { + +std::string get_line_starting_with(const std::string& path, const std::string& substring); + +std::string get_encoded_offset(const std::string& line); +std::string get_encoded_length(const std::string& line); + +} -- cgit v1.2.3