diff options
-rw-r--r-- | CMakeLists.txt | 1 | ||||
-rw-r--r-- | example.cc | 8 | ||||
-rw-r--r-- | src/istream/buffer.cc | 2 | ||||
-rw-r--r-- | src/util/query.cc | 57 | ||||
-rw-r--r-- | src/util/query.h | 12 |
5 files changed, 77 insertions, 3 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index f67bc42..45d4b67 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,6 +14,7 @@ add_library( DictzipQuery SHARED src/util/base64.cc + src/util/query.cc src/istream/stream.cc src/istream/buffer.cc ) @@ -1,5 +1,6 @@ #include "istream/stream.h" #include "util/base64.h" +#include "util/query.h" #include <string> #include <iostream> @@ -17,10 +18,13 @@ std::string get(const std::string& path, std::size_t offset, std::size_t length) } int main() { + // Get location of _Accession_ + const std::string line = dictzip::get_line_starting_with("gcide.index", "Accession"); + // Decode location of _Accession_ // `gcide.index[1089]: "Accession 8Aw Wt" - const std::size_t offset = dictzip::base64_decode("8Aw"); - const std::size_t length = dictzip::base64_decode("Wt"); + const std::size_t offset = dictzip::base64_decode(dictzip::get_encoded_offset(line)); + const std::size_t length = dictzip::base64_decode(dictzip::get_encoded_length(line)); // Print the GCIDE definition of _Accession_ std::cout << get("gcide.dict.dz", offset, length) << std::endl; diff --git a/src/istream/buffer.cc b/src/istream/buffer.cc index 26697f4..2a195e0 100644 --- a/src/istream/buffer.cc +++ b/src/istream/buffer.cc @@ -20,7 +20,7 @@ void IstreamBuf::readChunk(long n) { std::fseek(dictzip_file_, data_offset_ + chunkN.offset, SEEK_SET); - if ( fread(zBuf, 1, chunkN.size, dictzip_file_) != chunkN.size ) { + if ( std::fread(zBuf, 1, chunkN.size, dictzip_file_) != chunkN.size ) { throw std::runtime_error("Could not read dictzip chunk."); } diff --git a/src/util/query.cc b/src/util/query.cc new file mode 100644 index 0000000..a6fdcd3 --- /dev/null +++ b/src/util/query.cc @@ -0,0 +1,57 @@ +#include "query.h" + +#include <fcntl.h> + +#include <cstdio> +#include <cstring> + +#include <stdexcept> + +namespace dictzip { + +std::string get_line_starting_with( + const std::string& path, const std::string& substring) { + static const auto BUFFER_SIZE = 16*1024; + + FILE* file = std::fopen(path.c_str(), "r"); + + posix_fadvise(fileno(file), 0, 0, 1); // FDADVICE_SEQUENTIAL + + char buffer[BUFFER_SIZE + 1]; + char* start_of_match = nullptr; + + while( std::size_t bytes_read = std::fread(buffer, sizeof(char), BUFFER_SIZE, file) ) { + if ( bytes_read ) { + for ( char* p = buffer; + (p = static_cast<char*>(std::memchr(p, '\n', (buffer + bytes_read) - p))); + ++p ) { + if ( start_of_match == nullptr ) { + if ( std::strncmp(substring.c_str(), p+1, substring.size()) == 0 ) { + start_of_match = p+1; + } + } else { + return std::string(start_of_match, p-start_of_match); + } + } + } else { + break; + } + } + + throw std::runtime_error("No match found"); +} + +std::string get_encoded_offset(const std::string& line) { + const std::size_t start = line.find_first_of('\t'); + const std::size_t end = line.find_last_of('\t'); + + return line.substr(start + 1, end - (start + 1)); +} + +std::string get_encoded_length(const std::string& line) { + const std::size_t start = line.find_last_of('\t'); + + return line.substr(start + 1); +} + +} diff --git a/src/util/query.h b/src/util/query.h new file mode 100644 index 0000000..3a25206 --- /dev/null +++ b/src/util/query.h @@ -0,0 +1,12 @@ +#pragma once + +#include <string> + +namespace dictzip { + +std::string get_line_starting_with(const std::string& path, const std::string& substring); + +std::string get_encoded_offset(const std::string& line); +std::string get_encoded_length(const std::string& line); + +} |