From b93f4fea59dfab2c8cb06d00b32a567838982bdf Mon Sep 17 00:00:00 2001 From: Adrian Kummerlaender Date: Fri, 6 Oct 2017 16:46:31 +0200 Subject: Implement primitive dictzip index queries --- src/istream/buffer.cc | 2 +- src/util/query.cc | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/util/query.h | 12 +++++++++++ 3 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 src/util/query.cc create mode 100644 src/util/query.h (limited to 'src') diff --git a/src/istream/buffer.cc b/src/istream/buffer.cc index 26697f4..2a195e0 100644 --- a/src/istream/buffer.cc +++ b/src/istream/buffer.cc @@ -20,7 +20,7 @@ void IstreamBuf::readChunk(long n) { std::fseek(dictzip_file_, data_offset_ + chunkN.offset, SEEK_SET); - if ( fread(zBuf, 1, chunkN.size, dictzip_file_) != chunkN.size ) { + if ( std::fread(zBuf, 1, chunkN.size, dictzip_file_) != chunkN.size ) { throw std::runtime_error("Could not read dictzip chunk."); } diff --git a/src/util/query.cc b/src/util/query.cc new file mode 100644 index 0000000..a6fdcd3 --- /dev/null +++ b/src/util/query.cc @@ -0,0 +1,57 @@ +#include "query.h" + +#include + +#include +#include + +#include + +namespace dictzip { + +std::string get_line_starting_with( + const std::string& path, const std::string& substring) { + static const auto BUFFER_SIZE = 16*1024; + + FILE* file = std::fopen(path.c_str(), "r"); + + posix_fadvise(fileno(file), 0, 0, 1); // FDADVICE_SEQUENTIAL + + char buffer[BUFFER_SIZE + 1]; + char* start_of_match = nullptr; + + while( std::size_t bytes_read = std::fread(buffer, sizeof(char), BUFFER_SIZE, file) ) { + if ( bytes_read ) { + for ( char* p = buffer; + (p = static_cast(std::memchr(p, '\n', (buffer + bytes_read) - p))); + ++p ) { + if ( start_of_match == nullptr ) { + if ( std::strncmp(substring.c_str(), p+1, substring.size()) == 0 ) { + start_of_match = p+1; + } + } else { + return std::string(start_of_match, p-start_of_match); + } + } + } else { + break; + } + } + + throw std::runtime_error("No match found"); +} + +std::string get_encoded_offset(const std::string& line) { + const std::size_t start = line.find_first_of('\t'); + const std::size_t end = line.find_last_of('\t'); + + return line.substr(start + 1, end - (start + 1)); +} + +std::string get_encoded_length(const std::string& line) { + const std::size_t start = line.find_last_of('\t'); + + return line.substr(start + 1); +} + +} diff --git a/src/util/query.h b/src/util/query.h new file mode 100644 index 0000000..3a25206 --- /dev/null +++ b/src/util/query.h @@ -0,0 +1,12 @@ +#pragma once + +#include + +namespace dictzip { + +std::string get_line_starting_with(const std::string& path, const std::string& substring); + +std::string get_encoded_offset(const std::string& line); +std::string get_encoded_length(const std::string& line); + +} -- cgit v1.2.3