aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAdrian Kummerlaender2017-10-06 16:46:31 +0200
committerAdrian Kummerlaender2017-10-06 16:47:32 +0200
commitb93f4fea59dfab2c8cb06d00b32a567838982bdf (patch)
tree268e2d9f057acc72a6af42194497059bd0088b6f /src
parentd951a22faf35aabe526de2588006c9381904b137 (diff)
downloadDictzipQuery-b93f4fea59dfab2c8cb06d00b32a567838982bdf.tar
DictzipQuery-b93f4fea59dfab2c8cb06d00b32a567838982bdf.tar.gz
DictzipQuery-b93f4fea59dfab2c8cb06d00b32a567838982bdf.tar.bz2
DictzipQuery-b93f4fea59dfab2c8cb06d00b32a567838982bdf.tar.lz
DictzipQuery-b93f4fea59dfab2c8cb06d00b32a567838982bdf.tar.xz
DictzipQuery-b93f4fea59dfab2c8cb06d00b32a567838982bdf.tar.zst
DictzipQuery-b93f4fea59dfab2c8cb06d00b32a567838982bdf.zip
Implement primitive dictzip index queries
Diffstat (limited to 'src')
-rw-r--r--src/istream/buffer.cc2
-rw-r--r--src/util/query.cc57
-rw-r--r--src/util/query.h12
3 files changed, 70 insertions, 1 deletions
diff --git a/src/istream/buffer.cc b/src/istream/buffer.cc
index 26697f4..2a195e0 100644
--- a/src/istream/buffer.cc
+++ b/src/istream/buffer.cc
@@ -20,7 +20,7 @@ void IstreamBuf::readChunk(long n) {
std::fseek(dictzip_file_, data_offset_ + chunkN.offset, SEEK_SET);
- if ( fread(zBuf, 1, chunkN.size, dictzip_file_) != chunkN.size ) {
+ if ( std::fread(zBuf, 1, chunkN.size, dictzip_file_) != chunkN.size ) {
throw std::runtime_error("Could not read dictzip chunk.");
}
diff --git a/src/util/query.cc b/src/util/query.cc
new file mode 100644
index 0000000..a6fdcd3
--- /dev/null
+++ b/src/util/query.cc
@@ -0,0 +1,57 @@
+#include "query.h"
+
+#include <fcntl.h>
+
+#include <cstdio>
+#include <cstring>
+
+#include <stdexcept>
+
+namespace dictzip {
+
+std::string get_line_starting_with(
+ const std::string& path, const std::string& substring) {
+ static const auto BUFFER_SIZE = 16*1024;
+
+ FILE* file = std::fopen(path.c_str(), "r");
+
+ posix_fadvise(fileno(file), 0, 0, 1); // FDADVICE_SEQUENTIAL
+
+ char buffer[BUFFER_SIZE + 1];
+ char* start_of_match = nullptr;
+
+ while( std::size_t bytes_read = std::fread(buffer, sizeof(char), BUFFER_SIZE, file) ) {
+ if ( bytes_read ) {
+ for ( char* p = buffer;
+ (p = static_cast<char*>(std::memchr(p, '\n', (buffer + bytes_read) - p)));
+ ++p ) {
+ if ( start_of_match == nullptr ) {
+ if ( std::strncmp(substring.c_str(), p+1, substring.size()) == 0 ) {
+ start_of_match = p+1;
+ }
+ } else {
+ return std::string(start_of_match, p-start_of_match);
+ }
+ }
+ } else {
+ break;
+ }
+ }
+
+ throw std::runtime_error("No match found");
+}
+
+std::string get_encoded_offset(const std::string& line) {
+ const std::size_t start = line.find_first_of('\t');
+ const std::size_t end = line.find_last_of('\t');
+
+ return line.substr(start + 1, end - (start + 1));
+}
+
+std::string get_encoded_length(const std::string& line) {
+ const std::size_t start = line.find_last_of('\t');
+
+ return line.substr(start + 1);
+}
+
+}
diff --git a/src/util/query.h b/src/util/query.h
new file mode 100644
index 0000000..3a25206
--- /dev/null
+++ b/src/util/query.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <string>
+
+namespace dictzip {
+
+std::string get_line_starting_with(const std::string& path, const std::string& substring);
+
+std::string get_encoded_offset(const std::string& line);
+std::string get_encoded_length(const std::string& line);
+
+}