From b93f4fea59dfab2c8cb06d00b32a567838982bdf Mon Sep 17 00:00:00 2001
From: Adrian Kummerlaender
Date: Fri, 6 Oct 2017 16:46:31 +0200
Subject: Implement primitive dictzip index queries

---
 CMakeLists.txt        |  1 +
 example.cc            |  8 ++++++--
 src/istream/buffer.cc |  2 +-
 src/util/query.cc     | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++
 src/util/query.h      | 12 +++++++++++
 5 files changed, 77 insertions(+), 3 deletions(-)
 create mode 100644 src/util/query.cc
 create mode 100644 src/util/query.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f67bc42..45d4b67 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,6 +14,7 @@ add_library(
 	DictzipQuery
 	SHARED
 		src/util/base64.cc
+		src/util/query.cc
 		src/istream/stream.cc
 		src/istream/buffer.cc
 )
diff --git a/example.cc b/example.cc
index f2a091d..f1fbc8c 100644
--- a/example.cc
+++ b/example.cc
@@ -1,5 +1,6 @@
 #include "istream/stream.h"
 #include "util/base64.h"
+#include "util/query.h"
 
 #include <string>
 #include <iostream>
@@ -17,10 +18,13 @@ std::string get(const std::string& path, std::size_t offset, std::size_t length)
 }
 
 int main() {
+	// Get location of _Accession_
+	const std::string line = dictzip::get_line_starting_with("gcide.index", "Accession");
+
 	// Decode location of _Accession_
 	//     `gcide.index[1089]: "Accession	8Aw	Wt"
-	const std::size_t offset = dictzip::base64_decode("8Aw");
-	const std::size_t length = dictzip::base64_decode("Wt");
+	const std::size_t offset = dictzip::base64_decode(dictzip::get_encoded_offset(line));
+	const std::size_t length = dictzip::base64_decode(dictzip::get_encoded_length(line));
 
 	// Print the GCIDE definition of _Accession_
 	std::cout << get("gcide.dict.dz", offset, length) << std::endl;
diff --git a/src/istream/buffer.cc b/src/istream/buffer.cc
index 26697f4..2a195e0 100644
--- a/src/istream/buffer.cc
+++ b/src/istream/buffer.cc
@@ -20,7 +20,7 @@ void IstreamBuf::readChunk(long n) {
 	
 	std::fseek(dictzip_file_, data_offset_ + chunkN.offset, SEEK_SET);
 
-	if ( fread(zBuf, 1, chunkN.size, dictzip_file_) != chunkN.size ) {
+	if ( std::fread(zBuf, 1, chunkN.size, dictzip_file_) != chunkN.size ) {
 		throw std::runtime_error("Could not read dictzip chunk.");
 	}
 
diff --git a/src/util/query.cc b/src/util/query.cc
new file mode 100644
index 0000000..a6fdcd3
--- /dev/null
+++ b/src/util/query.cc
@@ -0,0 +1,57 @@
+#include "query.h"
+
+#include <fcntl.h>
+
+#include <cstdio>
+#include <cstring>
+
+#include <stdexcept>
+
+namespace dictzip {
+
+std::string get_line_starting_with(
+	const std::string& path, const std::string& substring) {
+	static const auto BUFFER_SIZE = 16*1024;
+
+	FILE* file = std::fopen(path.c_str(), "r");
+
+	posix_fadvise(fileno(file), 0, 0, 1);  // FDADVICE_SEQUENTIAL
+
+	char buffer[BUFFER_SIZE + 1];
+	char* start_of_match = nullptr;
+
+	while( std::size_t bytes_read = std::fread(buffer, sizeof(char), BUFFER_SIZE, file) ) {
+		if ( bytes_read ) {
+			for ( char* p = buffer;
+			      (p = static_cast<char*>(std::memchr(p, '\n', (buffer + bytes_read) - p)));
+			      ++p ) {
+				if ( start_of_match == nullptr ) {
+					if ( std::strncmp(substring.c_str(), p+1, substring.size()) == 0 ) {
+						start_of_match = p+1;
+					}
+				} else {
+					return std::string(start_of_match, p-start_of_match);
+				}
+			}
+		} else {
+			break;
+		}
+	}
+
+	throw std::runtime_error("No match found");
+}
+
+std::string get_encoded_offset(const std::string& line) {
+	const std::size_t start = line.find_first_of('\t');
+	const std::size_t end   = line.find_last_of('\t');
+
+	return line.substr(start + 1, end - (start + 1));
+}
+
+std::string get_encoded_length(const std::string& line) {
+	const std::size_t start = line.find_last_of('\t');
+
+	return line.substr(start + 1);
+}
+
+}
diff --git a/src/util/query.h b/src/util/query.h
new file mode 100644
index 0000000..3a25206
--- /dev/null
+++ b/src/util/query.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <string>
+
+namespace dictzip {
+
+std::string get_line_starting_with(const std::string& path, const std::string& substring);
+
+std::string get_encoded_offset(const std::string& line);
+std::string get_encoded_length(const std::string& line);
+
+}
-- 
cgit v1.2.3