Provide basic read only access to dictzip files

`dictzip::Istream` and `dictzip::IstreamBuf` are forked from `alpinocorpus::DzIstream` and `alpinocorpus::DzIstreamBuf` of rug-compling/alpinocorpus.
author: Adrian Kummerlaender 2017-10-05 21:57:08 +0200
committer: Adrian Kummerlaender 2017-10-05 21:57:08 +0200
commit: c953c72c86c281d650b2a8ff856e3d614664e11a (patch)
tree: b04024fa018cc05a1884c57123115a65884ad704 /src
download: DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar
DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar.gz
DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar.bz2
DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar.lz
DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar.xz
DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar.zst
DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.zip
5 files changed, 404 insertions, 0 deletions
diff --git a/src/istream/buffer.cc b/src/istream/buffer.cc
new file mode 100644
index 0000000..aaa93ff
--- /dev/null
+++ b/src/istream/buffer.cc
@@ -0,0 +1,253 @@
+#include <cstdio>
+#include <cstring>
+#include <iostream>
+#include <stdexcept>
+
+#include <zlib.h>
+
+#include "buffer.h"
+#include "util/gzip.h"
+
+namespace dictzip {
+
+void IstreamBuf::readChunk(long n) {
+	if ( n == curr_chunk_ ) {
+		return;
+	}
+
+	const IstreamBuf::Chunk chunkN = chunks_[n];
+	unsigned char* zBuf = new unsigned char[chunkN.size];
+	
+	std::fseek(dictzip_file_, data_offset_ + chunkN.offset, SEEK_SET);
+
+	if ( fread(zBuf, 1, chunkN.size, dictzip_file_) != chunkN.size ) {
+		throw std::runtime_error("Could not read dictzip chunk.");
+	}
+
+	z_stream zStream;
+	zStream.next_in   = zBuf;
+	zStream.avail_in  = chunkN.size;
+	zStream.next_out  = &buffer_[0];
+	zStream.avail_out = chunk_length_;
+	zStream.zalloc    = NULL;
+	zStream.zfree     = NULL;
+
+	if ( inflateInit2(&zStream, -15) != Z_OK ) {
+		delete[] zBuf;
+		throw std::runtime_error(zStream.msg);
+	}
+
+	const int r = inflate(&zStream, Z_PARTIAL_FLUSH);
+
+	if ( r != Z_OK && r != Z_STREAM_END ) {
+		delete[] zBuf;
+		throw std::runtime_error(zStream.msg);
+	}
+	
+	delete[] zBuf;
+
+	if ( inflateEnd(&zStream) != Z_OK ) {
+		throw std::runtime_error(zStream.msg);
+	}
+
+	unsigned char *buffer = &buffer_[0];
+
+	this->setg(
+		reinterpret_cast<char*>(buffer),
+		reinterpret_cast<char*>(buffer),
+		reinterpret_cast<char*>(buffer) + zStream.total_out
+	);
+	
+	curr_chunk_ = n;
+}
+
+void IstreamBuf::readExtra() {
+	const int  extraLen  = std::fgetc(dictzip_file_)
+	                     + ( std::fgetc(dictzip_file_) * 256 );
+	const long extraPos  = std::ftell(dictzip_file_);
+	const long nextField = extraPos + extraLen;
+
+	while ( std::ftell(dictzip_file_) < nextField ) {
+		// Read extra field 'header'
+		char si[2];
+		if ( std::fread(si, 1, sizeof(si), dictzip_file_) != sizeof(si) ) {
+			throw std::runtime_error("Could not read extra dictzip field header.");
+		}
+
+		const int len = std::fgetc(dictzip_file_)
+		              + ( std::fgetc(dictzip_file_) * 256 );
+		
+		// Check for chunk information
+		if ( si[0] == 'R' && si[1] == 'A' ) {
+			const int ver = std::fgetc(dictzip_file_)
+			              + ( std::fgetc(dictzip_file_) * 256 );
+			if ( ver != 1 ) {
+				throw std::runtime_error("Unknown dictzip version.");
+			}
+			
+			chunk_length_ = std::fgetc(dictzip_file_)
+			              + ( std::fgetc(dictzip_file_) * 256 );
+			const size_t chunkCount = std::fgetc(dictzip_file_)
+			                        + ( std::fgetc(dictzip_file_) * 256 );
+			
+			buffer_.resize(chunk_length_);
+			unsigned char *buffer = &buffer_[0];
+
+			this->setg(
+				reinterpret_cast<char*>(buffer),
+				reinterpret_cast<char*>(buffer) + chunk_length_,
+				reinterpret_cast<char*>(buffer) + chunk_length_
+			);
+			
+			size_t chunkPos = 0;
+
+			for ( size_t i = 0; i < chunkCount; ++i ) {
+				const size_t chunkLen = std::fgetc(dictzip_file_)
+				                      + ( std::fgetc(dictzip_file_) * 256 );
+				chunks_.emplace_back(chunkPos, chunkLen);	
+				chunkPos += chunkLen;
+			}
+		} else {
+			std::fseek(dictzip_file_, len, SEEK_CUR);
+		}
+	}
+}
+
+void IstreamBuf::readHeader() {
+	header_.resize(GZ_HEADER_SIZE);
+	unsigned char* header = &header_[0];
+
+	if ( std::fread(header, 1, GZ_HEADER_SIZE, dictzip_file_) != GZ_HEADER_SIZE ) {
+		throw std::runtime_error("Could not read dictzip header.");
+	}
+	
+	if ( header[GZ_HEADER_ID1] != gzipId1 ||
+	     header[GZ_HEADER_ID2] != gzipId2 ) {
+		throw std::runtime_error("Given dictzip file is not a gzip file.");
+	}
+	
+	if ( header[GZ_HEADER_CM] != GZ_CM_DEFLATE ) {
+		throw std::runtime_error("Unknown compression method detected.");
+	}
+	
+	if ( !(header[GZ_HEADER_FLG] & GZ_FLG_EXTRA) ) {
+		throw std::runtime_error("No extra fields, given file cannot be a dictzip file.");
+	}
+}
+
+void IstreamBuf::skipOptional() {
+	const unsigned char* header = &header_[0];
+	
+	if ( header[GZ_HEADER_FLG] & GZ_FLG_NAME ) {
+		while ( std::fgetc(dictzip_file_) != 0 ) {}
+	}
+		
+	if ( header[GZ_HEADER_FLG] & GZ_FLG_COMMENT ) {
+		while ( std::fgetc(dictzip_file_) != 0 ) {}
+	}
+	
+	if ( header[GZ_HEADER_FLG] & GZ_FLG_HCRC ) {
+		std::fseek(dictzip_file_, 2, SEEK_CUR);
+	}
+}
+
+int IstreamBuf::underflow() {
+	if ( this->gptr() < this->egptr() ) {
+		return *this->gptr();
+	}
+
+	if ( curr_chunk_ + 1 >= static_cast<int>(chunks_.size()) ) {
+		return EOF;
+	}
+
+	this->readChunk(curr_chunk_ + 1);
+	
+	return *gptr();
+}
+
+// From C++ annotations 8.1.0~pre1, chapter 23.
+std::streamsize IstreamBuf::xsgetn(char *dest, std::streamsize n) {
+	int nread = 0;
+	
+	while ( n ) {
+		if ( !this->in_avail() ) {
+			if ( this->underflow() == EOF ) {
+				break;
+			}
+		}
+		
+		int avail = this->in_avail();
+		
+		if (avail > n) {
+			avail = n;
+		}
+		
+		std::memcpy(dest + nread, gptr(), avail);
+		this->gbump(avail);
+		
+		nread += avail;
+		n     -= avail;
+	}
+	
+	return nread;
+}
+
+IstreamBuf::IstreamBuf(char const *filename):
+	dictzip_file_{ fopen(filename, "r") } {
+	if ( dictzip_file_ ) {
+		readHeader();
+		readExtra();
+		skipOptional();
+		data_offset_ = std::ftell(dictzip_file_);
+		curr_chunk_ = -1;
+	} else {
+		throw std::runtime_error("Could not open input dictzip stream.");
+	}
+}
+
+IstreamBuf::~IstreamBuf() {
+	if ( dictzip_file_ ) {
+		fclose(dictzip_file_);
+	}
+}
+
+IstreamBuf::pos_type IstreamBuf::seekoff(off_type off, seekdir dir, openmode) {
+	pos_type targetPos;
+
+	switch ( dir ) {
+		case std::ios::beg: {
+			targetPos = off;
+			break;
+		}
+		case std::ios::cur: {
+			const pos_type curPos = (curr_chunk_ * chunk_length_)
+			                      + (gptr() - eback());
+			targetPos = curPos + off;
+			break;
+		}
+		default: {
+			// XXX - We can only detmine the uncompressed file length by decompressing the
+			// last chunk. Quite inefficient, haven't made my mind up whether we want to
+			// support this.
+			return EOF;
+		}
+	}
+
+	if ( targetPos < 0 ) {
+		return -1;
+	} else {
+		const int targetChunk = targetPos / chunk_length_;
+		const int chunkPos    = targetPos % chunk_length_;
+	
+		this->readChunk(targetChunk);
+		this->setg(this->eback(), this->eback() + chunkPos, this->egptr());
+		
+		return targetPos;
+	}
+}
+
+IstreamBuf::pos_type IstreamBuf::seekpos(pos_type off, openmode mode) {
+	return seekoff(off, std::ios::beg, mode);
+}
+
+}
diff --git a/src/istream/buffer.h b/src/istream/buffer.h
new file mode 100644
index 0000000..91bb53f
--- /dev/null
+++ b/src/istream/buffer.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <cstdio>
+#include <iostream>
+#include <streambuf>
+#include <vector>
+
+namespace dictzip {
+
+// Warning: The inherent statefulness of stream buffers interferes with
+// multithreaded access. Users of this class should perform appropriate
+// locking, since they are in a better position to do so.
+class IstreamBuf : public std::streambuf {
+private:
+	struct Chunk {
+		Chunk(size_t offset, size_t size):
+			offset(offset),
+			size(size) {};
+
+		const size_t offset;
+		const size_t size;
+	};
+
+	FILE* dictzip_file_;
+
+	std::vector<unsigned char> buffer_;
+	std::vector<unsigned char> header_;
+	std::vector<Chunk>  chunks_;
+
+	size_t chunk_length_;
+	long data_offset_;
+	long curr_chunk_;
+
+	void readChunk(long n);
+	void readHeader();
+	void readExtra();
+	void skipOptional();
+
+protected:
+	int underflow();
+	std::streamsize xsgetn(char *dest, std::streamsize n);
+
+public:
+	using pos_type = std::streambuf::pos_type;
+	using off_type = std::streambuf::off_type;
+
+	typedef std::ios::seekdir  seekdir;
+	typedef std::ios::openmode openmode;
+
+	IstreamBuf(char const* filename);
+	~IstreamBuf();
+
+	pos_type seekoff(off_type off, seekdir dir, openmode);
+	pos_type seekpos(pos_type off, openmode mode);
+
+};
+
+}
diff --git a/src/istream/stream.cc b/src/istream/stream.cc
new file mode 100644
index 0000000..aea548f
--- /dev/null
+++ b/src/istream/stream.cc
@@ -0,0 +1,13 @@
+#include <iostream>
+
+#include "stream.h"
+
+namespace dictzip {
+
+Istream::Istream(char const* filename):
+	std::istream(0),
+	buffer_{ new IstreamBuf(filename) } {
+	this->rdbuf(buffer_.get());
+}
+
+}
diff --git a/src/istream/stream.h b/src/istream/stream.h
new file mode 100644
index 0000000..0602d06
--- /dev/null
+++ b/src/istream/stream.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <memory>
+
+#include "buffer.h"
+
+namespace dictzip {
+
+class Istream : public std::istream {
+private:
+	std::shared_ptr<IstreamBuf> buffer_;
+
+public:
+	Istream(char const* filename);
+	virtual ~Istream() {}
+
+};
+
+}
diff --git a/src/util/gzip.h b/src/util/gzip.h
new file mode 100644
index 0000000..53e59f6
--- /dev/null
+++ b/src/util/gzip.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <cstddef>
+
+namespace {
+
+size_t const GZ_HEADER_SIZE = 10;
+
+// Gzip header fields
+size_t const GZ_HEADER_ID1 = 0;
+size_t const GZ_HEADER_ID2 = 1;
+size_t const GZ_HEADER_CM = 2;
+size_t const GZ_HEADER_FLG = 3;
+size_t const GZ_HEADER_MTIME = 4;
+size_t const GZ_HEADER_XFL = 8;
+size_t const GZ_HEADER_OS = 9;
+
+// Gzip file magic
+unsigned char const gzipId1 = 0x1f;
+unsigned char const gzipId2 = 0x8b;
+
+// Gzip compression method(s)
+unsigned char const GZ_CM_DEFLATE = 8;
+
+// Flags in GZ_HEADER_FLG
+unsigned char const GZ_FLG_TEXT = 1;
+unsigned char const GZ_FLG_HCRC = 1 << 1;
+unsigned char const GZ_FLG_EXTRA = 1 << 2;
+unsigned char const GZ_FLG_NAME = 1 << 3;
+unsigned char const GZ_FLG_COMMENT = 1 << 4;
+
+// GZ_HEADER_XFL values for deflate
+unsigned char const GZ_XFL_MAX = 2;
+unsigned char const GZ_XFL_FAST = 4;
+
+// GZ_HEADER_OS values
+unsigned char const GZ_OS_FAT = 0;
+unsigned char const GZ_OS_AMIGA = 1;
+unsigned char const GZ_OS_VMS = 2;
+unsigned char const GZ_OS_UNIX = 3;
+unsigned char const GZ_OS_VM_CMS = 4;
+unsigned char const GZ_OS_TOS = 5;
+unsigned char const GZ_OS_HPFS = 6;
+unsigned char const GZ_OS_MAC = 7;
+unsigned char const GZ_OS_ZSYSTEM = 8;
+unsigned char const GZ_OS_CPM = 9;
+unsigned char const GZ_OS_TOPS20 = 10;
+unsigned char const GZ_OS_NTFS = 11;
+unsigned char const GZ_OS_QDOS = 12;
+unsigned char const GZ_OS_RISCOS = 13;
+unsigned char const GZ_OS_UNKNOWN = 255;
+
+size_t const GZ_TRAILER_SIZE = 8;
+
+// Gzip trailer fields
+size_t const GZ_TRAILER_CRC32 = 0;
+size_t const GZ_TRAILER_ISIZE = 4;
+
+}
author	Adrian Kummerlaender	2017-10-05 21:57:08 +0200
committer	Adrian Kummerlaender	2017-10-05 21:57:08 +0200
commit	c953c72c86c281d650b2a8ff856e3d614664e11a (patch)
tree	b04024fa018cc05a1884c57123115a65884ad704 /src
download	DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar.gz DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar.bz2 DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar.lz DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar.xz DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar.zst DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.zip