aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAdrian Kummerlaender2017-10-05 21:57:08 +0200
committerAdrian Kummerlaender2017-10-05 21:57:08 +0200
commitc953c72c86c281d650b2a8ff856e3d614664e11a (patch)
treeb04024fa018cc05a1884c57123115a65884ad704 /src
downloadDictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar
DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar.gz
DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar.bz2
DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar.lz
DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar.xz
DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar.zst
DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.zip
Provide basic read only access to dictzip files
`dictzip::Istream` and `dictzip::IstreamBuf` are forked from `alpinocorpus::DzIstream` and `alpinocorpus::DzIstreamBuf` of rug-compling/alpinocorpus.
Diffstat (limited to 'src')
-rw-r--r--src/istream/buffer.cc253
-rw-r--r--src/istream/buffer.h58
-rw-r--r--src/istream/stream.cc13
-rw-r--r--src/istream/stream.h21
-rw-r--r--src/util/gzip.h59
5 files changed, 404 insertions, 0 deletions
diff --git a/src/istream/buffer.cc b/src/istream/buffer.cc
new file mode 100644
index 0000000..aaa93ff
--- /dev/null
+++ b/src/istream/buffer.cc
@@ -0,0 +1,253 @@
+#include <cstdio>
+#include <cstring>
+#include <iostream>
+#include <stdexcept>
+
+#include <zlib.h>
+
+#include "buffer.h"
+#include "util/gzip.h"
+
+namespace dictzip {
+
+void IstreamBuf::readChunk(long n) {
+ if ( n == curr_chunk_ ) {
+ return;
+ }
+
+ const IstreamBuf::Chunk chunkN = chunks_[n];
+ unsigned char* zBuf = new unsigned char[chunkN.size];
+
+ std::fseek(dictzip_file_, data_offset_ + chunkN.offset, SEEK_SET);
+
+ if ( fread(zBuf, 1, chunkN.size, dictzip_file_) != chunkN.size ) {
+ throw std::runtime_error("Could not read dictzip chunk.");
+ }
+
+ z_stream zStream;
+ zStream.next_in = zBuf;
+ zStream.avail_in = chunkN.size;
+ zStream.next_out = &buffer_[0];
+ zStream.avail_out = chunk_length_;
+ zStream.zalloc = NULL;
+ zStream.zfree = NULL;
+
+ if ( inflateInit2(&zStream, -15) != Z_OK ) {
+ delete[] zBuf;
+ throw std::runtime_error(zStream.msg);
+ }
+
+ const int r = inflate(&zStream, Z_PARTIAL_FLUSH);
+
+ if ( r != Z_OK && r != Z_STREAM_END ) {
+ delete[] zBuf;
+ throw std::runtime_error(zStream.msg);
+ }
+
+ delete[] zBuf;
+
+ if ( inflateEnd(&zStream) != Z_OK ) {
+ throw std::runtime_error(zStream.msg);
+ }
+
+ unsigned char *buffer = &buffer_[0];
+
+ this->setg(
+ reinterpret_cast<char*>(buffer),
+ reinterpret_cast<char*>(buffer),
+ reinterpret_cast<char*>(buffer) + zStream.total_out
+ );
+
+ curr_chunk_ = n;
+}
+
+void IstreamBuf::readExtra() {
+ const int extraLen = std::fgetc(dictzip_file_)
+ + ( std::fgetc(dictzip_file_) * 256 );
+ const long extraPos = std::ftell(dictzip_file_);
+ const long nextField = extraPos + extraLen;
+
+ while ( std::ftell(dictzip_file_) < nextField ) {
+ // Read extra field 'header'
+ char si[2];
+ if ( std::fread(si, 1, sizeof(si), dictzip_file_) != sizeof(si) ) {
+ throw std::runtime_error("Could not read extra dictzip field header.");
+ }
+
+ const int len = std::fgetc(dictzip_file_)
+ + ( std::fgetc(dictzip_file_) * 256 );
+
+ // Check for chunk information
+ if ( si[0] == 'R' && si[1] == 'A' ) {
+ const int ver = std::fgetc(dictzip_file_)
+ + ( std::fgetc(dictzip_file_) * 256 );
+ if ( ver != 1 ) {
+ throw std::runtime_error("Unknown dictzip version.");
+ }
+
+ chunk_length_ = std::fgetc(dictzip_file_)
+ + ( std::fgetc(dictzip_file_) * 256 );
+ const size_t chunkCount = std::fgetc(dictzip_file_)
+ + ( std::fgetc(dictzip_file_) * 256 );
+
+ buffer_.resize(chunk_length_);
+ unsigned char *buffer = &buffer_[0];
+
+ this->setg(
+ reinterpret_cast<char*>(buffer),
+ reinterpret_cast<char*>(buffer) + chunk_length_,
+ reinterpret_cast<char*>(buffer) + chunk_length_
+ );
+
+ size_t chunkPos = 0;
+
+ for ( size_t i = 0; i < chunkCount; ++i ) {
+ const size_t chunkLen = std::fgetc(dictzip_file_)
+ + ( std::fgetc(dictzip_file_) * 256 );
+ chunks_.emplace_back(chunkPos, chunkLen);
+ chunkPos += chunkLen;
+ }
+ } else {
+ std::fseek(dictzip_file_, len, SEEK_CUR);
+ }
+ }
+}
+
+void IstreamBuf::readHeader() {
+ header_.resize(GZ_HEADER_SIZE);
+ unsigned char* header = &header_[0];
+
+ if ( std::fread(header, 1, GZ_HEADER_SIZE, dictzip_file_) != GZ_HEADER_SIZE ) {
+ throw std::runtime_error("Could not read dictzip header.");
+ }
+
+ if ( header[GZ_HEADER_ID1] != gzipId1 ||
+ header[GZ_HEADER_ID2] != gzipId2 ) {
+ throw std::runtime_error("Given dictzip file is not a gzip file.");
+ }
+
+ if ( header[GZ_HEADER_CM] != GZ_CM_DEFLATE ) {
+ throw std::runtime_error("Unknown compression method detected.");
+ }
+
+ if ( !(header[GZ_HEADER_FLG] & GZ_FLG_EXTRA) ) {
+ throw std::runtime_error("No extra fields, given file cannot be a dictzip file.");
+ }
+}
+
+void IstreamBuf::skipOptional() {
+ const unsigned char* header = &header_[0];
+
+ if ( header[GZ_HEADER_FLG] & GZ_FLG_NAME ) {
+ while ( std::fgetc(dictzip_file_) != 0 ) {}
+ }
+
+ if ( header[GZ_HEADER_FLG] & GZ_FLG_COMMENT ) {
+ while ( std::fgetc(dictzip_file_) != 0 ) {}
+ }
+
+ if ( header[GZ_HEADER_FLG] & GZ_FLG_HCRC ) {
+ std::fseek(dictzip_file_, 2, SEEK_CUR);
+ }
+}
+
+int IstreamBuf::underflow() {
+ if ( this->gptr() < this->egptr() ) {
+ return *this->gptr();
+ }
+
+ if ( curr_chunk_ + 1 >= static_cast<int>(chunks_.size()) ) {
+ return EOF;
+ }
+
+ this->readChunk(curr_chunk_ + 1);
+
+ return *gptr();
+}
+
+// From C++ annotations 8.1.0~pre1, chapter 23.
+std::streamsize IstreamBuf::xsgetn(char *dest, std::streamsize n) {
+ int nread = 0;
+
+ while ( n ) {
+ if ( !this->in_avail() ) {
+ if ( this->underflow() == EOF ) {
+ break;
+ }
+ }
+
+ int avail = this->in_avail();
+
+ if (avail > n) {
+ avail = n;
+ }
+
+ std::memcpy(dest + nread, gptr(), avail);
+ this->gbump(avail);
+
+ nread += avail;
+ n -= avail;
+ }
+
+ return nread;
+}
+
+IstreamBuf::IstreamBuf(char const *filename):
+ dictzip_file_{ fopen(filename, "r") } {
+ if ( dictzip_file_ ) {
+ readHeader();
+ readExtra();
+ skipOptional();
+ data_offset_ = std::ftell(dictzip_file_);
+ curr_chunk_ = -1;
+ } else {
+ throw std::runtime_error("Could not open input dictzip stream.");
+ }
+}
+
+IstreamBuf::~IstreamBuf() {
+ if ( dictzip_file_ ) {
+ fclose(dictzip_file_);
+ }
+}
+
+IstreamBuf::pos_type IstreamBuf::seekoff(off_type off, seekdir dir, openmode) {
+ pos_type targetPos;
+
+ switch ( dir ) {
+ case std::ios::beg: {
+ targetPos = off;
+ break;
+ }
+ case std::ios::cur: {
+ const pos_type curPos = (curr_chunk_ * chunk_length_)
+ + (gptr() - eback());
+ targetPos = curPos + off;
+ break;
+ }
+ default: {
+ // XXX - We can only detmine the uncompressed file length by decompressing the
+ // last chunk. Quite inefficient, haven't made my mind up whether we want to
+ // support this.
+ return EOF;
+ }
+ }
+
+ if ( targetPos < 0 ) {
+ return -1;
+ } else {
+ const int targetChunk = targetPos / chunk_length_;
+ const int chunkPos = targetPos % chunk_length_;
+
+ this->readChunk(targetChunk);
+ this->setg(this->eback(), this->eback() + chunkPos, this->egptr());
+
+ return targetPos;
+ }
+}
+
+IstreamBuf::pos_type IstreamBuf::seekpos(pos_type off, openmode mode) {
+ return seekoff(off, std::ios::beg, mode);
+}
+
+}
diff --git a/src/istream/buffer.h b/src/istream/buffer.h
new file mode 100644
index 0000000..91bb53f
--- /dev/null
+++ b/src/istream/buffer.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <cstdio>
+#include <iostream>
+#include <streambuf>
+#include <vector>
+
+namespace dictzip {
+
+// Warning: The inherent statefulness of stream buffers interferes with
+// multithreaded access. Users of this class should perform appropriate
+// locking, since they are in a better position to do so.
+class IstreamBuf : public std::streambuf {
+private:
+ struct Chunk {
+ Chunk(size_t offset, size_t size):
+ offset(offset),
+ size(size) {};
+
+ const size_t offset;
+ const size_t size;
+ };
+
+ FILE* dictzip_file_;
+
+ std::vector<unsigned char> buffer_;
+ std::vector<unsigned char> header_;
+ std::vector<Chunk> chunks_;
+
+ size_t chunk_length_;
+ long data_offset_;
+ long curr_chunk_;
+
+ void readChunk(long n);
+ void readHeader();
+ void readExtra();
+ void skipOptional();
+
+protected:
+ int underflow();
+ std::streamsize xsgetn(char *dest, std::streamsize n);
+
+public:
+ using pos_type = std::streambuf::pos_type;
+ using off_type = std::streambuf::off_type;
+
+ typedef std::ios::seekdir seekdir;
+ typedef std::ios::openmode openmode;
+
+ IstreamBuf(char const* filename);
+ ~IstreamBuf();
+
+ pos_type seekoff(off_type off, seekdir dir, openmode);
+ pos_type seekpos(pos_type off, openmode mode);
+
+};
+
+}
diff --git a/src/istream/stream.cc b/src/istream/stream.cc
new file mode 100644
index 0000000..aea548f
--- /dev/null
+++ b/src/istream/stream.cc
@@ -0,0 +1,13 @@
+#include <iostream>
+
+#include "stream.h"
+
+namespace dictzip {
+
+Istream::Istream(char const* filename):
+ std::istream(0),
+ buffer_{ new IstreamBuf(filename) } {
+ this->rdbuf(buffer_.get());
+}
+
+}
diff --git a/src/istream/stream.h b/src/istream/stream.h
new file mode 100644
index 0000000..0602d06
--- /dev/null
+++ b/src/istream/stream.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <memory>
+
+#include "buffer.h"
+
+namespace dictzip {
+
+class Istream : public std::istream {
+private:
+ std::shared_ptr<IstreamBuf> buffer_;
+
+public:
+ Istream(char const* filename);
+ virtual ~Istream() {}
+
+};
+
+}
diff --git a/src/util/gzip.h b/src/util/gzip.h
new file mode 100644
index 0000000..53e59f6
--- /dev/null
+++ b/src/util/gzip.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <cstddef>
+
+namespace {
+
+size_t const GZ_HEADER_SIZE = 10;
+
+// Gzip header fields
+size_t const GZ_HEADER_ID1 = 0;
+size_t const GZ_HEADER_ID2 = 1;
+size_t const GZ_HEADER_CM = 2;
+size_t const GZ_HEADER_FLG = 3;
+size_t const GZ_HEADER_MTIME = 4;
+size_t const GZ_HEADER_XFL = 8;
+size_t const GZ_HEADER_OS = 9;
+
+// Gzip file magic
+unsigned char const gzipId1 = 0x1f;
+unsigned char const gzipId2 = 0x8b;
+
+// Gzip compression method(s)
+unsigned char const GZ_CM_DEFLATE = 8;
+
+// Flags in GZ_HEADER_FLG
+unsigned char const GZ_FLG_TEXT = 1;
+unsigned char const GZ_FLG_HCRC = 1 << 1;
+unsigned char const GZ_FLG_EXTRA = 1 << 2;
+unsigned char const GZ_FLG_NAME = 1 << 3;
+unsigned char const GZ_FLG_COMMENT = 1 << 4;
+
+// GZ_HEADER_XFL values for deflate
+unsigned char const GZ_XFL_MAX = 2;
+unsigned char const GZ_XFL_FAST = 4;
+
+// GZ_HEADER_OS values
+unsigned char const GZ_OS_FAT = 0;
+unsigned char const GZ_OS_AMIGA = 1;
+unsigned char const GZ_OS_VMS = 2;
+unsigned char const GZ_OS_UNIX = 3;
+unsigned char const GZ_OS_VM_CMS = 4;
+unsigned char const GZ_OS_TOS = 5;
+unsigned char const GZ_OS_HPFS = 6;
+unsigned char const GZ_OS_MAC = 7;
+unsigned char const GZ_OS_ZSYSTEM = 8;
+unsigned char const GZ_OS_CPM = 9;
+unsigned char const GZ_OS_TOPS20 = 10;
+unsigned char const GZ_OS_NTFS = 11;
+unsigned char const GZ_OS_QDOS = 12;
+unsigned char const GZ_OS_RISCOS = 13;
+unsigned char const GZ_OS_UNKNOWN = 255;
+
+size_t const GZ_TRAILER_SIZE = 8;
+
+// Gzip trailer fields
+size_t const GZ_TRAILER_CRC32 = 0;
+size_t const GZ_TRAILER_ISIZE = 4;
+
+}