aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdrian Kummerlaender2017-10-05 21:57:08 +0200
committerAdrian Kummerlaender2017-10-05 21:57:08 +0200
commitc953c72c86c281d650b2a8ff856e3d614664e11a (patch)
treeb04024fa018cc05a1884c57123115a65884ad704
downloadDictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar
DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar.gz
DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar.bz2
DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar.lz
DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar.xz
DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.tar.zst
DictzipQuery-c953c72c86c281d650b2a8ff856e3d614664e11a.zip
Provide basic read only access to dictzip files
`dictzip::Istream` and `dictzip::IstreamBuf` are forked from `alpinocorpus::DzIstream` and `alpinocorpus::DzIstreamBuf` of rug-compling/alpinocorpus.
-rw-r--r--CMakeLists.txt36
-rw-r--r--LICENSE165
-rw-r--r--README.md9
-rw-r--r--example.cc17
-rw-r--r--src/istream/buffer.cc253
-rw-r--r--src/istream/buffer.h58
-rw-r--r--src/istream/stream.cc13
-rw-r--r--src/istream/stream.h21
-rw-r--r--src/util/gzip.h59
9 files changed, 631 insertions, 0 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..2514635
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,36 @@
+cmake_minimum_required(VERSION 2.8)
+project(DictzipQuery)
+
+set(
+ CMAKE_CXX_FLAGS
+ "-std=c++0x -W -Wall -Wextra -Winline -pedantic"
+)
+
+include_directories(
+ src/
+)
+
+add_library(
+ DictzipQuery
+ SHARED
+ src/istream/stream.cc
+ src/istream/buffer.cc
+)
+
+add_executable(
+ example
+ example.cc
+)
+
+target_link_libraries(
+ example
+ DictzipQuery
+ z
+)
+
+install(
+ TARGETS
+ DictzipQuery
+ LIBRARY DESTINATION
+ lib
+)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..0a04128
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,165 @@
+ GNU LESSER GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+ This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+ 0. Additional Definitions.
+
+ As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+ "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+ An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+ A "Combined Work" is a work produced by combining or linking an
+Application with the Library. The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+ The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+ The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+ 1. Exception to Section 3 of the GNU GPL.
+
+ You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+ 2. Conveying Modified Versions.
+
+ If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+ a) under this License, provided that you make a good faith effort to
+ ensure that, in the event an Application does not supply the
+ function or data, the facility still operates, and performs
+ whatever part of its purpose remains meaningful, or
+
+ b) under the GNU GPL, with none of the additional permissions of
+ this License applicable to that copy.
+
+ 3. Object Code Incorporating Material from Library Header Files.
+
+ The object code form of an Application may incorporate material from
+a header file that is part of the Library. You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+ a) Give prominent notice with each copy of the object code that the
+ Library is used in it and that the Library and its use are
+ covered by this License.
+
+ b) Accompany the object code with a copy of the GNU GPL and this license
+ document.
+
+ 4. Combined Works.
+
+ You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+ a) Give prominent notice with each copy of the Combined Work that
+ the Library is used in it and that the Library and its use are
+ covered by this License.
+
+ b) Accompany the Combined Work with a copy of the GNU GPL and this license
+ document.
+
+ c) For a Combined Work that displays copyright notices during
+ execution, include the copyright notice for the Library among
+ these notices, as well as a reference directing the user to the
+ copies of the GNU GPL and this license document.
+
+ d) Do one of the following:
+
+ 0) Convey the Minimal Corresponding Source under the terms of this
+ License, and the Corresponding Application Code in a form
+ suitable for, and under terms that permit, the user to
+ recombine or relink the Application with a modified version of
+ the Linked Version to produce a modified Combined Work, in the
+ manner specified by section 6 of the GNU GPL for conveying
+ Corresponding Source.
+
+ 1) Use a suitable shared library mechanism for linking with the
+ Library. A suitable mechanism is one that (a) uses at run time
+ a copy of the Library already present on the user's computer
+ system, and (b) will operate properly with a modified version
+ of the Library that is interface-compatible with the Linked
+ Version.
+
+ e) Provide Installation Information, but only if you would otherwise
+ be required to provide such information under section 6 of the
+ GNU GPL, and only to the extent that such information is
+ necessary to install and execute a modified version of the
+ Combined Work produced by recombining or relinking the
+ Application with a modified version of the Linked Version. (If
+ you use option 4d0, the Installation Information must accompany
+ the Minimal Corresponding Source and Corresponding Application
+ Code. If you use option 4d1, you must provide the Installation
+ Information in the manner specified by section 6 of the GNU GPL
+ for conveying Corresponding Source.)
+
+ 5. Combined Libraries.
+
+ You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+ a) Accompany the combined library with a copy of the same work based
+ on the Library, uncombined with any other library facilities,
+ conveyed under the terms of this License.
+
+ b) Give prominent notice with the combined library that part of it
+ is a work based on the Library, and explaining where to find the
+ accompanying uncombined form of the same work.
+
+ 6. Revised Versions of the GNU Lesser General Public License.
+
+ The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+ If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..08d22b2
--- /dev/null
+++ b/README.md
@@ -0,0 +1,9 @@
+# DictzipQuery
+
+This library aims to offer a straight forward C++ interface for querying compressed dictionaries distributed in the _dictzip_ format.
+
+## License
+
+This library is available under the terms of the LGPL 3.
+
+It is based on [alpinocorpus](https://github.com/rug-compling/alpinocorpus) - specifically its `DzIstreamBuf` and `DzIstream` classes.
diff --git a/example.cc b/example.cc
new file mode 100644
index 0000000..26bf404
--- /dev/null
+++ b/example.cc
@@ -0,0 +1,17 @@
+#include "istream/stream.h"
+
+#include <string>
+#include <iostream>
+
+int main() {
+ dictzip::Istream stream("gcide.dict.dz");
+
+ // Print the GCIDE definition of _Accession_
+ std::string data;
+ data.reserve(1453);
+
+ stream.seekg(245808);
+ stream.read(const_cast<char*>(data.data()), 1453);
+
+ std::cout << data.c_str() << std::endl;
+}
diff --git a/src/istream/buffer.cc b/src/istream/buffer.cc
new file mode 100644
index 0000000..aaa93ff
--- /dev/null
+++ b/src/istream/buffer.cc
@@ -0,0 +1,253 @@
+#include <cstdio>
+#include <cstring>
+#include <iostream>
+#include <stdexcept>
+
+#include <zlib.h>
+
+#include "buffer.h"
+#include "util/gzip.h"
+
+namespace dictzip {
+
+void IstreamBuf::readChunk(long n) {
+ if ( n == curr_chunk_ ) {
+ return;
+ }
+
+ const IstreamBuf::Chunk chunkN = chunks_[n];
+ unsigned char* zBuf = new unsigned char[chunkN.size];
+
+ std::fseek(dictzip_file_, data_offset_ + chunkN.offset, SEEK_SET);
+
+ if ( fread(zBuf, 1, chunkN.size, dictzip_file_) != chunkN.size ) {
+ throw std::runtime_error("Could not read dictzip chunk.");
+ }
+
+ z_stream zStream;
+ zStream.next_in = zBuf;
+ zStream.avail_in = chunkN.size;
+ zStream.next_out = &buffer_[0];
+ zStream.avail_out = chunk_length_;
+ zStream.zalloc = NULL;
+ zStream.zfree = NULL;
+
+ if ( inflateInit2(&zStream, -15) != Z_OK ) {
+ delete[] zBuf;
+ throw std::runtime_error(zStream.msg);
+ }
+
+ const int r = inflate(&zStream, Z_PARTIAL_FLUSH);
+
+ if ( r != Z_OK && r != Z_STREAM_END ) {
+ delete[] zBuf;
+ throw std::runtime_error(zStream.msg);
+ }
+
+ delete[] zBuf;
+
+ if ( inflateEnd(&zStream) != Z_OK ) {
+ throw std::runtime_error(zStream.msg);
+ }
+
+ unsigned char *buffer = &buffer_[0];
+
+ this->setg(
+ reinterpret_cast<char*>(buffer),
+ reinterpret_cast<char*>(buffer),
+ reinterpret_cast<char*>(buffer) + zStream.total_out
+ );
+
+ curr_chunk_ = n;
+}
+
+void IstreamBuf::readExtra() {
+ const int extraLen = std::fgetc(dictzip_file_)
+ + ( std::fgetc(dictzip_file_) * 256 );
+ const long extraPos = std::ftell(dictzip_file_);
+ const long nextField = extraPos + extraLen;
+
+ while ( std::ftell(dictzip_file_) < nextField ) {
+ // Read extra field 'header'
+ char si[2];
+ if ( std::fread(si, 1, sizeof(si), dictzip_file_) != sizeof(si) ) {
+ throw std::runtime_error("Could not read extra dictzip field header.");
+ }
+
+ const int len = std::fgetc(dictzip_file_)
+ + ( std::fgetc(dictzip_file_) * 256 );
+
+ // Check for chunk information
+ if ( si[0] == 'R' && si[1] == 'A' ) {
+ const int ver = std::fgetc(dictzip_file_)
+ + ( std::fgetc(dictzip_file_) * 256 );
+ if ( ver != 1 ) {
+ throw std::runtime_error("Unknown dictzip version.");
+ }
+
+ chunk_length_ = std::fgetc(dictzip_file_)
+ + ( std::fgetc(dictzip_file_) * 256 );
+ const size_t chunkCount = std::fgetc(dictzip_file_)
+ + ( std::fgetc(dictzip_file_) * 256 );
+
+ buffer_.resize(chunk_length_);
+ unsigned char *buffer = &buffer_[0];
+
+ this->setg(
+ reinterpret_cast<char*>(buffer),
+ reinterpret_cast<char*>(buffer) + chunk_length_,
+ reinterpret_cast<char*>(buffer) + chunk_length_
+ );
+
+ size_t chunkPos = 0;
+
+ for ( size_t i = 0; i < chunkCount; ++i ) {
+ const size_t chunkLen = std::fgetc(dictzip_file_)
+ + ( std::fgetc(dictzip_file_) * 256 );
+ chunks_.emplace_back(chunkPos, chunkLen);
+ chunkPos += chunkLen;
+ }
+ } else {
+ std::fseek(dictzip_file_, len, SEEK_CUR);
+ }
+ }
+}
+
+void IstreamBuf::readHeader() {
+ header_.resize(GZ_HEADER_SIZE);
+ unsigned char* header = &header_[0];
+
+ if ( std::fread(header, 1, GZ_HEADER_SIZE, dictzip_file_) != GZ_HEADER_SIZE ) {
+ throw std::runtime_error("Could not read dictzip header.");
+ }
+
+ if ( header[GZ_HEADER_ID1] != gzipId1 ||
+ header[GZ_HEADER_ID2] != gzipId2 ) {
+ throw std::runtime_error("Given dictzip file is not a gzip file.");
+ }
+
+ if ( header[GZ_HEADER_CM] != GZ_CM_DEFLATE ) {
+ throw std::runtime_error("Unknown compression method detected.");
+ }
+
+ if ( !(header[GZ_HEADER_FLG] & GZ_FLG_EXTRA) ) {
+ throw std::runtime_error("No extra fields, given file cannot be a dictzip file.");
+ }
+}
+
+void IstreamBuf::skipOptional() {
+ const unsigned char* header = &header_[0];
+
+ if ( header[GZ_HEADER_FLG] & GZ_FLG_NAME ) {
+ while ( std::fgetc(dictzip_file_) != 0 ) {}
+ }
+
+ if ( header[GZ_HEADER_FLG] & GZ_FLG_COMMENT ) {
+ while ( std::fgetc(dictzip_file_) != 0 ) {}
+ }
+
+ if ( header[GZ_HEADER_FLG] & GZ_FLG_HCRC ) {
+ std::fseek(dictzip_file_, 2, SEEK_CUR);
+ }
+}
+
+int IstreamBuf::underflow() {
+ if ( this->gptr() < this->egptr() ) {
+ return *this->gptr();
+ }
+
+ if ( curr_chunk_ + 1 >= static_cast<int>(chunks_.size()) ) {
+ return EOF;
+ }
+
+ this->readChunk(curr_chunk_ + 1);
+
+ return *gptr();
+}
+
+// From C++ annotations 8.1.0~pre1, chapter 23.
+std::streamsize IstreamBuf::xsgetn(char *dest, std::streamsize n) {
+ int nread = 0;
+
+ while ( n ) {
+ if ( !this->in_avail() ) {
+ if ( this->underflow() == EOF ) {
+ break;
+ }
+ }
+
+ int avail = this->in_avail();
+
+ if (avail > n) {
+ avail = n;
+ }
+
+ std::memcpy(dest + nread, gptr(), avail);
+ this->gbump(avail);
+
+ nread += avail;
+ n -= avail;
+ }
+
+ return nread;
+}
+
+IstreamBuf::IstreamBuf(char const *filename):
+ dictzip_file_{ fopen(filename, "r") } {
+ if ( dictzip_file_ ) {
+ readHeader();
+ readExtra();
+ skipOptional();
+ data_offset_ = std::ftell(dictzip_file_);
+ curr_chunk_ = -1;
+ } else {
+ throw std::runtime_error("Could not open input dictzip stream.");
+ }
+}
+
+IstreamBuf::~IstreamBuf() {
+ if ( dictzip_file_ ) {
+ fclose(dictzip_file_);
+ }
+}
+
+IstreamBuf::pos_type IstreamBuf::seekoff(off_type off, seekdir dir, openmode) {
+ pos_type targetPos;
+
+ switch ( dir ) {
+ case std::ios::beg: {
+ targetPos = off;
+ break;
+ }
+ case std::ios::cur: {
+ const pos_type curPos = (curr_chunk_ * chunk_length_)
+ + (gptr() - eback());
+ targetPos = curPos + off;
+ break;
+ }
+ default: {
+ // XXX - We can only detmine the uncompressed file length by decompressing the
+ // last chunk. Quite inefficient, haven't made my mind up whether we want to
+ // support this.
+ return EOF;
+ }
+ }
+
+ if ( targetPos < 0 ) {
+ return -1;
+ } else {
+ const int targetChunk = targetPos / chunk_length_;
+ const int chunkPos = targetPos % chunk_length_;
+
+ this->readChunk(targetChunk);
+ this->setg(this->eback(), this->eback() + chunkPos, this->egptr());
+
+ return targetPos;
+ }
+}
+
+IstreamBuf::pos_type IstreamBuf::seekpos(pos_type off, openmode mode) {
+ return seekoff(off, std::ios::beg, mode);
+}
+
+}
diff --git a/src/istream/buffer.h b/src/istream/buffer.h
new file mode 100644
index 0000000..91bb53f
--- /dev/null
+++ b/src/istream/buffer.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <cstdio>
+#include <iostream>
+#include <streambuf>
+#include <vector>
+
+namespace dictzip {
+
+// Warning: The inherent statefulness of stream buffers interferes with
+// multithreaded access. Users of this class should perform appropriate
+// locking, since they are in a better position to do so.
+class IstreamBuf : public std::streambuf {
+private:
+ struct Chunk {
+ Chunk(size_t offset, size_t size):
+ offset(offset),
+ size(size) {};
+
+ const size_t offset;
+ const size_t size;
+ };
+
+ FILE* dictzip_file_;
+
+ std::vector<unsigned char> buffer_;
+ std::vector<unsigned char> header_;
+ std::vector<Chunk> chunks_;
+
+ size_t chunk_length_;
+ long data_offset_;
+ long curr_chunk_;
+
+ void readChunk(long n);
+ void readHeader();
+ void readExtra();
+ void skipOptional();
+
+protected:
+ int underflow();
+ std::streamsize xsgetn(char *dest, std::streamsize n);
+
+public:
+ using pos_type = std::streambuf::pos_type;
+ using off_type = std::streambuf::off_type;
+
+ typedef std::ios::seekdir seekdir;
+ typedef std::ios::openmode openmode;
+
+ IstreamBuf(char const* filename);
+ ~IstreamBuf();
+
+ pos_type seekoff(off_type off, seekdir dir, openmode);
+ pos_type seekpos(pos_type off, openmode mode);
+
+};
+
+}
diff --git a/src/istream/stream.cc b/src/istream/stream.cc
new file mode 100644
index 0000000..aea548f
--- /dev/null
+++ b/src/istream/stream.cc
@@ -0,0 +1,13 @@
+#include <iostream>
+
+#include "stream.h"
+
+namespace dictzip {
+
+Istream::Istream(char const* filename):
+ std::istream(0),
+ buffer_{ new IstreamBuf(filename) } {
+ this->rdbuf(buffer_.get());
+}
+
+}
diff --git a/src/istream/stream.h b/src/istream/stream.h
new file mode 100644
index 0000000..0602d06
--- /dev/null
+++ b/src/istream/stream.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <memory>
+
+#include "buffer.h"
+
+namespace dictzip {
+
+class Istream : public std::istream {
+private:
+ std::shared_ptr<IstreamBuf> buffer_;
+
+public:
+ Istream(char const* filename);
+ virtual ~Istream() {}
+
+};
+
+}
diff --git a/src/util/gzip.h b/src/util/gzip.h
new file mode 100644
index 0000000..53e59f6
--- /dev/null
+++ b/src/util/gzip.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <cstddef>
+
+namespace {
+
+size_t const GZ_HEADER_SIZE = 10;
+
+// Gzip header fields
+size_t const GZ_HEADER_ID1 = 0;
+size_t const GZ_HEADER_ID2 = 1;
+size_t const GZ_HEADER_CM = 2;
+size_t const GZ_HEADER_FLG = 3;
+size_t const GZ_HEADER_MTIME = 4;
+size_t const GZ_HEADER_XFL = 8;
+size_t const GZ_HEADER_OS = 9;
+
+// Gzip file magic
+unsigned char const gzipId1 = 0x1f;
+unsigned char const gzipId2 = 0x8b;
+
+// Gzip compression method(s)
+unsigned char const GZ_CM_DEFLATE = 8;
+
+// Flags in GZ_HEADER_FLG
+unsigned char const GZ_FLG_TEXT = 1;
+unsigned char const GZ_FLG_HCRC = 1 << 1;
+unsigned char const GZ_FLG_EXTRA = 1 << 2;
+unsigned char const GZ_FLG_NAME = 1 << 3;
+unsigned char const GZ_FLG_COMMENT = 1 << 4;
+
+// GZ_HEADER_XFL values for deflate
+unsigned char const GZ_XFL_MAX = 2;
+unsigned char const GZ_XFL_FAST = 4;
+
+// GZ_HEADER_OS values
+unsigned char const GZ_OS_FAT = 0;
+unsigned char const GZ_OS_AMIGA = 1;
+unsigned char const GZ_OS_VMS = 2;
+unsigned char const GZ_OS_UNIX = 3;
+unsigned char const GZ_OS_VM_CMS = 4;
+unsigned char const GZ_OS_TOS = 5;
+unsigned char const GZ_OS_HPFS = 6;
+unsigned char const GZ_OS_MAC = 7;
+unsigned char const GZ_OS_ZSYSTEM = 8;
+unsigned char const GZ_OS_CPM = 9;
+unsigned char const GZ_OS_TOPS20 = 10;
+unsigned char const GZ_OS_NTFS = 11;
+unsigned char const GZ_OS_QDOS = 12;
+unsigned char const GZ_OS_RISCOS = 13;
+unsigned char const GZ_OS_UNKNOWN = 255;
+
+size_t const GZ_TRAILER_SIZE = 8;
+
+// Gzip trailer fields
+size_t const GZ_TRAILER_CRC32 = 0;
+size_t const GZ_TRAILER_ISIZE = 4;
+
+}