From c953c72c86c281d650b2a8ff856e3d614664e11a Mon Sep 17 00:00:00 2001 From: Adrian Kummerlaender Date: Thu, 5 Oct 2017 21:57:08 +0200 Subject: Provide basic read only access to dictzip files `dictzip::Istream` and `dictzip::IstreamBuf` are forked from `alpinocorpus::DzIstream` and `alpinocorpus::DzIstreamBuf` of rug-compling/alpinocorpus. --- CMakeLists.txt | 36 +++++++ LICENSE | 165 ++++++++++++++++++++++++++++++++ README.md | 9 ++ example.cc | 17 ++++ src/istream/buffer.cc | 253 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/istream/buffer.h | 58 ++++++++++++ src/istream/stream.cc | 13 +++ src/istream/stream.h | 21 +++++ src/util/gzip.h | 59 ++++++++++++ 9 files changed, 631 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 LICENSE create mode 100644 README.md create mode 100644 example.cc create mode 100644 src/istream/buffer.cc create mode 100644 src/istream/buffer.h create mode 100644 src/istream/stream.cc create mode 100644 src/istream/stream.h create mode 100644 src/util/gzip.h diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..2514635 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,36 @@ +cmake_minimum_required(VERSION 2.8) +project(DictzipQuery) + +set( + CMAKE_CXX_FLAGS + "-std=c++0x -W -Wall -Wextra -Winline -pedantic" +) + +include_directories( + src/ +) + +add_library( + DictzipQuery + SHARED + src/istream/stream.cc + src/istream/buffer.cc +) + +add_executable( + example + example.cc +) + +target_link_libraries( + example + DictzipQuery + z +) + +install( + TARGETS + DictzipQuery + LIBRARY DESTINATION + lib +) diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..0a04128 --- /dev/null +++ b/LICENSE @@ -0,0 +1,165 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. diff --git a/README.md b/README.md new file mode 100644 index 0000000..08d22b2 --- /dev/null +++ b/README.md @@ -0,0 +1,9 @@ +# DictzipQuery + +This library aims to offer a straight forward C++ interface for querying compressed dictionaries distributed in the _dictzip_ format. + +## License + +This library is available under the terms of the LGPL 3. + +It is based on [alpinocorpus](https://github.com/rug-compling/alpinocorpus) - specifically its `DzIstreamBuf` and `DzIstream` classes. diff --git a/example.cc b/example.cc new file mode 100644 index 0000000..26bf404 --- /dev/null +++ b/example.cc @@ -0,0 +1,17 @@ +#include "istream/stream.h" + +#include +#include + +int main() { + dictzip::Istream stream("gcide.dict.dz"); + + // Print the GCIDE definition of _Accession_ + std::string data; + data.reserve(1453); + + stream.seekg(245808); + stream.read(const_cast(data.data()), 1453); + + std::cout << data.c_str() << std::endl; +} diff --git a/src/istream/buffer.cc b/src/istream/buffer.cc new file mode 100644 index 0000000..aaa93ff --- /dev/null +++ b/src/istream/buffer.cc @@ -0,0 +1,253 @@ +#include +#include +#include +#include + +#include + +#include "buffer.h" +#include "util/gzip.h" + +namespace dictzip { + +void IstreamBuf::readChunk(long n) { + if ( n == curr_chunk_ ) { + return; + } + + const IstreamBuf::Chunk chunkN = chunks_[n]; + unsigned char* zBuf = new unsigned char[chunkN.size]; + + std::fseek(dictzip_file_, data_offset_ + chunkN.offset, SEEK_SET); + + if ( fread(zBuf, 1, chunkN.size, dictzip_file_) != chunkN.size ) { + throw std::runtime_error("Could not read dictzip chunk."); + } + + z_stream zStream; + zStream.next_in = zBuf; + zStream.avail_in = chunkN.size; + zStream.next_out = &buffer_[0]; + zStream.avail_out = chunk_length_; + zStream.zalloc = NULL; + zStream.zfree = NULL; + + if ( inflateInit2(&zStream, -15) != Z_OK ) { + delete[] zBuf; + throw std::runtime_error(zStream.msg); + } + + const int r = inflate(&zStream, Z_PARTIAL_FLUSH); + + if ( r != Z_OK && r != Z_STREAM_END ) { + delete[] zBuf; + throw std::runtime_error(zStream.msg); + } + + delete[] zBuf; + + if ( inflateEnd(&zStream) != Z_OK ) { + throw std::runtime_error(zStream.msg); + } + + unsigned char *buffer = &buffer_[0]; + + this->setg( + reinterpret_cast(buffer), + reinterpret_cast(buffer), + reinterpret_cast(buffer) + zStream.total_out + ); + + curr_chunk_ = n; +} + +void IstreamBuf::readExtra() { + const int extraLen = std::fgetc(dictzip_file_) + + ( std::fgetc(dictzip_file_) * 256 ); + const long extraPos = std::ftell(dictzip_file_); + const long nextField = extraPos + extraLen; + + while ( std::ftell(dictzip_file_) < nextField ) { + // Read extra field 'header' + char si[2]; + if ( std::fread(si, 1, sizeof(si), dictzip_file_) != sizeof(si) ) { + throw std::runtime_error("Could not read extra dictzip field header."); + } + + const int len = std::fgetc(dictzip_file_) + + ( std::fgetc(dictzip_file_) * 256 ); + + // Check for chunk information + if ( si[0] == 'R' && si[1] == 'A' ) { + const int ver = std::fgetc(dictzip_file_) + + ( std::fgetc(dictzip_file_) * 256 ); + if ( ver != 1 ) { + throw std::runtime_error("Unknown dictzip version."); + } + + chunk_length_ = std::fgetc(dictzip_file_) + + ( std::fgetc(dictzip_file_) * 256 ); + const size_t chunkCount = std::fgetc(dictzip_file_) + + ( std::fgetc(dictzip_file_) * 256 ); + + buffer_.resize(chunk_length_); + unsigned char *buffer = &buffer_[0]; + + this->setg( + reinterpret_cast(buffer), + reinterpret_cast(buffer) + chunk_length_, + reinterpret_cast(buffer) + chunk_length_ + ); + + size_t chunkPos = 0; + + for ( size_t i = 0; i < chunkCount; ++i ) { + const size_t chunkLen = std::fgetc(dictzip_file_) + + ( std::fgetc(dictzip_file_) * 256 ); + chunks_.emplace_back(chunkPos, chunkLen); + chunkPos += chunkLen; + } + } else { + std::fseek(dictzip_file_, len, SEEK_CUR); + } + } +} + +void IstreamBuf::readHeader() { + header_.resize(GZ_HEADER_SIZE); + unsigned char* header = &header_[0]; + + if ( std::fread(header, 1, GZ_HEADER_SIZE, dictzip_file_) != GZ_HEADER_SIZE ) { + throw std::runtime_error("Could not read dictzip header."); + } + + if ( header[GZ_HEADER_ID1] != gzipId1 || + header[GZ_HEADER_ID2] != gzipId2 ) { + throw std::runtime_error("Given dictzip file is not a gzip file."); + } + + if ( header[GZ_HEADER_CM] != GZ_CM_DEFLATE ) { + throw std::runtime_error("Unknown compression method detected."); + } + + if ( !(header[GZ_HEADER_FLG] & GZ_FLG_EXTRA) ) { + throw std::runtime_error("No extra fields, given file cannot be a dictzip file."); + } +} + +void IstreamBuf::skipOptional() { + const unsigned char* header = &header_[0]; + + if ( header[GZ_HEADER_FLG] & GZ_FLG_NAME ) { + while ( std::fgetc(dictzip_file_) != 0 ) {} + } + + if ( header[GZ_HEADER_FLG] & GZ_FLG_COMMENT ) { + while ( std::fgetc(dictzip_file_) != 0 ) {} + } + + if ( header[GZ_HEADER_FLG] & GZ_FLG_HCRC ) { + std::fseek(dictzip_file_, 2, SEEK_CUR); + } +} + +int IstreamBuf::underflow() { + if ( this->gptr() < this->egptr() ) { + return *this->gptr(); + } + + if ( curr_chunk_ + 1 >= static_cast(chunks_.size()) ) { + return EOF; + } + + this->readChunk(curr_chunk_ + 1); + + return *gptr(); +} + +// From C++ annotations 8.1.0~pre1, chapter 23. +std::streamsize IstreamBuf::xsgetn(char *dest, std::streamsize n) { + int nread = 0; + + while ( n ) { + if ( !this->in_avail() ) { + if ( this->underflow() == EOF ) { + break; + } + } + + int avail = this->in_avail(); + + if (avail > n) { + avail = n; + } + + std::memcpy(dest + nread, gptr(), avail); + this->gbump(avail); + + nread += avail; + n -= avail; + } + + return nread; +} + +IstreamBuf::IstreamBuf(char const *filename): + dictzip_file_{ fopen(filename, "r") } { + if ( dictzip_file_ ) { + readHeader(); + readExtra(); + skipOptional(); + data_offset_ = std::ftell(dictzip_file_); + curr_chunk_ = -1; + } else { + throw std::runtime_error("Could not open input dictzip stream."); + } +} + +IstreamBuf::~IstreamBuf() { + if ( dictzip_file_ ) { + fclose(dictzip_file_); + } +} + +IstreamBuf::pos_type IstreamBuf::seekoff(off_type off, seekdir dir, openmode) { + pos_type targetPos; + + switch ( dir ) { + case std::ios::beg: { + targetPos = off; + break; + } + case std::ios::cur: { + const pos_type curPos = (curr_chunk_ * chunk_length_) + + (gptr() - eback()); + targetPos = curPos + off; + break; + } + default: { + // XXX - We can only detmine the uncompressed file length by decompressing the + // last chunk. Quite inefficient, haven't made my mind up whether we want to + // support this. + return EOF; + } + } + + if ( targetPos < 0 ) { + return -1; + } else { + const int targetChunk = targetPos / chunk_length_; + const int chunkPos = targetPos % chunk_length_; + + this->readChunk(targetChunk); + this->setg(this->eback(), this->eback() + chunkPos, this->egptr()); + + return targetPos; + } +} + +IstreamBuf::pos_type IstreamBuf::seekpos(pos_type off, openmode mode) { + return seekoff(off, std::ios::beg, mode); +} + +} diff --git a/src/istream/buffer.h b/src/istream/buffer.h new file mode 100644 index 0000000..91bb53f --- /dev/null +++ b/src/istream/buffer.h @@ -0,0 +1,58 @@ +#pragma once + +#include +#include +#include +#include + +namespace dictzip { + +// Warning: The inherent statefulness of stream buffers interferes with +// multithreaded access. Users of this class should perform appropriate +// locking, since they are in a better position to do so. +class IstreamBuf : public std::streambuf { +private: + struct Chunk { + Chunk(size_t offset, size_t size): + offset(offset), + size(size) {}; + + const size_t offset; + const size_t size; + }; + + FILE* dictzip_file_; + + std::vector buffer_; + std::vector header_; + std::vector chunks_; + + size_t chunk_length_; + long data_offset_; + long curr_chunk_; + + void readChunk(long n); + void readHeader(); + void readExtra(); + void skipOptional(); + +protected: + int underflow(); + std::streamsize xsgetn(char *dest, std::streamsize n); + +public: + using pos_type = std::streambuf::pos_type; + using off_type = std::streambuf::off_type; + + typedef std::ios::seekdir seekdir; + typedef std::ios::openmode openmode; + + IstreamBuf(char const* filename); + ~IstreamBuf(); + + pos_type seekoff(off_type off, seekdir dir, openmode); + pos_type seekpos(pos_type off, openmode mode); + +}; + +} diff --git a/src/istream/stream.cc b/src/istream/stream.cc new file mode 100644 index 0000000..aea548f --- /dev/null +++ b/src/istream/stream.cc @@ -0,0 +1,13 @@ +#include + +#include "stream.h" + +namespace dictzip { + +Istream::Istream(char const* filename): + std::istream(0), + buffer_{ new IstreamBuf(filename) } { + this->rdbuf(buffer_.get()); +} + +} diff --git a/src/istream/stream.h b/src/istream/stream.h new file mode 100644 index 0000000..0602d06 --- /dev/null +++ b/src/istream/stream.h @@ -0,0 +1,21 @@ +#pragma once + +#include +#include +#include + +#include "buffer.h" + +namespace dictzip { + +class Istream : public std::istream { +private: + std::shared_ptr buffer_; + +public: + Istream(char const* filename); + virtual ~Istream() {} + +}; + +} diff --git a/src/util/gzip.h b/src/util/gzip.h new file mode 100644 index 0000000..53e59f6 --- /dev/null +++ b/src/util/gzip.h @@ -0,0 +1,59 @@ +#pragma once + +#include + +namespace { + +size_t const GZ_HEADER_SIZE = 10; + +// Gzip header fields +size_t const GZ_HEADER_ID1 = 0; +size_t const GZ_HEADER_ID2 = 1; +size_t const GZ_HEADER_CM = 2; +size_t const GZ_HEADER_FLG = 3; +size_t const GZ_HEADER_MTIME = 4; +size_t const GZ_HEADER_XFL = 8; +size_t const GZ_HEADER_OS = 9; + +// Gzip file magic +unsigned char const gzipId1 = 0x1f; +unsigned char const gzipId2 = 0x8b; + +// Gzip compression method(s) +unsigned char const GZ_CM_DEFLATE = 8; + +// Flags in GZ_HEADER_FLG +unsigned char const GZ_FLG_TEXT = 1; +unsigned char const GZ_FLG_HCRC = 1 << 1; +unsigned char const GZ_FLG_EXTRA = 1 << 2; +unsigned char const GZ_FLG_NAME = 1 << 3; +unsigned char const GZ_FLG_COMMENT = 1 << 4; + +// GZ_HEADER_XFL values for deflate +unsigned char const GZ_XFL_MAX = 2; +unsigned char const GZ_XFL_FAST = 4; + +// GZ_HEADER_OS values +unsigned char const GZ_OS_FAT = 0; +unsigned char const GZ_OS_AMIGA = 1; +unsigned char const GZ_OS_VMS = 2; +unsigned char const GZ_OS_UNIX = 3; +unsigned char const GZ_OS_VM_CMS = 4; +unsigned char const GZ_OS_TOS = 5; +unsigned char const GZ_OS_HPFS = 6; +unsigned char const GZ_OS_MAC = 7; +unsigned char const GZ_OS_ZSYSTEM = 8; +unsigned char const GZ_OS_CPM = 9; +unsigned char const GZ_OS_TOPS20 = 10; +unsigned char const GZ_OS_NTFS = 11; +unsigned char const GZ_OS_QDOS = 12; +unsigned char const GZ_OS_RISCOS = 13; +unsigned char const GZ_OS_UNKNOWN = 255; + +size_t const GZ_TRAILER_SIZE = 8; + +// Gzip trailer fields +size_t const GZ_TRAILER_CRC32 = 0; +size_t const GZ_TRAILER_ISIZE = 4; + +} -- cgit v1.2.3