From c953c72c86c281d650b2a8ff856e3d614664e11a Mon Sep 17 00:00:00 2001 From: Adrian Kummerlaender Date: Thu, 5 Oct 2017 21:57:08 +0200 Subject: Provide basic read only access to dictzip files `dictzip::Istream` and `dictzip::IstreamBuf` are forked from `alpinocorpus::DzIstream` and `alpinocorpus::DzIstreamBuf` of rug-compling/alpinocorpus. --- src/istream/buffer.cc | 253 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/istream/buffer.h | 58 ++++++++++++ src/istream/stream.cc | 13 +++ src/istream/stream.h | 21 +++++ 4 files changed, 345 insertions(+) create mode 100644 src/istream/buffer.cc create mode 100644 src/istream/buffer.h create mode 100644 src/istream/stream.cc create mode 100644 src/istream/stream.h (limited to 'src/istream') diff --git a/src/istream/buffer.cc b/src/istream/buffer.cc new file mode 100644 index 0000000..aaa93ff --- /dev/null +++ b/src/istream/buffer.cc @@ -0,0 +1,253 @@ +#include +#include +#include +#include + +#include + +#include "buffer.h" +#include "util/gzip.h" + +namespace dictzip { + +void IstreamBuf::readChunk(long n) { + if ( n == curr_chunk_ ) { + return; + } + + const IstreamBuf::Chunk chunkN = chunks_[n]; + unsigned char* zBuf = new unsigned char[chunkN.size]; + + std::fseek(dictzip_file_, data_offset_ + chunkN.offset, SEEK_SET); + + if ( fread(zBuf, 1, chunkN.size, dictzip_file_) != chunkN.size ) { + throw std::runtime_error("Could not read dictzip chunk."); + } + + z_stream zStream; + zStream.next_in = zBuf; + zStream.avail_in = chunkN.size; + zStream.next_out = &buffer_[0]; + zStream.avail_out = chunk_length_; + zStream.zalloc = NULL; + zStream.zfree = NULL; + + if ( inflateInit2(&zStream, -15) != Z_OK ) { + delete[] zBuf; + throw std::runtime_error(zStream.msg); + } + + const int r = inflate(&zStream, Z_PARTIAL_FLUSH); + + if ( r != Z_OK && r != Z_STREAM_END ) { + delete[] zBuf; + throw std::runtime_error(zStream.msg); + } + + delete[] zBuf; + + if ( inflateEnd(&zStream) != Z_OK ) { + throw std::runtime_error(zStream.msg); + } + + unsigned char *buffer = &buffer_[0]; + + this->setg( + reinterpret_cast(buffer), + reinterpret_cast(buffer), + reinterpret_cast(buffer) + zStream.total_out + ); + + curr_chunk_ = n; +} + +void IstreamBuf::readExtra() { + const int extraLen = std::fgetc(dictzip_file_) + + ( std::fgetc(dictzip_file_) * 256 ); + const long extraPos = std::ftell(dictzip_file_); + const long nextField = extraPos + extraLen; + + while ( std::ftell(dictzip_file_) < nextField ) { + // Read extra field 'header' + char si[2]; + if ( std::fread(si, 1, sizeof(si), dictzip_file_) != sizeof(si) ) { + throw std::runtime_error("Could not read extra dictzip field header."); + } + + const int len = std::fgetc(dictzip_file_) + + ( std::fgetc(dictzip_file_) * 256 ); + + // Check for chunk information + if ( si[0] == 'R' && si[1] == 'A' ) { + const int ver = std::fgetc(dictzip_file_) + + ( std::fgetc(dictzip_file_) * 256 ); + if ( ver != 1 ) { + throw std::runtime_error("Unknown dictzip version."); + } + + chunk_length_ = std::fgetc(dictzip_file_) + + ( std::fgetc(dictzip_file_) * 256 ); + const size_t chunkCount = std::fgetc(dictzip_file_) + + ( std::fgetc(dictzip_file_) * 256 ); + + buffer_.resize(chunk_length_); + unsigned char *buffer = &buffer_[0]; + + this->setg( + reinterpret_cast(buffer), + reinterpret_cast(buffer) + chunk_length_, + reinterpret_cast(buffer) + chunk_length_ + ); + + size_t chunkPos = 0; + + for ( size_t i = 0; i < chunkCount; ++i ) { + const size_t chunkLen = std::fgetc(dictzip_file_) + + ( std::fgetc(dictzip_file_) * 256 ); + chunks_.emplace_back(chunkPos, chunkLen); + chunkPos += chunkLen; + } + } else { + std::fseek(dictzip_file_, len, SEEK_CUR); + } + } +} + +void IstreamBuf::readHeader() { + header_.resize(GZ_HEADER_SIZE); + unsigned char* header = &header_[0]; + + if ( std::fread(header, 1, GZ_HEADER_SIZE, dictzip_file_) != GZ_HEADER_SIZE ) { + throw std::runtime_error("Could not read dictzip header."); + } + + if ( header[GZ_HEADER_ID1] != gzipId1 || + header[GZ_HEADER_ID2] != gzipId2 ) { + throw std::runtime_error("Given dictzip file is not a gzip file."); + } + + if ( header[GZ_HEADER_CM] != GZ_CM_DEFLATE ) { + throw std::runtime_error("Unknown compression method detected."); + } + + if ( !(header[GZ_HEADER_FLG] & GZ_FLG_EXTRA) ) { + throw std::runtime_error("No extra fields, given file cannot be a dictzip file."); + } +} + +void IstreamBuf::skipOptional() { + const unsigned char* header = &header_[0]; + + if ( header[GZ_HEADER_FLG] & GZ_FLG_NAME ) { + while ( std::fgetc(dictzip_file_) != 0 ) {} + } + + if ( header[GZ_HEADER_FLG] & GZ_FLG_COMMENT ) { + while ( std::fgetc(dictzip_file_) != 0 ) {} + } + + if ( header[GZ_HEADER_FLG] & GZ_FLG_HCRC ) { + std::fseek(dictzip_file_, 2, SEEK_CUR); + } +} + +int IstreamBuf::underflow() { + if ( this->gptr() < this->egptr() ) { + return *this->gptr(); + } + + if ( curr_chunk_ + 1 >= static_cast(chunks_.size()) ) { + return EOF; + } + + this->readChunk(curr_chunk_ + 1); + + return *gptr(); +} + +// From C++ annotations 8.1.0~pre1, chapter 23. +std::streamsize IstreamBuf::xsgetn(char *dest, std::streamsize n) { + int nread = 0; + + while ( n ) { + if ( !this->in_avail() ) { + if ( this->underflow() == EOF ) { + break; + } + } + + int avail = this->in_avail(); + + if (avail > n) { + avail = n; + } + + std::memcpy(dest + nread, gptr(), avail); + this->gbump(avail); + + nread += avail; + n -= avail; + } + + return nread; +} + +IstreamBuf::IstreamBuf(char const *filename): + dictzip_file_{ fopen(filename, "r") } { + if ( dictzip_file_ ) { + readHeader(); + readExtra(); + skipOptional(); + data_offset_ = std::ftell(dictzip_file_); + curr_chunk_ = -1; + } else { + throw std::runtime_error("Could not open input dictzip stream."); + } +} + +IstreamBuf::~IstreamBuf() { + if ( dictzip_file_ ) { + fclose(dictzip_file_); + } +} + +IstreamBuf::pos_type IstreamBuf::seekoff(off_type off, seekdir dir, openmode) { + pos_type targetPos; + + switch ( dir ) { + case std::ios::beg: { + targetPos = off; + break; + } + case std::ios::cur: { + const pos_type curPos = (curr_chunk_ * chunk_length_) + + (gptr() - eback()); + targetPos = curPos + off; + break; + } + default: { + // XXX - We can only detmine the uncompressed file length by decompressing the + // last chunk. Quite inefficient, haven't made my mind up whether we want to + // support this. + return EOF; + } + } + + if ( targetPos < 0 ) { + return -1; + } else { + const int targetChunk = targetPos / chunk_length_; + const int chunkPos = targetPos % chunk_length_; + + this->readChunk(targetChunk); + this->setg(this->eback(), this->eback() + chunkPos, this->egptr()); + + return targetPos; + } +} + +IstreamBuf::pos_type IstreamBuf::seekpos(pos_type off, openmode mode) { + return seekoff(off, std::ios::beg, mode); +} + +} diff --git a/src/istream/buffer.h b/src/istream/buffer.h new file mode 100644 index 0000000..91bb53f --- /dev/null +++ b/src/istream/buffer.h @@ -0,0 +1,58 @@ +#pragma once + +#include +#include +#include +#include + +namespace dictzip { + +// Warning: The inherent statefulness of stream buffers interferes with +// multithreaded access. Users of this class should perform appropriate +// locking, since they are in a better position to do so. +class IstreamBuf : public std::streambuf { +private: + struct Chunk { + Chunk(size_t offset, size_t size): + offset(offset), + size(size) {}; + + const size_t offset; + const size_t size; + }; + + FILE* dictzip_file_; + + std::vector buffer_; + std::vector header_; + std::vector chunks_; + + size_t chunk_length_; + long data_offset_; + long curr_chunk_; + + void readChunk(long n); + void readHeader(); + void readExtra(); + void skipOptional(); + +protected: + int underflow(); + std::streamsize xsgetn(char *dest, std::streamsize n); + +public: + using pos_type = std::streambuf::pos_type; + using off_type = std::streambuf::off_type; + + typedef std::ios::seekdir seekdir; + typedef std::ios::openmode openmode; + + IstreamBuf(char const* filename); + ~IstreamBuf(); + + pos_type seekoff(off_type off, seekdir dir, openmode); + pos_type seekpos(pos_type off, openmode mode); + +}; + +} diff --git a/src/istream/stream.cc b/src/istream/stream.cc new file mode 100644 index 0000000..aea548f --- /dev/null +++ b/src/istream/stream.cc @@ -0,0 +1,13 @@ +#include + +#include "stream.h" + +namespace dictzip { + +Istream::Istream(char const* filename): + std::istream(0), + buffer_{ new IstreamBuf(filename) } { + this->rdbuf(buffer_.get()); +} + +} diff --git a/src/istream/stream.h b/src/istream/stream.h new file mode 100644 index 0000000..0602d06 --- /dev/null +++ b/src/istream/stream.h @@ -0,0 +1,21 @@ +#pragma once + +#include +#include +#include + +#include "buffer.h" + +namespace dictzip { + +class Istream : public std::istream { +private: + std::shared_ptr buffer_; + +public: + Istream(char const* filename); + virtual ~Istream() {} + +}; + +} -- cgit v1.2.3