diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/istream/buffer.cc | 253 | ||||
| -rw-r--r-- | src/istream/buffer.h | 58 | ||||
| -rw-r--r-- | src/istream/stream.cc | 13 | ||||
| -rw-r--r-- | src/istream/stream.h | 21 | ||||
| -rw-r--r-- | src/util/gzip.h | 59 | 
5 files changed, 404 insertions, 0 deletions
diff --git a/src/istream/buffer.cc b/src/istream/buffer.cc new file mode 100644 index 0000000..aaa93ff --- /dev/null +++ b/src/istream/buffer.cc @@ -0,0 +1,253 @@ +#include <cstdio> +#include <cstring> +#include <iostream> +#include <stdexcept> + +#include <zlib.h> + +#include "buffer.h" +#include "util/gzip.h" + +namespace dictzip { + +void IstreamBuf::readChunk(long n) { +	if ( n == curr_chunk_ ) { +		return; +	} + +	const IstreamBuf::Chunk chunkN = chunks_[n]; +	unsigned char* zBuf = new unsigned char[chunkN.size]; +	 +	std::fseek(dictzip_file_, data_offset_ + chunkN.offset, SEEK_SET); + +	if ( fread(zBuf, 1, chunkN.size, dictzip_file_) != chunkN.size ) { +		throw std::runtime_error("Could not read dictzip chunk."); +	} + +	z_stream zStream; +	zStream.next_in   = zBuf; +	zStream.avail_in  = chunkN.size; +	zStream.next_out  = &buffer_[0]; +	zStream.avail_out = chunk_length_; +	zStream.zalloc    = NULL; +	zStream.zfree     = NULL; + +	if ( inflateInit2(&zStream, -15) != Z_OK ) { +		delete[] zBuf; +		throw std::runtime_error(zStream.msg); +	} + +	const int r = inflate(&zStream, Z_PARTIAL_FLUSH); + +	if ( r != Z_OK && r != Z_STREAM_END ) { +		delete[] zBuf; +		throw std::runtime_error(zStream.msg); +	} +	 +	delete[] zBuf; + +	if ( inflateEnd(&zStream) != Z_OK ) { +		throw std::runtime_error(zStream.msg); +	} + +	unsigned char *buffer = &buffer_[0]; + +	this->setg( +		reinterpret_cast<char*>(buffer), +		reinterpret_cast<char*>(buffer), +		reinterpret_cast<char*>(buffer) + zStream.total_out +	); +	 +	curr_chunk_ = n; +} + +void IstreamBuf::readExtra() { +	const int  extraLen  = std::fgetc(dictzip_file_) +	                     + ( std::fgetc(dictzip_file_) * 256 ); +	const long extraPos  = std::ftell(dictzip_file_); +	const long nextField = extraPos + extraLen; + +	while ( std::ftell(dictzip_file_) < nextField ) { +		// Read extra field 'header' +		char si[2]; +		if ( std::fread(si, 1, sizeof(si), dictzip_file_) != sizeof(si) ) { +			throw std::runtime_error("Could not read extra dictzip field header."); +		} + +		const int len = std::fgetc(dictzip_file_) +		              + ( std::fgetc(dictzip_file_) * 256 ); +		 +		// Check for chunk information +		if ( si[0] == 'R' && si[1] == 'A' ) { +			const int ver = std::fgetc(dictzip_file_) +			              + ( std::fgetc(dictzip_file_) * 256 ); +			if ( ver != 1 ) { +				throw std::runtime_error("Unknown dictzip version."); +			} +			 +			chunk_length_ = std::fgetc(dictzip_file_) +			              + ( std::fgetc(dictzip_file_) * 256 ); +			const size_t chunkCount = std::fgetc(dictzip_file_) +			                        + ( std::fgetc(dictzip_file_) * 256 ); +			 +			buffer_.resize(chunk_length_); +			unsigned char *buffer = &buffer_[0]; + +			this->setg( +				reinterpret_cast<char*>(buffer), +				reinterpret_cast<char*>(buffer) + chunk_length_, +				reinterpret_cast<char*>(buffer) + chunk_length_ +			); +			 +			size_t chunkPos = 0; + +			for ( size_t i = 0; i < chunkCount; ++i ) { +				const size_t chunkLen = std::fgetc(dictzip_file_) +				                      + ( std::fgetc(dictzip_file_) * 256 ); +				chunks_.emplace_back(chunkPos, chunkLen);	 +				chunkPos += chunkLen; +			} +		} else { +			std::fseek(dictzip_file_, len, SEEK_CUR); +		} +	} +} + +void IstreamBuf::readHeader() { +	header_.resize(GZ_HEADER_SIZE); +	unsigned char* header = &header_[0]; + +	if ( std::fread(header, 1, GZ_HEADER_SIZE, dictzip_file_) != GZ_HEADER_SIZE ) { +		throw std::runtime_error("Could not read dictzip header."); +	} +	 +	if ( header[GZ_HEADER_ID1] != gzipId1 || +	     header[GZ_HEADER_ID2] != gzipId2 ) { +		throw std::runtime_error("Given dictzip file is not a gzip file."); +	} +	 +	if ( header[GZ_HEADER_CM] != GZ_CM_DEFLATE ) { +		throw std::runtime_error("Unknown compression method detected."); +	} +	 +	if ( !(header[GZ_HEADER_FLG] & GZ_FLG_EXTRA) ) { +		throw std::runtime_error("No extra fields, given file cannot be a dictzip file."); +	} +} + +void IstreamBuf::skipOptional() { +	const unsigned char* header = &header_[0]; +	 +	if ( header[GZ_HEADER_FLG] & GZ_FLG_NAME ) { +		while ( std::fgetc(dictzip_file_) != 0 ) {} +	} +		 +	if ( header[GZ_HEADER_FLG] & GZ_FLG_COMMENT ) { +		while ( std::fgetc(dictzip_file_) != 0 ) {} +	} +	 +	if ( header[GZ_HEADER_FLG] & GZ_FLG_HCRC ) { +		std::fseek(dictzip_file_, 2, SEEK_CUR); +	} +} + +int IstreamBuf::underflow() { +	if ( this->gptr() < this->egptr() ) { +		return *this->gptr(); +	} + +	if ( curr_chunk_ + 1 >= static_cast<int>(chunks_.size()) ) { +		return EOF; +	} + +	this->readChunk(curr_chunk_ + 1); +	 +	return *gptr(); +} + +// From C++ annotations 8.1.0~pre1, chapter 23. +std::streamsize IstreamBuf::xsgetn(char *dest, std::streamsize n) { +	int nread = 0; +	 +	while ( n ) { +		if ( !this->in_avail() ) { +			if ( this->underflow() == EOF ) { +				break; +			} +		} +		 +		int avail = this->in_avail(); +		 +		if (avail > n) { +			avail = n; +		} +		 +		std::memcpy(dest + nread, gptr(), avail); +		this->gbump(avail); +		 +		nread += avail; +		n     -= avail; +	} +	 +	return nread; +} + +IstreamBuf::IstreamBuf(char const *filename): +	dictzip_file_{ fopen(filename, "r") } { +	if ( dictzip_file_ ) { +		readHeader(); +		readExtra(); +		skipOptional(); +		data_offset_ = std::ftell(dictzip_file_); +		curr_chunk_ = -1; +	} else { +		throw std::runtime_error("Could not open input dictzip stream."); +	} +} + +IstreamBuf::~IstreamBuf() { +	if ( dictzip_file_ ) { +		fclose(dictzip_file_); +	} +} + +IstreamBuf::pos_type IstreamBuf::seekoff(off_type off, seekdir dir, openmode) { +	pos_type targetPos; + +	switch ( dir ) { +		case std::ios::beg: { +			targetPos = off; +			break; +		} +		case std::ios::cur: { +			const pos_type curPos = (curr_chunk_ * chunk_length_) +			                      + (gptr() - eback()); +			targetPos = curPos + off; +			break; +		} +		default: { +			// XXX - We can only detmine the uncompressed file length by decompressing the +			// last chunk. Quite inefficient, haven't made my mind up whether we want to +			// support this. +			return EOF; +		} +	} + +	if ( targetPos < 0 ) { +		return -1; +	} else { +		const int targetChunk = targetPos / chunk_length_; +		const int chunkPos    = targetPos % chunk_length_; +	 +		this->readChunk(targetChunk); +		this->setg(this->eback(), this->eback() + chunkPos, this->egptr()); +		 +		return targetPos; +	} +} + +IstreamBuf::pos_type IstreamBuf::seekpos(pos_type off, openmode mode) { +	return seekoff(off, std::ios::beg, mode); +} + +} diff --git a/src/istream/buffer.h b/src/istream/buffer.h new file mode 100644 index 0000000..91bb53f --- /dev/null +++ b/src/istream/buffer.h @@ -0,0 +1,58 @@ +#pragma once + +#include <cstdio> +#include <iostream> +#include <streambuf> +#include <vector> + +namespace dictzip { + +// Warning: The inherent statefulness of stream buffers interferes with +// multithreaded access. Users of this class should perform appropriate +// locking, since they are in a better position to do so. +class IstreamBuf : public std::streambuf { +private: +	struct Chunk { +		Chunk(size_t offset, size_t size): +			offset(offset), +			size(size) {}; + +		const size_t offset; +		const size_t size; +	}; + +	FILE* dictzip_file_; + +	std::vector<unsigned char> buffer_; +	std::vector<unsigned char> header_; +	std::vector<Chunk>  chunks_; + +	size_t chunk_length_; +	long data_offset_; +	long curr_chunk_; + +	void readChunk(long n); +	void readHeader(); +	void readExtra(); +	void skipOptional(); + +protected: +	int underflow(); +	std::streamsize xsgetn(char *dest, std::streamsize n); + +public: +	using pos_type = std::streambuf::pos_type; +	using off_type = std::streambuf::off_type; + +	typedef std::ios::seekdir  seekdir; +	typedef std::ios::openmode openmode; + +	IstreamBuf(char const* filename); +	~IstreamBuf(); + +	pos_type seekoff(off_type off, seekdir dir, openmode); +	pos_type seekpos(pos_type off, openmode mode); + +}; + +} diff --git a/src/istream/stream.cc b/src/istream/stream.cc new file mode 100644 index 0000000..aea548f --- /dev/null +++ b/src/istream/stream.cc @@ -0,0 +1,13 @@ +#include <iostream> + +#include "stream.h" + +namespace dictzip { + +Istream::Istream(char const* filename): +	std::istream(0), +	buffer_{ new IstreamBuf(filename) } { +	this->rdbuf(buffer_.get()); +} + +} diff --git a/src/istream/stream.h b/src/istream/stream.h new file mode 100644 index 0000000..0602d06 --- /dev/null +++ b/src/istream/stream.h @@ -0,0 +1,21 @@ +#pragma once + +#include <iostream> +#include <string> +#include <memory> + +#include "buffer.h" + +namespace dictzip { + +class Istream : public std::istream { +private: +	std::shared_ptr<IstreamBuf> buffer_; + +public: +	Istream(char const* filename); +	virtual ~Istream() {} + +}; + +} diff --git a/src/util/gzip.h b/src/util/gzip.h new file mode 100644 index 0000000..53e59f6 --- /dev/null +++ b/src/util/gzip.h @@ -0,0 +1,59 @@ +#pragma once + +#include <cstddef> + +namespace { + +size_t const GZ_HEADER_SIZE = 10; + +// Gzip header fields +size_t const GZ_HEADER_ID1 = 0; +size_t const GZ_HEADER_ID2 = 1; +size_t const GZ_HEADER_CM = 2; +size_t const GZ_HEADER_FLG = 3; +size_t const GZ_HEADER_MTIME = 4; +size_t const GZ_HEADER_XFL = 8; +size_t const GZ_HEADER_OS = 9; + +// Gzip file magic +unsigned char const gzipId1 = 0x1f; +unsigned char const gzipId2 = 0x8b; + +// Gzip compression method(s) +unsigned char const GZ_CM_DEFLATE = 8; + +// Flags in GZ_HEADER_FLG +unsigned char const GZ_FLG_TEXT = 1; +unsigned char const GZ_FLG_HCRC = 1 << 1; +unsigned char const GZ_FLG_EXTRA = 1 << 2; +unsigned char const GZ_FLG_NAME = 1 << 3; +unsigned char const GZ_FLG_COMMENT = 1 << 4; + +// GZ_HEADER_XFL values for deflate +unsigned char const GZ_XFL_MAX = 2; +unsigned char const GZ_XFL_FAST = 4; + +// GZ_HEADER_OS values +unsigned char const GZ_OS_FAT = 0; +unsigned char const GZ_OS_AMIGA = 1; +unsigned char const GZ_OS_VMS = 2; +unsigned char const GZ_OS_UNIX = 3; +unsigned char const GZ_OS_VM_CMS = 4; +unsigned char const GZ_OS_TOS = 5; +unsigned char const GZ_OS_HPFS = 6; +unsigned char const GZ_OS_MAC = 7; +unsigned char const GZ_OS_ZSYSTEM = 8; +unsigned char const GZ_OS_CPM = 9; +unsigned char const GZ_OS_TOPS20 = 10; +unsigned char const GZ_OS_NTFS = 11; +unsigned char const GZ_OS_QDOS = 12; +unsigned char const GZ_OS_RISCOS = 13; +unsigned char const GZ_OS_UNKNOWN = 255; + +size_t const GZ_TRAILER_SIZE = 8; + +// Gzip trailer fields +size_t const GZ_TRAILER_CRC32 = 0; +size_t const GZ_TRAILER_ISIZE = 4; + +}  | 
