diff options
Extracted helper functions and bitmasks into separate compilation unit
* utility.h and utility.cc now contain the UTF8-codepoint and unit bitmasks and read / write functions
* Modified users of these functions and unions accordingly
* Added the new compilation unit to the Makefile
* Changed bitmask specification from plain integer literals to shift expressions for better readability
-rw-r--r-- | Makefile | 3 | ||||
-rw-r--r-- | src/codepoint_iterator.cc | 135 | ||||
-rw-r--r-- | src/utility.cc | 18 | ||||
-rw-r--r-- | src/utility.h | 29 |
4 files changed, 102 insertions, 83 deletions
@@ -1,7 +1,8 @@ CXX = g++ CXXFLAGS = -std=c++11 -W -Wall -Wextra -pedantic -SRC = src/codepoint_iterator.cc \ +SRC = src/utility.cc \ + src/codepoint_iterator.cc \ test.cc OBJ = $(subst .cc,.o,$(SRC)) diff --git a/src/codepoint_iterator.cc b/src/codepoint_iterator.cc index 21a8c36..fb638d1 100644 --- a/src/codepoint_iterator.cc +++ b/src/codepoint_iterator.cc @@ -1,35 +1,6 @@ #include "codepoint_iterator.h" -#include <cstdint> - -namespace { - -enum class CodeUnitType : uint8_t { - CONTINUATION = 128, // 10000000 - LEADING = 64, // 01000000 - THREE = 32, // 00100000 - FOUR = 16, // 00010000 -}; - -enum class CodePoint : uint8_t { - CONTINUATION = 63, // 00111111 - TWO = 31, // 00011111 - THREE = 15, // 00001111 - FOUR = 7, // 00000111 -}; - -inline bool match(const uint8_t& codeUnit, CodeUnitType&& type) { - return codeUnit & static_cast<uint8_t>(type); -} - -inline void write(char32_t& codePoint, - const uint8_t& codeUnit, - CodePoint&& mask, - const uint8_t& offset) { - codePoint += (codeUnit & static_cast<uint8_t>(mask)) << offset; -} - -} +#include "utility.h" namespace UTF8 { @@ -75,48 +46,48 @@ char32_t CodepointIterator::operator*() { this->dereferenced_ = true; this->codepoint_ = 0; - if ( match(currByte, CodeUnitType::CONTINUATION) ) { - if ( match(currByte, CodeUnitType::THREE) ) { - if ( match(currByte, CodeUnitType::FOUR) ) { - write(this->codepoint_, - currByte, - CodePoint::FOUR, - 18); - write(this->codepoint_, - *(this->iterator_ + 1), - CodePoint::CONTINUATION, - 12); - write(this->codepoint_, - *(this->iterator_ + 2), - CodePoint::CONTINUATION, - 6); - write(this->codepoint_, - *(this->iterator_ + 3), - CodePoint::CONTINUATION, - 0); + if ( match(currByte, dtl::CodeUnitType::CONTINUATION) ) { + if ( match(currByte, dtl::CodeUnitType::THREE) ) { + if ( match(currByte, dtl::CodeUnitType::FOUR) ) { + dtl::write(this->codepoint_, + currByte, + dtl::CodePoint::FOUR, + 18); + dtl::write(this->codepoint_, + *(this->iterator_ + 1), + dtl::CodePoint::CONTINUATION, + 12); + dtl::write(this->codepoint_, + *(this->iterator_ + 2), + dtl::CodePoint::CONTINUATION, + 6); + dtl::write(this->codepoint_, + *(this->iterator_ + 3), + dtl::CodePoint::CONTINUATION, + 0); } else { - write(this->codepoint_, - currByte, - CodePoint::THREE, - 12); - write(this->codepoint_, - *(this->iterator_ + 1), - CodePoint::CONTINUATION, - 6); - write(this->codepoint_, - *(this->iterator_ + 2), - CodePoint::CONTINUATION, - 0); + dtl::write(this->codepoint_, + currByte, + dtl::CodePoint::THREE, + 12); + dtl::write(this->codepoint_, + *(this->iterator_ + 1), + dtl::CodePoint::CONTINUATION, + 6); + dtl::write(this->codepoint_, + *(this->iterator_ + 2), + dtl::CodePoint::CONTINUATION, + 0); } } else { - write(this->codepoint_, - currByte, - CodePoint::TWO, - 6); - write(this->codepoint_, - *(this->iterator_ + 1), - CodePoint::CONTINUATION, - 0); + dtl::write(this->codepoint_, + currByte, + dtl::CodePoint::TWO, + 6); + dtl::write(this->codepoint_, + *(this->iterator_ + 1), + dtl::CodePoint::CONTINUATION, + 0); } } else { this->codepoint_ = currByte; @@ -131,9 +102,9 @@ CodepointIterator& CodepointIterator::operator++() { uint8_t currByte = *(this->iterator_); std::string::difference_type offset = 1; - if ( match(currByte, CodeUnitType::CONTINUATION) ) { - if ( match(currByte, CodeUnitType::THREE) ) { - if ( match(currByte, CodeUnitType::FOUR) ) { + if ( match(currByte, dtl::CodeUnitType::CONTINUATION) ) { + if ( match(currByte, dtl::CodeUnitType::THREE) ) { + if ( match(currByte, dtl::CodeUnitType::FOUR) ) { offset = 4; } else { offset = 3; @@ -150,18 +121,18 @@ CodepointIterator& CodepointIterator::operator++() { CodepointIterator& CodepointIterator::operator--() { this->dereferenced_ = false; - --this->iterator_; + this->iterator_.operator--(); - if ( match(*(this->iterator_), CodeUnitType::CONTINUATION) ) { - --this->iterator_; + if ( match(*(this->iterator_), dtl::CodeUnitType::CONTINUATION) ) { + this->iterator_.operator--(); - if ( !match(*(this->iterator_), CodeUnitType::LEADING) ) { - --this->iterator_; + if ( !match(*(this->iterator_), dtl::CodeUnitType::LEADING) ) { + this->iterator_.operator--(); - if ( !match(*(this->iterator_), CodeUnitType::LEADING) ) { - --this->iterator_; + if ( !match(*(this->iterator_), dtl::CodeUnitType::LEADING) ) { + this->iterator_.operator--(); - if ( !match(*(this->iterator_), CodeUnitType::LEADING) ) { + if ( !match(*(this->iterator_), dtl::CodeUnitType::LEADING) ) { throw codepoint_invalid(); } } @@ -174,7 +145,7 @@ CodepointIterator& CodepointIterator::operator--() { CodepointIterator CodepointIterator::operator++(int) { CodepointIterator oldIter(*this); - ++(*this); + this->operator++(); return oldIter; } @@ -182,7 +153,7 @@ CodepointIterator CodepointIterator::operator++(int) { CodepointIterator CodepointIterator::operator--(int) { CodepointIterator oldIter(*this); - --(*this); + this->operator--(); return oldIter; } diff --git a/src/utility.cc b/src/utility.cc new file mode 100644 index 0000000..92ba7b9 --- /dev/null +++ b/src/utility.cc @@ -0,0 +1,18 @@ +#include "utility.h" + +namespace UTF8 { +namespace dtl { + +bool match(const uint8_t& codeUnit, CodeUnitType&& type) { + return codeUnit & static_cast<uint8_t>(type); +} + +void write(char32_t& codePoint, + const uint8_t& codeUnit, + CodePoint&& mask, + const uint8_t& offset) { + codePoint += (codeUnit & static_cast<uint8_t>(mask)) << offset; +} + +} +} diff --git a/src/utility.h b/src/utility.h new file mode 100644 index 0000000..dcdcf75 --- /dev/null +++ b/src/utility.h @@ -0,0 +1,29 @@ +#ifndef CODEPOINT_ITERATOR_UTILITY_H_ +#define CODEPOINT_ITERATOR_UTILITY_H_ + +#include <cstdint> + +namespace UTF8 { +namespace dtl { + +enum class CodeUnitType : uint8_t { + CONTINUATION = (128 >> 0), // 10000000 + LEADING = (128 >> 1), // 01000000 + THREE = (128 >> 2), // 00100000 + FOUR = (128 >> 3), // 00010000 +}; + +enum class CodePoint : uint8_t { + CONTINUATION = (UINT8_MAX >> 2), // 00111111 + TWO = (UINT8_MAX >> 3), // 00011111 + THREE = (UINT8_MAX >> 4), // 00001111 + FOUR = (UINT8_MAX >> 5), // 00000111 +}; + +bool match(const uint8_t&, CodeUnitType&&); +void write(char32_t&, const uint8_t&, CodePoint&&, const uint8_t&); + +} +} + +#endif // CODEPOINT_ITERATOR_UTILITY_H_ |