aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdrian Kummerländer2014-02-15 12:48:35 +0100
committerAdrian Kummerländer2014-02-15 12:48:35 +0100
commit79a65ce58ad8f3b2b1c9eeaba4b0b4710dc09e2c (patch)
tree09a09025f16b79d5d46201c77bb5cab2d2bdd4f6
parent609be30bf9562a86182ed0958a238b6ba9392ebf (diff)
downloadCodepointIterator-79a65ce58ad8f3b2b1c9eeaba4b0b4710dc09e2c.tar
CodepointIterator-79a65ce58ad8f3b2b1c9eeaba4b0b4710dc09e2c.tar.gz
CodepointIterator-79a65ce58ad8f3b2b1c9eeaba4b0b4710dc09e2c.tar.bz2
CodepointIterator-79a65ce58ad8f3b2b1c9eeaba4b0b4710dc09e2c.tar.lz
CodepointIterator-79a65ce58ad8f3b2b1c9eeaba4b0b4710dc09e2c.tar.xz
CodepointIterator-79a65ce58ad8f3b2b1c9eeaba4b0b4710dc09e2c.tar.zst
CodepointIterator-79a65ce58ad8f3b2b1c9eeaba4b0b4710dc09e2c.zip
Extracted helper functions and bitmasks into separate compilation unit
* utility.h and utility.cc now contain the UTF8-codepoint and unit bitmasks and read / write functions * Modified users of these functions and unions accordingly * Added the new compilation unit to the Makefile * Changed bitmask specification from plain integer literals to shift expressions for better readability
-rw-r--r--Makefile3
-rw-r--r--src/codepoint_iterator.cc135
-rw-r--r--src/utility.cc18
-rw-r--r--src/utility.h29
4 files changed, 102 insertions, 83 deletions
diff --git a/Makefile b/Makefile
index 1c9b781..6d82904 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,8 @@
CXX = g++
CXXFLAGS = -std=c++11 -W -Wall -Wextra -pedantic
-SRC = src/codepoint_iterator.cc \
+SRC = src/utility.cc \
+ src/codepoint_iterator.cc \
test.cc
OBJ = $(subst .cc,.o,$(SRC))
diff --git a/src/codepoint_iterator.cc b/src/codepoint_iterator.cc
index 21a8c36..fb638d1 100644
--- a/src/codepoint_iterator.cc
+++ b/src/codepoint_iterator.cc
@@ -1,35 +1,6 @@
#include "codepoint_iterator.h"
-#include <cstdint>
-
-namespace {
-
-enum class CodeUnitType : uint8_t {
- CONTINUATION = 128, // 10000000
- LEADING = 64, // 01000000
- THREE = 32, // 00100000
- FOUR = 16, // 00010000
-};
-
-enum class CodePoint : uint8_t {
- CONTINUATION = 63, // 00111111
- TWO = 31, // 00011111
- THREE = 15, // 00001111
- FOUR = 7, // 00000111
-};
-
-inline bool match(const uint8_t& codeUnit, CodeUnitType&& type) {
- return codeUnit & static_cast<uint8_t>(type);
-}
-
-inline void write(char32_t& codePoint,
- const uint8_t& codeUnit,
- CodePoint&& mask,
- const uint8_t& offset) {
- codePoint += (codeUnit & static_cast<uint8_t>(mask)) << offset;
-}
-
-}
+#include "utility.h"
namespace UTF8 {
@@ -75,48 +46,48 @@ char32_t CodepointIterator::operator*() {
this->dereferenced_ = true;
this->codepoint_ = 0;
- if ( match(currByte, CodeUnitType::CONTINUATION) ) {
- if ( match(currByte, CodeUnitType::THREE) ) {
- if ( match(currByte, CodeUnitType::FOUR) ) {
- write(this->codepoint_,
- currByte,
- CodePoint::FOUR,
- 18);
- write(this->codepoint_,
- *(this->iterator_ + 1),
- CodePoint::CONTINUATION,
- 12);
- write(this->codepoint_,
- *(this->iterator_ + 2),
- CodePoint::CONTINUATION,
- 6);
- write(this->codepoint_,
- *(this->iterator_ + 3),
- CodePoint::CONTINUATION,
- 0);
+ if ( match(currByte, dtl::CodeUnitType::CONTINUATION) ) {
+ if ( match(currByte, dtl::CodeUnitType::THREE) ) {
+ if ( match(currByte, dtl::CodeUnitType::FOUR) ) {
+ dtl::write(this->codepoint_,
+ currByte,
+ dtl::CodePoint::FOUR,
+ 18);
+ dtl::write(this->codepoint_,
+ *(this->iterator_ + 1),
+ dtl::CodePoint::CONTINUATION,
+ 12);
+ dtl::write(this->codepoint_,
+ *(this->iterator_ + 2),
+ dtl::CodePoint::CONTINUATION,
+ 6);
+ dtl::write(this->codepoint_,
+ *(this->iterator_ + 3),
+ dtl::CodePoint::CONTINUATION,
+ 0);
} else {
- write(this->codepoint_,
- currByte,
- CodePoint::THREE,
- 12);
- write(this->codepoint_,
- *(this->iterator_ + 1),
- CodePoint::CONTINUATION,
- 6);
- write(this->codepoint_,
- *(this->iterator_ + 2),
- CodePoint::CONTINUATION,
- 0);
+ dtl::write(this->codepoint_,
+ currByte,
+ dtl::CodePoint::THREE,
+ 12);
+ dtl::write(this->codepoint_,
+ *(this->iterator_ + 1),
+ dtl::CodePoint::CONTINUATION,
+ 6);
+ dtl::write(this->codepoint_,
+ *(this->iterator_ + 2),
+ dtl::CodePoint::CONTINUATION,
+ 0);
}
} else {
- write(this->codepoint_,
- currByte,
- CodePoint::TWO,
- 6);
- write(this->codepoint_,
- *(this->iterator_ + 1),
- CodePoint::CONTINUATION,
- 0);
+ dtl::write(this->codepoint_,
+ currByte,
+ dtl::CodePoint::TWO,
+ 6);
+ dtl::write(this->codepoint_,
+ *(this->iterator_ + 1),
+ dtl::CodePoint::CONTINUATION,
+ 0);
}
} else {
this->codepoint_ = currByte;
@@ -131,9 +102,9 @@ CodepointIterator& CodepointIterator::operator++() {
uint8_t currByte = *(this->iterator_);
std::string::difference_type offset = 1;
- if ( match(currByte, CodeUnitType::CONTINUATION) ) {
- if ( match(currByte, CodeUnitType::THREE) ) {
- if ( match(currByte, CodeUnitType::FOUR) ) {
+ if ( match(currByte, dtl::CodeUnitType::CONTINUATION) ) {
+ if ( match(currByte, dtl::CodeUnitType::THREE) ) {
+ if ( match(currByte, dtl::CodeUnitType::FOUR) ) {
offset = 4;
} else {
offset = 3;
@@ -150,18 +121,18 @@ CodepointIterator& CodepointIterator::operator++() {
CodepointIterator& CodepointIterator::operator--() {
this->dereferenced_ = false;
- --this->iterator_;
+ this->iterator_.operator--();
- if ( match(*(this->iterator_), CodeUnitType::CONTINUATION) ) {
- --this->iterator_;
+ if ( match(*(this->iterator_), dtl::CodeUnitType::CONTINUATION) ) {
+ this->iterator_.operator--();
- if ( !match(*(this->iterator_), CodeUnitType::LEADING) ) {
- --this->iterator_;
+ if ( !match(*(this->iterator_), dtl::CodeUnitType::LEADING) ) {
+ this->iterator_.operator--();
- if ( !match(*(this->iterator_), CodeUnitType::LEADING) ) {
- --this->iterator_;
+ if ( !match(*(this->iterator_), dtl::CodeUnitType::LEADING) ) {
+ this->iterator_.operator--();
- if ( !match(*(this->iterator_), CodeUnitType::LEADING) ) {
+ if ( !match(*(this->iterator_), dtl::CodeUnitType::LEADING) ) {
throw codepoint_invalid();
}
}
@@ -174,7 +145,7 @@ CodepointIterator& CodepointIterator::operator--() {
CodepointIterator CodepointIterator::operator++(int) {
CodepointIterator oldIter(*this);
- ++(*this);
+ this->operator++();
return oldIter;
}
@@ -182,7 +153,7 @@ CodepointIterator CodepointIterator::operator++(int) {
CodepointIterator CodepointIterator::operator--(int) {
CodepointIterator oldIter(*this);
- --(*this);
+ this->operator--();
return oldIter;
}
diff --git a/src/utility.cc b/src/utility.cc
new file mode 100644
index 0000000..92ba7b9
--- /dev/null
+++ b/src/utility.cc
@@ -0,0 +1,18 @@
+#include "utility.h"
+
+namespace UTF8 {
+namespace dtl {
+
+bool match(const uint8_t& codeUnit, CodeUnitType&& type) {
+ return codeUnit & static_cast<uint8_t>(type);
+}
+
+void write(char32_t& codePoint,
+ const uint8_t& codeUnit,
+ CodePoint&& mask,
+ const uint8_t& offset) {
+ codePoint += (codeUnit & static_cast<uint8_t>(mask)) << offset;
+}
+
+}
+}
diff --git a/src/utility.h b/src/utility.h
new file mode 100644
index 0000000..dcdcf75
--- /dev/null
+++ b/src/utility.h
@@ -0,0 +1,29 @@
+#ifndef CODEPOINT_ITERATOR_UTILITY_H_
+#define CODEPOINT_ITERATOR_UTILITY_H_
+
+#include <cstdint>
+
+namespace UTF8 {
+namespace dtl {
+
+enum class CodeUnitType : uint8_t {
+ CONTINUATION = (128 >> 0), // 10000000
+ LEADING = (128 >> 1), // 01000000
+ THREE = (128 >> 2), // 00100000
+ FOUR = (128 >> 3), // 00010000
+};
+
+enum class CodePoint : uint8_t {
+ CONTINUATION = (UINT8_MAX >> 2), // 00111111
+ TWO = (UINT8_MAX >> 3), // 00011111
+ THREE = (UINT8_MAX >> 4), // 00001111
+ FOUR = (UINT8_MAX >> 5), // 00000111
+};
+
+bool match(const uint8_t&, CodeUnitType&&);
+void write(char32_t&, const uint8_t&, CodePoint&&, const uint8_t&);
+
+}
+}
+
+#endif // CODEPOINT_ITERATOR_UTILITY_H_