aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/codepoint_iterator.cc190
-rw-r--r--src/codepoint_iterator.h49
-rw-r--r--test.cc91
3 files changed, 330 insertions, 0 deletions
diff --git a/src/codepoint_iterator.cc b/src/codepoint_iterator.cc
new file mode 100644
index 0000000..21a8c36
--- /dev/null
+++ b/src/codepoint_iterator.cc
@@ -0,0 +1,190 @@
+#include "codepoint_iterator.h"
+
+#include <cstdint>
+
+namespace {
+
+enum class CodeUnitType : uint8_t {
+ CONTINUATION = 128, // 10000000
+ LEADING = 64, // 01000000
+ THREE = 32, // 00100000
+ FOUR = 16, // 00010000
+};
+
+enum class CodePoint : uint8_t {
+ CONTINUATION = 63, // 00111111
+ TWO = 31, // 00011111
+ THREE = 15, // 00001111
+ FOUR = 7, // 00000111
+};
+
+inline bool match(const uint8_t& codeUnit, CodeUnitType&& type) {
+ return codeUnit & static_cast<uint8_t>(type);
+}
+
+inline void write(char32_t& codePoint,
+ const uint8_t& codeUnit,
+ CodePoint&& mask,
+ const uint8_t& offset) {
+ codePoint += (codeUnit & static_cast<uint8_t>(mask)) << offset;
+}
+
+}
+
+namespace UTF8 {
+
+CodepointIterator::CodepointIterator(std::string::const_iterator iter):
+ iterator_(iter),
+ dereferenced_(false),
+ codepoint_(0) { }
+
+CodepointIterator::CodepointIterator(const CodepointIterator& src):
+ iterator_(src.iterator_),
+ dereferenced_(src.dereferenced_),
+ codepoint_(src.codepoint_) { }
+
+CodepointIterator& CodepointIterator::operator=(const CodepointIterator& src) {
+ this->iterator_ = src.iterator_;
+ this->dereferenced_ = src.dereferenced_;
+ this->codepoint_ = src.codepoint_;
+
+ return *this;
+}
+
+bool CodepointIterator::operator==(const CodepointIterator& src) const {
+ return this->iterator_ == src.iterator_;
+}
+
+bool CodepointIterator::operator!=(const CodepointIterator& src) const {
+ return this->iterator_ != src.iterator_;
+}
+
+bool CodepointIterator::operator==(
+ const std::string::const_iterator& src) const {
+ return this->iterator_ == src;
+}
+
+bool CodepointIterator::operator!=(
+ const std::string::const_iterator& src) const {
+ return this->iterator_ != src;
+}
+
+char32_t CodepointIterator::operator*() {
+ if ( !this->dereferenced_ ) {
+ uint8_t currByte = *(this->iterator_);
+ this->dereferenced_ = true;
+ this->codepoint_ = 0;
+
+ if ( match(currByte, CodeUnitType::CONTINUATION) ) {
+ if ( match(currByte, CodeUnitType::THREE) ) {
+ if ( match(currByte, CodeUnitType::FOUR) ) {
+ write(this->codepoint_,
+ currByte,
+ CodePoint::FOUR,
+ 18);
+ write(this->codepoint_,
+ *(this->iterator_ + 1),
+ CodePoint::CONTINUATION,
+ 12);
+ write(this->codepoint_,
+ *(this->iterator_ + 2),
+ CodePoint::CONTINUATION,
+ 6);
+ write(this->codepoint_,
+ *(this->iterator_ + 3),
+ CodePoint::CONTINUATION,
+ 0);
+ } else {
+ write(this->codepoint_,
+ currByte,
+ CodePoint::THREE,
+ 12);
+ write(this->codepoint_,
+ *(this->iterator_ + 1),
+ CodePoint::CONTINUATION,
+ 6);
+ write(this->codepoint_,
+ *(this->iterator_ + 2),
+ CodePoint::CONTINUATION,
+ 0);
+ }
+ } else {
+ write(this->codepoint_,
+ currByte,
+ CodePoint::TWO,
+ 6);
+ write(this->codepoint_,
+ *(this->iterator_ + 1),
+ CodePoint::CONTINUATION,
+ 0);
+ }
+ } else {
+ this->codepoint_ = currByte;
+ }
+ }
+
+ return this->codepoint_;
+}
+
+CodepointIterator& CodepointIterator::operator++() {
+ this->dereferenced_ = false;
+ uint8_t currByte = *(this->iterator_);
+ std::string::difference_type offset = 1;
+
+ if ( match(currByte, CodeUnitType::CONTINUATION) ) {
+ if ( match(currByte, CodeUnitType::THREE) ) {
+ if ( match(currByte, CodeUnitType::FOUR) ) {
+ offset = 4;
+ } else {
+ offset = 3;
+ }
+ } else {
+ offset = 2;
+ }
+ }
+
+ this->iterator_ += offset;
+
+ return *this;
+}
+
+CodepointIterator& CodepointIterator::operator--() {
+ this->dereferenced_ = false;
+ --this->iterator_;
+
+ if ( match(*(this->iterator_), CodeUnitType::CONTINUATION) ) {
+ --this->iterator_;
+
+ if ( !match(*(this->iterator_), CodeUnitType::LEADING) ) {
+ --this->iterator_;
+
+ if ( !match(*(this->iterator_), CodeUnitType::LEADING) ) {
+ --this->iterator_;
+
+ if ( !match(*(this->iterator_), CodeUnitType::LEADING) ) {
+ throw codepoint_invalid();
+ }
+ }
+ }
+ }
+
+ return *this;
+}
+
+CodepointIterator CodepointIterator::operator++(int) {
+ CodepointIterator oldIter(*this);
+
+ ++(*this);
+
+ return oldIter;
+}
+
+CodepointIterator CodepointIterator::operator--(int) {
+ CodepointIterator oldIter(*this);
+
+ --(*this);
+
+ return oldIter;
+}
+
+}
diff --git a/src/codepoint_iterator.h b/src/codepoint_iterator.h
new file mode 100644
index 0000000..938f53d
--- /dev/null
+++ b/src/codepoint_iterator.h
@@ -0,0 +1,49 @@
+#ifndef CODEPOINT_ITERATOR_H_
+#define CODEPOINT_ITERATOR_H_
+
+#include <iterator>
+#include <string>
+#include <exception>
+
+namespace UTF8 {
+
+class CodepointIterator : public std::iterator<std::bidirectional_iterator_tag,
+ char32_t,
+ std::string::difference_type,
+ const char32_t*,
+ const char32_t&> {
+ public:
+ CodepointIterator(std::string::const_iterator);
+ CodepointIterator(const CodepointIterator&);
+
+ CodepointIterator& operator=(const CodepointIterator&);
+
+ bool operator==(const CodepointIterator&) const;
+ bool operator==(const std::string::const_iterator&) const;
+
+ bool operator!=(const CodepointIterator&) const;
+ bool operator!=(const std::string::const_iterator&) const;
+
+ char32_t operator*();
+
+ CodepointIterator& operator++();
+ CodepointIterator& operator--();
+
+ CodepointIterator operator++(int);
+ CodepointIterator operator--(int);
+
+ private:
+ std::string::const_iterator iterator_;
+ bool dereferenced_;
+ char32_t codepoint_;
+};
+
+class codepoint_invalid: public std::exception {
+ virtual const char* what() const throw() {
+ return "codepoint_invalid";
+ }
+};
+
+}
+
+#endif // CODEPOINT_ITERATOR_H_
diff --git a/test.cc b/test.cc
new file mode 100644
index 0000000..0693970
--- /dev/null
+++ b/test.cc
@@ -0,0 +1,91 @@
+#include "gtest/gtest.h"
+#include "src/codepoint_iterator.h"
+
+#include <string>
+
+struct SampleText {
+ std::string text;
+ size_t length;
+ std::vector<char32_t> codepoints;
+};
+
+class CodepointIteratorTest : public ::testing::Test {
+ protected:
+ virtual void SetUp() {
+ SampleText tmp;
+
+ tmp.text = u8"Hellø Uni¢od€!";
+ tmp.codepoints = { 72, 101, 108, 108, 248, 32, 85, 110, 105, 162, 111, 100, 8364, 33 };
+ this->sample_.push_back(tmp);
+
+ tmp.text = u8"ᛖᚴ ᚷᛖᛏ ᛖᛏᛁ ᚧ ᚷᛚᛖᚱ ᛘᚾ ᚦᛖᛋᛋ ᚨᚧ ᚡᛖ ᚱᚧᚨ ᛋᚨᚱ";
+ tmp.codepoints = { 5846, 5812, 32, 5815, 5846, 5839, 32, 5846, 5839, 5825, 32, 5799, 32, 5815, 5850, 5846, 5809, 32, 5848, 5822, 32, 5798, 5846, 5835, 5835, 32, 5800, 5799, 32, 5793, 5846, 32, 5809, 5799, 5800, 32, 5835, 5800, 5809 };
+ this->sample_.push_back(tmp);
+
+ tmp.text = u8"⠊⠀⠉⠁⠝⠀⠑⠁⠞⠀⠛⠇⠁⠎⠎⠀⠁⠝⠙⠀⠊⠞⠀⠙⠕⠑⠎⠝⠞⠀⠓⠥⠗⠞⠀⠍⠑";
+ tmp.codepoints = { 10250, 10240, 10249, 10241, 10269, 10240, 10257, 10241, 10270, 10240, 10267, 10247, 10241, 10254, 10254, 10240, 10241, 10269, 10265, 10240, 10250, 10270, 10240, 10265, 10261, 10257, 10254, 10269, 10270, 10240, 10259, 10277, 10263, 10270, 10240, 10253, 10257 };
+ this->sample_.push_back(tmp);
+
+ tmp.text = u8"Ég get etið gler án þess að meiða mig";
+ tmp.codepoints = { 201, 103, 32, 103, 101, 116, 32, 101, 116, 105, 240, 32, 103, 108, 101, 114, 32, 225, 110, 32, 254, 101, 115, 115, 32, 97, 240, 32, 109, 101, 105, 240, 97, 32, 109, 105, 103 };
+ this->sample_.push_back(tmp);
+
+ tmp.text = u8"جام ييه بلورم بڭا ضررى طوقونمز";
+ tmp.codepoints = { 1580, 1575, 1605, 32, 1610, 1610, 1607, 32, 1576, 1604, 1608, 1585, 1605, 32, 1576, 1709, 1575, 32, 1590, 1585, 1585, 1609, 32, 1591, 1608, 1602, 1608, 1606, 1605, 1586 };
+ this->sample_.push_back(tmp);
+
+ tmp.text = u8"මට වීදුරු කෑමට හැකියි. එයින් මට කිසි හානියක් සිදු නොවේ";
+ tmp.codepoints = { 3512, 3495, 32, 3520, 3539, 3503, 3540, 3515, 3540, 32, 3482, 3537, 3512, 3495, 32, 3524, 3536, 3482, 3538, 3514, 3538, 46, 32, 3473, 3514, 3538, 3505, 3530, 32, 3512, 3495, 32, 3482, 3538, 3523, 3538, 32, 3524, 3535, 3505, 3538, 3514, 3482, 3530, 32, 3523, 3538, 3503, 3540, 32, 3505, 3548, 3520, 3546 };
+ this->sample_.push_back(tmp);
+ }
+
+ std::vector<SampleText> sample_;
+};
+
+TEST_F(CodepointIteratorTest, ForwardIteration) {
+ for ( auto tmp : this->sample_ ) {
+ size_t length = 0;
+
+ for ( UTF8::CodepointIterator iter(tmp.text.cbegin());
+ iter != tmp.text.cend();
+ ++iter ) {
+ length++;
+ }
+
+ EXPECT_EQ(tmp.codepoints.size(), length);
+ }
+}
+
+TEST_F(CodepointIteratorTest, ReverseIteration) {
+ for ( auto tmp : this->sample_ ) {
+ size_t length = 0;
+
+ for ( UTF8::CodepointIterator iter(tmp.text.cend());
+ iter != tmp.text.cbegin();
+ --iter ) {
+ length++;
+ }
+
+ EXPECT_EQ(tmp.codepoints.size(), length);
+ }
+}
+
+TEST_F(CodepointIteratorTest, Dereferencing) {
+ for ( auto tmp : this->sample_ ) {
+ size_t index = 0;
+
+ for ( UTF8::CodepointIterator iter(tmp.text.cbegin());
+ iter != tmp.text.cend();
+ ++iter ) {
+ EXPECT_EQ(tmp.codepoints[index], *iter);
+
+ ++index;
+ }
+ }
+}
+
+int main(int argc, char **argv) {
+ testing::InitGoogleTest(&argc, argv);
+
+ return RUN_ALL_TESTS();
+}