From 4d1b9a918f8a189ba1e9887c6a9da04e7392db90 Mon Sep 17 00:00:00 2001 From: Adrian Kummerländer Date: Sat, 5 Oct 2013 12:41:07 +0200 Subject: Initial commit * CodepointIterator is a simple C++ iterator class which iterates through unicode codepoints in a UTF8-encoded string * It is derived from std::iterator and implements the std::bidirectional_iterator_tag * Dereferencing an instance of the class provides the codepoint as char32_t * Tests require Google Test and use UTF8-samples from http://www.columbia.edu/~fdc/utf8/ --- src/codepoint_iterator.cc | 190 ++++++++++++++++++++++++++++++++++++++++++++++ src/codepoint_iterator.h | 49 ++++++++++++ test.cc | 91 ++++++++++++++++++++++ 3 files changed, 330 insertions(+) create mode 100644 src/codepoint_iterator.cc create mode 100644 src/codepoint_iterator.h create mode 100644 test.cc diff --git a/src/codepoint_iterator.cc b/src/codepoint_iterator.cc new file mode 100644 index 0000000..21a8c36 --- /dev/null +++ b/src/codepoint_iterator.cc @@ -0,0 +1,190 @@ +#include "codepoint_iterator.h" + +#include + +namespace { + +enum class CodeUnitType : uint8_t { + CONTINUATION = 128, // 10000000 + LEADING = 64, // 01000000 + THREE = 32, // 00100000 + FOUR = 16, // 00010000 +}; + +enum class CodePoint : uint8_t { + CONTINUATION = 63, // 00111111 + TWO = 31, // 00011111 + THREE = 15, // 00001111 + FOUR = 7, // 00000111 +}; + +inline bool match(const uint8_t& codeUnit, CodeUnitType&& type) { + return codeUnit & static_cast(type); +} + +inline void write(char32_t& codePoint, + const uint8_t& codeUnit, + CodePoint&& mask, + const uint8_t& offset) { + codePoint += (codeUnit & static_cast(mask)) << offset; +} + +} + +namespace UTF8 { + +CodepointIterator::CodepointIterator(std::string::const_iterator iter): + iterator_(iter), + dereferenced_(false), + codepoint_(0) { } + +CodepointIterator::CodepointIterator(const CodepointIterator& src): + iterator_(src.iterator_), + dereferenced_(src.dereferenced_), + codepoint_(src.codepoint_) { } + +CodepointIterator& CodepointIterator::operator=(const CodepointIterator& src) { + this->iterator_ = src.iterator_; + this->dereferenced_ = src.dereferenced_; + this->codepoint_ = src.codepoint_; + + return *this; +} + +bool CodepointIterator::operator==(const CodepointIterator& src) const { + return this->iterator_ == src.iterator_; +} + +bool CodepointIterator::operator!=(const CodepointIterator& src) const { + return this->iterator_ != src.iterator_; +} + +bool CodepointIterator::operator==( + const std::string::const_iterator& src) const { + return this->iterator_ == src; +} + +bool CodepointIterator::operator!=( + const std::string::const_iterator& src) const { + return this->iterator_ != src; +} + +char32_t CodepointIterator::operator*() { + if ( !this->dereferenced_ ) { + uint8_t currByte = *(this->iterator_); + this->dereferenced_ = true; + this->codepoint_ = 0; + + if ( match(currByte, CodeUnitType::CONTINUATION) ) { + if ( match(currByte, CodeUnitType::THREE) ) { + if ( match(currByte, CodeUnitType::FOUR) ) { + write(this->codepoint_, + currByte, + CodePoint::FOUR, + 18); + write(this->codepoint_, + *(this->iterator_ + 1), + CodePoint::CONTINUATION, + 12); + write(this->codepoint_, + *(this->iterator_ + 2), + CodePoint::CONTINUATION, + 6); + write(this->codepoint_, + *(this->iterator_ + 3), + CodePoint::CONTINUATION, + 0); + } else { + write(this->codepoint_, + currByte, + CodePoint::THREE, + 12); + write(this->codepoint_, + *(this->iterator_ + 1), + CodePoint::CONTINUATION, + 6); + write(this->codepoint_, + *(this->iterator_ + 2), + CodePoint::CONTINUATION, + 0); + } + } else { + write(this->codepoint_, + currByte, + CodePoint::TWO, + 6); + write(this->codepoint_, + *(this->iterator_ + 1), + CodePoint::CONTINUATION, + 0); + } + } else { + this->codepoint_ = currByte; + } + } + + return this->codepoint_; +} + +CodepointIterator& CodepointIterator::operator++() { + this->dereferenced_ = false; + uint8_t currByte = *(this->iterator_); + std::string::difference_type offset = 1; + + if ( match(currByte, CodeUnitType::CONTINUATION) ) { + if ( match(currByte, CodeUnitType::THREE) ) { + if ( match(currByte, CodeUnitType::FOUR) ) { + offset = 4; + } else { + offset = 3; + } + } else { + offset = 2; + } + } + + this->iterator_ += offset; + + return *this; +} + +CodepointIterator& CodepointIterator::operator--() { + this->dereferenced_ = false; + --this->iterator_; + + if ( match(*(this->iterator_), CodeUnitType::CONTINUATION) ) { + --this->iterator_; + + if ( !match(*(this->iterator_), CodeUnitType::LEADING) ) { + --this->iterator_; + + if ( !match(*(this->iterator_), CodeUnitType::LEADING) ) { + --this->iterator_; + + if ( !match(*(this->iterator_), CodeUnitType::LEADING) ) { + throw codepoint_invalid(); + } + } + } + } + + return *this; +} + +CodepointIterator CodepointIterator::operator++(int) { + CodepointIterator oldIter(*this); + + ++(*this); + + return oldIter; +} + +CodepointIterator CodepointIterator::operator--(int) { + CodepointIterator oldIter(*this); + + --(*this); + + return oldIter; +} + +} diff --git a/src/codepoint_iterator.h b/src/codepoint_iterator.h new file mode 100644 index 0000000..938f53d --- /dev/null +++ b/src/codepoint_iterator.h @@ -0,0 +1,49 @@ +#ifndef CODEPOINT_ITERATOR_H_ +#define CODEPOINT_ITERATOR_H_ + +#include +#include +#include + +namespace UTF8 { + +class CodepointIterator : public std::iterator { + public: + CodepointIterator(std::string::const_iterator); + CodepointIterator(const CodepointIterator&); + + CodepointIterator& operator=(const CodepointIterator&); + + bool operator==(const CodepointIterator&) const; + bool operator==(const std::string::const_iterator&) const; + + bool operator!=(const CodepointIterator&) const; + bool operator!=(const std::string::const_iterator&) const; + + char32_t operator*(); + + CodepointIterator& operator++(); + CodepointIterator& operator--(); + + CodepointIterator operator++(int); + CodepointIterator operator--(int); + + private: + std::string::const_iterator iterator_; + bool dereferenced_; + char32_t codepoint_; +}; + +class codepoint_invalid: public std::exception { + virtual const char* what() const throw() { + return "codepoint_invalid"; + } +}; + +} + +#endif // CODEPOINT_ITERATOR_H_ diff --git a/test.cc b/test.cc new file mode 100644 index 0000000..0693970 --- /dev/null +++ b/test.cc @@ -0,0 +1,91 @@ +#include "gtest/gtest.h" +#include "src/codepoint_iterator.h" + +#include + +struct SampleText { + std::string text; + size_t length; + std::vector codepoints; +}; + +class CodepointIteratorTest : public ::testing::Test { + protected: + virtual void SetUp() { + SampleText tmp; + + tmp.text = u8"Hellø Uni¢od€!"; + tmp.codepoints = { 72, 101, 108, 108, 248, 32, 85, 110, 105, 162, 111, 100, 8364, 33 }; + this->sample_.push_back(tmp); + + tmp.text = u8"ᛖᚴ ᚷᛖᛏ ᛖᛏᛁ ᚧ ᚷᛚᛖᚱ ᛘᚾ ᚦᛖᛋᛋ ᚨᚧ ᚡᛖ ᚱᚧᚨ ᛋᚨᚱ"; + tmp.codepoints = { 5846, 5812, 32, 5815, 5846, 5839, 32, 5846, 5839, 5825, 32, 5799, 32, 5815, 5850, 5846, 5809, 32, 5848, 5822, 32, 5798, 5846, 5835, 5835, 32, 5800, 5799, 32, 5793, 5846, 32, 5809, 5799, 5800, 32, 5835, 5800, 5809 }; + this->sample_.push_back(tmp); + + tmp.text = u8"⠊⠀⠉⠁⠝⠀⠑⠁⠞⠀⠛⠇⠁⠎⠎⠀⠁⠝⠙⠀⠊⠞⠀⠙⠕⠑⠎⠝⠞⠀⠓⠥⠗⠞⠀⠍⠑"; + tmp.codepoints = { 10250, 10240, 10249, 10241, 10269, 10240, 10257, 10241, 10270, 10240, 10267, 10247, 10241, 10254, 10254, 10240, 10241, 10269, 10265, 10240, 10250, 10270, 10240, 10265, 10261, 10257, 10254, 10269, 10270, 10240, 10259, 10277, 10263, 10270, 10240, 10253, 10257 }; + this->sample_.push_back(tmp); + + tmp.text = u8"Ég get etið gler án þess að meiða mig"; + tmp.codepoints = { 201, 103, 32, 103, 101, 116, 32, 101, 116, 105, 240, 32, 103, 108, 101, 114, 32, 225, 110, 32, 254, 101, 115, 115, 32, 97, 240, 32, 109, 101, 105, 240, 97, 32, 109, 105, 103 }; + this->sample_.push_back(tmp); + + tmp.text = u8"جام ييه بلورم بڭا ضررى طوقونمز"; + tmp.codepoints = { 1580, 1575, 1605, 32, 1610, 1610, 1607, 32, 1576, 1604, 1608, 1585, 1605, 32, 1576, 1709, 1575, 32, 1590, 1585, 1585, 1609, 32, 1591, 1608, 1602, 1608, 1606, 1605, 1586 }; + this->sample_.push_back(tmp); + + tmp.text = u8"මට වීදුරු කෑමට හැකියි. එයින් මට කිසි හානියක් සිදු නොවේ"; + tmp.codepoints = { 3512, 3495, 32, 3520, 3539, 3503, 3540, 3515, 3540, 32, 3482, 3537, 3512, 3495, 32, 3524, 3536, 3482, 3538, 3514, 3538, 46, 32, 3473, 3514, 3538, 3505, 3530, 32, 3512, 3495, 32, 3482, 3538, 3523, 3538, 32, 3524, 3535, 3505, 3538, 3514, 3482, 3530, 32, 3523, 3538, 3503, 3540, 32, 3505, 3548, 3520, 3546 }; + this->sample_.push_back(tmp); + } + + std::vector sample_; +}; + +TEST_F(CodepointIteratorTest, ForwardIteration) { + for ( auto tmp : this->sample_ ) { + size_t length = 0; + + for ( UTF8::CodepointIterator iter(tmp.text.cbegin()); + iter != tmp.text.cend(); + ++iter ) { + length++; + } + + EXPECT_EQ(tmp.codepoints.size(), length); + } +} + +TEST_F(CodepointIteratorTest, ReverseIteration) { + for ( auto tmp : this->sample_ ) { + size_t length = 0; + + for ( UTF8::CodepointIterator iter(tmp.text.cend()); + iter != tmp.text.cbegin(); + --iter ) { + length++; + } + + EXPECT_EQ(tmp.codepoints.size(), length); + } +} + +TEST_F(CodepointIteratorTest, Dereferencing) { + for ( auto tmp : this->sample_ ) { + size_t index = 0; + + for ( UTF8::CodepointIterator iter(tmp.text.cbegin()); + iter != tmp.text.cend(); + ++iter ) { + EXPECT_EQ(tmp.codepoints[index], *iter); + + ++index; + } + } +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} -- cgit v1.2.3