From ecab2aa84ab1993b7ca3f6d258767cb136312a3b Mon Sep 17 00:00:00 2001
From: Adrian Kummerlaender
Date: Sun, 3 Apr 2016 18:32:08 +0200
Subject: Implement support for UTF8 multi-byte code points

---
 src/line_accumulator.cc | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/line_accumulator.cc b/src/line_accumulator.cc
index 9922f04..d12ef54 100644
--- a/src/line_accumulator.cc
+++ b/src/line_accumulator.cc
@@ -34,6 +34,22 @@ std::vector<std::uint8_t> getRandomIndizes(
 	return indizes;
 }
 
+std::size_t getCharacterLength(const std::string& token) {
+	std::size_t codeUnitIndex  = 0;
+	std::size_t codePointIndex = 0;
+
+	while ( token.data()[codeUnitIndex] ) {
+		// advance `codePointIndex` if current unit is not a continuation byte
+		// see RFC3629 for further information
+		if ( (token.data()[codeUnitIndex] & 0b11000000 ) != 0b10000000 ) {
+			++codePointIndex;
+		}
+		++codeUnitIndex;
+	}
+
+	return codePointIndex;
+}
+
 }
 
 namespace justify {
@@ -52,12 +68,14 @@ std::uint8_t LineAccumulator::getMissing() const {
 }
 
 void LineAccumulator::operator()(const std::string& token) {
-	if ( ( this->length_ + token.length() ) > this->max_length_ ) {
+	const std::size_t tokenLength = getCharacterLength(token);
+
+	if ( ( this->length_ + tokenLength ) > this->max_length_ ) {
 		this->discharge(true);
 	}
 
 	this->tokens_.emplace_back(token, 0);
-	this->length_ += token.length();
+	this->length_ += tokenLength;
 
 	if ( this->length_ < this->max_length_ ) {
 		this->tokens_.back().second += 1;
-- 
cgit v1.2.3