diff options
author | Adrian Kummerlaender | 2016-04-03 18:32:08 +0200 |
---|---|---|
committer | Adrian Kummerlaender | 2016-04-03 19:49:04 +0200 |
commit | ecab2aa84ab1993b7ca3f6d258767cb136312a3b (patch) | |
tree | 0dd8349617ba1cac482a56b0b69c5f5b0670afce /src | |
parent | 0f6f4c9db81ce100baa2502075dc611fa50aa116 (diff) | |
download | justify-ecab2aa84ab1993b7ca3f6d258767cb136312a3b.tar justify-ecab2aa84ab1993b7ca3f6d258767cb136312a3b.tar.gz justify-ecab2aa84ab1993b7ca3f6d258767cb136312a3b.tar.bz2 justify-ecab2aa84ab1993b7ca3f6d258767cb136312a3b.tar.lz justify-ecab2aa84ab1993b7ca3f6d258767cb136312a3b.tar.xz justify-ecab2aa84ab1993b7ca3f6d258767cb136312a3b.tar.zst justify-ecab2aa84ab1993b7ca3f6d258767cb136312a3b.zip |
Implement support for UTF8 multi-byte code points
Diffstat (limited to 'src')
-rw-r--r-- | src/line_accumulator.cc | 22 |
1 files changed, 20 insertions, 2 deletions
diff --git a/src/line_accumulator.cc b/src/line_accumulator.cc index 9922f04..d12ef54 100644 --- a/src/line_accumulator.cc +++ b/src/line_accumulator.cc @@ -34,6 +34,22 @@ std::vector<std::uint8_t> getRandomIndizes( return indizes; } +std::size_t getCharacterLength(const std::string& token) { + std::size_t codeUnitIndex = 0; + std::size_t codePointIndex = 0; + + while ( token.data()[codeUnitIndex] ) { + // advance `codePointIndex` if current unit is not a continuation byte + // see RFC3629 for further information + if ( (token.data()[codeUnitIndex] & 0b11000000 ) != 0b10000000 ) { + ++codePointIndex; + } + ++codeUnitIndex; + } + + return codePointIndex; +} + } namespace justify { @@ -52,12 +68,14 @@ std::uint8_t LineAccumulator::getMissing() const { } void LineAccumulator::operator()(const std::string& token) { - if ( ( this->length_ + token.length() ) > this->max_length_ ) { + const std::size_t tokenLength = getCharacterLength(token); + + if ( ( this->length_ + tokenLength ) > this->max_length_ ) { this->discharge(true); } this->tokens_.emplace_back(token, 0); - this->length_ += token.length(); + this->length_ += tokenLength; if ( this->length_ < this->max_length_ ) { this->tokens_.back().second += 1; |