aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAdrian Kummerlaender2016-04-03 18:32:08 +0200
committerAdrian Kummerlaender2016-04-03 19:49:04 +0200
commitecab2aa84ab1993b7ca3f6d258767cb136312a3b (patch)
tree0dd8349617ba1cac482a56b0b69c5f5b0670afce /src
parent0f6f4c9db81ce100baa2502075dc611fa50aa116 (diff)
downloadjustify-ecab2aa84ab1993b7ca3f6d258767cb136312a3b.tar
justify-ecab2aa84ab1993b7ca3f6d258767cb136312a3b.tar.gz
justify-ecab2aa84ab1993b7ca3f6d258767cb136312a3b.tar.bz2
justify-ecab2aa84ab1993b7ca3f6d258767cb136312a3b.tar.lz
justify-ecab2aa84ab1993b7ca3f6d258767cb136312a3b.tar.xz
justify-ecab2aa84ab1993b7ca3f6d258767cb136312a3b.tar.zst
justify-ecab2aa84ab1993b7ca3f6d258767cb136312a3b.zip
Implement support for UTF8 multi-byte code points
Diffstat (limited to 'src')
-rw-r--r--src/line_accumulator.cc22
1 files changed, 20 insertions, 2 deletions
diff --git a/src/line_accumulator.cc b/src/line_accumulator.cc
index 9922f04..d12ef54 100644
--- a/src/line_accumulator.cc
+++ b/src/line_accumulator.cc
@@ -34,6 +34,22 @@ std::vector<std::uint8_t> getRandomIndizes(
return indizes;
}
+std::size_t getCharacterLength(const std::string& token) {
+ std::size_t codeUnitIndex = 0;
+ std::size_t codePointIndex = 0;
+
+ while ( token.data()[codeUnitIndex] ) {
+ // advance `codePointIndex` if current unit is not a continuation byte
+ // see RFC3629 for further information
+ if ( (token.data()[codeUnitIndex] & 0b11000000 ) != 0b10000000 ) {
+ ++codePointIndex;
+ }
+ ++codeUnitIndex;
+ }
+
+ return codePointIndex;
+}
+
}
namespace justify {
@@ -52,12 +68,14 @@ std::uint8_t LineAccumulator::getMissing() const {
}
void LineAccumulator::operator()(const std::string& token) {
- if ( ( this->length_ + token.length() ) > this->max_length_ ) {
+ const std::size_t tokenLength = getCharacterLength(token);
+
+ if ( ( this->length_ + tokenLength ) > this->max_length_ ) {
this->discharge(true);
}
this->tokens_.emplace_back(token, 0);
- this->length_ += token.length();
+ this->length_ += tokenLength;
if ( this->length_ < this->max_length_ ) {
this->tokens_.back().second += 1;