Utf8Decoder class
Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The parameters can set an offset into a list of bytes (as int), limit the length of the values to be decoded, and override the default Unicode replacement character. Set the replacementCharacter to null to throw an ArgumentError rather than replace the bad value. The return value from this method can be used as an Iterable (e.g. in a for-loop).
class Utf8Decoder implements Iterator<int> {
final _ListRangeIterator utf8EncodedBytesIterator;
final int replacementCodepoint;
int _current = null;
Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length,
this.replacementCodepoint =
UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
utf8EncodedBytesIterator =
(new _ListRange(utf8EncodedBytes, offset, length)).iterator;
Utf8Decoder._fromListRangeIterator(_ListRange source, [
this.replacementCodepoint =
UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
utf8EncodedBytesIterator = source.iterator;
/** Decode the remaininder of the characters in this decoder
* into a [List<int>].
*/
List<int> decodeRest() {
List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining);
int i = 0;
while (moveNext()) {
codepoints[i++] = current;
}
if (i == codepoints.length) {
return codepoints;
} else {
List<int> truncCodepoints = new List<int>(i);
truncCodepoints.setRange(0, i, codepoints);
return truncCodepoints;
}
}
int get current => _current;
bool moveNext() {
_current = null;
if (!utf8EncodedBytesIterator.moveNext()) return false;
int value = utf8EncodedBytesIterator.current;
int additionalBytes = 0;
if (value < 0) {
if (replacementCodepoint != null) {
_current = replacementCodepoint;
return true;
} else {
throw new ArgumentError(
"Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
}
} else if (value <= _UTF8_ONE_BYTE_MAX) {
_current = value;
return true;
} else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) {
if (replacementCodepoint != null) {
_current = replacementCodepoint;
return true;
} else {
throw new ArgumentError(
"Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
}
} else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) {
value -= _UTF8_FIRST_BYTE_OF_TWO_BASE;
additionalBytes = 1;
} else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) {
value -= _UTF8_FIRST_BYTE_OF_THREE_BASE;
additionalBytes = 2;
} else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) {
value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE;
additionalBytes = 3;
} else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) {
value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE;
additionalBytes = 4;
} else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) {
value -= _UTF8_FIRST_BYTE_OF_SIX_BASE;
additionalBytes = 5;
} else if (replacementCodepoint != null) {
_current = replacementCodepoint;
return true;
} else {
throw new ArgumentError(
"Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
}
int j = 0;
while (j < additionalBytes && utf8EncodedBytesIterator.moveNext()) {
int nextValue = utf8EncodedBytesIterator.current;
if (nextValue > _UTF8_ONE_BYTE_MAX &&
nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) {
value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK));
} else {
// if sequence-starting code unit, reposition cursor to start here
if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) {
utf8EncodedBytesIterator.backup();
}
break;
}
j++;
}
bool validSequence = (j == additionalBytes && (
value < UNICODE_UTF16_RESERVED_LO ||
value > UNICODE_UTF16_RESERVED_HI));
bool nonOverlong =
(additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) ||
(additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) ||
(additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX);
bool inRange = value <= UNICODE_VALID_RANGE_MAX;
if (validSequence && nonOverlong && inRange) {
_current = value;
return true;
} else if (replacementCodepoint != null) {
_current = replacementCodepoint;
return true;
} else {
throw new ArgumentError(
"Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}");
}
}
}
Implements
Constructors
new Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) #
Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length,
this.replacementCodepoint =
UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
utf8EncodedBytesIterator =
(new _ListRange(utf8EncodedBytes, offset, length)).iterator;
Properties
final _ListRangeIterator utf8EncodedBytesIterator #
final _ListRangeIterator utf8EncodedBytesIterator
Methods
List<int> decodeRest() #
Decode the remaininder of the characters in this decoder
into a [List
List<int> decodeRest() {
List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining);
int i = 0;
while (moveNext()) {
codepoints[i++] = current;
}
if (i == codepoints.length) {
return codepoints;
} else {
List<int> truncCodepoints = new List<int>(i);
truncCodepoints.setRange(0, i, codepoints);
return truncCodepoints;
}
}
bool moveNext() #
Moves to the next element. Returns true if current contains the next element. Returns false, if no element was left.
It is safe to invoke moveNext even when the iterator is already positioned after the last element. In this case moveNext has no effect.
bool moveNext() {
_current = null;
if (!utf8EncodedBytesIterator.moveNext()) return false;
int value = utf8EncodedBytesIterator.current;
int additionalBytes = 0;
if (value < 0) {
if (replacementCodepoint != null) {
_current = replacementCodepoint;
return true;
} else {
throw new ArgumentError(
"Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
}
} else if (value <= _UTF8_ONE_BYTE_MAX) {
_current = value;
return true;
} else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) {
if (replacementCodepoint != null) {
_current = replacementCodepoint;
return true;
} else {
throw new ArgumentError(
"Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
}
} else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) {
value -= _UTF8_FIRST_BYTE_OF_TWO_BASE;
additionalBytes = 1;
} else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) {
value -= _UTF8_FIRST_BYTE_OF_THREE_BASE;
additionalBytes = 2;
} else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) {
value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE;
additionalBytes = 3;
} else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) {
value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE;
additionalBytes = 4;
} else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) {
value -= _UTF8_FIRST_BYTE_OF_SIX_BASE;
additionalBytes = 5;
} else if (replacementCodepoint != null) {
_current = replacementCodepoint;
return true;
} else {
throw new ArgumentError(
"Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
}
int j = 0;
while (j < additionalBytes && utf8EncodedBytesIterator.moveNext()) {
int nextValue = utf8EncodedBytesIterator.current;
if (nextValue > _UTF8_ONE_BYTE_MAX &&
nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) {
value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK));
} else {
// if sequence-starting code unit, reposition cursor to start here
if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) {
utf8EncodedBytesIterator.backup();
}
break;
}
j++;
}
bool validSequence = (j == additionalBytes && (
value < UNICODE_UTF16_RESERVED_LO ||
value > UNICODE_UTF16_RESERVED_HI));
bool nonOverlong =
(additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) ||
(additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) ||
(additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX);
bool inRange = value <= UNICODE_VALID_RANGE_MAX;
if (validSequence && nonOverlong && inRange) {
_current = value;
return true;
} else if (replacementCodepoint != null) {
_current = replacementCodepoint;
return true;
} else {
throw new ArgumentError(
"Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}");
}
}