Dart API Referencedart:utfUtf8Decoder

Utf8Decoder class

Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The parameters can set an offset into a list of bytes (as int), limit the length of the values to be decoded, and override the default Unicode replacement character. Set the replacementCharacter to null to throw an ArgumentError rather than replace the bad value. The return value from this method can be used as an Iterable (e.g. in a for-loop).

class Utf8Decoder implements Iterator<int> {
 final _ListRangeIterator utf8EncodedBytesIterator;
 final int replacementCodepoint;
 int _current = null;

 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length,
     this.replacementCodepoint =
     UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
     utf8EncodedBytesIterator =
         (new _ListRange(utf8EncodedBytes, offset, length)).iterator;


 Utf8Decoder._fromListRangeIterator(_ListRange source, [
     this.replacementCodepoint =
     UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
     utf8EncodedBytesIterator = source.iterator;

 /** Decode the remaininder of the characters in this decoder
   * into a [List<int>].
   */
 List<int> decodeRest() {
   List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining);
   int i = 0;
   while (moveNext()) {
     codepoints[i++] = current;
   }
   if (i == codepoints.length) {
     return codepoints;
   } else {
     List<int> truncCodepoints = new List<int>(i);
     truncCodepoints.setRange(0, i, codepoints);
     return truncCodepoints;
   }
 }

 int get current => _current;

 bool moveNext() {
   _current = null;

   if (!utf8EncodedBytesIterator.moveNext()) return false;

   int value = utf8EncodedBytesIterator.current;
   int additionalBytes = 0;

   if (value < 0) {
     if (replacementCodepoint != null) {
       _current = replacementCodepoint;
       return true;
     } else {
       throw new ArgumentError(
           "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
     }
   } else if (value <= _UTF8_ONE_BYTE_MAX) {
     _current = value;
     return true;
   } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) {
     if (replacementCodepoint != null) {
       _current = replacementCodepoint;
       return true;
     } else {
       throw new ArgumentError(
           "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
     }
   } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) {
     value -= _UTF8_FIRST_BYTE_OF_TWO_BASE;
     additionalBytes = 1;
   } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) {
     value -= _UTF8_FIRST_BYTE_OF_THREE_BASE;
     additionalBytes = 2;
   } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) {
     value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE;
     additionalBytes = 3;
   } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) {
     value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE;
     additionalBytes = 4;
   } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) {
     value -= _UTF8_FIRST_BYTE_OF_SIX_BASE;
     additionalBytes = 5;
   } else if (replacementCodepoint != null) {
     _current = replacementCodepoint;
     return true;
   } else {
     throw new ArgumentError(
         "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
   }
   int j = 0;
   while (j < additionalBytes && utf8EncodedBytesIterator.moveNext()) {
     int nextValue = utf8EncodedBytesIterator.current;
     if (nextValue > _UTF8_ONE_BYTE_MAX &&
         nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) {
       value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK));
     } else {
       // if sequence-starting code unit, reposition cursor to start here
       if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) {
         utf8EncodedBytesIterator.backup();
       }
       break;
     }
     j++;
   }
   bool validSequence = (j == additionalBytes && (
       value < UNICODE_UTF16_RESERVED_LO ||
       value > UNICODE_UTF16_RESERVED_HI));
   bool nonOverlong =
       (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) ||
       (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) ||
       (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX);
   bool inRange = value <= UNICODE_VALID_RANGE_MAX;
   if (validSequence && nonOverlong && inRange) {
     _current = value;
     return true;
   } else if (replacementCodepoint != null) {
     _current = replacementCodepoint;
     return true;
   } else {
     throw new ArgumentError(
         "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}");
   }
 }
}

Implements

Iterator<int>

Constructors

new Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) #

Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length,
   this.replacementCodepoint =
   UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
   utf8EncodedBytesIterator =
       (new _ListRange(utf8EncodedBytes, offset, length)).iterator;

Properties

final int current #

Returns the current element.

Return null if the iterator has not yet been moved to the first element, or if the iterator has been moved after the last element of the Iterable.

docs inherited from Iterator<int>
int get current => _current;

final int replacementCodepoint #

final int replacementCodepoint

final _ListRangeIterator utf8EncodedBytesIterator #

final _ListRangeIterator utf8EncodedBytesIterator

Methods

List<int> decodeRest() #

Decode the remaininder of the characters in this decoder into a [List].

List<int> decodeRest() {
 List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining);
 int i = 0;
 while (moveNext()) {
   codepoints[i++] = current;
 }
 if (i == codepoints.length) {
   return codepoints;
 } else {
   List<int> truncCodepoints = new List<int>(i);
   truncCodepoints.setRange(0, i, codepoints);
   return truncCodepoints;
 }
}

bool moveNext() #

Moves to the next element. Returns true if current contains the next element. Returns false, if no element was left.

It is safe to invoke moveNext even when the iterator is already positioned after the last element. In this case moveNext has no effect.

docs inherited from Iterator<int>
bool moveNext() {
 _current = null;

 if (!utf8EncodedBytesIterator.moveNext()) return false;

 int value = utf8EncodedBytesIterator.current;
 int additionalBytes = 0;

 if (value < 0) {
   if (replacementCodepoint != null) {
     _current = replacementCodepoint;
     return true;
   } else {
     throw new ArgumentError(
         "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
   }
 } else if (value <= _UTF8_ONE_BYTE_MAX) {
   _current = value;
   return true;
 } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) {
   if (replacementCodepoint != null) {
     _current = replacementCodepoint;
     return true;
   } else {
     throw new ArgumentError(
         "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
   }
 } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) {
   value -= _UTF8_FIRST_BYTE_OF_TWO_BASE;
   additionalBytes = 1;
 } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) {
   value -= _UTF8_FIRST_BYTE_OF_THREE_BASE;
   additionalBytes = 2;
 } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) {
   value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE;
   additionalBytes = 3;
 } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) {
   value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE;
   additionalBytes = 4;
 } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) {
   value -= _UTF8_FIRST_BYTE_OF_SIX_BASE;
   additionalBytes = 5;
 } else if (replacementCodepoint != null) {
   _current = replacementCodepoint;
   return true;
 } else {
   throw new ArgumentError(
       "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
 }
 int j = 0;
 while (j < additionalBytes && utf8EncodedBytesIterator.moveNext()) {
   int nextValue = utf8EncodedBytesIterator.current;
   if (nextValue > _UTF8_ONE_BYTE_MAX &&
       nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) {
     value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK));
   } else {
     // if sequence-starting code unit, reposition cursor to start here
     if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) {
       utf8EncodedBytesIterator.backup();
     }
     break;
   }
   j++;
 }
 bool validSequence = (j == additionalBytes && (
     value < UNICODE_UTF16_RESERVED_LO ||
     value > UNICODE_UTF16_RESERVED_HI));
 bool nonOverlong =
     (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) ||
     (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) ||
     (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX);
 bool inRange = value <= UNICODE_VALID_RANGE_MAX;
 if (validSequence && nonOverlong && inRange) {
   _current = value;
   return true;
 } else if (replacementCodepoint != null) {
   _current = replacementCodepoint;
   return true;
 } else {
   throw new ArgumentError(
       "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}");
 }
}