├── .project
├── .properties
├── LICENSE
├── README.md
└── UTF8String
    ├── ByteArray.extension.st
    ├── String.extension.st
    ├── UTF8String.class.st
    ├── UTF8StringTest.class.st
    └── package.st


/.project:
--------------------------------------------------------------------------------
1 | {
2 | 	'srcDirectory' : ''
3 | }


--------------------------------------------------------------------------------
/.properties:
--------------------------------------------------------------------------------
1 | {
2 | 	#format : #tonel
3 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Sven Van Caekenberghe
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # UTF8String
  2 | 
  3 | A proof of concept / prototype alternative String implementation for Pharo
  4 | using a variable length UTF8 encoded internal representation.
  5 | 
  6 | 
  7 | ## Introduction
  8 | 
  9 | In Pharo Strings, sequences of Characters, are implemented by storing the Unicode code points of the Characters.
 10 | In general, 32 bits are needed for Unicode code points. However, the most common ASCII and Latin1 code points fit in 8 bits.
 11 | Two subclasses of String, WideString and ByteString respectively, cover these cases, transparently.
 12 | 
 13 | When doing IO or using FFI, Strings must be encoded using an encoder, to and from a ByteArray or binary stream.
 14 | Many encodings are in common use, but today, UTF8 has basically won, as it is the default encoding almost everywhere.
 15 | 
 16 | There is a real cost associated with encoding and decoding, especially with a variable length encoding such as UTF8.
 17 | 
 18 | So one might ask the question: could we not use UTF8 as the internal representation of Strings.
 19 | Some other programming languages, most notably Swift, took this road years ago.
 20 | 
 21 | 
 22 | ## Implementation
 23 | 
 24 | UTF8String is concept / prototype alternative String implementation for Pharo
 25 | using a variable length UTF8 encoded internal representation to explore this idea.
 26 | Furthermore UTF8String is readonly (no #at:put:).
 27 | 
 28 | The main problem with UTF8 is that it is a variable length encoding, with Characters being encoded using 1 to 4 bytes.
 29 | This means two things: indexing is much harder, as it basically comes down to a linear scan
 30 | and similary knowning the length in number of Characters can only be done after a linear scan.
 31 | 
 32 | Replacing one character with another is almost impossible, since this might shift things.
 33 | 
 34 | There are two clear advantages: IO and FFI can be done with zero cost (to UTF8 obviously, not to other encodings)
 35 | and space usage is more efficient in most cases (when at least one character does not fit in 8 bits).
 36 | 
 37 | 
 38 | ## Indexing and length caching
 39 | 
 40 | The UTF8String implementation just stores the UTF8 encoded bytes.
 41 | It tries to avoid indexing and counting if at all possible.
 42 | If indexing or the character count are needed, a single scan is performed,
 43 | that creates an index every stride (32) characters,
 44 | while also storing the length (#computeCountAndIndex)
 45 | Further operations can then be performed faster.
 46 | The key internal operation being:
 47 | 
 48 | - #byteIndexAt: characterIndex
 49 | - #characterIndexAt: byteIndex
 50 | 
 51 | By using the index, the linear search is limited to stride (32) characters at the most.
 52 | 
 53 | 
 54 | ## Operations
 55 | 
 56 | A surprising number of operations are possible that avoid indexing
 57 | or the character count:
 58 | 
 59 | - equality (#=)
 60 | - hashing (#hash)
 61 | - character inclusion (#includes:)
 62 | - empty test (#isEmpty)
 63 | - substring searching (#includesSubstring:)
 64 | - prefix/suffix matching (#beginsWith: #endsWith:)
 65 | - concatenation (#,)
 66 | 
 67 | Many other operation can be written using only a single (partial) scan:
 68 | 
 69 | - finding tokens (#findTokens:)
 70 | - formatting by interpolation (#format:)
 71 | - printing (#printOn:)
 72 | - comparing/sorting (#threeWayCompareTo: #< #<= #>= #>)
 73 | - partial copying (#copyUpTo:)
 74 | - enumeration (#do #reverseDo: #collect: #readStream)
 75 | 
 76 | On the other hand, many traditional operation trigger indexing and character counting:
 77 | 
 78 | - indexing (#at:)
 79 | - counting the characters (#size:)
 80 | - convenience accessors (#first #last)
 81 | - finding the index of a character or substring (#indexOf:[startingAt:] #indexOfSubCollection:)
 82 | - substring selection (#copyFrom:to:)
 83 | 
 84 | 
 85 | ## Discussion
 86 | 
 87 | The implementation was written to see if it could be done and how it would feel.
 88 | Not every algorithm is fully optimal, more specific loops are possible.
 89 | 
 90 | When creating a UTF8String on UTF8 encoded bytes, this is a zero cost operation
 91 | only if we assume the encoding is correct. A validate operation is available
 92 | to check this, but that defeats the speed advantage for the most part.
 93 | BTW, validate automatically does indexing and character counting.
 94 | 
 95 | An aspect that was ignored is the concept of Unicode normalization with respect to concatenation.
 96 | This is a hard subject has been solved in Pharo using external code, but not integrated in this implementation.
 97 | 
 98 | The concept of readonly strings is worth considering and feels acceptable, but requires a certain mindset.
 99 | 
100 | 
101 | ## Conclusion
102 | 
103 | Although this experiment went well, it is not meant for actual use.
104 | 


--------------------------------------------------------------------------------
/UTF8String/ByteArray.extension.st:
--------------------------------------------------------------------------------
1 | Extension { #name : #ByteArray }
2 | 
3 | { #category : #'*UTF8String' }
4 | ByteArray >> asUTF8String [
5 | 	^ UTF8String on: self
6 | ]
7 | 


--------------------------------------------------------------------------------
/UTF8String/String.extension.st:
--------------------------------------------------------------------------------
1 | Extension { #name : #String }
2 | 
3 | { #category : #'*UTF8String' }
4 | String >> asUTF8String [
5 | 	^ UTF8String fromSystemString: self
6 | ]
7 | 


--------------------------------------------------------------------------------
/UTF8String/UTF8String.class.st:
--------------------------------------------------------------------------------
  1 | "
  2 | I am UTF8String, a new kind of String.
  3 | 
  4 | I hold an ordered collection or sequence of Character objects.
  5 | 
  6 | I use variable length UTF8 encoding as my internal representation so that I can offer IO operations with zero cost for binary streams or FFI. This representation is also space efficient.
  7 | 
  8 | I use a lazy computed count and index to deal with the variable length encoding. A character count requires a full linear scan. Character based indexing (#at:) needs such a linear scan as well. The index is used to map character to byte offsets every #stride characters to speed things up.  
  9 | 
 10 | I know when I am pure ASCII (an important optimization case).
 11 | 
 12 | I am readonly (and cannot be modified, I do not understand #at:put:).
 13 | 
 14 | I am compatible with the most important classic String API, while encouraging operations that avoid computing the count or index by scanning.
 15 | 
 16 | "
 17 | Class {
 18 | 	#name : #UTF8String,
 19 | 	#superclass : #Object,
 20 | 	#instVars : [
 21 | 		'bytes',
 22 | 		'count',
 23 | 		'index'
 24 | 	],
 25 | 	#category : #UTF8String
 26 | }
 27 | 
 28 | { #category : #requirements }
 29 | UTF8String class >> empty [
 30 | 	"Return the empty UTF8String instance.
 31 | 	Since my instances are readonly, this could be cached."
 32 | 
 33 | 	^ (self on: #[]) beAscii; yourself
 34 | ]
 35 | 
 36 | { #category : #accessing }
 37 | UTF8String class >> encoder [
 38 | 	"Return the ZnCharacterEncoder that I use, obviously UTF8."
 39 | 	
 40 | 	^ ZnUTF8Encoder default
 41 | ]
 42 | 
 43 | { #category : #'instance creation' }
 44 | UTF8String class >> fromCharacter: character [
 45 | 	"Return a new UTF8String containing a single character"
 46 | 	
 47 | 	| codePoint bytes |
 48 | 	codePoint := character asInteger.
 49 | 	bytes := ByteArray 
 50 | 		new: (self encoder encodedByteCountFor: codePoint) 
 51 | 		streamContents: [ :out | 
 52 | 			self encoder nextPutCodePoint: codePoint toStream: out ].
 53 | 	^ self on: bytes
 54 | ]
 55 | 
 56 | { #category : #'instance creation' }
 57 | UTF8String class >> fromSystemString: systemString [
 58 | 	"Return a new UTF8String containing the characters in systemString."
 59 | 	
 60 | 	^ self on: systemString utf8Encoded 
 61 | ]
 62 | 
 63 | { #category : #'instance creation' }
 64 | UTF8String class >> on: utf8EncodedBytes [
 65 | 	"Return an new UTF8String containing the characters encoded in utf8EncodedBytes, a ByteArray.
 66 | 	Note that no validation is performed. See #validate"
 67 | 	
 68 | 	^ self new on: utf8EncodedBytes; yourself
 69 | ]
 70 | 
 71 | { #category : #'instance creation' }
 72 | UTF8String class >> streamContents: block [
 73 | 	"Create a new UTF8String by writing characters to the character WriteStream passed to block"
 74 | 	
 75 | 	| utf8EncodedBytes |
 76 | 	utf8EncodedBytes := ByteArray streamContents: [ :binaryWriteStream |
 77 | 		block value: (ZnCharacterWriteStream on: binaryWriteStream) ].
 78 | 	^ self on: utf8EncodedBytes
 79 | ]
 80 | 
 81 | { #category : #copying }
 82 | UTF8String >> , anotherUTF8String [
 83 | 	"Return a new string by concatenating myself with anotherUTF8String.
 84 | 	In principle we should do 'Normalization Preserving Concatenation', 
 85 | 	see https://medium.com/concerning-pharo/an-implementation-of-unicode-normalization-7c6719068f43
 86 | 	For now, assume we don't need it."
 87 | 	
 88 | 	^ self on: self utf8EncodedBytes , anotherUTF8String utf8EncodedBytes
 89 | ]
 90 | 
 91 | { #category : #comparing }
 92 | UTF8String >> < anotherUTF8String [
 93 | 	"Return true when I come lexicographically before anotherUTF8String, false otherwise"
 94 | 	
 95 | 	^ (self threeWayCompareTo: anotherUTF8String) < 0 
 96 | ]
 97 | 
 98 | { #category : #comparing }
 99 | UTF8String >> <= anotherUTF8String [
100 | 	"Return true when I come lexicographically before or an equal to anotherUTF8String, false otherwise"
101 | 	
102 | 	^ (self threeWayCompareTo: anotherUTF8String) <= 0 
103 | ]
104 | 
105 | { #category : #comparing }
106 | UTF8String >> = anotherUTF8String [
107 | 	"Return true when I am equal to anotherUTF8String, false otherwise"
108 | 	
109 | 	self == anotherUTF8String ifTrue: [ ^ true ].
110 | 	self class = anotherUTF8String class ifFalse: [ ^ false ].
111 | 	^ bytes = anotherUTF8String utf8EncodedBytes
112 | ]
113 | 
114 | { #category : #comparing }
115 | UTF8String >> > anotherUTF8String [
116 | 	"Return true when I come lexicographically after anotherUTF8String, false otherwise"
117 | 	
118 | 	^ (self threeWayCompareTo: anotherUTF8String) > 0 
119 | ]
120 | 
121 | { #category : #comparing }
122 | UTF8String >> >= anotherUTF8String [
123 | 	"Return true when I come lexicographically after or am equal to anotherUTF8String, false otherwise"
124 | 	
125 | 	^ (self threeWayCompareTo: anotherUTF8String) >= 0 
126 | ]
127 | 
128 | { #category : #converting }
129 | UTF8String >> asByteArray [
130 | 	"Convert me to a ByteArray containing my UTF8 encoded representation"
131 | 	
132 | 	^ self utf8EncodedBytes
133 | ]
134 | 
135 | { #category : #converting }
136 | UTF8String >> asLowercase [
137 | 	"Return a copy of me with each character converted to lowercase."
138 | 	
139 | 	^ self collect: [ :each | each asLowercase ]
140 | ]
141 | 
142 | { #category : #converting }
143 | UTF8String >> asString [
144 | 	"Convert me to a system String"
145 | 	
146 | 	^ bytes utf8Decoded
147 | ]
148 | 
149 | { #category : #converting }
150 | UTF8String >> asUTF8Bytes [
151 | 	"Return a ByteArray with my UTF8 encoded representation"
152 | 	
153 | 	^ self utf8EncodedBytes
154 | ]
155 | 
156 | { #category : #converting }
157 | UTF8String >> asUTF8String [
158 | 	"Convert me to an UTF8String"
159 | 	
160 | 	^ self
161 | ]
162 | 
163 | { #category : #converting }
164 | UTF8String >> asUppercase [
165 | 	"Return a copy of me with each character converted to uppercase"
166 | 	
167 | 	^ self collect: [ :each | each asUppercase ]
168 | ]
169 | 
170 | { #category : #accessing }
171 | UTF8String >> at: characterIndex [
172 | 	"Return the Character at characterIndex.
173 | 	Signal SubscriptOutOfBounds or CollectionIsEmpty when there is no such character."
174 | 	
175 | 	self emptyCheck.
176 | 	self boundsCheck: characterIndex.
177 | 	self isAscii ifTrue: [ ^ (bytes at: characterIndex) asCharacter ].
178 | 	^ self encodedCharacterAt: characterIndex
179 | 	
180 | ]
181 | 
182 | { #category : #initialization }
183 | UTF8String >> beAscii [
184 | 	"Mark me as being pure 7-bit ASCII, which means each character is encoded as one byte.
185 | 	This is an important optimisation as it prevents counting, scanning and indexing."
186 | 	
187 | 	count := bytes size.
188 | 	index := Array empty
189 | ]
190 | 
191 | { #category : #testing }
192 | UTF8String >> beginsWith: prefixUTF8String [
193 | 	"Return true when I begin with prefixUTF8String, false otherwise."
194 | 	
195 | 	^ bytes beginsWith: prefixUTF8String utf8EncodedBytes
196 | ]
197 | 
198 | { #category : #private }
199 | UTF8String >> boundsCheck: characterIndex [
200 | 	(characterIndex between: 1 and: self size) 
201 | 		ifFalse: [ self errorOutOfBounds: characterIndex ]
202 | ]
203 | 
204 | { #category : #private }
205 | UTF8String >> byteIndexAt: characterIndex [	
206 | 	| byteIndex |
207 | 	self isAscii ifTrue: [ ^ characterIndex ].
208 | 	byteIndex := characterIndex > self stride
209 | 		ifTrue: [ index at: characterIndex - 1 // self stride ]
210 | 		ifFalse: [ 1 ].
211 | 	characterIndex - 1 \\ self stride timesRepeat: [ 
212 | 		byteIndex := byteIndex + (self encodedByteCountAt: byteIndex) ].
213 | 	^ byteIndex
214 | 	
215 | ]
216 | 
217 | { #category : #converting }
218 | UTF8String >> capitalized [
219 | 	"Return a copy of me with my first character capitalized"
220 | 	
221 | 	| first |
222 | 	first := true.
223 | 	^ self collect: [ :each | 
224 | 			first
225 | 				ifTrue: [ first := false. each asUppercase ]
226 | 				ifFalse: [ each ] ]
227 | ]
228 | 
229 | { #category : #private }
230 | UTF8String >> characterIndexAt: byteIndex [
231 | 	| characterIndex byteCursor |
232 | 	self isAscii ifTrue: [ ^ byteIndex ].
233 | 	characterIndex := byteCursor := 1.
234 | 	index ifNotEmpty: [  
235 | 		index 
236 | 			findBinaryIndex: [ :x | byteIndex - x ] 
237 | 			do: [ :found | ^ (found * self stride) + 1 ] 
238 | 			ifNone: [ :lower :upper |
239 | 				lower = 0 
240 | 			 		ifFalse: [ 
241 | 						byteCursor := index at: lower. 
242 | 						characterIndex := (lower * self stride) + 1 ] ] ].
243 | 	[ byteCursor <= bytes size ] whileTrue: [
244 | 		byteCursor = byteIndex 
245 | 			ifTrue: [ ^ characterIndex ].
246 | 		byteCursor := byteCursor + (self encodedByteCountAt: byteCursor).
247 | 		characterIndex := characterIndex + 1 ].
248 | 	^ 0
249 | ]
250 | 
251 | { #category : #enumerating }
252 | UTF8String >> collect: block [
253 | 	"Create a copy of me with each Character transformed by block, in order, from first to last"
254 | 	
255 | 	self isEmpty ifTrue: [ ^ self ].
256 | 	^ self class streamContents: [ :out |
257 | 			self do: [ :each |
258 | 				out nextPut: (block value: each) ] ]
259 | ]
260 | 
261 | { #category : #private }
262 | UTF8String >> computeCountAndIndex [
263 | 	self isEmpty ifTrue: [ count := 0. ^ self ].
264 | 	"the number of characters cannot be larger than the byte size"
265 | 	bytes size <= self stride
266 | 		ifTrue: [ self computeCountNoIndex ]
267 | 		ifFalse: [ self computeCountAndIndexFull ]
268 | ]
269 | 
270 | { #category : #private }
271 | UTF8String >> computeCountAndIndexFull [
272 | 	count := 0.
273 | 	index := Array streamContents: [ :out | | byteIndex |
274 | 		byteIndex := 1.
275 | 		[ byteIndex <= bytes size ] whileTrue: [
276 | 			count := count + 1.
277 | 			(count > self stride and: [ (count - 1 \\ self stride) = 0 ]) 
278 | 				ifTrue: [ out nextPut: byteIndex ].
279 | 			byteIndex := byteIndex + (self encodedByteCountAt: byteIndex) ] ]
280 | ]
281 | 
282 | { #category : #private }
283 | UTF8String >> computeCountNoIndex [
284 | 	| byteIndex |
285 | 	count := 0.
286 | 	index := Array empty.
287 | 	byteIndex := 1.
288 | 	[ byteIndex <= bytes size ] whileTrue: [
289 | 		count := count + 1.
290 | 		byteIndex := byteIndex + (self encodedByteCountAt: byteIndex) ]
291 | ]
292 | 
293 | { #category : #copying }
294 | UTF8String >> copyFrom: startIndex to: stopIndex [
295 | 	"Return a new string, a copy of me from startIndex to stopIndex, both character indices.
296 | 	Signal SubscriptOutOfBounds or CollectionIsEmpty in case of illegal indices."
297 | 	
298 | 	| startByteIndex stopByteIndex substring |
299 | 	self boundsCheck: startIndex. 
300 | 	self boundsCheck: stopIndex. 
301 | 	startByteIndex := self byteIndexAt: startIndex.
302 | 	stopByteIndex := self byteIndexAt: stopIndex.
303 | 	stopByteIndex := stopByteIndex + (self encodedByteCountAt: stopByteIndex) - 1.
304 | 	substring := self class on: (bytes copyFrom: startByteIndex to: stopByteIndex).
305 | 	self isAscii ifTrue: [ substring beAscii ].
306 | 	^ substring
307 | ]
308 | 
309 | { #category : #copying }
310 | UTF8String >> copyUpTo: aCharacter [
311 | 	"Return a new string, a copy of me, from my first character 
312 | 	up to but not including the first occurrence of aCharacter."
313 | 	
314 | 	| readStream |
315 | 	readStream := self readStream.
316 | 	^ self class streamContents: [ :out |
317 | 		[ readStream atEnd or: [ readStream peek = aCharacter ] ] whileFalse: [ 
318 | 			out nextPut: readStream next ] ]
319 | ]
320 | 
321 | { #category : #private }
322 | UTF8String >> decodeCharacterAt: byteIndex [
323 | 	^ self encoder nextFromStream: (bytes readStream position: byteIndex - 1)
324 | ]
325 | 
326 | { #category : #enumerating }
327 | UTF8String >> do: block [
328 | 	"Execute block with each of my Characters as argument, in order, from first to last"
329 | 	
330 | 	| readStream |
331 | 	self isEmpty ifTrue: [ ^ self ].
332 | 	readStream := self readStream.
333 | 	[ readStream atEnd ] whileFalse: [ 
334 | 		block value: readStream next ]
335 | ]
336 | 
337 | { #category : #private }
338 | UTF8String >> emptyCheck [
339 | 	self isEmpty ifTrue: [ self errorEmptyCollection ]
340 | ]
341 | 
342 | { #category : #accessing }
343 | UTF8String >> encodedByteCount [
344 | 	"Return the byte count of my UTF8 encoded representation"
345 | 	
346 | 	^ bytes size
347 | ]
348 | 
349 | { #category : #private }
350 | UTF8String >> encodedByteCountAt: byteIndex [
351 | 	| byte |
352 | 	(byte := bytes at: byteIndex) < 128 ifTrue: [ ^ 1 ].
353 | 	(byte bitAnd: 2r11100000) = 2r11000000 ifTrue: [ ^ 2 ].
354 | 	(byte bitAnd: 2r11110000) = 2r11100000 ifTrue: [ ^ 3 ].
355 | 	(byte bitAnd: 2r11111000) = 2r11110000 ifTrue: [ ^ 4 ].
356 | 	self errorIllegalLeadingByte  
357 | ]
358 | 
359 | { #category : #private }
360 | UTF8String >> encodedBytesFor: codePoint [
361 | 	^ ByteArray 
362 | 			new: (self encoder encodedByteCountFor: codePoint) 
363 | 			streamContents: [ :out | 
364 | 				self encoder nextPutCodePoint: codePoint toStream: out ]
365 | ]
366 | 
367 | { #category : #private }
368 | UTF8String >> encodedCharacterAt: characterIndex [	
369 | 	^ self decodeCharacterAt: (self byteIndexAt: characterIndex)
370 | ]
371 | 
372 | { #category : #private }
373 | UTF8String >> encoder [
374 | 	^ self class encoder
375 | ]
376 | 
377 | { #category : #testing }
378 | UTF8String >> endsWith: suffixUTF8String [
379 | 	"Return true when I end with suffixUTF8String, false otherwise."
380 | 	
381 | 	^ bytes endsWith: suffixUTF8String utf8EncodedBytes
382 | ]
383 | 
384 | { #category : #'error handling' }
385 | UTF8String >> errorEmptyCollection [
386 | 	 CollectionIsEmpty signalWith: self 
387 | ]
388 | 
389 | { #category : #'error handling' }
390 | UTF8String >> errorIllegalContinuationByte [
391 | 	^ self encoder errorIllegalContinuationByte
392 | ]
393 | 
394 | { #category : #'error handling' }
395 | UTF8String >> errorIllegalLeadingByte [
396 | 	^ self encoder errorIllegalLeadingByte
397 | ]
398 | 
399 | { #category : #'error handling' }
400 | UTF8String >> errorOutOfBounds: characterIndex [
401 | 	SubscriptOutOfBounds 
402 | 		signalFor: characterIndex 
403 | 		lowerBound: 1 
404 | 		upperBound: self size 
405 | 		in: self 
406 | ]
407 | 
408 | { #category : #accessing }
409 | UTF8String >> findTokens: delimiters [
410 | 	"Return a collection of tokens, substrings, in me, separated by delimiters.
411 | 	Delimiters is a collection of characters or a single character.
412 | 	Delimiters are not included in the results, multiple delimiters in a row count as one."
413 | 	
414 | 	| separators in token |
415 | 	separators := delimiters isCharacter ifTrue: [ { delimiters } ] ifFalse: [ delimiters ].
416 | 	in := self readStream.
417 | 	^ OrderedCollection streamContents: [ :tokens |
418 | 		[ in atEnd ] whileFalse: [
419 | 			token := self class streamContents: [ :out |
420 | 				[ in atEnd or: [ separators includes: in peek ] ] whileFalse: [
421 | 					out nextPut: in next ] ].
422 | 			token isEmpty ifFalse: [ tokens nextPut: token ].
423 | 			[ in atEnd not and: [ separators includes: in peek ] ] whileTrue: [
424 | 				in next ] ] ]
425 | ]
426 | 
427 | { #category : #accessing }
428 | UTF8String >> first [
429 | 	"Return my first Character.
430 | 	Signal CollectionIsEmpty when there is no such character."
431 | 
432 | 	self emptyCheck.
433 | 	^ self isAscii 
434 | 		ifTrue: [ bytes first asCharacter ] 
435 | 		ifFalse: [ self decodeCharacterAt: 1 ]
436 | ]
437 | 
438 | { #category : #formatting }
439 | UTF8String >> format: collection [ 
440 | 	"Format the receiver by interpolating elements from collection (see String>>#format:)"
441 | 	
442 | 	^ self class streamContents: [ :out | | in |
443 | 		in := self readStream.
444 | 		[ in atEnd ] whileFalse: [ | currentChar | 
445 | 			(currentChar := in next) == ${
446 | 				ifTrue: [ | expression key | 
447 | 					expression := in upTo: $}.
448 | 					key := Integer readFrom: expression ifFail: [ expression ].
449 | 					out nextPutAll: (collection at: key) asString asUTF8String ]
450 | 				ifFalse: [
451 | 					currentChar == $\
452 | 						ifTrue: [ in atEnd ifFalse: [ out nextPut: in next ] ]
453 | 						ifFalse: [ out nextPut: currentChar ] ] ] ]
454 | ]
455 | 
456 | { #category : #comparing }
457 | UTF8String >> hash [
458 | 	"Return my hash, a SmallInteger related to my identity"
459 | 
460 | 	^ bytes hash
461 | ]
462 | 
463 | { #category : #testing }
464 | UTF8String >> includes: aCharacter [
465 | 	"Return true when I include aCharacter, false otherwise."
466 | 	
467 | 	| codePoint |
468 | 	self isEmpty ifTrue: [ ^ false ].
469 | 	codePoint := aCharacter asInteger.
470 | 	^ self isAscii
471 | 		ifTrue: [ 
472 | 			codePoint > 127 
473 | 				ifTrue: [ false ] 
474 | 				ifFalse: [ bytes includes: codePoint ] ] 
475 | 		ifFalse: [ 
476 | 			(bytes 
477 | 				indexOfSubCollection: (self encodedBytesFor: codePoint)
478 | 				startingAt: 1) ~= 0 ]
479 | ]
480 | 
481 | { #category : #testing }
482 | UTF8String >> includesSubstring: aSubUTF8String [
483 | 	"Return true when I include aSubUTF8String, false otherwise.
484 | 	The empty string contains nothing, an empty string is part of any non empty string."
485 | 	
486 | 	self isEmpty ifTrue: [ ^ false ].
487 | 	aSubUTF8String isEmpty ifTrue: [ ^ true ].
488 | 	^ (bytes indexOfSubCollection: aSubUTF8String utf8EncodedBytes startingAt: 1) ~= 0
489 | ]
490 | 
491 | { #category : #accessing }
492 | UTF8String >> indexOf: aCharacter [
493 | 	"Return the index of the first occurrence of aCharacter in me.
494 | 	If I do not contain aCharacter, return 0."
495 | 	
496 | 	^ self indexOf: aCharacter startingAt: 1
497 | ]
498 | 
499 | { #category : #accessing }
500 | UTF8String >> indexOf: aCharacter startingAt: characterOffset [
501 | 	"Return the index of the first occurrence of aCharacter after characterOffset in me.
502 | 	If I do not contain aCharacter after characterOffset, return 0."
503 | 
504 | 	| codePoint found |
505 | 	self isEmpty ifTrue: [ ^ false ].
506 | 	self boundsCheck: characterOffset. 
507 | 	codePoint := aCharacter asInteger.
508 | 	^ self isAscii
509 | 		ifTrue: [ 
510 | 			codePoint > 127 
511 | 				ifTrue: [ false ] 
512 | 				ifFalse: [ bytes indexOf: codePoint startingAt: (self byteIndexAt: characterOffset) ] ] 
513 | 		ifFalse: [ 
514 | 			found := bytes 
515 | 				indexOfSubCollection: (self encodedBytesFor: codePoint)
516 | 				startingAt: (self byteIndexAt: characterOffset).
517 | 			found = 0 ifTrue: [ 0 ] ifFalse: [ self characterIndexAt: found ] ]
518 | ]
519 | 
520 | { #category : #accessing }
521 | UTF8String >> indexOfSubCollection: aSubUTF8String [
522 | 	"Return the index of the first occurrence of aSubUTF8String in me.
523 | 	Return 0 if aSubUTF8String does not occur in me."
524 | 	
525 | 	| byteIndex |
526 | 	self isEmpty ifTrue: [ ^ 0 ].
527 | 	aSubUTF8String isEmpty ifTrue: [ ^ 1 ].
528 | 	byteIndex := bytes indexOfSubCollection: aSubUTF8String utf8EncodedBytes startingAt: 1.
529 | 	^ byteIndex ~= 0
530 | 		ifTrue: [ self characterIndexAt: byteIndex ]
531 | 		ifFalse: [ 0 ]
532 | ]
533 | 
534 | { #category : #testing }
535 | UTF8String >> isAscii [
536 | 	"Return true when I am pure 7-bit ASCII, which means each character is encoded as one byte.
537 | 	It is an important optimisation to know this upfront, see #beAscii, 
538 | 	as it prevents counting, scanning and indexing."
539 | 	
540 | 	count ifNil: [ self computeCountAndIndex ].
541 | 	^ bytes size = count
542 | ]
543 | 
544 | { #category : #testing }
545 | UTF8String >> isByteString [
546 | 	"We are not compatible with a system ByteString"
547 | 	
548 | 	^ false
549 | ]
550 | 
551 | { #category : #testing }
552 | UTF8String >> isEmpty [
553 | 	"Return true when I am empty, when I do not contain any characters"
554 | 	
555 | 	^ bytes isEmpty
556 | ]
557 | 
558 | { #category : #accessing }
559 | UTF8String >> last [
560 | 	"Return my last Character.
561 | 	Signal CollectionIsEmpty when there is no such character."
562 | 
563 | 	self emptyCheck.
564 | 	^ self isAscii
565 | 		ifTrue: [ bytes last asCharacter ]
566 | 		ifFalse: [ | binaryReadStream |
567 | 			binaryReadStream := bytes readStream setToEnd.
568 | 			self encoder
569 | 				backOnStream: binaryReadStream;
570 | 				nextFromStream: binaryReadStream ]
571 | ]
572 | 
573 | { #category : #initialization }
574 | UTF8String >> on: utf8EncodedBytes [
575 | 	"Initialize me on utf8EncodedBytes. No validation takes place. See #validate."
576 | 	
577 | 	bytes := utf8EncodedBytes 
578 | ]
579 | 
580 | { #category : #printing }
581 | UTF8String >> printDebugOn: stream [
582 | 	| max readStream |
583 | 	stream nextPutAll: 'a UTF8String('; nextPut: $'.
584 | 	max := self stride / 2.
585 | 	readStream := self readStream.
586 | 	[ readStream atEnd or: [ max = 0 ] ] whileFalse: [
587 | 		stream peek = $' ifTrue: [ stream nextPut: $' ].
588 | 		stream nextPut: readStream next.
589 | 		max := max - 1 ].
590 | 	max = 0 ifTrue: [ stream nextPutAll: '...' ].
591 | 	stream nextPut: $'.
592 | 	count ifNotNil: [ stream space; nextPut: $#; print: count ].
593 | 	stream nextPut: $)
594 | ]
595 | 
596 | { #category : #printing }
597 | UTF8String >> printOn: stream [
598 | 	| readStream |
599 | 	stream nextPut: $'.
600 | 	readStream := self readStream.
601 | 	[ readStream atEnd ] whileFalse: [
602 | 		stream peek = $' ifTrue: [ stream nextPut: $' ].
603 | 		stream nextPut: readStream next ].
604 | 	stream nextPut: $'.
605 | ]
606 | 
607 | { #category : #streaming }
608 | UTF8String >> putOn: aStream [
609 | 	aStream nextPutAll: self
610 | ]
611 | 
612 | { #category : #enumerating }
613 | UTF8String >> readStream [
614 | 	"Return a Character ReadStream to iterate over my contents in order, from first to last"
615 | 	
616 | 	^ ZnCharacterReadStream on: bytes readStream
617 | ]
618 | 
619 | { #category : #converting }
620 | UTF8String >> reverse [
621 | 	self isAscii 
622 | 		ifTrue: [ ^ (self class on: bytes reverse) beAscii ].
623 | 	^ self class streamContents: [ :out |
624 | 			self reverseDo: [ :each | out nextPut: each ] ]
625 | ]
626 | 
627 | { #category : #enumerating }
628 | UTF8String >> reverseDo: block [
629 | 	"Execute block with each of my Characters as argument, in reverse order, from last to first"
630 | 
631 | 	| binaryReadStream |
632 | 	self isEmpty ifTrue: [ ^ self ].
633 | 	binaryReadStream := bytes readStream setToEnd.
634 | 	[ block value: (self encoder
635 | 		backOnStream: binaryReadStream;
636 | 		nextFromStream: binaryReadStream) ] doWhileTrue: [
637 | 			self encoder backOnStream: binaryReadStream.
638 | 			binaryReadStream position > 0 ]
639 | ]
640 | 
641 | { #category : #accessing }
642 | UTF8String >> size [
643 | 	"Return how many characters I contain"
644 | 	
645 | 	self isEmpty ifTrue: [ ^ 0 ].
646 | 	count ifNil: [ self computeCountAndIndex ].
647 | 	^ count
648 | ]
649 | 
650 | { #category : #private }
651 | UTF8String >> stride [
652 | 	"Return the stride, the constant I use internally to build my index"
653 | 	
654 | 	^ 32
655 | ]
656 | 
657 | { #category : #comparing }
658 | UTF8String >> threeWayCompareTo: anotherUTF8String [
659 | 	"Do a three-way comparison between the receiver and anotherUTF8String (see Magnitude>>#threeWayCompareTo:)"
660 | 	
661 | 	| readStream otherReadStream character otherCharacter |
662 | 	readStream := self readStream.
663 | 	otherReadStream := anotherUTF8String readStream.
664 | 	[ readStream atEnd or: [ otherReadStream atEnd ] ] whileFalse: [ 
665 | 		character := readStream next.
666 | 		otherCharacter := otherReadStream next.
667 | 		character = otherCharacter 
668 | 			ifFalse: [ 
669 | 				character < otherCharacter ifTrue: [ ^ -1 ] ifFalse: [ ^ 1 ] ] ].
670 | 	(readStream atEnd and: [ otherReadStream atEnd ]) ifTrue: [ ^ 0 ].
671 | 	^ readStream atEnd ifTrue: [ -1 ] ifFalse: [ 1 ]
672 | ]
673 | 
674 | { #category : #converting }
675 | UTF8String >> utf8Encoded [
676 | 	"Return a ByteArray with my UTF8 encoded representation"
677 | 	
678 | 	^ self utf8EncodedBytes
679 | ]
680 | 
681 | { #category : #accessing }
682 | UTF8String >> utf8EncodedBytes [
683 | 	"Return a ByteArray with a UTF8 encoded representation of me."
684 | 	
685 | 	^ bytes
686 | ]
687 | 
688 | { #category : #initialization }
689 | UTF8String >> validate [
690 | 	"Validate my byte representation, checking if the UTF8 encoding is correct.
691 | 	Since this requires a full scan, I compute my count and index as well.
692 | 	Return true when I am valid, false otherwise."
693 | 	
694 | 	self isEmpty ifTrue: [ count := 0. ^ true ].
695 | 	"the number of characters cannot be larger than the byte size"
696 | 	^ bytes size <= self stride
697 | 		ifTrue: [ self validateNoIndex ]
698 | 		ifFalse: [ self validateFullIndex ]
699 | ]
700 | 
701 | { #category : #private }
702 | UTF8String >> validateFullIndex [
703 | 	count := 0.
704 | 	[
705 | 		index := Array streamContents: [ :out | | readStream |
706 | 			readStream := self readStream.
707 | 			[ readStream atEnd ] whileFalse: [
708 | 				count := count + 1.
709 | 				readStream next.
710 | 				(count > self stride and: [ (count - 1 \\ self stride) = 0 ]) 
711 | 					ifTrue: [ out nextPut: readStream wrappedStream position ] ] ]
712 | 	] on: ZnCharacterEncodingError do: [ ^ false ].
713 | 	^ true
714 | ]
715 | 
716 | { #category : #private }
717 | UTF8String >> validateNoIndex [
718 | 	count := 0.
719 | 	index := Array empty.
720 | 	[ | readStream |
721 | 		readStream := self readStream.
722 | 		[ readStream atEnd ] whileFalse: [
723 | 			readStream next. 
724 | 			count := count + 1 ] 
725 | 	] on: ZnCharacterEncodingError do: [ ^ false ].
726 | 	^ true
727 | ]
728 | 


--------------------------------------------------------------------------------
/UTF8String/UTF8StringTest.class.st:
--------------------------------------------------------------------------------
  1 | "
  2 | I am UTF8StringTest, holding unit tests for UTF8String.
  3 | I am a TestCase.
  4 | "
  5 | Class {
  6 | 	#name : #UTF8StringTest,
  7 | 	#superclass : #TestCase,
  8 | 	#category : #UTF8String
  9 | }
 10 | 
 11 | { #category : #accessing }
 12 | UTF8StringTest >> firstString [
 13 | 	^ 'First string: élève Français à 10 € - 1'
 14 | ]
 15 | 
 16 | { #category : #accessing }
 17 | UTF8StringTest >> normalAsciiString [
 18 | 	^ String loremIpsum 
 19 | ]
 20 | 
 21 | { #category : #accessing }
 22 | UTF8StringTest >> normalString [
 23 | 	^ 'Les élèves Français à 10 € - Les élèves Français à 10 € - 0123456789 - Les élèves Français à 10 € - Les élèves Français à 10 €'
 24 | ]
 25 | 
 26 | { #category : #accessing }
 27 | UTF8StringTest >> secondString [
 28 | 	^ 'Second string: élève Français à 20 € - 2'
 29 | ]
 30 | 
 31 | { #category : #accessing }
 32 | UTF8StringTest >> smallAsciiString [
 33 | 	^ 'ABC123'
 34 | ]
 35 | 
 36 | { #category : #accessing }
 37 | UTF8StringTest >> smallString [
 38 | 	^ 'Les élèves Français à 10 €'
 39 | ]
 40 | 
 41 | { #category : #tests }
 42 | UTF8StringTest >> testAsLowercase [
 43 | 	self 
 44 | 		assert: (UTF8String fromSystemString: self smallString) asLowercase 
 45 | 		equals: (UTF8String fromSystemString: self smallString asLowercase)
 46 | ]
 47 | 
 48 | { #category : #tests }
 49 | UTF8StringTest >> testAsUTF8String [
 50 | 	| string |
 51 | 	self 
 52 | 		assert: (UTF8String fromSystemString: self smallString)
 53 | 		equals: self smallString asUTF8String.
 54 | 	string := #[ 65 66 67 ] asUTF8String.
 55 | 	self assert: string equals: 'ABC' asUTF8String.
 56 | 	self assert: string identicalTo: string asUTF8String  
 57 | ]
 58 | 
 59 | { #category : #tests }
 60 | UTF8StringTest >> testAsUppercase [
 61 | 	self 
 62 | 		assert: (UTF8String fromSystemString: self smallString) asUppercase 
 63 | 		equals: (UTF8String fromSystemString: self smallString asUppercase)
 64 | ]
 65 | 
 66 | { #category : #tests }
 67 | UTF8StringTest >> testAscii [
 68 | 	| str |
 69 | 	str := 'ABC' asUTF8String.
 70 | 	self assert: str isAscii.
 71 | 	str := 'ABC' asUTF8String.
 72 | 	str beAscii.
 73 | 	self assert: str isAscii.
 74 | 	str := 'DéF' asUTF8String.
 75 | 	self deny: str isAscii.
 76 | 	str := Character alphabet asString asUTF8String.
 77 | 	self assert: str isAscii.
 78 | 	self assert: UTF8String empty isAscii
 79 | ]
 80 | 
 81 | { #category : #tests }
 82 | UTF8StringTest >> testCapitalized [
 83 | 	self 
 84 | 		assert: (UTF8String fromSystemString: self smallString) capitalized 
 85 | 		equals: (UTF8String fromSystemString: self smallString capitalized)
 86 | ]
 87 | 
 88 | { #category : #tests }
 89 | UTF8StringTest >> testConcatenation [
 90 | 	| first second concatenation |
 91 | 	first := UTF8String fromSystemString: self firstString.
 92 | 	second := UTF8String fromSystemString: self secondString.
 93 | 	concatenation := UTF8String fromSystemString: self firstString , self secondString.
 94 | 	self assert: first , second equals: concatenation 
 95 | ]
 96 | 
 97 | { #category : #tests }
 98 | UTF8StringTest >> testEmpty [
 99 | 	| emptyString |
100 | 	emptyString := UTF8String empty.
101 | 	self assert: emptyString isEmpty.
102 | 	self assert: emptyString size equals: 0.
103 | 	self should: [ emptyString at: 1 ] raise: CollectionIsEmpty. 
104 | 	self assert: emptyString equals: UTF8String empty.
105 | 	self assert: emptyString equals: (UTF8String on: #[]).
106 | 	self assert: emptyString utf8EncodedBytes equals: #[].
107 | 	self deny: emptyString equals: (UTF8String fromSystemString: 'ABC')
108 | ]
109 | 
110 | { #category : #tests }
111 | UTF8StringTest >> testEnumeration [
112 | 	| string index |
113 | 	string := UTF8String fromSystemString: self normalString.
114 | 	index := 1.
115 | 	string do: [ :each |
116 | 		self assert: each equals: (self normalString at: index).
117 | 		index := index + 1 ].
118 | 	self assert: index equals: self normalString size + 1.
119 | 	index := self normalString size.
120 | 	string reverseDo: [ :each |
121 | 		self assert: each equals: (self normalString at: index).
122 | 		index := index - 1 ].
123 | 	self assert: index equals: 0
124 | 	
125 | ]
126 | 
127 | { #category : #tests }
128 | UTF8StringTest >> testFindTokens [
129 | 	self 
130 | 		assert: ('AB/CD/EF' asUTF8String findTokens: $/) asArray 
131 | 		equals: { 'AB' asUTF8String . 'CD' asUTF8String . 'EF' asUTF8String }.
132 | 	self 
133 | 		assert: ('AB/CD/EF/' asUTF8String findTokens: $/) asArray 
134 | 		equals: { 'AB' asUTF8String . 'CD' asUTF8String . 'EF' asUTF8String }.
135 | 	self 
136 | 		assert: ('/AB/é€è/EF' asUTF8String findTokens: $/) asArray 
137 | 		equals: { 'AB' asUTF8String . 'é€è' asUTF8String . 'EF' asUTF8String }.
138 | 	self 
139 | 		assert: ('AB - CD - EF' asUTF8String findTokens: ' -' asUTF8String) asArray
140 | 		equals: { 'AB' asUTF8String . 'CD' asUTF8String . 'EF' asUTF8String }.
141 | 	self 
142 | 		assert: ('ABC' asUTF8String findTokens: '+-€' asUTF8String) asArray
143 | 		equals: { 'ABC' asUTF8String }.
144 | 	self 
145 | 		assert: (UTF8String empty findTokens: '+-' asUTF8String) asArray
146 | 		equals: { }. 
147 | ]
148 | 
149 | { #category : #tests }
150 | UTF8StringTest >> testFirst [
151 | 	self assert: self firstString asUTF8String first equals: $F 
152 | ]
153 | 
154 | { #category : #tests }
155 | UTF8StringTest >> testFormat [
156 | 	self 
157 | 		assert: ('{1} € ~= {2} €' asUTF8String format: #(100 200)) 
158 | 		equals: '100 € ~= 200 €' asUTF8String.
159 | 	self 
160 | 		assert: ('{one} € ~= {two} €' asUTF8String format: { #one->100. #two->200 } asDictionary) 
161 | 		equals: '100 € ~= 200 €' asUTF8String. 
162 | ]
163 | 
164 | { #category : #tests }
165 | UTF8StringTest >> testHashing [
166 | 	| words dictionary set |
167 | 	words := (String loremIpsum findTokens: ' .') , (self firstString findTokens: ' :-').
168 | 	words := words collect: #asUTF8String.
169 | 	dictionary := Dictionary new.
170 | 	words do: [ :each | dictionary at: each put: each ].
171 | 	self assert: (dictionary at: words first) equals: words first.
172 | 	self assert: (dictionary at: words last) equals: words last.
173 | 	self assert: (dictionary includesKey: 'Français' asUTF8String).
174 | 	set := words asSet.
175 | 	self assert: (set includes: words first).
176 | 	self assert: (set includes: words last).
177 | 	self assert: (set includes: 'Français' asUTF8String).
178 | 
179 | ]
180 | 
181 | { #category : #tests }
182 | UTF8StringTest >> testIncludes [
183 | 	| string |
184 | 	string := UTF8String fromSystemString: self normalString.
185 | 	self normalString do: [ :each |
186 | 		self assert: (string includes: each) ]
187 | ]
188 | 
189 | { #category : #tests }
190 | UTF8StringTest >> testIncludesSubstring [
191 | 	| string |
192 | 	string := UTF8String fromSystemString: self normalString.
193 | 	(self normalString findTokens: ' -') do: [ :each |
194 | 		self assert: (string includesSubstring: (UTF8String fromSystemString: each)) ]
195 | ]
196 | 
197 | { #category : #tests }
198 | UTF8StringTest >> testIndexNormal [
199 | 	| string map |
200 | 	string := UTF8String fromSystemString: self normalString.
201 | 	map := Array streamContents: [ :out |
202 | 		1 to: string size do: [ :characterIndex |
203 | 			out nextPut: characterIndex -> (string byteIndexAt: characterIndex) ] ].
204 | 	map do: [ :spec |
205 | 		self assert: (string characterIndexAt: spec value) equals: spec key ]
206 | ]
207 | 
208 | { #category : #tests }
209 | UTF8StringTest >> testIndexOf [
210 | 	| str |
211 | 	str := UTF8String fromSystemString: self normalString.
212 | 	self assert: (str indexOf: $L) equals: 1.
213 | 	self assert: (str indexOf: $€) equals: 26.
214 | 	self assert: (str indexOf: $A) equals: 0.
215 | 	str := UTF8String fromSystemString: 'áb'.
216 | 	self assert: (str indexOf: $á) equals: 1.
217 | 	self assert: (str indexOf: $b) equals: 2.
218 | 	self assert: (str indexOf: $X) equals: 0
219 | ]
220 | 
221 | { #category : #tests }
222 | UTF8StringTest >> testIndexSmall [
223 | 	| string map |
224 | 	string := UTF8String fromSystemString: self smallString.
225 | 	map := Array streamContents: [ :out |
226 | 		1 to: string size do: [ :characterIndex |
227 | 			out nextPut: characterIndex -> (string byteIndexAt: characterIndex) ] ].
228 | 	map do: [ :spec |
229 | 		self assert: (string characterIndexAt: spec value) equals: spec key ]
230 | ]
231 | 
232 | { #category : #tests }
233 | UTF8StringTest >> testLast [
234 | 	self assert: self firstString asUTF8String last equals: $1 
235 | ]
236 | 
237 | { #category : #tests }
238 | UTF8StringTest >> testNormal [
239 | 	| string |
240 | 	string := UTF8String fromSystemString: self normalString.
241 | 	self assert: string size equals: self normalString size.
242 | 	1 to: string size do: [ :index | 
243 | 		self assert: (string at: index) equals: (self normalString at: index) ].
244 | 	self deny: string isAscii.
245 | 	self deny: string isEmpty
246 | ]
247 | 
248 | { #category : #tests }
249 | UTF8StringTest >> testNormalAscii [
250 | 	| string |
251 | 	string := UTF8String fromSystemString: self normalAsciiString.
252 | 	self assert: string size equals: self normalAsciiString size.
253 | 	1 to: string size do: [ :index | 
254 | 		self assert: (string at: index) equals: (self normalAsciiString at: index) ].
255 | 	self assert: string isAscii.
256 | 	self deny: string isEmpty
257 | ]
258 | 
259 | { #category : #tests }
260 | UTF8StringTest >> testReverse [
261 | 	| string reverse |
262 | 	string := UTF8String fromSystemString: self normalString.
263 | 	reverse := UTF8String fromSystemString: self normalString reverse.
264 | 	self assert: string reverse equals: reverse
265 | 	
266 | ]
267 | 
268 | { #category : #tests }
269 | UTF8StringTest >> testSmall [
270 | 	| smallString |
271 | 	smallString := UTF8String fromSystemString: self smallString.
272 | 	self assert: smallString size equals: self smallString size.
273 | 	1 to: smallString size do: [ :index | 
274 | 		self assert: (smallString at: index) equals: (self smallString at: index) ].
275 | 	self deny: smallString isAscii
276 | ]
277 | 
278 | { #category : #tests }
279 | UTF8StringTest >> testSmallAscii [
280 | 	| smallString |
281 | 	smallString := UTF8String fromSystemString: self smallAsciiString.
282 | 	self assert: smallString size equals: self smallAsciiString size.
283 | 	1 to: smallString size do: [ :index | 
284 | 		self assert: (smallString at: index) equals: (self smallAsciiString at: index) ].
285 | 	self assert: smallString isAscii
286 | ]
287 | 
288 | { #category : #tests }
289 | UTF8StringTest >> testSorting [
290 | 	| words |
291 | 	words := (String loremIpsum findTokens: ' .') , (self firstString findTokens: ' :-').
292 | 	self 
293 | 		assert: (words collect: #asUTF8String) sorted 
294 | 		equals: (words sorted collect: #asUTF8String).
295 | 	self 
296 | 		assert: ((words collect: #asUTF8String) sorted: #yourself descending) 
297 | 		equals: ((words sorted: #yourself descending) collect: #asUTF8String).
298 | ]
299 | 
300 | { #category : #tests }
301 | UTF8StringTest >> testStreamContents [
302 | 	| string built |
303 | 	string := UTF8String fromSystemString: self normalString.
304 | 	built := UTF8String streamContents: [ :out |
305 | 		self normalString do: [ :each | out nextPut: each ] ].
306 | 	self assert: built equals: string
307 | ]
308 | 
309 | { #category : #tests }
310 | UTF8StringTest >> testValidate [
311 | 	#( firstString secondString normalString normalAsciiString smallString smallAsciiString ) do: [ :each |
312 | 		self assert: (UTF8String fromSystemString: (self perform: each)) validate ].
313 | 	{ #[235 140 4]. #[255]. #[255 128] } do: [ :each |
314 | 		self deny: (UTF8String on: each) validate ]
315 | ]
316 | 


--------------------------------------------------------------------------------
/UTF8String/package.st:
--------------------------------------------------------------------------------
1 | Package { #name : #UTF8String }
2 | 


--------------------------------------------------------------------------------