├── composer.json
├── LICENSE
├── README.md
├── characters.json
└── gpt3-encoder.php


/composer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "coderevolutionplugins/gpt-3-encoder-php",
 3 |     "description": "PHP BPE Text Encoder/Decoder for GPT-2 / GPT-3",
 4 |     "keywords": [
 5 | 		"php",
 6 | 		"encoder",
 7 | 		"decoder",
 8 | 		"gpt-2",
 9 | 		"gpt-2-simple",
10 | 		"gpt-3",
11 | 		"gpt-3-prompts",
12 | 		"gpt-2-text-generation",
13 | 		"gpt-3-text-generation",
14 | 		"gpt-3-prompt",
15 | 		"gpt-3-tokenizer",
16 | 		"gpt-2-tokenizer",
17 | 		"gpt3-encoder",
18 | 		"gpt3-decoder"
19 |     ],
20 |     "license": "MIT",
21 |     "authors": [
22 | 		{
23 | 			"name": "CodeRevolution",
24 | 			"homepage": "https://coderevolution.ro/"
25 | 		}
26 |     ],
27 |     "require": {
28 |         "php": ">=5.6"
29 |     },
30 |     "support": {
31 | 	"issues": "https://github.com/CodeRevolutionPlugins/GPT-3-Encoder-PHP/issues",
32 | 	"wiki": "https://github.com/CodeRevolutionPlugins/GPT-3-Encoder-PHP/wiki",
33 | 	"source": "https://github.com/CodeRevolutionPlugins/GPT-3-Encoder-PHP"
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Szabolcs-Istvan Kisded
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # GPT-3-Encoder-Decoder-PHP
 2 | PHP BPE Text Encoder/Decoder for GPT-2 / GPT-3
 3 | 
 4 | ## About
 5 | GPT-2 and GPT-3 use byte pair encoding to turn text into a series of integers to feed into the model. This is a PHP implementation of OpenAI's original python encoder and decoder which can be found [here](https://github.com/openai/gpt-2). The main source of inspiration for writing this encoder was the NodeJS version of this encoder, found [here](https://github.com/latitudegames/GPT-3-Encoder).
 6 | 
 7 | You can test the results, by comparing the output generated by this script, with the [official tokenizer page from OpenAI](https://beta.openai.com/tokenizer).
 8 | 
 9 | This specific encoder and decoder is used in the [Aiomatic WordPress plugin](https://1.envato.market/aiomatic), to count the number of tokens a string will use when sent to OpenAI API. Check more of my work on my [website](https://coderevolution.ro/).
10 | 
11 | 
12 | ## Usage
13 | 
14 | The mbstring PHP extension is needed for this tool to work correctly (in case non-ASCII characters are present in the tokenized text): [details here on how to install mbstring](https://www.php.net/manual/en/mbstring.installation.php)
15 | 
16 | 
17 | ```php
18 | 
19 | $prompt = "Many words map to one token, but some don't: indivisible. Unicode characters like emojis may be split into many tokens containing the underlying bytes: 🤚🏾 Sequences of characters commonly found next to each other may be grouped together: 1234567890";
20 | 
21 | $token_array = gpt_encode($prompt);
22 | 
23 | $original_text = gpt_decode($token_array);
24 | 
25 | ```
26 | 


--------------------------------------------------------------------------------
/characters.json:
--------------------------------------------------------------------------------
1 | {"0":"Ā","1":"ā","2":"Ă","3":"ă","4":"Ą","5":"ą","6":"Ć","7":"ć","8":"Ĉ","9":"ĉ","10":"Ċ","11":"ċ","12":"Č","13":"č","14":"Ď","15":"ď","16":"Đ","17":"đ","18":"Ē","19":"ē","20":"Ĕ","21":"ĕ","22":"Ė","23":"ė","24":"Ę","25":"ę","26":"Ě","27":"ě","28":"Ĝ","29":"ĝ","30":"Ğ","31":"ğ","32":"Ġ","33":"!","34":"\"","35":"#","36":"$","37":"%","38":"&","39":"'","40":"(","41":")","42":"*","43":"+","44":",","45":"-","46":".","47":"/","48":"0","49":"1","50":"2","51":"3","52":"4","53":"5","54":"6","55":"7","56":"8","57":"9","58":":","59":";","60":"<","61":"=","62":">","63":"?","64":"@","65":"A","66":"B","67":"C","68":"D","69":"E","70":"F","71":"G","72":"H","73":"I","74":"J","75":"K","76":"L","77":"M","78":"N","79":"O","80":"P","81":"Q","82":"R","83":"S","84":"T","85":"U","86":"V","87":"W","88":"X","89":"Y","90":"Z","91":"[","92":"\\","93":"]","94":"^","95":"_","96":"`","97":"a","98":"b","99":"c","100":"d","101":"e","102":"f","103":"g","104":"h","105":"i","106":"j","107":"k","108":"l","109":"m","110":"n","111":"o","112":"p","113":"q","114":"r","115":"s","116":"t","117":"u","118":"v","119":"w","120":"x","121":"y","122":"z","123":"{","124":"|","125":"}","126":"~","127":"ġ","128":"Ģ","129":"ģ","130":"Ĥ","131":"ĥ","132":"Ħ","133":"ħ","134":"Ĩ","135":"ĩ","136":"Ī","137":"ī","138":"Ĭ","139":"ĭ","140":"Į","141":"į","142":"İ","143":"ı","144":"Ĳ","145":"ĳ","146":"Ĵ","147":"ĵ","148":"Ķ","149":"ķ","150":"ĸ","151":"Ĺ","152":"ĺ","153":"Ļ","154":"ļ","155":"Ľ","156":"ľ","157":"Ŀ","158":"ŀ","159":"Ł","160":"ł","161":"¡","162":"¢","163":"£","164":"¤","165":"¥","166":"¦","167":"§","168":"¨","169":"©","170":"ª","171":"«","172":"¬","173":"Ń","174":"®","175":"¯","176":"°","177":"±","178":"²","179":"³","180":"´","181":"µ","182":"¶","183":"·","184":"¸","185":"¹","186":"º","187":"»","188":"¼","189":"½","190":"¾","191":"¿","192":"À","193":"Á","194":"Â","195":"Ã","196":"Ä","197":"Å","198":"Æ","199":"Ç","200":"È","201":"É","202":"Ê","203":"Ë","204":"Ì","205":"Í","206":"Î","207":"Ï","208":"Ð","209":"Ñ","210":"Ò","211":"Ó","212":"Ô","213":"Õ","214":"Ö","215":"×","216":"Ø","217":"Ù","218":"Ú","219":"Û","220":"Ü","221":"Ý","222":"Þ","223":"ß","224":"à","225":"á","226":"â","227":"ã","228":"ä","229":"å","230":"æ","231":"ç","232":"è","233":"é","234":"ê","235":"ë","236":"ì","237":"í","238":"î","239":"ï","240":"ð","241":"ñ","242":"ò","243":"ó","244":"ô","245":"õ","246":"ö","247":"÷","248":"ø","249":"ù","250":"ú","251":"û","252":"ü","253":"ý","254":"þ","255":"ÿ"}


--------------------------------------------------------------------------------
/gpt3-encoder.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | function gpt_utf8_encode(string $str): string 
  4 | {
  5 |     $str .= $str;
  6 |     $len = \strlen($str);
  7 |     for ($i = $len >> 1, $j = 0; $i < $len; ++$i, ++$j) {
  8 |         switch (true) {
  9 |             case $str[$i] < "\x80": $str[$j] = $str[$i]; break;
 10 |             case $str[$i] < "\xC0": $str[$j] = "\xC2"; $str[++$j] = $str[$i]; break;
 11 |             default: $str[$j] = "\xC3"; $str[++$j] = \chr(\ord($str[$i]) - 64); break;
 12 |         }
 13 |     }
 14 |     return substr($str, 0, $j);
 15 | }
 16 | function gpt_encode($text) 
 17 | {
 18 |     $bpe_tokens = array();
 19 |     if(empty($text))
 20 |     {
 21 |         return $bpe_tokens;
 22 |     }
 23 |     $raw_chars = file_get_contents(dirname(__FILE__) . "/characters.json");
 24 |     $byte_encoder = json_decode($raw_chars, true);
 25 |     if(empty($byte_encoder))
 26 |     {
 27 |         error_log('Failed to load characters.json: ' . $raw_chars);
 28 |         return $bpe_tokens;
 29 |     }
 30 |     $rencoder = file_get_contents(dirname(__FILE__) . "/encoder.json");
 31 |     $encoder = json_decode($rencoder, true);
 32 |     if(empty($encoder))
 33 |     {
 34 |         error_log('Failed to load encoder.json: ' . $rencoder);
 35 |         return $bpe_tokens;
 36 |     }
 37 | 
 38 |     $bpe_file = file_get_contents(dirname(__FILE__) . "/vocab.bpe");
 39 |     if(empty($bpe_file))
 40 |     {
 41 |         error_log('Failed to load vocab.bpe');
 42 |         return $bpe_tokens;
 43 |     }
 44 | 
 45 |     preg_match_all("#'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+#u", $text, $matches);
 46 |     if(!isset($matches[0]) || count($matches[0]) == 0)
 47 |     {
 48 |         error_log('Failed to match string: ' . $text);
 49 |         return $bpe_tokens;
 50 |     }
 51 |     $lines = preg_split('/\r\n|\r|\n/', $bpe_file);
 52 |     $bpe_merges = array();
 53 |     $bpe_merges_temp = array_slice($lines, 1, count($lines), true);
 54 |     foreach($bpe_merges_temp as $bmt)
 55 |     {
 56 |         $split_bmt = preg_split('#(\s+)#', $bmt);
 57 |         $split_bmt = array_filter($split_bmt, 'gpt_my_filter');
 58 |         if(count($split_bmt) > 0)
 59 |         {
 60 |             $bpe_merges[] = $split_bmt;
 61 |         }
 62 |     }
 63 |     $bpe_ranks = gpt_dictZip($bpe_merges, range(0, count($bpe_merges) - 1));
 64 |     
 65 |     $cache = array();
 66 |     foreach($matches[0] as $token)
 67 |     {
 68 |         $new_tokens = array();
 69 |         $chars = array();
 70 |         $token = gpt_utf8_encode($token);
 71 |         if(function_exists('mb_strlen'))
 72 |         {
 73 |             $len = mb_strlen($token, 'UTF-8');
 74 |             for ($i = 0; $i < $len; $i++) 
 75 |             {
 76 |                 $chars[] = mb_substr($token, $i, 1, 'UTF-8');
 77 |             }
 78 |         }
 79 |         else
 80 |         {
 81 |             $chars = str_split($token);
 82 |         }
 83 |         $result_word = '';
 84 |         foreach($chars as $char)
 85 |         {
 86 |             if(isset($byte_encoder[gpt_unichr($char)]))
 87 |             {
 88 |                 $result_word .= $byte_encoder[gpt_unichr($char)];
 89 |             }
 90 |         }
 91 |         $new_tokens_bpe = gpt_bpe($result_word, $bpe_ranks, $cache);
 92 |         $new_tokens_bpe = explode(' ', $new_tokens_bpe);
 93 |         foreach($new_tokens_bpe as $x)
 94 |         {
 95 |             if(isset($encoder[$x]))
 96 |             {
 97 |                 if(isset($new_tokens[$x]))
 98 |                 {
 99 |                     $new_tokens[rand() . '---' . $x] = $encoder[$x];
100 |                 }
101 |                 else
102 |                 {
103 |                     $new_tokens[$x] = $encoder[$x];
104 |                 }
105 |             }
106 |             else
107 |             {
108 |                 if(isset($new_tokens[$x]))
109 |                 {
110 |                     $new_tokens[rand() . '---' . $x] = $x;
111 |                 }
112 |                 else
113 |                 {
114 |                     $new_tokens[$x] = $x;
115 |                 }
116 |             }
117 |         }
118 |         foreach($new_tokens as $ninx => $nval)
119 |         {
120 |             if(isset($bpe_tokens[$ninx]))
121 |             {
122 |                 $bpe_tokens[rand() . '---' . $ninx] = $nval;
123 |             }
124 |             else
125 |             {
126 |                 $bpe_tokens[$ninx] = $nval;
127 |             }
128 |         }
129 |     }
130 |     return $bpe_tokens;
131 | }
132 | 
133 | function gpt_decode($tokens) 
134 | {
135 |     $rencoder = file_get_contents(dirname(__FILE__) . "/encoder.json");
136 |     $encoder = json_decode($rencoder, true);
137 |     if(empty($encoder))
138 |     {
139 |         error_log('Failed to load encoder.json: ' . $rencoder);
140 |         return false;
141 |     }
142 |     $decoder = array();
143 |     foreach($encoder as $index => $val)
144 |     {
145 |         $decoder[$val] = $index;
146 |     }
147 |     $raw_chars = file_get_contents(dirname(__FILE__) . "/characters.json");
148 |     $byte_encoder = json_decode($raw_chars, true);
149 |     if(empty($byte_encoder))
150 |     {
151 |         error_log('Failed to load characters.json: ' . $raw_chars);
152 |         return false;
153 |     }
154 |     $byte_decoder = array();
155 |     foreach($byte_encoder as $index => $val)
156 |     {
157 |         $byte_decoder[$val] = $index;
158 |     }
159 |     $text = '';
160 |     $mych_arr = [];
161 |     foreach($tokens as $myt)
162 |     {
163 |         if(isset($decoder[$myt]))
164 |         {
165 |             $mych_arr[] = $decoder[$myt];
166 |         }
167 |         else
168 |         {
169 |             error_log('Character not found in decoder: ' . $myt);
170 |         }
171 |     }
172 |     $text = implode('', $mych_arr);
173 |     $text_arr = preg_split('//u', $text, -1, PREG_SPLIT_NO_EMPTY);
174 |     $final_arr = array();
175 |     foreach($text_arr as $txa)
176 |     {
177 |         if(isset($byte_decoder[$txa]))
178 |         {
179 |             $final_arr[] = $byte_decoder[$txa];
180 |         }
181 |         else
182 |         {
183 |             error_log('Character not found in byte_decoder: ' . $txa);
184 |         }
185 |     }
186 |     $output = '';
187 |     for ($i = 0, $j = count($final_arr); $i < $j; ++$i) {
188 |         $output .= chr($final_arr[$i]);
189 |     }
190 |     return $output;
191 | }
192 | function gpt_my_filter($var)
193 | {
194 |     return ($var !== NULL && $var !== FALSE && $var !== '');
195 | }
196 | 
197 | function gpt_unichr($c) 
198 | {
199 |     if (ord($c[0]) >=0 && ord($c[0]) <= 127)
200 |     {
201 |         return ord($c[0]);
202 |     }
203 |     if (ord($c[0]) >= 192 && ord($c[0]) <= 223)
204 |     {
205 |         return (ord($c[0])-192)*64 + (ord($c[1])-128);
206 |     }
207 |     if (ord($c[0]) >= 224 && ord($c[0]) <= 239)
208 |     {
209 |         return (ord($c[0])-224)*4096 + (ord($c[1])-128)*64 + (ord($c[2])-128);
210 |     }
211 |     if (ord($c[0]) >= 240 && ord($c[0]) <= 247)
212 |     {
213 |         return (ord($c[0])-240)*262144 + (ord($c[1])-128)*4096 + (ord($c[2])-128)*64 + (ord($c[3])-128);
214 |     }
215 |     if (ord($c[0]) >= 248 && ord($c[0]) <= 251)
216 |     {
217 |         return (ord($c[0])-248)*16777216 + (ord($c[1])-128)*262144 + (ord($c[2])-128)*4096 + (ord($c[3])-128)*64 + (ord($c[4])-128);
218 |     }
219 |     if (ord($c[0]) >= 252 && ord($c[0]) <= 253)
220 |     {
221 |         return (ord($c[0])-252)*1073741824 + (ord($c[1])-128)*16777216 + (ord($c[2])-128)*262144 + (ord($c[3])-128)*4096 + (ord($c[4])-128)*64 + (ord($c[5])-128);
222 |     }
223 |     if (ord($c[0]) >= 254 && ord($c[0]) <= 255)
224 |     {
225 |         return 0;
226 |     }
227 |     return 0;
228 | }
229 | function gpt_dictZip($x, $y)
230 | {
231 |     $result = array();
232 |     $cnt = 0;
233 |     foreach($x as $i)
234 |     {
235 |         if(isset($i[1]) && isset($i[0]))
236 |         {
237 |             $result[$i[0] . ',' . $i[1]] = $cnt;
238 |             $cnt++;
239 |         }
240 |     }
241 |     return $result;
242 | }
243 | function gpt_get_pairs($word) 
244 | {
245 |     $pairs = array();
246 |     $prev_char = $word[0];
247 |     for ($i = 1; $i < count($word); $i++) 
248 |     {
249 |         $char = $word[$i];
250 |         $pairs[] = array($prev_char, $char);
251 |         $prev_char = $char;
252 |     }
253 |     return $pairs;
254 | }
255 | function gpt_split($str, $len = 1) 
256 | {
257 |     $arr		= [];
258 |     if(function_exists('mb_strlen'))
259 |     {
260 |         $length 	= mb_strlen($str, 'UTF-8');
261 |     }
262 |     else
263 |     {
264 |         $length 	= strlen($str);
265 |     }
266 | 
267 |     for ($i = 0; $i < $length; $i += $len) 
268 |     {
269 |         if(function_exists('mb_substr'))
270 |         {
271 |             $arr[] = mb_substr($str, $i, $len, 'UTF-8');
272 |         }
273 |         else
274 |         {
275 |             $arr[] = substr($str, $i, $len);
276 |         }
277 |     }
278 |     return $arr;
279 | 
280 | }
281 | function gpt_bpe($token, $bpe_ranks, &$cache)
282 | {
283 |     if(array_key_exists($token, $cache))
284 |     {
285 |         return $cache[$token];
286 |     }
287 |     $word = gpt_split($token);
288 |     $init_len = count($word);
289 |     $pairs = gpt_get_pairs($word);
290 |     if(!$pairs)
291 |     {
292 |         return $token;
293 |     }
294 |     while (true) 
295 |     {
296 |         $minPairs = array();
297 |         foreach($pairs as $pair)
298 |         {
299 |             if(array_key_exists($pair[0] . ','. $pair[1], $bpe_ranks))
300 |             {
301 |                 $rank = $bpe_ranks[$pair[0] . ','. $pair[1]];
302 |                 $minPairs[$rank] = $pair;
303 |             }
304 |             else
305 |             { 
306 |                 $minPairs[10e10] = $pair;
307 |             }
308 |         }
309 |         ksort($minPairs);
310 |         $min_key = array_key_first($minPairs);
311 |         foreach($minPairs as $mpi => $mp)
312 |         {
313 |             if($mpi < $min_key)
314 |             {
315 |                 $min_key = $mpi;
316 |             }
317 |         }
318 |         $bigram = $minPairs[$min_key];
319 |         if(!array_key_exists($bigram[0] . ',' . $bigram[1], $bpe_ranks))
320 |         {
321 |             break;
322 |         }
323 |         $first = $bigram[0];
324 |         $second = $bigram[1];
325 |         $new_word = array();
326 |         $i = 0;
327 |         while ($i < count($word)) 
328 |         {
329 |             $j = gpt_indexOf($word, $first, $i);
330 |             if ($j === -1) 
331 |             {
332 |                 $new_word = array_merge($new_word, array_slice($word, $i, null, true));
333 |                 break;
334 |             }
335 |             if($i > $j)
336 |             {
337 |                 $slicer = array();
338 |             }
339 |             elseif($j == 0)
340 |             {
341 |                 $slicer = array();
342 |             }
343 |             else
344 |             {
345 |                 $slicer = array_slice($word, $i, $j - $i, true);
346 |             }
347 |             $new_word = array_merge($new_word, $slicer);
348 |             if(count($new_word) > $init_len)
349 |             {
350 |                 break;
351 |             }
352 |             $i = $j;
353 |             if ($word[$i] === $first && $i < count($word) - 1 && $word[$i + 1] === $second) 
354 |             {
355 |                 array_push($new_word, $first . $second);
356 |                 $i = $i + 2;
357 |             }
358 |             else
359 |             {
360 |                 array_push($new_word, $word[$i]);
361 |                 $i = $i + 1;
362 |             }
363 |         }
364 |         if($word == $new_word)
365 |         {
366 |             break;
367 |         }
368 |         $word = $new_word;
369 |         if (count($word) === 1) 
370 |         {
371 |             break;
372 |         }
373 |         else
374 |         {
375 |             $pairs = gpt_get_pairs($word);
376 |         }
377 |     }
378 |     $word = implode(' ', $word);
379 |     $cache[$token] = $word;
380 |     return $word;
381 | }
382 | function gpt_indexOf($arrax, $searchElement, $fromIndex)
383 | {
384 |     $index = 0;
385 |     foreach($arrax as $index => $value)
386 |     {
387 |         if($index < $fromIndex)
388 |         {
389 |             $index++;
390 |             continue;
391 |         }
392 |         if($value == $searchElement)
393 |         {
394 |             return $index;
395 |         }
396 |         $index++;
397 |     }
398 |     return -1;
399 | }
400 | 
401 | $prompt = "Many words map to one token, but some don't: indivisible. Unicode characters like emojis may be split into many tokens containing the underlying bytes: 🤚🏾 Sequences of characters commonly found next to each other may be grouped together: 1234567890";
402 | $token_array = gpt_encode($prompt);
403 | error_log('Token array: ' . print_r($token_array, true));
404 | error_log('Count: ' . count($token_array));
405 | $original_text = gpt_decode($token_array);
406 | error_log('Original text: ' . $original_text);
407 | ?>
408 | 


--------------------------------------------------------------------------------