├── composer.json ├── LICENSE ├── README.md ├── characters.json └── gpt3-encoder.php /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "coderevolutionplugins/gpt-3-encoder-php", 3 | "description": "PHP BPE Text Encoder/Decoder for GPT-2 / GPT-3", 4 | "keywords": [ 5 | "php", 6 | "encoder", 7 | "decoder", 8 | "gpt-2", 9 | "gpt-2-simple", 10 | "gpt-3", 11 | "gpt-3-prompts", 12 | "gpt-2-text-generation", 13 | "gpt-3-text-generation", 14 | "gpt-3-prompt", 15 | "gpt-3-tokenizer", 16 | "gpt-2-tokenizer", 17 | "gpt3-encoder", 18 | "gpt3-decoder" 19 | ], 20 | "license": "MIT", 21 | "authors": [ 22 | { 23 | "name": "CodeRevolution", 24 | "homepage": "https://coderevolution.ro/" 25 | } 26 | ], 27 | "require": { 28 | "php": ">=5.6" 29 | }, 30 | "support": { 31 | "issues": "https://github.com/CodeRevolutionPlugins/GPT-3-Encoder-PHP/issues", 32 | "wiki": "https://github.com/CodeRevolutionPlugins/GPT-3-Encoder-PHP/wiki", 33 | "source": "https://github.com/CodeRevolutionPlugins/GPT-3-Encoder-PHP" 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Szabolcs-Istvan Kisded 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GPT-3-Encoder-Decoder-PHP 2 | PHP BPE Text Encoder/Decoder for GPT-2 / GPT-3 3 | 4 | ## About 5 | GPT-2 and GPT-3 use byte pair encoding to turn text into a series of integers to feed into the model. This is a PHP implementation of OpenAI's original python encoder and decoder which can be found [here](https://github.com/openai/gpt-2). The main source of inspiration for writing this encoder was the NodeJS version of this encoder, found [here](https://github.com/latitudegames/GPT-3-Encoder). 6 | 7 | You can test the results, by comparing the output generated by this script, with the [official tokenizer page from OpenAI](https://beta.openai.com/tokenizer). 8 | 9 | This specific encoder and decoder is used in the [Aiomatic WordPress plugin](https://1.envato.market/aiomatic), to count the number of tokens a string will use when sent to OpenAI API. Check more of my work on my [website](https://coderevolution.ro/). 10 | 11 | 12 | ## Usage 13 | 14 | The mbstring PHP extension is needed for this tool to work correctly (in case non-ASCII characters are present in the tokenized text): [details here on how to install mbstring](https://www.php.net/manual/en/mbstring.installation.php) 15 | 16 | 17 | ```php 18 | 19 | $prompt = "Many words map to one token, but some don't: indivisible. Unicode characters like emojis may be split into many tokens containing the underlying bytes: 🤚🏾 Sequences of characters commonly found next to each other may be grouped together: 1234567890"; 20 | 21 | $token_array = gpt_encode($prompt); 22 | 23 | $original_text = gpt_decode($token_array); 24 | 25 | ``` 26 | -------------------------------------------------------------------------------- /characters.json: -------------------------------------------------------------------------------- 1 | {"0":"Ā","1":"ā","2":"Ă","3":"ă","4":"Ą","5":"ą","6":"Ć","7":"ć","8":"Ĉ","9":"ĉ","10":"Ċ","11":"ċ","12":"Č","13":"č","14":"Ď","15":"ď","16":"Đ","17":"đ","18":"Ē","19":"ē","20":"Ĕ","21":"ĕ","22":"Ė","23":"ė","24":"Ę","25":"ę","26":"Ě","27":"ě","28":"Ĝ","29":"ĝ","30":"Ğ","31":"ğ","32":"Ġ","33":"!","34":"\"","35":"#","36":"$","37":"%","38":"&","39":"'","40":"(","41":")","42":"*","43":"+","44":",","45":"-","46":".","47":"/","48":"0","49":"1","50":"2","51":"3","52":"4","53":"5","54":"6","55":"7","56":"8","57":"9","58":":","59":";","60":"<","61":"=","62":">","63":"?","64":"@","65":"A","66":"B","67":"C","68":"D","69":"E","70":"F","71":"G","72":"H","73":"I","74":"J","75":"K","76":"L","77":"M","78":"N","79":"O","80":"P","81":"Q","82":"R","83":"S","84":"T","85":"U","86":"V","87":"W","88":"X","89":"Y","90":"Z","91":"[","92":"\\","93":"]","94":"^","95":"_","96":"`","97":"a","98":"b","99":"c","100":"d","101":"e","102":"f","103":"g","104":"h","105":"i","106":"j","107":"k","108":"l","109":"m","110":"n","111":"o","112":"p","113":"q","114":"r","115":"s","116":"t","117":"u","118":"v","119":"w","120":"x","121":"y","122":"z","123":"{","124":"|","125":"}","126":"~","127":"ġ","128":"Ģ","129":"ģ","130":"Ĥ","131":"ĥ","132":"Ħ","133":"ħ","134":"Ĩ","135":"ĩ","136":"Ī","137":"ī","138":"Ĭ","139":"ĭ","140":"Į","141":"į","142":"İ","143":"ı","144":"IJ","145":"ij","146":"Ĵ","147":"ĵ","148":"Ķ","149":"ķ","150":"ĸ","151":"Ĺ","152":"ĺ","153":"Ļ","154":"ļ","155":"Ľ","156":"ľ","157":"Ŀ","158":"ŀ","159":"Ł","160":"ł","161":"¡","162":"¢","163":"£","164":"¤","165":"¥","166":"¦","167":"§","168":"¨","169":"©","170":"ª","171":"«","172":"¬","173":"Ń","174":"®","175":"¯","176":"°","177":"±","178":"²","179":"³","180":"´","181":"µ","182":"¶","183":"·","184":"¸","185":"¹","186":"º","187":"»","188":"¼","189":"½","190":"¾","191":"¿","192":"À","193":"Á","194":"Â","195":"Ã","196":"Ä","197":"Å","198":"Æ","199":"Ç","200":"È","201":"É","202":"Ê","203":"Ë","204":"Ì","205":"Í","206":"Î","207":"Ï","208":"Ð","209":"Ñ","210":"Ò","211":"Ó","212":"Ô","213":"Õ","214":"Ö","215":"×","216":"Ø","217":"Ù","218":"Ú","219":"Û","220":"Ü","221":"Ý","222":"Þ","223":"ß","224":"à","225":"á","226":"â","227":"ã","228":"ä","229":"å","230":"æ","231":"ç","232":"è","233":"é","234":"ê","235":"ë","236":"ì","237":"í","238":"î","239":"ï","240":"ð","241":"ñ","242":"ò","243":"ó","244":"ô","245":"õ","246":"ö","247":"÷","248":"ø","249":"ù","250":"ú","251":"û","252":"ü","253":"ý","254":"þ","255":"ÿ"} -------------------------------------------------------------------------------- /gpt3-encoder.php: -------------------------------------------------------------------------------- 1 | > 1, $j = 0; $i < $len; ++$i, ++$j) { 8 | switch (true) { 9 | case $str[$i] < "\x80": $str[$j] = $str[$i]; break; 10 | case $str[$i] < "\xC0": $str[$j] = "\xC2"; $str[++$j] = $str[$i]; break; 11 | default: $str[$j] = "\xC3"; $str[++$j] = \chr(\ord($str[$i]) - 64); break; 12 | } 13 | } 14 | return substr($str, 0, $j); 15 | } 16 | function gpt_encode($text) 17 | { 18 | $bpe_tokens = array(); 19 | if(empty($text)) 20 | { 21 | return $bpe_tokens; 22 | } 23 | $raw_chars = file_get_contents(dirname(__FILE__) . "/characters.json"); 24 | $byte_encoder = json_decode($raw_chars, true); 25 | if(empty($byte_encoder)) 26 | { 27 | error_log('Failed to load characters.json: ' . $raw_chars); 28 | return $bpe_tokens; 29 | } 30 | $rencoder = file_get_contents(dirname(__FILE__) . "/encoder.json"); 31 | $encoder = json_decode($rencoder, true); 32 | if(empty($encoder)) 33 | { 34 | error_log('Failed to load encoder.json: ' . $rencoder); 35 | return $bpe_tokens; 36 | } 37 | 38 | $bpe_file = file_get_contents(dirname(__FILE__) . "/vocab.bpe"); 39 | if(empty($bpe_file)) 40 | { 41 | error_log('Failed to load vocab.bpe'); 42 | return $bpe_tokens; 43 | } 44 | 45 | preg_match_all("#'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+#u", $text, $matches); 46 | if(!isset($matches[0]) || count($matches[0]) == 0) 47 | { 48 | error_log('Failed to match string: ' . $text); 49 | return $bpe_tokens; 50 | } 51 | $lines = preg_split('/\r\n|\r|\n/', $bpe_file); 52 | $bpe_merges = array(); 53 | $bpe_merges_temp = array_slice($lines, 1, count($lines), true); 54 | foreach($bpe_merges_temp as $bmt) 55 | { 56 | $split_bmt = preg_split('#(\s+)#', $bmt); 57 | $split_bmt = array_filter($split_bmt, 'gpt_my_filter'); 58 | if(count($split_bmt) > 0) 59 | { 60 | $bpe_merges[] = $split_bmt; 61 | } 62 | } 63 | $bpe_ranks = gpt_dictZip($bpe_merges, range(0, count($bpe_merges) - 1)); 64 | 65 | $cache = array(); 66 | foreach($matches[0] as $token) 67 | { 68 | $new_tokens = array(); 69 | $chars = array(); 70 | $token = gpt_utf8_encode($token); 71 | if(function_exists('mb_strlen')) 72 | { 73 | $len = mb_strlen($token, 'UTF-8'); 74 | for ($i = 0; $i < $len; $i++) 75 | { 76 | $chars[] = mb_substr($token, $i, 1, 'UTF-8'); 77 | } 78 | } 79 | else 80 | { 81 | $chars = str_split($token); 82 | } 83 | $result_word = ''; 84 | foreach($chars as $char) 85 | { 86 | if(isset($byte_encoder[gpt_unichr($char)])) 87 | { 88 | $result_word .= $byte_encoder[gpt_unichr($char)]; 89 | } 90 | } 91 | $new_tokens_bpe = gpt_bpe($result_word, $bpe_ranks, $cache); 92 | $new_tokens_bpe = explode(' ', $new_tokens_bpe); 93 | foreach($new_tokens_bpe as $x) 94 | { 95 | if(isset($encoder[$x])) 96 | { 97 | if(isset($new_tokens[$x])) 98 | { 99 | $new_tokens[rand() . '---' . $x] = $encoder[$x]; 100 | } 101 | else 102 | { 103 | $new_tokens[$x] = $encoder[$x]; 104 | } 105 | } 106 | else 107 | { 108 | if(isset($new_tokens[$x])) 109 | { 110 | $new_tokens[rand() . '---' . $x] = $x; 111 | } 112 | else 113 | { 114 | $new_tokens[$x] = $x; 115 | } 116 | } 117 | } 118 | foreach($new_tokens as $ninx => $nval) 119 | { 120 | if(isset($bpe_tokens[$ninx])) 121 | { 122 | $bpe_tokens[rand() . '---' . $ninx] = $nval; 123 | } 124 | else 125 | { 126 | $bpe_tokens[$ninx] = $nval; 127 | } 128 | } 129 | } 130 | return $bpe_tokens; 131 | } 132 | 133 | function gpt_decode($tokens) 134 | { 135 | $rencoder = file_get_contents(dirname(__FILE__) . "/encoder.json"); 136 | $encoder = json_decode($rencoder, true); 137 | if(empty($encoder)) 138 | { 139 | error_log('Failed to load encoder.json: ' . $rencoder); 140 | return false; 141 | } 142 | $decoder = array(); 143 | foreach($encoder as $index => $val) 144 | { 145 | $decoder[$val] = $index; 146 | } 147 | $raw_chars = file_get_contents(dirname(__FILE__) . "/characters.json"); 148 | $byte_encoder = json_decode($raw_chars, true); 149 | if(empty($byte_encoder)) 150 | { 151 | error_log('Failed to load characters.json: ' . $raw_chars); 152 | return false; 153 | } 154 | $byte_decoder = array(); 155 | foreach($byte_encoder as $index => $val) 156 | { 157 | $byte_decoder[$val] = $index; 158 | } 159 | $text = ''; 160 | $mych_arr = []; 161 | foreach($tokens as $myt) 162 | { 163 | if(isset($decoder[$myt])) 164 | { 165 | $mych_arr[] = $decoder[$myt]; 166 | } 167 | else 168 | { 169 | error_log('Character not found in decoder: ' . $myt); 170 | } 171 | } 172 | $text = implode('', $mych_arr); 173 | $text_arr = preg_split('//u', $text, -1, PREG_SPLIT_NO_EMPTY); 174 | $final_arr = array(); 175 | foreach($text_arr as $txa) 176 | { 177 | if(isset($byte_decoder[$txa])) 178 | { 179 | $final_arr[] = $byte_decoder[$txa]; 180 | } 181 | else 182 | { 183 | error_log('Character not found in byte_decoder: ' . $txa); 184 | } 185 | } 186 | $output = ''; 187 | for ($i = 0, $j = count($final_arr); $i < $j; ++$i) { 188 | $output .= chr($final_arr[$i]); 189 | } 190 | return $output; 191 | } 192 | function gpt_my_filter($var) 193 | { 194 | return ($var !== NULL && $var !== FALSE && $var !== ''); 195 | } 196 | 197 | function gpt_unichr($c) 198 | { 199 | if (ord($c[0]) >=0 && ord($c[0]) <= 127) 200 | { 201 | return ord($c[0]); 202 | } 203 | if (ord($c[0]) >= 192 && ord($c[0]) <= 223) 204 | { 205 | return (ord($c[0])-192)*64 + (ord($c[1])-128); 206 | } 207 | if (ord($c[0]) >= 224 && ord($c[0]) <= 239) 208 | { 209 | return (ord($c[0])-224)*4096 + (ord($c[1])-128)*64 + (ord($c[2])-128); 210 | } 211 | if (ord($c[0]) >= 240 && ord($c[0]) <= 247) 212 | { 213 | return (ord($c[0])-240)*262144 + (ord($c[1])-128)*4096 + (ord($c[2])-128)*64 + (ord($c[3])-128); 214 | } 215 | if (ord($c[0]) >= 248 && ord($c[0]) <= 251) 216 | { 217 | return (ord($c[0])-248)*16777216 + (ord($c[1])-128)*262144 + (ord($c[2])-128)*4096 + (ord($c[3])-128)*64 + (ord($c[4])-128); 218 | } 219 | if (ord($c[0]) >= 252 && ord($c[0]) <= 253) 220 | { 221 | return (ord($c[0])-252)*1073741824 + (ord($c[1])-128)*16777216 + (ord($c[2])-128)*262144 + (ord($c[3])-128)*4096 + (ord($c[4])-128)*64 + (ord($c[5])-128); 222 | } 223 | if (ord($c[0]) >= 254 && ord($c[0]) <= 255) 224 | { 225 | return 0; 226 | } 227 | return 0; 228 | } 229 | function gpt_dictZip($x, $y) 230 | { 231 | $result = array(); 232 | $cnt = 0; 233 | foreach($x as $i) 234 | { 235 | if(isset($i[1]) && isset($i[0])) 236 | { 237 | $result[$i[0] . ',' . $i[1]] = $cnt; 238 | $cnt++; 239 | } 240 | } 241 | return $result; 242 | } 243 | function gpt_get_pairs($word) 244 | { 245 | $pairs = array(); 246 | $prev_char = $word[0]; 247 | for ($i = 1; $i < count($word); $i++) 248 | { 249 | $char = $word[$i]; 250 | $pairs[] = array($prev_char, $char); 251 | $prev_char = $char; 252 | } 253 | return $pairs; 254 | } 255 | function gpt_split($str, $len = 1) 256 | { 257 | $arr = []; 258 | if(function_exists('mb_strlen')) 259 | { 260 | $length = mb_strlen($str, 'UTF-8'); 261 | } 262 | else 263 | { 264 | $length = strlen($str); 265 | } 266 | 267 | for ($i = 0; $i < $length; $i += $len) 268 | { 269 | if(function_exists('mb_substr')) 270 | { 271 | $arr[] = mb_substr($str, $i, $len, 'UTF-8'); 272 | } 273 | else 274 | { 275 | $arr[] = substr($str, $i, $len); 276 | } 277 | } 278 | return $arr; 279 | 280 | } 281 | function gpt_bpe($token, $bpe_ranks, &$cache) 282 | { 283 | if(array_key_exists($token, $cache)) 284 | { 285 | return $cache[$token]; 286 | } 287 | $word = gpt_split($token); 288 | $init_len = count($word); 289 | $pairs = gpt_get_pairs($word); 290 | if(!$pairs) 291 | { 292 | return $token; 293 | } 294 | while (true) 295 | { 296 | $minPairs = array(); 297 | foreach($pairs as $pair) 298 | { 299 | if(array_key_exists($pair[0] . ','. $pair[1], $bpe_ranks)) 300 | { 301 | $rank = $bpe_ranks[$pair[0] . ','. $pair[1]]; 302 | $minPairs[$rank] = $pair; 303 | } 304 | else 305 | { 306 | $minPairs[10e10] = $pair; 307 | } 308 | } 309 | ksort($minPairs); 310 | $min_key = array_key_first($minPairs); 311 | foreach($minPairs as $mpi => $mp) 312 | { 313 | if($mpi < $min_key) 314 | { 315 | $min_key = $mpi; 316 | } 317 | } 318 | $bigram = $minPairs[$min_key]; 319 | if(!array_key_exists($bigram[0] . ',' . $bigram[1], $bpe_ranks)) 320 | { 321 | break; 322 | } 323 | $first = $bigram[0]; 324 | $second = $bigram[1]; 325 | $new_word = array(); 326 | $i = 0; 327 | while ($i < count($word)) 328 | { 329 | $j = gpt_indexOf($word, $first, $i); 330 | if ($j === -1) 331 | { 332 | $new_word = array_merge($new_word, array_slice($word, $i, null, true)); 333 | break; 334 | } 335 | if($i > $j) 336 | { 337 | $slicer = array(); 338 | } 339 | elseif($j == 0) 340 | { 341 | $slicer = array(); 342 | } 343 | else 344 | { 345 | $slicer = array_slice($word, $i, $j - $i, true); 346 | } 347 | $new_word = array_merge($new_word, $slicer); 348 | if(count($new_word) > $init_len) 349 | { 350 | break; 351 | } 352 | $i = $j; 353 | if ($word[$i] === $first && $i < count($word) - 1 && $word[$i + 1] === $second) 354 | { 355 | array_push($new_word, $first . $second); 356 | $i = $i + 2; 357 | } 358 | else 359 | { 360 | array_push($new_word, $word[$i]); 361 | $i = $i + 1; 362 | } 363 | } 364 | if($word == $new_word) 365 | { 366 | break; 367 | } 368 | $word = $new_word; 369 | if (count($word) === 1) 370 | { 371 | break; 372 | } 373 | else 374 | { 375 | $pairs = gpt_get_pairs($word); 376 | } 377 | } 378 | $word = implode(' ', $word); 379 | $cache[$token] = $word; 380 | return $word; 381 | } 382 | function gpt_indexOf($arrax, $searchElement, $fromIndex) 383 | { 384 | $index = 0; 385 | foreach($arrax as $index => $value) 386 | { 387 | if($index < $fromIndex) 388 | { 389 | $index++; 390 | continue; 391 | } 392 | if($value == $searchElement) 393 | { 394 | return $index; 395 | } 396 | $index++; 397 | } 398 | return -1; 399 | } 400 | 401 | $prompt = "Many words map to one token, but some don't: indivisible. Unicode characters like emojis may be split into many tokens containing the underlying bytes: 🤚🏾 Sequences of characters commonly found next to each other may be grouped together: 1234567890"; 402 | $token_array = gpt_encode($prompt); 403 | error_log('Token array: ' . print_r($token_array, true)); 404 | error_log('Count: ' . count($token_array)); 405 | $original_text = gpt_decode($token_array); 406 | error_log('Original text: ' . $original_text); 407 | ?> 408 | --------------------------------------------------------------------------------