├── .gitignore ├── jest.config.js ├── example.js ├── package.json ├── .github └── workflows │ └── node.js.yml ├── README.md ├── Encoder.test.js ├── LICENSE ├── encoder.py └── Encoder.js /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | -------------------------------------------------------------------------------- /jest.config.js: -------------------------------------------------------------------------------- 1 | // For a detailed explanation regarding each configuration property, visit: 2 | // https://jestjs.io/docs/en/configuration.html 3 | 4 | module.exports = { 5 | // Automatically clear mock calls and instances between every test 6 | clearMocks: true, 7 | // Indicates which provider should be used to instrument code for coverage 8 | coverageProvider: "v8", 9 | // The test environment that will be used for testing 10 | testEnvironment: "node", 11 | }; 12 | -------------------------------------------------------------------------------- /example.js: -------------------------------------------------------------------------------- 1 | const {encode, decode} = require('./encoder.js') 2 | 3 | 4 | const str = 'This is an example sentence to try encoding out on!' 5 | const encoded = encode(str) 6 | console.log('Encoded this string looks like: ', encoded) 7 | 8 | console.log('We can look at each token and what it represents') 9 | for(let token of encoded){ 10 | console.log({token, string: decode([token])}) 11 | } 12 | 13 | const decoded = decode(encoded) 14 | console.log('We can decode it back into:\n', decoded) 15 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "gpt-3-encoder", 3 | "version": "1.1.3", 4 | "description": "Javascript BPE Encoder Decoder for GPT-2 / GPT-3", 5 | "main": "Encoder.js", 6 | "files": [ 7 | "encoder.json", 8 | "vocab.bpe" 9 | ], 10 | "scripts": { 11 | "test": "jest" 12 | }, 13 | "repository": { 14 | "type": "git", 15 | "url": "git+https://github.com/AIDungeon/GPT-3-Encoder.git" 16 | }, 17 | "author": "", 18 | "license": "MIT", 19 | "bugs": { 20 | "url": "https://github.com/AIDungeon/GPT-3-Encoder/issues" 21 | }, 22 | "homepage": "https://github.com/AIDungeon/GPT-3-Encoder#readme", 23 | "devDependencies": { 24 | "jest": "^26.4.2" 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /.github/workflows/node.js.yml: -------------------------------------------------------------------------------- 1 | # This workflow will do a clean install of node dependencies, build the source code and run tests across different versions of node 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-nodejs-with-github-actions 3 | 4 | name: Node.js CI 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | strategy: 18 | matrix: 19 | node-version: [12.x, 14.x] 20 | 21 | steps: 22 | - uses: actions/checkout@v2 23 | - name: Use Node.js ${{ matrix.node-version }} 24 | uses: actions/setup-node@v1 25 | with: 26 | node-version: ${{ matrix.node-version }} 27 | - run: npm ci 28 | - run: npm run build --if-present 29 | - run: npm test 30 | 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | GPT-3-Encoder 2 | Javascript BPE Encoder Decoder for GPT-2 / GPT-3 3 | 4 | ## About 5 | GPT-2 and GPT-3 use byte pair encoding to turn text into a series of integers to feed into the model. This is a javascript implementation of OpenAI's original python encoder/decoder which can be found [here](https://github.com/openai/gpt-2) 6 | 7 | ## Install with npm 8 | 9 | `npm install gpt-3-encoder` 10 | 11 | ## Usage 12 | 13 | Compatible with Node >= 12 14 | 15 | ``` 16 | const {encode, decode} = require('gpt-3-encoder') 17 | 18 | const str = 'This is an example sentence to try encoding out on!' 19 | const encoded = encode(str) 20 | console.log('Encoded this string looks like: ', encoded) 21 | 22 | console.log('We can look at each token and what it represents') 23 | for(let token of encoded){ 24 | console.log({token, string: decode([token])}) 25 | } 26 | 27 | const decoded = decode(encoded) 28 | console.log('We can decode it back into:\n', decoded) 29 | 30 | ``` 31 | 32 | 33 | -------------------------------------------------------------------------------- /Encoder.test.js: -------------------------------------------------------------------------------- 1 | const {encode, decode} = require('./Encoder.js'); 2 | 3 | test('empty string', () => { 4 | const str = ""; 5 | expect(encode(str)).toEqual([]) 6 | expect(decode(encode(str))).toEqual(str) 7 | }); 8 | 9 | test('space', () => { 10 | const str = " "; 11 | expect(encode(str)).toEqual([220]) 12 | expect(decode(encode(str))).toEqual(str) 13 | }); 14 | 15 | test('tab', () => { 16 | const str = "\t"; 17 | expect(encode(str)).toEqual([197]) 18 | expect(decode(encode(str))).toEqual(str) 19 | }); 20 | 21 | test('simple text', () => { 22 | const str = "This is some text"; 23 | expect(encode(str)).toEqual([1212, 318, 617, 2420]) 24 | expect(decode(encode(str))).toEqual(str) 25 | }); 26 | 27 | test('multi-token word', () => { 28 | const str = "indivisible"; 29 | expect(encode(str)).toEqual([521, 452, 12843]) 30 | expect(decode(encode(str))).toEqual(str) 31 | }); 32 | 33 | test('emojis', () => { 34 | const str = "hello 👋 world 🌍"; 35 | expect(encode(str)).toEqual([31373, 50169, 233, 995, 12520, 234, 235]) 36 | expect(decode(encode(str))).toEqual(str) 37 | }); -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 AIDungeon 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /encoder.py: -------------------------------------------------------------------------------- 1 | # This file includes code which was modified from https://github.com/openai/gpt-2 2 | 3 | import tensorflow as tf 4 | import os 5 | import json 6 | import regex as re 7 | from functools import lru_cache 8 | import requests 9 | import boto3 10 | import pdb 11 | 12 | 13 | @lru_cache() 14 | def bytes_to_unicode(): 15 | 16 | bs = ( 17 | list(range(ord("!"), ord("~") + 1)) 18 | + list(range(ord("¡"), ord("¬") + 1)) 19 | + list(range(ord("®"), ord("ÿ") + 1)) 20 | ) 21 | cs = bs[:] 22 | n = 0 23 | for b in range(2 ** 8): 24 | if b not in bs: 25 | bs.append(b) 26 | cs.append(2 ** 8 + n) 27 | n += 1 28 | cs = [chr(n) for n in cs] 29 | return dict(zip(bs, cs)) 30 | 31 | 32 | def get_pairs(word): 33 | pairs = set() 34 | prev_char = word[0] 35 | for char in word[1:]: 36 | pairs.add((prev_char, char)) 37 | prev_char = char 38 | return pairs 39 | 40 | 41 | class Encoder: 42 | def __init__(self, encoder, bpe_merges, errors="replace"): 43 | self.encoder = encoder 44 | self.decoder = {v: k for k, v in self.encoder.items()} 45 | self.errors = errors 46 | self.byte_encoder = bytes_to_unicode() 47 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} 48 | self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) 49 | self.cache = {} 50 | self.pat = re.compile( 51 | r"""'s|'t|'re|'ve|'m|'l l|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" 52 | ) 53 | 54 | def bpe(self, token): 55 | if token in self.cache: 56 | return self.cache[token] 57 | word = tuple(token) 58 | 59 | pairs = get_pairs(word) 60 | 61 | if not pairs: 62 | return token 63 | 64 | while True: 65 | bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) 66 | if bigram not in self.bpe_ranks: 67 | break 68 | first, second = bigram 69 | new_word = [] 70 | i = 0 71 | while i < len(word): 72 | try: 73 | j = word.index(first, i) 74 | new_word.extend(word[i:j]) 75 | i = j 76 | except: 77 | new_word.extend(word[i:]) 78 | break 79 | 80 | if word[i] == first and i < len(word) - 1 and word[i + 1] == second: 81 | new_word.append(first + second) 82 | i += 2 83 | else: 84 | new_word.append(word[i]) 85 | i += 1 86 | new_word = tuple(new_word) 87 | word = new_word 88 | if len(word) == 1: 89 | break 90 | else: 91 | pairs = get_pairs(word) 92 | 93 | word = " ".join(word) 94 | self.cache[token] = word 95 | return word 96 | 97 | def encode(self, text): 98 | bpe_tokens = [] 99 | for token in re.findall(self.pat, text): 100 | token = "".join(self.byte_encoder[b] for b in token.encode("utf-8")) 101 | 102 | bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ")) 103 | return bpe_tokens 104 | 105 | def decode(self, tokens): 106 | text = "".join([self.decoder[token] for token in tokens]) 107 | text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) 108 | return text 109 | 110 | 111 | def get_encoder(): 112 | with open("encoder.json", "r") as f: 113 | encoder = json.load(f) 114 | with open("vocab.bpe", "r", encoding="utf-8") as f: 115 | bpe_data = f.read() 116 | bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]] 117 | return Encoder(encoder=encoder, bpe_merges=bpe_merges) 118 | 119 | # encoder = get_encoder() 120 | # print('encoded is ', encoder.encode('hello 👋 world 🌍 This is a long string to test whether or not the emoji issue was fixed!')) -------------------------------------------------------------------------------- /Encoder.js: -------------------------------------------------------------------------------- 1 | // This file includes code which was modified from https://github.com/openai/gpt-2 2 | const fs = require('fs') 3 | const path = require('path'); 4 | 5 | const encoder = JSON.parse(fs.readFileSync(path.join(__dirname, './encoder.json'))); 6 | const bpe_file = fs.readFileSync(path.join(__dirname, './vocab.bpe'), 'utf-8'); 7 | 8 | const range = (x, y) => { 9 | const res = Array.from(Array(y).keys()).slice(x) 10 | return res 11 | } 12 | 13 | const ord = x => { 14 | return x.charCodeAt(0) 15 | } 16 | 17 | const chr = x => { 18 | return String.fromCharCode(x) 19 | } 20 | 21 | const textEncoder = new TextEncoder("utf-8") 22 | const encodeStr = str => { 23 | return Array.from(textEncoder.encode(str)).map(x => x.toString()) 24 | } 25 | 26 | const textDecoder = new TextDecoder("utf-8") 27 | const decodeStr = arr => { 28 | return textDecoder.decode(new Uint8Array(arr)); 29 | } 30 | 31 | const dictZip = (x, y) => { 32 | const result = {} 33 | x.map((_, i) => { result[x[i]] = y[i] }) 34 | return result 35 | } 36 | 37 | function bytes_to_unicode() { 38 | const bs = range(ord('!'), ord('~') + 1).concat(range(ord('¡'), ord('¬') + 1), range(ord('®'), ord('ÿ') + 1)) 39 | 40 | let cs = bs.slice() 41 | let n = 0 42 | for (let b = 0; b < 2 ** 8; b++) { 43 | if (!bs.includes(b)) { 44 | bs.push(b) 45 | cs.push(2 ** 8 + n) 46 | n = n + 1 47 | } 48 | } 49 | 50 | cs = cs.map(x => chr(x)) 51 | 52 | const result = {} 53 | bs.map((_, i) => { result[bs[i]] = cs[i] }) 54 | return result 55 | } 56 | 57 | function get_pairs(word) { 58 | const pairs = new Set() 59 | let prev_char = word[0] 60 | for (let i = 1; i < word.length; i++) { 61 | const char = word[i] 62 | pairs.add([prev_char, char]) 63 | prev_char = char 64 | } 65 | return pairs 66 | } 67 | 68 | const pat = /'s|'t|'re|'ve|'m|'l l|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/gu 69 | 70 | const decoder = {} 71 | Object.keys(encoder).map(x => { decoder[encoder[x]] = x }) 72 | 73 | const lines = bpe_file.split('\n') 74 | 75 | // bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]] 76 | const bpe_merges = lines.slice(1, lines.length - 1).map(x => { 77 | return x.split(/(\s+)/).filter(function(e) { return e.trim().length > 0 }) 78 | }) 79 | 80 | const byte_encoder = bytes_to_unicode() 81 | const byte_decoder = {} 82 | Object.keys(byte_encoder).map(x => { byte_decoder[byte_encoder[x]] = x }) 83 | 84 | const bpe_ranks = dictZip(bpe_merges, range(0, bpe_merges.length)) 85 | const cache = {} 86 | 87 | function bpe(token) { 88 | if (token in cache) { 89 | return cache[token] 90 | } 91 | 92 | let word = token.split('') 93 | 94 | let pairs = get_pairs(word) 95 | 96 | if (!pairs) { 97 | return token 98 | } 99 | 100 | while (true) { 101 | const minPairs = {} 102 | Array.from(pairs).map(pair => { 103 | const rank = bpe_ranks[pair] 104 | minPairs[(isNaN(rank) ? 10e10 : rank)] = pair 105 | }) 106 | 107 | 108 | 109 | const bigram = minPairs[Math.min(...Object.keys(minPairs).map(x => { 110 | return parseInt(x) 111 | } 112 | ))] 113 | 114 | if (!(bigram in bpe_ranks)) { 115 | break 116 | } 117 | 118 | const first = bigram[0] 119 | const second = bigram[1] 120 | let new_word = [] 121 | let i = 0 122 | 123 | while (i < word.length) { 124 | const j = word.indexOf(first, i) 125 | if (j === -1) { 126 | new_word = new_word.concat(word.slice(i)) 127 | break 128 | } 129 | new_word = new_word.concat(word.slice(i, j)) 130 | i = j 131 | 132 | if (word[i] === first && i < word.length - 1 && word[i + 1] === second) { 133 | new_word.push(first + second) 134 | i = i + 2 135 | } else { 136 | new_word.push(word[i]) 137 | i = i + 1 138 | } 139 | } 140 | 141 | word = new_word 142 | if (word.length === 1) { 143 | break 144 | } else { 145 | pairs = get_pairs(word) 146 | } 147 | } 148 | 149 | word = word.join(' ') 150 | cache[token] = word 151 | 152 | return word 153 | } 154 | 155 | function encode(text) { 156 | let bpe_tokens = [] 157 | const matches = Array.from(text.matchAll(pat)).map(x => x[0]) 158 | for (let token of matches) { 159 | token = encodeStr(token).map(x => { 160 | return byte_encoder[x] 161 | }).join('') 162 | 163 | const new_tokens = bpe(token).split(' ').map(x => encoder[x]) 164 | bpe_tokens = bpe_tokens.concat(new_tokens) 165 | } 166 | return bpe_tokens 167 | } 168 | 169 | function decode(tokens) { 170 | let text = tokens.map(x => decoder[x]).join('') 171 | text = decodeStr(text.split('').map(x => byte_decoder[x])) 172 | return text 173 | } 174 | 175 | module.exports = { 176 | encode, 177 | decode 178 | }; --------------------------------------------------------------------------------