├── .gitignore
├── jest.config.js
├── example.js
├── package.json
├── .github
    └── workflows
    │   └── node.js.yml
├── README.md
├── Encoder.test.js
├── LICENSE
├── encoder.py
└── Encoder.js


/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | 


--------------------------------------------------------------------------------
/jest.config.js:
--------------------------------------------------------------------------------
 1 | // For a detailed explanation regarding each configuration property, visit:
 2 | // https://jestjs.io/docs/en/configuration.html
 3 | 
 4 | module.exports = {
 5 |   // Automatically clear mock calls and instances between every test
 6 |   clearMocks: true,
 7 |   // Indicates which provider should be used to instrument code for coverage
 8 |   coverageProvider: "v8",
 9 |   // The test environment that will be used for testing
10 |   testEnvironment: "node",
11 | };
12 | 


--------------------------------------------------------------------------------
/example.js:
--------------------------------------------------------------------------------
 1 | const {encode, decode} = require('./encoder.js')
 2 | 
 3 | 
 4 | const str = 'This is an example sentence to try encoding out on!'
 5 | const encoded = encode(str)
 6 | console.log('Encoded this string looks like: ', encoded)
 7 | 
 8 | console.log('We can look at each token and what it represents')
 9 | for(let token of encoded){
10 |   console.log({token, string: decode([token])})
11 | }
12 | 
13 | const decoded = decode(encoded)
14 | console.log('We can decode it back into:\n', decoded)
15 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "gpt-3-encoder",
 3 |   "version": "1.1.3",
 4 |   "description": "Javascript BPE Encoder Decoder for GPT-2 / GPT-3",
 5 |   "main": "Encoder.js",
 6 |   "files": [
 7 |     "encoder.json",
 8 |     "vocab.bpe"
 9 |   ],
10 |   "scripts": {
11 |     "test": "jest"
12 |   },
13 |   "repository": {
14 |     "type": "git",
15 |     "url": "git+https://github.com/AIDungeon/GPT-3-Encoder.git"
16 |   },
17 |   "author": "",
18 |   "license": "MIT",
19 |   "bugs": {
20 |     "url": "https://github.com/AIDungeon/GPT-3-Encoder/issues"
21 |   },
22 |   "homepage": "https://github.com/AIDungeon/GPT-3-Encoder#readme",
23 |   "devDependencies": {
24 |     "jest": "^26.4.2"
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/.github/workflows/node.js.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will do a clean install of node dependencies, build the source code and run tests across different versions of node
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-nodejs-with-github-actions
 3 | 
 4 | name: Node.js CI
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 | 
17 |     strategy:
18 |       matrix:
19 |         node-version: [12.x, 14.x]
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v2
23 |     - name: Use Node.js ${{ matrix.node-version }}
24 |       uses: actions/setup-node@v1
25 |       with:
26 |         node-version: ${{ matrix.node-version }}
27 |     - run: npm ci
28 |     - run: npm run build --if-present
29 |     - run: npm test
30 | 
31 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | GPT-3-Encoder
 2 | Javascript BPE Encoder Decoder for GPT-2 / GPT-3
 3 | 
 4 | ## About
 5 | GPT-2 and GPT-3 use byte pair encoding to turn text into a series of integers to feed into the model. This is a javascript implementation of OpenAI's original python encoder/decoder which can be found [here](https://github.com/openai/gpt-2)
 6 | 
 7 | ## Install with npm
 8 | 
 9 | `npm install gpt-3-encoder`
10 | 
11 | ## Usage
12 | 
13 | Compatible with Node >= 12
14 | 
15 | ```
16 | const {encode, decode} = require('gpt-3-encoder')
17 | 
18 | const str = 'This is an example sentence to try encoding out on!'
19 | const encoded = encode(str)
20 | console.log('Encoded this string looks like: ', encoded)
21 | 
22 | console.log('We can look at each token and what it represents')
23 | for(let token of encoded){
24 |   console.log({token, string: decode([token])})
25 | }
26 | 
27 | const decoded = decode(encoded)
28 | console.log('We can decode it back into:\n', decoded)
29 | 
30 | ```
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/Encoder.test.js:
--------------------------------------------------------------------------------
 1 | const {encode, decode} = require('./Encoder.js');
 2 | 
 3 | test('empty string', () => {
 4 |   const str = "";
 5 |   expect(encode(str)).toEqual([])
 6 |   expect(decode(encode(str))).toEqual(str)
 7 | });
 8 | 
 9 | test('space', () => {
10 |   const str = " ";
11 |   expect(encode(str)).toEqual([220])
12 |   expect(decode(encode(str))).toEqual(str)
13 | });
14 | 
15 | test('tab', () => {
16 |   const str = "\t";
17 |   expect(encode(str)).toEqual([197])
18 |   expect(decode(encode(str))).toEqual(str)
19 | });
20 | 
21 | test('simple text', () => {
22 |   const str = "This is some text";
23 |   expect(encode(str)).toEqual([1212, 318, 617, 2420])
24 |   expect(decode(encode(str))).toEqual(str)
25 | });
26 | 
27 | test('multi-token word', () => {
28 |   const str = "indivisible";
29 |   expect(encode(str)).toEqual([521, 452, 12843])
30 |   expect(decode(encode(str))).toEqual(str)
31 | });
32 | 
33 | test('emojis', () => {
34 |   const str = "hello 👋 world 🌍";
35 |   expect(encode(str)).toEqual([31373, 50169, 233, 995, 12520, 234, 235])
36 |   expect(decode(encode(str))).toEqual(str)
37 | });


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 AIDungeon
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/encoder.py:
--------------------------------------------------------------------------------
  1 | # This file includes code which was modified from https://github.com/openai/gpt-2
  2 | 
  3 | import tensorflow as tf
  4 | import os
  5 | import json
  6 | import regex as re
  7 | from functools import lru_cache
  8 | import requests
  9 | import boto3
 10 | import pdb
 11 | 
 12 | 
 13 | @lru_cache()
 14 | def bytes_to_unicode():
 15 | 
 16 |     bs = (
 17 |         list(range(ord("!"), ord("~") + 1))
 18 |         + list(range(ord("¡"), ord("¬") + 1))
 19 |         + list(range(ord("®"), ord("ÿ") + 1))
 20 |     )
 21 |     cs = bs[:]
 22 |     n = 0
 23 |     for b in range(2 ** 8):
 24 |         if b not in bs:
 25 |             bs.append(b)
 26 |             cs.append(2 ** 8 + n)
 27 |             n += 1
 28 |     cs = [chr(n) for n in cs]
 29 |     return dict(zip(bs, cs))
 30 | 
 31 | 
 32 | def get_pairs(word):
 33 |     pairs = set()
 34 |     prev_char = word[0]
 35 |     for char in word[1:]:
 36 |         pairs.add((prev_char, char))
 37 |         prev_char = char
 38 |     return pairs
 39 | 
 40 | 
 41 | class Encoder:
 42 |     def __init__(self, encoder, bpe_merges, errors="replace"):
 43 |         self.encoder = encoder
 44 |         self.decoder = {v: k for k, v in self.encoder.items()}
 45 |         self.errors = errors
 46 |         self.byte_encoder = bytes_to_unicode()
 47 |         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
 48 |         self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
 49 |         self.cache = {}
 50 |         self.pat = re.compile(
 51 |             r"""'s|'t|'re|'ve|'m|'l l|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
 52 |         )
 53 | 
 54 |     def bpe(self, token):
 55 |         if token in self.cache:
 56 |             return self.cache[token]
 57 |         word = tuple(token)
 58 | 
 59 |         pairs = get_pairs(word)
 60 | 
 61 |         if not pairs:
 62 |             return token
 63 | 
 64 |         while True:
 65 |             bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
 66 |             if bigram not in self.bpe_ranks:
 67 |                 break
 68 |             first, second = bigram
 69 |             new_word = []
 70 |             i = 0
 71 |             while i < len(word):
 72 |                 try:
 73 |                     j = word.index(first, i)
 74 |                     new_word.extend(word[i:j])
 75 |                     i = j
 76 |                 except:
 77 |                     new_word.extend(word[i:])
 78 |                     break
 79 | 
 80 |                 if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
 81 |                     new_word.append(first + second)
 82 |                     i += 2
 83 |                 else:
 84 |                     new_word.append(word[i])
 85 |                     i += 1
 86 |             new_word = tuple(new_word)
 87 |             word = new_word
 88 |             if len(word) == 1:
 89 |                 break
 90 |             else:
 91 |                 pairs = get_pairs(word)
 92 | 
 93 |         word = " ".join(word)
 94 |         self.cache[token] = word
 95 |         return word
 96 | 
 97 |     def encode(self, text):
 98 |         bpe_tokens = []
 99 |         for token in re.findall(self.pat, text):
100 |             token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
101 | 
102 |             bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" "))
103 |         return bpe_tokens
104 | 
105 |     def decode(self, tokens):
106 |         text = "".join([self.decoder[token] for token in tokens])
107 |         text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
108 |         return text
109 | 
110 | 
111 | def get_encoder():
112 |     with open("encoder.json", "r") as f:
113 |         encoder = json.load(f)
114 |     with open("vocab.bpe", "r", encoding="utf-8") as f:
115 |         bpe_data = f.read()
116 |     bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]]
117 |     return Encoder(encoder=encoder, bpe_merges=bpe_merges)
118 | 
119 | # encoder = get_encoder()
120 | # print('encoded is ', encoder.encode('hello 👋 world 🌍 This is a long string to test whether or not the emoji issue was fixed!'))


--------------------------------------------------------------------------------
/Encoder.js:
--------------------------------------------------------------------------------
  1 | // This file includes code which was modified from https://github.com/openai/gpt-2
  2 | const fs = require('fs')
  3 | const path = require('path');
  4 | 
  5 | const encoder = JSON.parse(fs.readFileSync(path.join(__dirname, './encoder.json')));
  6 | const bpe_file = fs.readFileSync(path.join(__dirname, './vocab.bpe'), 'utf-8');
  7 | 
  8 | const range = (x, y) => {
  9 |   const res = Array.from(Array(y).keys()).slice(x)
 10 |   return res
 11 | }
 12 | 
 13 | const ord = x => {
 14 |   return x.charCodeAt(0)
 15 | }
 16 | 
 17 | const chr = x => {
 18 |   return String.fromCharCode(x)
 19 | }
 20 | 
 21 | const textEncoder = new TextEncoder("utf-8")
 22 | const encodeStr = str => {
 23 |   return Array.from(textEncoder.encode(str)).map(x => x.toString())
 24 | }
 25 | 
 26 | const textDecoder = new TextDecoder("utf-8")
 27 | const decodeStr = arr => {
 28 |   return textDecoder.decode(new Uint8Array(arr));
 29 | }
 30 | 
 31 | const dictZip = (x, y) => {
 32 |   const result = {}
 33 |   x.map((_, i) => { result[x[i]] = y[i] })
 34 |   return result
 35 | }
 36 | 
 37 | function bytes_to_unicode() {
 38 |   const bs = range(ord('!'), ord('~') + 1).concat(range(ord('¡'), ord('¬') + 1), range(ord('®'), ord('ÿ') + 1))
 39 | 
 40 |   let cs = bs.slice()
 41 |   let n = 0
 42 |   for (let b = 0; b < 2 ** 8; b++) {
 43 |     if (!bs.includes(b)) {
 44 |       bs.push(b)
 45 |       cs.push(2 ** 8 + n)
 46 |       n = n + 1
 47 |     }
 48 |   }
 49 | 
 50 |   cs = cs.map(x => chr(x))
 51 | 
 52 |   const result = {}
 53 |   bs.map((_, i) => { result[bs[i]] = cs[i] })
 54 |   return result
 55 | }
 56 | 
 57 | function get_pairs(word) {
 58 |   const pairs = new Set()
 59 |   let prev_char = word[0]
 60 |   for (let i = 1; i < word.length; i++) {
 61 |     const char = word[i]
 62 |     pairs.add([prev_char, char])
 63 |     prev_char = char
 64 |   }
 65 |   return pairs
 66 | }
 67 | 
 68 | const pat = /'s|'t|'re|'ve|'m|'l l|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/gu
 69 | 
 70 | const decoder = {}
 71 | Object.keys(encoder).map(x => { decoder[encoder[x]] = x })
 72 | 
 73 | const lines = bpe_file.split('\n')
 74 | 
 75 | // bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]]
 76 | const bpe_merges = lines.slice(1, lines.length - 1).map(x => {
 77 |   return x.split(/(\s+)/).filter(function(e) { return e.trim().length > 0 })
 78 | })
 79 | 
 80 | const byte_encoder = bytes_to_unicode()
 81 | const byte_decoder = {}
 82 | Object.keys(byte_encoder).map(x => { byte_decoder[byte_encoder[x]] = x })
 83 | 
 84 | const bpe_ranks = dictZip(bpe_merges, range(0, bpe_merges.length))
 85 | const cache = {}
 86 | 
 87 | function bpe(token) {
 88 |   if (token in cache) {
 89 |     return cache[token]
 90 |   }
 91 | 
 92 |   let word = token.split('')
 93 | 
 94 |   let pairs = get_pairs(word)
 95 | 
 96 |   if (!pairs) {
 97 |     return token
 98 |   }
 99 | 
100 |   while (true) {
101 |     const minPairs = {}
102 |     Array.from(pairs).map(pair => {
103 |       const rank = bpe_ranks[pair]
104 |       minPairs[(isNaN(rank) ? 10e10 : rank)] = pair
105 |     })
106 | 
107 | 
108 | 
109 |     const bigram = minPairs[Math.min(...Object.keys(minPairs).map(x => {
110 |       return parseInt(x)
111 |     }
112 |     ))]
113 | 
114 |     if (!(bigram in bpe_ranks)) {
115 |       break
116 |     }
117 | 
118 |     const first = bigram[0]
119 |     const second = bigram[1]
120 |     let new_word = []
121 |     let i = 0
122 | 
123 |     while (i < word.length) {
124 |       const j = word.indexOf(first, i)
125 |       if (j === -1) {
126 |         new_word = new_word.concat(word.slice(i))
127 |         break
128 |       }
129 |       new_word = new_word.concat(word.slice(i, j))
130 |       i = j
131 | 
132 |       if (word[i] === first && i < word.length - 1 && word[i + 1] === second) {
133 |         new_word.push(first + second)
134 |         i = i + 2
135 |       } else {
136 |         new_word.push(word[i])
137 |         i = i + 1
138 |       }
139 |     }
140 | 
141 |     word = new_word
142 |     if (word.length === 1) {
143 |       break
144 |     } else {
145 |       pairs = get_pairs(word)
146 |     }
147 |   }
148 | 
149 |   word = word.join(' ')
150 |   cache[token] = word
151 | 
152 |   return word
153 | }
154 | 
155 | function encode(text) {
156 |   let bpe_tokens = []
157 |   const matches = Array.from(text.matchAll(pat)).map(x => x[0])
158 |   for (let token of matches) {
159 |     token = encodeStr(token).map(x => {
160 |       return byte_encoder[x]
161 |     }).join('')
162 |     
163 |     const new_tokens = bpe(token).split(' ').map(x => encoder[x])
164 |     bpe_tokens = bpe_tokens.concat(new_tokens)
165 |   }
166 |   return bpe_tokens
167 | }
168 | 
169 | function decode(tokens) {
170 |   let text = tokens.map(x => decoder[x]).join('')
171 |   text = decodeStr(text.split('').map(x => byte_decoder[x]))
172 |   return text
173 | }
174 | 
175 | module.exports = {
176 |   encode,
177 |   decode
178 | };


--------------------------------------------------------------------------------