├── .dockerignore ├── .env.example ├── .eslintrc.js ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── app.js ├── fly.toml.example ├── functions ├── checkInventory.js ├── checkPrice.js ├── function-manifest.js ├── placeOrder.js └── transferCall.js ├── package-lock.json ├── package.json ├── scripts ├── inbound-call.js └── outbound-call.js ├── services ├── gpt-service.js ├── recording-service.js ├── stream-service.js ├── transcription-service.js └── tts-service.js └── test ├── checkInventory.test.js ├── checkPrice.test.js ├── placeOrder.test.js └── transferCall.test.js /.dockerignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/node 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=node 3 | 4 | ### Node ### 5 | # Logs 6 | logs 7 | *.log 8 | npm-debug.log* 9 | yarn-debug.log* 10 | yarn-error.log* 11 | lerna-debug.log* 12 | .pnpm-debug.log* 13 | 14 | # Diagnostic reports (https://nodejs.org/api/report.html) 15 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 16 | 17 | # Runtime data 18 | pids 19 | *.pid 20 | *.seed 21 | *.pid.lock 22 | 23 | # Directory for instrumented libs generated by jscoverage/JSCover 24 | lib-cov 25 | 26 | # Coverage directory used by tools like istanbul 27 | coverage 28 | *.lcov 29 | 30 | # nyc test coverage 31 | .nyc_output 32 | 33 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 34 | .grunt 35 | 36 | # Bower dependency directory (https://bower.io/) 37 | bower_components 38 | 39 | # node-waf configuration 40 | .lock-wscript 41 | 42 | # Compiled binary addons (https://nodejs.org/api/addons.html) 43 | build/Release 44 | 45 | # Dependency directories 46 | node_modules/ 47 | jspm_packages/ 48 | 49 | # Snowpack dependency directory (https://snowpack.dev/) 50 | web_modules/ 51 | 52 | # TypeScript cache 53 | *.tsbuildinfo 54 | 55 | # Optional npm cache directory 56 | .npm 57 | 58 | # Optional eslint cache 59 | .eslintcache 60 | 61 | # Optional stylelint cache 62 | .stylelintcache 63 | 64 | # Microbundle cache 65 | .rpt2_cache/ 66 | .rts2_cache_cjs/ 67 | .rts2_cache_es/ 68 | .rts2_cache_umd/ 69 | 70 | # Optional REPL history 71 | .node_repl_history 72 | 73 | # Output of 'npm pack' 74 | *.tgz 75 | 76 | # Yarn Integrity file 77 | .yarn-integrity 78 | 79 | # dotenv environment variable files 80 | .env 81 | .env.development.local 82 | .env.test.local 83 | .env.production.local 84 | .env.local 85 | 86 | # parcel-bundler cache (https://parceljs.org/) 87 | .cache 88 | .parcel-cache 89 | 90 | # Next.js build output 91 | .next 92 | out 93 | 94 | # Nuxt.js build / generate output 95 | .nuxt 96 | dist 97 | 98 | # Gatsby files 99 | .cache/ 100 | # Comment in the public line in if your project uses Gatsby and not Next.js 101 | # https://nextjs.org/blog/next-9-1#public-directory-support 102 | # public 103 | 104 | # vuepress build output 105 | .vuepress/dist 106 | 107 | # vuepress v2.x temp and cache directory 108 | .temp 109 | 110 | # Docusaurus cache and generated files 111 | .docusaurus 112 | 113 | # Serverless directories 114 | .serverless/ 115 | 116 | # FuseBox cache 117 | .fusebox/ 118 | 119 | # DynamoDB Local files 120 | .dynamodb/ 121 | 122 | # TernJS port file 123 | .tern-port 124 | 125 | # Stores VSCode versions used for testing VSCode extensions 126 | .vscode-test 127 | 128 | # yarn v2 129 | .yarn/cache 130 | .yarn/unplugged 131 | .yarn/build-state.yml 132 | .yarn/install-state.gz 133 | .pnp.* 134 | 135 | ### Node Patch ### 136 | # Serverless Webpack directories 137 | .webpack/ 138 | 139 | # Optional stylelint cache 140 | 141 | # SvelteKit build / generate output 142 | .svelte-kit 143 | 144 | # End of https://www.toptal.com/developers/gitignore/api/node -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # Optional: Configure your Twilio credentials if you want 2 | # to make test calls using '$ npm run outbound'. 3 | TWILIO_ACCOUNT_SID=YOUR-ACCOUNT-SID 4 | TWILIO_AUTH_TOKEN=YOUR-AUTH-TOKEN 5 | FROM_NUMBER='+12223334444' 6 | APP_NUMBER='+13334445555' 7 | YOUR_NUMBER='+14445556666' 8 | 9 | # Your ngrok or server URL 10 | # E.g. 123.ngrok.io or myserver.fly.dev 11 | SERVER='myserver.website.com' 12 | 13 | # Service API Keys 14 | OPENAI_API_KEY= 15 | DEEPGRAM_API_KEY= 16 | 17 | # Deepgram voice model, see more options here: https://developers.deepgram.com/docs/tts-models 18 | VOICE_MODEL=aura-asteria-en 19 | 20 | # Call Recording 21 | # Important: Legal implications of call recording 22 | 23 | # If you choose to record voice or video calls, you need to comply with certain laws and regulations, 24 | # including those regarding obtaining consent to record (such as California's Invasion of Privacy Act 25 | # and similar laws in other jurisdictions). Additional information on the legal implications of call 26 | # recording can be found in the "Legal Considerations with Recording Voice and Video Communications" 27 | # Help Center article: https://help.twilio.com/articles/360011522553-Legal-Considerations-with-Recording-Voice-and-Video-Communications 28 | 29 | # Notice: Twilio recommends that you consult with your legal counsel to make sure that you are complying 30 | # with all applicable laws in connection with communications you record or store using Twilio. 31 | RECORDING_ENABLED='false' -------------------------------------------------------------------------------- /.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | 'env': { 3 | 'browser': true, 4 | 'commonjs': true, 5 | 'es2021': true 6 | }, 7 | 'extends': 'eslint:recommended', 8 | 'overrides': [ 9 | { 10 | 'env': { 11 | 'node': true 12 | }, 13 | 'files': [ 14 | '.eslintrc.{js,cjs}' 15 | ], 16 | 'parserOptions': { 17 | 'sourceType': 'script' 18 | } 19 | } 20 | ], 21 | 'globals' : { 22 | 'expect': 'writeable', 23 | 'test': 'writeable', 24 | 'process': 'readable' 25 | }, 26 | 'parserOptions': { 27 | 'ecmaVersion': 'latest' 28 | }, 29 | 'rules': { 30 | 'indent': [ 31 | 'error', 32 | 2 33 | ], 34 | 'linebreak-style': [ 35 | 'error', 36 | 'unix' 37 | ], 38 | 'quotes': [ 39 | 'error', 40 | 'single' 41 | ], 42 | 'semi': [ 43 | 'error', 44 | 'always' 45 | ] 46 | } 47 | }; 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/node 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=node 3 | 4 | ### Node ### 5 | # Logs 6 | logs 7 | *.log 8 | npm-debug.log* 9 | yarn-debug.log* 10 | yarn-error.log* 11 | lerna-debug.log* 12 | .pnpm-debug.log* 13 | 14 | # Diagnostic reports (https://nodejs.org/api/report.html) 15 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 16 | 17 | # Runtime data 18 | pids 19 | *.pid 20 | *.seed 21 | *.pid.lock 22 | 23 | # Directory for instrumented libs generated by jscoverage/JSCover 24 | lib-cov 25 | 26 | # Coverage directory used by tools like istanbul 27 | coverage 28 | *.lcov 29 | 30 | # nyc test coverage 31 | .nyc_output 32 | 33 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 34 | .grunt 35 | 36 | # Bower dependency directory (https://bower.io/) 37 | bower_components 38 | 39 | # node-waf configuration 40 | .lock-wscript 41 | 42 | # Compiled binary addons (https://nodejs.org/api/addons.html) 43 | build/Release 44 | 45 | # Dependency directories 46 | node_modules/ 47 | jspm_packages/ 48 | 49 | # Snowpack dependency directory (https://snowpack.dev/) 50 | web_modules/ 51 | 52 | # TypeScript cache 53 | *.tsbuildinfo 54 | 55 | # Optional npm cache directory 56 | .npm 57 | 58 | # Optional eslint cache 59 | .eslintcache 60 | 61 | # Optional stylelint cache 62 | .stylelintcache 63 | 64 | # Microbundle cache 65 | .rpt2_cache/ 66 | .rts2_cache_cjs/ 67 | .rts2_cache_es/ 68 | .rts2_cache_umd/ 69 | 70 | # Optional REPL history 71 | .node_repl_history 72 | 73 | # Output of 'npm pack' 74 | *.tgz 75 | 76 | # Yarn Integrity file 77 | .yarn-integrity 78 | 79 | # dotenv environment variable files 80 | .env 81 | .env.development.local 82 | .env.test.local 83 | .env.production.local 84 | .env.local 85 | 86 | # parcel-bundler cache (https://parceljs.org/) 87 | .cache 88 | .parcel-cache 89 | 90 | # Next.js build output 91 | .next 92 | out 93 | 94 | # Nuxt.js build / generate output 95 | .nuxt 96 | dist 97 | 98 | # Gatsby files 99 | .cache/ 100 | # Comment in the public line in if your project uses Gatsby and not Next.js 101 | # https://nextjs.org/blog/next-9-1#public-directory-support 102 | # public 103 | 104 | # vuepress build output 105 | .vuepress/dist 106 | 107 | # vuepress v2.x temp and cache directory 108 | .temp 109 | 110 | # Docusaurus cache and generated files 111 | .docusaurus 112 | 113 | # Serverless directories 114 | .serverless/ 115 | 116 | # FuseBox cache 117 | .fusebox/ 118 | 119 | # DynamoDB Local files 120 | .dynamodb/ 121 | 122 | # TernJS port file 123 | .tern-port 124 | 125 | # Stores VSCode versions used for testing VSCode extensions 126 | .vscode-test 127 | 128 | # yarn v2 129 | .yarn/cache 130 | .yarn/unplugged 131 | .yarn/build-state.yml 132 | .yarn/install-state.gz 133 | .pnp.* 134 | 135 | ### Node Patch ### 136 | # Serverless Webpack directories 137 | .webpack/ 138 | 139 | # Optional stylelint cache 140 | 141 | # SvelteKit build / generate output 142 | .svelte-kit 143 | 144 | # Ignore Fly.io configuration file 145 | fly.toml 146 | 147 | # End of https://www.toptal.com/developers/gitignore/api/node -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax = docker/dockerfile:1 2 | 3 | # Adjust NODE_VERSION as desired 4 | ARG NODE_VERSION=18.9.0 5 | FROM node:${NODE_VERSION}-slim as base 6 | 7 | LABEL fly_launch_runtime="Node.js" 8 | 9 | # Node.js app lives here 10 | WORKDIR /app 11 | 12 | # Set production environment 13 | ENV NODE_ENV="production" 14 | 15 | 16 | # Throw-away build stage to reduce size of final image 17 | FROM base as build 18 | 19 | # Install packages needed to build node modules 20 | RUN apt-get update -qq && \ 21 | apt-get install -y build-essential pkg-config python-is-python3 22 | 23 | # Install node modules 24 | COPY --link package-lock.json package.json ./ 25 | RUN npm ci 26 | 27 | # Copy application code 28 | COPY --link . . 29 | 30 | 31 | # Final stage for app image 32 | FROM base 33 | 34 | # Copy built application 35 | COPY --from=build /app /app 36 | 37 | # Start the server by default, this can be overwritten at runtime 38 | EXPOSE 3000 39 | CMD [ "node", "app.js" ] 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Craig Dennis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Call GPT: Generative AI Phone Calling 2 | 3 | Wouldn't it be neat if you could build an app that allowed you to chat with ChatGPT on the phone? 4 | 5 | Twilio gives you a superpower called [Media Streams](https://twilio.com/media-streams). Media Streams provides a Websocket connection to both sides of a phone call. You can get audio streamed to you, process it, and send audio back. 6 | 7 | This app serves as a demo exploring two services: 8 | - [Deepgram](https://deepgram.com/) for Speech to Text and Text to Speech 9 | - [OpenAI](https://openai.com) for GPT prompt completion 10 | 11 | These service combine to create a voice application that is remarkably better at transcribing, understanding, and speaking than traditional IVR systems. 12 | 13 | Features: 14 | - 🏁 Returns responses with low latency, typically 1 second by utilizing streaming. 15 | - ❗️ Allows the user to interrupt the GPT assistant and ask a different question. 16 | - 📔 Maintains chat history with GPT. 17 | - 🛠️ Allows the GPT to call external tools. 18 | 19 | ## Setting up for Development 20 | 21 | ### Prerequisites 22 | Sign up for the following services and get an API key for each: 23 | - [Deepgram](https://console.deepgram.com/signup) 24 | - [OpenAI](https://platform.openai.com/signup) 25 | 26 | If you're hosting the app locally, we also recommend using a tunneling service like [ngrok](https://ngrok.com) so that Twilio can forward audio to your app. 27 | 28 | ### 1. Start Ngrok 29 | Start an [ngrok](https://ngrok.com) tunnel for port `3000`: 30 | 31 | ```bash 32 | ngrok http 3000 33 | ``` 34 | Ngrok will give you a unique URL, like `abc123.ngrok.io`. Copy the URL without http:// or https://. You'll need this URL in the next step. 35 | 36 | ### 2. Configure Environment Variables 37 | Copy `.env.example` to `.env` and configure the following environment variables: 38 | 39 | ```bash 40 | # Your ngrok or server URL 41 | # E.g. 123.ngrok.io or myserver.fly.dev (exlude https://) 42 | SERVER="yourserverdomain.com" 43 | 44 | # Service API Keys 45 | OPENAI_API_KEY="sk-XXXXXX" 46 | DEEPGRAM_API_KEY="YOUR-DEEPGRAM-API-KEY" 47 | 48 | # Configure your Twilio credentials if you want 49 | # to make test calls using '$ npm test'. 50 | TWILIO_ACCOUNT_SID="YOUR-ACCOUNT-SID" 51 | TWILIO_AUTH_TOKEN="YOUR-AUTH-TOKEN" 52 | FROM_NUMBER='+12223334444' 53 | TO_NUMBER='+13334445555' 54 | ``` 55 | 56 | ### 3. Install Dependencies with NPM 57 | Install the necessary packages: 58 | 59 | ```bash 60 | npm install 61 | ``` 62 | 63 | ### 4. Start Your Server in Development Mode 64 | Run the following command: 65 | ```bash 66 | npm run dev 67 | ``` 68 | This will start your app using `nodemon` so that any changes to your code automatically refreshes and restarts the server. 69 | 70 | ### 5. Configure an Incoming Phone Number 71 | 72 | Connect a phone number using the [Twilio Console](https://console.twilio.com/us1/develop/phone-numbers/manage/incoming). 73 | 74 | You can also use the Twilio CLI: 75 | 76 | ```bash 77 | twilio phone-numbers:update +1[your-twilio-number] --voice-url=https://your-server.ngrok.io/incoming 78 | ``` 79 | This configuration tells Twilio to send incoming call audio to your app when someone calls your number. The app responds to the incoming call webhook with a [Stream](https://www.twilio.com/docs/voice/twiml/stream) TwiML verb that will connect an audio media stream to your websocket server. 80 | 81 | ## Application Workflow 82 | CallGPT coordinates the data flow between multiple different services including Deepgram, OpenAI, and Twilio Media Streams: 83 | ![Call GPT Flow](https://github.com/twilio-labs/call-gpt/assets/1418949/0b7fcc0b-d5e5-4527-bc4c-2ffb8931139c) 84 | 85 | 86 | ## Modifying the ChatGPT Context & Prompt 87 | Within `gpt-service.js` you'll find the settings for the GPT's initial context and prompt. For example: 88 | 89 | ```javascript 90 | this.userContext = [ 91 | { "role": "system", "content": "You are an outbound sales representative selling Apple Airpods. You have a youthful and cheery personality. Keep your responses as brief as possible but make every attempt to keep the caller on the phone without being rude. Don't ask more than 1 question at a time. Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous. Speak out all prices to include the currency. Please help them decide between the airpods, airpods pro and airpods max by asking questions like 'Do you prefer headphones that go in your ear or over the ear?'. If they are trying to choose between the airpods and airpods pro try asking them if they need noise canceling. Once you know which model they would like ask them how many they would like to purchase and try to get them to place an order. Add a '•' symbol every 5 to 10 words at natural pauses where your response can be split for text to speech." }, 92 | { "role": "assistant", "content": "Hello! I understand you're looking for a pair of AirPods, is that correct?" }, 93 | ], 94 | ``` 95 | ### About the `system` Attribute 96 | The `system` attribute is background information for the GPT. As you build your use-case, play around with modifying the context. A good starting point would be to imagine training a new employee on their first day and giving them the basics of how to help a customer. 97 | 98 | There are some context prompts that will likely be helpful to include by default. For example: 99 | 100 | - You have a [cheerful, wise, empathetic, etc.] personality. 101 | - Keep your responses as brief as possible but make every attempt to keep the caller on the phone without being rude. 102 | - Don't ask more than 1 question at a time. 103 | - Don't make assumptions about what values to plug into functions. 104 | - Ask for clarification if a user request is ambiguous. 105 | - Add a '•' symbol every 5 to 10 words at natural pauses where your response can be split for text to speech. 106 | 107 | These context items help shape a GPT so that it will act more naturally in a phone conversation. 108 | 109 | The `•` symbol context in particular is helpful for the app to be able to break sentences into natural chunks. This speeds up text-to-speech processing so that users hear audio faster. 110 | 111 | ### About the `content` Attribute 112 | This attribute is your default conversations starter for the GPT. However, you could consider making it more complex and customized based on personalized user data. 113 | 114 | In this case, our bot will start off by saying, "Hello! I understand you're looking for a pair of AirPods, is that correct?" 115 | 116 | ## Using Function Calls with GPT 117 | You can use function calls to interact with external APIs and data sources. For example, your GPT could check live inventory, check an item's price, or place an order. 118 | 119 | ### How Function Calling Works 120 | Function calling is handled within the `gpt-service.js` file in the following sequence: 121 | 122 | 1. `gpt-service` loads `function-manifest.js` and requires (imports) all functions defined there from the `functions` directory. Our app will call these functions later when GPT gives us a function name and parameters. 123 | ```javascript 124 | tools.forEach((tool) => { 125 | const functionName = tool.function.name; 126 | availableFunctions[functionName] = require(`../functions/${functionName}`); 127 | }); 128 | ``` 129 | 130 | 2. When we call GPT for completions, we also pass in the same `function-manifest` JSON as the tools parameter. This allows the GPT to "know" what functions are available: 131 | 132 | ```javascript 133 | const stream = await this.openai.chat.completions.create({ 134 | model: 'gpt-4', 135 | messages: this.userContext, 136 | tools, // <-- function-manifest definition 137 | stream: true, 138 | }); 139 | ``` 140 | 3. When the GPT responds, it will send us a stream of chunks for the text completion. The GPT will tell us whether each text chunk is something to say to the user, or if it's a tool call that our app needs to execute. This is indicated by the `deltas.tool_calls` key: 141 | ```javascript 142 | if (deltas.tool_calls) { 143 | // handle function calling 144 | } 145 | ``` 146 | 4. Once we have gathered all of the stream chunks about the tool call, our application can run the actual function code that we imported during the first step. The function name and parameters are provided by GPT: 147 | ```javascript 148 | const functionToCall = availableFunctions[functionName]; 149 | const functionResponse = functionToCall(functionArgs); 150 | ``` 151 | 5. As the final step, we add the function response data into the conversation context like this: 152 | 153 | ```javascript 154 | this.userContext.push({ 155 | role: 'function', 156 | name: functionName, 157 | content: functionResponse, 158 | }); 159 | ``` 160 | We then ask the GPT to generate another completion including what it knows from the function call. This allows the GPT to respond to the user with details gathered from the external data source. 161 | 162 | ### Adding Custom Function Calls 163 | You can have your GPT call external data sources by adding functions to the `/functions` directory. Follow these steps: 164 | 165 | 1. Create a function (e.g. `checkInventory.js` in `/functions`) 166 | 1. Within `checkInventory.js`, write a function called `checkInventory`. 167 | 1. Add information about your function to the `function-manifest.js` file. This information provides context to GPT about what arguments the function takes. 168 | 169 | **Important:** Your function's name must be the same as the file name that contains the function (excluding the .js extension). For example, our function is called `checkInventory` so we have named the the file `checkInventory.js`, and set the `name` attribute in `function-manifest.js` to be `checkInventory`. 170 | 171 | Example function manifest entry: 172 | 173 | ```javascript 174 | { 175 | type: "function", 176 | function: { 177 | name: "checkInventory", 178 | say: "Let me check our inventory right now.", 179 | description: "Check the inventory of airpods, airpods pro or airpods max.", 180 | parameters: { 181 | type: "object", 182 | properties: { 183 | model: { 184 | type: "string", 185 | "enum": ["airpods", "airpods pro", "airpods max"], 186 | description: "The model of airpods, either the airpods, airpods pro or airpods max", 187 | }, 188 | }, 189 | required: ["model"], 190 | }, 191 | returns: { 192 | type: "object", 193 | properties: { 194 | stock: { 195 | type: "integer", 196 | description: "An integer containing how many of the model are in currently in stock." 197 | } 198 | } 199 | } 200 | }, 201 | } 202 | ``` 203 | #### Using `say` in the Function Manifest 204 | The `say` key in the function manifest allows you to define a sentence for the app to speak to the user before calling a function. For example, if a function will take a long time to call you might say "Give me a few moments to look that up for you..." 205 | 206 | ### Receiving Function Arguments 207 | When ChatGPT calls a function, it will provide an object with multiple attributes as a single argument. The parameters included in the object are based on the definition in your `function-manifest.js` file. 208 | 209 | In the `checkInventory` example above, `model` is a required argument, so the data passed to the function will be a single object like this: 210 | 211 | ```javascript 212 | { 213 | model: "airpods pro" 214 | } 215 | ``` 216 | For our `placeOrder` function, the arguments passed will look like this: 217 | 218 | ```javascript 219 | { 220 | model: "airpods pro", 221 | quantity: 10 222 | } 223 | ``` 224 | ### Returning Arguments to GPT 225 | Your function should always return a value: GPT will get confused when the function returns nothing, and may continue trying to call the function expecting an answer. If your function doesn't have any data to return to the GPT, you should still return a response with an instruction like "Tell the user that their request was processed successfully." This prevents the GPT from calling the function repeatedly and wasting tokens. 226 | 227 | Any data that you return to the GPT should match the expected format listed in the `returns` key of `function-manifest.js`. 228 | 229 | ## Utility Scripts for Placing Calls 230 | The `scripts` directory contains two files that allow you to place test calls: 231 | - `npm run inbound` will place an automated call from a Twilio number to your app and speak a script. You can adjust this to your use-case, e.g. as an automated test. 232 | - `npm run outbound` will place an outbound call that connects to your app. This can be useful if you want the app to call your phone so that you can manually test it. 233 | 234 | ## Using Eleven Labs for Text to Speech 235 | Replace the Deepgram API call and array transformation in tts-service.js with the following call to Eleven Labs. Note that sometimes Eleven Labs will hit a rate limit (especially on the free trial) and return 400 errors with no audio (or a clicking sound). 236 | 237 | ``` 238 | try { 239 | const response = await fetch( 240 | `https://api.elevenlabs.io/v1/text-to-speech/21m00Tcm4TlvDq8ikWAM/stream?output_format=ulaw_8000&optimize_streaming_latency=3`, 241 | { 242 | method: 'POST', 243 | headers: { 244 | 'xi-api-key': process.env.XI_API_KEY, 245 | 'Content-Type': 'application/json', 246 | accept: 'audio/wav', 247 | }, 248 | body: JSON.stringify({ 249 | model_id: process.env.XI_MODEL_ID, 250 | text: partialResponse, 251 | }), 252 | } 253 | ); 254 | 255 | if (response.status === 200) { 256 | const audioArrayBuffer = await response.arrayBuffer(); 257 | this.emit('speech', partialResponseIndex, Buffer.from(audioArrayBuffer).toString('base64'), partialResponse, interactionCount); 258 | } else { 259 | console.log('Eleven Labs Error:'); 260 | console.log(response); 261 | } 262 | } catch (err) { 263 | console.error('Error occurred in XI LabsTextToSpeech service'); 264 | console.error(err); 265 | } 266 | ``` 267 | 268 | 269 | ## Testing with Jest 270 | Repeatedly calling the app can be a time consuming way to test your tool function calls. This project contains example unit tests that can help you test your functions without relying on the GPT to call them. 271 | 272 | Simple example tests are available in the `/test` directory. To run them, simply run `npm run test`. 273 | 274 | ## Deploy via Fly.io 275 | Fly.io is a hosting service similar to Heroku that simplifies the deployment process. Given Twilio Media Streams are sent and received from us-east-1, it's recommended to choose Fly's Ashburn, VA (IAD) region. 276 | 277 | > Deploying to Fly.io is not required to try the app, but can be helpful if your home internet speed is variable. 278 | 279 | Modify the app name `fly.toml` to be a unique value (this must be globally unique). 280 | 281 | Deploy the app using the Fly.io CLI: 282 | ```bash 283 | fly launch 284 | 285 | fly deploy 286 | ``` 287 | 288 | Import your secrets from your .env file to your deployed app: 289 | ```bash 290 | fly secrets import < .env 291 | ``` 292 | -------------------------------------------------------------------------------- /app.js: -------------------------------------------------------------------------------- 1 | require('dotenv').config(); 2 | require('colors'); 3 | 4 | const express = require('express'); 5 | const ExpressWs = require('express-ws'); 6 | 7 | const { GptService } = require('./services/gpt-service'); 8 | const { StreamService } = require('./services/stream-service'); 9 | const { TranscriptionService } = require('./services/transcription-service'); 10 | const { TextToSpeechService } = require('./services/tts-service'); 11 | const { recordingService } = require('./services/recording-service'); 12 | 13 | const VoiceResponse = require('twilio').twiml.VoiceResponse; 14 | 15 | const app = express(); 16 | ExpressWs(app); 17 | 18 | const PORT = process.env.PORT || 3000; 19 | 20 | app.post('/incoming', (req, res) => { 21 | try { 22 | const response = new VoiceResponse(); 23 | const connect = response.connect(); 24 | connect.stream({ url: `wss://${process.env.SERVER}/connection` }); 25 | 26 | res.type('text/xml'); 27 | res.end(response.toString()); 28 | } catch (err) { 29 | console.log(err); 30 | } 31 | }); 32 | 33 | app.ws('/connection', (ws) => { 34 | try { 35 | ws.on('error', console.error); 36 | // Filled in from start message 37 | let streamSid; 38 | let callSid; 39 | 40 | const gptService = new GptService(); 41 | const streamService = new StreamService(ws); 42 | const transcriptionService = new TranscriptionService(); 43 | const ttsService = new TextToSpeechService({}); 44 | 45 | let marks = []; 46 | let interactionCount = 0; 47 | 48 | // Incoming from MediaStream 49 | ws.on('message', function message(data) { 50 | const msg = JSON.parse(data); 51 | if (msg.event === 'start') { 52 | streamSid = msg.start.streamSid; 53 | callSid = msg.start.callSid; 54 | 55 | streamService.setStreamSid(streamSid); 56 | gptService.setCallSid(callSid); 57 | 58 | // Set RECORDING_ENABLED='true' in .env to record calls 59 | recordingService(ttsService, callSid).then(() => { 60 | console.log(`Twilio -> Starting Media Stream for ${streamSid}`.underline.red); 61 | ttsService.generate({partialResponseIndex: null, partialResponse: 'Hello! I understand you\'re looking for a pair of AirPods, is that correct?'}, 0); 62 | }); 63 | } else if (msg.event === 'media') { 64 | transcriptionService.send(msg.media.payload); 65 | } else if (msg.event === 'mark') { 66 | const label = msg.mark.name; 67 | console.log(`Twilio -> Audio completed mark (${msg.sequenceNumber}): ${label}`.red); 68 | marks = marks.filter(m => m !== msg.mark.name); 69 | } else if (msg.event === 'stop') { 70 | console.log(`Twilio -> Media stream ${streamSid} ended.`.underline.red); 71 | } 72 | }); 73 | 74 | transcriptionService.on('utterance', async (text) => { 75 | // This is a bit of a hack to filter out empty utterances 76 | if(marks.length > 0 && text?.length > 5) { 77 | console.log('Twilio -> Interruption, Clearing stream'.red); 78 | ws.send( 79 | JSON.stringify({ 80 | streamSid, 81 | event: 'clear', 82 | }) 83 | ); 84 | } 85 | }); 86 | 87 | transcriptionService.on('transcription', async (text) => { 88 | if (!text) { return; } 89 | console.log(`Interaction ${interactionCount} – STT -> GPT: ${text}`.yellow); 90 | gptService.completion(text, interactionCount); 91 | interactionCount += 1; 92 | }); 93 | 94 | gptService.on('gptreply', async (gptReply, icount) => { 95 | console.log(`Interaction ${icount}: GPT -> TTS: ${gptReply.partialResponse}`.green ); 96 | ttsService.generate(gptReply, icount); 97 | }); 98 | 99 | ttsService.on('speech', (responseIndex, audio, label, icount) => { 100 | console.log(`Interaction ${icount}: TTS -> TWILIO: ${label}`.blue); 101 | 102 | streamService.buffer(responseIndex, audio); 103 | }); 104 | 105 | streamService.on('audiosent', (markLabel) => { 106 | marks.push(markLabel); 107 | }); 108 | } catch (err) { 109 | console.log(err); 110 | } 111 | }); 112 | 113 | app.listen(PORT); 114 | console.log(`Server running on port ${PORT}`); 115 | -------------------------------------------------------------------------------- /fly.toml.example: -------------------------------------------------------------------------------- 1 | # fly.toml app configuration file generated for cweems-genai-phone-call on 2024-03-01T14:37:33-08:00 2 | # 3 | # See https://fly.io/docs/reference/configuration/ for information about how to use this file. 4 | # 5 | 6 | app = '' 7 | 8 | # Recommend IAD for proximity to Twilio's Media Stream Servers. 9 | primary_region = 'iad' 10 | 11 | [build] 12 | 13 | [http_service] 14 | internal_port = 3000 15 | force_https = true 16 | auto_stop_machines = true 17 | auto_start_machines = true 18 | 19 | # Recommend keeping a machine running to reduce spin-up time 20 | # note this will increase hosting costs. 21 | min_machines_running = 1 22 | processes = ['app'] 23 | 24 | [[vm]] 25 | memory = '1gb' 26 | cpu_kind = 'shared' 27 | cpus = 1 28 | -------------------------------------------------------------------------------- /functions/checkInventory.js: -------------------------------------------------------------------------------- 1 | async function checkInventory(functionArgs) { 2 | const model = functionArgs.model; 3 | console.log('GPT -> called checkInventory function'); 4 | 5 | if (model?.toLowerCase().includes('pro')) { 6 | return JSON.stringify({ stock: 10 }); 7 | } else if (model?.toLowerCase().includes('max')) { 8 | return JSON.stringify({ stock: 0 }); 9 | } else { 10 | return JSON.stringify({ stock: 100 }); 11 | } 12 | } 13 | 14 | module.exports = checkInventory; -------------------------------------------------------------------------------- /functions/checkPrice.js: -------------------------------------------------------------------------------- 1 | async function checkPrice(functionArgs) { 2 | let model = functionArgs.model; 3 | console.log('GPT -> called checkPrice function'); 4 | if (model?.toLowerCase().includes('pro')) { 5 | return JSON.stringify({ price: 249 }); 6 | } else if (model?.toLowerCase().includes('max')) { 7 | return JSON.stringify({ price: 549 }); 8 | } else { 9 | return JSON.stringify({ price: 149 }); 10 | } 11 | } 12 | 13 | module.exports = checkPrice; -------------------------------------------------------------------------------- /functions/function-manifest.js: -------------------------------------------------------------------------------- 1 | // create metadata for all the available functions to pass to completions API 2 | const tools = [ 3 | { 4 | type: 'function', 5 | function: { 6 | name: 'checkInventory', 7 | say: 'Let me check our inventory right now.', 8 | description: 'Check the inventory of airpods, airpods pro or airpods max.', 9 | parameters: { 10 | type: 'object', 11 | properties: { 12 | model: { 13 | type: 'string', 14 | 'enum': ['airpods', 'airpods pro', 'airpods max'], 15 | description: 'The model of airpods, either the airpods, airpods pro or airpods max', 16 | }, 17 | }, 18 | required: ['model'], 19 | }, 20 | returns: { 21 | type: 'object', 22 | properties: { 23 | stock: { 24 | type: 'integer', 25 | description: 'An integer containing how many of the model are in currently in stock.' 26 | } 27 | } 28 | } 29 | }, 30 | }, 31 | { 32 | type: 'function', 33 | function: { 34 | name: 'checkPrice', 35 | say: 'Let me check the price, one moment.', 36 | description: 'Check the price of given model of airpods, airpods pro or airpods max.', 37 | parameters: { 38 | type: 'object', 39 | properties: { 40 | model: { 41 | type: 'string', 42 | 'enum': ['airpods', 'airpods pro', 'airpods max'], 43 | description: 'The model of airpods, either the airpods, airpods pro or airpods max', 44 | }, 45 | }, 46 | required: ['model'], 47 | }, 48 | returns: { 49 | type: 'object', 50 | properties: { 51 | price: { 52 | type: 'integer', 53 | description: 'the price of the model' 54 | } 55 | } 56 | } 57 | }, 58 | }, 59 | { 60 | type: 'function', 61 | function: { 62 | name: 'placeOrder', 63 | say: 'All right, I\'m just going to ring that up in our system.', 64 | description: 'Places an order for a set of airpods.', 65 | parameters: { 66 | type: 'object', 67 | properties: { 68 | model: { 69 | type: 'string', 70 | 'enum': ['airpods', 'airpods pro'], 71 | description: 'The model of airpods, either the regular or pro', 72 | }, 73 | quantity: { 74 | type: 'integer', 75 | description: 'The number of airpods they want to order', 76 | }, 77 | }, 78 | required: ['type', 'quantity'], 79 | }, 80 | returns: { 81 | type: 'object', 82 | properties: { 83 | price: { 84 | type: 'integer', 85 | description: 'The total price of the order including tax' 86 | }, 87 | orderNumber: { 88 | type: 'integer', 89 | description: 'The order number associated with the order.' 90 | } 91 | } 92 | } 93 | }, 94 | }, 95 | { 96 | type: 'function', 97 | function: { 98 | name: 'transferCall', 99 | say: 'One moment while I transfer your call.', 100 | description: 'Transfers the customer to a live agent in case they request help from a real person.', 101 | parameters: { 102 | type: 'object', 103 | properties: { 104 | callSid: { 105 | type: 'string', 106 | description: 'The unique identifier for the active phone call.', 107 | }, 108 | }, 109 | required: ['callSid'], 110 | }, 111 | returns: { 112 | type: 'object', 113 | properties: { 114 | status: { 115 | type: 'string', 116 | description: 'Whether or not the customer call was successfully transfered' 117 | }, 118 | } 119 | } 120 | }, 121 | }, 122 | ]; 123 | 124 | module.exports = tools; -------------------------------------------------------------------------------- /functions/placeOrder.js: -------------------------------------------------------------------------------- 1 | async function placeOrder(functionArgs) { 2 | const {model, quantity} = functionArgs; 3 | console.log('GPT -> called placeOrder function'); 4 | 5 | // generate a random order number that is 7 digits 6 | const orderNum = Math.floor(Math.random() * (9999999 - 1000000 + 1) + 1000000); 7 | 8 | // check model and return the order number and price with 7.9% sales tax 9 | if (model?.toLowerCase().includes('pro')) { 10 | return JSON.stringify({ orderNumber: orderNum, price: Math.floor(quantity * 249 * 1.079)}); 11 | } else if (model?.toLowerCase().includes('max')) { 12 | return JSON.stringify({ orderNumber: orderNum, price: Math.floor(quantity * 549 * 1.079) }); 13 | } 14 | return JSON.stringify({ orderNumber: orderNum, price: Math.floor(quantity * 179 * 1.079) }); 15 | } 16 | 17 | module.exports = placeOrder; -------------------------------------------------------------------------------- /functions/transferCall.js: -------------------------------------------------------------------------------- 1 | require('dotenv').config(); 2 | 3 | const transferCall = async function (call) { 4 | 5 | console.log('Transferring call', call.callSid); 6 | const accountSid = process.env.TWILIO_ACCOUNT_SID; 7 | const authToken = process.env.TWILIO_AUTH_TOKEN; 8 | const client = require('twilio')(accountSid, authToken); 9 | 10 | return await client.calls(call.callSid) 11 | .update({twiml: `${process.env.TRANSFER_NUMBER}`}) 12 | .then(() => { 13 | return 'The call was transferred successfully, say goodbye to the customer.'; 14 | }) 15 | .catch(() => { 16 | return 'The call was not transferred successfully, advise customer to call back later.'; 17 | }); 18 | }; 19 | 20 | module.exports = transferCall; -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "genai-phone", 3 | "version": "1.1.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "inbound": "node ./scripts/inbound-call.js", 8 | "outbound": "node ./scripts/outbound-call.js", 9 | "test": "jest", 10 | "dev": "nodemon app.js", 11 | "start": "node app.js" 12 | }, 13 | "keywords": [], 14 | "author": "Charlie Weems", 15 | "license": "MIT", 16 | "dependencies": { 17 | "@deepgram/sdk": "^3.3.4", 18 | "colors": "^1.4.0", 19 | "dotenv": "^16.3.1", 20 | "express": "^4.19.2", 21 | "express-ws": "^5.0.2", 22 | "node-fetch": "^2.7.0", 23 | "openai": "^4.20.1", 24 | "twilio": "^4.19.3", 25 | "uuid": "^9.0.1", 26 | "wavefile": "^11.0.0" 27 | }, 28 | "devDependencies": { 29 | "@flydotio/dockerfile": "^0.4.11", 30 | "eslint": "^8.57.0", 31 | "jest": "^29.7.0", 32 | "nodemon": "^3.0.2" 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /scripts/inbound-call.js: -------------------------------------------------------------------------------- 1 | require('dotenv').config(); 2 | 3 | // You can use this function to make a 4 | // test call to your application by running 5 | // npm inbound 6 | async function makeInboundCall() { 7 | const VoiceResponse = require('twilio').twiml.VoiceResponse; 8 | const accountSid = process.env.TWILIO_ACCOUNT_SID; 9 | const authToken = process.env.TWILIO_AUTH_TOKEN; 10 | 11 | const client = require('twilio')(accountSid, authToken); 12 | 13 | let twiml = new VoiceResponse(); 14 | twiml.pause({ length: 10 }); 15 | twiml.say('Which models of airpods do you have available right now?'); 16 | twiml.pause({ length: 30 }); 17 | twiml.hangup(); 18 | 19 | console.log(twiml.toString()); 20 | 21 | await client.calls 22 | .create({ 23 | twiml: twiml.toString(), 24 | to: process.env.APP_NUMBER, 25 | from: process.env.FROM_NUMBER 26 | }) 27 | .then(call => console.log(call.sid)); 28 | } 29 | 30 | makeInboundCall(); -------------------------------------------------------------------------------- /scripts/outbound-call.js: -------------------------------------------------------------------------------- 1 | /* 2 | You can use this script to place an outbound call 3 | to your own mobile phone. 4 | */ 5 | 6 | require('dotenv').config(); 7 | 8 | async function makeOutBoundCall() { 9 | const accountSid = process.env.TWILIO_ACCOUNT_SID; 10 | const authToken = process.env.TWILIO_AUTH_TOKEN; 11 | 12 | const client = require('twilio')(accountSid, authToken); 13 | 14 | await client.calls 15 | .create({ 16 | url: `https://${process.env.SERVER}/incoming`, 17 | to: process.env.YOUR_NUMBER, 18 | from: process.env.FROM_NUMBER 19 | }) 20 | .then(call => console.log(call.sid)); 21 | } 22 | 23 | makeOutBoundCall(); -------------------------------------------------------------------------------- /services/gpt-service.js: -------------------------------------------------------------------------------- 1 | require('colors'); 2 | const EventEmitter = require('events'); 3 | const OpenAI = require('openai'); 4 | const tools = require('../functions/function-manifest'); 5 | 6 | // Import all functions included in function manifest 7 | // Note: the function name and file name must be the same 8 | const availableFunctions = {}; 9 | tools.forEach((tool) => { 10 | let functionName = tool.function.name; 11 | availableFunctions[functionName] = require(`../functions/${functionName}`); 12 | }); 13 | 14 | class GptService extends EventEmitter { 15 | constructor() { 16 | super(); 17 | this.openai = new OpenAI(); 18 | this.userContext = [ 19 | { 'role': 'system', 'content': 'You are an outbound sales representative selling Apple Airpods. You have a youthful and cheery personality. Keep your responses as brief as possible but make every attempt to keep the caller on the phone without being rude. Don\'t ask more than 1 question at a time. Don\'t make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous. Speak out all prices to include the currency. Please help them decide between the airpods, airpods pro and airpods max by asking questions like \'Do you prefer headphones that go in your ear or over the ear?\'. If they are trying to choose between the airpods and airpods pro try asking them if they need noise canceling. Once you know which model they would like ask them how many they would like to purchase and try to get them to place an order. You must add a \'•\' symbol every 5 to 10 words at natural pauses where your response can be split for text to speech.' }, 20 | { 'role': 'assistant', 'content': 'Hello! I understand you\'re looking for a pair of AirPods, is that correct?' }, 21 | ], 22 | this.partialResponseIndex = 0; 23 | } 24 | 25 | // Add the callSid to the chat context in case 26 | // ChatGPT decides to transfer the call. 27 | setCallSid (callSid) { 28 | this.userContext.push({ 'role': 'system', 'content': `callSid: ${callSid}` }); 29 | } 30 | 31 | validateFunctionArgs (args) { 32 | try { 33 | return JSON.parse(args); 34 | } catch (error) { 35 | console.log('Warning: Double function arguments returned by OpenAI:', args); 36 | // Seeing an error where sometimes we have two sets of args 37 | if (args.indexOf('{') != args.lastIndexOf('{')) { 38 | return JSON.parse(args.substring(args.indexOf(''), args.indexOf('}') + 1)); 39 | } 40 | } 41 | } 42 | 43 | updateUserContext(name, role, text) { 44 | if (name !== 'user') { 45 | this.userContext.push({ 'role': role, 'name': name, 'content': text }); 46 | } else { 47 | this.userContext.push({ 'role': role, 'content': text }); 48 | } 49 | } 50 | 51 | async completion(text, interactionCount, role = 'user', name = 'user') { 52 | this.updateUserContext(name, role, text); 53 | 54 | // Step 1: Send user transcription to Chat GPT 55 | const stream = await this.openai.chat.completions.create({ 56 | model: 'gpt-4-1106-preview', 57 | messages: this.userContext, 58 | tools: tools, 59 | stream: true, 60 | }); 61 | 62 | let completeResponse = ''; 63 | let partialResponse = ''; 64 | let functionName = ''; 65 | let functionArgs = ''; 66 | let finishReason = ''; 67 | 68 | function collectToolInformation(deltas) { 69 | let name = deltas.tool_calls[0]?.function?.name || ''; 70 | if (name != '') { 71 | functionName = name; 72 | } 73 | let args = deltas.tool_calls[0]?.function?.arguments || ''; 74 | if (args != '') { 75 | // args are streamed as JSON string so we need to concatenate all chunks 76 | functionArgs += args; 77 | } 78 | } 79 | 80 | for await (const chunk of stream) { 81 | let content = chunk.choices[0]?.delta?.content || ''; 82 | let deltas = chunk.choices[0].delta; 83 | finishReason = chunk.choices[0].finish_reason; 84 | 85 | // Step 2: check if GPT wanted to call a function 86 | if (deltas.tool_calls) { 87 | // Step 3: Collect the tokens containing function data 88 | collectToolInformation(deltas); 89 | } 90 | 91 | // need to call function on behalf of Chat GPT with the arguments it parsed from the conversation 92 | if (finishReason === 'tool_calls') { 93 | // parse JSON string of args into JSON object 94 | 95 | const functionToCall = availableFunctions[functionName]; 96 | const validatedArgs = this.validateFunctionArgs(functionArgs); 97 | 98 | // Say a pre-configured message from the function manifest 99 | // before running the function. 100 | const toolData = tools.find(tool => tool.function.name === functionName); 101 | const say = toolData.function.say; 102 | 103 | this.emit('gptreply', { 104 | partialResponseIndex: null, 105 | partialResponse: say 106 | }, interactionCount); 107 | 108 | let functionResponse = await functionToCall(validatedArgs); 109 | 110 | // Step 4: send the info on the function call and function response to GPT 111 | this.updateUserContext(functionName, 'function', functionResponse); 112 | 113 | // call the completion function again but pass in the function response to have OpenAI generate a new assistant response 114 | await this.completion(functionResponse, interactionCount, 'function', functionName); 115 | } else { 116 | // We use completeResponse for userContext 117 | completeResponse += content; 118 | // We use partialResponse to provide a chunk for TTS 119 | partialResponse += content; 120 | // Emit last partial response and add complete response to userContext 121 | if (content.trim().slice(-1) === '•' || finishReason === 'stop') { 122 | const gptReply = { 123 | partialResponseIndex: this.partialResponseIndex, 124 | partialResponse 125 | }; 126 | 127 | this.emit('gptreply', gptReply, interactionCount); 128 | this.partialResponseIndex++; 129 | partialResponse = ''; 130 | } 131 | } 132 | } 133 | this.userContext.push({'role': 'assistant', 'content': completeResponse}); 134 | console.log(`GPT -> user context length: ${this.userContext.length}`.green); 135 | } 136 | } 137 | 138 | module.exports = { GptService }; 139 | -------------------------------------------------------------------------------- /services/recording-service.js: -------------------------------------------------------------------------------- 1 | 2 | require('colors'); 3 | 4 | async function recordingService(ttsService, callSid) { 5 | try { 6 | if (process.env.RECORDING_ENABLED === 'true') { 7 | const client = require('twilio')(process.env.TWILIO_ACCOUNT_SID, process.env.TWILIO_AUTH_TOKEN); 8 | 9 | ttsService.generate({partialResponseIndex: null, partialResponse: 'This call will be recorded.'}, 0); 10 | const recording = await client.calls(callSid) 11 | .recordings 12 | .create({ 13 | recordingChannels: 'dual' 14 | }); 15 | 16 | console.log(`Recording Created: ${recording.sid}`.red); 17 | } 18 | } catch (err) { 19 | console.log(err); 20 | } 21 | } 22 | 23 | module.exports = { recordingService }; -------------------------------------------------------------------------------- /services/stream-service.js: -------------------------------------------------------------------------------- 1 | const EventEmitter = require('events'); 2 | const uuid = require('uuid'); 3 | 4 | class StreamService extends EventEmitter { 5 | constructor(websocket) { 6 | super(); 7 | this.ws = websocket; 8 | this.expectedAudioIndex = 0; 9 | this.audioBuffer = {}; 10 | this.streamSid = ''; 11 | } 12 | 13 | setStreamSid (streamSid) { 14 | this.streamSid = streamSid; 15 | } 16 | 17 | buffer (index, audio) { 18 | // Escape hatch for intro message, which doesn't have an index 19 | if(index === null) { 20 | this.sendAudio(audio); 21 | } else if(index === this.expectedAudioIndex) { 22 | this.sendAudio(audio); 23 | this.expectedAudioIndex++; 24 | 25 | while(Object.prototype.hasOwnProperty.call(this.audioBuffer, this.expectedAudioIndex)) { 26 | const bufferedAudio = this.audioBuffer[this.expectedAudioIndex]; 27 | this.sendAudio(bufferedAudio); 28 | this.expectedAudioIndex++; 29 | } 30 | } else { 31 | this.audioBuffer[index] = audio; 32 | } 33 | } 34 | 35 | sendAudio (audio) { 36 | this.ws.send( 37 | JSON.stringify({ 38 | streamSid: this.streamSid, 39 | event: 'media', 40 | media: { 41 | payload: audio, 42 | }, 43 | }) 44 | ); 45 | // When the media completes you will receive a `mark` message with the label 46 | const markLabel = uuid.v4(); 47 | this.ws.send( 48 | JSON.stringify({ 49 | streamSid: this.streamSid, 50 | event: 'mark', 51 | mark: { 52 | name: markLabel 53 | } 54 | }) 55 | ); 56 | this.emit('audiosent', markLabel); 57 | } 58 | } 59 | 60 | module.exports = {StreamService}; -------------------------------------------------------------------------------- /services/transcription-service.js: -------------------------------------------------------------------------------- 1 | require('colors'); 2 | const { createClient, LiveTranscriptionEvents } = require('@deepgram/sdk'); 3 | const { Buffer } = require('node:buffer'); 4 | const EventEmitter = require('events'); 5 | 6 | 7 | class TranscriptionService extends EventEmitter { 8 | constructor() { 9 | super(); 10 | const deepgram = createClient(process.env.DEEPGRAM_API_KEY); 11 | this.dgConnection = deepgram.listen.live({ 12 | encoding: 'mulaw', 13 | sample_rate: '8000', 14 | model: 'nova-2', 15 | punctuate: true, 16 | interim_results: true, 17 | endpointing: 200, 18 | utterance_end_ms: 1000 19 | }); 20 | 21 | this.finalResult = ''; 22 | this.speechFinal = false; // used to determine if we have seen speech_final=true indicating that deepgram detected a natural pause in the speakers speech. 23 | 24 | this.dgConnection.on(LiveTranscriptionEvents.Open, () => { 25 | this.dgConnection.on(LiveTranscriptionEvents.Transcript, (transcriptionEvent) => { 26 | const alternatives = transcriptionEvent.channel?.alternatives; 27 | let text = ''; 28 | if (alternatives) { 29 | text = alternatives[0]?.transcript; 30 | } 31 | 32 | // if we receive an UtteranceEnd and speech_final has not already happened then we should consider this the end of of the human speech and emit the transcription 33 | if (transcriptionEvent.type === 'UtteranceEnd') { 34 | if (!this.speechFinal) { 35 | console.log(`UtteranceEnd received before speechFinal, emit the text collected so far: ${this.finalResult}`.yellow); 36 | this.emit('transcription', this.finalResult); 37 | return; 38 | } else { 39 | console.log('STT -> Speech was already final when UtteranceEnd recevied'.yellow); 40 | return; 41 | } 42 | } 43 | 44 | // console.log(text, "is_final: ", transcription?.is_final, "speech_final: ", transcription.speech_final); 45 | // if is_final that means that this chunk of the transcription is accurate and we need to add it to the finalResult 46 | if (transcriptionEvent.is_final === true && text.trim().length > 0) { 47 | this.finalResult += ` ${text}`; 48 | // if speech_final and is_final that means this text is accurate and it's a natural pause in the speakers speech. We need to send this to the assistant for processing 49 | if (transcriptionEvent.speech_final === true) { 50 | this.speechFinal = true; // this will prevent a utterance end which shows up after speechFinal from sending another response 51 | this.emit('transcription', this.finalResult); 52 | this.finalResult = ''; 53 | } else { 54 | // if we receive a message without speechFinal reset speechFinal to false, this will allow any subsequent utteranceEnd messages to properly indicate the end of a message 55 | this.speechFinal = false; 56 | } 57 | } else { 58 | this.emit('utterance', text); 59 | } 60 | }); 61 | 62 | this.dgConnection.on(LiveTranscriptionEvents.Error, (error) => { 63 | console.error('STT -> deepgram error'); 64 | console.error(error); 65 | }); 66 | 67 | this.dgConnection.on(LiveTranscriptionEvents.Warning, (warning) => { 68 | console.error('STT -> deepgram warning'); 69 | console.error(warning); 70 | }); 71 | 72 | this.dgConnection.on(LiveTranscriptionEvents.Metadata, (metadata) => { 73 | console.error('STT -> deepgram metadata'); 74 | console.error(metadata); 75 | }); 76 | 77 | this.dgConnection.on(LiveTranscriptionEvents.Close, () => { 78 | console.log('STT -> Deepgram connection closed'.yellow); 79 | }); 80 | }); 81 | } 82 | 83 | /** 84 | * Send the payload to Deepgram 85 | * @param {String} payload A base64 MULAW/8000 audio stream 86 | */ 87 | send(payload) { 88 | if (this.dgConnection.getReadyState() === 1) { 89 | this.dgConnection.send(Buffer.from(payload, 'base64')); 90 | } 91 | } 92 | } 93 | 94 | module.exports = { TranscriptionService }; -------------------------------------------------------------------------------- /services/tts-service.js: -------------------------------------------------------------------------------- 1 | require('dotenv').config(); 2 | const { Buffer } = require('node:buffer'); 3 | const EventEmitter = require('events'); 4 | const fetch = require('node-fetch'); 5 | 6 | class TextToSpeechService extends EventEmitter { 7 | constructor() { 8 | super(); 9 | this.nextExpectedIndex = 0; 10 | this.speechBuffer = {}; 11 | } 12 | 13 | async generate(gptReply, interactionCount) { 14 | const { partialResponseIndex, partialResponse } = gptReply; 15 | 16 | if (!partialResponse) { return; } 17 | 18 | try { 19 | const response = await fetch( 20 | `https://api.deepgram.com/v1/speak?model=${process.env.VOICE_MODEL}&encoding=mulaw&sample_rate=8000&container=none`, 21 | { 22 | method: 'POST', 23 | headers: { 24 | 'Authorization': `Token ${process.env.DEEPGRAM_API_KEY}`, 25 | 'Content-Type': 'application/json', 26 | }, 27 | body: JSON.stringify({ 28 | text: partialResponse, 29 | }), 30 | } 31 | ); 32 | 33 | if (response.status === 200) { 34 | try { 35 | const blob = await response.blob(); 36 | const audioArrayBuffer = await blob.arrayBuffer(); 37 | const base64String = Buffer.from(audioArrayBuffer).toString('base64'); 38 | this.emit('speech', partialResponseIndex, base64String, partialResponse, interactionCount); 39 | } catch (err) { 40 | console.log(err); 41 | } 42 | } else { 43 | console.log('Deepgram TTS error:'); 44 | console.log(response); 45 | } 46 | } catch (err) { 47 | console.error('Error occurred in TextToSpeech service'); 48 | console.error(err); 49 | } 50 | } 51 | } 52 | 53 | module.exports = { TextToSpeechService }; -------------------------------------------------------------------------------- /test/checkInventory.test.js: -------------------------------------------------------------------------------- 1 | const checkInventory = require('../functions/checkInventory'); 2 | 3 | test('Expect Airpods Pro to have 10 units', () => { 4 | expect(checkInventory({model: 'airpods pro'})).toBe('{"stock":10}'); 5 | }); 6 | 7 | test('Expect Airpods Max to have 0 units', () => { 8 | expect(checkInventory({model: 'airpods max'})).toBe('{"stock":0}'); 9 | }); 10 | 11 | test('Expect all other values to have 100 units', () => { 12 | expect(checkInventory({model: 'anything'})).toBe('{"stock":100}'); 13 | }); -------------------------------------------------------------------------------- /test/checkPrice.test.js: -------------------------------------------------------------------------------- 1 | const checkPrice = require('../functions/checkPrice'); 2 | 3 | test('Expect Airpods Pro to cost $249', () => { 4 | expect(checkPrice({model: 'airpods pro'})).toBe('{"price":249}'); 5 | }); 6 | 7 | test('Expect Airpods Max to cost $549', () => { 8 | expect(checkPrice({model: 'airpods max'})).toBe('{"price":549}'); 9 | }); 10 | 11 | test('Expect all other models to cost $149', () => { 12 | expect(checkPrice({model: 'anything'})).toBe('{"price":149}'); 13 | }); -------------------------------------------------------------------------------- /test/placeOrder.test.js: -------------------------------------------------------------------------------- 1 | const placeOrder = require('../functions/placeOrder'); 2 | 3 | test('Expect placeOrder to return an object with a price and order number', () => { 4 | const order = JSON.parse(placeOrder({model: 'airpods pro', quantity: 10})); 5 | 6 | expect(order).toHaveProperty('orderNumber'); 7 | expect(order).toHaveProperty('price'); 8 | }); -------------------------------------------------------------------------------- /test/transferCall.test.js: -------------------------------------------------------------------------------- 1 | require('dotenv').config(); 2 | const setTimeout = require('timers/promises').setTimeout; 3 | const transferCall = require('../functions/transferCall'); 4 | 5 | test('Expect transferCall to successfully redirect call', async () => { 6 | 7 | async function makeOutBoundCall() { 8 | const accountSid = process.env.TWILIO_ACCOUNT_SID; 9 | const authToken = process.env.TWILIO_AUTH_TOKEN; 10 | 11 | const client = require('twilio')(accountSid, authToken); 12 | 13 | const sid = await client.calls 14 | .create({ 15 | url: `https://${process.env.SERVER}/incoming`, 16 | to: process.env.YOUR_NUMBER, 17 | from: process.env.FROM_NUMBER 18 | }) 19 | .then(call => call.sid); 20 | 21 | return sid; 22 | } 23 | 24 | const callSid = await makeOutBoundCall(); 25 | console.log(callSid); 26 | await setTimeout(10000); 27 | 28 | const transferResult = await transferCall(callSid); 29 | 30 | expect(transferResult).toBe('The call was transferred successfully'); 31 | }, 20000); --------------------------------------------------------------------------------