├── .gitignore
├── LICENSE
├── README.md
├── autoload
    ├── vim_speech.vim
    └── vim_speech
    │   └── statusline.vim
├── install.sh
├── plugin
    ├── speech_to_text_client.py
    └── vim-speech.vim
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | venv
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2018 w0rp <devw0rp@gmail.com>
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # vim-speech
  2 | 
  3 | This project is an attempt at getting some basic speech to text processing
  4 | working in Vim using Google's cloud services.
  5 | 
  6 | **NOTE:** This project is a proof of concept.
  7 | 
  8 | **NOTE:** To use this plugin, you will probably need to pay Google money, at
  9 | least eventually.
 10 | 
 11 | This project uses an MIT licence to allow you to basically do what you want.
 12 | 
 13 | Click the image below to watch a video demonstration.
 14 | 
 15 | [![vim-speech video demo](https://img.youtube.com/vi/UtInOI7LluA/0.jpg)](http://www.youtube.com/watch?v=UtInOI7LluA "vim-speech video demo")
 16 | 
 17 | ## Installation
 18 | 
 19 | Add the directory for this git project to `runtimepath` for Vim somehow.
 20 | You can load the plugin in Vim 8 easily with the built-in plugin mechanism by
 21 | storing in in a path like the following:
 22 | 
 23 | ```
 24 | ~/.vim/pack/git-plugins/start/vim-speech
 25 | ```
 26 | 
 27 | You will also need to install [ALE](https://github.com/w0rp/ale), as this plugin
 28 | currently uses functions from ALE, purely so the plugin could be written more
 29 | quickly. Follow the [instructions for installing ALE](https://github.com/w0rp/ale#installation).
 30 | 
 31 | After the plugin has been installed, you'll need to install all of the
 32 | requirements for your system and build the virtualenv that the project uses
 33 | for the Python text to speech client. You will need...
 34 | 
 35 | 1. Python 2.7 with `virtualenv` installed.
 36 | 2. Google's `google-cloud-sdk` tools.
 37 | 3. `libportaudio2` and `portaudio19-dev` for audio recording.
 38 | 
 39 | You can run the following to set up everything, including installing packages
 40 | on Ubuntu:
 41 | 
 42 | ```
 43 | cd ~/.vim/pack/git-plugins/start/vim-speech
 44 | ./install.sh
 45 | ```
 46 | 
 47 | If you don't like running scripts from the Internet, _as you shouldn't_, go read
 48 | `install.sh`, look at what it does, and figure it out.
 49 | 
 50 | After the Python script has been set up, you will need to tell Vim and the
 51 | script where your Google application credentials are by setting an environment
 52 | variable. The easiest way to do this is to add a line to your `vimrc` file.
 53 | 
 54 | ```
 55 | " This is how I specify the path to the JSON credentials file.
 56 | let $GOOGLE_APPLICATION_CREDENTIALS = $HOME
 57 | \   . '/content/application/speech-to-text-key.json'
 58 | ```
 59 | 
 60 | You have to register a Google cloud service at https://cloud.google.com/ for any
 61 | of this to work. You will be given such a JSON credentials file after you
 62 | register a project with access to the "Cloud Speech API." See Google's
 63 | speech-to-text demo site for more information:
 64 | https://cloud.google.com/speech-to-text/
 65 | 
 66 | ## Usage
 67 | 
 68 | Once you have figured out how to get everything installed, you can use the
 69 | following commands in Vim for recording speech.
 70 | 
 71 | | Command         | Description                                               |
 72 | | --------------- | --------------------------------------------------------- |
 73 | | `:SpeechRecord` | Start recording, and start the job if needed.             |
 74 | | `:SpeechStop`   | Stop recording, and print the output to your buffer.      |
 75 | | `:SpeechQuit`   | Stop the background job and free some memory.             |
 76 | 
 77 | If you don't see any text being outputted into your buffer, you're probably just
 78 | recording from the wrong device on your machine. Mess around in `pavucontrol` or
 79 | whatever device selection application you have until it works.
 80 | 
 81 | ## Running the speech to text client outside of Vim
 82 | 
 83 | Run the script from a terminal where your `GOOGLE_APPLICATION_CREDENTIALS`
 84 | environment variable is set. For example:
 85 | 
 86 | ```bash
 87 | # ~/whatever.json won't work, so use $HOME/whatever.json.
 88 | export GOOGLE_APPLICATION_CREDENTIALS="$HOME/whatever.json"
 89 | ```
 90 | 
 91 | Run `plugin/speech_to_text_client.py` To start the speech-to-text client
 92 | recording audio. It uses a simple text protocol which accepts the following
 93 | commands as lines of input, in a case-insensitive manner.
 94 | 
 95 | | Command        | Description                                                |
 96 | | -------------- | ---------------------------------------------------------- |
 97 | | `record`       | Start recording audio.                                     |
 98 | | `stop`         | Stop recording audio, and get the text from Google.        |
 99 | 
100 | The protocol will respond with the following lines.
101 | 
102 | | Response       | Description                                                |
103 | | -------------- | ---------------------------------------------------------- |
104 | | `record start` | Signals when recording stops.                              |
105 | | `record stop`  | Signals when recording ends.                               |
106 | | `speech ...`   | Text data returned from Google.                            |
107 | 
108 | The client will catch SIGINT and stop the client as soon as possible, in a safe
109 | manner. Debug information may be written to stderr. The client won't work at all
110 | on operating systems that aren't Unix-like.
111 | 
112 | Nothing might be coming out from the voice samples when you try to record
113 | speech. If this happens, mess around with `pavucontrol` and select different
114 | audio devices while recording is live. You're probably using the wrong audio
115 | device.
116 | 


--------------------------------------------------------------------------------
/autoload/vim_speech.vim:
--------------------------------------------------------------------------------
  1 | "Author: w0rp <devw0rp@gmail.com>
  2 | "Description: The main file for implementing Vim speech-to-text control.
  3 | 
  4 | if !exists('s:job_id')
  5 |     let s:job_id = 0
  6 | endif
  7 | 
  8 | if !exists('s:buffer_to_write_to')
  9 |     let s:buffer_to_write_to = 0
 10 | endif
 11 | 
 12 | if !exists('g:vim_speech_info')
 13 |     let g:vim_speech_info = {
 14 |     \ 'recording': 0,
 15 |     \}
 16 | endif
 17 | 
 18 | function! s:HandleExit(job_id, exit_code) abort
 19 |     " If the job is the current one, clear the ID so we start again.
 20 |     if a:job_id is s:job_id
 21 |         let s:job_id = 0
 22 |     endif
 23 | endfunction
 24 | 
 25 | function! s:HandleSpeech(speech) abort
 26 |     if empty(a:speech)
 27 |         " Do nothing when we get nothing back.
 28 |         return
 29 |     endif
 30 | 
 31 |     if s:buffer_to_write_to is bufnr('')
 32 |         " Get the cursor's position and the text for the line.
 33 |         let l:pos = getcurpos()
 34 |         let l:line = getline(l:pos[1])
 35 | 
 36 |         let l:before = l:line[: l:pos[2]]
 37 |         let l:after = l:line[l:pos[2] :]
 38 |         let l:inserted = a:speech
 39 | 
 40 |         " Add a space before the words, if we need to.
 41 |         if !empty(l:before) && l:before !~? ' $'
 42 |             let l:inserted = ' ' . l:inserted
 43 |         endif
 44 | 
 45 |         " Add a space after the words, if we need to.
 46 |         if !empty(l:after) && l:after !~? '^ '
 47 |             let l:inserted = l:inserted . ' '
 48 |         endif
 49 | 
 50 |         let l:line = l:before . l:inserted . l:after
 51 |         let l:pos[2] += len(l:inserted)
 52 | 
 53 |         " Update the line and the cursor's position.
 54 |         call setline(l:pos[1], l:line)
 55 |         call setpos('.', l:pos)
 56 |     endif
 57 | endfunction
 58 | 
 59 | " Handle lines from the speech to text client.
 60 | function! s:HandleResponseLine(job_id, line) abort
 61 |     let l:match = matchlist(a:line, '\v^(speech) (.*)$')
 62 | 
 63 |     if empty(l:match)
 64 |         return
 65 |     endif
 66 | 
 67 |     let l:command = l:match[1]
 68 |     let l:value = l:match[2]
 69 | 
 70 |     if l:command =~? '^speech$'
 71 |         call s:HandleSpeech(l:value)
 72 |     endif
 73 | endfunction
 74 | 
 75 | function! s:HandleErrorLine(job_id, line) abort
 76 | endfunction
 77 | 
 78 | function! s:StartJobIfNeeded(buffer) abort
 79 |     if s:job_id > 0
 80 |         return
 81 |     endif
 82 | 
 83 |     if empty($GOOGLE_APPLICATION_CREDENTIALS)
 84 |         throw 'GOOGLE_APPLICATION_CREDENTIALS is not set'
 85 |     endif
 86 | 
 87 |     let l:command = ale#Escape(g:vim_speech_dir . '/venv/bin/python')
 88 |     \   . ' ' . ale#Escape(g:vim_speech_dir . '/speech_to_text_client.py')
 89 |     let l:job_options = {
 90 |     \   'mode': 'nl',
 91 |     \   'exit_cb': function('s:HandleExit'),
 92 |     \   'out_cb': function('s:HandleResponseLine'),
 93 |     \   'err_cb': function('s:HandleErrorLine'),
 94 |     \}
 95 | 
 96 |     let l:command = ale#job#PrepareCommand(a:buffer, l:command)
 97 |     let s:job_id = ale#job#Start(l:command, l:job_options)
 98 | endfunction
 99 | 
100 | function! vim_speech#StartRecording() abort
101 |     let l:buffer = bufnr('')
102 | 
103 |     call s:StartJobIfNeeded(l:buffer)
104 | 
105 |     if s:job_id > 0
106 |         call ale#job#SendRaw(s:job_id, "record\n")
107 |         let g:vim_speech_info.recording = 1
108 |     else
109 |         throw 'Failed to start speech client!'
110 |     endif
111 | endfunction
112 | 
113 | function! vim_speech#StopRecording() abort
114 |     let l:buffer = bufnr('')
115 | 
116 |     if s:job_id > 0
117 |         call ale#job#SendRaw(s:job_id, "stop\n")
118 |         let s:buffer_to_write_to = l:buffer
119 |         let g:vim_speech_info.recording = 0
120 |     endif
121 | endfunction
122 | 
123 | " Toggle recording on and off, for easier keybinds.
124 | function! vim_speech#ToggleRecording() abort
125 |     if get(g:vim_speech_info, 'recording', 0)
126 |         call vim_speech#StopRecording()
127 |     else
128 |         call vim_speech#StartRecording()
129 |     endif
130 | endfunction
131 | 
132 | function! vim_speech#Quit() abort
133 |     if s:job_id > 0
134 |         " Send a command to shutdown safely.
135 |         call ale#job#SendRaw(s:job_id, "quit\n")
136 |         " Assume the job will close later and forget the ID now.
137 |         let s:job_id = 0
138 |     endif
139 | endfunction
140 | 


--------------------------------------------------------------------------------
/autoload/vim_speech/statusline.vim:
--------------------------------------------------------------------------------
 1 | scriptencoding utf-8
 2 | 
 3 | function! vim_speech#statusline#GetStatus() abort
 4 |     if get(get(g:, 'vim_speech_info', {}), 'recording')
 5 |         return '◉ REC'
 6 |     endif
 7 | 
 8 |     return ''
 9 | endfunction
10 | 


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | CLOUD_SDK_REPO="cloud-sdk-$(lsb_release -c -s)"
 6 | export CLOUD_SDK_REPO
 7 | 
 8 | # Install what we need via apt, if we need to.
 9 | if ! [ -f /etc/apt/sources.list.d/google-cloud-sdk.list ]; then
10 |     echo 'Adding a now apt source...'
11 |     echo "deb http://packages.cloud.google.com/apt $CLOUD_SDK_REPO main" \
12 |         | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
13 |     echo "Adding Google's gpg key via apt-key..."
14 |     curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg \
15 |         | sudo apt-key add -
16 |     echo 'Running apt to install things...'
17 |     sudo apt update
18 |     sudo apt install google-cloud-sdk google-cloud-sdk-app-engine-python
19 | fi
20 | 
21 | if ! dpkg -s libportaudio2 > /dev/null; then
22 |     echo 'Installing libportaudio2...'
23 |     sudo apt install libportaudio2
24 | fi
25 | 
26 | if ! [ -f /usr/include/portaudio.h ]; then
27 |     echo 'Installing portaudio19-dev...'
28 |     sudo apt install portaudio19-dev
29 | fi
30 | 
31 | if ! [ -d plugin/venv ]; then
32 |     virtualenv -p python2.7 plugin/venv
33 | fi
34 | 
35 | set +u
36 | source plugin/venv/bin/activate
37 | set -u
38 | 
39 | pip install -q pip==10.0.1 wheel==0.31.1
40 | pip install -q -r requirements.txt
41 | 
42 | echo 'Everything has probably been installed.'
43 | 


--------------------------------------------------------------------------------
/plugin/speech_to_text_client.py:
--------------------------------------------------------------------------------
  1 | #!plugin/venv/bin/python
  2 | from __future__ import absolute_import, print_function, unicode_literals
  3 | 
  4 | import os
  5 | import select
  6 | import sys
  7 | import wave
  8 | import signal
  9 | from io import BytesIO
 10 | 
 11 | import pyaudio
 12 | 
 13 | SAMPLE_RATE = 16000
 14 | CHANNELS = 1
 15 | CHUNK_SIZE = 1024
 16 | AUDIO_FORMAT = pyaudio.paInt16
 17 | 
 18 | 
 19 | def print_and_flush(*args, **kwargs):
 20 |     """
 21 |     print() doesn't flush, and we need to flush for the Vim plugin to work.
 22 |     """
 23 |     print(*args, **kwargs)
 24 |     sys.stdout.flush()
 25 | 
 26 | 
 27 | class RecordingClient(object):
 28 |     def __init__(self):
 29 |         self.frames = []
 30 |         self.audio_context = None
 31 |         self.stream = None
 32 |         self.signit_sent = False
 33 | 
 34 |     def trap_sigint(self):
 35 |         def signal_handler(*args, **kwargs):
 36 |             self.signit_sent = True
 37 | 
 38 |         signal.signal(signal.SIGINT, signal_handler)
 39 | 
 40 |     def start_recording(self):
 41 |         self.frames = []
 42 | 
 43 |         self.audio_context = pyaudio.PyAudio()
 44 |         self.stream = self.audio_context.open(
 45 |             format=AUDIO_FORMAT,
 46 |             channels=CHANNELS,
 47 |             rate=SAMPLE_RATE,
 48 |             input=True,
 49 |             frames_per_buffer=CHUNK_SIZE,
 50 |         )
 51 | 
 52 |     def save_frames(self):
 53 |         if self.stream is not None:
 54 |             data = self.stream.read(CHUNK_SIZE)
 55 |             self.frames.append(data)
 56 | 
 57 |     def cleanup(self):
 58 |         if self.stream is not None:
 59 |             self.stream.stop_stream()
 60 |             self.stream.close()
 61 |             self.stream = None
 62 | 
 63 |         if self.audio_context is not None:
 64 |             self.audio_context.terminate()
 65 | 
 66 |         self.frames = []
 67 | 
 68 |     def stop_recording(self):
 69 |         if (
 70 |             self.stream is None
 71 |             or self.audio_context is None
 72 |             or not self.frames
 73 |         ):
 74 |             return b''
 75 | 
 76 |         self.stream.stop_stream()
 77 |         self.stream.close()
 78 |         self.audio_context.terminate()
 79 | 
 80 |         output_file = BytesIO()
 81 | 
 82 |         wf = wave.open(output_file, 'wb')
 83 |         wf.setnchannels(CHANNELS)
 84 |         wf.setsampwidth(self.audio_context.get_sample_size(AUDIO_FORMAT))
 85 |         wf.setframerate(SAMPLE_RATE)
 86 |         wf.writeframes(b''.join(self.frames))
 87 |         wf.close()
 88 | 
 89 |         self.stream = None
 90 |         self.audio_context = None
 91 |         self.frames = []
 92 | 
 93 |         return output_file.getvalue()
 94 | 
 95 | 
 96 | def transcribe_file(content):
 97 |     from google.cloud import speech
 98 |     from google.cloud.speech import enums
 99 |     from google.cloud.speech import types
100 | 
101 |     if not content:
102 |         return ''
103 | 
104 |     client = speech.SpeechClient()
105 | 
106 |     audio = types.RecognitionAudio(content=content)
107 |     config = types.RecognitionConfig(
108 |         encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
109 |         sample_rate_hertz=SAMPLE_RATE,
110 |         language_code='en-US',
111 |     )
112 | 
113 |     response = client.recognize(config, audio)
114 |     lines = []
115 | 
116 |     for result in response.results:
117 |         lines.append(result.alternatives[0].transcript)
118 | 
119 |     return ' '.join(lines)
120 | 
121 | 
122 | def stdin_has_data():
123 |     try:
124 |         return sys.stdin in select.select([sys.stdin], [], [], 0)[0]
125 |     except select.error:
126 |         return False
127 | 
128 | 
129 | def main():
130 |     # Stop early if the environment variable isn't set.
131 |     if not os.environ.get('GOOGLE_APPLICATION_CREDENTIALS'):
132 |         sys.exit(
133 |             'You must set GOOGLE_APPLICATION_CREDENTIALS'
134 |             ' to your JSON credentials filename.'
135 |         )
136 | 
137 |     client = RecordingClient()
138 |     client.trap_sigint()
139 | 
140 |     while True:
141 |         if client.signit_sent:
142 |             break
143 | 
144 |         if stdin_has_data():
145 |             line = sys.stdin.readline()
146 | 
147 |             if line:
148 |                 message = line.lower().strip()
149 | 
150 |                 if client.signit_sent:
151 |                     break
152 | 
153 |                 if message == 'record':
154 |                     print_and_flush('record start')
155 |                     client.start_recording()
156 |                 elif message == 'stop':
157 |                     print_and_flush('record end')
158 |                     audio_content = client.stop_recording()
159 |                     print_and_flush('speech', transcribe_file(audio_content))
160 |                 elif message == 'quit':
161 |                     break
162 | 
163 |         client.save_frames()
164 | 
165 |     if client.signit_sent:
166 |         # Print a line if we caught SIGINT, for the benefit of terminals.
167 |         print_and_flush()
168 | 
169 |     client.cleanup()
170 | 
171 | 
172 | if __name__ == "__main__":
173 |     main()
174 | 


--------------------------------------------------------------------------------
/plugin/vim-speech.vim:
--------------------------------------------------------------------------------
 1 | "Author: w0rp <devw0rp@gmail.com>
 2 | "Description: A plugin for implementing Vim speech-to-text control.
 3 | 
 4 | if exists('g:loaded_vim_speech')
 5 |     finish
 6 | endif
 7 | 
 8 | let g:loaded_vim_speech = 1
 9 | 
10 | let g:vim_speech_dir = fnamemodify(resolve(expand('<sfile>:p')), ':h')
11 | 
12 | command! -bar SpeechRecord :call vim_speech#StartRecording()
13 | command! -bar SpeechStop :call vim_speech#StopRecording()
14 | command! -bar SpeechToggle :call vim_speech#ToggleRecording()
15 | command! -bar SpeechQuit :call vim_speech#Quit()
16 | 
17 | " <Plug> mappings for commands
18 | nnoremap <silent> <Plug>(vim_speech_record) :SpeechRecord<Return>
19 | nnoremap <silent> <Plug>(vim_speech_stop) :SpeechStop<Return>
20 | nnoremap <silent> <Plug>(vim_speech_toggle) :SpeechToggle<Return>
21 | nnoremap <silent> <Plug>(vim_speech_quit) :SpeechQuit<Return>
22 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | google-cloud==0.33.1
2 | PyAudio==0.2.11
3 | 


--------------------------------------------------------------------------------