├── .gitignore ├── README.md ├── apt-packages.txt ├── dict └── cmu07a.dic ├── hmm └── .vc ├── lm └── .vc └── recognizer.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | !percept/bin 15 | var 16 | sdist 17 | develop-eggs 18 | .installed.cfg 19 | lib 20 | lib64 21 | 22 | # Installer logs 23 | pip-log.txt 24 | 25 | # Unit test / coverage reports 26 | .coverage 27 | .tox 28 | nosetests.xml 29 | 30 | # Translations 31 | *.mo 32 | 33 | # Mr Developer 34 | .mr.developer.cfg 35 | .project 36 | .pydevproject 37 | 38 | #pycharm 39 | .idea 40 | 41 | #project 42 | *.wav 43 | hmm 44 | lm 45 | !.vc 46 | 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Scribe 2 | ------------------------------- 3 | 4 | Simple speech recognition for Python. Run the script, say some things into your microphone, and then see what you said (or an approximation). 5 | 6 | Powered by [pyaudio](http://people.csail.mit.edu/hubert/pyaudio/) and [Sphinx](http://cmusphinx.sourceforge.net/). 7 | 8 | Installation 9 | -------------------------------- 10 | 11 | ### Sphinxbase 12 | 13 | Download [sphinxbase](http://sourceforge.net/projects/cmusphinx/files/sphinxbase/0.8) and extract the files. 14 | 15 | Now, run: 16 | ``` 17 | cd sphinxbase 18 | ./configure;make clean all;make install 19 | cd python 20 | python setup.py install 21 | ``` 22 | 23 | You may need to use sudo for make install or python setup.py install. 24 | 25 | ### Pocketsphinx 26 | 27 | Download [pocketsphinx](http://sourceforge.net/projects/cmusphinx/files/pocketsphinx/0.8) and extract the files. 28 | 29 | Now, run: 30 | ``` 31 | cd pocketsphinx 32 | ./configure;make clean all;make install 33 | cd python 34 | python setup.py install 35 | ``` 36 | 37 | ### Packages (Linux only) 38 | 39 | Now, run: 40 | 41 | ``` 42 | cd speech-recognizer 43 | sudo xargs -a apt-packages.txt apt-get install 44 | ``` 45 | 46 | ### Pyaudio 47 | 48 | Now, download the right version of [pyaudio](http://people.csail.mit.edu/hubert/pyaudio/) and install it. 49 | 50 | ### Language files 51 | 52 | If you want to speak english, you need to get the [english language model](http://sourceforge.net/projects/cmusphinx/files/Acoustic%20and%20Language%20Models/US%20English%20Generic%20Language%20Model/) and the [english acoustic model](http://sourceforge.net/projects/cmusphinx/files/Acoustic%20and%20Language%20Models/US%20English%20Generic%20Acoustic%20Model/). 53 | 54 | You will need to put the acoustic model into `scribe/hmm`, and the language model into `scribe/lm`. 55 | 56 | The filetree should look like this for english: 57 | 58 | ``` 59 | scribe 60 | ├── dict 61 | │   └── cmu07a.dic 62 | ├── hmm 63 | │   ├── feat.params 64 | │   ├── feature_transform 65 | │   ├── mdef 66 | │   ├── means 67 | │   ├── mixture_weights 68 | │   ├── noisedict 69 | │   ├── README 70 | │   ├── transition_matrices 71 | │   └── variances 72 | ├── lm 73 | │   └── cmusphinx-5.0-en-us.lm.dmp 74 | ``` 75 | 76 | For other languages, [check here](http://sourceforge.net/projects/cmusphinx/files/Acoustic%20and%20Language%20Models/), or see below on training your own model. If you use different language models, acoustic models, or dictionaries, you will want to change these paths in `recognizer.py`: 77 | 78 | ``` 79 | HMDIR = os.path.join(BASE_PATH, "hmm") 80 | LMDIR = os.path.join(BASE_PATH, "lm/cmusphinx-5.0-en-us.lm.dmp") 81 | DICTD = os.path.join(BASE_PATH, "dict/cmu07a.dic") 82 | ``` 83 | 84 | Run 85 | ------------------------------- 86 | 87 | To run, you just have to: 88 | 89 | ``` 90 | cd speech-recognizer 91 | python recognizer.py 92 | ``` 93 | 94 | You should be able to talk for a few seconds, after which it will spend some time processing, and the show you what you said. 95 | 96 | Configure 97 | --------------------------------- 98 | 99 | There are some options that you can modify at the top of `recognizer.py`. The easiest one to modify is `RECORD_SECONDS`. 100 | 101 | More reading 102 | ---------------------------------- 103 | 104 | To find out more, read up on [sphinx](http://cmusphinx.sourceforge.net/wiki/). 105 | 106 | You can train the language models to make them more accurate, use unsupported languages, or be more domain-specific. -------------------------------------------------------------------------------- /apt-packages.txt: -------------------------------------------------------------------------------- 1 | pocketsphinx-hmm-wsj1 2 | pocketsphinx-lm-wsj 3 | libasound-dev -------------------------------------------------------------------------------- /hmm/.vc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/scribe/48428d42871677910eb6e56ad5892a366922af26/hmm/.vc -------------------------------------------------------------------------------- /lm/.vc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VikParuchuri/scribe/48428d42871677910eb6e56ad5892a366922af26/lm/.vc -------------------------------------------------------------------------------- /recognizer.py: -------------------------------------------------------------------------------- 1 | import pyaudio 2 | import wave 3 | import sphinxbase 4 | import os 5 | 6 | # Import sometimes fails first time around because of a Cython issue. 7 | try: 8 | import pocketsphinx 9 | except ValueError: 10 | import pocketsphinx 11 | 12 | # Paths 13 | BASE_PATH = os.path.dirname(os.path.realpath(__file__)) 14 | HMDIR = os.path.join(BASE_PATH, "hmm") 15 | LMDIR = os.path.join(BASE_PATH, "lm/cmusphinx-5.0-en-us.lm.dmp") 16 | DICTD = os.path.join(BASE_PATH, "dict/cmu07a.dic") 17 | 18 | # Options 19 | CHUNK = 128 # The size of each audio chunk coming from the input device. 20 | FORMAT = pyaudio.paInt16 # Should not be changed, as this format is best for speech recognition. 21 | RATE = 16000 # Speech recognition only works well with this rate. Don't change unless your microphone demands it. 22 | RECORD_SECONDS = 5 # Number of seconds to record, can be changed. 23 | WAVE_OUTPUT_FILENAME = "output.wav" # Where to save the recording from the microphone. 24 | 25 | def find_device(p, tags): 26 | """ 27 | Find an audio device to read input from. 28 | """ 29 | device_index = None 30 | for i in range(p.get_device_count()): 31 | devinfo = p.get_device_info_by_index(i) 32 | print("Device %d: %s" % (i, devinfo["name"])) 33 | 34 | for keyword in tags: 35 | if keyword in devinfo["name"].lower(): 36 | print("Found an input: device %d - %s"%(i, devinfo["name"])) 37 | device_index = i 38 | return device_index 39 | 40 | if device_index is None: 41 | print("No preferred input found; using default input device.") 42 | 43 | return device_index 44 | 45 | def save_audio(wav_file): 46 | """ 47 | Stream audio from an input device and save it. 48 | """ 49 | p = pyaudio.PyAudio() 50 | 51 | device = find_device(p, ["input", "mic", "audio"]) 52 | device_info = p.get_device_info_by_index(device) 53 | channels = int(device_info['maxInputChannels']) 54 | 55 | stream = p.open( 56 | format=FORMAT, 57 | channels=channels, 58 | rate=RATE, 59 | input=True, 60 | frames_per_buffer=CHUNK, 61 | input_device_index=device 62 | ) 63 | 64 | print("* recording") 65 | 66 | frames = [] 67 | 68 | for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)): 69 | data = stream.read(CHUNK) 70 | frames.append(data) 71 | 72 | print("* done recording") 73 | 74 | stream.stop_stream() 75 | stream.close() 76 | 77 | p.terminate() 78 | 79 | wf = wave.open(wav_file, 'wb') 80 | wf.setnchannels(channels) 81 | wf.setsampwidth(p.get_sample_size(FORMAT)) 82 | wf.setframerate(RATE) 83 | wf.writeframes(b''.join(frames)) 84 | wf.close() 85 | 86 | def recognize(wav_file): 87 | """ 88 | Run speech recognition on a given file. 89 | """ 90 | speech_rec = pocketsphinx.Decoder(hmm=HMDIR, lm=LMDIR, dict=DICTD) 91 | wav_file = file(wav_file, 'rb') 92 | speech_rec.decode_raw(wav_file) 93 | result = speech_rec.get_hyp() 94 | return result 95 | 96 | # Run the thing! 97 | if __name__ == '__main__': 98 | save_audio(WAVE_OUTPUT_FILENAME) 99 | result = recognize(WAVE_OUTPUT_FILENAME) 100 | print "You just said: {0}".format(result[0]) --------------------------------------------------------------------------------