├── auth.cfg.sample ├── transcribe_with_sphinx.sh ├── fetch_podcast.sh ├── google-transcribe.py ├── convert-to-html.py ├── README.rst └── watson-transcribe.py /auth.cfg.sample: -------------------------------------------------------------------------------- 1 | [watson] 2 | username = UUID 3 | password = pass 4 | -------------------------------------------------------------------------------- /transcribe_with_sphinx.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eux 4 | 5 | FILE=${FILE:-podcast.wav} 6 | 7 | pocketsphinx_continuous -dict /usr/share/pocketsphinx/model/en-us/cmudict-en-us.dict -lm /usr/share/pocketsphinx/model/en-us/en-us.lm.bin -infile $FILE 2> sphinx-voice-debug.log | tee sphinx-transcription.log 8 | -------------------------------------------------------------------------------- /fetch_podcast.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eux 4 | 5 | PODCAST_URL=${PODCAST_URL:-http://audio.commonwealthclub.org/audio/podcast/cc_20170323_Zip_Code_Not_Genetic_Code_Podcast.mp3} 6 | FILE=$(basename $PODCAST_URL) 7 | 8 | wget -O $FILE $PODCAST_URL 9 | 10 | echo "Converting to flac for cloud services" 11 | ffmpeg -i $FILE -ar 16000 podcast.flac 12 | 13 | echo "Converting to wav for CMU Sphinx" 14 | ffmpeg -i $FILE -ar 16000 podcast.wav 15 | -------------------------------------------------------------------------------- /google-transcribe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import time 5 | 6 | from google.cloud import storage 7 | from google.cloud import exceptions as gex 8 | from google.cloud.gapic.speech.v1 import speech_client 9 | from google.cloud.gapic.speech.v1 import enums 10 | from google.cloud.proto.speech.v1 import cloud_speech_pb2 11 | from google.protobuf import json_format 12 | 13 | BUCKET="transcribe-test" 14 | FILE="podcast.flac" 15 | 16 | os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "auth.json" 17 | 18 | print("Creating bucket") 19 | 20 | client = storage.Client() 21 | bucket = client.lookup_bucket(BUCKET) 22 | if not bucket: 23 | bucket = client.create_bucket(BUCKET) 24 | 25 | print("Uploading Podcast") 26 | blob = storage.Blob(FILE, bucket) 27 | blob.upload_from_filename(FILE) 28 | 29 | sclient = speech_client.SpeechClient() 30 | encoding = enums.RecognitionConfig.AudioEncoding.FLAC 31 | sample_rate_hertz = 16000 32 | language_code = 'en-US' 33 | config = cloud_speech_pb2.RecognitionConfig( 34 | encoding=encoding, 35 | sample_rate_hertz=sample_rate_hertz, 36 | language_code=language_code) 37 | uri = 'gs://%s/%s' % (BUCKET, FILE) 38 | audio = cloud_speech_pb2.RecognitionAudio(uri=uri) 39 | response = sclient.long_running_recognize(config, audio) 40 | 41 | def callback(operation_future): 42 | # Handle result. 43 | result = operation_future.result() 44 | with open("google-transcript.json", "w") as f: 45 | f.write(json_format.MessageToJson(result)) 46 | print("Done!") 47 | 48 | response.add_done_callback(callback) 49 | print("Running speech recognition") 50 | -------------------------------------------------------------------------------- /convert-to-html.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # This is a simple script to convert the watson transcript to 4 | # something a little easier to skim. It color codes the words based on 5 | # individual confidences, and puts delay markers in for every 100 ms 6 | # of speech delay. This makes it a bit easier to parse than the run on 7 | # blocks. 8 | 9 | import json 10 | 11 | data = json.load(open('watson-transcript.json')) 12 | 13 | # print(data) 14 | 15 | words = [] 16 | 17 | for r in data["results"]: 18 | conf = r["alternatives"][0]["word_confidence"] 19 | times = r["alternatives"][0]["timestamps"] 20 | for i in range(len(conf)): 21 | conf[i].extend((times[i][1], times[i][2])) 22 | words.extend(conf) 23 | 24 | print """ 25 | 26 |
27 | 41 |
42 | 43 |
44 | """ 45 | 46 | lastend = 0 47 | 48 | for word in words: 49 | w, score, start, end = word 50 | space = start - lastend 51 | if 0.1 < space <= 0.3: 52 | print ", " 53 | elif 0.3 < space: 54 | num = int(space * 10) 55 | print "." * num 56 | 57 | print '%s ' % (score, start, end, score * 10, w) 58 | lastend = end 59 | 60 | print """ 61 |
62 | 63 | 64 | """ 65 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ========================== 2 | Transcribe Podcast Audio Testing 3 | ========================== 4 | 5 | This is a repository demonstrating transcribing podcast audio with a 6 | few different services: IBM Watson Speech to Text, Google Cloud 7 | Speech, and CMU Sphinx. 8 | 9 | Getting Started 10 | =============== 11 | 12 | First run ``fetch_podcast.sh`` to pull some content down for 13 | converting. The example from my blog post is used, though you can 14 | replace the URL if you like to try with other content. 15 | 16 | Running CMU Sphinx 17 | ================== 18 | 19 | Ensure that sphinx is installed. On Ubuntu this is done with 20 | 21 | :: 22 | 23 | sudo apt-get install pocketsphinx pocketsphinx-en-us 24 | 25 | Then run ``transcribe_with_sphinx.sh``. It will produce 26 | ``sphinx-transcription.log`` as output. 27 | 28 | Running Watson 29 | ============== 30 | 31 | Sign up for IBM Bluemix and create a Watson Speech to Text instance. 32 | 33 | Copy user/password into auth.cfg in the [watson] section. 34 | 35 | Install python prereqs 36 | 37 | :: 38 | 39 | pip install watson-developer-cloud 40 | 41 | Then run it with 42 | 43 | :: 44 | 45 | watson-transcribe.py podcast.flac 46 | 47 | Optionally specify a pretrained customized language id 48 | 49 | :: 50 | 51 | watson-transcribe.py --customization a13780b0-52b7-1fe7-fbb7-77471c70c949 podcast.flac 52 | 53 | .. note:: 54 | 55 | It will take 30 - 45 minutes to run, it processes slightly faster 56 | than realtime. 57 | 58 | Running Google 59 | ============== 60 | 61 | Sign up for Google Cloud and create a project and a service key. Go 62 | through all the setup around authentication - 63 | https://googlecloudplatform.github.io/google-cloud-python/stable/google-cloud-auth.html 64 | 65 | Create a key for the service, put that json in ``auth.json`` in this 66 | directory. 67 | 68 | Install python prereqs 69 | 70 | :: 71 | 72 | pip install google-cloud-storage google-cloud-speech 73 | 74 | Then run it as 75 | 76 | :: 77 | 78 | google-transcribe.py 79 | 80 | .. note:: 81 | 82 | It will take 20 - 30 minutes to run, it processes slightly faster 83 | than realtime. 84 | -------------------------------------------------------------------------------- /watson-transcribe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright 2017 IBM 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may 6 | # not use this file except in compliance with the License. You may obtain 7 | # a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14 | # License for the specific language governing permissions and limitations 15 | # under the License. 16 | 17 | import argparse 18 | import ConfigParser 19 | import json 20 | from os.path import join, dirname 21 | from watson_developer_cloud import SpeechToTextV1 22 | 23 | 24 | def get_auth(): 25 | config = ConfigParser.RawConfigParser() 26 | config.read('auth.cfg') 27 | user = config.get('watson', 'username') 28 | password = config.get('watson', 'password') 29 | return (user, password) 30 | 31 | 32 | def parse_args(): 33 | parser = argparse.ArgumentParser( 34 | description='Transcribe audio with watson') 35 | parser.add_argument('file') 36 | parser.add_argument('--customization', help="Process using a customized model id") 37 | args = parser.parse_args() 38 | return args 39 | 40 | def main(): 41 | args = parse_args() 42 | (user, passwd) = get_auth() 43 | speech_to_text = SpeechToTextV1( 44 | username=user, 45 | password=passwd, 46 | x_watson_learning_opt_out=False 47 | ) 48 | 49 | speech_to_text.get_model('en-US_BroadbandModel') 50 | 51 | with open(args.file, 'rb') as audio_file: 52 | print("Sending audio to watson to recognize, this will take 30+ minutes") 53 | if args.customization is not None: 54 | print("Using custom model {0}".format(args.customization)) 55 | print("Please be patient and don't kill this process while running") 56 | output = speech_to_text.recognize( 57 | audio_file, content_type='audio/flac', timestamps=True, 58 | customization_id=args.customization, 59 | word_confidence=True) 60 | with open("watson-transcript.json", "w") as out: 61 | print("Transcription done, written to watson-transcription.json") 62 | out.write(json.dumps(output, indent=2)) 63 | 64 | if __name__ == "__main__": 65 | main() 66 | --------------------------------------------------------------------------------