├── auth.cfg.sample
├── transcribe_with_sphinx.sh
├── fetch_podcast.sh
├── google-transcribe.py
├── convert-to-html.py
├── README.rst
└── watson-transcribe.py


/auth.cfg.sample:
--------------------------------------------------------------------------------
1 | [watson]
2 | username = UUID
3 | password = pass
4 | 


--------------------------------------------------------------------------------
/transcribe_with_sphinx.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -eux
4 | 
5 | FILE=${FILE:-podcast.wav}
6 | 
7 | pocketsphinx_continuous -dict /usr/share/pocketsphinx/model/en-us/cmudict-en-us.dict -lm /usr/share/pocketsphinx/model/en-us/en-us.lm.bin -infile $FILE 2> sphinx-voice-debug.log | tee sphinx-transcription.log
8 | 


--------------------------------------------------------------------------------
/fetch_podcast.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -eux
 4 | 
 5 | PODCAST_URL=${PODCAST_URL:-http://audio.commonwealthclub.org/audio/podcast/cc_20170323_Zip_Code_Not_Genetic_Code_Podcast.mp3}
 6 | FILE=$(basename $PODCAST_URL)
 7 | 
 8 | wget -O $FILE $PODCAST_URL
 9 | 
10 | echo "Converting to flac for cloud services"
11 | ffmpeg -i $FILE -ar 16000 podcast.flac
12 | 
13 | echo "Converting to wav for CMU Sphinx"
14 | ffmpeg -i $FILE -ar 16000 podcast.wav
15 | 


--------------------------------------------------------------------------------
/google-transcribe.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import time
 5 | 
 6 | from google.cloud import storage
 7 | from google.cloud import exceptions as gex
 8 | from google.cloud.gapic.speech.v1 import speech_client
 9 | from google.cloud.gapic.speech.v1 import enums
10 | from google.cloud.proto.speech.v1 import cloud_speech_pb2
11 | from google.protobuf import json_format
12 | 
13 | BUCKET="transcribe-test"
14 | FILE="podcast.flac"
15 | 
16 | os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "auth.json"
17 | 
18 | print("Creating bucket")
19 | 
20 | client = storage.Client()
21 | bucket = client.lookup_bucket(BUCKET)
22 | if not bucket:
23 |     bucket = client.create_bucket(BUCKET)
24 | 
25 | print("Uploading Podcast")
26 | blob = storage.Blob(FILE, bucket)
27 | blob.upload_from_filename(FILE)
28 | 
29 | sclient = speech_client.SpeechClient()
30 | encoding = enums.RecognitionConfig.AudioEncoding.FLAC
31 | sample_rate_hertz = 16000
32 | language_code = 'en-US'
33 | config = cloud_speech_pb2.RecognitionConfig(
34 |     encoding=encoding,
35 |     sample_rate_hertz=sample_rate_hertz,
36 |     language_code=language_code)
37 | uri = 'gs://%s/%s' % (BUCKET, FILE)
38 | audio = cloud_speech_pb2.RecognitionAudio(uri=uri)
39 | response = sclient.long_running_recognize(config, audio)
40 | 
41 | def callback(operation_future):
42 |     # Handle result.
43 |     result = operation_future.result()
44 |     with open("google-transcript.json", "w") as f:
45 |         f.write(json_format.MessageToJson(result))
46 |     print("Done!")
47 | 
48 | response.add_done_callback(callback)
49 | print("Running speech recognition")
50 | 


--------------------------------------------------------------------------------
/convert-to-html.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # This is a simple script to convert the watson transcript to
 4 | # something a little easier to skim. It color codes the words based on
 5 | # individual confidences, and puts delay markers in for every 100 ms
 6 | # of speech delay. This makes it a bit easier to parse than the run on
 7 | # blocks.
 8 | 
 9 | import json
10 | 
11 | data = json.load(open('watson-transcript.json'))
12 | 
13 | # print(data)
14 | 
15 | words = []
16 | 
17 | for r in data["results"]:
18 |     conf = r["alternatives"][0]["word_confidence"]
19 |     times = r["alternatives"][0]["timestamps"]
20 |     for i in range(len(conf)):
21 |         conf[i].extend((times[i][1], times[i][2]))
22 |     words.extend(conf)
23 | 
24 | print """
25 | <html>
26 | <header>
27 | <style>
28 | .confidence_10 { color: #000000 }
29 | .confidence_9 { color: #220000 }
30 | .confidence_8 { color: #440000}
31 | .confidence_7 { color: #660000 }
32 | .confidence_6 { color: #880000 }
33 | .confidence_5 { color: #aa0000}
34 | .confidence_4 { color: #cc0000 }
35 | .confidence_3 { color: #ee0000 }
36 | .confidence_2 { color: #ff2200 }
37 | .confidence_1 { color: #ff4400 }
38 | .confidence_0 { color: #ff6600 }
39 | .main {width: 480px; margin:0 auto;}
40 | </style>
41 | </header>
42 | <body>
43 | <div class="main">
44 | """
45 | 
46 | lastend = 0
47 | 
48 | for word in words:
49 |     w, score, start, end = word
50 |     space = start - lastend
51 |     if 0.1 < space <= 0.3:
52 |         print ", "
53 |     elif 0.3 < space:
54 |         num = int(space * 10)
55 |         print "." * num
56 | 
57 |     print '<span title="Confidence %f; Timestamp %.2fs-%.2fs" class="confidence_%d">%s</span> ' % (score, start, end, score * 10, w)
58 |     lastend = end
59 | 
60 | print """
61 | </div>
62 | </body>
63 | </html>
64 | """
65 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | ==========================
 2 |  Transcribe Podcast Audio Testing
 3 | ==========================
 4 | 
 5 | This is a repository demonstrating transcribing podcast audio with a
 6 | few different services: IBM Watson Speech to Text, Google Cloud
 7 | Speech, and CMU Sphinx.
 8 | 
 9 | Getting Started
10 | ===============
11 | 
12 | First run ``fetch_podcast.sh`` to pull some content down for
13 | converting. The example from my blog post is used, though you can
14 | replace the URL if you like to try with other content.
15 | 
16 | Running CMU Sphinx
17 | ==================
18 | 
19 | Ensure that sphinx is installed. On Ubuntu this is done with
20 | 
21 | ::
22 | 
23 |    sudo apt-get install pocketsphinx pocketsphinx-en-us
24 | 
25 | Then run ``transcribe_with_sphinx.sh``. It will produce
26 | ``sphinx-transcription.log`` as output.
27 | 
28 | Running Watson
29 | ==============
30 | 
31 | Sign up for IBM Bluemix and create a Watson Speech to Text instance.
32 | 
33 | Copy user/password into auth.cfg in the [watson] section.
34 | 
35 | Install python prereqs
36 | 
37 | ::
38 | 
39 |    pip install watson-developer-cloud
40 | 
41 | Then run it with
42 | 
43 | ::
44 | 
45 |    watson-transcribe.py podcast.flac
46 | 
47 | Optionally specify a pretrained customized language id
48 | 
49 | ::
50 | 
51 |    watson-transcribe.py --customization a13780b0-52b7-1fe7-fbb7-77471c70c949 podcast.flac
52 | 
53 | .. note::
54 | 
55 |    It will take 30 - 45 minutes to run, it processes slightly faster
56 |    than realtime.
57 | 
58 | Running Google
59 | ==============
60 | 
61 | Sign up for Google Cloud and create a project and a service key. Go
62 | through all the setup around authentication -
63 | https://googlecloudplatform.github.io/google-cloud-python/stable/google-cloud-auth.html
64 | 
65 | Create a key for the service, put that json in ``auth.json`` in this
66 | directory.
67 | 
68 | Install python prereqs
69 | 
70 | ::
71 | 
72 |    pip install google-cloud-storage google-cloud-speech
73 | 
74 | Then run it as
75 | 
76 | ::
77 | 
78 |    google-transcribe.py
79 | 
80 | .. note::
81 | 
82 |    It will take 20 - 30 minutes to run, it processes slightly faster
83 |    than realtime.
84 | 


--------------------------------------------------------------------------------
/watson-transcribe.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Copyright 2017 IBM
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
 6 | # not use this file except in compliance with the License. You may obtain
 7 | # a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 | # License for the specific language governing permissions and limitations
15 | # under the License.
16 | 
17 | import argparse
18 | import ConfigParser
19 | import json
20 | from os.path import join, dirname
21 | from watson_developer_cloud import SpeechToTextV1
22 | 
23 | 
24 | def get_auth():
25 |     config = ConfigParser.RawConfigParser()
26 |     config.read('auth.cfg')
27 |     user = config.get('watson', 'username')
28 |     password = config.get('watson', 'password')
29 |     return (user, password)
30 | 
31 | 
32 | def parse_args():
33 |     parser = argparse.ArgumentParser(
34 |         description='Transcribe audio with watson')
35 |     parser.add_argument('file')
36 |     parser.add_argument('--customization', help="Process using a customized model id")
37 |     args = parser.parse_args()
38 |     return args
39 | 
40 | def main():
41 |     args = parse_args()
42 |     (user, passwd) = get_auth()
43 |     speech_to_text = SpeechToTextV1(
44 |         username=user,
45 |         password=passwd,
46 |         x_watson_learning_opt_out=False
47 |     )
48 | 
49 |     speech_to_text.get_model('en-US_BroadbandModel')
50 | 
51 |     with open(args.file, 'rb') as audio_file:
52 |         print("Sending audio to watson to recognize, this will take 30+ minutes")
53 |         if args.customization is not None:
54 |             print("Using custom model {0}".format(args.customization))
55 |         print("Please be patient and don't kill this process while running")
56 |         output = speech_to_text.recognize(
57 |             audio_file, content_type='audio/flac', timestamps=True,
58 |             customization_id=args.customization,
59 |             word_confidence=True)
60 |         with open("watson-transcript.json", "w") as out:
61 |             print("Transcription done, written to watson-transcription.json")
62 |             out.write(json.dumps(output, indent=2))
63 | 
64 | if __name__ == "__main__":
65 |     main()
66 | 


--------------------------------------------------------------------------------