├── auth.cfg.sample
├── transcribe_with_sphinx.sh
├── fetch_podcast.sh
├── google-transcribe.py
├── convert-to-html.py
├── README.rst
└── watson-transcribe.py
/auth.cfg.sample:
--------------------------------------------------------------------------------
1 | [watson]
2 | username = UUID
3 | password = pass
4 |
--------------------------------------------------------------------------------
/transcribe_with_sphinx.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -eux
4 |
5 | FILE=${FILE:-podcast.wav}
6 |
7 | pocketsphinx_continuous -dict /usr/share/pocketsphinx/model/en-us/cmudict-en-us.dict -lm /usr/share/pocketsphinx/model/en-us/en-us.lm.bin -infile $FILE 2> sphinx-voice-debug.log | tee sphinx-transcription.log
8 |
--------------------------------------------------------------------------------
/fetch_podcast.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -eux
4 |
5 | PODCAST_URL=${PODCAST_URL:-http://audio.commonwealthclub.org/audio/podcast/cc_20170323_Zip_Code_Not_Genetic_Code_Podcast.mp3}
6 | FILE=$(basename $PODCAST_URL)
7 |
8 | wget -O $FILE $PODCAST_URL
9 |
10 | echo "Converting to flac for cloud services"
11 | ffmpeg -i $FILE -ar 16000 podcast.flac
12 |
13 | echo "Converting to wav for CMU Sphinx"
14 | ffmpeg -i $FILE -ar 16000 podcast.wav
15 |
--------------------------------------------------------------------------------
/google-transcribe.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import os
4 | import time
5 |
6 | from google.cloud import storage
7 | from google.cloud import exceptions as gex
8 | from google.cloud.gapic.speech.v1 import speech_client
9 | from google.cloud.gapic.speech.v1 import enums
10 | from google.cloud.proto.speech.v1 import cloud_speech_pb2
11 | from google.protobuf import json_format
12 |
13 | BUCKET="transcribe-test"
14 | FILE="podcast.flac"
15 |
16 | os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "auth.json"
17 |
18 | print("Creating bucket")
19 |
20 | client = storage.Client()
21 | bucket = client.lookup_bucket(BUCKET)
22 | if not bucket:
23 | bucket = client.create_bucket(BUCKET)
24 |
25 | print("Uploading Podcast")
26 | blob = storage.Blob(FILE, bucket)
27 | blob.upload_from_filename(FILE)
28 |
29 | sclient = speech_client.SpeechClient()
30 | encoding = enums.RecognitionConfig.AudioEncoding.FLAC
31 | sample_rate_hertz = 16000
32 | language_code = 'en-US'
33 | config = cloud_speech_pb2.RecognitionConfig(
34 | encoding=encoding,
35 | sample_rate_hertz=sample_rate_hertz,
36 | language_code=language_code)
37 | uri = 'gs://%s/%s' % (BUCKET, FILE)
38 | audio = cloud_speech_pb2.RecognitionAudio(uri=uri)
39 | response = sclient.long_running_recognize(config, audio)
40 |
41 | def callback(operation_future):
42 | # Handle result.
43 | result = operation_future.result()
44 | with open("google-transcript.json", "w") as f:
45 | f.write(json_format.MessageToJson(result))
46 | print("Done!")
47 |
48 | response.add_done_callback(callback)
49 | print("Running speech recognition")
50 |
--------------------------------------------------------------------------------
/convert-to-html.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # This is a simple script to convert the watson transcript to
4 | # something a little easier to skim. It color codes the words based on
5 | # individual confidences, and puts delay markers in for every 100 ms
6 | # of speech delay. This makes it a bit easier to parse than the run on
7 | # blocks.
8 |
9 | import json
10 |
11 | data = json.load(open('watson-transcript.json'))
12 |
13 | # print(data)
14 |
15 | words = []
16 |
17 | for r in data["results"]:
18 | conf = r["alternatives"][0]["word_confidence"]
19 | times = r["alternatives"][0]["timestamps"]
20 | for i in range(len(conf)):
21 | conf[i].extend((times[i][1], times[i][2]))
22 | words.extend(conf)
23 |
24 | print """
25 |
26 |
42 |
43 |
44 | """
45 |
46 | lastend = 0
47 |
48 | for word in words:
49 | w, score, start, end = word
50 | space = start - lastend
51 | if 0.1 < space <= 0.3:
52 | print ", "
53 | elif 0.3 < space:
54 | num = int(space * 10)
55 | print "." * num
56 |
57 | print '%s ' % (score, start, end, score * 10, w)
58 | lastend = end
59 |
60 | print """
61 |
62 |
63 |
64 | """
65 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | ==========================
2 | Transcribe Podcast Audio Testing
3 | ==========================
4 |
5 | This is a repository demonstrating transcribing podcast audio with a
6 | few different services: IBM Watson Speech to Text, Google Cloud
7 | Speech, and CMU Sphinx.
8 |
9 | Getting Started
10 | ===============
11 |
12 | First run ``fetch_podcast.sh`` to pull some content down for
13 | converting. The example from my blog post is used, though you can
14 | replace the URL if you like to try with other content.
15 |
16 | Running CMU Sphinx
17 | ==================
18 |
19 | Ensure that sphinx is installed. On Ubuntu this is done with
20 |
21 | ::
22 |
23 | sudo apt-get install pocketsphinx pocketsphinx-en-us
24 |
25 | Then run ``transcribe_with_sphinx.sh``. It will produce
26 | ``sphinx-transcription.log`` as output.
27 |
28 | Running Watson
29 | ==============
30 |
31 | Sign up for IBM Bluemix and create a Watson Speech to Text instance.
32 |
33 | Copy user/password into auth.cfg in the [watson] section.
34 |
35 | Install python prereqs
36 |
37 | ::
38 |
39 | pip install watson-developer-cloud
40 |
41 | Then run it with
42 |
43 | ::
44 |
45 | watson-transcribe.py podcast.flac
46 |
47 | Optionally specify a pretrained customized language id
48 |
49 | ::
50 |
51 | watson-transcribe.py --customization a13780b0-52b7-1fe7-fbb7-77471c70c949 podcast.flac
52 |
53 | .. note::
54 |
55 | It will take 30 - 45 minutes to run, it processes slightly faster
56 | than realtime.
57 |
58 | Running Google
59 | ==============
60 |
61 | Sign up for Google Cloud and create a project and a service key. Go
62 | through all the setup around authentication -
63 | https://googlecloudplatform.github.io/google-cloud-python/stable/google-cloud-auth.html
64 |
65 | Create a key for the service, put that json in ``auth.json`` in this
66 | directory.
67 |
68 | Install python prereqs
69 |
70 | ::
71 |
72 | pip install google-cloud-storage google-cloud-speech
73 |
74 | Then run it as
75 |
76 | ::
77 |
78 | google-transcribe.py
79 |
80 | .. note::
81 |
82 | It will take 20 - 30 minutes to run, it processes slightly faster
83 | than realtime.
84 |
--------------------------------------------------------------------------------
/watson-transcribe.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Copyright 2017 IBM
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
6 | # not use this file except in compliance with the License. You may obtain
7 | # a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 | # License for the specific language governing permissions and limitations
15 | # under the License.
16 |
17 | import argparse
18 | import ConfigParser
19 | import json
20 | from os.path import join, dirname
21 | from watson_developer_cloud import SpeechToTextV1
22 |
23 |
24 | def get_auth():
25 | config = ConfigParser.RawConfigParser()
26 | config.read('auth.cfg')
27 | user = config.get('watson', 'username')
28 | password = config.get('watson', 'password')
29 | return (user, password)
30 |
31 |
32 | def parse_args():
33 | parser = argparse.ArgumentParser(
34 | description='Transcribe audio with watson')
35 | parser.add_argument('file')
36 | parser.add_argument('--customization', help="Process using a customized model id")
37 | args = parser.parse_args()
38 | return args
39 |
40 | def main():
41 | args = parse_args()
42 | (user, passwd) = get_auth()
43 | speech_to_text = SpeechToTextV1(
44 | username=user,
45 | password=passwd,
46 | x_watson_learning_opt_out=False
47 | )
48 |
49 | speech_to_text.get_model('en-US_BroadbandModel')
50 |
51 | with open(args.file, 'rb') as audio_file:
52 | print("Sending audio to watson to recognize, this will take 30+ minutes")
53 | if args.customization is not None:
54 | print("Using custom model {0}".format(args.customization))
55 | print("Please be patient and don't kill this process while running")
56 | output = speech_to_text.recognize(
57 | audio_file, content_type='audio/flac', timestamps=True,
58 | customization_id=args.customization,
59 | word_confidence=True)
60 | with open("watson-transcript.json", "w") as out:
61 | print("Transcription done, written to watson-transcription.json")
62 | out.write(json.dumps(output, indent=2))
63 |
64 | if __name__ == "__main__":
65 | main()
66 |
--------------------------------------------------------------------------------