├── README.md
├── AWS_transcribe.ipynb
├── AWS_transcribe_with_Speaker_diarization.ipynb
├── Google_Longaudio_API_without_speaker_diarization.ipynb
└── Google_Longaudio_API_speaker_diarization.ipynb


/README.md:
--------------------------------------------------------------------------------
1 | # Speech-to-text
2 | 
3 | Convert speech to text using different API. Works well for long audio files (Asynchronous).
4 | 


--------------------------------------------------------------------------------
/AWS_transcribe.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "filepath = \"~/audio_wav/\"\n",
 12 |     "output_filepath = \"~/Transcripts/\""
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from __future__ import print_function\n",
 24 |     "import time\n",
 25 |     "import boto3\n",
 26 |     "import json\n",
 27 |     "import os\n",
 28 |     "import botocore\n",
 29 |     "\n",
 30 |     "bucketName = \"audiofiles\""
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 3,
 36 |    "metadata": {
 37 |     "collapsed": true
 38 |    },
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "def upload_file_to_s3(audio_file_name):\n",
 42 |     "    \n",
 43 |     "    Key = filepath + audio_file_name\n",
 44 |     "    outPutname = audio_file_name\n",
 45 |     "\n",
 46 |     "    s3 = boto3.client('s3')\n",
 47 |     "    s3.upload_file(Key,bucketName,outPutname)"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 4,
 53 |    "metadata": {
 54 |     "collapsed": true
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "def download_file_from_s3(audio_file_name):\n",
 59 |     "    \n",
 60 |     "    s3 = boto3.resource('s3')\n",
 61 |     "    \n",
 62 |     "    Key = outPutname = audio_file_name.split('.')[0] + '.json'\n",
 63 |     "    \n",
 64 |     "    try:\n",
 65 |     "        s3.Bucket(bucketName).download_file(Key, outPutname)\n",
 66 |     "    except botocore.exceptions.ClientError as e:\n",
 67 |     "        if e.response['Error']['Code'] == \"404\":\n",
 68 |     "            print(\"The object does not exist.\")\n",
 69 |     "        else:\n",
 70 |     "            raise"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 5,
 76 |    "metadata": {
 77 |     "collapsed": true
 78 |    },
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "def delete_file_from_s3(audio_file_name):\n",
 82 |     "    \n",
 83 |     "    s3 = boto3.resource('s3')\n",
 84 |     "    s3.Object(bucketName, audio_file_name).delete()\n",
 85 |     "    s3.Object(bucketName, audio_file_name.split('.')[0] + '.json').delete()"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 6,
 91 |    "metadata": {
 92 |     "collapsed": true
 93 |    },
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "def transcribe(audio_file_name):\n",
 97 |     "    \n",
 98 |     "    transcripts = ''\n",
 99 |     "    \n",
100 |     "    upload_file_to_s3(audio_file_name)\n",
101 |     "    \n",
102 |     "    transcribe = boto3.client('transcribe', region_name='us-east-2')\n",
103 |     "    job_name = audio_file_name.split('.')[0]\n",
104 |     "    job_uri = \"https://s3.us-east-2.amazonaws.com/\" + bucketName + \"/\" + audio_file_name\n",
105 |     "    transcribe.start_transcription_job(\n",
106 |     "        TranscriptionJobName=job_name,\n",
107 |     "        Media={'MediaFileUri': job_uri},\n",
108 |     "        MediaFormat='wav',\n",
109 |     "        LanguageCode='en-US',\n",
110 |     "        OutputBucketName=bucketName\n",
111 |     "    )\n",
112 |     "    while True:\n",
113 |     "        status = transcribe.get_transcription_job(TranscriptionJobName=job_name)\n",
114 |     "        if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:\n",
115 |     "            break\n",
116 |     "        time.sleep(5)\n",
117 |     "    \n",
118 |     "    download_file_from_s3(audio_file_name)\n",
119 |     "    \n",
120 |     "    transcribe.delete_transcription_job(TranscriptionJobName=job_name)\n",
121 |     "    \n",
122 |     "    delete_file_from_s3(audio_file_name)\n",
123 |     "    \n",
124 |     "    with open(audio_file_name.split('.')[0] + '.json') as f:\n",
125 |     "        text = json.load(f)\n",
126 |     "    \n",
127 |     "    for i in text['results']['transcripts']:\n",
128 |     "        transcripts += i['transcript']\n",
129 |     "    \n",
130 |     "    os.remove(audio_file_name.split('.')[0] + '.json')\n",
131 |     "    \n",
132 |     "    return transcripts"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 7,
138 |    "metadata": {
139 |     "collapsed": true
140 |    },
141 |    "outputs": [],
142 |    "source": [
143 |     "def write_transcripts(transcript_filename,transcript):\n",
144 |     "    f= open(output_filepath + transcript_filename,\"w+\")\n",
145 |     "    f.write(transcript)\n",
146 |     "    f.close() "
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {
153 |     "collapsed": true
154 |    },
155 |    "outputs": [],
156 |    "source": [
157 |     "if __name__ == \"__main__\":\n",
158 |     "    files = [f for f in os.listdir(filepath) if f.endswith(\".wav\")]\n",
159 |     "    for audio_file_name in files:\n",
160 |     "        exists = os.path.isfile(output_filepath + audio_file_name.split('.')[0] + '.txt')\n",
161 |     "        if exists:\n",
162 |     "            pass\n",
163 |     "        else:\n",
164 |     "            print(audio_file_name)\n",
165 |     "            transcript = transcribe(audio_file_name)\n",
166 |     "            transcript_filename = audio_file_name.split('.')[0] + '.txt'\n",
167 |     "            write_transcripts(transcript_filename,transcript)"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {
174 |     "collapsed": true
175 |    },
176 |    "outputs": [],
177 |    "source": []
178 |   }
179 |  ],
180 |  "metadata": {
181 |   "kernelspec": {
182 |    "display_name": "Python 3",
183 |    "language": "python",
184 |    "name": "python3"
185 |   },
186 |   "language_info": {
187 |    "codemirror_mode": {
188 |     "name": "ipython",
189 |     "version": 3
190 |    },
191 |    "file_extension": ".py",
192 |    "mimetype": "text/x-python",
193 |    "name": "python",
194 |    "nbconvert_exporter": "python",
195 |    "pygments_lexer": "ipython3",
196 |    "version": "3.5.2"
197 |   }
198 |  },
199 |  "nbformat": 4,
200 |  "nbformat_minor": 2
201 | }
202 | 


--------------------------------------------------------------------------------
/AWS_transcribe_with_Speaker_diarization.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 6,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "filepath = \"~/audio_wav/\"\n",
 12 |     "output_filepath = \"~/Transcripts/\""
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 7,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from __future__ import print_function\n",
 24 |     "import time\n",
 25 |     "import boto3\n",
 26 |     "import json\n",
 27 |     "import os\n",
 28 |     "import botocore\n",
 29 |     "\n",
 30 |     "bucketName = \"audiofiles\""
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 8,
 36 |    "metadata": {
 37 |     "collapsed": true
 38 |    },
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "def upload_file_to_s3(audio_file_name):\n",
 42 |     "    \n",
 43 |     "    Key = filepath + audio_file_name\n",
 44 |     "    outPutname = audio_file_name\n",
 45 |     "\n",
 46 |     "    s3 = boto3.client('s3')\n",
 47 |     "    s3.upload_file(Key,bucketName,outPutname)"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 9,
 53 |    "metadata": {
 54 |     "collapsed": true
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "def download_file_from_s3(audio_file_name):\n",
 59 |     "    \n",
 60 |     "    s3 = boto3.resource('s3')\n",
 61 |     "    \n",
 62 |     "    Key = outPutname = audio_file_name.split('.')[0] + '.json'\n",
 63 |     "    \n",
 64 |     "    try:\n",
 65 |     "        s3.Bucket(bucketName).download_file(Key, outPutname)\n",
 66 |     "    except botocore.exceptions.ClientError as e:\n",
 67 |     "        if e.response['Error']['Code'] == \"404\":\n",
 68 |     "            print(\"The object does not exist.\")\n",
 69 |     "        else:\n",
 70 |     "            raise"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 10,
 76 |    "metadata": {
 77 |     "collapsed": true
 78 |    },
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "def delete_file_from_s3(audio_file_name):\n",
 82 |     "    \n",
 83 |     "    s3 = boto3.resource('s3')\n",
 84 |     "    s3.Object(bucketName, audio_file_name).delete()\n",
 85 |     "    s3.Object(bucketName, audio_file_name.split('.')[0] + '.json').delete()"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 15,
 91 |    "metadata": {
 92 |     "collapsed": true
 93 |    },
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "def transcribe(audio_file_name):\n",
 97 |     "    \n",
 98 |     "    transcripts = ''\n",
 99 |     "    \n",
100 |     "    upload_file_to_s3(audio_file_name)\n",
101 |     "    \n",
102 |     "    transcribe = boto3.client('transcribe', region_name='us-east-2')\n",
103 |     "    job_name = audio_file_name.split('.')[0]\n",
104 |     "    job_uri = \"https://s3.us-east-2.amazonaws.com/\" + bucketName + \"/\" + audio_file_name\n",
105 |     "    transcribe.start_transcription_job(\n",
106 |     "        TranscriptionJobName=job_name,\n",
107 |     "        Media={'MediaFileUri': job_uri},\n",
108 |     "        MediaFormat='wav',\n",
109 |     "        LanguageCode='en-US',\n",
110 |     "        Settings={'MaxSpeakerLabels':2,'ShowSpeakerLabels':True},\n",
111 |     "        OutputBucketName=bucketName\n",
112 |     "    )\n",
113 |     "    while True:\n",
114 |     "        status = transcribe.get_transcription_job(TranscriptionJobName=job_name)\n",
115 |     "        if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:\n",
116 |     "            break\n",
117 |     "        time.sleep(5)\n",
118 |     "    \n",
119 |     "    download_file_from_s3(audio_file_name)\n",
120 |     "    \n",
121 |     "    transcribe.delete_transcription_job(TranscriptionJobName=job_name)\n",
122 |     "    \n",
123 |     "    delete_file_from_s3(audio_file_name)\n",
124 |     "    \n",
125 |     "    with open(audio_file_name.split('.')[0] + '.json') as f:\n",
126 |     "        text = json.load(f)\n",
127 |     "    \n",
128 |     "    for i in text['results']['transcripts']:\n",
129 |     "        transcripts += i['transcript']\n",
130 |     "    \n",
131 |     "    #os.remove(audio_file_name.split('.')[0] + '.json')\n",
132 |     "    \n",
133 |     "    return transcripts"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 16,
139 |    "metadata": {
140 |     "collapsed": true
141 |    },
142 |    "outputs": [],
143 |    "source": [
144 |     "def write_transcripts(transcript_filename,transcript):\n",
145 |     "    f= open(output_filepath + transcript_filename,\"w+\")\n",
146 |     "    f.write(transcript)\n",
147 |     "    f.close() "
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 17,
153 |    "metadata": {},
154 |    "outputs": [
155 |     {
156 |      "name": "stdout",
157 |      "output_type": "stream",
158 |      "text": [
159 |       "path.wav\n",
160 |       "Test.wav\n"
161 |      ]
162 |     }
163 |    ],
164 |    "source": [
165 |     "if __name__ == \"__main__\":\n",
166 |     "    files = [f for f in os.listdir(filepath) if f.endswith(\".wav\")]\n",
167 |     "    for audio_file_name in files:\n",
168 |     "        exists = os.path.isfile(output_filepath + audio_file_name.split('.')[0] + '.txt')\n",
169 |     "        if exists:\n",
170 |     "            pass\n",
171 |     "        else:\n",
172 |     "            print(audio_file_name)\n",
173 |     "            transcript = transcribe(audio_file_name)\n",
174 |     "            transcript_filename = audio_file_name.split('.')[0] + '.txt'\n",
175 |     "            write_transcripts(transcript_filename,transcript)"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {
182 |     "collapsed": true
183 |    },
184 |    "outputs": [],
185 |    "source": []
186 |   }
187 |  ],
188 |  "metadata": {
189 |   "kernelspec": {
190 |    "display_name": "Python 3",
191 |    "language": "python",
192 |    "name": "python3"
193 |   },
194 |   "language_info": {
195 |    "codemirror_mode": {
196 |     "name": "ipython",
197 |     "version": 3
198 |    },
199 |    "file_extension": ".py",
200 |    "mimetype": "text/x-python",
201 |    "name": "python",
202 |    "nbconvert_exporter": "python",
203 |    "pygments_lexer": "ipython3",
204 |    "version": "3.5.2"
205 |   }
206 |  },
207 |  "nbformat": 4,
208 |  "nbformat_minor": 2
209 | }
210 | 


--------------------------------------------------------------------------------
/Google_Longaudio_API_without_speaker_diarization.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "filepath = \"~/audio_wav/\"\n",
 12 |     "output_filepath = \"~/Transcripts/\""
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from pydub import AudioSegment\n",
 24 |     "import io\n",
 25 |     "import os\n",
 26 |     "from google.cloud import speech\n",
 27 |     "from google.cloud.speech import enums\n",
 28 |     "from google.cloud.speech import types\n",
 29 |     "import wave\n",
 30 |     "from google.cloud import storage"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 3,
 36 |    "metadata": {
 37 |     "collapsed": true
 38 |    },
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "def mp3_to_wav(audio_file_name):\n",
 42 |     "    if audio_file_name.split('.')[1] == 'mp3':    \n",
 43 |     "        sound = AudioSegment.from_mp3(audio_file_name)\n",
 44 |     "        audio_file_name = audio_file_name.split('.')[0] + '.wav'\n",
 45 |     "        sound.export(audio_file_name, format=\"wav\")"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 4,
 51 |    "metadata": {
 52 |     "collapsed": true
 53 |    },
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "def frame_rate_channel(audio_file_name):\n",
 57 |     "    with wave.open(audio_file_name, \"rb\") as wave_file:\n",
 58 |     "        frame_rate = wave_file.getframerate()\n",
 59 |     "        channels = wave_file.getnchannels()\n",
 60 |     "        return frame_rate,channels"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 5,
 66 |    "metadata": {
 67 |     "collapsed": true
 68 |    },
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "def stereo_to_mono(audio_file_name):\n",
 72 |     "    sound = AudioSegment.from_wav(audio_file_name)\n",
 73 |     "    sound = sound.set_channels(1)\n",
 74 |     "    sound.export(audio_file_name, format=\"wav\")"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 6,
 80 |    "metadata": {
 81 |     "collapsed": true
 82 |    },
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "def upload_blob(bucket_name, source_file_name, destination_blob_name):\n",
 86 |     "    \"\"\"Uploads a file to the bucket.\"\"\"\n",
 87 |     "    storage_client = storage.Client()\n",
 88 |     "    bucket = storage_client.get_bucket(bucket_name)\n",
 89 |     "    blob = bucket.blob(destination_blob_name)\n",
 90 |     "\n",
 91 |     "    blob.upload_from_filename(source_file_name)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 7,
 97 |    "metadata": {
 98 |     "collapsed": true
 99 |    },
100 |    "outputs": [],
101 |    "source": [
102 |     "def delete_blob(bucket_name, blob_name):\n",
103 |     "    \"\"\"Deletes a blob from the bucket.\"\"\"\n",
104 |     "    storage_client = storage.Client()\n",
105 |     "    bucket = storage_client.get_bucket(bucket_name)\n",
106 |     "    blob = bucket.blob(blob_name)\n",
107 |     "\n",
108 |     "    blob.delete()"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 8,
114 |    "metadata": {
115 |     "collapsed": true
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "def google_transcribe(audio_file_name):\n",
120 |     "    \n",
121 |     "    file_name = filepath + audio_file_name\n",
122 |     "    mp3_to_wav(file_name)\n",
123 |     "\n",
124 |     "    # The name of the audio file to transcribe\n",
125 |     "    \n",
126 |     "    frame_rate, channels = frame_rate_channel(file_name)\n",
127 |     "    \n",
128 |     "    if channels > 1:\n",
129 |     "        stereo_to_mono(file_name)\n",
130 |     "    \n",
131 |     "    bucket_name = 'callsaudiofiles'\n",
132 |     "    source_file_name = filepath + audio_file_name\n",
133 |     "    destination_blob_name = audio_file_name\n",
134 |     "    \n",
135 |     "    upload_blob(bucket_name, source_file_name, destination_blob_name)\n",
136 |     "    \n",
137 |     "    gcs_uri = 'gs://callsaudiofiles/' + audio_file_name\n",
138 |     "    transcript = ''\n",
139 |     "    \n",
140 |     "    client = speech.SpeechClient()\n",
141 |     "    audio = types.RecognitionAudio(uri=gcs_uri)\n",
142 |     "\n",
143 |     "    config = types.RecognitionConfig(\n",
144 |     "    encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,\n",
145 |     "    sample_rate_hertz=frame_rate,\n",
146 |     "    language_code='en-US')\n",
147 |     "\n",
148 |     "    # Detects speech in the audio file\n",
149 |     "    operation = client.long_running_recognize(config, audio)\n",
150 |     "    response = operation.result(timeout=10000)\n",
151 |     "\n",
152 |     "    for result in response.results:\n",
153 |     "        transcript += result.alternatives[0].transcript\n",
154 |     "    \n",
155 |     "    delete_blob(bucket_name, destination_blob_name)\n",
156 |     "    return transcript"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 9,
162 |    "metadata": {
163 |     "collapsed": true
164 |    },
165 |    "outputs": [],
166 |    "source": [
167 |     "def write_transcripts(transcript_filename,transcript):\n",
168 |     "    f= open(output_filepath + transcript_filename,\"w+\")\n",
169 |     "    f.write(transcript)\n",
170 |     "    f.close() "
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 10,
176 |    "metadata": {
177 |     "collapsed": true
178 |    },
179 |    "outputs": [],
180 |    "source": [
181 |     "if __name__ == \"__main__\":\n",
182 |     "    for audio_file_name in os.listdir(filepath):\n",
183 |     "        transcript = google_transcribe(audio_file_name)\n",
184 |     "        transcript_filename = audio_file_name.split('.')[0] + '.txt'\n",
185 |     "        write_transcripts(transcript_filename,transcript)"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {
192 |     "collapsed": true
193 |    },
194 |    "outputs": [],
195 |    "source": []
196 |   }
197 |  ],
198 |  "metadata": {
199 |   "kernelspec": {
200 |    "display_name": "Python 3",
201 |    "language": "python",
202 |    "name": "python3"
203 |   },
204 |   "language_info": {
205 |    "codemirror_mode": {
206 |     "name": "ipython",
207 |     "version": 3
208 |    },
209 |    "file_extension": ".py",
210 |    "mimetype": "text/x-python",
211 |    "name": "python",
212 |    "nbconvert_exporter": "python",
213 |    "pygments_lexer": "ipython3",
214 |    "version": "3.5.2"
215 |   }
216 |  },
217 |  "nbformat": 4,
218 |  "nbformat_minor": 2
219 | }
220 | 


--------------------------------------------------------------------------------
/Google_Longaudio_API_speaker_diarization.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "filepath = \"~/audio_wav/\"\n",
 12 |     "output_filepath = \"~/Transcripts/\""
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from pydub import AudioSegment\n",
 24 |     "import io\n",
 25 |     "import os\n",
 26 |     "from google.cloud import speech_v1p1beta1 as speech\n",
 27 |     "from google.cloud.speech_v1p1beta1 import enums\n",
 28 |     "from google.cloud.speech_v1p1beta1 import types\n",
 29 |     "import wave\n",
 30 |     "from google.cloud import storage"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 3,
 36 |    "metadata": {
 37 |     "collapsed": true
 38 |    },
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "def mp3_to_wav(audio_file_name):\n",
 42 |     "    if audio_file_name.split('.')[1] == 'mp3':    \n",
 43 |     "        sound = AudioSegment.from_mp3(audio_file_name)\n",
 44 |     "        audio_file_name = audio_file_name.split('.')[0] + '.wav'\n",
 45 |     "        sound.export(audio_file_name, format=\"wav\")"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 4,
 51 |    "metadata": {
 52 |     "collapsed": true
 53 |    },
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "def frame_rate_channel(audio_file_name):\n",
 57 |     "    with wave.open(audio_file_name, \"rb\") as wave_file:\n",
 58 |     "        frame_rate = wave_file.getframerate()\n",
 59 |     "        channels = wave_file.getnchannels()\n",
 60 |     "        return frame_rate,channels"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 5,
 66 |    "metadata": {
 67 |     "collapsed": true
 68 |    },
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "def stereo_to_mono(audio_file_name):\n",
 72 |     "    sound = AudioSegment.from_wav(audio_file_name)\n",
 73 |     "    sound = sound.set_channels(1)\n",
 74 |     "    sound.export(audio_file_name, format=\"wav\")"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 6,
 80 |    "metadata": {
 81 |     "collapsed": true
 82 |    },
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "def upload_blob(bucket_name, source_file_name, destination_blob_name):\n",
 86 |     "    \"\"\"Uploads a file to the bucket.\"\"\"\n",
 87 |     "    storage_client = storage.Client()\n",
 88 |     "    bucket = storage_client.get_bucket(bucket_name)\n",
 89 |     "    blob = bucket.blob(destination_blob_name)\n",
 90 |     "\n",
 91 |     "    blob.upload_from_filename(source_file_name)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 7,
 97 |    "metadata": {
 98 |     "collapsed": true
 99 |    },
100 |    "outputs": [],
101 |    "source": [
102 |     "def delete_blob(bucket_name, blob_name):\n",
103 |     "    \"\"\"Deletes a blob from the bucket.\"\"\"\n",
104 |     "    storage_client = storage.Client()\n",
105 |     "    bucket = storage_client.get_bucket(bucket_name)\n",
106 |     "    blob = bucket.blob(blob_name)\n",
107 |     "\n",
108 |     "    blob.delete()"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 8,
114 |    "metadata": {
115 |     "collapsed": true
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "def google_transcribe(audio_file_name):\n",
120 |     "    \n",
121 |     "    file_name = filepath + audio_file_name\n",
122 |     "    mp3_to_wav(file_name)\n",
123 |     "\n",
124 |     "    # The name of the audio file to transcribe\n",
125 |     "    \n",
126 |     "    frame_rate, channels = frame_rate_channel(file_name)\n",
127 |     "    \n",
128 |     "    if channels > 1:\n",
129 |     "        stereo_to_mono(file_name)\n",
130 |     "    \n",
131 |     "    bucket_name = 'callsaudiofiles'\n",
132 |     "    source_file_name = filepath + audio_file_name\n",
133 |     "    destination_blob_name = audio_file_name\n",
134 |     "    \n",
135 |     "    upload_blob(bucket_name, source_file_name, destination_blob_name)\n",
136 |     "    \n",
137 |     "    gcs_uri = 'gs://callsaudiofiles/' + audio_file_name\n",
138 |     "    transcript = ''\n",
139 |     "    \n",
140 |     "    client = speech.SpeechClient()\n",
141 |     "    audio = types.RecognitionAudio(uri=gcs_uri)\n",
142 |     "\n",
143 |     "    config = types.RecognitionConfig(\n",
144 |     "    encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,\n",
145 |     "    sample_rate_hertz=frame_rate,\n",
146 |     "    language_code='en-US',\n",
147 |     "    enable_speaker_diarization=True,\n",
148 |     "    diarization_speaker_count=2)\n",
149 |     "\n",
150 |     "    # Detects speech in the audio file\n",
151 |     "    operation = client.long_running_recognize(config, audio)\n",
152 |     "    response = operation.result(timeout=10000)\n",
153 |     "    result = response.results[-1]\n",
154 |     "    words_info = result.alternatives[0].words\n",
155 |     "    \n",
156 |     "    tag=1\n",
157 |     "    speaker=\"\"\n",
158 |     "\n",
159 |     "    for word_info in words_info:\n",
160 |     "        if word_info.speaker_tag==tag:\n",
161 |     "            speaker=speaker+\" \"+word_info.word\n",
162 |     "        else:\n",
163 |     "            transcript += \"speaker {}: {}\".format(tag,speaker) + '\\n'\n",
164 |     "            tag=word_info.speaker_tag\n",
165 |     "            speaker=\"\"+word_info.word\n",
166 |     "\n",
167 |     "    transcript += \"speaker {}: {}\".format(tag,speaker)\n",
168 |     "    \n",
169 |     "    delete_blob(bucket_name, destination_blob_name)\n",
170 |     "    return transcript"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 9,
176 |    "metadata": {
177 |     "collapsed": true
178 |    },
179 |    "outputs": [],
180 |    "source": [
181 |     "def write_transcripts(transcript_filename,transcript):\n",
182 |     "    f= open(output_filepath + transcript_filename,\"w+\")\n",
183 |     "    f.write(transcript)\n",
184 |     "    f.close() "
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 10,
190 |    "metadata": {
191 |     "collapsed": true
192 |    },
193 |    "outputs": [],
194 |    "source": [
195 |     "if __name__ == \"__main__\":\n",
196 |     "    for audio_file_name in os.listdir(filepath):\n",
197 |     "        exists = os.path.isfile(output_filepath + audio_file_name.split('.')[0] + '.txt')\n",
198 |     "        if exists:\n",
199 |     "            pass\n",
200 |     "        else:\n",
201 |     "            transcript = google_transcribe(audio_file_name)\n",
202 |     "            transcript_filename = audio_file_name.split('.')[0] + '.txt'\n",
203 |     "            write_transcripts(transcript_filename,transcript)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "metadata": {
210 |     "collapsed": true
211 |    },
212 |    "outputs": [],
213 |    "source": []
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {
219 |     "collapsed": true
220 |    },
221 |    "outputs": [],
222 |    "source": []
223 |   }
224 |  ],
225 |  "metadata": {
226 |   "kernelspec": {
227 |    "display_name": "Python 3",
228 |    "language": "python",
229 |    "name": "python3"
230 |   },
231 |   "language_info": {
232 |    "codemirror_mode": {
233 |     "name": "ipython",
234 |     "version": 3
235 |    },
236 |    "file_extension": ".py",
237 |    "mimetype": "text/x-python",
238 |    "name": "python",
239 |    "nbconvert_exporter": "python",
240 |    "pygments_lexer": "ipython3",
241 |    "version": "3.5.2"
242 |   }
243 |  },
244 |  "nbformat": 4,
245 |  "nbformat_minor": 2
246 | }
247 | 


--------------------------------------------------------------------------------