├── README.md ├── speech_text_whisper_offline.py ├── templates ├── conversation.html └── ai_assistant_button.html ├── speech_text_whisper.py ├── terminal_print_elements.py ├── app_flask_basic.py ├── geo_location_point.py ├── audio_play.py ├── audio_get_channels.py ├── terminal_spectrum_viz.py ├── terminal_print.py ├── app.flask_speak.py ├── audio_record.py ├── speech_text_offline.py ├── geo_google.py ├── ai_doc_creator.py ├── pilot.py ├── pilot_functions.py ├── conversation.ipynb ├── terminal_chat_func.py ├── terminal_audio_spectrum.ipynb ├── app_functions_map.py ├── conversation.py ├── pilot_terminal.py └── nlp_labelling.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # ai_assistent 2 | This repository contains files which use ai to assist you 3 | -------------------------------------------------------------------------------- /speech_text_whisper_offline.py: -------------------------------------------------------------------------------- 1 | #%% 2 | # listening with whisper 3 | import whisper 4 | 5 | model = whisper.load_model("base") 6 | result = model.transcribe("output.wav") 7 | print(result) 8 | #%% 9 | -------------------------------------------------------------------------------- /templates/conversation.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Conversation 6 | 12 | 13 | 14 |

Conversation Page

15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /speech_text_whisper.py: -------------------------------------------------------------------------------- 1 | # import openai 2 | import openai 3 | import json 4 | import pandas as pd 5 | import credentials 6 | import os 7 | 8 | script_dir = os.path.dirname(os.path.abspath(__file__)) 9 | filename = os.path.join(script_dir, "audio_output.wav") 10 | 11 | #%% 12 | def get_transcript_whisper(): 13 | openai.api_key = credentials.api_key 14 | file = open(filename, "rb") 15 | transcription = openai.Audio.transcribe("whisper-1", file, response_format="json") 16 | text = transcription["text"] 17 | return text 18 | 19 | # Main code 20 | output = get_transcript_whisper() 21 | print(output) 22 | print(type(output)) 23 | 24 | #%% 25 | -------------------------------------------------------------------------------- /terminal_print_elements.py: -------------------------------------------------------------------------------- 1 | # Rich text function to display list or dictionarys in panel form 2 | 3 | from rich import print 4 | from rich.console import Console 5 | from rich.columns import Columns 6 | from rich.panel import Panel 7 | import json 8 | 9 | # Test data with three columns 'Name' 'Age' 'City' and 10 persons 10 | 11 | data = [["John Smith", 33, "New York"], 12 | ["Jane Doe", 32, "Paris"], 13 | ["Sam Doe", 35, "London"], 14 | ["Susan", 33, "New York"], 15 | ["John Smith", 33, "Sydney"], 16 | ["Jane Doe", 32, "Amsterdam"], 17 | ["Sam Doe", 35, "London"], 18 | ["Susan", 33, "New York"], 19 | ["John Smith", 33, "Sydney"]] 20 | 21 | json_data = json.dumps(data) -------------------------------------------------------------------------------- /app_flask_basic.py: -------------------------------------------------------------------------------- 1 | # app.py 2 | 3 | '''This is a basic Flask app that uses the terminal_chat_func.py file to run the chatbot functions. It also uses the gTTS library to convert the chatbot's answer to speech and the pygame library to play the audio. The speak_answer function is called in a new thread so that the chatbot can continue to run while the audio is playing.''' 4 | 5 | from flask import Flask, render_template, request 6 | from terminal_chat_func import run_all_functions 7 | 8 | app = Flask(__name__) 9 | 10 | @app.route('/', methods=['GET', 'POST']) 11 | def index(): 12 | if request.method == 'POST': 13 | transcript, answer = run_all_functions() 14 | return render_template('index.html', transcript=transcript, answer=answer) 15 | return render_template('index.html') 16 | 17 | 18 | if __name__ == '__main__': 19 | app.run(debug=True) 20 | 21 | #%% 22 | 23 | #%% 24 | -------------------------------------------------------------------------------- /geo_location_point.py: -------------------------------------------------------------------------------- 1 | #%% 2 | # importing geopy library 3 | from geopy.geocoders import Nominatim 4 | import json 5 | from urllib.request import urlopen 6 | import pandas as pd 7 | 8 | 9 | #%% 10 | def get_geo_location(city_name): 11 | # calling the Nominatim tool 12 | loc = Nominatim(user_agent="GetLoc") 13 | 14 | # entering the location name 15 | getLoc = loc.geocode(city_name) 16 | 17 | # printing address 18 | print(getLoc.address) 19 | 20 | # printing latitude and longitude 21 | print("Latitude = ", getLoc.latitude, "\n") 22 | print("Longitude = ", getLoc.longitude) 23 | 24 | return getLoc 25 | 26 | 27 | def get_location(): 28 | url = "http://ipinfo.io/json" 29 | response = urlopen(url) 30 | data = json.load(response) 31 | # Get location 32 | lng = data['loc'].split(',')[1] 33 | lat = data['loc'].split(',')[0] 34 | location = {'lat': lat, 'lng': lng} 35 | return location 36 | 37 | -------------------------------------------------------------------------------- /audio_play.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import pyaudio 3 | import wave 4 | from audio_get_channels import get_speaker 5 | import os 6 | 7 | script_dir = os.path.dirname(os.path.abspath(__file__)) 8 | 9 | def play_recording(): 10 | filename = os.path.join(script_dir, "audio_output.wav") 11 | sound = wave.open(filename) 12 | p = pyaudio.PyAudio() 13 | print(f"Start playing {sound.getnchannels()} channels at {sound.getframerate()} Hz") 14 | chunk = 1024 15 | stream = p.open(format=p.get_format_from_width(sound.getsampwidth()), 16 | channels=sound.getnchannels(), 17 | rate=sound.getframerate(), 18 | output=True, 19 | output_device_index=get_speaker()) 20 | 21 | data = sound.readframes(chunk) 22 | while True: 23 | if data != '': 24 | stream.write(data) 25 | data = sound.readframes(chunk) 26 | 27 | if data == b'': 28 | break 29 | 30 | stream.stop_stream() 31 | stream.close() 32 | p.terminate() 33 | 34 | print("Finished playing") 35 | 36 | play_recording() 37 | 38 | #%% 39 | -------------------------------------------------------------------------------- /audio_get_channels.py: -------------------------------------------------------------------------------- 1 | # pyaudio get channels and device index 2 | import pyaudio 3 | import pandas as pd 4 | 5 | # Get all channels printed 6 | def get_all_channels(): 7 | p = pyaudio.PyAudio() 8 | channels = {} 9 | for i in range(p.get_device_count()): 10 | dev = p.get_device_info_by_index(i) 11 | rate = p.get_device_info_by_index(0)['defaultSampleRate'] 12 | 13 | print((i, dev['name'], dev['maxInputChannels']), rate) 14 | 15 | # Get all channels printed in a df 16 | def get_all_channels_df(): 17 | p = pyaudio.PyAudio() 18 | channels = {} 19 | for i in range(p.get_device_count()): 20 | dev = p.get_device_info_by_index(i) 21 | channels[i] = i, dev['name'], dev['maxInputChannels'] 22 | 23 | return pd.DataFrame(channels).T 24 | 25 | # Get channel Macbook Pro Microphone 26 | def get_cur_mic(): 27 | p = pyaudio.PyAudio() 28 | for i in range(p.get_device_count()): 29 | dev = p.get_device_info_by_index(i) 30 | if 'pro microphone' in dev['name'].lower(): 31 | return i 32 | return None 33 | 34 | def get_speaker(): 35 | p = pyaudio.PyAudio() 36 | for i in range(p.get_device_count()): 37 | dev = p.get_device_info_by_index(i) 38 | if 'speaker' in dev['name'].lower(): 39 | return i 40 | return None 41 | 42 | #%% 43 | -------------------------------------------------------------------------------- /terminal_spectrum_viz.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import pyaudio 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import time 6 | import struct 7 | import tkinter as tk 8 | from tkinter import TclError 9 | 10 | # https://www.youtube.com/watch?v=AShHJdSIxkY 11 | 12 | #%% 13 | chunk = 4410 14 | sample_format = pyaudio.paInt16 15 | channels = 1 # Settings with screen = 1 | Settings without screen = 1 16 | fs = 44100 17 | seconds = 10 18 | device_index = 1 # Settings with screen = 2 | Settings without screen = 1 | Settings uwithout Wifi = 0 19 | filename = "audio_spectrum.wav" 20 | 21 | p = pyaudio.PyAudio() # Create an interface to PortAudio 22 | # Implement a linebreak inn the print 23 | 24 | 25 | print(f'\n... Recording {seconds} seconds of audio initialized ...\n') 26 | 27 | stream = p.open(format=sample_format, 28 | channels=channels, 29 | rate=fs, 30 | input_device_index = device_index, 31 | frames_per_buffer=chunk, 32 | output=True, 33 | input=True) 34 | 35 | data = stream.read(chunk) 36 | #print(data) 37 | 38 | print(len(data)) 39 | data_int = np.array(struct.unpack(str(2*chunk) + 'B', data), dtype= 'b')[::2] + 128 40 | 41 | fix, ax = plt.subplots() 42 | ax.plot(data_int, '-') 43 | plt.show() 44 | 45 | #%% 46 | -------------------------------------------------------------------------------- /terminal_print.py: -------------------------------------------------------------------------------- 1 | # Print function to slowly print the text except when the user presses Enter 2 | # With pressing enter, the full answer is printed instantly 3 | from rich import print 4 | from rich.console import Console 5 | from pynput import keyboard 6 | import time 7 | import sys 8 | 9 | console = Console() 10 | 11 | def print_answer(answer): 12 | print_complete = False 13 | break_program = False 14 | 15 | def on_press(key): 16 | nonlocal break_program 17 | if key == keyboard.Key.enter: 18 | console.print('Printout activated', style='bold red') 19 | break_program = True 20 | return False 21 | 22 | listener_thread = keyboard.Listener(on_press=on_press) 23 | listener_thread.start() 24 | 25 | try: 26 | for line in answer.splitlines(): 27 | for word in line.split(): 28 | console.print(word, end=' ') 29 | sys.stdout.flush() 30 | time.sleep(0.30) 31 | if break_program: 32 | break 33 | 34 | sys.stdout.write('\n') 35 | 36 | if line == answer.splitlines()[-1] and word == line.split()[-1]: 37 | console.print('\nPrintout completed', style='bold green') 38 | print_complete = True 39 | break_program = True 40 | if print_complete: 41 | break 42 | 43 | finally: 44 | listener_thread.join() 45 | 46 | -------------------------------------------------------------------------------- /app.flask_speak.py: -------------------------------------------------------------------------------- 1 | # app.py 2 | 3 | '''This is a basic Flask app that uses the terminal_chat_func.py file to run the chatbot functions. It also uses the gTTS library to convert the chatbot's answer to speech and the pygame library to play the audio. The speak_answer function is called in a new thread so that the chatbot can continue to run while the audio is playing.''' 4 | 5 | from flask import Flask, render_template, request 6 | from terminal_chat_func import run_all_functions 7 | from gtts import gTTS 8 | import pygame 9 | import threading 10 | import tempfile 11 | 12 | # Initialize pygame mixer 13 | pygame.mixer.init() 14 | 15 | def speak_answer(answer): 16 | tts = gTTS(text=answer, lang='en') 17 | with tempfile.NamedTemporaryFile(delete=True) as f: 18 | tts.save(f.name) 19 | pygame.mixer.music.load(f.name) 20 | pygame.mixer.music.play() 21 | while pygame.mixer.music.get_busy(): 22 | pygame.time.Clock().tick(10) 23 | 24 | app = Flask(__name__) 25 | 26 | @app.route('/', methods=['GET', 'POST']) 27 | def index(): 28 | if request.method == 'POST': 29 | transcript, answer = run_all_functions() 30 | # Call the speak_answer function in a new thread 31 | t = threading.Thread(target=speak_answer, args=(answer,)) 32 | t.start() 33 | return render_template('index.html', transcript=transcript, answer=answer) 34 | return render_template('index.html') 35 | 36 | if __name__ == '__main__': 37 | app.run(debug=True) -------------------------------------------------------------------------------- /audio_record.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import pyaudio 3 | import wave 4 | from audio_get_channels import get_cur_mic 5 | import os 6 | 7 | # Audio channels device_index need to be adjusted to the current settings: 8 | # chanel settings with screen = 1 | Settings without screen = 1 9 | # device_settings with screen = 2 | Settings without screen = 1 | Settings without Wifi = 0 10 | 11 | # (0, 'IPhone 14NJ Microphone', 1) 12 | # (1, 'External Microphone', 1) 13 | # (2, 'External Headphones', 0) 14 | # (3, 'MacBook Pro Microphone', 1) 15 | # (4, 'MacBook Pro Speakers', 0) 16 | # (5, 'Microsoft Teams Audio', 2) 17 | 18 | script_dir = os.path.dirname(os.path.abspath(__file__)) 19 | 20 | def audio_rec(num_seconds): 21 | chunk = 4410 22 | fs = 44100 23 | channels = 1 # Adjust as mentioned above 24 | seconds = num_seconds 25 | device_index = get_cur_mic() # Adjust as mentioned above 26 | sample_format = pyaudio.paInt16 27 | filename = os.path.join(script_dir, "audio_output.wav") 28 | 29 | p = pyaudio.PyAudio() # Create an interface to PortAudio 30 | 31 | print(f'\n... Recording {seconds} seconds of audio initialized ...\n') 32 | 33 | stream = p.open(format=sample_format, 34 | channels=channels, 35 | rate=fs, 36 | input_device_index=device_index, 37 | frames_per_buffer=chunk, 38 | input=True) 39 | 40 | frames = [] # Initialize array to store frames 41 | 42 | # While the for loop is running and recording, print countdown in seconds 43 | second_tracking = 0 44 | second_count = 0 45 | for i in range(0, int(fs/chunk*seconds)): 46 | data = stream.read(chunk) 47 | frames.append(data) 48 | second_tracking += 1 49 | if second_tracking == fs/chunk: 50 | second_count += 1 51 | second_tracking = 0 52 | print(f'... Time left: {seconds - second_count} seconds') 53 | # Stop and close the stream 54 | stream.stop_stream() 55 | stream.close() 56 | # Terminate the PortAudio interface 57 | p.terminate() 58 | 59 | print('\n... Finished recording ...') 60 | 61 | # Save the recorded data as a WAV file 62 | wf = wave.open(filename, 'wb') 63 | wf.setnchannels(channels) 64 | wf.setsampwidth(p.get_sample_size(sample_format)) 65 | wf.setframerate(fs) 66 | wf.writeframes(b''.join(frames)) 67 | wf.close() 68 | 69 | audio_rec(5) 70 | 71 | #%% 72 | -------------------------------------------------------------------------------- /speech_text_offline.py: -------------------------------------------------------------------------------- 1 | import pyaudio 2 | import wave 3 | 4 | chunk = 1024 5 | sample_format = pyaudio.paInt16 6 | channels = 1 7 | fs = 44100 8 | seconds = 3 9 | device_index = 2 10 | filename = "data/output.wav" 11 | 12 | p = pyaudio.PyAudio() 13 | 14 | stream = p.open(format=sample_format, 15 | channels=channels, 16 | rate=fs, 17 | frames_per_buffer=chunk, 18 | input_device_index=device_index, 19 | input=True) 20 | 21 | frames = [] 22 | 23 | for i in range(0, int(fs / chunk * seconds)): 24 | data = stream.read(chunk) 25 | frames.append(data) 26 | 27 | stream.stop_stream() 28 | stream.close() 29 | p.terminate() 30 | 31 | wf = wave.open(filename, 'wb') 32 | wf.setnchannels(channels) 33 | wf.setsampwidth(p.get_sample_size(sample_format)) 34 | wf.setframerate(fs) 35 | wf.writeframes(b''.join(frames)) 36 | wf.close() 37 | #%% 38 | # Path: speach-to-text.py 39 | import speech_recognition as sr 40 | 41 | r = sr.Recognizer() 42 | with sr.AudioFile('data/output.wav') as source: 43 | audio = r.record(source) 44 | 45 | try: 46 | print("Text: " + r.recognize_google(audio)) 47 | except Exception as e: 48 | print("Exception: " + str(e)) 49 | 50 | 51 | # Path: text-to-speech.py 52 | from gtts import gTTS 53 | import os 54 | 55 | #%% 56 | import pyaudio 57 | p = pyaudio.PyAudio() 58 | for i in range(p.get_device_count()): 59 | dev = p.get_device_info_by_index(i) 60 | print((i,dev['name'],dev['maxInputChannels'])) 61 | 62 | #%% SECOND TRY 63 | #%% 64 | # speech recognition 65 | import speech_recognition as sr 66 | # transcribe audio file 67 | AUDIO_FILE = "data/output.wav" 68 | 69 | # use the audio file as the audio source 70 | 71 | r = sr.Recognizer() 72 | with sr.AudioFile(AUDIO_FILE) as source: 73 | audio = r.record(source) # read the entire audio file 74 | 75 | # recognize speech using Google Speech Recognition 76 | try: 77 | recognised = r.recognize_google(audio, language="de-DE") 78 | # print recognised text and confidence 79 | print(recognised) 80 | 81 | except sr.UnknownValueError: 82 | print("Google Speech Recognition could not understand audio") 83 | except sr.RequestError as e: 84 | print("Could not request results from Google Speech Recognition service; {0}".format(e)) 85 | 86 | #%% 87 | print(recognised) 88 | 89 | # If recognised text contains the word "Barcelona", then play the pilot_functions.py script 90 | if "Barcelona" in recognised: 91 | print("Barcelona") 92 | import pilot_functions 93 | #pilot.get_story() 94 | -------------------------------------------------------------------------------- /geo_google.py: -------------------------------------------------------------------------------- 1 | #%% 2 | # Libraries 3 | import requests 4 | import credentials 5 | from geo_location_point import get_location 6 | import pandas as pd 7 | import time 8 | 9 | # Get surroundings by google maps api 10 | api_key = credentials.google_api_key 11 | 12 | lat = 41.3828 13 | lng = 2.1824 14 | radius = 1000 # -33.8670522,151.1957362&radius=500& 15 | types = '' 16 | search_name = '' 17 | 18 | #%% Get location function 19 | 20 | def get_surroundings(lat, lng, radius, types, search_name): 21 | api_key = credentials.google_api_key 22 | # Google Maps directions API endpoint 23 | prompt = f'https://maps.googleapis.com/maps/api/place/nearbysearch/json?location={lat},{lng}&radius={radius}&types={types}&name={search_name}&key={api_key}' 24 | # Send request and get response 25 | response = requests.get(prompt) 26 | # Get response data as Python object 27 | data = response.json() 28 | results = data['results'] 29 | # Parse the results to a dataframe 30 | df = pd.DataFrame.from_dict(results) 31 | df = df[['geometry', 'name', 'place_id', 'types', 'vicinity', 'business_status', 'rating', 'user_ratings_total', 'opening_hours']] 32 | df['lat'] = df['geometry'].apply(lambda x: x['location']['lat']) 33 | df['lng'] = df['geometry'].apply(lambda x: x['location']['lng']) 34 | df = df.drop(columns=['geometry']) 35 | return df 36 | 37 | get_surroundings(lat, lng, radius, types, search_name) 38 | 39 | #%% 40 | # Get surroundings with pagination 3*20 41 | 42 | 43 | 44 | def get_surroundings_pagination(lat, lng, radius, types, search_name): 45 | api_key = credentials.google_api_key 46 | # Paginated request 47 | df_list = [] 48 | for i in range(0, 3): 49 | # Google Maps directions API endpoint 50 | prompt = f'https://maps.googleapis.com/maps/api/place/nearbysearch/json?location={lat},{lng}&radius={radius}&types={types}&name={search_name}&key={api_key}&pagetoken={i}' 51 | # Send request and get response 52 | response = requests.get(prompt) 53 | # Get response data as Python object 54 | data = response.json() 55 | results = data['results'] 56 | # Parse the results to a dataframe 57 | df_list.append(pd.DataFrame.from_dict(results)) 58 | time.sleep(1) # Add a 1-second delay before the next request 59 | df = pd.concat(df_list, ignore_index=True) 60 | df = df[['geometry', 'name', 'place_id', 'types', 'vicinity', 'business_status', 'rating', 'user_ratings_total', 'opening_hours']] 61 | df['lat'] = df['geometry'].apply(lambda x: x['location']['lat']) 62 | df['lng'] = df['geometry'].apply(lambda x: x['location']['lng']) 63 | df = df.drop(columns=['geometry']) 64 | return df 65 | 66 | 67 | #%% 68 | -------------------------------------------------------------------------------- /ai_doc_creator.py: -------------------------------------------------------------------------------- 1 | 2 | # --- MANUAL TEMPLATE --- 3 | # Import libraries 4 | from docx import Document 5 | from docx.shared import Inches 6 | import openai 7 | import credentials 8 | import os 9 | 10 | script_dir = os.path.dirname(os.path.abspath(__file__)) 11 | startfile = os.path.join(script_dir, "template.docx") 12 | 13 | #%% 14 | # --- CHATGPT PROMPT --- 15 | header = input("\nTitle of the document: ") 16 | openai.api_key = credentials.api_key 17 | number_paragraphs = int(input("Number of Paragraphs?: ")) 18 | 19 | docu_list = [] 20 | 21 | for i in range(number_paragraphs): 22 | prompt = input(f"Question {i+1}/{number_paragraphs}: ") 23 | docu_list.append(prompt) 24 | 25 | def gpt_docu(prompt): 26 | try: 27 | print(f"----- {prompt} -----") 28 | completion = openai.ChatCompletion.create( 29 | model="gpt-4", 30 | messages=[ 31 | {"role": "user", "content": prompt} 32 | ] 33 | ) 34 | 35 | answer = completion.choices[0].message.content 36 | for i in range(2): 37 | try: 38 | if answer[0] == "\n": 39 | answer = answer[1:] 40 | except: 41 | pass 42 | 43 | 44 | return answer 45 | except: 46 | print("Something went wrong. Please try again.") 47 | 48 | 49 | # append all answers to a list 50 | answer_list = [] 51 | try: 52 | for i in range(len(docu_list)): 53 | print(f"\n----- Generating answer for: Question {i+1}/{len(docu_list)} -----") 54 | answer = gpt_docu(docu_list[i]) 55 | answer_list.append(answer) 56 | print(f"----- Answer : {answer} -----\n") 57 | except: 58 | print("Something went wrong. Please try again.") 59 | 60 | #%% 61 | # --- CREATE DOCUMENT --- 62 | # Open template document `hello_world.docx` 63 | document = Document(startfile) 64 | # Clear document 65 | document._body.clear_content() 66 | 67 | # Add title 68 | document.add_heading(header, 1) 69 | 70 | for i in range(len(answer_list)): 71 | # Add header first 72 | document.add_heading(docu_list[i], 2) 73 | # Add paragraph 74 | p = document.add_paragraph(answer_list[i]) 75 | 76 | # Save document with the first 4 words from the header if the exist. 77 | 78 | try: 79 | document.save('/Users/erictak/Desktop/' + header.split()[0] +' '+ header.split()[1] +' '+ header.split()[2] + header.split()[3] + '.docx') 80 | except: 81 | try: 82 | document.save('/Users/erictak/Desktop/' + header.split()[0] +' '+ header.split()[1] +' '+ header.split()[2] + '.docx') 83 | except: 84 | try: 85 | document.save('/Users/erictak/Desktop/' + header.split()[0] +' '+ header.split()[1] + '.docx') 86 | except: 87 | try: 88 | document.save('/Users/erictak/Desktop/' + header.split()[0] + '.docx') 89 | except: 90 | document.save('/Users/erictak/Desktop/Untitled.docx') 91 | 92 | -------------------------------------------------------------------------------- /pilot.py: -------------------------------------------------------------------------------- 1 | # GEOSPATIALLY SUPPORTED STORYTELLER 2 | # possible names: fun_fact, storymapper, ... 3 | 4 | # Based on the current location, the storyteller will tell a fun fact or anecdote about nearby places 5 | # Possible sources: Wikipedia, Google, ... 6 | # Possine output: country, city, street information; historical events; future events and local ; stores and restaurants; 7 | # famous people; myths and legends; weather incidents; 8 | 9 | 10 | 11 | #%% IMPORTING LIBRARIES 12 | import pandas as pd 13 | import numpy as np 14 | import matplotlib.pyplot as plt 15 | import folium as fl 16 | import wikipedia 17 | import gtts 18 | from numpy import random 19 | from playsound import playsound 20 | import pyttsx3 21 | import numpy.random 22 | 23 | 24 | # SETTINGS 25 | 26 | #%% CHECK GPS 27 | 28 | # Define GPS Coordinates of Barcelona 29 | lat_input = 11.390 30 | log_input = 11.154 31 | 32 | # Define the radius of the circle 33 | radius = 0.5 34 | 35 | # Load worldcities dataset zip 'https://simplemaps.com/data/world-cities' 36 | worldcities = pd.read_csv('simplemaps_worldcities_basicv1.75/worldcities.csv') 37 | 38 | print(worldcities.tail(10)) 39 | 40 | 41 | #%% MATCH LOCATION 42 | 43 | # Find closest city to the input coordinates 44 | # Calculate the distance between the input coordinates and the coordinates of the cities 45 | worldcities['distance'] = np.sqrt((worldcities['lat'] - lat_input)**2 + (worldcities['lng'] - log_input)**2) 46 | 47 | # Find the closest cit 48 | closest_cities = worldcities.sort_values(by='distance').head(10) 49 | closest_city = worldcities.loc[worldcities['distance'].idxmin()] 50 | 51 | # Print the closest city 52 | print(closest_city) 53 | 54 | #%% INFORMATION SEARCH 55 | 56 | # Search for the closest city on wikipedia 57 | try: 58 | 59 | page = wikipedia.page(closest_city['city']) 60 | wikipedia.set_lang("de") 61 | print(page.summary) 62 | except: 63 | print('No information found') 64 | 65 | # Search for the closest city with country on wikipedia 66 | try: 67 | page = wikipedia.page(closest_city['city'], ',' , closest_city['country']) 68 | print(page.summary) 69 | except: 70 | print('No information found') 71 | 72 | # Possible source: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/6ZFC0V 73 | 74 | #%% PLOT CITY MAP (FOLIUM as html in directory 75 | 76 | # City Map with folium 77 | # https://python-visualization.github.io/folium/ 78 | 79 | # Create a map of the city 80 | city_map = fl.Map(location=[closest_city['lat'], closest_city['lng']], zoom_start=10) 81 | 82 | # Add a marker to the map 83 | fl.Marker([closest_city['lat'], closest_city['lng']], popup=closest_city['city']).add_to(city_map) 84 | 85 | # Display the map 86 | 87 | city_map.save(f"data/{closest_city['city']}_map.html") # <- HTML file in directory 88 | 89 | #%% Synthesize the story verbally 90 | 91 | from gtts import gTTS 92 | 93 | lang = gtts.lang.tts_langs() 94 | rand_lang = random.choice(list(lang.keys())) # random.randint(0, 9) 95 | print(type(lang)) 96 | story = f"The closest city is {closest_city}{page.summary}" 97 | my_tts = story 98 | tts = gTTS(text=my_tts, lang=rand_lang, slow=False) 99 | tts.save(f"data/{closest_city['city']}_story.mp3") 100 | print(lang) 101 | print(rand_lang) 102 | -------------------------------------------------------------------------------- /pilot_functions.py: -------------------------------------------------------------------------------- 1 | # Location Based Storyteller 2 | 3 | # %% IMPORTING LIBRARIES 4 | import pandas as pd 5 | import numpy as np 6 | import wikipedia 7 | import pyttsx3 8 | import random 9 | 10 | # %% CHECK GPS 11 | 12 | def get_story(): 13 | ''' Parameters: lat_input, log_input 14 | Returns: closest_city, page.summary ''' 15 | lat_input = random.uniform(40, 45) 16 | log_input = random.uniform(0, 5) 17 | radius = 0.5 18 | 19 | # Load worldcities dataset and calculate closest city 20 | worldcities = pd.read_csv('data/worldcities.csv') 21 | worldcities['distance'] = np.sqrt((worldcities['lat'] - lat_input) ** 2 + (worldcities['lng'] - log_input) ** 2) 22 | 23 | # Selection of one or more cities 24 | closest_cities = worldcities.sort_values(by='distance').head(10) 25 | closest_city = worldcities.loc[worldcities['distance'].idxmin()] 26 | 27 | # Wikipedia search 28 | try: 29 | page = wikipedia.page(closest_city['city'], ',', closest_city['country']) 30 | print(page.summary) 31 | except: 32 | try: 33 | page = wikipedia.page(closest_city['city']) 34 | print(page.summary) 35 | except: 36 | print('No information found') 37 | 38 | # Create the story 39 | story = 'The closest city to your location is ' + closest_city['city'] + ' in ' + closest_city['country'] + '. ' + page.summary 40 | shortstory = str(story.split('.', 4)[0:4]) # Cut story after 4th sentence if possible and convert to string 41 | shortstory = shortstory.replace('[', '').replace(']', '').replace("'", '') 42 | 43 | # Synthesize story with pyttsx3 44 | tts = pyttsx3.init() 45 | tts.setProperty('rate', 80) 46 | tts.say(shortstory) 47 | tts.runAndWait() 48 | 49 | get_story() 50 | #%% 51 | import pandas as pd 52 | import wikipedia 53 | import pyttsx3 54 | 55 | def get_location_story(city_name): 56 | ''' Parameters: city_name 57 | Returns: closest_city, page.summary ''' 58 | radius = 0.5 59 | 60 | # Load worldcities dataset and retrieve latitude and longitude values of the city 61 | worldcities = pd.read_csv('data/worldcities.csv') 62 | city_data = worldcities[worldcities['city_ascii'] == city_name] 63 | if city_data.empty: 64 | print(f"No data found for city: {city_name}") 65 | return 66 | 67 | lat_input = city_data['lat'].values[0] 68 | log_input = city_data['lng'].values[0] 69 | 70 | # Calculate closest city 71 | worldcities['distance'] = ((worldcities['lat'] - lat_input) ** 2 + (worldcities['lng'] - log_input) ** 2).apply(np.sqrt) 72 | closest_city = worldcities.loc[worldcities['distance'].idxmin()] 73 | 74 | # Wikipedia search 75 | try: 76 | page = wikipedia.page(closest_city['city'], ',', closest_city['country']) 77 | print(page.summary) 78 | except: 79 | try: 80 | page = wikipedia.page(closest_city['city']) 81 | print(page.summary) 82 | except: 83 | print('No information found') 84 | 85 | # Create the story 86 | story = 'The closest city to your location is ' + closest_city['city'] + ' in ' + closest_city['country'] + '. ' + page.summary 87 | shortstory = str(story.split('.', 4)[0:4]) # Cut story after 4th sentence if possible and convert to string 88 | shortstory = shortstory.replace('[', '').replace(']', '').replace("'", '') 89 | 90 | # Synthesize story with pyttsx3 91 | tts = pyttsx3.init() 92 | tts.setProperty('rate', 80) 93 | tts.say(shortstory) 94 | tts.runAndWait() 95 | #%% 96 | -------------------------------------------------------------------------------- /templates/ai_assistant_button.html: -------------------------------------------------------------------------------- 1 | // this is the html file that will be rendered and give a button to make a conversation with gpt-4 model. 2 | 3 | 4 | 5 | 6 | 7 | Conversational AI 8 | 66 | 67 | 68 |

Conversational AI

69 |

How may I assist you today?

70 |
71 | 72 |
73 | 74 | {% if request.method == 'POST' %} 75 |
76 |

Transcript:

77 |

{{ transcript }}

78 |

Answer:

79 |

{{ answer }}

80 |
81 | {% endif %} 82 | 83 | 103 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /conversation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [ 10 | { 11 | "ename": "IndentationError", 12 | "evalue": "expected an indented block after 'if' statement on line 78 (606879098.py, line 80)", 13 | "output_type": "error", 14 | "traceback": [ 15 | "\u001B[0;36m Cell \u001B[0;32mIn[1], line 80\u001B[0;36m\u001B[0m\n\u001B[0;31m print(f\"\\n------------------------- Storyteller assistant stopped -----------------------------------\")\u001B[0m\n\u001B[0m ^\u001B[0m\n\u001B[0;31mIndentationError\u001B[0m\u001B[0;31m:\u001B[0m expected an indented block after 'if' statement on line 78\n" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "# Import libraries\n", 21 | "import os\n", 22 | "import openai\n", 23 | "import credentials\n", 24 | "import sys\n", 25 | "import pyttsx3\n", 26 | "from audio_record import audio_rec\n", 27 | "from speech_text_whisper import get_transcript_whisper\n", 28 | "\n", 29 | "openai.api_key = credentials.api_key\n", 30 | "import time\n", 31 | "\n", 32 | "\n", 33 | "def get_transcript_whisper():\n", 34 | " '''Get transcript of audio file with whisper api'''\n", 35 | " openai.api_key = credentials.api_key\n", 36 | " file = open(\"audio_output.wav\", \"rb\")\n", 37 | " transcription = openai.Audio.transcribe(\"whisper-1\", file, response_format=\"json\")\n", 38 | " text = transcription[\"text\"]\n", 39 | " return text\n", 40 | "\n", 41 | "\n", 42 | "def run_GPT3(prompt):\n", 43 | " '''Run GPT-3 with the prompt and return the response'''\n", 44 | " response = openai.Completion.create(\n", 45 | " engine=\"davinci\",\n", 46 | " prompt=prompt,\n", 47 | " temperature=0.7,\n", 48 | " max_tokens=256,\n", 49 | " top_p=1,\n", 50 | " frequency_penalty=0,\n", 51 | " presence_penalty=0\n", 52 | " )\n", 53 | " print(response)\n", 54 | " return response\n", 55 | "\n", 56 | "\n", 57 | "def run_chatGPT():\n", 58 | " '''Run chatGPT with the prompt and return the response'''\n", 59 | " completion = openai.ChatCompletion.create(\n", 60 | " model=\"gpt-3.5-turbo\",\n", 61 | " messages=[\n", 62 | " {\"role\": \"user\", \"content\": text_output}\n", 63 | " ]\n", 64 | " )\n", 65 | "\n", 66 | " return completion.choices[0].message.content\n", 67 | "\n", 68 | "\n", 69 | "print(f\"\\n------------------------- Storyteller assistant started -----------------------------------\")\n", 70 | "# Record audio file with function improted from audio_record.py\n", 71 | "audio_rec(10)\n", 72 | "\n", 73 | "# Get transcript of audio file with whisper api\n", 74 | "print(f\"\\n... Transcribing audio file ...\")\n", 75 | "\n", 76 | "text_output = get_transcript_whisper()\n", 77 | "\n", 78 | "print(f\"\\n... Text recognized: {text_output}\")\n", 79 | "\n", 80 | "print(f\"\\n... Asking ChatGPT ...\")\n", 81 | "answer = run_chatGPT()\n", 82 | "\n", 83 | "print(f\"\\n... Answer from ChatGPT: {answer}\")\n", 84 | "\n", 85 | "print(f\"\\n... Synthesizing answer ...\")\n", 86 | "tts = pyttsx3.init()\n", 87 | "tts.setProperty('rate', 120)\n", 88 | "tts.say(answer)\n", 89 | "tts.runAndWait()\n", 90 | "\n", 91 | "print(f\"\\n... Listening to answer for 3 seconds ...\")\n", 92 | "audio_rec(3)\n", 93 | "\n", 94 | "text_output = get_transcript_whisper()\n", 95 | "print(f\"\\n... Text recognized: {text_output}\")\n", 96 | "\n", 97 | "if \"yes\" in text_output:\n", 98 | "\n", 99 | "print(f\"\\n------------------------- Storyteller assistant stopped -----------------------------------\")" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "outputs": [], 106 | "source": [], 107 | "metadata": { 108 | "collapsed": false 109 | } 110 | } 111 | ], 112 | "metadata": { 113 | "kernelspec": { 114 | "display_name": "Python 3", 115 | "language": "python", 116 | "name": "python3" 117 | }, 118 | "language_info": { 119 | "codemirror_mode": { 120 | "name": "ipython", 121 | "version": 2 122 | }, 123 | "file_extension": ".py", 124 | "mimetype": "text/x-python", 125 | "name": "python", 126 | "nbconvert_exporter": "python", 127 | "pygments_lexer": "ipython2", 128 | "version": "2.7.6" 129 | } 130 | }, 131 | "nbformat": 4, 132 | "nbformat_minor": 0 133 | } 134 | -------------------------------------------------------------------------------- /terminal_chat_func.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import plotext 3 | import numpy as np 4 | import pyaudio 5 | import struct 6 | import wave 7 | import time 8 | from audio_get_channels import get_cur_mic 9 | from scipy.fftpack import fft 10 | import openai 11 | import credentials 12 | import os 13 | import pyttsx3 14 | import threading 15 | import sys 16 | from audio_get_channels import get_speaker 17 | from geopy.geocoders import Nominatim 18 | import json 19 | from urllib.request import urlopen 20 | 21 | script_dir = os.path.dirname(os.path.abspath(__file__)) 22 | filename = os.path.join(script_dir, "audio_output.wav") 23 | preprompt = "You are a Ai audio guide. in the following prompt, look for the name of a city or a location, and give a one line discription of this place. Start with the location as a header." 24 | 25 | def get_geo_location(city_name): 26 | # calling the Nominatim tool 27 | loc = Nominatim(user_agent="GetLoc") 28 | 29 | # entering the location name 30 | getLoc = loc.geocode(city_name) 31 | 32 | # printing address 33 | print(getLoc.address) 34 | 35 | # printing latitude and longitude 36 | print("Latitude = ", getLoc.latitude, "\n") 37 | print("Longitude = ", getLoc.longitude) 38 | 39 | return getLoc 40 | 41 | 42 | def get_location(): 43 | url = "http://ipinfo.io/json" 44 | response = urlopen(url) 45 | data = json.load(response) 46 | # Get location 47 | lng = data['loc'].split(',')[1] 48 | lat = data['loc'].split(',')[0] 49 | location = {'lat': lat, 'lng': lng} 50 | return location 51 | 52 | 53 | 54 | def run_chatGPT(prompt): 55 | '''Run chatGPT with the prompt and return the response''' 56 | completion = openai.ChatCompletion.create( 57 | model="gpt-4", 58 | messages=[ 59 | {"role": "user", "content": preprompt + prompt}, 60 | ] 61 | ) 62 | answer = completion.choices[0].message.content 63 | 64 | return answer 65 | 66 | 67 | def speak_answer(answer): 68 | engine = pyttsx3.init() 69 | engine.setProperty('rate', 110) 70 | engine.say(answer) 71 | engine.runAndWait() 72 | 73 | 74 | def print_answer(answer): 75 | for word in answer.split(): 76 | print(word, end=' ', flush=True) 77 | 78 | 79 | def print_transcript(transcript): 80 | for word in transcript.split(): 81 | time.sleep(0.27) 82 | print(word, end=' ', flush=True) 83 | 84 | 85 | def get_transcript_whisper(): 86 | openai.api_key = credentials.api_key 87 | file = open(filename, "rb") 88 | transcription = openai.Audio.transcribe("whisper-1", file, response_format="json") 89 | transcribed_text = transcription["text"] 90 | return transcribed_text 91 | 92 | 93 | def audio_spectrum(num_seconds): 94 | script_dir = os.path.dirname(os.path.abspath(__file__)) 95 | 96 | chunk = 2205 97 | channels = 1 98 | fs = 44100 99 | seconds = max(num_seconds, 0.1) 100 | sample_format = pyaudio.paInt16 101 | filename = os.path.join(script_dir, "audio_output.wav") 102 | 103 | print(f'\n... Recording {seconds} seconds of audio initialized ...\n') 104 | 105 | p = pyaudio.PyAudio() 106 | stream = p.open(format=sample_format, 107 | channels=channels, 108 | rate=fs, 109 | input_device_index=get_cur_mic(), 110 | frames_per_buffer=chunk, 111 | input=True) 112 | 113 | 114 | frames = [] 115 | for i in range(0, int(fs / chunk * seconds)): 116 | data = stream.read(chunk, False) 117 | frames.append(data) 118 | 119 | 120 | # Stop and close the stream 121 | stream.stop_stream() 122 | stream.close() 123 | p.terminate() 124 | 125 | # Save the recorded data as a WAV file 126 | wf = wave.open(filename, 'wb') 127 | wf.setnchannels(channels) 128 | wf.setsampwidth(p.get_sample_size(sample_format)) 129 | wf.setframerate(fs) 130 | wf.writeframes(b''.join(frames)) 131 | wf.close() 132 | 133 | 134 | def run_all_functions(): 135 | try: 136 | audio_spectrum(6) 137 | except KeyboardInterrupt: 138 | pass 139 | 140 | transcript = get_transcript_whisper() 141 | 142 | # If text contains one word of a stopwordlist then the script will stop 143 | if any(word in transcript for word in ['stop', 'Stop', 'exit', 'quit', 'end']): 144 | print('... Script stopped by user') 145 | exit() 146 | 147 | transcript = f' {transcript}' 148 | 149 | #print_transcript(transcript) 150 | 151 | answer = run_chatGPT(transcript) 152 | # Split answer into answer and location 153 | 154 | #print_answer(answer) 155 | 156 | #speak_answer(answer) 157 | 158 | return transcript, answer 159 | 160 | 161 | 162 | #run_all_functions() 163 | # ---------------------------------------------------------------- 164 | if __name__ == "__main__": 165 | script_start = time.time() 166 | script_dir = os.path.dirname(os.path.abspath(__file__)) 167 | filename = os.path.join(script_dir, "audio_output.wav") 168 | run_all_functions() 169 | 170 | 171 | 172 | # Create threads for speaking and printing the transcript 173 | #speak_thread = threading.Thread(target=speak_answer) 174 | #print_thread = threading.Thread(target=print_transcript) 175 | 176 | # Start both threads 177 | #print_thread.start() 178 | #speak_thread.start() 179 | 180 | # Wait for both threads to finish 181 | #threading.wait_for(lambda: not speak_thread.is_alive()and not print_thread.is_alive()) 182 | 183 | # Wait for both threads to finish 184 | #speak_thread.join() 185 | #print_thread.join() 186 | #print_answer() 187 | 188 | # ---------------------------------------------------------------- 189 | # Restart the script 190 | 191 | 192 | -------------------------------------------------------------------------------- /terminal_audio_spectrum.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# Version 1\n", 12 | "import plotext\n", 13 | "import numpy as np\n", 14 | "import pyaudio\n", 15 | "import struct\n", 16 | "import wave\n", 17 | "import time\n", 18 | "\n", 19 | "chunk = 4410\n", 20 | "sample_format = pyaudio.paInt16\n", 21 | "channels = 1\n", 22 | "fs = 44100\n", 23 | "seconds = 10\n", 24 | "device_index = 2\n", 25 | "filename = \"audio_spectrum.wav\"\n", 26 | "\n", 27 | "print(f'\\n... Recording {seconds} seconds of audio initialized ...\\n')\n", 28 | "print(\n", 29 | " f'\\n\\n>>> RECORDING PLEASE SPEAK CLEARLY <<< >>> RECORDING PLEASE SPEAK CLEARLY <<< >>> RECORDING PLEASE SPEAK CLEARLY <<<\\n\\n')\n", 30 | "\n", 31 | "p = pyaudio.PyAudio()\n", 32 | "stream = p.open(format=sample_format,\n", 33 | " channels=channels,\n", 34 | " rate=fs,\n", 35 | " input_device_index=device_index,\n", 36 | " frames_per_buffer=chunk,\n", 37 | " input=True,\n", 38 | " output=True,)\n", 39 | "\n", 40 | "plotext.title(\"Spectrum\")\n", 41 | "plotext.clc()\n", 42 | "\n", 43 | "x = np.arange(0, chunk, 1)\n", 44 | "\n", 45 | "frames = []\n", 46 | "start_time = time.time()\n", 47 | "\n", 48 | "while time.time() - start_time < seconds:\n", 49 | " plotext.clt()\n", 50 | " plotext.cld()\n", 51 | "\n", 52 | " y = np.array(struct.unpack(str(2 * chunk) + 'B', stream.read(chunk)), dtype='b')[::2] + 128\n", 53 | " plotext.plot(x, y, color=\"green\", marker=\"_\", label=\"Spectrum\")\n", 54 | " plotext.sleep(0.01)\n", 55 | " plotext.xlabel(f'Frequency (Hz) - {round(time.time() - start_time, 2)} seconds')\n", 56 | " plotext.ylim(0, 260)\n", 57 | " plotext.xlim(0, 4500)\n", 58 | " plotext.show()\n", 59 | "\n", 60 | " # Append audio frames to the list\n", 61 | " frames.append(y.tobytes())\n", 62 | "\n", 63 | "# Stop and close the stream\n", 64 | "stream.stop_stream()\n", 65 | "stream.close()\n", 66 | "p.terminate()\n", 67 | "\n", 68 | "# Save the recorded data as a WAV file\n", 69 | "wf = wave.open(filename, 'wb')\n", 70 | "wf.setnchannels(channels)\n", 71 | "wf.setsampwidth(p.get_sample_size(sample_format))\n", 72 | "wf.setframerate(fs)\n", 73 | "wf.writeframes(b''.join(frames))\n", 74 | "wf.close()\n", 75 | "\n", 76 | "plotext.clear_data()\n" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "outputs": [], 83 | "source": [ 84 | "# Version 2\n", 85 | "import plotext\n", 86 | "import numpy as np\n", 87 | "import pyaudio\n", 88 | "import struct\n", 89 | "import wave\n", 90 | "\n", 91 | "chunk = 4410\n", 92 | "sample_format = pyaudio.paInt16\n", 93 | "channels = 1\n", 94 | "fs = 44100\n", 95 | "seconds = 10\n", 96 | "device_index = 2\n", 97 | "filename = \"audio_spectrum.wav\"\n", 98 | "\n", 99 | "print(f'\\n... Recording {seconds} seconds of audio initialized ...\\n')\n", 100 | "print(f'\\n\\n>>> RECORDING PLEASE SPEAK CLEARLY <<< >>> RECORDING PLEASE SPEAK CLEARLY <<< >>> RECORDING PLEASE SPEAK CLEARLY <<<\\n\\n')\n", 101 | "\n", 102 | "p = pyaudio.PyAudio()\n", 103 | "stream = p.open(format=sample_format,\n", 104 | " channels=channels,\n", 105 | " rate=fs,\n", 106 | " input_device_index=device_index,\n", 107 | " frames_per_buffer=chunk,\n", 108 | " input=True,\n", 109 | " output=True,)\n", 110 | "\n", 111 | "plotext.title(\"Spectrum\")\n", 112 | "plotext.clc()\n", 113 | "\n", 114 | "x = np.arange(0, chunk, 1)\n", 115 | "\n", 116 | "frames = []\n", 117 | "for i in range(fs):\n", 118 | " plotext.clt()\n", 119 | " plotext.cld()\n", 120 | "\n", 121 | " y = np.array(struct.unpack(str(2 * chunk) + 'B', stream.read(chunk)), dtype='b')[::2] + 128\n", 122 | " plotext.plot(x, y, color=\"green\", marker=\"_\", label=\"Spectrum\")\n", 123 | " plotext.sleep(0.01)\n", 124 | " plotext.xlabel(\"Frequency\")\n", 125 | " plotext.ylabel(\"Amplitude\")\n", 126 | " plotext.ylim(0, 260)\n", 127 | " plotext.xlim(0, 4500)\n", 128 | " plotext.show()\n", 129 | "\n", 130 | " # Append audio frames to the list\n", 131 | " frames.append(y.tobytes())\n", 132 | "\n", 133 | "# Stop and close the stream\n", 134 | "stream.stop_stream()\n", 135 | "stream.close()\n", 136 | "p.terminate()\n", 137 | "\n", 138 | "# Save the recorded data as a WAV file\n", 139 | "wf = wave.open(filename, 'wb')\n", 140 | "wf.setnchannels(channels)\n", 141 | "wf.setsampwidth(p.get_sample_size(sample_format))\n", 142 | "wf.setframerate(fs)\n", 143 | "wf.writeframes(b''.join(frames))\n", 144 | "wf.close()\n", 145 | "\n", 146 | "plotext.clear_data()" 147 | ], 148 | "metadata": { 149 | "collapsed": false 150 | } 151 | } 152 | ], 153 | "metadata": { 154 | "kernelspec": { 155 | "display_name": "Python 3", 156 | "language": "python", 157 | "name": "python3" 158 | }, 159 | "language_info": { 160 | "codemirror_mode": { 161 | "name": "ipython", 162 | "version": 2 163 | }, 164 | "file_extension": ".py", 165 | "mimetype": "text/x-python", 166 | "name": "python", 167 | "nbconvert_exporter": "python", 168 | "pygments_lexer": "ipython2", 169 | "version": "2.7.6" 170 | } 171 | }, 172 | "nbformat": 4, 173 | "nbformat_minor": 0 174 | } 175 | -------------------------------------------------------------------------------- /app_functions_map.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import folium 3 | from gtts import gTTS 4 | import pygame 5 | import tempfile 6 | import pyaudio 7 | import wave 8 | import time 9 | from audio_get_channels import get_cur_mic 10 | import openai 11 | import credentials 12 | import os 13 | from geopy.geocoders import Nominatim 14 | import json 15 | from urllib.request import urlopen 16 | import pandas as pd 17 | from geo_google import get_surroundings 18 | 19 | 20 | '''these functions are used to run the chatbot and audio functions in the flask app''' 21 | 22 | script_dir = os.path.dirname(os.path.abspath(__file__)) 23 | filename = os.path.join(script_dir, "audio_output.wav") 24 | preprompt = "You are a Ai audio guide. in the following prompt, look for the name of a city or a location, and give a one line discription of this place. Start with the location as a header." 25 | 26 | 27 | #%% AUDIO FUNCTIONS 28 | def audio_spectrum(num_seconds): 29 | ''' Record audio for num_seconds and save it to a wav file''' 30 | 31 | script_dir = os.path.dirname(os.path.abspath(__file__)) 32 | 33 | chunk = 2205 34 | channels = 1 35 | fs = 44100 36 | seconds = max(num_seconds, 0.1) 37 | sample_format = pyaudio.paInt16 38 | filename = os.path.join(script_dir, "audio_output.wav") 39 | 40 | print(f'\n... Recording {seconds} seconds of audio initialized ...\n') 41 | 42 | p = pyaudio.PyAudio() 43 | stream = p.open(format=sample_format, 44 | channels=channels, 45 | rate=fs, 46 | input_device_index=get_cur_mic(), 47 | frames_per_buffer=chunk, 48 | input=True) 49 | 50 | 51 | frames = [] 52 | for i in range(0, int(fs / chunk * seconds)): 53 | data = stream.read(chunk, False) 54 | frames.append(data) 55 | 56 | 57 | # Stop and close the stream 58 | stream.stop_stream() 59 | stream.close() 60 | p.terminate() 61 | 62 | # Save the recorded data as a WAV file 63 | wf = wave.open(filename, 'wb') 64 | wf.setnchannels(channels) 65 | wf.setsampwidth(p.get_sample_size(sample_format)) 66 | wf.setframerate(fs) 67 | wf.writeframes(b''.join(frames)) 68 | wf.close() 69 | 70 | #%% TRANSCRIPT FUNCTIONS 71 | def get_transcript_whisper(): 72 | '''Get the transcript of the audio file''' 73 | openai.api_key = credentials.api_key 74 | file = open(filename, "rb") 75 | transcription = openai.Audio.transcribe("whisper-1", file, response_format="json") 76 | transcribed_text = transcription["text"] 77 | 78 | return transcribed_text 79 | 80 | #%% TEXT RECOGNITION FUNCTIONS 81 | def recognize_infos(transcribed_text): 82 | ''' If transcribed text contains a city name from worldcities.csv, return the city name''' 83 | df = pd.read_csv('worldcities.csv') 84 | df = df.dropna() 85 | df = df.drop_duplicates(subset=['city_ascii']) 86 | df = df.reset_index(drop=True) 87 | df['city_ascii'] = df['city_ascii'].str.lower() 88 | 89 | # Convert transcribed_text to lowercase for proper comparison 90 | transcribed_text = transcribed_text.lower() 91 | 92 | for city in df['city_ascii']: 93 | if city in transcribed_text: 94 | city = df[df['city'] == city.lower()]['city'].values[0] 95 | lng = df[df['city_ascii'] == city.lower()]['lng'].values[0] 96 | lat = df[df['city_ascii'] == city.lower()]['lat'].values[0] 97 | location = {'lat': lat, 'lng': lng} 98 | return city, location 99 | 100 | 101 | #%% CHATGPT FUNCTIONS 102 | def run_chatGPT(prompt): 103 | '''Run chatGPT with the prompt and return the response''' 104 | completion = openai.ChatCompletion.create( 105 | model="gpt-4", 106 | messages=[ 107 | {"role": "user", "content": preprompt + prompt}, 108 | ] 109 | ) 110 | answer = completion.choices[0].message.content 111 | 112 | return answer 113 | 114 | #%% SPEAK FUNCTIONS 115 | def speak_answer(answer): 116 | '''Speak the answer''' 117 | tts = gTTS(text=answer, lang='en') 118 | with tempfile.NamedTemporaryFile(delete=True) as f: 119 | tts.save(f.name) 120 | pygame.mixer.music.load(f.name) 121 | pygame.mixer.music.play() 122 | while pygame.mixer.music.get_busy(): 123 | pygame.time.Clock().tick(10) 124 | 125 | 126 | #%% CREATE MAP 127 | #load map in browser 128 | def create_map(location, df): 129 | map = folium.Map(location=[location['lat'], location['lng']], zoom_start=15, tiles='Stamen Toner') 130 | for i in range(0, len(df)): 131 | button_html = f'' 132 | marker_popup = folium.Popup(f''' 133 | Name: {df.iloc[i]['name']} 134 | Vicinity: {df.iloc[i]['vicinity']} 135 | Business status: {df.iloc[i]['business_status']} 136 | Rating: {df.iloc[i]['rating']} 137 | User ratings total: {df.iloc[i]['user_ratings_total']} 138 | Opening hours: {df.iloc[i]['opening_hours']} 139 | {button_html} 140 | ''') 141 | folium.Marker([df.iloc[i]['lat'], df.iloc[i]['lng']], popup=marker_popup, 142 | icon=folium.Icon(color='red', icon='')).add_to(map) 143 | return map._repr_html_() 144 | 145 | 146 | def run_all_functions(): 147 | try: 148 | audio_spectrum(6) 149 | except KeyboardInterrupt: 150 | pass 151 | 152 | transcript = get_transcript_whisper() 153 | recognized_city = recognize_infos(transcript) 154 | city = recognized_city[0] 155 | location = recognized_city[1] 156 | df = get_surroundings(location['lat'], location['lng'], 500, 'tourist_attraction', '') 157 | map = create_map(location, df) 158 | 159 | 160 | # If text contains one word of a stopwordlist then the script will stop 161 | if any(word in transcript for word in ['stop', 'Stop', 'exit', 'quit', 'end']): 162 | print('... Script stopped by user') 163 | exit() 164 | 165 | transcript = f' {transcript}' 166 | 167 | answer = run_chatGPT(transcript) 168 | speak_answer(answer) 169 | 170 | return transcript, answer, city, map 171 | 172 | 173 | #run_all_functions() 174 | # ---------------------------------------------------------------- 175 | if __name__ == "__main__": 176 | script_start = time.time() 177 | script_dir = os.path.dirname(os.path.abspath(__file__)) 178 | filename = os.path.join(script_dir, "audio_output.wav") 179 | run_all_functions() 180 | 181 | -------------------------------------------------------------------------------- /conversation.py: -------------------------------------------------------------------------------- 1 | # Import libraries 2 | import os 3 | import openai 4 | import credentials 5 | import sys 6 | import pyttsx3 7 | from speech_text_whisper import get_transcript_whisper 8 | openai.api_key = credentials.api_key 9 | import time 10 | import plotext 11 | import numpy as np 12 | import pyaudio 13 | import struct 14 | import wave 15 | from audio_get_channels import get_cur_mic 16 | from scipy.fftpack import fft 17 | 18 | 19 | #%% FUNCTIONS TO BE STORED AWAY 20 | 21 | def audio_spectrum(num_seconds): 22 | script_dir = os.path.dirname(os.path.abspath(__file__)) 23 | 24 | chunk = 4410 25 | channels = 1 26 | fs = 44100 27 | seconds = num_seconds 28 | sample_format = pyaudio.paInt16 29 | filename = os.path.join(script_dir, "audio_output.wav") 30 | 31 | print(f'\n... Recording {seconds} seconds of audio initialized ...\n') 32 | 33 | p = pyaudio.PyAudio() 34 | stream = p.open(format=sample_format, 35 | channels=channels, 36 | rate=fs, 37 | input_device_index=get_cur_mic(), 38 | frames_per_buffer=chunk, 39 | input=True) 40 | 41 | 42 | x = np.arange(0, chunk) 43 | x_fft = np.linspace(0, fs / 2, chunk // 2 + 1) 44 | 45 | frames = [] 46 | start_time = time.time() 47 | 48 | while time.time() - start_time < seconds: 49 | plotext.clt() 50 | plotext.cld() 51 | plotext.clc() 52 | 53 | data = stream.read(chunk, False) 54 | frames.append(data) 55 | data_int = struct.unpack(str(2 * chunk) + 'B', data) 56 | data_np = np.array(data_int, dtype='b')[::2] + 128 57 | 58 | y_freq = data_np 59 | spec = fft(data_int) 60 | y_spec = np.abs(np.fft.rfft(data_int)) / chunk 61 | 62 | # plotext.subplots(2, 1) 63 | # plotext.subplot(1, 1) 64 | plotext.plot(x, y_freq, color="white", marker="braille") 65 | # marker braille, fhd, hd, sd, dot, dollar,euro, bitcoin, at, heart, smile, queen, king, 66 | 67 | plotext.plot_size(200, 15) 68 | plotext.ylim(0, 300) 69 | plotext.xlabel(f' {seconds} seconds recording | Elapsed time: {round(time.time() - start_time, 1)} seconds, Time left: {round(seconds - (time.time() - start_time), 1)} seconds') 70 | plotext.yfrequency(2) 71 | plotext.xfrequency(0) 72 | plotext.xlim(0, 4410) 73 | plotext.horizontal_line(128, color="red", yside="top") 74 | 75 | # plotext.subplot(2, 1) 76 | # plotext.plot_size(200, 15) 77 | # plotext.plot(x_fft, y_spec, color="white", marker="braille") 78 | # plotext.ylim(0, 1) 79 | # plotext.xfrequency(2) 80 | # plotext.yfrequency(2) 81 | # plotext.xaxes("log") 82 | plotext.show() 83 | 84 | 85 | # Stop and close the stream 86 | stream.stop_stream() 87 | stream.close() 88 | p.terminate() 89 | 90 | # Save the recorded data as a WAV file 91 | wf = wave.open(filename, 'wb') 92 | wf.setnchannels(channels) 93 | wf.setsampwidth(p.get_sample_size(sample_format)) 94 | wf.setframerate(fs) 95 | wf.writeframes(b''.join(frames)) 96 | wf.close() 97 | 98 | print('\n... Finished recording ...') 99 | 100 | 101 | def get_transcript_whisper(): 102 | '''Get transcript of audio file with whisper api''' 103 | openai.api_key = credentials.api_key 104 | file = open("audio_output.wav", "rb") 105 | transcription = openai.Audio.transcribe("whisper-1", file, response_format="json") 106 | text = transcription["text"] 107 | return text 108 | 109 | def run_GPT3(prompt): 110 | '''Run GPT-3 with the prompt and return the response''' 111 | response = openai.Completion.create( 112 | engine="davinci", 113 | prompt=prompt, 114 | temperature=0.7, 115 | max_tokens=256, 116 | top_p=1, 117 | frequency_penalty=0, 118 | presence_penalty=0 119 | ) 120 | print(response) 121 | return response 122 | 123 | def run_chatGPT(prompt): 124 | '''Run chatGPT with the prompt and return the response''' 125 | completion = openai.ChatCompletion.create( 126 | model="gpt-3.5-turbo", 127 | messages=[ 128 | {"role": "user", "content": prompt} 129 | ] 130 | ) 131 | answer = completion.choices[0].message.content 132 | return answer 133 | 134 | 135 | 136 | #%% CONVERSATION FUNCTIONS 137 | 138 | def start_conversation(): 139 | # Record audio file with function imported from audio_record.py 140 | audio_spectrum(10) 141 | # Get transcript of audio file with whisper api 142 | print(f"\n... Audio transcription ...") 143 | text_output = get_transcript_whisper() 144 | time.sleep(1) 145 | print(f"\n... Text recognition ... \n Text recongized: {text_output}") 146 | print(f"\n... ChatGPT prompt ...") 147 | answer = run_chatGPT(text_output) 148 | print(f"\n... ChatGPT answer ... {answer}") 149 | print(f"\n... Text to speech ...") 150 | tts = pyttsx3.init() 151 | tts.setProperty('rate', 110) 152 | tts.say(answer) 153 | tts.runAndWait() 154 | 155 | def feedback_question(): 156 | print(f'... Feedback question ...') 157 | question = f"Do you want to know more about this topic or another one?" 158 | tts = pyttsx3.init() 159 | tts.setProperty('rate', 110) 160 | tts.say(question) 161 | tts.runAndWait() 162 | 163 | def stop_sequence(): 164 | print('... Stop sequence ...') 165 | stop_phrase = f'Well, if you have more questions, you know where to find me. And next time prepare your questions a little better. Goodbye!' 166 | tts = pyttsx3.init() 167 | tts.setProperty('rate', 110) 168 | tts.say(stop_phrase) 169 | tts.runAndWait() 170 | 171 | 172 | def recognize_answer(): 173 | print(f"\n... Listening to answer for 3 seconds ...") 174 | audio_spectrum(3) 175 | text_output = get_transcript_whisper() 176 | print(f"\n... Text recognition \nText recognized: {text_output}") 177 | # if text contains yes restart conversation else stop assistant 178 | restart_keywords = ["yes", "another", "topic", "more", "Yes", "Another", "Topic", "More"] 179 | if any(x in text_output for x in restart_keywords): 180 | print('... Feedback sequence ...') 181 | feedback_phrase = f'Seems like you are interested today. What do you want to know more about?' 182 | tts = pyttsx3.init() 183 | tts.setProperty('rate', 110) 184 | tts.say(feedback_phrase) 185 | tts.runAndWait() 186 | 187 | start_conversation() 188 | feedback_question() 189 | recognize_answer() 190 | else: 191 | stop_sequence() 192 | 193 | 194 | 195 | #%% Start assistant 196 | 197 | print(f"\n--------------------------------- Storyteller assistant started ----------------------------------------- \n") 198 | start_conversation() 199 | time.sleep(1) 200 | feedback_question() 201 | time.sleep(1) 202 | recognize_answer() 203 | print(f"\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx Storyteller assistant stopped xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n") 204 | 205 | #%% Stop assistant 206 | -------------------------------------------------------------------------------- /pilot_terminal.py: -------------------------------------------------------------------------------- 1 | import plotext 2 | import numpy as np 3 | import pyaudio 4 | import struct 5 | import wave 6 | import time 7 | 8 | from audio_get_channels import get_cur_mic 9 | from audio_get_channels import get_speaker 10 | from scipy.fftpack import fft 11 | import openai 12 | import credentials 13 | import os 14 | import pyttsx3 15 | import threading 16 | import sys 17 | 18 | from rich import print 19 | from rich.progress import track 20 | from rich.console import Console 21 | from rich.columns import Columns 22 | from rich.panel import Panel 23 | 24 | console = Console() 25 | 26 | # ---------------------------------------------------------------- 27 | console.print(f'''\n... RECORDING AUDIO \n\n''', style="bold red") 28 | print(f'''... Testline1\n''') 29 | print(f'''... Testline2\n''') 30 | print(f'''... Testline3\n''') 31 | 32 | script_start = time.time() 33 | script_dir = os.path.dirname(os.path.abspath(__file__)) 34 | filename = os.path.join(script_dir, "audio_output.wav") 35 | 36 | # ---------------------------------------------------------------- 37 | def audio_spectrum(num_seconds): 38 | script_dir = os.path.dirname(os.path.abspath(__file__)) 39 | 40 | chunk = 2205 41 | channels = 1 42 | fs = 44100 43 | seconds = num_seconds 44 | sample_format = pyaudio.paInt16 45 | filename = os.path.join(script_dir, "audio_output.wav") 46 | 47 | console.print(f'\n... Recording {seconds} seconds of audio initialized ...\n') 48 | 49 | p = pyaudio.PyAudio() 50 | stream = p.open(format=sample_format, 51 | channels=channels, 52 | rate=fs, 53 | input_device_index=get_cur_mic(), 54 | frames_per_buffer=chunk, 55 | input=True) 56 | 57 | 58 | x = np.arange(0, chunk) 59 | x_fft = np.linspace(0, fs / 2, chunk // 2 + 1) 60 | 61 | frames = [] 62 | start_time = time.time() 63 | 64 | 65 | while time.time() - start_time < seconds: 66 | plotext.clear_terminal(lines=10) 67 | plotext.clear_data() 68 | plotext.clear_figure() 69 | plotext.clear_color() 70 | 71 | # plotext.clt() # to clear the terminal 72 | # plotext.cld() # to clear the data onl 73 | # plotext.clf() # to clear the figure 74 | # plotext.clc() # to clear color 75 | 76 | data = stream.read(chunk, False) 77 | frames.append(data) 78 | data_int = struct.unpack(str(2 * chunk) + 'B', data) 79 | data_np = np.array(data_int, dtype='b')[::2] + 128 80 | 81 | y_freq = data_np 82 | spec = fft(data_int) 83 | y_spec = np.abs(np.fft.rfft(data_int)) / chunk 84 | 85 | # plotext.subplots(2, 1) 86 | # plotext.subplot(1, 1) 87 | plotext.plot(x, y_freq, color="white", marker="braille") 88 | plotext.title(f'[ {round(seconds - (time.time() - start_time), 1)}s | {seconds}s ]') 89 | # marker braille, fhd, hd, sd, dot, dollar,euro, bitcoin, at, heart, smile, queen, king, 90 | 91 | plotext.plot_size(200, 10) 92 | plotext.ylim(0, 300) 93 | 94 | plotext.yfrequency(2) 95 | plotext.xfrequency(0) 96 | plotext.xlim(0, 2205) 97 | plotext.horizontal_line(128, color="red", yside="top") 98 | 99 | # plotext.subplot(2, 1) 100 | # plotext.plot_size(200, 15) 101 | # plotext.plot(x_fft, y_spec, color="white", marker="braille") 102 | # plotext.ylim(0, 1) 103 | # plotext.xfrequency(2) 104 | # plotext.yfrequency(2) 105 | # plotext.xaxes("log") 106 | plotext.show() 107 | 108 | 109 | # Stop and close the stream 110 | stream.stop_stream() 111 | stream.close() 112 | p.terminate() 113 | 114 | # Save the recorded data as a WAV file 115 | wf = wave.open(filename, 'wb') 116 | wf.setnchannels(channels) 117 | wf.setsampwidth(p.get_sample_size(sample_format)) 118 | wf.setframerate(fs) 119 | wf.writeframes(b''.join(frames)) 120 | wf.close() 121 | 122 | 123 | try: 124 | audio_spectrum(6) 125 | except KeyboardInterrupt: 126 | pass 127 | 128 | # ----------------------------------------------------------------------------------------- 129 | script_check_1 = time.time() 130 | check_1 = script_check_1 - script_start 131 | console.print(f'\n... Time: {round(check_1, 3)} seconds | Recording finished succesfully! \n') 132 | # ----------------------------------------------------------------------------------------- 133 | 134 | def get_transcript_whisper(): 135 | openai.api_key = credentials.api_key 136 | file = open(filename, "rb") 137 | transcription = openai.Audio.transcribe("whisper-1", file, response_format="json") 138 | transcribed_text = transcription["text"] 139 | return transcribed_text 140 | 141 | 142 | text = get_transcript_whisper() 143 | 144 | # If text contains one word of a stopwordlist then the script will stop 145 | if any(word in text for word in ['stop', 'Stop', 'exit', 'quit', 'end', 'No', 'no']): 146 | print(f'''\n... SKRIPT STOPPED BY STOPWORD\n''') 147 | print(f'''__________________________________________________________________________________________________\n\n''') 148 | exit() 149 | 150 | 151 | 152 | # ----------------------------------------------------------------------------------------- 153 | script_check_2 = time.time() 154 | check_2 = script_check_2 - script_start 155 | console.print(f'\n... Time: {round(check_2, 3)} seconds | Registered text: \n') 156 | # ---------------------------------------------------------------- 157 | text = f' {text}' 158 | 159 | def print_transcript(): 160 | for word in text.split(): 161 | time.sleep(0.27) 162 | print(word, end=' ', flush=True) 163 | 164 | print_thread = threading.Thread(target=print_transcript()) 165 | print_thread.start() 166 | 167 | def run_chatGPT(prompt): 168 | '''Run chatGPT with the prompt and return the response''' 169 | completion = openai.ChatCompletion.create( 170 | model="gpt-3.5-turbo", 171 | messages=[ 172 | {"role": "user", "content": prompt} 173 | ] 174 | ) 175 | answer = completion.choices[0].message.content 176 | 177 | return answer 178 | 179 | answer = run_chatGPT(text) 180 | 181 | # ----------------------------------------------------------------------------------------- 182 | script_check_3 = time.time() 183 | check_3 = script_check_3 - script_start 184 | console.print(f'\n\n... Time: {round(check_3, 3)} seconds | ChatGPT Answer: \n') 185 | # ----------------------------------------------------------------------------------------- 186 | answer = f'{answer}' 187 | 188 | 189 | # Initialize the pyttsx3 engine 190 | engine = pyttsx3.init() 191 | engine.setProperty('rate', 110) 192 | 193 | 194 | # Define a function for speaking the anser 195 | def speak_answer(): 196 | engine.say(answer) 197 | engine.runAndWait() 198 | 199 | # Define a function for printing the transcript 200 | # Print function to slowly print the text except when the user presses Enter 201 | # With pressing enter, the full answer is printed instantly 202 | from rich import print 203 | from rich.console import Console 204 | from pynput import keyboard 205 | import time 206 | import sys 207 | 208 | console = Console() 209 | 210 | def print_answer(answer): 211 | print_complete = False 212 | break_program = False 213 | 214 | def on_press(key): 215 | nonlocal break_program 216 | if key == keyboard.Key.enter: 217 | console.print('Printout activated', style='bold red') 218 | break_program = True 219 | return False 220 | 221 | listener_thread = keyboard.Listener(on_press=on_press) 222 | listener_thread.start() 223 | 224 | try: 225 | for line in answer.splitlines(): 226 | for word in line.split(): 227 | console.print(word, end=' ') 228 | sys.stdout.flush() 229 | time.sleep(0.30) 230 | if break_program: 231 | break 232 | 233 | sys.stdout.write('\n') 234 | 235 | if line == answer.splitlines()[-1] and word == line.split()[-1]: 236 | console.print('\nPrintout completed', style='bold green') 237 | print_complete = True 238 | break_program = True 239 | if print_complete: 240 | break 241 | 242 | finally: 243 | listener_thread.join() 244 | 245 | 246 | 247 | # Create threads for speaking and printing the transcript 248 | speak_thread = threading.Thread(target=speak_answer) 249 | #print_thread = threading.Thread(target=print_transcript) 250 | 251 | # Start both threads 252 | #print_thread.start() 253 | speak_thread.start() 254 | 255 | # Wait for both threads to finish 256 | #threading.wait_for(lambda: not speak_thread.is_alive()and not print_thread.is_alive()) 257 | 258 | # Wait for both threads to finish 259 | #speak_thread.join() 260 | #print_thread.join() 261 | print_answer(answer) 262 | 263 | # ----------------------------------------------------------------------------------------- 264 | script_check_4 = time.time() 265 | check_4 = script_check_4 - script_start 266 | console.print(f'''\n\n... Time: {round(check_4, 3)} seconds | Chat finished! \n''') 267 | 268 | # ----------------------------------------------------------------------------------------- 269 | time.sleep(2) 270 | 271 | # Restart the script 272 | # while True: 273 | 274 | # Restart the program 275 | # python = sys.executable 276 | #os.execl(python, python, *sys.argv, ) 277 | 278 | 279 | -------------------------------------------------------------------------------- /nlp_labelling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "id": "032fea65", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import spacy" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 9, 16 | "id": "ac1e8c4d", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "nlp = spacy.load(\"en_core_web_sm\")" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 15, 26 | "id": "f61ed47b", 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "data": { 31 | "text/plain": [ 32 | "spacy.tokens.doc.Doc" 33 | ] 34 | }, 35 | "execution_count": 15, 36 | "metadata": {}, 37 | "output_type": "execute_result" 38 | } 39 | ], 40 | "source": [ 41 | "story=\"The closest city to your location is Wajir in Kenya. Wajir (Somali: Wajeer) is the capital of the Wajir County of Kenya. It is situated in the former North Eastern Province\"\n", 42 | "doc = nlp(story)\n", 43 | "type(doc)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 16, 49 | "id": "81cf56c2", 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "The DET det\n", 57 | "closest ADJ amod\n", 58 | "city NOUN nsubj\n", 59 | "to ADP prep\n", 60 | "your PRON poss\n", 61 | "location NOUN pobj\n", 62 | "is AUX ROOT\n", 63 | "Wajir PROPN attr\n", 64 | "in ADP prep\n", 65 | "Kenya PROPN pobj\n", 66 | ". PUNCT punct\n", 67 | "Wajir PROPN nsubj\n", 68 | "( PUNCT punct\n", 69 | "Somali PROPN nmod\n", 70 | ": PUNCT punct\n", 71 | "Wajeer PROPN appos\n", 72 | ") PUNCT punct\n", 73 | "is AUX ROOT\n", 74 | "the DET det\n", 75 | "capital NOUN attr\n", 76 | "of ADP prep\n", 77 | "the DET det\n", 78 | "Wajir PROPN compound\n", 79 | "County PROPN pobj\n", 80 | "of ADP prep\n", 81 | "Kenya PROPN pobj\n", 82 | ". PUNCT punct\n", 83 | "It PRON nsubjpass\n", 84 | "is AUX auxpass\n", 85 | "situated VERB ROOT\n", 86 | "in ADP prep\n", 87 | "the DET det\n", 88 | "former ADJ amod\n", 89 | "North PROPN compound\n", 90 | "Eastern PROPN compound\n", 91 | "Province PROPN pobj\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "for token in doc:\n", 97 | " print(token.text, token.pos_, token.dep_)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 17, 103 | "id": "05288415", 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "data": { 108 | "text/plain": [ 109 | "'determiner'" 110 | ] 111 | }, 112 | "execution_count": 17, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "spacy.explain(\"det\")" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 20, 124 | "id": "131f158a", 125 | "metadata": {}, 126 | "outputs": [ 127 | { 128 | "data": { 129 | "text/html": [ 130 | "
\n", 131 | "\n", 144 | "\n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | "
TEXTLEMMAPOSDESCTAGDEPSHAPEALPHASTOP
0ThetheDETdeterminerDTdetXxxTrueTrue
1closestcloseADJadjectiveJJSamodxxxxTrueFalse
2citycityNOUNnounNNnsubjxxxxTrueFalse
3totoADPadpositionINprepxxTrueTrue
4youryourPRONpronounPRP$possxxxxTrueTrue
5locationlocationNOUNnounNNpobjxxxxTrueFalse
6isbeAUXauxiliaryVBZROOTxxTrueTrue
7WajirWajirPROPNproper nounNNPattrXxxxxTrueFalse
8ininADPadpositionINprepxxTrueTrue
9KenyaKenyaPROPNproper nounNNPpobjXxxxxTrueFalse
10..PUNCTpunctuation.punct.FalseFalse
11WajirWajirPROPNproper nounNNPnsubjXxxxxTrueFalse
12((PUNCTpunctuation-LRB-punct(FalseFalse
13SomaliSomaliPROPNproper nounNNPnmodXxxxxTrueFalse
14::PUNCTpunctuation:punct:FalseFalse
15WajeerWajeerPROPNproper nounNNPapposXxxxxTrueFalse
16))PUNCTpunctuation-RRB-punct)FalseFalse
17isbeAUXauxiliaryVBZROOTxxTrueTrue
18thetheDETdeterminerDTdetxxxTrueTrue
19capitalcapitalNOUNnounNNattrxxxxTrueFalse
20ofofADPadpositionINprepxxTrueTrue
21thetheDETdeterminerDTdetxxxTrueTrue
22WajirWajirPROPNproper nounNNPcompoundXxxxxTrueFalse
23CountyCountyPROPNproper nounNNPpobjXxxxxTrueFalse
24ofofADPadpositionINprepxxTrueTrue
25KenyaKenyaPROPNproper nounNNPpobjXxxxxTrueFalse
26..PUNCTpunctuation.punct.FalseFalse
27ItitPRONpronounPRPnsubjpassXxTrueTrue
28isbeAUXauxiliaryVBZauxpassxxTrueTrue
29situatedsituateVERBverbVBNROOTxxxxTrueFalse
30ininADPadpositionINprepxxTrueTrue
31thetheDETdeterminerDTdetxxxTrueTrue
32formerformerADJadjectiveJJamodxxxxTrueTrue
33NorthNorthPROPNproper nounNNPcompoundXxxxxTrueFalse
34EasternEasternPROPNproper nounNNPcompoundXxxxxTrueFalse
35ProvinceProvincePROPNproper nounNNPpobjXxxxxTrueFalse
\n", 594 | "
" 595 | ], 596 | "text/plain": [ 597 | " TEXT LEMMA POS DESC TAG DEP SHAPE ALPHA \\\n", 598 | "0 The the DET determiner DT det Xxx True \n", 599 | "1 closest close ADJ adjective JJS amod xxxx True \n", 600 | "2 city city NOUN noun NN nsubj xxxx True \n", 601 | "3 to to ADP adposition IN prep xx True \n", 602 | "4 your your PRON pronoun PRP$ poss xxxx True \n", 603 | "5 location location NOUN noun NN pobj xxxx True \n", 604 | "6 is be AUX auxiliary VBZ ROOT xx True \n", 605 | "7 Wajir Wajir PROPN proper noun NNP attr Xxxxx True \n", 606 | "8 in in ADP adposition IN prep xx True \n", 607 | "9 Kenya Kenya PROPN proper noun NNP pobj Xxxxx True \n", 608 | "10 . . PUNCT punctuation . punct . False \n", 609 | "11 Wajir Wajir PROPN proper noun NNP nsubj Xxxxx True \n", 610 | "12 ( ( PUNCT punctuation -LRB- punct ( False \n", 611 | "13 Somali Somali PROPN proper noun NNP nmod Xxxxx True \n", 612 | "14 : : PUNCT punctuation : punct : False \n", 613 | "15 Wajeer Wajeer PROPN proper noun NNP appos Xxxxx True \n", 614 | "16 ) ) PUNCT punctuation -RRB- punct ) False \n", 615 | "17 is be AUX auxiliary VBZ ROOT xx True \n", 616 | "18 the the DET determiner DT det xxx True \n", 617 | "19 capital capital NOUN noun NN attr xxxx True \n", 618 | "20 of of ADP adposition IN prep xx True \n", 619 | "21 the the DET determiner DT det xxx True \n", 620 | "22 Wajir Wajir PROPN proper noun NNP compound Xxxxx True \n", 621 | "23 County County PROPN proper noun NNP pobj Xxxxx True \n", 622 | "24 of of ADP adposition IN prep xx True \n", 623 | "25 Kenya Kenya PROPN proper noun NNP pobj Xxxxx True \n", 624 | "26 . . PUNCT punctuation . punct . False \n", 625 | "27 It it PRON pronoun PRP nsubjpass Xx True \n", 626 | "28 is be AUX auxiliary VBZ auxpass xx True \n", 627 | "29 situated situate VERB verb VBN ROOT xxxx True \n", 628 | "30 in in ADP adposition IN prep xx True \n", 629 | "31 the the DET determiner DT det xxx True \n", 630 | "32 former former ADJ adjective JJ amod xxxx True \n", 631 | "33 North North PROPN proper noun NNP compound Xxxxx True \n", 632 | "34 Eastern Eastern PROPN proper noun NNP compound Xxxxx True \n", 633 | "35 Province Province PROPN proper noun NNP pobj Xxxxx True \n", 634 | "\n", 635 | " STOP \n", 636 | "0 True \n", 637 | "1 False \n", 638 | "2 False \n", 639 | "3 True \n", 640 | "4 True \n", 641 | "5 False \n", 642 | "6 True \n", 643 | "7 False \n", 644 | "8 True \n", 645 | "9 False \n", 646 | "10 False \n", 647 | "11 False \n", 648 | "12 False \n", 649 | "13 False \n", 650 | "14 False \n", 651 | "15 False \n", 652 | "16 False \n", 653 | "17 True \n", 654 | "18 True \n", 655 | "19 False \n", 656 | "20 True \n", 657 | "21 True \n", 658 | "22 False \n", 659 | "23 False \n", 660 | "24 True \n", 661 | "25 False \n", 662 | "26 False \n", 663 | "27 True \n", 664 | "28 True \n", 665 | "29 False \n", 666 | "30 True \n", 667 | "31 True \n", 668 | "32 True \n", 669 | "33 False \n", 670 | "34 False \n", 671 | "35 False " 672 | ] 673 | }, 674 | "execution_count": 20, 675 | "metadata": {}, 676 | "output_type": "execute_result" 677 | } 678 | ], 679 | "source": [ 680 | "import pandas as pd\n", 681 | "d = []\n", 682 | "for token in doc:\n", 683 | " d.append({'TEXT': token.text, 'LEMMA': token.lemma_, \n", 684 | " 'POS': token.pos_, 'DESC':spacy.explain(token.pos_),\n", 685 | " 'TAG': token.tag_,\n", 686 | " 'DEP': token.dep_, 'SHAPE': token.shape_,\n", 687 | " 'ALPHA': token.is_alpha, 'STOP': token.is_stop})\n", 688 | "\n", 689 | "spacy_dataframe = pd.DataFrame(d)\n", 690 | "spacy_dataframe" 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": 21, 696 | "id": "1cfe12a1", 697 | "metadata": {}, 698 | "outputs": [ 699 | { 700 | "data": { 701 | "text/html": [ 702 | "\n", 703 | "\n", 704 | " The\n", 705 | " DET\n", 706 | "\n", 707 | "\n", 708 | "\n", 709 | " closest\n", 710 | " ADJ\n", 711 | "\n", 712 | "\n", 713 | "\n", 714 | " city\n", 715 | " NOUN\n", 716 | "\n", 717 | "\n", 718 | "\n", 719 | " to\n", 720 | " ADP\n", 721 | "\n", 722 | "\n", 723 | "\n", 724 | " your\n", 725 | " PRON\n", 726 | "\n", 727 | "\n", 728 | "\n", 729 | " location\n", 730 | " NOUN\n", 731 | "\n", 732 | "\n", 733 | "\n", 734 | " is\n", 735 | " AUX\n", 736 | "\n", 737 | "\n", 738 | "\n", 739 | " Wajir\n", 740 | " PROPN\n", 741 | "\n", 742 | "\n", 743 | "\n", 744 | " in\n", 745 | " ADP\n", 746 | "\n", 747 | "\n", 748 | "\n", 749 | " Kenya.\n", 750 | " PROPN\n", 751 | "\n", 752 | "\n", 753 | "\n", 754 | " Wajir (\n", 755 | " PROPN\n", 756 | "\n", 757 | "\n", 758 | "\n", 759 | " Somali:\n", 760 | " PROPN\n", 761 | "\n", 762 | "\n", 763 | "\n", 764 | " Wajeer)\n", 765 | " PROPN\n", 766 | "\n", 767 | "\n", 768 | "\n", 769 | " is\n", 770 | " AUX\n", 771 | "\n", 772 | "\n", 773 | "\n", 774 | " the\n", 775 | " DET\n", 776 | "\n", 777 | "\n", 778 | "\n", 779 | " capital\n", 780 | " NOUN\n", 781 | "\n", 782 | "\n", 783 | "\n", 784 | " of\n", 785 | " ADP\n", 786 | "\n", 787 | "\n", 788 | "\n", 789 | " the\n", 790 | " DET\n", 791 | "\n", 792 | "\n", 793 | "\n", 794 | " Wajir\n", 795 | " PROPN\n", 796 | "\n", 797 | "\n", 798 | "\n", 799 | " County\n", 800 | " PROPN\n", 801 | "\n", 802 | "\n", 803 | "\n", 804 | " of\n", 805 | " ADP\n", 806 | "\n", 807 | "\n", 808 | "\n", 809 | " Kenya.\n", 810 | " PROPN\n", 811 | "\n", 812 | "\n", 813 | "\n", 814 | " It\n", 815 | " PRON\n", 816 | "\n", 817 | "\n", 818 | "\n", 819 | " is\n", 820 | " AUX\n", 821 | "\n", 822 | "\n", 823 | "\n", 824 | " situated\n", 825 | " VERB\n", 826 | "\n", 827 | "\n", 828 | "\n", 829 | " in\n", 830 | " ADP\n", 831 | "\n", 832 | "\n", 833 | "\n", 834 | " the\n", 835 | " DET\n", 836 | "\n", 837 | "\n", 838 | "\n", 839 | " former\n", 840 | " ADJ\n", 841 | "\n", 842 | "\n", 843 | "\n", 844 | " North\n", 845 | " PROPN\n", 846 | "\n", 847 | "\n", 848 | "\n", 849 | " Eastern\n", 850 | " PROPN\n", 851 | "\n", 852 | "\n", 853 | "\n", 854 | " Province\n", 855 | " PROPN\n", 856 | "\n", 857 | "\n", 858 | "\n", 859 | " \n", 860 | " \n", 861 | " det\n", 862 | " \n", 863 | " \n", 864 | "\n", 865 | "\n", 866 | "\n", 867 | " \n", 868 | " \n", 869 | " amod\n", 870 | " \n", 871 | " \n", 872 | "\n", 873 | "\n", 874 | "\n", 875 | " \n", 876 | " \n", 877 | " nsubj\n", 878 | " \n", 879 | " \n", 880 | "\n", 881 | "\n", 882 | "\n", 883 | " \n", 884 | " \n", 885 | " prep\n", 886 | " \n", 887 | " \n", 888 | "\n", 889 | "\n", 890 | "\n", 891 | " \n", 892 | " \n", 893 | " poss\n", 894 | " \n", 895 | " \n", 896 | "\n", 897 | "\n", 898 | "\n", 899 | " \n", 900 | " \n", 901 | " pobj\n", 902 | " \n", 903 | " \n", 904 | "\n", 905 | "\n", 906 | "\n", 907 | " \n", 908 | " \n", 909 | " attr\n", 910 | " \n", 911 | " \n", 912 | "\n", 913 | "\n", 914 | "\n", 915 | " \n", 916 | " \n", 917 | " prep\n", 918 | " \n", 919 | " \n", 920 | "\n", 921 | "\n", 922 | "\n", 923 | " \n", 924 | " \n", 925 | " pobj\n", 926 | " \n", 927 | " \n", 928 | "\n", 929 | "\n", 930 | "\n", 931 | " \n", 932 | " \n", 933 | " nsubj\n", 934 | " \n", 935 | " \n", 936 | "\n", 937 | "\n", 938 | "\n", 939 | " \n", 940 | " \n", 941 | " nmod\n", 942 | " \n", 943 | " \n", 944 | "\n", 945 | "\n", 946 | "\n", 947 | " \n", 948 | " \n", 949 | " appos\n", 950 | " \n", 951 | " \n", 952 | "\n", 953 | "\n", 954 | "\n", 955 | " \n", 956 | " \n", 957 | " det\n", 958 | " \n", 959 | " \n", 960 | "\n", 961 | "\n", 962 | "\n", 963 | " \n", 964 | " \n", 965 | " attr\n", 966 | " \n", 967 | " \n", 968 | "\n", 969 | "\n", 970 | "\n", 971 | " \n", 972 | " \n", 973 | " prep\n", 974 | " \n", 975 | " \n", 976 | "\n", 977 | "\n", 978 | "\n", 979 | " \n", 980 | " \n", 981 | " det\n", 982 | " \n", 983 | " \n", 984 | "\n", 985 | "\n", 986 | "\n", 987 | " \n", 988 | " \n", 989 | " compound\n", 990 | " \n", 991 | " \n", 992 | "\n", 993 | "\n", 994 | "\n", 995 | " \n", 996 | " \n", 997 | " pobj\n", 998 | " \n", 999 | " \n", 1000 | "\n", 1001 | "\n", 1002 | "\n", 1003 | " \n", 1004 | " \n", 1005 | " prep\n", 1006 | " \n", 1007 | " \n", 1008 | "\n", 1009 | "\n", 1010 | "\n", 1011 | " \n", 1012 | " \n", 1013 | " pobj\n", 1014 | " \n", 1015 | " \n", 1016 | "\n", 1017 | "\n", 1018 | "\n", 1019 | " \n", 1020 | " \n", 1021 | " nsubjpass\n", 1022 | " \n", 1023 | " \n", 1024 | "\n", 1025 | "\n", 1026 | "\n", 1027 | " \n", 1028 | " \n", 1029 | " auxpass\n", 1030 | " \n", 1031 | " \n", 1032 | "\n", 1033 | "\n", 1034 | "\n", 1035 | " \n", 1036 | " \n", 1037 | " prep\n", 1038 | " \n", 1039 | " \n", 1040 | "\n", 1041 | "\n", 1042 | "\n", 1043 | " \n", 1044 | " \n", 1045 | " det\n", 1046 | " \n", 1047 | " \n", 1048 | "\n", 1049 | "\n", 1050 | "\n", 1051 | " \n", 1052 | " \n", 1053 | " amod\n", 1054 | " \n", 1055 | " \n", 1056 | "\n", 1057 | "\n", 1058 | "\n", 1059 | " \n", 1060 | " \n", 1061 | " compound\n", 1062 | " \n", 1063 | " \n", 1064 | "\n", 1065 | "\n", 1066 | "\n", 1067 | " \n", 1068 | " \n", 1069 | " compound\n", 1070 | " \n", 1071 | " \n", 1072 | "\n", 1073 | "\n", 1074 | "\n", 1075 | " \n", 1076 | " \n", 1077 | " pobj\n", 1078 | " \n", 1079 | " \n", 1080 | "\n", 1081 | "" 1082 | ], 1083 | "text/plain": [ 1084 | "" 1085 | ] 1086 | }, 1087 | "metadata": {}, 1088 | "output_type": "display_data" 1089 | } 1090 | ], 1091 | "source": [ 1092 | "from spacy import displacy\n", 1093 | "\n", 1094 | "# Show POS and synthatic dependencies\n", 1095 | "displacy.render(doc, style=\"dep\", jupyter=True, options={'distance': 100})" 1096 | ] 1097 | }, 1098 | { 1099 | "cell_type": "markdown", 1100 | "id": "67356e31", 1101 | "metadata": {}, 1102 | "source": [ 1103 | "# Categorised Words" 1104 | ] 1105 | }, 1106 | { 1107 | "cell_type": "code", 1108 | "execution_count": 22, 1109 | "id": "73f46389", 1110 | "metadata": {}, 1111 | "outputs": [ 1112 | { 1113 | "name": "stdout", 1114 | "output_type": "stream", 1115 | "text": [ 1116 | "Wajir 37 42 ORG\n", 1117 | "Kenya 46 51 GPE\n", 1118 | "Wajir 53 58 PERSON\n", 1119 | "the Wajir County 94 110 GPE\n", 1120 | "Kenya 114 119 GPE\n", 1121 | "North Eastern Province 150 172 GPE\n" 1122 | ] 1123 | } 1124 | ], 1125 | "source": [ 1126 | "for ent in doc.ents:\n", 1127 | " print(ent.text, ent.start_char, ent.end_char, ent.label_)" 1128 | ] 1129 | }, 1130 | { 1131 | "cell_type": "code", 1132 | "execution_count": 23, 1133 | "id": "5614619f", 1134 | "metadata": {}, 1135 | "outputs": [ 1136 | { 1137 | "data": { 1138 | "text/html": [ 1139 | "
The closest city to your location is \n", 1140 | "\n", 1141 | " Wajir\n", 1142 | " ORG\n", 1143 | "\n", 1144 | " in \n", 1145 | "\n", 1146 | " Kenya\n", 1147 | " GPE\n", 1148 | "\n", 1149 | ". \n", 1150 | "\n", 1151 | " Wajir\n", 1152 | " PERSON\n", 1153 | "\n", 1154 | " (Somali: Wajeer) is the capital of \n", 1155 | "\n", 1156 | " the Wajir County\n", 1157 | " GPE\n", 1158 | "\n", 1159 | " of \n", 1160 | "\n", 1161 | " Kenya\n", 1162 | " GPE\n", 1163 | "\n", 1164 | ". It is situated in the former \n", 1165 | "\n", 1166 | " North Eastern Province\n", 1167 | " GPE\n", 1168 | "\n", 1169 | "
" 1170 | ], 1171 | "text/plain": [ 1172 | "" 1173 | ] 1174 | }, 1175 | "metadata": {}, 1176 | "output_type": "display_data" 1177 | } 1178 | ], 1179 | "source": [ 1180 | "displacy.render(doc, style=\"ent\", jupyter=True)" 1181 | ] 1182 | }, 1183 | { 1184 | "cell_type": "code", 1185 | "execution_count": 27, 1186 | "id": "7bd1e6de", 1187 | "metadata": {}, 1188 | "outputs": [], 1189 | "source": [ 1190 | "import spacy\n", 1191 | "nlp2 = spacy.load(\"en_core_web_md\")\n" 1192 | ] 1193 | }, 1194 | { 1195 | "cell_type": "code", 1196 | "execution_count": 38, 1197 | "id": "9923cb4c", 1198 | "metadata": {}, 1199 | "outputs": [], 1200 | "source": [ 1201 | "#Needed Functions\n", 1202 | "from collections import Counter\n", 1203 | "import spacy\n", 1204 | "from spacy.lang.en.stop_words import STOP_WORDS\n", 1205 | "\n", 1206 | "def lemmatize(doc):\n", 1207 | " return [token.lemma_ for token in doc if not\n", 1208 | " (token.is_punct or token.is_space or token.lower_ in STOP_WORDS)]\n", 1209 | "\n", 1210 | "def tf(w, doc):\n", 1211 | " c = 0\n", 1212 | " for token in doc:\n", 1213 | " if token.lemma_ == w:\n", 1214 | " c = c + 1\n", 1215 | " return c\n", 1216 | "\n", 1217 | "def idf(w,docs):\n", 1218 | " c = 0\n", 1219 | " num_docs = len(docs)\n", 1220 | " for doc in docs:\n", 1221 | " if tf(w, doc) > 0:\n", 1222 | " c = c + 1\n", 1223 | " return c / num_docs" 1224 | ] 1225 | }, 1226 | { 1227 | "cell_type": "code", 1228 | "execution_count": 39, 1229 | "id": "c5373530", 1230 | "metadata": {}, 1231 | "outputs": [ 1232 | { 1233 | "name": "stdout", 1234 | "output_type": "stream", 1235 | "text": [ 1236 | "The closest city to your location is Wajir in Kenya. Wajir (Somali: Wajeer) is the capital of the Wajir County of Kenya. It is situated in the former North Eastern Province\n" 1237 | ] 1238 | } 1239 | ], 1240 | "source": [ 1241 | "print(doc)" 1242 | ] 1243 | }, 1244 | { 1245 | "cell_type": "code", 1246 | "execution_count": 44, 1247 | "id": "3f8aec89", 1248 | "metadata": {}, 1249 | "outputs": [ 1250 | { 1251 | "data": { 1252 | "text/plain": [ 1253 | "2" 1254 | ] 1255 | }, 1256 | "execution_count": 44, 1257 | "metadata": {}, 1258 | "output_type": "execute_result" 1259 | } 1260 | ], 1261 | "source": [ 1262 | "#lemmatize(doc)\n", 1263 | "tf(\"Kenya\",doc)" 1264 | ] 1265 | }, 1266 | { 1267 | "cell_type": "code", 1268 | "execution_count": 56, 1269 | "id": "0cf32902", 1270 | "metadata": {}, 1271 | "outputs": [], 1272 | "source": [ 1273 | "from string import punctuation\n", 1274 | "from collections import Counter\n", 1275 | "from heapq import nlargest" 1276 | ] 1277 | }, 1278 | { 1279 | "cell_type": "code", 1280 | "execution_count": 61, 1281 | "id": "17da1522", 1282 | "metadata": {}, 1283 | "outputs": [ 1284 | { 1285 | "data": { 1286 | "text/plain": [ 1287 | "3" 1288 | ] 1289 | }, 1290 | "execution_count": 61, 1291 | "metadata": {}, 1292 | "output_type": "execute_result" 1293 | } 1294 | ], 1295 | "source": [ 1296 | "#How many Sentences\n", 1297 | "len(list(doc.sents))" 1298 | ] 1299 | }, 1300 | { 1301 | "cell_type": "code", 1302 | "execution_count": 63, 1303 | "id": "db1a54dd", 1304 | "metadata": {}, 1305 | "outputs": [], 1306 | "source": [ 1307 | "keyword = [ ]\n", 1308 | "stopwords = list(STOP_WORDS)\n", 1309 | "pos_tag = [ 'PROPN','ADJ', 'NOUN', 'VERB']\n", 1310 | "for token in doc:\n", 1311 | " if(token.text in stopwords or token.text in punctuation):\n", 1312 | " continue\n", 1313 | " if(token.pos_ in pos_tag):\n", 1314 | " keyword. append (token. text)" 1315 | ] 1316 | }, 1317 | { 1318 | "cell_type": "code", 1319 | "execution_count": 68, 1320 | "id": "1e5bc7f0", 1321 | "metadata": {}, 1322 | "outputs": [ 1323 | { 1324 | "data": { 1325 | "text/plain": [ 1326 | "Counter({The: 1,\n", 1327 | " closest: 1,\n", 1328 | " city: 1,\n", 1329 | " to: 1,\n", 1330 | " your: 1,\n", 1331 | " location: 1,\n", 1332 | " is: 1,\n", 1333 | " Wajir: 1,\n", 1334 | " in: 1,\n", 1335 | " Kenya: 1,\n", 1336 | " .: 1,\n", 1337 | " Wajir: 1,\n", 1338 | " (: 1,\n", 1339 | " Somali: 1,\n", 1340 | " :: 1,\n", 1341 | " Wajeer: 1,\n", 1342 | " ): 1,\n", 1343 | " is: 1,\n", 1344 | " the: 1,\n", 1345 | " capital: 1,\n", 1346 | " of: 1,\n", 1347 | " the: 1,\n", 1348 | " Wajir: 1,\n", 1349 | " County: 1,\n", 1350 | " of: 1,\n", 1351 | " Kenya: 1,\n", 1352 | " .: 1,\n", 1353 | " It: 1,\n", 1354 | " is: 1,\n", 1355 | " situated: 1,\n", 1356 | " in: 1,\n", 1357 | " the: 1,\n", 1358 | " former: 1,\n", 1359 | " North: 1,\n", 1360 | " Eastern: 1,\n", 1361 | " Province: 1})" 1362 | ] 1363 | }, 1364 | "execution_count": 68, 1365 | "metadata": {}, 1366 | "output_type": "execute_result" 1367 | } 1368 | ], 1369 | "source": [ 1370 | "freq=Counter(doc)\n", 1371 | "freq" 1372 | ] 1373 | }, 1374 | { 1375 | "cell_type": "code", 1376 | "execution_count": null, 1377 | "id": "877feb87", 1378 | "metadata": {}, 1379 | "outputs": [], 1380 | "source": [] 1381 | } 1382 | ], 1383 | "metadata": { 1384 | "kernelspec": { 1385 | "display_name": "Python 3 (ipykernel)", 1386 | "language": "python", 1387 | "name": "python3" 1388 | }, 1389 | "language_info": { 1390 | "codemirror_mode": { 1391 | "name": "ipython", 1392 | "version": 3 1393 | }, 1394 | "file_extension": ".py", 1395 | "mimetype": "text/x-python", 1396 | "name": "python", 1397 | "nbconvert_exporter": "python", 1398 | "pygments_lexer": "ipython3", 1399 | "version": "3.10.7" 1400 | } 1401 | }, 1402 | "nbformat": 4, 1403 | "nbformat_minor": 5 1404 | } 1405 | --------------------------------------------------------------------------------